[
  {
    "path": ".dvc/.gitignore",
    "content": "/config.local\n/tmp\n/cache\n"
  },
  {
    "path": ".dvc/config",
    "content": ""
  },
  {
    "path": ".dvc/plots/confusion.json",
    "content": "{\n    \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.json\",\n    \"data\": {\n        \"values\": \"<DVC_METRIC_DATA>\"\n    },\n    \"title\": \"<DVC_METRIC_TITLE>\",\n    \"mark\": \"rect\",\n    \"encoding\": {\n        \"x\": {\n            \"field\": \"<DVC_METRIC_X>\",\n            \"type\": \"nominal\",\n            \"sort\": \"ascending\",\n            \"title\": \"<DVC_METRIC_X_LABEL>\"\n        },\n        \"y\": {\n            \"field\": \"<DVC_METRIC_Y>\",\n            \"type\": \"nominal\",\n            \"sort\": \"ascending\",\n            \"title\": \"<DVC_METRIC_Y_LABEL>\"\n        },\n        \"color\": {\n            \"aggregate\": \"count\",\n            \"type\": \"quantitative\"\n        },\n        \"facet\": {\n            \"field\": \"rev\",\n            \"type\": \"nominal\"\n        }\n    }\n}\n"
  },
  {
    "path": ".dvc/plots/default.json",
    "content": "{\n    \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.json\",\n    \"data\": {\n        \"values\": \"<DVC_METRIC_DATA>\"\n    },\n    \"title\": \"<DVC_METRIC_TITLE>\",\n    \"mark\": {\n        \"type\": \"line\"\n    },\n    \"encoding\": {\n        \"x\": {\n            \"field\": \"<DVC_METRIC_X>\",\n            \"type\": \"quantitative\",\n            \"title\": \"<DVC_METRIC_X_LABEL>\"\n        },\n        \"y\": {\n            \"field\": \"<DVC_METRIC_Y>\",\n            \"type\": \"quantitative\",\n            \"title\": \"<DVC_METRIC_Y_LABEL>\",\n            \"scale\": {\n                \"zero\": false\n            }\n        },\n        \"color\": {\n            \"field\": \"rev\",\n            \"type\": \"nominal\"\n        }\n    }\n}\n"
  },
  {
    "path": ".dvc/plots/scatter.json",
    "content": "{\n    \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.json\",\n    \"data\": {\n        \"values\": \"<DVC_METRIC_DATA>\"\n    },\n    \"title\": \"<DVC_METRIC_TITLE>\",\n    \"mark\": \"point\",\n    \"encoding\": {\n        \"x\": {\n            \"field\": \"<DVC_METRIC_X>\",\n            \"type\": \"quantitative\",\n            \"title\": \"<DVC_METRIC_X_LABEL>\"\n        },\n        \"y\": {\n            \"field\": \"<DVC_METRIC_Y>\",\n            \"type\": \"quantitative\",\n            \"title\": \"<DVC_METRIC_Y_LABEL>\",\n            \"scale\": {\n                \"zero\": false\n            }\n        },\n        \"color\": {\n            \"field\": \"rev\",\n            \"type\": \"nominal\"\n        }\n    }\n}\n"
  },
  {
    "path": ".dvc/plots/smooth.json",
    "content": "{\n    \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.json\",\n    \"data\": {\n        \"values\": \"<DVC_METRIC_DATA>\"\n    },\n    \"title\": \"<DVC_METRIC_TITLE>\",\n    \"mark\": {\n        \"type\": \"line\"\n    },\n    \"encoding\": {\n        \"x\": {\n            \"field\": \"<DVC_METRIC_X>\",\n            \"type\": \"quantitative\",\n            \"title\": \"<DVC_METRIC_X_LABEL>\"\n        },\n        \"y\": {\n            \"field\": \"<DVC_METRIC_Y>\",\n            \"type\": \"quantitative\",\n            \"title\": \"<DVC_METRIC_Y_LABEL>\",\n            \"scale\": {\n                \"zero\": false\n            }\n        },\n        \"color\": {\n            \"field\": \"rev\",\n            \"type\": \"nominal\"\n        }\n    },\n    \"transform\": [\n        {\n            \"loess\": \"<DVC_METRIC_Y>\",\n            \"on\": \"<DVC_METRIC_X>\",\n            \"groupby\": [\n                \"rev\"\n            ],\n            \"bandwidth\": 0.3\n        }\n    ]\n}\n"
  },
  {
    "path": ".dvcignore",
    "content": "# Add patterns of files dvc should ignore, which could improve\n# the performance. Learn more at\n# https://dvc.org/doc/user-guide/dvcignore\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/bug-report.yml",
    "content": "name: Bug report\ndescription: Create a report to help reproduce and fix the bug\nbody:\n  - type: textarea\n    id: description\n    attributes:\n      label: Describe the bug\n      description: A clear and concise description of what the bug is\n    validations:\n      required: true\n  \n  - type: textarea\n    id: reproduction\n    attributes:\n      label: Steps to reproduce the bug\n      description: |\n        Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.\n        If you have code snippets, error messages, stack traces please provide them here as well.\n        Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting\n        Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.\n      placeholder: |\n        Steps to reproduce the behavior:\n          \n          1.\n          2.\n          3.\n    validations:\n      required: true\n\n  - type: textarea\n    id: expected-behavior\n    validations:\n      required: true\n    attributes:\n      label: Expected behavior\n      description: A clear and concise description of the expected results.\n\n  - type: textarea\n    id: environment-info\n    attributes:\n      label: Environment info\n      description: Please share your environemnt info with us. You can run the command `datasets-cli env` and copy-paste its output below.\n      placeholder: datasets version, platform, python version, ...\n    validations:\n      required: true\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/config.yml",
    "content": "contact_links:\n  - name: Datasets on the Hugging Face Hub\n    url: https://huggingface.co/datasets\n    about: Please use the \"Community\" tab of the dataset on the Hugging Face Hub to open a discussion or a pull request\n  - name: Forum\n    url: https://discuss.huggingface.co/c/datasets/10\n    about: Please ask and answer questions here, and engage with other community members\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/feature-request.yml",
    "content": "name: Feature request\ndescription: Suggest an idea for this project\nlabels: [\"enhancement\"]\nbody:\n  - type: textarea\n    id: feature-request\n    attributes:\n      label: Feature request\n      description: A clear and concise description of the feature proposal.\n    validations:\n      required: true\n  \n  - type: textarea\n    id: motivation\n    validations:\n      required: true\n    attributes:\n      label: Motivation\n      description: |\n        Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.   \n\n  - type: textarea\n    id: contribution\n    validations:\n      required: true\n    attributes:\n      label: Your contribution\n      description: |\n        Is there any way that you could help, e.g. by submitting a PR? Make sure to read the CONTRIBUTING.MD [readme](https://github.com/huggingface/datasets/blob/main/CONTRIBUTING.md).\n"
  },
  {
    "path": ".github/conda/build.sh",
    "content": "$PYTHON setup.py install --single-version-externally-managed --record=record.txt\n"
  },
  {
    "path": ".github/conda/meta.yaml",
    "content": "{% set name = \"datasets\" %}\n\npackage:\n  name: \"{{ name|lower }}\"\n  version: \"{{ DATASETS_VERSION }}\"\n\nsource:\n  path: ../../\n\nbuild:\n  noarch: python\n\nrequirements:\n  host:\n    - python\n    - pip\n    - numpy >=1.17\n    - pyarrow >=16.0.0\n    - python-xxhash\n    - dill\n    - pandas\n    - requests >=2.19.0\n    - httpx <1.0.0\n    - tqdm >=4.66.3\n    - dataclasses\n    - multiprocess\n    - fsspec\n    - huggingface_hub >=0.25.0,<2.0.0\n    - packaging\n  run:\n    - python\n    - pip\n    - numpy >=1.17\n    - pyarrow >=16.0.0\n    - python-xxhash\n    - dill\n    - pandas\n    - requests >=2.19.0\n    - httpx <1.0.0\n    - tqdm >=4.66.3\n    - dataclasses\n    - multiprocess\n    - fsspec\n    - huggingface_hub >=0.25.0,<2.0.0\n    - packaging\n\ntest:\n  imports:\n    - datasets\n\nabout:\n  home: https://huggingface.co\n  license: Apache License 2.0\n  license_file: LICENSE\n  summary: \"🤗 The largest hub of ready-to-use NLP datasets for ML models with fast, easy-to-use and efficient data manipulation tools\"\n"
  },
  {
    "path": ".github/workflows/build_documentation.yml",
    "content": "name: Build documentation\n\non:\n  push:\n    branches:\n      - main\n      - doc-builder*\n      - v*-release\n      - v*-patch\n\njobs:\n  build:\n    uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main\n    with:\n      commit_sha: ${{ github.sha }}\n      package: datasets\n      notebook_folder: datasets_doc\n    secrets:\n      token: ${{ secrets.HUGGINGFACE_PUSH }}\n      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}\n"
  },
  {
    "path": ".github/workflows/build_pr_documentation.yml",
    "content": "name: Build PR Documentation\n\non:\n  pull_request:\n\nconcurrency:\n  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}\n  cancel-in-progress: true\n\njobs:\n  build:\n    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main\n    with:\n      commit_sha: ${{ github.event.pull_request.head.sha }}\n      pr_number: ${{ github.event.number }}\n      package: datasets\n"
  },
  {
    "path": ".github/workflows/ci.yml",
    "content": "name: CI\n\non:\n  pull_request:\n    branches:\n      - main\n  push:\n    branches:\n      - main\n      - ci-*\n\nenv:\n  CI_HEADERS: ${{ secrets.CI_HEADERS }}\n\njobs:\n\n  check_code_quality:\n    runs-on: ubuntu-latest\n    steps:\n      - uses: actions/checkout@v4\n      - name: Set up Python\n        uses: actions/setup-python@v5\n        with:\n          python-version: \"3.10\"\n      - name: Install dependencies\n        run: |\n          python -m pip install --upgrade pip\n          pip install .[quality]\n      - name: Check quality\n        run: |\n          ruff check tests src benchmarks utils setup.py # linter\n          ruff format --check tests src benchmarks utils setup.py # formatter\n\n  test:\n    needs: check_code_quality\n    strategy:\n      matrix:\n        test: ['unit', 'integration']\n        os: [ubuntu-latest, windows-latest]\n        deps_versions: [deps-latest, deps-minimum]\n    continue-on-error: ${{ matrix.test == 'integration' }}\n    runs-on: ${{ matrix.os }}\n    steps:\n      - uses: actions/checkout@v4\n        with:\n          fetch-depth: 0\n      - name: Setup FFmpeg\n        if: ${{ matrix.os == 'ubuntu-latest' }}\n        run: |\n          sudo apt update\n          sudo apt install -y ffmpeg \n      - name: Set up Python 3.10\n        uses: actions/setup-python@v5\n        with:\n          python-version: \"3.10\"\n      - name: Setup conda env (windows)\n        if: ${{ matrix.os == 'windows-latest' }}\n        uses: conda-incubator/setup-miniconda@v2\n        with:\n          auto-update-conda: true\n          miniconda-version: \"latest\"\n          activate-environment: test\n          python-version: \"3.10\"\n      - name: Setup FFmpeg (windows)\n        if: ${{ matrix.os == 'windows-latest' }}\n        run: conda install \"ffmpeg=7.0.1\" -c conda-forge\n      - name: Upgrade pip\n        run: python -m pip install --upgrade pip\n      - name: Install uv\n        run: pip install --upgrade uv\n      - name: Install dependencies\n        run: uv pip install --system \"datasets[tests] @ .\"\n      - name: Install dependencies (latest versions)\n        if: ${{ matrix.deps_versions == 'deps-latest' }}\n        run: uv pip install --system --upgrade pyarrow huggingface-hub \"dill<0.3.9\"\n      - name: Install dependencies (minimum versions)\n        if: ${{ matrix.deps_versions != 'deps-latest' }}\n        run: uv pip install --system pyarrow==21.0.0 huggingface-hub==0.25.0 transformers dill==0.3.1.1\n      - name: Print dependencies\n        run: uv pip list\n      - name: Test with pytest\n        run: |\n          python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/\n\n  test_py314:\n    needs: check_code_quality\n    strategy:\n      matrix:\n        test: ['unit']\n        os: [ubuntu-latest, windows-latest]\n        deps_versions: [deps-latest]\n    continue-on-error: false\n    runs-on: ${{ matrix.os }}\n    steps:\n      - uses: actions/checkout@v4\n        with:\n          fetch-depth: 0\n      - name: Setup FFmpeg\n        if: ${{ matrix.os == 'ubuntu-latest' }}\n        run: |\n          sudo apt update\n          sudo apt install -y ffmpeg\n      - name: Set up Python 3.14\n        uses: actions/setup-python@v5\n        with:\n          python-version: \"3.14\"\n      - name: Setup conda env (windows)\n        if: ${{ matrix.os == 'windows-latest' }}\n        uses: conda-incubator/setup-miniconda@v2\n        with:\n          auto-update-conda: true\n          miniconda-version: \"latest\"\n          activate-environment: test\n          python-version: \"3.14\"\n      - name: Setup FFmpeg (windows)\n        if: ${{ matrix.os == 'windows-latest' }}\n        run: conda install \"ffmpeg=7.0.1\" -c conda-forge\n      - name: Upgrade pip\n        run: python -m pip install --upgrade pip\n      - name: Install uv\n        run: pip install --upgrade uv\n      - name: Install dependencies\n        run: uv pip install --system \"datasets[tests] @ .\"\n      - name: Print dependencies\n        run: uv pip list\n      - name: Test with pytest\n        run: |\n          python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/\n\n  test_py314_future:\n    needs: check_code_quality\n    strategy:\n      matrix:\n        test: ['unit']\n        os: [ubuntu-latest, windows-latest]\n        deps_versions: [deps-latest]\n    continue-on-error: false\n    runs-on: ${{ matrix.os }}\n    steps:\n      - uses: actions/checkout@v4\n        with:\n          fetch-depth: 0\n      - name: Setup FFmpeg\n        if: ${{ matrix.os == 'ubuntu-latest' }}\n        run: |\n          sudo apt update\n          sudo apt install -y ffmpeg \n      - name: Set up Python 3.14\n        uses: actions/setup-python@v5\n        with:\n          python-version: \"3.14\"\n      - name: Setup conda env (windows)\n        if: ${{ matrix.os == 'windows-latest' }}\n        uses: conda-incubator/setup-miniconda@v2\n        with:\n          auto-update-conda: true\n          miniconda-version: \"latest\"\n          activate-environment: test\n          python-version: \"3.14\"\n      - name: Setup FFmpeg (windows)\n        if: ${{ matrix.os == 'windows-latest' }}\n        run: conda install \"ffmpeg=7.0.1\" -c conda-forge\n      - name: Upgrade pip\n        run: python -m pip install --upgrade pip\n      - name: Install uv\n        run: pip install --upgrade uv\n      - name: Install dependencies\n        run: uv pip install --system \"datasets[tests_numpy2] @ .\"\n      - name: Print dependencies\n        run: pip list\n\n      - name: Test with pytest\n        run: |\n          python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/\n"
  },
  {
    "path": ".github/workflows/release-conda.yml",
    "content": "name: Release - Conda\n\non:\n  push:\n    tags:\n      - \"[0-9]+.[0-9]+.[0-9]+*\"\n\nenv:\n  ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}\n\njobs:\n  build_and_package:\n    runs-on: ubuntu-22.04\n    defaults:\n      run:\n        shell: bash -l {0}\n\n    steps:\n      - name: Checkout repository\n        uses: actions/checkout@v4\n\n      - name: Install miniconda\n        uses: conda-incubator/setup-miniconda@v2\n        with:\n          auto-update-conda: true\n          auto-activate-base: false\n          activate-environment: \"build-datasets\"\n          python-version: \"3.10\"\n          channels: huggingface\n\n      - name: Setup conda env\n        run: |\n          conda install -c defaults anaconda-client conda-build\n\n      - name: Extract version\n        run: echo \"DATASETS_VERSION=`python setup.py --version`\" >> $GITHUB_ENV\n\n      - name: Build conda packages\n        run: |\n          conda info\n          conda build .github/conda\n\n      - name: Upload to Anaconda\n        run: |\n          anaconda upload `conda build .github/conda --output -c conda-forge` --force\n"
  },
  {
    "path": ".github/workflows/self-assign.yaml",
    "content": "name: Self-assign\non:\n  issue_comment:\n    types: created\njobs:\n  one:\n    runs-on: ubuntu-latest\n    if: >-\n      (github.event.comment.body == '#take' ||\n       github.event.comment.body == '#self-assign')\n      && !github.event.issue.assignee\n    steps:\n      - run: |\n          echo \"Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}\"\n          curl -H \"Authorization: token ${{ secrets.GITHUB_TOKEN }}\" -d '{\"assignees\": [\"${{ github.event.comment.user.login }}\"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees\n          curl -H \"Authorization: token ${{ secrets.GITHUB_TOKEN }}\" -X \"DELETE\" https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/labels/help%20wanted\n"
  },
  {
    "path": ".github/workflows/trufflehog.yml",
    "content": "on:\n  push:\n\nname: Secret Leaks\n\npermissions:\n  contents: read\n\njobs:\n  trufflehog:\n    runs-on: ubuntu-latest\n    steps:\n    - name: Checkout code\n      uses: actions/checkout@v5\n      with:\n        fetch-depth: 0\n    - name: Secret Scanning\n      uses: trufflesecurity/trufflehog@main\n      with:\n        extra_args: --results=verified\n"
  },
  {
    "path": ".github/workflows/upload_pr_documentation.yml",
    "content": "name: Upload PR Documentation\n\non:\n  workflow_run:\n    workflows: [\"Build PR Documentation\"]\n    types:\n      - completed\n\njobs:\n  build:\n    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main\n    with:\n      package_name: datasets\n    secrets:\n      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}\n      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}"
  },
  {
    "path": ".gitignore",
    "content": "# Locked files\n*.lock\n!dvc.lock\n\n# Extracted dummy data\ndatasets/**/dummy_data-zip-extracted/\n\n# Compiled python modules.\n*.pyc\n\n# Byte-compiled\n_pycache__/\n.cache/\n\n# Python egg metadata, regenerated from source files by setuptools.\n*.egg-info\n.eggs/\n\n# PyPI distribution artifacts.\nbuild/\ndist/\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# pyenv\n.python-version\n\n# Tests\n.pytest_cache/\n\n# Other\n*.DS_Store\n\n# PyCharm/vscode\n.idea\n.vscode\n\n# Vim\n.*.swp\n\n# playground\n/playground\n\n# Sphinx documentation\ndocs/_build/\ndocs/source/_build/\n\n# Benchmark results\nreport.json\nreport.md\n\n# Ruff\n.ruff_cache\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "repos:\n  - repo: https://github.com/charliermarsh/ruff-pre-commit # https://github.com/charliermarsh/ruff#usage\n    rev: 'v0.11.8'\n    hooks:\n      # Run the linter.\n      - id: ruff\n        args: [ --fix ]\n      # Run the formatter.\n      - id: ruff-format\n"
  },
  {
    "path": ".zenodo.json",
    "content": "{\n    \"license\": \"Apache-2.0\",\n    \"creators\": [\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Quentin Lhoest\"\n        },\n        {\n            \"orcid\": \"0000-0003-1727-1045\",\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Albert Villanova del Moral\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Patrick von Platen\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Thomas Wolf\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Mario Šaško\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Yacine Jernite\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Abhishek Thakur\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Lewis Tunstall\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Suraj Patil\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Mariama Drame\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Julien Chaumond\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Julien Plu\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Joe Davison\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Simon Brandeis\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Victor Sanh\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Teven Le Scao\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Kevin Canwen Xu\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Nicolas Patry\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Steven Liu\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Angelina McMillan-Major\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Philipp Schmid\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Sylvain Gugger\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Nathan Raw\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Sylvain Lesage\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Anton Lozhkov\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Matthew Carrigan\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Th\\u00e9o Matussi\\u00e8re\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Leandro von Werra\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Lysandre Debut\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Stas Bekman\"\n        },\n        {\n            \"affiliation\": \"Hugging Face\",\n            \"name\": \"Cl\\u00e9ment Delangue\"\n        }\n    ]\n}"
  },
  {
    "path": "ADD_NEW_DATASET.md",
    "content": "# How to add one new datasets\n\nAdd datasets directly to the 🤗 Hugging Face Hub!\n\nYou can share your dataset on https://huggingface.co/datasets directly using your account, see the documentation:\n\n* [Create a dataset and upload files on the website](https://huggingface.co/docs/datasets/upload_dataset)\n* [Advanced guide using the CLI](https://huggingface.co/docs/datasets/share)\n"
  },
  {
    "path": "AUTHORS",
    "content": "# This is the list of HuggingFace Datasets authors for copyright purposes.\n#\n# This does not necessarily list everyone who has contributed code, since in\n# some cases, their employer may be the copyright holder.  To see the full list\n# of contributors, see the revision history in source control.\n\nGoogle Inc.\nHuggingFace Inc.\n"
  },
  {
    "path": "CITATION.cff",
    "content": "cff-version: 1.2.0\nmessage: \"If you use this software, please cite it as below.\"\ntitle: \"huggingface/datasets\"\nauthors:\n- family-names: Lhoest\n  given-names: Quentin\n- family-names: Villanova del Moral\n  given-names: Albert\n  orcid: \"https://orcid.org/0000-0003-1727-1045\"\n- family-names: von Platen\n  given-names: Patrick\n- family-names: Wolf\n  given-names: Thomas\n- family-names: Šaško\n  given-names: Mario\n- family-names: Jernite\n  given-names: Yacine\n- family-names: Thakur\n  given-names: Abhishek\n- family-names: Tunstall\n  given-names: Lewis\n- family-names: Patil\n  given-names: Suraj\n- family-names: Drame\n  given-names: Mariama\n- family-names: Chaumond\n  given-names: Julien\n- family-names: Plu\n  given-names: Julien\n- family-names: Davison\n  given-names: Joe\n- family-names: Brandeis\n  given-names: Simon\n- family-names: Sanh\n  given-names: Victor\n- family-names: Le Scao\n  given-names: Teven\n- family-names: Canwen Xu\n  given-names: Kevin\n- family-names: Patry\n  given-names: Nicolas\n- family-names: Liu\n  given-names: Steven\n- family-names: McMillan-Major\n  given-names: Angelina\n- family-names: Schmid\n  given-names: Philipp\n- family-names: Gugger\n  given-names: Sylvain\n- family-names: Raw\n  given-names: Nathan\n- family-names: Lesage\n  given-names: Sylvain\n- family-names: Lozhkov\n  given-names: Anton\n- family-names: Carrigan\n  given-names: Matthew\n- family-names: Matussière\n  given-names: Théo\n- family-names: von Werra\n  given-names: Leandro\n- family-names: Debut\n  given-names: Lysandre\n- family-names: Bekman\n  given-names: Stas\n- family-names: Delangue\n  given-names: Clément\ndoi: 10.5281/zenodo.4817768\nrepository-code: \"https://github.com/huggingface/datasets\"\nlicense: Apache-2.0\npreferred-citation:\n  type: conference-paper\n  title: \"Datasets: A Community Library for Natural Language Processing\"\n  authors:\n  - family-names: Lhoest\n    given-names: Quentin\n  - family-names: Villanova del Moral\n    given-names: Albert\n    orcid: \"https://orcid.org/0000-0003-1727-1045\"\n  - family-names: von Platen\n    given-names: Patrick\n  - family-names: Wolf\n    given-names: Thomas\n  - family-names: Šaško\n    given-names: Mario\n  - family-names: Jernite\n    given-names: Yacine\n  - family-names: Thakur\n    given-names: Abhishek\n  - family-names: Tunstall\n    given-names: Lewis\n  - family-names: Patil\n    given-names: Suraj\n  - family-names: Drame\n    given-names: Mariama\n  - family-names: Chaumond\n    given-names: Julien\n  - family-names: Plu\n    given-names: Julien\n  - family-names: Davison\n    given-names: Joe\n  - family-names: Brandeis\n    given-names: Simon\n  - family-names: Sanh\n    given-names: Victor\n  - family-names: Le Scao\n    given-names: Teven\n  - family-names: Canwen Xu\n    given-names: Kevin\n  - family-names: Patry\n    given-names: Nicolas\n  - family-names: Liu\n    given-names: Steven\n  - family-names: McMillan-Major\n    given-names: Angelina\n  - family-names: Schmid\n    given-names: Philipp\n  - family-names: Gugger\n    given-names: Sylvain\n  - family-names: Raw\n    given-names: Nathan\n  - family-names: Lesage\n    given-names: Sylvain\n  - family-names: Lozhkov\n    given-names: Anton\n  - family-names: Carrigan\n    given-names: Matthew\n  - family-names: Matussière\n    given-names: Théo\n  - family-names: von Werra\n    given-names: Leandro\n  - family-names: Debut\n    given-names: Lysandre\n  - family-names: Bekman\n    given-names: Stas\n  - family-names: Delangue\n    given-names: Clément\n  collection-title: \"Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing: System Demonstrations\"\n  collection-type: proceedings\n  month: 11\n  year: 2021\n  publisher:\n    name: \"Association for Computational Linguistics\"\n  url: \"https://aclanthology.org/2021.emnlp-demo.21\"\n  start: 175\n  end: 184\n  identifiers:\n    - type: other\n      value: \"arXiv:2109.02846\"\n      description: \"The arXiv preprint of the paper\"\n"
  },
  {
    "path": "CODE_OF_CONDUCT.md",
    "content": "# Contributor Covenant Code of Conduct\n\n## Our Pledge\n\nWe as members, contributors, and leaders pledge to make participation in our\ncommunity a harassment-free experience for everyone, regardless of age, body\nsize, visible or invisible disability, ethnicity, sex characteristics, gender\nidentity and expression, level of experience, education, socio-economic status,\nnationality, personal appearance, race, caste, color, religion, or sexual identity\nand orientation.\n\nWe pledge to act and interact in ways that contribute to an open, welcoming,\ndiverse, inclusive, and healthy community.\n\n## Our Standards\n\nExamples of behavior that contributes to a positive environment for our\ncommunity include:\n\n* Demonstrating empathy and kindness toward other people\n* Being respectful of differing opinions, viewpoints, and experiences\n* Giving and gracefully accepting constructive feedback\n* Accepting responsibility and apologizing to those affected by our mistakes,\n  and learning from the experience\n* Focusing on what is best not just for us as individuals, but for the\n  overall community\n\nExamples of unacceptable behavior include:\n\n* The use of sexualized language or imagery, and sexual attention or\n  advances of any kind\n* Trolling, insulting or derogatory comments, and personal or political attacks\n* Public or private harassment\n* Publishing others' private information, such as a physical or email\n  address, without their explicit permission\n* Other conduct which could reasonably be considered inappropriate in a\n  professional setting\n\n## Enforcement Responsibilities\n\nCommunity leaders are responsible for clarifying and enforcing our standards of\nacceptable behavior and will take appropriate and fair corrective action in\nresponse to any behavior that they deem inappropriate, threatening, offensive,\nor harmful.\n\nCommunity leaders have the right and responsibility to remove, edit, or reject\ncomments, commits, code, wiki edits, issues, and other contributions that are\nnot aligned to this Code of Conduct, and will communicate reasons for moderation\ndecisions when appropriate.\n\n## Scope\n\nThis Code of Conduct applies within all community spaces, and also applies when\nan individual is officially representing the community in public spaces.\nExamples of representing our community include using an official e-mail address,\nposting via an official social media account, or acting as an appointed\nrepresentative at an online or offline event.\n\n## Enforcement\n\nInstances of abusive, harassing, or otherwise unacceptable behavior may be\nreported to the community leaders responsible for enforcement at\nfeedback@huggingface.co.\nAll complaints will be reviewed and investigated promptly and fairly.\n\nAll community leaders are obligated to respect the privacy and security of the\nreporter of any incident.\n\n## Enforcement Guidelines\n\nCommunity leaders will follow these Community Impact Guidelines in determining\nthe consequences for any action they deem in violation of this Code of Conduct:\n\n### 1. Correction\n\n**Community Impact**: Use of inappropriate language or other behavior deemed\nunprofessional or unwelcome in the community.\n\n**Consequence**: A private, written warning from community leaders, providing\nclarity around the nature of the violation and an explanation of why the\nbehavior was inappropriate. A public apology may be requested.\n\n### 2. Warning\n\n**Community Impact**: A violation through a single incident or series\nof actions.\n\n**Consequence**: A warning with consequences for continued behavior. No\ninteraction with the people involved, including unsolicited interaction with\nthose enforcing the Code of Conduct, for a specified period of time. This\nincludes avoiding interactions in community spaces as well as external channels\nlike social media. Violating these terms may lead to a temporary or\npermanent ban.\n\n### 3. Temporary Ban\n\n**Community Impact**: A serious violation of community standards, including\nsustained inappropriate behavior.\n\n**Consequence**: A temporary ban from any sort of interaction or public\ncommunication with the community for a specified period of time. No public or\nprivate interaction with the people involved, including unsolicited interaction\nwith those enforcing the Code of Conduct, is allowed during this period.\nViolating these terms may lead to a permanent ban.\n\n### 4. Permanent Ban\n\n**Community Impact**: Demonstrating a pattern of violation of community\nstandards, including sustained inappropriate behavior,  harassment of an\nindividual, or aggression toward or disparagement of classes of individuals.\n\n**Consequence**: A permanent ban from any sort of public interaction within\nthe community.\n\n## Attribution\n\nThis Code of Conduct is adapted from the [Contributor Covenant][homepage],\nversion 2.0, available at\n[https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0].\n\nCommunity Impact Guidelines were inspired by \n[Mozilla's code of conduct enforcement ladder][Mozilla CoC].\n\nFor answers to common questions about this code of conduct, see the FAQ at\n[https://www.contributor-covenant.org/faq][FAQ]. Translations are available \nat [https://www.contributor-covenant.org/translations][translations].\n\n[homepage]: https://www.contributor-covenant.org\n[v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html\n[Mozilla CoC]: https://github.com/mozilla/diversity\n[FAQ]: https://www.contributor-covenant.org/faq\n[translations]: https://www.contributor-covenant.org/translations\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "# How to contribute to Datasets?\n[![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg)](CODE_OF_CONDUCT.md)\n\nDatasets is an open source project, so all contributions and suggestions are welcome.\n\nYou can contribute in many different ways: giving ideas, answering questions, reporting bugs, proposing enhancements,\nimproving the documentation, fixing bugs,...\n\nMany thanks in advance to every contributor.\n\nIn order to facilitate healthy, constructive behavior in an open and inclusive community, we all respect and abide by\nour [code of conduct](CODE_OF_CONDUCT.md).\n\n## How to work on an open Issue?\nYou have the list of open Issues at: https://github.com/huggingface/datasets/issues\n\nSome of them may have the label `help wanted`: that means that any contributor is welcomed!\n\nIf you would like to work on any of the open Issues:\n\n1. Make sure it is not already assigned to someone else. You have the assignee (if any) on the top of the right column of the Issue page.\n\n2. You can self-assign it by commenting on the Issue page with the keyword: `#self-assign`.\n\n3. Work on your self-assigned issue and eventually create a Pull Request.\n\n## How to create a Pull Request?\nIf you want to add a dataset see specific instructions in the section [*How to add a dataset*](#how-to-add-a-dataset).\n\n1. Fork the [repository](https://github.com/huggingface/datasets) by clicking on the 'Fork' button on the repository's page. This creates a copy of the code under your GitHub user account.\n\n2. Clone your fork to your local disk, and add the base repository as a remote:\n\n    ```bash\n    git clone git@github.com:<your Github handle>/datasets.git\n    cd datasets\n    git remote add upstream https://github.com/huggingface/datasets.git\n    ```\n\n3. Create a new branch to hold your development changes:\n\n    ```bash\n    git checkout -b a-descriptive-name-for-my-changes\n    ```\n\n    **do not** work on the `main` branch.\n\n4. Set up a development environment by running the following command in a virtual environment:\n\n    Simple setup with code formatting only (recommended)\n    ```bash\n    pip install -e \".[quality]\"\n    ```\n    \n    Advanced setup with all the optional dependencies\n    ```bash\n    pip install -e \".[dev]\"\n    ```\n\n   (If datasets was already installed in the virtual environment, remove\n   it with `pip uninstall datasets` before reinstalling it in editable\n   mode with the `-e` flag.)\n\n5. Develop the features on your branch.\n\n6. Format your code. Run `black` and `ruff` so that your newly added files look nice with the following command:\n\n    ```bash\n    make style\n    ```\n   \n7. _(Optional)_ You can also use [`pre-commit`](https://pre-commit.com/) to format your code automatically each time run `git commit`, instead of running `make style` manually. \nTo do this, install `pre-commit` via `pip install pre-commit` and then run `pre-commit install` in the project's root directory to set up the hooks.\nNote that if any files were formatted by `pre-commit` hooks during committing, you have to run `git commit` again .\n\n\n8. Once you're happy with your contribution, add your changed files and make a commit to record your changes locally:\n\n    ```bash\n    git add -u\n    git commit\n    ```\n\n    It is a good idea to sync your copy of the code with the original\n    repository regularly. This way you can quickly account for changes:\n\n    ```bash\n    git fetch upstream\n    git rebase upstream/main\n    ```\n\n9. Once you are satisfied, push the changes to your fork repo using:\n\n   ```bash\n   git push -u origin a-descriptive-name-for-my-changes\n   ```\n\n   Go the webpage of your fork on GitHub. Click on \"Pull request\" to send your changes to the project maintainers for review.\n\n## Datasets on Hugging Face\n\n### How to add a dataset on Hugging Face\n\nYou can share your dataset on https://huggingface.co/datasets directly using your account (no need to open a PR on GitHub), see the documentation:\n\n* [Create a dataset and upload files on the website](https://huggingface.co/docs/datasets/upload_dataset)\n* [Advanced guide using the CLI](https://huggingface.co/docs/datasets/share)\n\n### How to contribute to the dataset cards\n\nImproving the documentation of datasets is an ever-increasing effort, and we invite users to contribute by sharing their insights with the community in the `README.md` dataset cards provided for each dataset.\n\nIf you see that a dataset card is missing information that you are in a position to provide (as an author of the dataset or as an experienced user), the best thing you can do is to open a Pull Request on the Hugging Face Hub. To do, go to the \"Files and versions\" tab of the dataset page and edit the `README.md` file. We provide:\n\n* a [template](https://github.com/huggingface/datasets/blob/main/templates/README.md)\n* a [guide](https://github.com/huggingface/datasets/blob/main/templates/README_guide.md) describing what information should go into each of the paragraphs\n* and if you need inspiration, we recommend looking through a [completed example](https://huggingface.co/datasets/eli5/blob/main/README.md)\n\nIf you are a **dataset author**... you know what to do, it is your dataset after all ;) ! We would especially appreciate if you could help us fill in information about the process of creating the dataset, and take a moment to reflect on its social impact and possible limitations if you haven't already done so in the dataset paper or in another data statement.\n\nIf you are a **user of a dataset**, the main source of information should be the dataset paper if it is available: we recommend pulling information from there into the relevant paragraphs of the template. We also eagerly welcome discussions on the [Considerations for Using the Data](https://github.com/huggingface/datasets/blob/main/templates/README_guide.md#considerations-for-using-the-data) based on existing scholarship or personal experience that would benefit the whole community.\n\nFinally, if you want more information on the how and why of dataset cards, we strongly recommend reading the foundational works [Datasheets for Datasets](https://huggingface.co/papers/1803.09010) and [Data Statements for NLP](https://www.aclweb.org/anthology/Q18-1041/).\n\nThank you for your contribution!\n\n## Code of conduct\n\nThis project adheres to the HuggingFace [code of conduct](CODE_OF_CONDUCT.md).\nBy participating, you are expected to abide by this code.\n"
  },
  {
    "path": "LICENSE",
    "content": "\n                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "Makefile",
    "content": ".PHONY: quality style test\n\ncheck_dirs := tests src benchmarks utils\n\n# Check that source code meets quality standards\n\nquality:\n\truff check $(check_dirs) setup.py  # linter\n\truff format --check $(check_dirs) setup.py # formatter\n\n# Format source code automatically\n\nstyle:\n\truff check --fix $(check_dirs) setup.py # linter\n\truff format $(check_dirs) setup.py # formatter\n\n# Run tests for the library\n\ntest:\n\tpython -m pytest -n auto --dist=loadfile -s -v ./tests/\n"
  },
  {
    "path": "README.md",
    "content": "<p align=\"center\">\n  <picture>\n    <source media=\"(prefers-color-scheme: dark)\" srcset=\"https://huggingface.co/datasets/huggingface/documentation-images/raw/main/datasets-logo-dark.svg\">\n    <source media=\"(prefers-color-scheme: light)\" srcset=\"https://huggingface.co/datasets/huggingface/documentation-images/raw/main/datasets-logo-light.svg\">\n    <img alt=\"Hugging Face Datasets Library\" src=\"https://huggingface.co/datasets/huggingface/documentation-images/raw/main/datasets-logo-light.svg\" width=\"352\" height=\"59\" style=\"max-width: 100%;\">\n  </picture>\n  <br/>\n  <br/>\n</p>\n\n<p align=\"center\">\n    <a href=\"https://github.com/huggingface/datasets/actions/workflows/ci.yml?query=branch%3Amain\"><img alt=\"Build\" src=\"https://github.com/huggingface/datasets/actions/workflows/ci.yml/badge.svg?branch=main\"></a>\n    <a href=\"https://github.com/huggingface/datasets/blob/main/LICENSE\"><img alt=\"GitHub\" src=\"https://img.shields.io/github/license/huggingface/datasets.svg?color=blue\"></a>\n    <a href=\"https://huggingface.co/docs/datasets/index.html\"><img alt=\"Documentation\" src=\"https://img.shields.io/website/http/huggingface.co/docs/datasets/index.html.svg?down_color=red&down_message=offline&up_message=online\"></a>\n    <a href=\"https://github.com/huggingface/datasets/releases\"><img alt=\"GitHub release\" src=\"https://img.shields.io/github/release/huggingface/datasets.svg\"></a>\n    <a href=\"https://huggingface.co/datasets/\"><img alt=\"Number of datasets\" src=\"https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/datasets&color=brightgreen\"></a>\n    <a href=\"CODE_OF_CONDUCT.md\"><img alt=\"Contributor Covenant\" src=\"https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg\"></a>\n    <a href=\"https://zenodo.org/badge/latestdoi/250213286\"><img src=\"https://zenodo.org/badge/250213286.svg\" alt=\"DOI\"></a>\n</p>\n\n🤗 Datasets is a lightweight library providing **two** main features:\n\n- **one-line dataloaders for many public datasets**: one-liners to download and pre-process any of the ![number of datasets](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/datasets&color=brightgreen) major public datasets (image datasets, audio datasets, text datasets in 467 languages and dialects, etc.) provided on the [HuggingFace Datasets Hub](https://huggingface.co/datasets). With a simple command like `squad_dataset = load_dataset(\"rajpurkar/squad\")`, get any of these datasets ready to use in a dataloader for training/evaluating a ML model (Numpy/Pandas/PyTorch/TensorFlow/JAX),\n- **efficient data pre-processing**: simple, fast and reproducible data pre-processing for the public datasets as well as your own local datasets in CSV, JSON, text, PNG, JPEG, WAV, MP3, Parquet, HDF5, etc. With simple commands like `processed_dataset = dataset.map(process_example)`, efficiently prepare the dataset for inspection and ML model evaluation and training.\n\n[🎓 **Documentation**](https://huggingface.co/docs/datasets/) [🔎 **Find a dataset in the Hub**](https://huggingface.co/datasets) [🌟 **Share a dataset on the Hub**](https://huggingface.co/docs/datasets/share)\n\n<h3 align=\"center\">\n    <a href=\"https://hf.co/course\"><img src=\"https://raw.githubusercontent.com/huggingface/datasets/main/docs/source/imgs/course_banner.png\"></a>\n</h3>\n\n🤗 Datasets is designed to let the community easily add and share new datasets.\n\n🤗 Datasets has many additional interesting features:\n\n- Thrive on large datasets: 🤗 Datasets naturally frees the user from RAM memory limitation, all datasets are memory-mapped using an efficient zero-serialization cost backend (Apache Arrow).\n- Smart caching: never wait for your data to process several times.\n- Lightweight and fast with a transparent and pythonic API (multi-processing/caching/memory-mapping).\n- Built-in interoperability with NumPy, PyTorch, TensorFlow 2, JAX, Pandas, Polars and more.\n- Native support for audio, image and video data.\n- Enable streaming mode to save disk space and start iterating over the dataset immediately.\n\n🤗 Datasets originated from a fork of the awesome [TensorFlow Datasets](https://github.com/tensorflow/datasets) and the HuggingFace team want to deeply thank the TensorFlow Datasets team for building this amazing library.\n\n# Installation\n\n## With pip\n\n🤗 Datasets can be installed from PyPi and has to be installed in a virtual environment (venv or conda for instance)\n\n```bash\npip install datasets\n```\n\n## With conda\n\n🤗 Datasets can be installed using conda as follows:\n\n```bash\nconda install -c huggingface -c conda-forge datasets\n```\n\nFollow the installation pages of TensorFlow and PyTorch to see how to install them with conda.\n\nFor more details on installation, check the installation page in the documentation: https://huggingface.co/docs/datasets/installation\n\n## Installation to use with Machine Learning & Data frameworks frameworks\n\nIf you plan to use 🤗 Datasets with PyTorch (2.0+), TensorFlow (2.6+) or JAX (0.4+) you should also install PyTorch, TensorFlow or JAX.\n🤗 Datasets is also well integrated with data frameworks like PyArrow, Pandas, Polars and Spark, which should be installed separately.\n\nFor more details on using the library with these frameworks, check the quick start page in the documentation: https://huggingface.co/docs/datasets/quickstart\n\n# Usage\n\n🤗 Datasets is made to be very simple to use - the API is centered around a single function, `datasets.load_dataset(dataset_name, **kwargs)`, that instantiates a dataset.\n\nThis library can be used for text/image/audio/etc. datasets. Here is an example to load a text dataset:\n\nHere is a quick example:\n\n```python\nfrom datasets import load_dataset\n\n# Print all the available datasets\nfrom huggingface_hub import list_datasets\nprint([dataset.id for dataset in list_datasets(limit=20)])\n\n# Load a dataset and print the first example in the training set\nsquad_dataset = load_dataset('rajpurkar/squad')\nprint(squad_dataset['train'][0])\n\n# Process the dataset - add a column with the length of the context texts\ndataset_with_length = squad_dataset.map(lambda x: {\"length\": len(x[\"context\"])})\n\n# Process the dataset - tokenize the context texts (using a tokenizer from the 🤗 Transformers library)\nfrom transformers import AutoTokenizer\ntokenizer = AutoTokenizer.from_pretrained('bert-base-cased')\n\ntokenized_dataset = squad_dataset.map(lambda x: tokenizer(x['context']), batched=True)\n```\n\nIf your dataset is bigger than your disk or if you don't want to wait to download the data, you can use streaming:\n\n```python\n# If you want to use the dataset immediately and efficiently stream the data as you iterate over the dataset\nimage_dataset = load_dataset('timm/imagenet-1k-wds', streaming=True)\nfor example in image_dataset[\"train\"]:\n    break\n```\n\nFor more details on using the library, check the quick start page in the documentation: https://huggingface.co/docs/datasets/quickstart and the specific pages on:\n\n- Loading a dataset: https://huggingface.co/docs/datasets/loading\n- What's in a Dataset: https://huggingface.co/docs/datasets/access\n- Processing data with 🤗 Datasets: https://huggingface.co/docs/datasets/process\n    - Processing audio data: https://huggingface.co/docs/datasets/audio_process\n    - Processing image data: https://huggingface.co/docs/datasets/image_process\n    - Processing text data: https://huggingface.co/docs/datasets/nlp_process\n- Streaming a dataset: https://huggingface.co/docs/datasets/stream\n- etc.\n\n# Add a new dataset to the Hub\n\nWe have a very detailed step-by-step guide to add a new dataset to the ![number of datasets](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/datasets&color=brightgreen) datasets already provided on the [HuggingFace Datasets Hub](https://huggingface.co/datasets).\n\nYou can find:\n- [how to upload a dataset to the Hub using your web browser or Python](https://huggingface.co/docs/datasets/upload_dataset) and also\n- [how to upload it using Git](https://huggingface.co/docs/datasets/share).\n\n# Disclaimers\n\nYou can use 🤗 Datasets to load datasets based on versioned git repositories maintained by the dataset authors. For reproducibility reasons, we ask users to pin the `revision` of the repositories they use.\n\nIf you're a dataset owner and wish to update any part of it (description, citation, license, etc.), or do not want your dataset to be included in the Hugging Face Hub, please get in touch by opening a discussion or a pull request in the Community tab of the dataset page. Thanks for your contribution to the ML community!\n\n## BibTeX\n\nIf you want to cite our 🤗 Datasets library, you can use our [paper](https://huggingface.co/papers/2109.02846):\n\n```bibtex\n@inproceedings{lhoest-etal-2021-datasets,\n    title = \"Datasets: A Community Library for Natural Language Processing\",\n    author = \"Lhoest, Quentin  and\n      Villanova del Moral, Albert  and\n      Jernite, Yacine  and\n      Thakur, Abhishek  and\n      von Platen, Patrick  and\n      Patil, Suraj  and\n      Chaumond, Julien  and\n      Drame, Mariama  and\n      Plu, Julien  and\n      Tunstall, Lewis  and\n      Davison, Joe  and\n      {\\v{S}}a{\\v{s}}ko, Mario  and\n      Chhablani, Gunjan  and\n      Malik, Bhavitvya  and\n      Brandeis, Simon  and\n      Le Scao, Teven  and\n      Sanh, Victor  and\n      Xu, Canwen  and\n      Patry, Nicolas  and\n      McMillan-Major, Angelina  and\n      Schmid, Philipp  and\n      Gugger, Sylvain  and\n      Delangue, Cl{\\'e}ment  and\n      Matussi{\\`e}re, Th{\\'e}o  and\n      Debut, Lysandre  and\n      Bekman, Stas  and\n      Cistac, Pierric  and\n      Goehringer, Thibault  and\n      Mustar, Victor  and\n      Lagunas, Fran{\\c{c}}ois  and\n      Rush, Alexander  and\n      Wolf, Thomas\",\n    booktitle = \"Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing: System Demonstrations\",\n    month = nov,\n    year = \"2021\",\n    address = \"Online and Punta Cana, Dominican Republic\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://aclanthology.org/2021.emnlp-demo.21\",\n    pages = \"175--184\",\n    abstract = \"The scale, variety, and quantity of publicly-available NLP datasets has grown rapidly as researchers propose new tasks, larger models, and novel benchmarks. Datasets is a community library for contemporary NLP designed to support this ecosystem. Datasets aims to standardize end-user interfaces, versioning, and documentation, while providing a lightweight front-end that behaves similarly for small datasets as for internet-scale corpora. The design of the library incorporates a distributed, community-driven approach to adding datasets and documenting usage. After a year of development, the library now includes more than 650 unique datasets, has more than 250 contributors, and has helped support a variety of novel cross-dataset research projects and shared tasks. The library is available at https://github.com/huggingface/datasets.\",\n    eprint={2109.02846},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n}\n```\n\nIf you need to cite a specific version of our 🤗 Datasets library for reproducibility, you can use the corresponding version Zenodo DOI from this [list](https://zenodo.org/search?q=conceptrecid:%224817768%22&sort=-version&all_versions=True).\n"
  },
  {
    "path": "SECURITY.md",
    "content": "# Security Policy\n\n## Supported Versions\n<!--\nUse this section to tell people about which versions of your project are\ncurrently being supported with security updates.\n\n| Version | Supported          |\n| ------- | ------------------ |\n| 5.1.x   | :white_check_mark: |\n| 5.0.x   | :x:                |\n| 4.0.x   | :white_check_mark: |\n| < 4.0   | :x:                |\n-->\n\nEach major version is currently being supported with security updates.\n\n| Version | Supported          |\n|---------|--------------------|\n| 1.x.x   | :white_check_mark: |\n| 2.x.x   | :white_check_mark: |\n\n\n## Reporting a Vulnerability\n<!--\nUse this section to tell people how to report a vulnerability.\n\nTell them where to go, how often they can expect to get an update on a\nreported vulnerability, what to expect if the vulnerability is accepted or\ndeclined, etc.\n-->\n\nTo report a security vulnerability, please contact: security@huggingface.co\n"
  },
  {
    "path": "benchmarks/benchmark_array_xd.py",
    "content": "import json\nimport os\nimport tempfile\n\nimport datasets\nfrom datasets.arrow_writer import ArrowWriter\nfrom datasets.features import Array2D\nfrom utils import generate_examples, get_duration\n\n\nSHAPE_TEST_1 = (30, 487)\nSHAPE_TEST_2 = (36, 1024)\nSPEED_TEST_SHAPE = (100, 100)\nSPEED_TEST_N_EXAMPLES = 100\n\nDEFAULT_FEATURES = datasets.Features(\n    {\"text\": Array2D(SHAPE_TEST_1, dtype=\"float32\"), \"image\": Array2D(SHAPE_TEST_2, dtype=\"float32\")}\n)\n\nRESULTS_BASEPATH, RESULTS_FILENAME = os.path.split(__file__)\nRESULTS_FILE_PATH = os.path.join(RESULTS_BASEPATH, \"results\", RESULTS_FILENAME.replace(\".py\", \".json\"))\n\n\n@get_duration\ndef write(my_features, dummy_data, tmp_dir):\n    with ArrowWriter(features=my_features, path=os.path.join(tmp_dir, \"beta.arrow\")) as writer:\n        for key, record in dummy_data:\n            example = my_features.encode_example(record)\n            writer.write(example)\n        num_examples, num_bytes = writer.finalize()\n\n\n@get_duration\ndef read_unformated(feats, tmp_dir):\n    dataset = datasets.Dataset.from_file(\n        filename=os.path.join(tmp_dir, \"beta.arrow\"), info=datasets.DatasetInfo(features=feats)\n    )\n    for _ in dataset:\n        pass\n\n\n@get_duration\ndef read_formatted_as_numpy(feats, tmp_dir):\n    dataset = datasets.Dataset.from_file(\n        filename=os.path.join(tmp_dir, \"beta.arrow\"), info=datasets.DatasetInfo(features=feats)\n    )\n    dataset.set_format(\"numpy\")\n    for _ in dataset:\n        pass\n\n\n@get_duration\ndef read_batch_unformated(feats, tmp_dir):\n    batch_size = 10\n    dataset = datasets.Dataset.from_file(\n        filename=os.path.join(tmp_dir, \"beta.arrow\"), info=datasets.DatasetInfo(features=feats)\n    )\n    for i in range(0, len(dataset), batch_size):\n        _ = dataset[i : i + batch_size]\n\n\n@get_duration\ndef read_batch_formatted_as_numpy(feats, tmp_dir):\n    batch_size = 10\n    dataset = datasets.Dataset.from_file(\n        filename=os.path.join(tmp_dir, \"beta.arrow\"), info=datasets.DatasetInfo(features=feats)\n    )\n    dataset.set_format(\"numpy\")\n    for i in range(0, len(dataset), batch_size):\n        _ = dataset[i : i + batch_size]\n\n\n@get_duration\ndef read_col_unformated(feats, tmp_dir):\n    dataset = datasets.Dataset.from_file(\n        filename=os.path.join(tmp_dir, \"beta.arrow\"), info=datasets.DatasetInfo(features=feats)\n    )\n    for col in feats:\n        _ = dataset[col]\n\n\n@get_duration\ndef read_col_formatted_as_numpy(feats, tmp_dir):\n    dataset = datasets.Dataset.from_file(\n        filename=os.path.join(tmp_dir, \"beta.arrow\"), info=datasets.DatasetInfo(features=feats)\n    )\n    dataset.set_format(\"numpy\")\n    for col in feats:\n        _ = dataset[col]\n\n\ndef benchmark_array_xd():\n    times = {}\n    read_functions = (\n        read_unformated,\n        read_formatted_as_numpy,\n        read_batch_unformated,\n        read_batch_formatted_as_numpy,\n        read_col_unformated,\n        read_col_formatted_as_numpy,\n    )\n    with tempfile.TemporaryDirectory() as tmp_dir:\n        feats = datasets.Features({\"image\": Array2D(SPEED_TEST_SHAPE, dtype=\"float32\")})\n        data = generate_examples(features=feats, num_examples=SPEED_TEST_N_EXAMPLES)\n        times[\"write_array2d\"] = write(feats, data, tmp_dir)\n        for read_func in read_functions:\n            times[read_func.__name__ + \" after write_array2d\"] = read_func(feats, tmp_dir)\n\n    with tempfile.TemporaryDirectory() as tmp_dir:\n        # don't use fixed length for fair comparison\n        # feats = datasets.Features(\n        #     {\"image\": datasets.Sequence(datasets.Sequence(datasets.Value(\"float32\"), SPEED_TEST_SHAPE[1]), SPEED_TEST_SHAPE[0])}\n        # )\n        feats = datasets.Features({\"image\": datasets.Sequence(datasets.Sequence(datasets.Value(\"float32\")))})\n        data = generate_examples(\n            features=feats, num_examples=SPEED_TEST_N_EXAMPLES, seq_shapes={\"image\": SPEED_TEST_SHAPE}\n        )\n        times[\"write_nested_sequence\"] = write(feats, data, tmp_dir)\n        for read_func in read_functions:\n            times[read_func.__name__ + \" after write_nested_sequence\"] = read_func(feats, tmp_dir)\n\n    with tempfile.TemporaryDirectory() as tmp_dir:\n        # don't use fixed length for fair comparison\n        # feats = datasets.Features(\n        #     {\"image\": datasets.Sequence(datasets.Value(\"float32\"), SPEED_TEST_SHAPE[0] * SPEED_TEST_SHAPE[1])}\n        # )\n        feats = datasets.Features({\"image\": datasets.Sequence(datasets.Value(\"float32\"))})\n        data = generate_examples(\n            features=feats,\n            num_examples=SPEED_TEST_N_EXAMPLES,\n            seq_shapes={\"image\": [SPEED_TEST_SHAPE[0] * SPEED_TEST_SHAPE[1]]},\n        )\n        times[\"write_flattened_sequence\"] = write(feats, data, tmp_dir)\n        for read_func in read_functions:\n            times[read_func.__name__ + \" after write_flattened_sequence\"] = read_func(feats, tmp_dir)\n\n    with open(RESULTS_FILE_PATH, \"wb\") as f:\n        f.write(json.dumps(times).encode(\"utf-8\"))\n\n\nif __name__ == \"__main__\":  # useful to run the profiler\n    benchmark_array_xd()\n"
  },
  {
    "path": "benchmarks/benchmark_getitem_100B.py",
    "content": "import json\nimport os\nfrom dataclasses import dataclass\n\nimport numpy as np\nimport pyarrow as pa\n\nimport datasets\nfrom utils import get_duration\n\n\nSPEED_TEST_N_EXAMPLES = 100_000_000_000\nSPEED_TEST_CHUNK_SIZE = 10_000\n\nRESULTS_BASEPATH, RESULTS_FILENAME = os.path.split(__file__)\nRESULTS_FILE_PATH = os.path.join(RESULTS_BASEPATH, \"results\", RESULTS_FILENAME.replace(\".py\", \".json\"))\n\n\ndef generate_100B_dataset(num_examples: int, chunk_size: int) -> datasets.Dataset:\n    table = pa.Table.from_pydict({\"col\": [0] * chunk_size})\n    table = pa.concat_tables([table] * (num_examples // chunk_size))\n    return datasets.Dataset(table, fingerprint=\"table_100B\")\n\n\n@dataclass\nclass RandIter:\n    low: int\n    high: int\n    size: int\n    seed: int\n\n    def __post_init__(self):\n        rng = np.random.default_rng(self.seed)\n        self._sampled_values = rng.integers(low=self.low, high=self.high, size=self.size).tolist()\n\n    def __iter__(self):\n        return iter(self._sampled_values)\n\n    def __len__(self):\n        return self.size\n\n\n@get_duration\ndef get_first_row(dataset: datasets.Dataset):\n    _ = dataset[0]\n\n\n@get_duration\ndef get_last_row(dataset: datasets.Dataset):\n    _ = dataset[-1]\n\n\n@get_duration\ndef get_batch_of_1024_rows(dataset: datasets.Dataset):\n    _ = dataset[range(len(dataset) // 2, len(dataset) // 2 + 1024)]\n\n\n@get_duration\ndef get_batch_of_1024_random_rows(dataset: datasets.Dataset):\n    _ = dataset[RandIter(0, len(dataset), 1024, seed=42)]\n\n\ndef benchmark_table_100B():\n    times = {\"num examples\": SPEED_TEST_N_EXAMPLES}\n    functions = (get_first_row, get_last_row, get_batch_of_1024_rows, get_batch_of_1024_random_rows)\n    print(\"generating dataset\")\n    dataset = generate_100B_dataset(num_examples=SPEED_TEST_N_EXAMPLES, chunk_size=SPEED_TEST_CHUNK_SIZE)\n    print(\"Functions\")\n    for func in functions:\n        print(func.__name__)\n        times[func.__name__] = func(dataset)\n\n    with open(RESULTS_FILE_PATH, \"wb\") as f:\n        f.write(json.dumps(times).encode(\"utf-8\"))\n\n\nif __name__ == \"__main__\":  # useful to run the profiler\n    benchmark_table_100B()\n"
  },
  {
    "path": "benchmarks/benchmark_indices_mapping.py",
    "content": "import json\nimport os\nimport tempfile\n\nimport datasets\nfrom utils import generate_example_dataset, get_duration\n\n\nSPEED_TEST_N_EXAMPLES = 500_000\n\nRESULTS_BASEPATH, RESULTS_FILENAME = os.path.split(__file__)\nRESULTS_FILE_PATH = os.path.join(RESULTS_BASEPATH, \"results\", RESULTS_FILENAME.replace(\".py\", \".json\"))\n\n\n@get_duration\ndef select(dataset: datasets.Dataset):\n    _ = dataset.select(range(0, len(dataset), 2))\n\n\n@get_duration\ndef sort(dataset: datasets.Dataset):\n    _ = dataset.sort(\"numbers\")\n\n\n@get_duration\ndef shuffle(dataset: datasets.Dataset):\n    _ = dataset.shuffle()\n\n\n@get_duration\ndef train_test_split(dataset: datasets.Dataset):\n    _ = dataset.train_test_split(0.1)\n\n\n@get_duration\ndef shard(dataset: datasets.Dataset, num_shards=10):\n    for shard_id in range(num_shards):\n        _ = dataset.shard(num_shards, shard_id)\n\n\ndef benchmark_indices_mapping():\n    times = {\"num examples\": SPEED_TEST_N_EXAMPLES}\n    functions = (select, sort, shuffle, train_test_split, shard)\n    with tempfile.TemporaryDirectory() as tmp_dir:\n        print(\"generating dataset\")\n        features = datasets.Features({\"text\": datasets.Value(\"string\"), \"numbers\": datasets.Value(\"float32\")})\n        dataset = generate_example_dataset(\n            os.path.join(tmp_dir, \"dataset.arrow\"), features, num_examples=SPEED_TEST_N_EXAMPLES\n        )\n        print(\"Functions\")\n        for func in functions:\n            print(func.__name__)\n            times[func.__name__] = func(dataset)\n\n    with open(RESULTS_FILE_PATH, \"wb\") as f:\n        f.write(json.dumps(times).encode(\"utf-8\"))\n\n\nif __name__ == \"__main__\":  # useful to run the profiler\n    benchmark_indices_mapping()\n"
  },
  {
    "path": "benchmarks/benchmark_iterating.py",
    "content": "import json\nimport os\nimport tempfile\n\nimport datasets\nfrom utils import generate_example_dataset, get_duration\n\n\nSPEED_TEST_N_EXAMPLES = 50_000\nSMALL_TEST = 5_000\n\nRESULTS_BASEPATH, RESULTS_FILENAME = os.path.split(__file__)\nRESULTS_FILE_PATH = os.path.join(RESULTS_BASEPATH, \"results\", RESULTS_FILENAME.replace(\".py\", \".json\"))\n\n\n@get_duration\ndef read(dataset: datasets.Dataset, length):\n    for i in range(length):\n        _ = dataset[i]\n\n\n@get_duration\ndef read_batch(dataset: datasets.Dataset, length, batch_size):\n    for i in range(0, len(dataset), batch_size):\n        _ = dataset[i : i + batch_size]\n\n\n@get_duration\ndef read_formatted(dataset: datasets.Dataset, length, type):\n    with dataset.formatted_as(type=type):\n        for i in range(length):\n            _ = dataset[i]\n\n\n@get_duration\ndef read_formatted_batch(dataset: datasets.Dataset, length, batch_size, type):\n    with dataset.formatted_as(type=type):\n        for i in range(0, length, batch_size):\n            _ = dataset[i : i + batch_size]\n\n\ndef benchmark_iterating():\n    times = {\"num examples\": SPEED_TEST_N_EXAMPLES}\n    functions = [\n        (read, {\"length\": SMALL_TEST}),\n        (read, {\"length\": SPEED_TEST_N_EXAMPLES}),\n        (read_batch, {\"length\": SPEED_TEST_N_EXAMPLES, \"batch_size\": 10}),\n        (read_batch, {\"length\": SPEED_TEST_N_EXAMPLES, \"batch_size\": 100}),\n        (read_batch, {\"length\": SPEED_TEST_N_EXAMPLES, \"batch_size\": 1_000}),\n        (read_formatted, {\"type\": \"numpy\", \"length\": SMALL_TEST}),\n        (read_formatted, {\"type\": \"pandas\", \"length\": SMALL_TEST}),\n        (read_formatted, {\"type\": \"torch\", \"length\": SMALL_TEST}),\n        (read_formatted, {\"type\": \"tensorflow\", \"length\": SMALL_TEST}),\n        (read_formatted_batch, {\"type\": \"numpy\", \"length\": SMALL_TEST, \"batch_size\": 10}),\n        (read_formatted_batch, {\"type\": \"numpy\", \"length\": SMALL_TEST, \"batch_size\": 1_000}),\n    ]\n\n    functions_shuffled = [\n        (read, {\"length\": SMALL_TEST}),\n        (read, {\"length\": SPEED_TEST_N_EXAMPLES}),\n        (read_batch, {\"length\": SPEED_TEST_N_EXAMPLES, \"batch_size\": 10}),\n        (read_batch, {\"length\": SPEED_TEST_N_EXAMPLES, \"batch_size\": 100}),\n        (read_batch, {\"length\": SPEED_TEST_N_EXAMPLES, \"batch_size\": 1_000}),\n        (read_formatted, {\"type\": \"numpy\", \"length\": SMALL_TEST}),\n        (read_formatted_batch, {\"type\": \"numpy\", \"length\": SMALL_TEST, \"batch_size\": 10}),\n        (read_formatted_batch, {\"type\": \"numpy\", \"length\": SMALL_TEST, \"batch_size\": 1_000}),\n    ]\n    with tempfile.TemporaryDirectory() as tmp_dir:\n        print(\"generating dataset\")\n        features = datasets.Features(\n            {\"list\": datasets.Sequence(datasets.Value(\"float32\")), \"numbers\": datasets.Value(\"float32\")}\n        )\n        dataset = generate_example_dataset(\n            os.path.join(tmp_dir, \"dataset.arrow\"),\n            features,\n            num_examples=SPEED_TEST_N_EXAMPLES,\n            seq_shapes={\"list\": (100,)},\n        )\n        print(\"first set of iterations\")\n        for func, kwargs in functions:\n            print(func.__name__, str(kwargs))\n            times[func.__name__ + \" \" + \" \".join(str(v) for v in kwargs.values())] = func(dataset, **kwargs)\n\n        print(\"shuffling dataset\")\n        dataset = dataset.shuffle()\n        print(\"Second set of iterations (after shuffling\")\n        for func, kwargs in functions_shuffled:\n            print(\"shuffled \", func.__name__, str(kwargs))\n            times[\"shuffled \" + func.__name__ + \" \" + \" \".join(str(v) for v in kwargs.values())] = func(\n                dataset, **kwargs\n            )\n\n    with open(RESULTS_FILE_PATH, \"wb\") as f:\n        f.write(json.dumps(times).encode(\"utf-8\"))\n\n\nif __name__ == \"__main__\":  # useful to run the profiler\n    benchmark_iterating()\n"
  },
  {
    "path": "benchmarks/benchmark_map_filter.py",
    "content": "import json\nimport os\nimport tempfile\n\nimport transformers\n\nimport datasets\nfrom utils import generate_example_dataset, get_duration\n\n\nSPEED_TEST_N_EXAMPLES = 500_000\n\nRESULTS_BASEPATH, RESULTS_FILENAME = os.path.split(__file__)\nRESULTS_FILE_PATH = os.path.join(RESULTS_BASEPATH, \"results\", RESULTS_FILENAME.replace(\".py\", \".json\"))\n\n\n@get_duration\ndef map(dataset: datasets.Dataset, **kwargs):\n    _ = dataset.map(**kwargs)\n\n\n@get_duration\ndef filter(dataset: datasets.Dataset, **kwargs):\n    _ = dataset.filter(**kwargs)\n\n\ndef benchmark_map_filter():\n    times = {\"num examples\": SPEED_TEST_N_EXAMPLES}\n    with tempfile.TemporaryDirectory() as tmp_dir:\n        features = datasets.Features({\"text\": datasets.Value(\"string\"), \"numbers\": datasets.Value(\"float32\")})\n        dataset = generate_example_dataset(\n            os.path.join(tmp_dir, \"dataset.arrow\"), features, num_examples=SPEED_TEST_N_EXAMPLES\n        )\n\n        tokenizer = transformers.AutoTokenizer.from_pretrained(\"bert-base-cased\", use_fast=True)\n\n        def tokenize(examples):\n            return tokenizer(examples[\"text\"])\n\n        times[\"map identity\"] = map(dataset)\n\n        times[\"map identity batched\"] = map(dataset, batched=True)\n\n        times[\"map no-op batched\"] = map(dataset, function=lambda x: None, batched=True)\n\n        with dataset.formatted_as(type=\"numpy\"):\n            times[\"map no-op batched numpy\"] = map(dataset, function=lambda x: None, batched=True)\n\n        with dataset.formatted_as(type=\"pandas\"):\n            times[\"map no-op batched pandas\"] = map(dataset, function=lambda x: None, batched=True)\n\n        with dataset.formatted_as(type=\"torch\", columns=\"numbers\"):\n            times[\"map no-op batched pytorch\"] = map(dataset, function=lambda x: None, batched=True)\n\n        with dataset.formatted_as(type=\"tensorflow\", columns=\"numbers\"):\n            times[\"map no-op batched tensorflow\"] = map(dataset, function=lambda x: None, batched=True)\n\n        times[\"map fast-tokenizer batched\"] = map(dataset, function=tokenize, batched=True)\n\n        times[\"filter\"] = filter(dataset)\n\n        # Activate later when tokenizer support batched inputs\n        # with dataset.formatted_as(type='numpy'):\n        #     times[func.__name__ + \" fast-tokenizer batched numpy\"] = func(dataset, function=tokenize, batched=True)\n\n    with open(RESULTS_FILE_PATH, \"wb\") as f:\n        f.write(json.dumps(times).encode(\"utf-8\"))\n\n\nif __name__ == \"__main__\":  # useful to run the profiler\n    benchmark_map_filter()\n"
  },
  {
    "path": "benchmarks/format.py",
    "content": "import json\nimport sys\n\n\ndef format_json_to_md(input_json_file, output_md_file):\n    with open(input_json_file, encoding=\"utf-8\") as f:\n        results = json.load(f)\n\n    output_md = [\"<details>\", \"<summary>Show updated benchmarks!</summary>\", \" \"]\n\n    for benchmark_name in sorted(results):\n        benchmark_res = results[benchmark_name]\n\n        benchmark_file_name = benchmark_name.split(\"/\")[-1]\n        output_md.append(f\"### Benchmark: {benchmark_file_name}\")\n\n        title = \"| metric |\"\n        lines = \"|--------|\"\n        value = \"| new / old (diff) |\"\n        for metric_name in sorted(benchmark_res):\n            metric_vals = benchmark_res[metric_name]\n            new_val = metric_vals[\"new\"]\n            old_val = metric_vals.get(\"old\", None)\n            dif_val = metric_vals.get(\"diff\", None)\n\n            val_str = f\" {new_val:f}\" if isinstance(new_val, (int, float)) else \"None\"\n\n            if old_val is not None:\n                val_str += f\" / {old_val:f}\" if isinstance(old_val, (int, float)) else \"None\"\n            if dif_val is not None:\n                val_str += f\" ({dif_val:f})\" if isinstance(dif_val, (int, float)) else \"None\"\n\n            title += \" \" + metric_name + \" |\"\n            lines += \"---|\"\n            value += val_str + \" |\"\n\n        output_md += [title, lines, value, \" \"]\n\n    output_md.append(\"</details>\")\n\n    with open(output_md_file, \"w\", encoding=\"utf-8\") as f:\n        f.writelines(\"\\n\".join(output_md))\n\n\nif __name__ == \"__main__\":\n    input_json_file = sys.argv[1]\n    output_md_file = sys.argv[2]\n\n    format_json_to_md(input_json_file, output_md_file)\n"
  },
  {
    "path": "benchmarks/results/.gitkeep",
    "content": ""
  },
  {
    "path": "benchmarks/results/benchmark_array_xd.json",
    "content": "{\"write_array2d\": 0.14168284999323077, \"read_unformated after write_array2d\": 0.04353281999647152, \"read_formatted_as_numpy after write_array2d\": 0.1285462469968479, \"read_batch_unformated after write_array2d\": 0.023109222995117307, \"read_batch_formatted_as_numpy after write_array2d\": 0.011352884990628809, \"read_col_unformated after write_array2d\": 0.037052362007671036, \"read_col_formatted_as_numpy after write_array2d\": 0.007985618998645805, \"write_nested_sequence\": 1.4927163410029607, \"read_unformated after write_nested_sequence\": 0.28319963401008863, \"read_formatted_as_numpy after write_nested_sequence\": 0.419271487990045, \"read_batch_unformated after write_nested_sequence\": 0.3234798710036557, \"read_batch_formatted_as_numpy after write_nested_sequence\": 0.03850809299910907, \"read_col_unformated after write_nested_sequence\": 0.29384092400141526, \"read_col_formatted_as_numpy after write_nested_sequence\": 0.004250421989127062, \"write_flattened_sequence\": 1.4521546780015342, \"read_unformated after write_flattened_sequence\": 0.25513897799828555, \"read_formatted_as_numpy after write_flattened_sequence\": 0.07564631900459062, \"read_batch_unformated after write_flattened_sequence\": 0.2758980469952803, \"read_batch_formatted_as_numpy after write_flattened_sequence\": 0.011008214991306886, \"read_col_unformated after write_flattened_sequence\": 0.25848906899045687, \"read_col_formatted_as_numpy after write_flattened_sequence\": 0.004328447001171298}"
  },
  {
    "path": "benchmarks/results/benchmark_getitem_100B.json",
    "content": "{\"num examples\": 100000000000, \"get_first_row\": 0.00019991099999927542, \"get_last_row\": 5.4411000000698095e-05, \"get_batch_of_1024_rows\": 0.0004897069999998394, \"get_batch_of_1024_random_rows\": 0.01800621099999944}"
  },
  {
    "path": "benchmarks/results/benchmark_indices_mapping.json",
    "content": "{\"num examples\": 500000, \"select\": 0.03741131999413483, \"sort\": 0.7371353159978753, \"shuffle\": 0.17655655200360343, \"train_test_split\": 0.29633847798686475, \"shard\": 0.01452581599005498}"
  },
  {
    "path": "benchmarks/results/benchmark_iterating.json",
    "content": "{\"num examples\": 50000, \"read 5000\": 0.2152090710005723, \"read 50000\": 2.077654693988734, \"read_batch 50000 10\": 1.5041199039987987, \"read_batch 50000 100\": 1.5411947140091797, \"read_batch 50000 1000\": 1.4684901159926085, \"read_formatted numpy 5000\": 4.584776938994764, \"read_formatted pandas 5000\": 3.7457121399929747, \"read_formatted torch 5000\": 4.565676491998602, \"read_formatted tensorflow 5000\": 5.269861594992108, \"read_formatted_batch numpy 5000 10\": 0.4242750950070331, \"read_formatted_batch numpy 5000 1000\": 0.007607111998368055, \"shuffled read 5000\": 0.22604441999283154, \"shuffled read 50000\": 2.268928524994408, \"shuffled read_batch 50000 10\": 55.44462437101174, \"shuffled read_batch 50000 100\": 6.876476717996411, \"shuffled read_batch 50000 1000\": 2.1420724369963864, \"shuffled read_formatted numpy 5000\": 4.8052272600034485, \"shuffled read_formatted_batch numpy 5000 10\": 6.500664097999106, \"shuffled read_formatted_batch numpy 5000 1000\": 0.0754691059992183}"
  },
  {
    "path": "benchmarks/results/benchmark_map_filter.json",
    "content": "{\"num examples\": 500000, \"map identity\": 10.19139202599763, \"map identity batched\": 0.6804238399927272, \"map no-op batched\": 0.5342009569867514, \"map no-op batched numpy\": 0.5792830920108827, \"map no-op batched pandas\": 0.4343639040016569, \"map no-op batched pytorch\": 0.5403374370071106, \"map no-op batched tensorflow\": 1.3869360350072384, \"map fast-tokenizer batched\": 8.074308118986664, \"filter\": 1.841787679004483}"
  },
  {
    "path": "benchmarks/utils.py",
    "content": "import timeit\n\nimport numpy as np\n\nimport datasets\nfrom datasets.arrow_writer import ArrowWriter\nfrom datasets.features.features import _ArrayXD\n\n\ndef get_duration(func):\n    def wrapper(*args, **kwargs):\n        starttime = timeit.default_timer()\n        _ = func(*args, **kwargs)\n        delta = timeit.default_timer() - starttime\n        return delta\n\n    wrapper.__name__ = func.__name__\n\n    return wrapper\n\n\ndef generate_examples(features: dict, num_examples=100, seq_shapes=None):\n    dummy_data = []\n    seq_shapes = seq_shapes or {}\n    for i in range(num_examples):\n        example = {}\n        for col_id, (k, v) in enumerate(features.items()):\n            if isinstance(v, _ArrayXD):\n                data = np.random.rand(*v.shape).astype(v.dtype)\n            elif isinstance(v, datasets.Value):\n                if v.dtype == \"string\":\n                    data = \"The small grey turtle was surprisingly fast when challenged.\"\n                else:\n                    data = np.random.randint(10, size=1).astype(v.dtype).item()\n            elif isinstance(v, datasets.Sequence):\n                while isinstance(v, datasets.Sequence):\n                    v = v.feature\n                shape = seq_shapes[k]\n                data = np.random.rand(*shape).astype(v.dtype)\n            example[k] = data\n\n        dummy_data.append((i, example))\n\n    return dummy_data\n\n\ndef generate_example_dataset(dataset_path, features, num_examples=100, seq_shapes=None):\n    dummy_data = generate_examples(features, num_examples=num_examples, seq_shapes=seq_shapes)\n\n    with ArrowWriter(features=features, path=dataset_path) as writer:\n        for key, record in dummy_data:\n            example = features.encode_example(record)\n            writer.write(example)\n\n        num_final_examples, num_bytes = writer.finalize()\n\n    if not num_final_examples == num_examples:\n        raise ValueError(\n            f\"Error writing the dataset, wrote {num_final_examples} examples but should have written {num_examples}.\"\n        )\n\n    dataset = datasets.Dataset.from_file(filename=dataset_path, info=datasets.DatasetInfo(features=features))\n\n    return dataset\n"
  },
  {
    "path": "docs/README.md",
    "content": "<!---\nCopyright 2020 The HuggingFace Team. All rights reserved.\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n\n    http://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License.\n-->\n\n# Generating the documentation\n\nTo generate the documentation, you first have to build it. Several packages are necessary to build the doc,\nyou can install them with the following command, at the root of the code repository:\n\n```bash\npip install -e \".[docs]\"\n```\n\nThen you need to install our special tool that builds the documentation:\n\n```bash\npip install git+https://github.com/huggingface/doc-builder\n```\n\n---\n**NOTE**\n\nYou only need to generate the documentation to inspect it locally (if you're planning changes and want to\ncheck how they look before committing for instance). You don't have to `git commit` the built documentation.\n\n---\n\n## Building the documentation\n\nOnce you have setup the `doc-builder` and additional packages, you can generate the documentation by typing\nthe following command:\n\n```bash\ndoc-builder build datasets docs/source/ --build_dir ~/tmp/test-build\n```\n\nYou can adapt the `--build_dir` to set any temporary folder that you prefer. This command will create it and generate\nthe MDX files that will be rendered as the documentation on the main website. You can inspect them in your favorite\nMarkdown editor.\n\n## Previewing the documentation\n\nTo preview the docs, first install the `watchdog` module with:\n\n```bash\npip install watchdog\n```\n\nThen run the following command:\n\n```bash\ndoc-builder preview datasets docs/source/\n```\n\nThe docs will be viewable at [http://localhost:3000](http://localhost:3000). You can also preview the docs once you have opened a PR. You will see a bot add a comment to a link where the documentation with your changes lives.\n\n---\n**NOTE**\n\nThe `preview` command only works with existing doc files. When you add a completely new file, you need to update `_toctree.yml` & restart `preview` command (`ctrl-c` to stop it & call `doc-builder preview ...` again).\n\n## Adding a new element to the navigation bar\n\nAccepted files are Markdown (.md or .mdx).\n\nCreate a file with its extension and put it in the source directory. You can then link it to the toc-tree by putting\nthe filename without the extension in the [`_toctree.yml`](https://github.com/huggingface/datasets/blob/main/docs/source/_toctree.yml) file.\n\n## Renaming section headers and moving sections\n\nIt helps to keep the old links working when renaming the section header and/or moving sections from one document to another. This is because the old links are likely to be used in Issues, Forums and Social media and it'd make for a much more superior user experience if users reading those months later could still easily navigate to the originally intended information.\n\nTherefore we simply keep a little map of moved sections at the end of the document where the original section was. The key is to preserve the original anchor.\n\nSo if you renamed a section from: \"Section A\" to \"Section B\", then you can add at the end of the file:\n\n```\nSections that were moved:\n\n[ <a href=\"#section-b\">Section A</a><a id=\"section-a\"></a> ]\n```\nand of course if you moved it to another file, then:\n\n```\nSections that were moved:\n\n[ <a href=\"../new-file#section-b\">Section A</a><a id=\"section-a\"></a> ]\n```\n\nUse the relative style to link to the new file so that the versioned docs continue to work.\n\nFor an example of a rich moved sections set please see the very end of [the transformers Trainer doc](https://github.com/huggingface/transformers/blob/main/docs/source/en/main_classes/trainer.md).\n\n\n## Writing Documentation - Specification\n\nThe `huggingface/datasets` documentation follows the\n[Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style for docstrings,\nalthough we can write them directly in Markdown.\n\n### Adding a new tutorial\n\nAdding a new tutorial or section is done in two steps:\n\n- Add a new file under `./source`. This file can either be ReStructuredText (.rst) or Markdown (.md).\n- Link that file in `./source/_toctree.yml` on the correct toc-tree.\n\nMake sure to put your new file under the proper section. If you have a doubt, feel free to ask in a Github Issue or PR.\n\n### Writing source documentation\n\nValues that should be put in `code` should either be surrounded by backticks: \\`like so\\`. Note that argument names\nand objects like True, None or any strings should usually be put in `code`.\n\nWhen mentioning a class, function or method, it is recommended to use our syntax for internal links so that our tool\nadds a link to its documentation with this syntax: \\[\\`XXXClass\\`\\] or \\[\\`function\\`\\]. This requires the class or \nfunction to be in the main package.\n\nIf you want to create a link to some internal class or function, you need to\nprovide its path. For instance: \\[\\`table.InMemoryTable\\`\\]. This will be converted into a link with\n`table.InMemoryTable` in the description. To get rid of the path and only keep the name of the object you are\nlinking to in the description, add a ~: \\[\\`~table.InMemoryTable\\`\\] will generate a link with `InMemoryTable` in the description.\n\nThe same works for methods so you can either use \\[\\`XXXClass.method\\`\\] or \\[~\\`XXXClass.method\\`\\].\n\n#### Defining arguments in a method\n\nArguments should be defined with the `Args:` (or `Arguments:` or `Parameters:`) prefix, followed by a line return and\nan indentation. The argument should be followed by its type, with its shape if it is a tensor, a colon and its\ndescription:\n\n```\n    Args:\n        n_layers (`int`): The number of layers of the model.\n```\n\nIf the description is too long to fit in one line, another indentation is necessary before writing the description\nafter the argument.\n\nHere's an example showcasing everything so far:\n\n```\n    Args:\n        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):\n            Indices of input sequence tokens in the vocabulary.\n\n            Indices can be obtained using [`AlbertTokenizer`]. See [`~PreTrainedTokenizer.encode`] and\n            [`~PreTrainedTokenizer.__call__`] for details.\n\n            [What are input IDs?](../glossary#input-ids)\n```\n\nFor optional arguments or arguments with defaults we follow the following syntax: imagine we have a function with the\nfollowing signature:\n\n```\ndef my_function(x: str = None, a: float = 1):\n```\n\nthen its documentation should look like this:\n\n```\n    Args:\n        x (`str`, *optional*):\n            This argument controls ...\n        a (`float`, *optional*, defaults to 1):\n            This argument is used to ...\n```\n\nNote that we always omit the \"defaults to \\`None\\`\" when None is the default for any argument. Also note that even\nif the first line describing your argument type and its default gets long, you can't break it into several lines. You can\nhowever write as many lines as you want in the indented description (see the example above with `input_ids`).\n\n#### Writing a multi-line code block\n\nMulti-line code blocks can be useful for displaying examples. They are done between two lines of three backticks as usual in Markdown:\n\n\n````\n```\n# first line of code\n# second line\n# etc\n```\n````\n\n#### Writing a return block\n\nThe return block should be introduced with the `Returns:` prefix, followed by a line return and an indentation.\nThe first line should be the type of the return, followed by a line return. No need to indent further for the elements\nbuilding the return.\n\nHere's an example of a single value return:\n\n```\n    Returns:\n        `List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.\n```\n\nHere's an example of tuple return, comprising several objects:\n\n```\n    Returns:\n        `tuple(torch.FloatTensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs:\n        - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.FloatTensor` of shape `(1,)` --\n          Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.\n        - **prediction_scores** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`) --\n          Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).\n```\n\n#### Adding an image\n\nDue to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos and other non-text files. We prefer to leverage a hf.co hosted `dataset` like\nthe ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference\nthem by URL. We recommend putting them in the following dataset: [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).\nIf an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate your images\nto this dataset.\n\n## Writing documentation examples\n\nThe syntax for Example docstrings can look as follows:\n\n```\n    Example:\n\n    ```py\n    >>> from datasets import load_dataset\n    >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n    >>> def add_prefix(example):\n    ...     example[\"text\"] = \"Review: \" + example[\"text\"]\n    ...     return example\n    >>> ds = ds.map(add_prefix)\n    >>> ds[0:3][\"text\"]\n    ['Review: compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .',\n        'Review: the soundtrack alone is worth the price of admission .',\n        'Review: rodriguez does a splendid job of racial profiling hollywood style--casting excellent latin actors of all ages--a trend long overdue .']\n\n    # process a batch of examples\n    >>> ds = ds.map(lambda example: tokenizer(example[\"text\"]), batched=True)\n    # set number of processors\n    >>> ds = ds.map(add_prefix, num_proc=4)\n    ```\n```\n\nThe docstring should give a minimal, clear example of how the respective class or function is to be used in practice and also include the expected (ideally sensible) output.\nOften, readers will try out the example before even going through the function \nor class definitions. Therefore, it is of utmost importance that the example \nworks as expected.\n"
  },
  {
    "path": "docs/source/_config.py",
    "content": "# docstyle-ignore\nINSTALL_CONTENT = \"\"\"\n# Datasets installation\n! pip install datasets transformers\n# To install from source instead of the last release, comment the command above and uncomment the following one.\n# ! pip install git+https://github.com/huggingface/datasets.git\n\"\"\"\n\nnotebook_first_cells = [{\"type\": \"code\", \"content\": INSTALL_CONTENT}]\ndefault_branch_name = \"main\"\nversion_prefix = \"\"\n"
  },
  {
    "path": "docs/source/_redirects.yml",
    "content": "# This first_section was backported from nginx\nloading_datasets: loading\nshare_dataset: share\nquicktour: quickstart\ndataset_streaming: stream\ntorch_tensorflow: use_dataset\nsplits: loading#slice-splits\nprocessing: process\nfaiss_and_ea: faiss_es\nfeatures: about_dataset_features\nexploring: access\npackage_reference/logging_methods: package_reference/utilities\n# end of first_section\n"
  },
  {
    "path": "docs/source/_toctree.yml",
    "content": "- sections: \n  - local: index\n    title: 🤗 Datasets\n  - local: quickstart\n    title: Quickstart\n  - local: installation\n    title: Installation\n  title: Get started\n- sections:\n  - local: tutorial\n    title: Overview\n  - local: load_hub\n    title: Load a dataset from the Hub\n  - local: access\n    title: Know your dataset\n  - local: use_dataset\n    title: Preprocess\n  - local: create_dataset\n    title: Create a dataset\n  - local: upload_dataset\n    title: Share a dataset to the Hub\n  title: \"Tutorials\"\n- sections:\n  - local: how_to\n    title: Overview\n  - sections:\n    - local: loading\n      title: Load\n    - local: process\n      title: Process\n    - local: stream\n      title: Stream\n    - local: use_with_pytorch\n      title: Use with PyTorch\n    - local: use_with_tensorflow\n      title: Use with TensorFlow\n    - local: use_with_numpy\n      title: Use with NumPy\n    - local: use_with_jax\n      title: Use with JAX\n    - local: use_with_pandas\n      title: Use with Pandas\n    - local: use_with_polars\n      title: Use with Polars\n    - local: use_with_pyarrow\n      title: Use with PyArrow\n    - local: use_with_spark\n      title: Use with Spark\n    - local: cache\n      title: Cache management\n    - local: filesystems\n      title: Cloud storage\n    - local: faiss_es\n      title: Search index\n    - local: cli\n      title: CLI\n    - local: troubleshoot\n      title: Troubleshooting\n    title: \"General usage\"\n  - sections:\n    - local: audio_load\n      title: Load audio data\n    - local: audio_process\n      title: Process audio data\n    - local: audio_dataset\n      title: Create an audio dataset\n    title: \"Audio\"\n  - sections:\n    - local: image_load\n      title: Load image data\n    - local: image_process\n      title: Process image data\n    - local: image_dataset\n      title: Create an image dataset\n    - local: depth_estimation\n      title: Depth estimation\n    - local: image_classification\n      title: Image classification\n    - local: semantic_segmentation\n      title: Semantic segmentation\n    - local: object_detection\n      title: Object detection\n    - local: video_load\n      title: Load video data\n    - local: video_dataset\n      title: Create a video dataset\n    - local: document_load\n      title: Load document data\n    - local: document_dataset\n      title: Create a document dataset\n    - local: nifti_dataset\n      title: Create a medical imaging dataset\n    title: \"Vision\"\n  - sections:\n    - local: nlp_load\n      title: Load text data\n    - local: nlp_process\n      title: Process text data\n    title: \"Text\"\n  - sections:\n    - local: tabular_load\n      title: Load tabular data\n    title: \"Tabular\"\n  - sections:\n    - local: share\n      title: Share\n    - local: dataset_card\n      title: Create a dataset card\n    - local: repository_structure\n      title: Structure your repository\n    title: \"Dataset repository\"\n  title: \"How-to guides\"\n- sections:\n  - local: about_arrow\n    title: Datasets 🤝 Arrow\n  - local: about_cache\n    title: The cache\n  - local: about_mapstyle_vs_iterable\n    title: Dataset or IterableDataset\n  - local: about_dataset_features\n    title: Dataset features\n  - local: about_dataset_load\n    title: Build and load\n  - local: about_map_batch\n    title: Batch mapping\n  title: \"Conceptual guides\"\n- sections:\n  - local: package_reference/main_classes\n    title: Main classes\n  - local: package_reference/builder_classes\n    title: Builder classes\n  - local: package_reference/loading_methods\n    title: Loading methods\n  - local: package_reference/table_classes\n    title: Table Classes\n  - local: package_reference/utilities\n    title: Utilities\n  title: \"Reference\"\n"
  },
  {
    "path": "docs/source/about_arrow.md",
    "content": "# Datasets 🤝 Arrow\n\n## What is Arrow?\n\n[Arrow](https://arrow.apache.org/) enables large amounts of data to be processed and moved quickly. It is a specific data format that stores data in a columnar memory layout. This provides several significant advantages:\n\n* Arrow's standard format allows [zero-copy reads](https://en.wikipedia.org/wiki/Zero-copy) which removes virtually all serialization overhead.\n* Arrow is language-agnostic so it supports different programming languages.\n* Arrow is column-oriented so it is faster at querying and processing slices or columns of data.\n* Arrow allows for copy-free hand-offs to standard machine learning tools such as NumPy, Pandas, PyTorch, and TensorFlow.\n* Arrow supports many, possibly nested, column types.\n\n## Memory-mapping\n\n🤗 Datasets uses Arrow for its local caching system. It allows datasets to be backed by an on-disk cache, which is memory-mapped for fast lookup.\nThis architecture allows for large datasets to be used on machines with relatively small device memory.\n\nFor example, loading the full English Wikipedia dataset only takes a few MB of RAM:\n\n```python\n>>> import os; import psutil; import timeit\n>>> from datasets import load_dataset\n\n# Process.memory_info is expressed in bytes, so convert to megabytes \n>>> mem_before = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)\n>>> wiki = load_dataset(\"wikimedia/wikipedia\", \"20220301.en\", split=\"train\")\n>>> mem_after = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)\n\n>>> print(f\"RAM memory used: {(mem_after - mem_before)} MB\")\nRAM memory used: 50 MB\n```\n\nThis is possible because the Arrow data is actually memory-mapped from disk, and not loaded in memory.\nMemory-mapping allows access to data on disk, and leverages virtual memory capabilities for fast lookups.\n\n## Performance\n\nIterating over a memory-mapped dataset using Arrow is fast. Iterating over Wikipedia on a laptop gives you speeds of 1-3 Gbit/s:\n\n```python\n>>> s = \"\"\"batch_size = 1000\n... for batch in wiki.iter(batch_size):\n...     ...\n... \"\"\"\n\n>>> elapsed_time = timeit.timeit(stmt=s, number=1, globals=globals())\n>>> print(f\"Time to iterate over the {wiki.dataset_size >> 30} GB dataset: {elapsed_time:.1f} sec, \"\n...       f\"ie. {float(wiki.dataset_size >> 27)/elapsed_time:.1f} Gb/s\")\nTime to iterate over the 18 GB dataset: 31.8 sec, ie. 4.8 Gb/s\n```\n"
  },
  {
    "path": "docs/source/about_cache.mdx",
    "content": "# The cache\n\nThe cache is one of the reasons why 🤗 Datasets is so efficient. It stores previously downloaded and processed datasets so when you need to use them again, they are reloaded directly from the cache. This avoids having to download a dataset all over again, or reapplying processing functions. Even after you close and start another Python session, 🤗 Datasets will reload your dataset directly from the cache!\n\n## Fingerprint\n\nHow does the cache keeps track of what transforms are applied to a dataset? Well, 🤗 Datasets assigns a fingerprint to the cache file. A fingerprint keeps track of the current state of a dataset. The initial fingerprint is computed using a hash from the Arrow table, or a hash of the Arrow files if the dataset is on disk. Subsequent fingerprints are computed by combining the fingerprint of the previous state, and a hash of the latest transform applied. \n\n> [!TIP]\n> Transforms are any of the processing methods from the [How-to Process](./process) guides such as [`Dataset.map`] or [`Dataset.shuffle`].\n\nHere are what the actual fingerprints look like:\n\n```py\n>>> from datasets import Dataset\n>>> dataset1 = Dataset.from_dict({\"a\": [0, 1, 2]})\n>>> dataset2 = dataset1.map(lambda x: {\"a\": x[\"a\"] + 1})\n>>> print(dataset1._fingerprint, dataset2._fingerprint)\nd19493523d95e2dc 5b86abacd4b42434\n```\n\nIn order for a transform to be hashable, it needs to be picklable by [dill](https://dill.readthedocs.io/en/latest/) or [pickle](https://docs.python.org/3/library/pickle). \n\nWhen you use a non-hashable transform, 🤗 Datasets uses a random fingerprint instead and raises a warning. The non-hashable transform is considered different from the previous transforms. As a result, 🤗 Datasets will recompute all the transforms. Make sure your transforms are serializable with pickle or dill to avoid this!\n\nAn example of when 🤗 Datasets recomputes everything is when caching is disabled. When this happens, the cache files are generated every time and they get written to a temporary directory. Once your Python session ends, the cache files in the temporary directory are deleted. A random hash is assigned to these cache files, instead of a fingerprint. \n\n> [!TIP]\n> When caching is disabled, use [`Dataset.save_to_disk`] to save your transformed dataset or it will be deleted once the session ends.\n\n## Hashing\n\nThe fingerprint of a dataset is updated by hashing the function passed to `map` as well as the `map` parameters (`batch_size`, `remove_columns`, etc.).\n\nYou can check the hash of any Python object using the [`fingerprint.Hasher`]:\n\n```py\n>>> from datasets.fingerprint import Hasher\n>>> my_func = lambda example: {\"length\": len(example[\"text\"])}\n>>> print(Hasher.hash(my_func))\n'3d35e2b3e94c81d6'\n```\n\nThe hash is computed by dumping the object using a `dill` pickler and hashing the dumped bytes.\nThe pickler recursively dumps all the variables used in your function, so any change you do to an object that is used in your function, will cause the hash to change.\n\nIf one of your functions doesn't seem to have the same hash across sessions, it means at least one of its variables contains a Python object that is not deterministic.\nWhen this happens, feel free to hash any object you find suspicious to try to find the object that caused the hash to change.\nFor example, if you use a list for which the order of its elements is not deterministic across sessions, then the hash won't be the same across sessions either.\n"
  },
  {
    "path": "docs/source/about_dataset_features.mdx",
    "content": "# Dataset features\n\n[`Features`] defines the internal structure of a dataset. It is used to specify the underlying serialization format. What's more interesting to you though is that [`Features`] contains high-level information about everything from the column names and types, to the [`ClassLabel`]. You can think of [`Features`] as the backbone of a dataset.\n\nThe [`Features`] format is simple: `dict[column_name, column_type]`. It is a dictionary of column name and column type pairs. The column type provides a wide range of options for describing the type of data you have.\n\nLet's have a look at the features of the MRPC dataset from the GLUE benchmark:\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset('nyu-mll/glue', 'mrpc', split='train')\n>>> dataset.features\n{'idx': Value('int32'),\n 'label': ClassLabel(names=['not_equivalent', 'equivalent']),\n 'sentence1': Value('string'),\n 'sentence2': Value('string'),\n}\n```\n\nThe [`Value`] feature tells 🤗 Datasets:\n\n- The `idx` data type is `int32`.\n- The `sentence1` and `sentence2` data types are `string`.\n\n🤗 Datasets supports many other data types such as `bool`, `float32` and `binary` to name just a few.\n\n> [!TIP]\n> Refer to [`Value`] for a full list of supported data types.\n\nThe [`ClassLabel`] feature informs 🤗 Datasets the `label` column contains two classes. The classes are labeled `not_equivalent` and `equivalent`. Labels are stored as integers in the dataset. When you retrieve the labels, [`ClassLabel.int2str`] and [`ClassLabel.str2int`] carries out the conversion from integer value to label name, and vice versa.\n\nIf your data type contains a list of objects, then you want to use the [`List`] feature. Remember the SQuAD dataset?\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset('rajpurkar/squad', split='train')\n>>> dataset.features\n{'id': Value('string'),\n 'title': Value('string'),\n 'context': Value('string'),\n 'question': Value('string'),\n 'answers': {'text': List(Value('string')),\n  'answer_start': List(Value('int32'))}}\n```\n\nThe `answers` field is constructed using the dict of features because and contains two subfields, `text` and `answer_start`, which are lists of `string` and `int32`, respectively.\n\n> [!TIP]\n> See the [flatten](./process#flatten) section to learn how you can extract the nested subfields as their own independent columns.\n\nThe array feature type is useful for creating arrays of various sizes. You can create arrays with two dimensions using [`Array2D`], and even arrays with five dimensions using [`Array5D`].\n\n```py\n>>> features = Features({'a': Array2D(shape=(1, 3), dtype='int32')})\n```\n\nThe array type also allows the first dimension of the array to be dynamic. This is useful for handling sequences with variable lengths such as sentences, without having to pad or truncate the input to a uniform shape.\n\n```py\n>>> features = Features({'a': Array3D(shape=(None, 5, 2), dtype='int32')})\n```\n\n## Audio feature\n\nAudio datasets have a column with type [`Audio`], which contains three important fields:\n\n- `array`: the decoded audio data represented as a 1-dimensional array.\n- `path`: the path to the downloaded audio file.\n- `sampling_rate`: the sampling rate of the audio data.\n\nWhen you load an audio dataset and call the audio column, the [`Audio`] feature automatically decodes and resamples the audio file:\n\n```py\n>>> from datasets import load_dataset, Audio\n\n>>> dataset = load_dataset(\"PolyAI/minds14\", \"en-US\", split=\"train\")\n>>> dataset[0][\"audio\"]\n<datasets.features._torchcodec.AudioDecoder object at 0x11642b6a0>\n```\n\n> [!WARNING]\n> Index into an audio dataset using the row index first and then the `audio` column - `dataset[0][\"audio\"]` - to avoid decoding and resampling all the audio files in the dataset. Otherwise, this can be a slow and time-consuming process if you have a large dataset.\n\nWith `decode=False`, the [`Audio`] type simply gives you the path or the bytes of the audio file, without decoding it into an torchcodec `AudioDecoder` object,\n\n```py\n>>> dataset = load_dataset(\"PolyAI/minds14\", \"en-US\", split=\"train\").cast_column(\"audio\", Audio(decode=False))\n>>> dataset[0]\n{'audio': {'bytes': None,\n  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav'},\n 'english_transcription': 'I would like to set up a joint account with my partner',\n 'intent_class': 11,\n 'lang_id': 4,\n 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',\n 'transcription': 'I would like to set up a joint account with my partner'}\n```\n\n## Image feature\n\nImage datasets have a column with type [`Image`], which loads `PIL.Image` objects from images stored as bytes:\n\nWhen you load an image dataset and call the image column, the [`Image`] feature automatically decodes the image file:\n\n```py\n>>> from datasets import load_dataset, Image\n\n>>> dataset = load_dataset(\"AI-Lab-Makerere/beans\", split=\"train\")\n>>> dataset[0][\"image\"]\n<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x500 at 0x125506CF8>\n```\n\n> [!WARNING]\n> Index into an image dataset using the row index first and then the `image` column - `dataset[0][\"image\"]` - to avoid decoding all the image files in the dataset. Otherwise, this can be a slow and time-consuming process if you have a large dataset.\n\nWith `decode=False`, the [`Image`] type simply gives you the path or the bytes of the image file, without decoding it into an `PIL.Image`,\n\n```py\n>>> dataset = load_dataset(\"AI-Lab-Makerere/beans\", split=\"train\").cast_column(\"image\", Image(decode=False))\n>>> dataset[0][\"image\"]\n{'bytes': None,\n 'path': '/Users/username/.cache/huggingface/datasets/downloads/extracted/772e7c1fba622cff102b85dd74bcce46e8168634df4eaade7bedd3b8d91d3cd7/train/healthy/healthy_train.265.jpg'}\n```\n\nDepending on the dataset, you may get the path to the local downloaded image, or the content of the image as bytes if the dataset is not made of individual files.\n\nYou can also define a dataset of images from numpy arrays:\n\n```python\n>>> ds = Dataset.from_dict({\"i\": [np.zeros(shape=(16, 16, 3), dtype=np.uint8)]}, features=Features({\"i\": Image()}))\n```\n\nAnd in this case the numpy arrays are encoded into PNG (or TIFF if the pixels values precision is important).\n\nFor multi-channels arrays like RGB or RGBA, only uint8 is supported. If you use a larger precision, you get a warning and the array is downcasted to uint8.\nFor gray-scale images you can use the integer or float precision you want as long as it is compatible with `Pillow`. A warning is shown if your image integer or float precision is too high, and in this case the array is downcated: an int64 array is downcasted to int32, and a float64 array is downcasted to float32.\n\n## Json feature\n\nDatasets are based on Arrow which is a columnar format, and therefore they expect every example to have the same type and subtypes, and dictionaries to have the same keys and values types.\nLoading a dataset errors out when fields have mismatching types, and fills missing fields in dictionaries with None so all dictionaries have the same keys and value types.\n\nTo avoid this and allow mixed-types without errors, you can use `on_mixed_types=\"use_json\"` or specify `features=` with a [`Json`] type:\n\n```python\n>>> ds = Dataset.from_dict({\"a\": [0, \"foo\", {\"subfield\": \"bar\"}]})\nTraceback (most recent call last):\n  ...\n  File \"pyarrow/error.pxi\", line 92, in pyarrow.lib.check_status\npyarrow.lib.ArrowInvalid: Could not convert 'foo' with type str: tried to convert to int64\n\n>>> features = Features({\"a\": Json()})\n>>> ds = Dataset.from_dict({\"a\": [0, \"foo\", {\"subfield\": \"bar\"}]}, features=features)\n>>> ds.features\n{'a': Json()}\n>>> list(ds[\"a\"])\n[0, \"foo\", {\"subfield\": \"bar\"}]\n```\n\nThis is also useful for lists of dictionaries with arbitrary keys and values, to avoid filling missing fields with None:\n\n```python\n>>> ds = Dataset.from_dict({\"a\": [[{\"b\": 0}, {\"c\": 0}]]})\n>>> ds.features\n{'a': List({'b': Value('int64'), 'c': Value('int64')})}\n>>> list(ds[\"a\"])\n[[{'b': 0, 'c': None}, {'b': None, 'c': 0}]]  # missing fields are filled with None\n\n>>> features = Features({\"a\": List(Json())})\n>>> ds = Dataset.from_dict({\"a\": [[{\"b\": 0}, {\"c\": 0}]]}, features=features)\n>>> ds.features\n{'a': List(Json())}\n>>> list(ds[\"a\"])\n[[{'b': 0}, {'c': 0}]]  # OK\n```\n\nAnother example with tool calling data and the `on_mixed_types=\"use_json\"` argument (useful to not have to specify `features=` manually):\n\n```python\n>>> messages = [\n...     {\"role\": \"user\", \"content\": \"Turn on the living room lights and play my electronic music playlist.\"},\n...     {\"role\": \"assistant\", \"tool_calls\": [\n...         {\"type\": \"function\", \"function\": {\n...             \"name\": \"control_light\",\n...             \"arguments\": {\"room\": \"living room\", \"state\": \"on\"}\n...         }},\n...         {\"type\": \"function\", \"function\": {\n...             \"name\": \"play_music\",\n...             \"arguments\": {\"playlist\": \"electronic\"}  # mixed-type here since keys [\"playlist\"] and [\"room\", \"state\"] are different\n...         }}]\n...     },\n...     {\"role\": \"tool\", \"name\": \"control_light\", \"content\": \"The lights in the living room are now on.\"},\n...     {\"role\": \"tool\", \"name\": \"play_music\", \"content\": \"The music is now playing.\"},\n...     {\"role\": \"assistant\", \"content\": \"Done!\"}\n... ]\n>>> ds = Dataset.from_dict({\"messages\": [messages]}, on_mixed_types=\"use_json\")\n>>> ds.features\n{'messages': List({'role': Value('string'), 'content': Value('string'), 'tool_calls': List(Json()), 'name': Value('string')})}\n>>> ds[0][1][\"tool_calls\"][0][\"function\"][\"arguments\"]\n{\"room\": \"living room\", \"state\": \"on\"}\n```\n"
  },
  {
    "path": "docs/source/about_dataset_load.mdx",
    "content": "# Build and load\n\nNearly every deep learning workflow begins with loading a dataset, which makes it one of the most important steps. With 🤗 Datasets, there are more than 900 datasets available to help you get started with your NLP task. All you have to do is call: [`load_dataset`] to take your first step. This function is a true workhorse in every sense because it builds and loads every dataset you use.\n\n## ELI5: `load_dataset`\n\nLet's begin with a basic Explain Like I'm Five.\n\nA dataset is a directory that contains:\n\n- Some data files in generic formats (JSON, CSV, Parquet, text, etc.)\n- A dataset card named `README.md` that contains documentation about the dataset as well as a YAML header to define the datasets tags and configurations\n\nThe [`load_dataset`] function fetches the requested dataset locally or from the Hugging Face Hub.\nThe Hub is a central repository where all the Hugging Face datasets and models are stored.\n\nIf the dataset only contains data files, then [`load_dataset`] automatically infers how to load the data files from their extensions (json, csv, parquet, txt, etc.).\nUnder the hood, 🤗 Datasets will use an appropriate [`DatasetBuilder`] based on the data files format. There exist one builder per data file format in 🤗 Datasets:\n\n* [`datasets.packaged_modules.text.Text`] for text\n* [`datasets.packaged_modules.csv.Csv`] for CSV and TSV\n* [`datasets.packaged_modules.json.Json`] for JSON and JSONL\n* [`datasets.packaged_modules.parquet.Parquet`] for Parquet\n* [`datasets.packaged_modules.arrow.Arrow`] for Arrow (streaming file format)\n* [`datasets.packaged_modules.sql.Sql`] for SQL databases\n* [`datasets.packaged_modules.imagefolder.ImageFolder`] for image folders\n* [`datasets.packaged_modules.audiofolder.AudioFolder`] for audio folders\n\n> [!TIP]\n> Read the [Share](./upload_dataset) section to learn more about how to share a dataset.\n\n🤗 Datasets downloads the dataset files from the original URL, generates the dataset and caches it in an Arrow table on your drive.\nIf you've downloaded the dataset before, then 🤗 Datasets will reload it from the cache to save you the trouble of downloading it again.\n\nNow that you have a high-level understanding about how datasets are built, let's take a closer look at the nuts and bolts of how all this works.\n\n## Building a dataset\n\nWhen you load a dataset for the first time, 🤗 Datasets takes the raw data file and builds it into a table of rows and typed columns. There are two main classes responsible for building a dataset: [`BuilderConfig`] and [`DatasetBuilder`].\n\n\n<div class=\"flex justify-center\">\n   <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/builderconfig.png\"/>\n</div>\n\n### BuilderConfig[[datasets-builderconfig]]\n\n[`BuilderConfig`] is the configuration class of [`DatasetBuilder`]. The [`BuilderConfig`] contains the following basic attributes about a dataset:\n\n| Attribute     | Description                                                  |\n|---------------|--------------------------------------------------------------|\n| `name`        | Short name of the dataset.                                   |\n| `version`     | Dataset version identifier.                                  |\n| `data_dir`    | Stores the path to a local folder containing the data files. |\n| `data_files`  | Stores paths to local data files.                            |\n| `description` | Description of the dataset.                                  |\n\nIf you want to add additional attributes to your dataset such as the class labels, you can subclass the base [`BuilderConfig`] class. There are two ways to populate the attributes of a [`BuilderConfig`] class or subclass:\n\n- Provide a list of predefined [`BuilderConfig`] class (or subclass) instances in the datasets [`DatasetBuilder.BUILDER_CONFIGS`] attribute.\n\n- When you call [`load_dataset`], any keyword arguments that are not specific to the method will be used to set the associated attributes of the [`BuilderConfig`] class. This will override the predefined attributes if a specific configuration was selected.\n\nYou can also set the [`DatasetBuilder.BUILDER_CONFIG_CLASS`] to any custom subclass of [`BuilderConfig`].\n\n### DatasetBuilder[[datasets-datasetbuilder]]\n\n[`DatasetBuilder`] accesses all the attributes inside [`BuilderConfig`] to build the actual dataset.\n\n<div class=\"flex justify-center\">\n   <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/datasetbuilder.png\"/>\n</div>\n\nThere are three main methods in [`DatasetBuilder`]:\n\n1. [`DatasetBuilder._info`] is in charge of defining the dataset attributes. When you call `dataset.info`, 🤗 Datasets returns the information stored here. Likewise, the [`Features`] are also specified here. Remember, the [`Features`] are like the skeleton of the dataset. It provides the names and types of each column.\n\n2. [`DatasetBuilder._split_generator`] downloads or retrieves the requested data files, organizes them into splits, and defines specific arguments for the generation process. This method has a [`DownloadManager`] that downloads files or fetches them from your local filesystem. Within the [`DownloadManager`], there is a [`DownloadManager.download_and_extract`] method that accepts a dictionary of URLs to the original data files, and downloads the requested files. Accepted inputs include: a single URL or path, or a list/dictionary of URLs or paths. Any compressed file types like TAR, GZIP and ZIP archives will be automatically extracted.\n\n   Once the files are downloaded, [`SplitGenerator`] organizes them into splits. The [`SplitGenerator`] contains the name of the split, and any keyword arguments that are provided to the [`DatasetBuilder._generate_examples`] method. The keyword arguments can be specific to each split, and typically comprise at least the local path to the data files for each split.\n\n3. [`DatasetBuilder._generate_examples`] reads and parses the data files for a split. Then it yields dataset examples according to the format specified in the `features` from [`DatasetBuilder._info`]. The input of [`DatasetBuilder._generate_examples`] is actually the `filepath` provided in the keyword arguments of the last method.\n\n   The dataset is generated with a Python generator, which doesn't load all the data in memory. As a result, the generator can handle large datasets. However, before the generated samples are flushed to the dataset file on disk, they are stored in an `ArrowWriter` buffer. This means the generated samples are written by batch. If your dataset samples consumes a lot of memory (images or videos), then make sure to specify a low value for the `DEFAULT_WRITER_BATCH_SIZE` attribute in [`DatasetBuilder`]. We recommend not exceeding a size of 200 MB.\n\n## Maintaining integrity\n\nTo ensure a dataset is complete, [`load_dataset`] will perform a series of tests on the downloaded files to make sure everything is there. This way, you don't encounter any surprises when your requested dataset doesn't get generated as expected. [`load_dataset`] verifies:\n\n- The number of splits in the generated `DatasetDict`.\n- The number of samples in each split of the generated `DatasetDict`.\n- The list of downloaded files.\n- The SHA256 checksums of the downloaded files (disabled by default).\n\nIf the dataset doesn't pass the verifications, it is likely that the dataset author made some changes in the data files.\n\nIn this case, an error is raised to alert that the dataset has changed.\nTo ignore the error, one needs to specify `verification_mode=\"no_checks\"` in [`load_dataset`].\nAnytime you see a verification error, feel free to open a discussion or pull request in the corresponding dataset \"Community\" tab, so that the integrity checks for that dataset are updated.\n\n## Security\n\nThe dataset repositories on the Hub are scanned for malware, see more information [here](https://huggingface.co/docs/hub/security#malware-scanning).\n"
  },
  {
    "path": "docs/source/about_map_batch.mdx",
    "content": "# Batch mapping\n\nCombining the utility of [`Dataset.map`] with batch mode is very powerful. It allows you to speed up processing, and freely control the size of the generated dataset. \n\n## Need for speed\n\nThe primary objective of batch mapping is to speed up processing. Often times, it is faster to work with batches of data instead of single examples. Naturally, batch mapping lends itself to tokenization. For example, the 🤗 [Tokenizers](https://huggingface.co/docs/tokenizers/python/latest/) library works faster with batches because it parallelizes the tokenization of all the examples in a batch.\n\n## Input size != output size\n\nThe ability to control the size of the generated dataset can be leveraged for many interesting use-cases. In the How-to [map](#map) section, there are examples of using batch mapping to:\n\n- Split long sentences into shorter chunks.\n- Augment a dataset with additional tokens.\n\nIt is helpful to understand how this works, so you can come up with your own ways to use batch mapping. At this point, you may be wondering how you can control the size of the generated dataset. The answer is: **the mapped function does not have to return an output batch of the same size**.\n\nIn other words, your mapped function input can be a batch of size `N` and return a batch of size `M`. The output `M` can be greater than or less than `N`. This means you can concatenate your examples, divide it up, and even add more examples!\n\nHowever, remember that all values in the output dictionary must contain the **same number of elements** as the other fields in the output dictionary. Otherwise, it is not possible to define the number of examples in the output returned by the mapped function. The number can vary between successive batches processed by the mapped function. For a single batch though, all values of the output dictionary should have the same length (i.e., the number of elements).\n\nFor example, from a dataset of 1 column and 3 rows, if you use `map` to return a new column with twice as many rows, then you will have an error.\nIn this case, you end up with one column with 3 rows, and one column with 6 rows. As you can see, the table will not be valid:\n\n```py\n>>> from datasets import Dataset\n>>> dataset = Dataset.from_dict({\"a\": [0, 1, 2]})\n>>> dataset.map(lambda batch: {\"b\": batch[\"a\"] * 2}, batched=True)  # new column with 6 elements: [0, 1, 2, 0, 1, 2]\n'ArrowInvalid: Column 1 named b expected length 3 but got length 6'\n```\n\nTo make it valid, you have to drop one of the columns:\n\n```py\n>>> from datasets import Dataset\n>>> dataset = Dataset.from_dict({\"a\": [0, 1, 2]})\n>>> dataset_with_duplicates = dataset.map(lambda batch: {\"b\": batch[\"a\"] * 2}, remove_columns=[\"a\"], batched=True)\n>>> len(dataset_with_duplicates)\n6\n```\nAlternatively, you can overwrite the existing column to achieve the same result.\nFor example, here’s how to duplicate every row in the dataset by overwriting column `\"a\"`:\n\n```py\n>>> from datasets import Dataset\n>>> dataset = Dataset.from_dict({\"a\": [0, 1, 2]})\n# overwrites the existing \"a\" column with duplicated values\n>>> duplicated_dataset = dataset.map(\n...     lambda batch: {\"a\": [x for x in batch[\"a\"] for _ in range(2)]},\n...     batched=True\n... )\n>>> duplicated_dataset\nDataset({\n    features: ['a'],\n    num_rows: 6\n})\n>>> duplicated_dataset[\"a\"]\n[0, 0, 1, 1, 2, 2]\n```\n"
  },
  {
    "path": "docs/source/about_mapstyle_vs_iterable.mdx",
    "content": "# Differences between Dataset and IterableDataset\n\nThere are two types of dataset objects, a [`Dataset`] and an [`IterableDataset`].\nWhichever type of dataset you choose to use or create depends on the size of the dataset.\nIn general, an [`IterableDataset`] is ideal for big datasets (think hundreds of GBs!) due to its lazy behavior and speed advantages, while a [`Dataset`] is great for everything else.\nThis page will compare the differences between a [`Dataset`] and an [`IterableDataset`] to help you pick the right dataset object for you.\n\n## Downloading and streaming\n\nWhen you have a regular [`Dataset`], you can access it using `my_dataset[0]`. This provides random access to the rows.\nSuch datasets are also called \"map-style\" datasets.\nFor example you can download ImageNet-1k like this and access any row:\n\n```python\nfrom datasets import load_dataset\n\nimagenet = load_dataset(\"timm/imagenet-1k-wds\", split=\"train\")  # downloads the full dataset\nprint(imagenet[0])\n```\n\nBut one caveat is that you must have the entire dataset stored on your disk or in memory, which blocks you from accessing datasets bigger than the disk.\nBecause it can become inconvenient for big datasets, there exists another type of dataset, the [`IterableDataset`].\nWhen you have an `IterableDataset`, you can access it using a `for` loop to load the data progressively as you iterate over the dataset.\nThis way, only a small fraction of examples is loaded in memory, and you don't write anything on disk.\n\nFor example, you can stream the ImageNet-1k dataset without downloading it on disk:\n\n```python\nfrom datasets import load_dataset\n\nimagenet = load_dataset(\"timm/imagenet-1k-wds\", split=\"train\", streaming=True)  # will start loading the data when iterated over\nfor example in imagenet:\n    print(example)\n    break\n```\n\nStreaming can read online data without writing any file to disk.\nFor example, you can stream datasets made out of multiple shards, each of which is hundreds of gigabytes like [C4](https://huggingface.co/datasets/c4)  or [LAION-2B](https://huggingface.co/datasets/laion/laion2B-en).\nLearn more about how to stream a dataset in the [Dataset Streaming Guide](./stream).\n\nThis is not the only difference though, because the \"lazy\" behavior of an `IterableDataset` is also present when it comes to dataset creation and processing.\n\n## Creating map-style datasets and iterable datasets\n\nYou can create a [`Dataset`] using lists or dictionaries, and the data is entirely converted to Arrow so you can easily access any row:\n```python\nmy_dataset = Dataset.from_dict({\"col_1\": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]})\nprint(my_dataset[0])\n```\n\nTo create an `IterableDataset` on the other hand, you must provide a \"lazy\" way to load the data.\nIn Python, we generally use generator functions. These functions `yield` one example at a time, which means you can't access a row by slicing it like a regular `Dataset`:\n```python\ndef my_generator(n):\n    for i in range(n):\n        yield {\"col_1\": i}\n\nmy_iterable_dataset = IterableDataset.from_generator(my_generator, gen_kwargs={\"n\": 10})\nfor example in my_iterable_dataset:\n    print(example)\n    break\n```\n\n## Loading local files entirely and progressively\n\nIt is possible to convert local or remote data files to an Arrow [`Dataset`] using [`load_dataset`]:\n```python\ndata_files = {\"train\": [\"path/to/data.csv\"]}\nmy_dataset = load_dataset(\"csv\", data_files=data_files, split=\"train\")\nprint(my_dataset[0])\n```\n\nHowever, this requires a conversion step from CSV to Arrow format, which takes time and disk space if your dataset is big.\n\nTo save disk space and skip the conversion step, you can define an `IterableDataset` by streaming from the local files directly.\nThis way, the data is read progressively from the local files as you iterate over the dataset:\n\n```python\ndata_files = {\"train\": [\"path/to/data.csv\"]}\nmy_iterable_dataset = load_dataset(\"csv\", data_files=data_files, split=\"train\", streaming=True)\nfor example in my_iterable_dataset:  # this reads the CSV file progressively as you iterate over the dataset\n    print(example)\n    break\n```\n\nMany file formats are supported, like CSV, JSONL, and Parquet, as well as image and audio files.\nYou can find more information in the corresponding guides for loading [tabular](./tabular_load), [text](./nlp_load), [vision](./image_load), and [audio](./audio_load]) datasets.\n\n## Eager data processing and lazy data processing\n\nWhen you process a [`Dataset`] object using [`Dataset.map`], the entire dataset is processed immediately and returned.\nThis is similar to how `pandas` works for example.\n\n```python\nmy_dataset = my_dataset.map(process_fn)  # process_fn is applied on all the examples of the dataset\nprint(my_dataset[0])\n```\n\nOn the other hand, due to the \"lazy\" nature of an `IterableDataset`, calling [`IterableDataset.map`] does not apply your `map` function over the full dataset.\nInstead, your `map` function is applied on-the-fly.\n\nBecause of that, you can chain multiple processing steps and they will all run at once when you start iterating over the dataset:\n\n```python\nmy_iterable_dataset = my_iterable_dataset.map(process_fn_1)\nmy_iterable_dataset = my_iterable_dataset.filter(filter_fn)\nmy_iterable_dataset = my_iterable_dataset.map(process_fn_2)\n\n# process_fn_1, filter_fn and process_fn_2 are applied on-the-fly when iterating over the dataset\nfor example in my_iterable_dataset:  \n    print(example)\n    break\n```\n\n## Exact and fast approximate shuffling\n\nWhen you shuffle a [`Dataset`] using [`Dataset.shuffle`], you apply an exact shuffling of the dataset.\nIt works by taking a list of indices `[0, 1, 2, ... len(my_dataset) - 1]` and shuffling this list.\nThen, accessing `my_dataset[0]` returns the row and index defined by the first element of the indices mapping that has been shuffled:\n```python\nmy_dataset = my_dataset.shuffle(seed=42)\nprint(my_dataset[0])\n```\n\nSince we don't have random access to the rows in the case of an `IterableDataset`, we can't use a shuffled list of indices and access a row at an arbitrary position.\nThis prevents the use of exact shuffling.\nInstead, a fast approximate shuffling is used in [`IterableDataset.shuffle`].\nIt uses a shuffle buffer to sample random examples iteratively from the dataset.\nSince the dataset is still read iteratively, it provides excellent speed performance:\n```python\nmy_iterable_dataset = my_iterable_dataset.shuffle(seed=42, buffer_size=100)\nfor example in my_iterable_dataset:\n    print(example)\n    break\n```\n\nBut using a shuffle buffer is not enough to provide a satisfactory shuffling for machine learning model training. So [`IterableDataset.shuffle`] also shuffles the dataset shards if your dataset is made of multiple files or sources:\n\n```python\n# Stream from the internet\nmy_iterable_dataset = load_dataset(\"deepmind/code_contests\", split=\"train\", streaming=True)\nmy_iterable_dataset.num_shards  # 39\n\n# Stream from local files\ndata_files = {\"train\": [f\"path/to/data_{i}.csv\" for i in range(1024)]}\nmy_iterable_dataset = load_dataset(\"csv\", data_files=data_files, split=\"train\", streaming=True)\nmy_iterable_dataset.num_shards  # 1024\n\n# From a generator function\ndef my_generator(n, sources):\n    for source in sources:\n        for example_id_for_current_source in range(n):\n            yield {\"example_id\": f\"{source}_{example_id_for_current_source}\"}\n\ngen_kwargs = {\"n\": 10, \"sources\": [f\"path/to/data_{i}\" for i in range(1024)]}\nmy_iterable_dataset = IterableDataset.from_generator(my_generator, gen_kwargs=gen_kwargs)\nmy_iterable_dataset.num_shards  # 1024\n```\n\n## Speed differences\n\nRegular [`Dataset`] objects are based on Arrow which provides fast random access to the rows.\nThanks to memory mapping and the fact that Arrow is an in-memory format, reading data from disk doesn't do expensive system calls and deserialization.\nIt provides even faster data loading when iterating using a `for` loop by iterating on contiguous Arrow record batches.\n\nHowever as soon as your [`Dataset`] has an indices mapping (via [`Dataset.shuffle`] for example), the speed can become 10x slower.\nThis is because there is an extra step to get the row index to read using the indices mapping, and most importantly, you aren't reading contiguous chunks of data anymore.\nTo restore the speed, you'd need to rewrite the entire dataset on your disk again using [`Dataset.flatten_indices`], which removes the indices mapping.\nThis may take a lot of time depending on the size of your dataset though:\n\n```python\nmy_dataset[0]  # fast\nmy_dataset = my_dataset.shuffle(seed=42)\nmy_dataset[0]  # up to 10x slower\nmy_dataset = my_dataset.flatten_indices()  # rewrite the shuffled dataset on disk as contiguous chunks of data\nmy_dataset[0]  # fast again\n```\n\n\nIn this case, we recommend switching to an [`IterableDataset`] and leveraging its fast approximate shuffling method [`IterableDataset.shuffle`].\nIt only shuffles the shards order and adds a shuffle buffer to your dataset, which keeps the speed of your dataset optimal.\nYou can also reshuffle the dataset easily:\n\n```python\nfor example in enumerate(my_iterable_dataset):  # fast\n    pass\n\nshuffled_iterable_dataset = my_iterable_dataset.shuffle(seed=42, buffer_size=100)\n\nfor example in enumerate(shuffled_iterable_dataset):  # as fast as before\n    pass\n\nshuffled_iterable_dataset = my_iterable_dataset.shuffle(seed=1337, buffer_size=100)  # reshuffling using another seed is instantaneous\n\nfor example in enumerate(shuffled_iterable_dataset):  # still as fast as before\n    pass\n```\n\nIf you're using your dataset on multiple epochs, the effective seed to shuffle the shards order in the shuffle buffer is `seed + epoch`.\nIt makes it easy to reshuffle a dataset between epochs:\n```python\nfor epoch in range(n_epochs):\n    my_iterable_dataset.set_epoch(epoch)\n    for example in my_iterable_dataset:  # fast + reshuffled at each epoch using `effective_seed = seed + epoch`\n        pass\n```\n\nTo restart the iteration of a map-style dataset, you can simply skip the first examples:\n\n```python\nmy_dataset = my_dataset.select(range(start_index, len(dataset)))\n```\n\nBut if you use a `DataLoader` with a `Sampler`, you should instead save the state of your sampler (you might have written a custom sampler that allows resuming).\n\nOn the other hand, iterable datasets don't provide random access to a specific example index to resume from. But you can use [`IterableDataset.state_dict`] and [`IterableDataset.load_state_dict`] to resume from a checkpoint instead, similarly to what you can do for models and optimizers:\n\n```python\n>>> iterable_dataset = Dataset.from_dict({\"a\": range(6)}).to_iterable_dataset(num_shards=3)\n>>> # save in the middle of training\n>>> state_dict = iterable_dataset.state_dict()\n>>> # and resume later\n>>> iterable_dataset.load_state_dict(state_dict)\n```\n\nUnder the hood, the iterable dataset keeps track of the current shard being read and the example index in the current shard and it stores this info in the `state_dict`.\n\nTo resume from a checkpoint, the dataset skips all the shards that were previously read to restart from the current shard. \nThen it reads the shard and skips examples until it reaches the exact example from the checkpoint.\n\nTherefore restarting a dataset is quite fast, since it will not re-read the shards that have already been iterated on. Still, resuming a dataset is generally not instantaneous since it has to restart reading from the beginning of the current shard and skip examples until it reaches the checkpoint location.\n\nThis can be used with the `StatefulDataLoader` from `torchdata`, see [streaming with a PyTorch DataLoader](./use_with_pytorch#stream-data).\n\n## Switch from map-style to iterable\n\nIf you want to benefit from the \"lazy\" behavior of an [`IterableDataset`] or their speed advantages, you can switch your map-style [`Dataset`] to an [`IterableDataset`]:\n```python\nmy_iterable_dataset = my_dataset.to_iterable_dataset()\n```\n\nIf you want to shuffle your dataset or [use it with a PyTorch DataLoader](./use_with_pytorch#stream-data), we recommend generating a sharded [`IterableDataset`]:\n```python\nmy_iterable_dataset = my_dataset.to_iterable_dataset(num_shards=1024)\nmy_iterable_dataset.num_shards  # 1024\n```\n"
  },
  {
    "path": "docs/source/access.mdx",
    "content": "# Know your dataset\n\nThere are two types of dataset objects, a regular [`Dataset`] and then an ✨ [`IterableDataset`] ✨. A [`Dataset`] provides fast random access to the rows, and memory-mapping so that loading even large datasets only uses a relatively small amount of device memory. But for really, really big datasets that won't even fit on disk or in memory, an [`IterableDataset`] allows you to access and use the dataset without waiting for it to download completely!\n\nThis tutorial will show you how to load and access a [`Dataset`] and an [`IterableDataset`].\n\n## Dataset\n\nWhen you load a dataset split, you'll get a [`Dataset`] object. You can do many things with a [`Dataset`] object, which is why it's important to learn how to manipulate and interact with the data stored inside. \n \nThis tutorial uses the [rotten_tomatoes](https://huggingface.co/datasets/rotten_tomatoes) dataset, but feel free to load any dataset you'd like and follow along!\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\")\n```\n\n### Indexing\n\nA [`Dataset`] contains columns of data, and each column can be a different type of data. The *index*, or axis label, is used to access examples from the dataset. For example, indexing by the row returns a dictionary of an example from the dataset:\n\n```py\n# Get the first row in the dataset\n>>> dataset[0]\n{'label': 1,\n 'text': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}\n```\n\nUse the `-` operator to start from the end of the dataset:\n\n```py\n# Get the last row in the dataset\n>>> dataset[-1]\n{'label': 0,\n 'text': 'things really get weird , though not particularly scary : the movie is all portent and no content .'}\n```\n\nIndexing by the column name returns a list of all the values in the column:\n\n```py\n>>> dataset[\"text\"]\n['the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',\n 'the gorgeously elaborate continuation of \" the lord of the rings \" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\\'s expanded vision of j . r . r . tolkien\\'s middle-earth .',\n 'effective but too-tepid biopic',\n ...,\n 'things really get weird , though not particularly scary : the movie is all portent and no content .']\n```\n\nYou can combine row and column name indexing to return a specific value at a position:\n\n```py\n>>> dataset[0][\"text\"]\n'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'\n```\n\nIndexing order doesn't matter. Indexing by the column name first returns a [`Column`] object that you can index as usual with row indices:\n\n```py\n>>> import time\n\n>>> start_time = time.time()\n>>> text = dataset[0][\"text\"]\n>>> end_time = time.time()\n>>> print(f\"Elapsed time: {end_time - start_time:.4f} seconds\")\nElapsed time: 0.0031 seconds\n\n>>> start_time = time.time()\n>>> text = dataset[\"text\"][0]\n>>> end_time = time.time()\n>>> print(f\"Elapsed time: {end_time - start_time:.4f} seconds\")\nElapsed time: 0.0042 seconds\n```\n\n### Slicing\n\nSlicing returns a slice - or subset - of the dataset, which is useful for viewing several rows at once. To slice a dataset, use the `:` operator to specify a range of positions. \n\n```py\n# Get the first three rows\n>>> dataset[:3]\n{'label': [1, 1, 1],\n 'text': ['the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',\n  'the gorgeously elaborate continuation of \" the lord of the rings \" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\\'s expanded vision of j . r . r . tolkien\\'s middle-earth .',\n  'effective but too-tepid biopic']}\n\n# Get rows between three and six\n>>> dataset[3:6]\n{'label': [1, 1, 1],\n 'text': ['if you sometimes like to go to the movies to have fun , wasabi is a good place to start .',\n  \"emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one .\",\n  'the film provides some great insight into the neurotic mindset of all comics -- even those who have reached the absolute top of the game .']}\n```\n\n## IterableDataset\n\nAn [`IterableDataset`] is loaded when you set the `streaming` parameter to `True` in [`~datasets.load_dataset`]:\n\n```py\n>>> from datasets import load_dataset\n\n>>> iterable_dataset = load_dataset(\"ethz/food101\", split=\"train\", streaming=True)\n>>> for example in iterable_dataset:\n...     print(example)\n...     break\n{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=384x512 at 0x7F0681F5C520>, 'label': 6}\n```\n\nYou can also create an [`IterableDataset`] from an *existing* [`Dataset`], but it is faster than streaming mode because the dataset is streamed from local files:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\")\n>>> iterable_dataset = dataset.to_iterable_dataset()\n```\n\nAn [`IterableDataset`] progressively iterates over a dataset one example at a time, so you don't have to wait for the whole dataset to download before you can use it. As you can imagine, this is quite useful for large datasets you want to use immediately!\n\n### Indexing\n\nAn [`IterableDataset`]'s behavior is different from a regular [`Dataset`]. You don't get random access to examples in an [`IterableDataset`]. Instead, you should iterate over its elements, for example, by calling `next(iter())` or with a `for` loop to return the next item from the [`IterableDataset`]:\n\n```py\n>>> next(iter(iterable_dataset))\n{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=384x512 at 0x7F0681F59B50>,\n 'label': 6}\n\n>>> for example in iterable_dataset:\n...     print(example)\n...     break\n{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=384x512 at 0x7F7479DE82B0>, 'label': 6}\n```\n\nBut an [`IterableDataset`] supports column indexing that returns an iterable for the column values:\n\n```py\n>>> next(iter(iterable_dataset[\"label\"]))\n6\n```\n\n### Creating a subset\n\nYou can return a subset of the dataset with a specific number of examples in it with [`IterableDataset.take`]:\n\n```py\n# Get first three examples\n>>> list(iterable_dataset.take(3))\n[{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=384x512 at 0x7F7479DEE9D0>,\n  'label': 6},\n {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512 at 0x7F7479DE8190>,\n  'label': 6},\n {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x383 at 0x7F7479DE8310>,\n  'label': 6}]\n```\n\nBut unlike [slicing](access/#slicing), [`IterableDataset.take`] creates a new [`IterableDataset`]. \n\n## Next steps\n\nInterested in learning more about the differences between these two types of datasets? Learn more about them in the [Differences between `Dataset` and `IterableDataset`](about_mapstyle_vs_iterable) conceptual guide.\n\nTo get more hands-on with these datasets types, check out the [Process](process) guide to learn how to preprocess a [`Dataset`] or the [Stream](stream) guide to learn how to preprocess an [`IterableDataset`].\n"
  },
  {
    "path": "docs/source/audio_dataset.mdx",
    "content": "# Create an audio dataset\n\nYou can share a dataset with your team or with anyone in the community by creating a dataset repository on the Hugging Face Hub:\n\n```py\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"<username>/my_dataset\")\n```\n\nThere are several methods for creating and sharing an audio dataset:\n\n- Create an audio dataset from local files in python with [`Dataset.push_to_hub`]. This is an easy way that requires only a few steps in python.\n\n- Create an audio dataset repository with the `AudioFolder` builder. This is a no-code solution for quickly creating an audio dataset with several thousand audio files.\n\n> [!TIP]\n> You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information about how to enable this feature on the Hub.\n\n## Local files\n\nYou can load your own dataset using the paths to your audio files. Use the [`~Dataset.cast_column`] function to take a column of audio file paths, and cast it to the [`Audio`] feature:\n\n```py\n>>> audio_dataset = Dataset.from_dict({\"audio\": [\"path/to/audio_1\", \"path/to/audio_2\", ..., \"path/to/audio_n\"]}).cast_column(\"audio\", Audio())\n>>> audio_dataset[0][\"audio\"]\n<datasets.features._torchcodec.AudioDecoder object at 0x11642b6a0>\n```\n\nThen upload the dataset to the Hugging Face Hub using [`Dataset.push_to_hub`]:\n\n```py\naudio_dataset.push_to_hub(\"<username>/my_dataset\")\n```\n\nThis will create a dataset repository containing your audio dataset:\n\n```\nmy_dataset/\n├── README.md\n└── data/\n    └── train-00000-of-00001.parquet\n```\n\n## AudioFolder\n\nThe `AudioFolder` is a dataset builder designed to quickly load an audio dataset with several thousand audio files without requiring you to write any code.\n\n> [!TIP]\n> 💡 Take a look at the [Split pattern hierarchy](repository_structure#split-pattern-hierarchy) to learn more about how `AudioFolder` creates dataset splits based on your dataset repository structure.\n\n`AudioFolder` automatically infers the class labels of your dataset based on the directory name. Store your dataset in a directory structure like:\n\n```\nfolder/train/dog/golden_retriever.mp3\nfolder/train/dog/german_shepherd.mp3\nfolder/train/dog/chihuahua.mp3\n\nfolder/train/cat/maine_coon.mp3\nfolder/train/cat/bengal.mp3\nfolder/train/cat/birman.mp3\n```\n\nIf the dataset follows the `AudioFolder` structure, then you can load it directly with [`load_dataset`]:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"username/dataset_name\")\n```\n\nThis is equivalent to passing `audiofolder` manually in [`load_dataset`] and the directory in `data_dir`:\n\n```py\n>>> dataset = load_dataset(\"audiofolder\", data_dir=\"/path/to/folder\")\n```\n\nYou can also use `audiofolder` to load datasets involving multiple splits. To do so, your dataset directory should have the following structure:\n\n```\nfolder/train/dog/golden_retriever.mp3\nfolder/train/cat/maine_coon.mp3\nfolder/test/dog/german_shepherd.mp3\nfolder/test/cat/bengal.mp3\n```\n\n> [!WARNING]\n> If all audio files are contained in a single directory or if they are not on the same level of directory structure, `label` column won't be added automatically. If you need it, set `drop_labels=False` explicitly.\n\nIf there is additional information you'd like to include about your dataset, like text captions or bounding boxes, add it as a `metadata.csv` file in your folder. This lets you quickly create datasets for different computer vision tasks like text captioning or object detection. You can also use a JSONL file `metadata.jsonl` or a Parquet file `metadata.parquet`.\n\n```\nfolder/train/metadata.csv\nfolder/train/0001.mp3\nfolder/train/0002.mp3\nfolder/train/0003.mp3\n```\n\nYou can also zip your audio files, and in this case each zip should contain both the audio files and the metadata\n\n```\nfolder/train.zip\nfolder/test.zip\nfolder/validation.zip\n```\n\nYour `metadata.csv` file must have a `file_name` or `*_file_name` field which links audio files with their metadata:\n\n```csv\nfile_name,additional_feature\n0001.mp3,This is a first value of a text feature you added to your audio files\n0002.mp3,This is a second value of a text feature you added to your audio files\n0003.mp3,This is a third value of a text feature you added to your audio files\n```\n\nor using `metadata.jsonl`:\n\n```jsonl\n{\"file_name\": \"0001.mp3\", \"additional_feature\": \"This is a first value of a text feature you added to your audio files\"}\n{\"file_name\": \"0002.mp3\", \"additional_feature\": \"This is a second value of a text feature you added to your audio files\"}\n{\"file_name\": \"0003.mp3\", \"additional_feature\": \"This is a third value of a text feature you added to your audio files\"}\n```\n\nHere the `file_name` must be the name of the audio file next to the metadata file. More generally, it must be the relative path from the directory containing the metadata to the audio file.\n\nIt's possible to point to more than one audio in each row in your dataset, for example if both your input and output are audio files:\n\n```jsonl\n{\"input_file_name\": \"0001.mp3\", \"output_file_name\": \"0001_output.mp3\"}\n{\"input_file_name\": \"0002.mp3\", \"output_file_name\": \"0002_output.mp3\"}\n{\"input_file_name\": \"0003.mp3\", \"output_file_name\": \"0003_output.mp3\"}\n```\n\nYou can also define lists of audio files. In that case you need to name the field `file_names` or `*_file_names`. Here is an example:\n\n```jsonl\n{\"recordings_file_names\": [\"0001_r0.mp3\", \"0001_r1.mp3\"], label: \"same_person\"}\n{\"recordings_file_names\": [\"0002_r0.mp3\", \"0002_r1.mp3\"], label: \"same_person\"}\n{\"recordings_file_names\": [\"0003_r0.mp3\", \"0003_r1.mp3\"], label: \"different_person\"}\n```\n\n## WebDataset\n\nThe [WebDataset](https://github.com/webdataset/webdataset) format is based on TAR archives and is suitable for big audio datasets.\nIndeed you can group your audio files in TAR archives (e.g. 1GB of audio files per TAR archive) and have thousands of TAR archives:\n\n```\nfolder/train/00000.tar\nfolder/train/00001.tar\nfolder/train/00002.tar\n...\n```\n\nIn the archives, each example is made of files sharing the same prefix:\n\n```\ne39871fd9fd74f55.mp3\ne39871fd9fd74f55.json\nf18b91585c4d3f3e.mp3\nf18b91585c4d3f3e.json\nede6e66b2fb59aab.mp3\nede6e66b2fb59aab.json\ned600d57fcee4f94.mp3\ned600d57fcee4f94.json\n...\n```\n\nYou can put your audio files labels/captions/bounding boxes using JSON or text files for example.\n\nLoad your WebDataset and it will create on column per file suffix (here \"mp3\" and \"json\"):\n\n```python\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"webdataset\", data_dir=\"/path/to/folder\", split=\"train\")\n>>> dataset[0][\"json\"]\n{\"transcript\": \"Hello there !\", \"speaker\": \"Obi-Wan Kenobi\"}\n```\n\nIt's also possible to have several audio files per example like this:\n\n```\ne39871fd9fd74f55.input.mp3\ne39871fd9fd74f55.output.mp3\ne39871fd9fd74f55.json\nf18b91585c4d3f3e.input.mp3\nf18b91585c4d3f3e.output.mp3\nf18b91585c4d3f3e.json\n...\n```\n\nFor more details on the WebDataset format and the python library, please check the [WebDataset documentation](https://webdataset.github.io/webdataset).\n"
  },
  {
    "path": "docs/source/audio_load.mdx",
    "content": "# Load audio data\n\nYou can load an audio dataset using the [`Audio`] feature that automatically decodes and resamples the audio files when you access the examples.\nAudio decoding is based on the [`torchcodec`](https://github.com/pytorch/torchcodec) python package, which uses the [`FFmpeg`](https://www.ffmpeg.org/) C library under the hood.\n\n## Installation\n\nTo work with audio datasets, you need to have the `audio` dependencies installed.\nCheck out the [installation](./installation#audio) guide to learn how to install it.\n\n## Local files\n\nYou can load your own dataset using the paths to your audio files. Use the [`~Dataset.cast_column`] function to take a column of audio file paths, and cast it to the [`Audio`] feature:\n\n```py\n>>> audio_dataset = Dataset.from_dict({\"audio\": [\"path/to/audio_1\", \"path/to/audio_2\", ..., \"path/to/audio_n\"]}).cast_column(\"audio\", Audio())\n>>> audio_dataset[0][\"audio\"]\n<datasets.features._torchcodec.AudioDecoder object at 0x11642b6a0>\n```\n\n## AudioFolder\n\nYou can also load a dataset with an `AudioFolder` dataset builder. It does not require writing a custom dataloader, making it useful for quickly creating and loading audio datasets with several thousand audio files.\n\n## AudioFolder with metadata\n\nTo link your audio files with metadata information, make sure your dataset has a `metadata.csv` file. Your dataset structure might look like:\n\n```\nfolder/train/metadata.csv\nfolder/train/first_audio_file.mp3\nfolder/train/second_audio_file.mp3\nfolder/train/third_audio_file.mp3\n```\n\nYour `metadata.csv` file must have a `file_name` column which links audio files with their metadata. An example `metadata.csv` file might look like:\n\n```text\nfile_name,transcription\nfirst_audio_file.mp3,znowu się duch z ciałem zrośnie w młodocianej wstaniesz wiosnie i możesz skutkiem tych leków umierać wstawać wiek wieków dalej tam były przestrogi jak siekać głowę jak nogi\nsecond_audio_file.mp3,już u źwierzyńca podwojów król zasiada przy nim książęta i panowie rada a gdzie wzniosły krążył ganek rycerze obok kochanek król skinął palcem zaczęto igrzysko\nthird_audio_file.mp3,pewnie kędyś w obłędzie ubite minęły szlaki zaczekajmy dzień jaki poślemy szukać wszędzie dziś jutro pewnie będzie posłali wszędzie sługi czekali dzień i drugi gdy nic nie doczekali z płaczem chcą jechać dali\n```\n\n`AudioFolder` will load audio data and create a `transcription` column containing texts from `metadata.csv`:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"username/dataset_name\")\n>>> # OR locally:\n>>> dataset = load_dataset(\"/path/to/folder\")\n```\n\nFor local datasets, this is equivalent to passing `audiofolder` manually in [`load_dataset`] and the directory in `data_dir`:\n\n```py\n>>> dataset = load_dataset(\"audiofolder\", data_dir=\"/path/to/folder\")\n```\n\nMetadata can also be specified as JSON Lines, in which case use `metadata.jsonl` as the name of the metadata file. This format is helpful in scenarios when one of the columns is complex, e.g. a list of floats, to avoid parsing errors or reading the complex values as strings.\n\nTo ignore the information in the metadata file, set `drop_metadata=True` in [`load_dataset`]:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"username/dataset_with_metadata\", drop_metadata=True)\n```\n\nIf you don't have a metadata file, `AudioFolder` automatically infers the label name from the directory name.\nIf you want to drop automatically created labels, set `drop_labels=True`.\nIn this case, your dataset will only contain an audio column:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"username/dataset_without_metadata\", drop_labels=True)\n```\n\nFinally the `filters` argument lets you load only a subset of the dataset, based on a condition on the label or the metadata. This is especially useful if the metadata is in Parquet format, since this format enables fast filtering. It is also recommended to use this argument with `streaming=True`, because by default the dataset is fully downloaded before filtering.\n\n```python\n>>> filters = [(\"label\", \"=\", 0)]\n>>> dataset = load_dataset(\"username/dataset_name\", streaming=True, filters=filters)\n```\n\n> [!TIP]\n> For more information about creating your own `AudioFolder` dataset, take a look at the [Create an audio dataset](./audio_dataset) guide.\n\nFor a guide on how to load any type of dataset, take a look at the <a class=\"underline decoration-sky-400 decoration-2 font-semibold\" href=\"./loading\">general loading guide</a>.\n\n## Audio decoding\n\nBy default, audio files are decoded sequentially as torchcodec [`AudioDecoder`](https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder.html#torchcodec.decoders.AudioDecoder) objects when you iterate on a dataset.\nHowever it is possible to speed up the dataset significantly using multithreaded decoding:\n\n```python\n>>> import os\n>>> num_threads = num_threads = min(32, (os.cpu_count() or 1) + 4)\n>>> dataset = dataset.decode(num_threads=num_threads)\n>>> for example in dataset:  # up to 20 times faster !\n...     ...\n```\n\nYou can enable multithreading using `num_threads`. This is especially useful to speed up remote data streaming.\nHowever it can be slower than `num_threads=0` for local data on fast disks.\n\nIf you are not interested in the images decoded as NumPy arrays and would like to access the path/bytes instead, you can disable decoding:\n\n```python\n>>> dataset = dataset.decode(False)\n```\n\nNote: [`IterableDataset.decode`] is only available for streaming datasets at the moment.\n"
  },
  {
    "path": "docs/source/audio_process.mdx",
    "content": "# Process audio data\n\nThis guide shows specific methods for processing audio datasets. Learn how to:\n\n- Resample the sampling rate.\n- Use [`~Dataset.map`] with audio datasets.\n\nFor a guide on how to process any type of dataset, take a look at the <a class=\"underline decoration-sky-400 decoration-2 font-semibold\" href=\"./process\">general process guide</a>.\n\n## Cast\n\nThe [`~Dataset.cast_column`] function is used to cast a column to another feature to be decoded. When you use this function with the [`Audio`] feature, you can resample the sampling rate:\n\n```py\n>>> from datasets import load_dataset, Audio\n\n>>> dataset = load_dataset(\"PolyAI/minds14\", \"en-US\", split=\"train\")\n>>> dataset = dataset.cast_column(\"audio\", Audio(sampling_rate=16000))\n```\n\nAudio files are decoded and resampled on-the-fly, so the next time you access an example, the audio file is resampled to 16kHz:\n\n```py\n>>> audio = dataset[0][\"audio\"]\n<datasets.features._torchcodec.AudioDecoder object at 0x11642b6a0>\n>>> audio = audio_dataset[0][\"audio\"]\n>>> samples = audio.get_all_samples()\n>>> samples.data\ntensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  2.3447e-06,\n         -1.9127e-04, -5.3330e-05]]\n>>> samples.sample_rate\n16000\n```\n\n<div class=\"flex justify-center\">\n  <img\n    class=\"block dark:hidden\"\n    src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/resample.gif\"\n  />\n  <img\n    class=\"hidden dark:block\"\n    src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/resample-dark.gif\"\n  />\n</div>\n\n## Map\n\nThe [`~Dataset.map`] function helps preprocess your entire dataset at once. Depending on the type of model you're working with, you'll need to either load a [feature extractor](https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoFeatureExtractor) or a [processor](https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoProcessor).\n\n- For pretrained speech recognition models, load a feature extractor and tokenizer and combine them in a `processor`:\n\n  ```py\n  >>> from transformers import AutoTokenizer, AutoFeatureExtractor, AutoProcessor\n\n  >>> model_checkpoint = \"facebook/wav2vec2-large-xlsr-53\"\n  # after defining a vocab.json file you can instantiate a tokenizer object:\n  >>> tokenizer = AutoTokenizer(\"./vocab.json\", unk_token=\"[UNK]\", pad_token=\"[PAD]\", word_delimiter_token=\"|\")\n  >>> feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)\n  >>> processor = AutoProcessor.from_pretrained(feature_extractor=feature_extractor, tokenizer=tokenizer)\n  ```\n\n- For fine-tuned speech recognition models, you only need to load a `processor`:\n\n  ```py\n  >>> from transformers import AutoProcessor\n\n  >>> processor = AutoProcessor.from_pretrained(\"facebook/wav2vec2-base-960h\")\n  ```\n\nWhen you use [`~Dataset.map`] with your preprocessing function, include the `audio` column to ensure you're actually resampling the audio data:\n\n```py\n>>> def prepare_dataset(batch):\n...     audio = batch[\"audio\"]\n...     batch[\"input_values\"] = processor(audio.get_all_samples().data, sampling_rate=audio[\"sampling_rate\"]).input_values[0]\n...     batch[\"input_length\"] = len(batch[\"input_values\"])\n...     with processor.as_target_processor():\n...         batch[\"labels\"] = processor(batch[\"sentence\"]).input_ids\n...     return batch\n>>> dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)\n```\n"
  },
  {
    "path": "docs/source/cache.mdx",
    "content": "# Cache management\n\nWhen you download a dataset from Hugging Face, the data are stored locally on your computer.\nFiles from Hugging Face are stored as usual in the `huggingface_hub` cache, which is at `~/.cache/huggingface/hub` by default.\nSee the [Hub cache documentation](https://huggingface.co/docs/huggingface_hub/guides/manage-cache) for more details and how to change its location.\n\nThe Hub cache allows 🤗 Datasets to avoid re-downloading dataset files from Hugging Face every time you use them. \n\n🤗 Datasets also has its own cache to store datasets converted in Arrow format (the format used by [`Dataset`] objects).\n\nThis guide focuses on the 🤗 Datasets cache and will show you how to:\n\n- Change the cache directory.\n- Control how a dataset is loaded from the cache.\n- Clean up cache files in the directory.\n- Enable or disable caching.\n\n## Cache directory\n\nThe default 🤗 Datasets cache directory is `~/.cache/huggingface/datasets`. Change the cache location by setting the shell environment variable, `HF_HOME` to another directory:\n\n```\n$ export HF_HOME=\"/path/to/another/directory/datasets\"\n```\n\nAlternatively, you can set the `HF_DATASETS_CACHE` environment variable to control only the datasets-specific cache directory:\n\n```\n$ export HF_DATASETS_CACHE=\"/path/to/datasets_cache\"\n```\n\n⚠️ This only applies to files written by the `datasets` library (e.g., Arrow files and indices).  \nIt does **not** affect files downloaded from the Hugging Face Hub (such as models, tokenizers, or raw dataset sources), which are located in `~/.cache/huggingface/hub` by default and controlled separately via the `HF_HUB_CACHE` variable:\n\n```\n$ export HF_HUB_CACHE=\"/path/to/hub_cache\"\n```\n\n💡 If you'd like to relocate all Hugging Face caches — including datasets and hub downloads — use the `HF_HOME` variable instead:\n\n```\n$ export HF_HOME=\"/path/to/cache_root\"\n```\n\nThis results in:\n- datasets cache → `/path/to/cache_root/datasets`\n- hub cache → `/path/to/cache_root/hub`\n\nThese distinctions are especially useful when working in shared environments or networked file systems (e.g., NFS).  \nSee [issue #7480](https://github.com/huggingface/datasets/issues/7480) for discussion on how users encountered unexpected cache locations when `HF_HUB_CACHE` was not set alongside `HF_DATASETS_CACHE`.\n\nWhen you load a dataset, you also have the option to change where the data is cached. Change the `cache_dir` parameter to the path you want:\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset('username/dataset', cache_dir=\"/path/to/another/directory/datasets\")\n```\n\n## Download mode\n\nAfter you download a dataset, control how it is loaded by [`load_dataset`] with the `download_mode` parameter. By default, 🤗 Datasets will reuse a dataset if it exists. But if you need the original dataset without any processing functions applied, re-download the files as shown below:\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset('rajpurkar/squad', download_mode='force_redownload')\n```\n\nRefer to [`DownloadMode`] for a full list of download modes.\n\n## Cache files\n\nClean up the Arrow cache files in the directory with [`Dataset.cleanup_cache_files`]:\n\n```py\n# Returns the number of removed cache files\n>>> dataset.cleanup_cache_files()\n2\n```\n\n## Enable or disable caching\n\nIf you're using a cached file locally, it will automatically reload the dataset with any previous transforms you applied to the dataset. Disable this behavior by setting the argument `load_from_cache_file=False` in [`Dataset.map`]:\n\n```py\n>>> updated_dataset = small_dataset.map(add_prefix, load_from_cache_file=False)\n```\n\nIn the example above, 🤗 Datasets will execute the function `add_prefix` over the entire dataset again instead of loading the dataset from its previous state.\n\nDisable caching on a global scale with [`disable_caching`]:\n\n```py\n>>> from datasets import disable_caching\n>>> disable_caching()\n```\n\nWhen you disable caching, 🤗 Datasets will no longer reload cached files when applying transforms to datasets. Any transform you apply on your dataset will be need to be reapplied.\n\n> [!TIP]\n> If you want to reuse a dataset from scratch, try setting the `download_mode` parameter in [`load_dataset`] instead.\n\n<a id='load_dataset_enhancing_performance'></a>\n\n## Improve performance\n\nDisabling the cache and copying the dataset in-memory will speed up dataset operations. There are two options for copying the dataset in-memory:\n\n1. Set `datasets.config.IN_MEMORY_MAX_SIZE` to a nonzero value (in bytes) that fits in your RAM memory.\n\n2. Set the environment variable `HF_DATASETS_IN_MEMORY_MAX_SIZE` to a nonzero value. Note that the first method takes higher precedence.\n"
  },
  {
    "path": "docs/source/cli.mdx",
    "content": "# Command Line Interface (CLI)\n\n🤗 Datasets provides a command line interface (CLI) with useful shell commands to interact with your dataset.\n\nYou can check the available commands:\n```bash\n>>> datasets-cli --help\nusage: datasets-cli <command> [<args>]\n\npositional arguments:\n  {env,test,delete_from_hub}\n                        datasets-cli command helpers\n    env                 Print relevant system environment info.\n    test                Test dataset loading.\n    delete_from_hub     Delete dataset config from the Hub\n\noptional arguments:\n  -h, --help            show this help message and exit\n```\n\n## Delete from Hub\n\nDelete a dataset configuration from a [supported dataset](repository_structure) on the Hub.\n\n```bash\n>>> datasets-cli delete_from_hub --help\nusage: datasets-cli <command> [<args>] delete_from_hub [-h] [--token TOKEN] [--revision REVISION] dataset_id config_name\n\npositional arguments:\n  dataset_id           source dataset ID, e.g. USERNAME/DATASET_NAME or ORGANIZATION/DATASET_NAME\n  config_name          config name to delete\n\noptional arguments:\n  -h, --help           show this help message and exit\n  --token TOKEN        access token to the Hugging Face Hub\n  --revision REVISION  source revision\n```\n\nFor example:\n```bash\n>>> datasets-cli delete_from_hub USERNAME/DATASET_NAME CONFIG_NAME\n```\n\n> [!TIP]\n> Do not forget that you need to log in first to your Hugging Face account:\n> ```bash\n> >>> hf auth login\n> ```\n"
  },
  {
    "path": "docs/source/create_dataset.mdx",
    "content": "# Create a dataset\n\nSometimes, you may need to create a dataset if you're working with your own data. Creating a dataset with 🤗 Datasets confers all the advantages of the library to your dataset: fast loading and processing, [stream enormous datasets](stream), [memory-mapping](https://huggingface.co/course/chapter5/4?fw=pt#the-magic-of-memory-mapping), and more. You can easily and rapidly create a dataset with 🤗 Datasets low-code approaches, reducing the time it takes to start training a model. In many cases, it is as easy as [dragging and dropping](upload_dataset#upload-with-the-hub-ui) your data files into a dataset repository on the Hub.\n\nIn this tutorial, you'll learn how to use 🤗 Datasets low-code methods for creating all types of datasets:\n\n- Folder-based builders for quickly creating an image or audio dataset\n- `from_` methods for creating datasets from local files\n\n## File-based builders\n\n🤗 Datasets supports many common formats such as `csv`, `json/jsonl`, `parquet`, `txt`.\n\nFor example it can read a dataset made up of one or several CSV files (in this case, pass your CSV files as a list):\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset(\"csv\", data_files=\"my_file.csv\")\n```\n\nTo get the list of supported formats and code examples, follow this guide [here](https://huggingface.co/docs/datasets/loading#local-and-remote-files).\n\n## Folder-based builders\n\nThere are two folder-based builders, [`ImageFolder`] and [`AudioFolder`]. These are low-code methods for quickly creating an image or speech and audio dataset with several thousand examples. They are great for rapidly prototyping computer vision and speech models before scaling to a larger dataset. Folder-based builders takes your data and automatically generates the dataset's features, splits, and labels. Under the hood:\n\n- [`ImageFolder`] uses the [`~datasets.Image`] feature to decode an image file. Many image extension formats are supported, such as jpg and png, but other formats are also supported. You can check the complete [list](https://github.com/huggingface/datasets/blob/b5672a956d5de864e6f5550e493527d962d6ae55/src/datasets/packaged_modules/imagefolder/imagefolder.py#L39) of supported image extensions.\n- [`AudioFolder`] uses the [`~datasets.Audio`] feature to decode an audio file. Extensions such as wav, mp3, and even mp4 are supported, and you can check the complete [list](https://ffmpeg.org/ffmpeg-formats.html) of supported audio extensions. Decoding is done via ffmpeg.\n\nThe dataset splits are generated from the repository structure, and the label names are automatically inferred from the directory name.\n\nFor example, if your image dataset (it is the same for an audio dataset) is stored like this:\n\n```\npokemon/train/grass/bulbasaur.png\npokemon/train/fire/charmander.png\npokemon/train/water/squirtle.png\n\npokemon/test/grass/ivysaur.png\npokemon/test/fire/charmeleon.png\npokemon/test/water/wartortle.png\n```\n\nThen this is how the folder-based builder generates an example:\n\n<div class=\"flex justify-center\">\n  <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/folder-based-builder.png\" />\n</div>\n\nCreate the image dataset by specifying `imagefolder` in [`load_dataset`]:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"imagefolder\", data_dir=\"/path/to/pokemon\")\n```\n\nAn audio dataset is created in the same way, except you specify `audiofolder` in [`load_dataset`] instead:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"audiofolder\", data_dir=\"/path/to/folder\")\n```\n\nAny additional information about your dataset, such as text captions or transcriptions, can be included with a `metadata.csv` file in the folder containing your dataset. The metadata file needs to have a `file_name` column that links the image or audio file to its corresponding metadata:\n\n```\nfile_name, text\nbulbasaur.png, There is a plant seed on its back right from the day this Pokémon is born.\ncharmander.png, It has a preference for hot things.\nsquirtle.png, When it retracts its long neck into its shell, it squirts out water with vigorous force.\n```\n\nTo learn more about each of these folder-based builders, check out the and <a href=\"https://huggingface.co/docs/datasets/image_dataset#imagefolder\"><span class=\"underline decoration-yellow-400 decoration-2 font-semibold\">ImageFolder</span></a> or <a href=\"https://huggingface.co/docs/datasets/audio_dataset#audiofolder\"><span class=\"underline decoration-pink-400 decoration-2 font-semibold\">AudioFolder</span></a> guides.\n\n## From Python dictionaries\n\nYou can also create a dataset from data in Python dictionaries. There are two ways you can create a dataset using the `from_` methods:\n\n    * The [`~Dataset.from_generator`] method is the most memory-efficient way to create a dataset from a [generator](https://wiki.python.org/moin/Generators) due to a generators iterative behavior. This is especially useful when you're working with a really large dataset that may not fit in memory, since the dataset is generated on disk progressively and then memory-mapped.\n\n    ```py\n    >>> from datasets import Dataset\n    >>> def gen():\n    ...     yield {\"pokemon\": \"bulbasaur\", \"type\": \"grass\"}\n    ...     yield {\"pokemon\": \"squirtle\", \"type\": \"water\"}\n    >>> ds = Dataset.from_generator(gen)\n    >>> ds[0]\n    {\"pokemon\": \"bulbasaur\", \"type\": \"grass\"}\n    ```\n\n    A generator-based [`IterableDataset`] needs to be iterated over with a `for` loop for example:\n\n    ```py\n    >>> from datasets import IterableDataset\n    >>> ds = IterableDataset.from_generator(gen)\n    >>> for example in ds:\n    ...     print(example)\n    {\"pokemon\": \"bulbasaur\", \"type\": \"grass\"}\n    {\"pokemon\": \"squirtle\", \"type\": \"water\"}\n    ```\n\n    * The [`~Dataset.from_dict`] method is a straightforward way to create a dataset from a dictionary:\n\n    ```py\n    >>> from datasets import Dataset\n    >>> ds = Dataset.from_dict({\"pokemon\": [\"bulbasaur\", \"squirtle\"], \"type\": [\"grass\", \"water\"]})\n    >>> ds[0]\n    {\"pokemon\": \"bulbasaur\", \"type\": \"grass\"}\n    ```\n\n    To create an image or audio dataset, chain the [`~Dataset.cast_column`] method with [`~Dataset.from_dict`] and specify the column and feature type. For example, to create an audio dataset:\n\n    ```py\n    >>> audio_dataset = Dataset.from_dict({\"audio\": [\"path/to/audio_1\", ..., \"path/to/audio_n\"]}).cast_column(\"audio\", Audio())\n    ```\n\nNow that you know how to create a dataset, consider sharing it on the Hub so the community can also benefit from your work! Go on to the next section to learn how to share your dataset.\n"
  },
  {
    "path": "docs/source/dataset_card.mdx",
    "content": "# Create a dataset card\n\nEach dataset should have a dataset card to promote responsible usage and inform users of any potential biases within the dataset.\nThis idea was inspired by the Model Cards proposed by [Mitchell, 2018](https://huggingface.co/papers/1810.03993).\nDataset cards help users understand a dataset's contents, the context for using the dataset, how it was created, and any other considerations a user should be aware of.\n\nCreating a dataset card is easy and can be done in just a few steps:\n\n1. Go to your dataset repository on the [Hub](https://hf.co/new-dataset) and click on **Create Dataset Card** to create a new `README.md` file in your repository.\n\n2. Use the **Metadata UI** to select the tags that describe your dataset. You can add a license, language, pretty_name, the task_categories, size_categories, and any other tags that you think are relevant. These tags help users discover and find your dataset on the Hub.\n\n<div class=\"flex justify-center\">\n    <img class=\"block dark:hidden\" src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/datasets-metadata-ui.png\"/>\n    <img class=\"hidden dark:block\" src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/datasets-metadata-ui-dark.png\"/>\n</div>\n\n  > [!TIP]\n  > For a complete, but not required, set of tag options you can also look at the [Dataset Card specifications](https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1). This'll have a few more tag options like `multilinguality` and `language_creators` which are useful but not absolutely necessary.\n\n3. Click on the **Import dataset card template** link to automatically create a template with all the relevant fields to complete. Fill out the template sections to the best of your ability. Take a look at the [Dataset Card Creation Guide](https://github.com/huggingface/datasets/blob/main/templates/README_guide.md) for more detailed information about what to include in each section of the card. For fields you are unable to complete, you can write **[More Information Needed]**.\n\n4. Once you're done, commit the changes to the `README.md` file and you'll see the completed dataset card on your repository.\n\nYAML also allows you to customize the way your dataset is loaded by [defining splits and/or configurations](./repository_structure#define-your-splits-and-subsets-in-yaml) without the need to write any code.\n\nFeel free to take a look at the [SNLI](https://huggingface.co/datasets/stanfordnlp/snli), [CNN/DailyMail](https://huggingface.co/datasets/abisee/cnn_dailymail), and [Allociné](https://huggingface.co/datasets/tblard/allocine) dataset cards as examples to help you get started.\n"
  },
  {
    "path": "docs/source/depth_estimation.mdx",
    "content": "# Depth estimation\n\nDepth estimation datasets are used to train a model to approximate the relative distance of every pixel in an\nimage from the camera, also known as depth. The applications enabled by these datasets primarily lie in areas like visual machine\nperception and perception in robotics. Example applications include mapping streets for self-driving cars. This guide will show you how to apply transformations\nto a depth estimation dataset.\n\nBefore you start, make sure you have up-to-date versions of `albumentations` installed:\n\n```bash\npip install -U albumentations \n```\n\n[Albumentations](https://albumentations.ai/) is a Python library for performing data augmentation\nfor computer vision. It supports various computer vision tasks such as image classification, object\ndetection, segmentation, and keypoint estimation.\n\nThis guide uses the [NYU Depth V2](https://huggingface.co/datasets/sayakpaul/nyu_depth_v2) dataset which is \ncomprised of video sequences from various indoor scenes, recorded by RGB and depth cameras. The dataset consists of scenes from 3 cities and provides images along with\ntheir depth maps as labels.\n\nLoad the `train` split of the dataset and take a look at an example:\n\n```py\n>>> from datasets import load_dataset\n\n>>> train_dataset = load_dataset(\"sayakpaul/nyu_depth_v2\", split=\"train\")\n>>> index = 17\n>>> example = train_dataset[index]\n>>> example\n{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=640x480>,\n 'depth_map': <PIL.TiffImagePlugin.TiffImageFile image mode=F size=640x480>}\n```\n\nThe dataset has two fields:\n\n* `image`: a PIL PNG image object with `uint8` data type.\n* `depth_map`: a PIL Tiff image object with `float32` data type which is the depth map of the image.\n\nHere the depth maps are using TIFF format as it supports a wide range of data types, including `float32` data.\nHowever it is mention-worthy that JPEG/PNG format can only store `uint8` or `uint16` data.\nTherefore if you have depth maps saved as JPEG/PNG, use the `Image(mode=\"F\")` type to load them as single channel `float32` like normal depth maps:\n\n```python\n>>> from datasets import Image\n\n>>> train_dataset = train_dataset.cast_column(\"depth_map\", Image(mode=\"F\"))\n```\n\nNext, check out an image with:\n\n```py\n>>> example[\"image\"]\n```\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/depth_est_sample.png\">\n</div>\n\nBefore we look at the depth map, we need to first convert its data type to `uint8` using `.convert('RGB')` as PIL can't display `float32` images. Now take a look at its corresponding depth map:\n\n```py\n>>> example[\"depth_map\"].convert(\"RGB\")\n```\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/depth_est_target.png\">\n</div>\n\nIt's all black! You'll need to add some color to the depth map to visualize it properly. To do that, either we can apply color automatically during display using `plt.imshow()` or create a colored depth map using `plt.cm` and then display it. In this example, we have used the latter one, as we can save/write the colored depth map later. (the utility below is taken from the [FastDepth repository](https://github.com/dwofk/fast-depth/blob/master/utils.py)).\n\n```py \n>>> import numpy as np\n>>> import matplotlib.pyplot as plt\n\n>>> cmap = plt.cm.viridis\n\n>>> def colored_depthmap(depth, d_min=None, d_max=None):\n...     if d_min is None:\n...         d_min = np.min(depth)\n...     if d_max is None:\n...         d_max = np.max(depth)\n...     depth_relative = (depth - d_min) / (d_max - d_min)\n...     return 255 * cmap(depth_relative)[:,:,:3]\n\n>>> def show_depthmap(depth_map):\n...    if not isinstance(depth_map, np.ndarray):\n...        depth_map = np.array(depth_map)\n...    if depth_map.ndim == 3:\n...        depth_map = depth_map.squeeze()\n\n...    d_min = np.min(depth_map)\n...    d_max = np.max(depth_map)\n...    depth_map = colored_depthmap(depth_map, d_min, d_max)\n\n...    plt.imshow(depth_map.astype(\"uint8\"))\n...    plt.axis(\"off\")\n...    plt.show()\n\n>>> show_depthmap(example[\"depth_map\"])\n```\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/depth_est_target_viz.png\">\n</div>\n\nYou can also visualize several different images and their corresponding depth maps.\n\n```py\n>>> def merge_into_row(input_image, depth_target):\n...     if not isinstance(input_image, np.ndarray):\n...         input_image = np.array(input_image)\n...\n...     d_min = np.min(depth_target)\n...     d_max = np.max(depth_target)\n...     depth_target_col = colored_depthmap(depth_target, d_min, d_max)\n...     img_merge = np.hstack([input_image, depth_target_col])\n...\n...     return img_merge\n\n>>> random_indices = np.random.choice(len(train_dataset), 9).tolist()\n>>> plt.figure(figsize=(15, 6))\n>>> for i, idx in enumerate(random_indices):\n...     example = train_dataset[idx]\n...     ax = plt.subplot(3, 3, i + 1)\n...     image_viz = merge_into_row(\n...         example[\"image\"], example[\"depth_map\"]\n...     )\n...     plt.imshow(image_viz.astype(\"uint8\"))\n...     plt.axis(\"off\")\n```\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/depth_est_collage.png\">\n</div>\n\nNow apply some augmentations with `albumentations`. The augmentation transformations include:\n\n* Random horizontal flipping\n* Random cropping \n* Random brightness and contrast \n* Random gamma correction \n* Random hue saturation\n\n```py \n>>> import albumentations as A\n\n>>> crop_size = (448, 576)\n>>> transforms = [\n...     A.HorizontalFlip(p=0.5),\n...     A.RandomCrop(crop_size[0], crop_size[1]),\n...     A.RandomBrightnessContrast(),\n...     A.RandomGamma(),\n...     A.HueSaturationValue()\n... ]\n```\n\nAdditionally, define a mapping to better reflect the target key name.\n\n```py \n>>> additional_targets = {\"depth\": \"mask\"}\n>>> aug = A.Compose(transforms=transforms, additional_targets=additional_targets)\n```\n\nWith `additional_targets` defined, you can pass the target depth maps to the `depth` argument of `aug` instead of `mask`. You'll notice this change\nin the `apply_transforms()` function defined below.\n\nCreate a function to apply the transformation to the images as well as their depth maps:\n\n```py \n>>> def apply_transforms(examples):\n...     transformed_images, transformed_maps = [], []\n...     for image, depth_map in zip(examples[\"image\"], examples[\"depth_map\"]):\n...         image, depth_map = np.array(image), np.array(depth_map)\n...         transformed = aug(image=image, depth=depth_map)\n...         transformed_images.append(transformed[\"image\"])\n...         transformed_maps.append(transformed[\"depth\"])\n...\n...     examples[\"pixel_values\"] = transformed_images\n...     examples[\"labels\"] = transformed_maps\n...     return examples\n```\n\nUse the [`~Dataset.set_transform`] function to apply the transformation on-the-fly to batches of the dataset to consume less disk space:\n\n```py\n>>> train_dataset.set_transform(apply_transforms)\n```\n\nYou can verify the transformation worked by indexing into the `pixel_values` and `labels` of an example image:\n\n```py\n>>> example = train_dataset[index]\n\n>>> plt.imshow(example[\"pixel_values\"])\n>>> plt.axis(\"off\")\n>>> plt.show()\n```\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/depth_est_sample_aug.png\">\n</div>\n\nVisualize the same transformation on the image's corresponding depth map:\n\n```py \n>>> show_depthmap(example[\"labels\"])\n```\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/depth_est_target_aug.png\">\n</div>\n\nYou can also visualize multiple training samples reusing the previous `random_indices`: \n\n```py \n>>> plt.figure(figsize=(15, 6))\n\n>>> for i, idx in enumerate(random_indices):\n...     ax = plt.subplot(3, 3, i + 1)\n...     example = train_dataset[idx]\n...     image_viz = merge_into_row(\n...         example[\"pixel_values\"], example[\"labels\"]\n...     )\n...     plt.imshow(image_viz.astype(\"uint8\"))\n...     plt.axis(\"off\")\n```\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/depth_est_aug_collage.png\">\n</div>"
  },
  {
    "path": "docs/source/document_dataset.mdx",
    "content": "# Create a document dataset\n\nThis guide will show you how to create a document dataset with `PdfFolder` and some metadata. This is a no-code solution for quickly creating a document dataset with several thousand PDFs.\n\n> [!TIP]\n> You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information about how to enable this feature on the Hub.\n\n## PdfFolder\n\nThe `PdfFolder` is a dataset builder designed to quickly load a document dataset with several thousand PDFs without requiring you to write any code.\n\n> [!TIP]\n> 💡 Take a look at the [Split pattern hierarchy](repository_structure#split-pattern-hierarchy) to learn more about how `PdfFolder` creates dataset splits based on your dataset repository structure.\n\n`PdfFolder` automatically infers the class labels of your dataset based on the directory name. Store your dataset in a directory structure like:\n\n```\nfolder/train/resume/0001.pdf\nfolder/train/resume/0002.pdf\nfolder/train/resume/0003.pdf\n\nfolder/train/invoice/0001.pdf\nfolder/train/invoice/0002.pdf\nfolder/train/invoice/0003.pdf\n```\n\nIf the dataset follows the `PdfFolder` structure, then you can load it directly with [`load_dataset`]:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"path/to/folder\")\n```\n\nThis is equivalent to passing `pdffolder` manually in [`load_dataset`] and the directory in `data_dir`:\n\n```py\n>>> dataset = load_dataset(\"pdffolder\", data_dir=\"/path/to/folder\")\n```\n\nYou can also use `pdffolder` to load datasets involving multiple splits. To do so, your dataset directory should have the following structure:\n\n```\nfolder/train/resume/0001.pdf\nfolder/train/resume/0002.pdf\nfolder/test/invoice/0001.pdf\nfolder/test/invoice/0002.pdf\n```\n\n> [!WARNING]\n> If all PDF files are contained in a single directory or if they are not on the same level of directory structure, `label` column won't be added automatically. If you need it, set `drop_labels=False` explicitly.\n\n\nIf there is additional information you'd like to include about your dataset, like text captions or bounding boxes, add it as a `metadata.csv` file in your folder. This lets you quickly create datasets for different computer vision tasks like text captioning or object detection. You can also use a JSONL file `metadata.jsonl` or a Parquet file `metadata.parquet`.\n\n```\nfolder/train/metadata.csv\nfolder/train/0001.pdf\nfolder/train/0002.pdf\nfolder/train/0003.pdf\n```\n\nYour `metadata.csv` file must have a `file_name` or `*_file_name` field which links PDF files with their metadata:\n\n```csv\nfile_name,additional_feature\n0001.pdf,This is a first value of a text feature you added to your pdfs\n0002.pdf,This is a second value of a text feature you added to your pdfs\n0003.pdf,This is a third value of a text feature you added to your pdfs\n```\n\nor using `metadata.jsonl`:\n\n```jsonl\n{\"file_name\": \"0001.pdf\", \"additional_feature\": \"This is a first value of a text feature you added to your PDFs\"}\n{\"file_name\": \"0002.pdf\", \"additional_feature\": \"This is a second value of a text feature you added to your PDFs\"}\n{\"file_name\": \"0003.pdf\", \"additional_feature\": \"This is a third value of a text feature you added to your PDFs\"}\n```\n\nHere the `file_name` must be the name of the PDF file next to the metadata file. More generally, it must be the relative path from the directory containing the metadata to the PDF file.\n\nIt's possible to point to more than one PDF in each row in your dataset, for example if both your input and output are pdfs:\n\n```jsonl\n{\"input_file_name\": \"0001.pdf\", \"output_file_name\": \"0001_output.pdf\"}\n{\"input_file_name\": \"0002.pdf\", \"output_file_name\": \"0002_output.pdf\"}\n{\"input_file_name\": \"0003.pdf\", \"output_file_name\": \"0003_output.pdf\"}\n```\n\nYou can also define lists of PDFs. In that case you need to name the field `file_names` or `*_file_names`. Here is an example:\n\n```jsonl\n{\"pdfs_file_names\": [\"0001_part1.pdf\", \"0001_part2.pdf\"], \"label\": \"urgent\"}\n{\"pdfs_file_names\": [\"0002_part1.pdf\", \"0002_part2.pdf\"], \"label\": \"urgent\"}\n{\"pdfs_file_names\": [\"0003_part1.pdf\", \"0002_part2.pdf\"], \"label\": \"normal\"}\n```\n\n### OCR (Optical Character Recognition)\n\nOCR datasets have the text contained in a PDF. An example `metadata.csv` may look like:\n\n```csv\nfile_name,text\n0001.pdf,Invoice 1234 from 01/01/1970...\n0002.pdf,Software Engineer Resume. Education: ...\n0003.pdf,Attention is all you need. Abstract. The ...\n```\n\nLoad the dataset with `PdfFolder`, and it will create a `text` column for the PDF captions:\n\n```py\n>>> dataset = load_dataset(\"pdffolder\", data_dir=\"/path/to/folder\", split=\"train\")\n>>> dataset[0][\"text\"]\n\"Invoice 1234 from 01/01/1970...\"\n```\n\n### Upload dataset to the Hub\n\nOnce you've created a dataset, you can share it to the using `huggingface_hub` for example. Make sure you have the [huggingface_hub](https://huggingface.co/docs/huggingface_hub/index) library installed and you're logged in to your Hugging Face account (see the [Upload with Python tutorial](upload_dataset#upload-with-python) for more details).\n\nUpload your dataset with `huggingface_hub.HfApi.upload_folder`:\n\n```py\nfrom huggingface_hub import HfApi\napi = HfApi()\n\napi.upload_folder(\n    folder_path=\"/path/to/local/dataset\",\n    repo_id=\"username/my-cool-dataset\",\n    repo_type=\"dataset\",\n)\n```\n"
  },
  {
    "path": "docs/source/document_load.mdx",
    "content": "# Load pdf data\n\n> [!WARNING]\n> Pdf support is experimental and is subject to change.\n\nPdf datasets have [`Pdf`] type columns, which contain `pdfplumber` objects. \n\n> [!TIP]\n> To work with pdf datasets, you need to have the `pdfplumber` package installed. Check out the [installation](https://github.com/jsvine/pdfplumber#installation) guide to learn how to install it.\n\nWhen you load a pdf dataset and call the pdf column, the pdfs are decoded as `pdfplumber` Pdfs:\n\n```py\n>>> from datasets import load_dataset, Pdf\n\n>>> dataset = load_dataset(\"path/to/pdf/folder\", split=\"train\")\n>>> dataset[0][\"pdf\"]\n<pdfplumber.pdf.PDF at 0x1075bc320>\n```\n\n> [!WARNING]\n> Index into a pdf dataset using the row index first and then the `pdf` column - `dataset[0][\"pdf\"]` - to avoid creating all the pdf objects in the dataset. Otherwise, this can be a slow and time-consuming process if you have a large dataset.\n\nFor a guide on how to load any type of dataset, take a look at the <a class=\"underline decoration-sky-400 decoration-2 font-semibold\" href=\"./loading\">general loading guide</a>.\n\n## Read pages\n\nAccess pages directly from a pdf using the `.pages` attribute.\n\nThen you can use the `pdfplumber` functions to read texts, tables and images, e.g.:\n\n```python\n>>> pdf = dataset[0][\"pdf\"]\n>>> first_page = pdf.pages[0]\n>>> first_page\n<Page:1>\n>>> first_page.extract_text()\nDocling Technical Report\nVersion1.0\nChristophAuer MaksymLysak AhmedNassar MicheleDolfi NikolaosLivathinos\nPanosVagenas CesarBerrospiRamis MatteoOmenetti FabianLindlbauer\nKasperDinkla LokeshMishra YusikKim ShubhamGupta RafaelTeixeiradeLima\nValeryWeber LucasMorin IngmarMeijer ViktorKuropiatnyk PeterW.J.Staar\nAI4KGroup,IBMResearch\nRu¨schlikon,Switzerland\nAbstract\nThis technical report introduces Docling, an easy to use, self-contained, MIT-\nlicensed open-source package for PDF document conversion.\n...\n>>> first_page.images\nIn [24]: first_page.images\nOut[24]: \n[{'x0': 256.5,\n  'y0': 621.0,\n  'x1': 355.49519999999995,\n  'y1': 719.9952,\n  'width': 98.99519999999995,\n  'height': 98.99519999999995,\n  'name': 'Im1',\n  'stream': <PDFStream(44): raw=88980, {'Type': /'XObject', 'Subtype': /'Image', 'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'DCTDecode', 'Height': 1024, 'Length': 88980, 'Width': 1024}>,\n  'srcsize': (1024, 1024),\n  'imagemask': None,\n  'bits': 8,\n  'colorspace': [/'DeviceRGB'],\n  'mcid': None,\n  'tag': None,\n  'object_type': 'image',\n  'page_number': 1,\n  'top': 72.00480000000005,\n  'bottom': 171.0,\n  'doctop': 72.00480000000005}]\n>>> first_page.extract_tables()\n[]\n```\n\nYou can also load each page as a `PIL.Image`:\n\n```python\n>>> import PIL.Image\n>>> import io\n>>> first_page.to_image()\n<pdfplumber.display.PageImage at 0x107d68dd0>\n>>> buffer = io.BytesIO()\n>>> first_page.to_image().save(buffer)\n>>> img = PIL.Image.open(buffer)\n>>> img\n<PIL.PngImagePlugin.PngImageFile image mode=P size=612x792>\n```\n\nNote that you can pass `resolution=` to `.to_image()` to render the image in higher resolution that the default (72 ppi).\n\n## Local files\n\nYou can load a dataset from the pdf path. Use the [`~Dataset.cast_column`] function to accept a column of pdf file paths, and decode it into a `pdfplumber` pdf with the [`Pdf`] feature:\n```py\n>>> from datasets import Dataset, Pdf\n\n>>> dataset = Dataset.from_dict({\"pdf\": [\"path/to/pdf_1\", \"path/to/pdf_2\", ..., \"path/to/pdf_n\"]}).cast_column(\"pdf\", Pdf())\n>>> dataset[0][\"pdf\"]\n<pdfplumber.pdf.PDF at 0x1657d0280>\n```\n\nIf you only want to load the underlying path to the pdf dataset without decoding the pdf object, set `decode=False` in the [`Pdf`] feature:\n\n```py\n>>> dataset = dataset.cast_column(\"pdf\", Pdf(decode=False))\n>>> dataset[0][\"pdf\"]\n{'bytes': None,\n 'path': 'path/to/pdf/folder/pdf0.pdf'}\n```\n\n## PdfFolder\n\nYou can also load a dataset with an `PdfFolder` dataset builder which does not require writing a custom dataloader. This makes `PdfFolder` ideal for quickly creating and loading pdf datasets with several thousand pdfs for different vision tasks. Your pdf dataset structure should look like this:\n\n```\nfolder/train/resume/0001.pdf\nfolder/train/resume/0002.pdf\nfolder/train/resume/0003.pdf\n\nfolder/train/invoice/0001.pdf\nfolder/train/invoice/0002.pdf\nfolder/train/invoice/0003.pdf\n```\n\nIf the dataset follows the `PdfFolder` structure, then you can load it directly with [`load_dataset`]:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"username/dataset_name\")\n>>> # OR locally:\n>>> dataset = load_dataset(\"/path/to/folder\")\n```\n\nFor local datasets, this is equivalent to passing `pdffolder` manually in [`load_dataset`] and the directory in `data_dir`:\n\n```py\n>>> dataset = load_dataset(\"pdffolder\", data_dir=\"/path/to/folder\")\n```\n\nThen you can access the pdfs as `pdfplumber.pdf.PDF` objects:\n\n```\n>>> dataset[\"train\"][0]\n{\"pdf\": <pdfplumber.pdf.PDF at 0x161715e50>, \"label\": 0}\n\n>>> dataset[\"train\"][-1]\n{\"pdf\": <pdfplumber.pdf.PDF at 0x16170bd90>, \"label\": 1}\n```\n\nTo ignore the information in the metadata file, set `drop_metadata=True` in [`load_dataset`]:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"username/dataset_with_metadata\", drop_metadata=True)\n```\n\nIf you don't have a metadata file, `PdfFolder` automatically infers the label name from the directory name.\nIf you want to drop automatically created labels, set `drop_labels=True`.\nIn this case, your dataset will only contain a pdf column:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"username/dataset_without_metadata\", drop_labels=True)\n```\n\nFinally the `filters` argument lets you load only a subset of the dataset, based on a condition on the label or the metadata. This is especially useful if the metadata is in Parquet format, since this format enables fast filtering. It is also recommended to use this argument with `streaming=True`, because by default the dataset is fully downloaded before filtering.\n\n```python\n>>> filters = [(\"label\", \"=\", 0)]\n>>> dataset = load_dataset(\"username/dataset_name\", streaming=True, filters=filters)\n```\n\n> [!TIP]\n> For more information about creating your own `PdfFolder` dataset, take a look at the [Create a pdf dataset](./document_dataset) guide.\n\n## Pdf decoding\n\nBy default, pdfs are decoded sequentially as pdfplumber `PDFs` when you iterate on a dataset.\nIt sequentially decodes the metadata of the pdfs, and doesn't read the pdf pages until you access them.\n\nHowever it is possible to speed up the dataset significantly using multithreaded decoding:\n\n```python\n>>> import os\n>>> num_threads = num_threads = min(32, (os.cpu_count() or 1) + 4)\n>>> dataset = dataset.decode(num_threads=num_threads)\n>>> for example in dataset:  # up to 20 times faster !\n...     ...\n```\n\nYou can enable multithreading using `num_threads`. This is especially useful to speed up remote data streaming.\nHowever it can be slower than `num_threads=0` for local data on fast disks.\n\nIf you are not interested in the documents decoded as pdfplumber `PDFs` and would like to access the path/bytes instead, you can disable decoding:\n\n```python\n>>> dataset = dataset.decode(False)\n```\n\nNote: [`IterableDataset.decode`] is only available for streaming datasets at the moment.\n"
  },
  {
    "path": "docs/source/faiss_es.mdx",
    "content": "# Search index\n\n[FAISS](https://github.com/facebookresearch/faiss) and [Elasticsearch](https://www.elastic.co/elasticsearch/) enables searching for examples in a dataset. This can be useful when you want to retrieve specific examples from a dataset that are relevant to your NLP task. For example, if you are working on an Open Domain Question Answering task, you may want to only return examples that are relevant to answering your question.\n\nThis guide will show you how to build an index for your dataset that will allow you to search it.\n\n## FAISS\n\nFAISS retrieves documents based on the similarity of their vector representations. In this example, you will generate the vector representations with the [DPR](https://huggingface.co/transformers/model_doc/dpr.html) model.\n\n1. Download the DPR model from 🤗 Transformers:\n\n```py\n>>> from transformers import DPRContextEncoder, DPRContextEncoderTokenizer\n>>> import torch\n>>> torch.set_grad_enabled(False)\n>>> ctx_encoder = DPRContextEncoder.from_pretrained(\"facebook/dpr-ctx_encoder-single-nq-base\")\n>>> ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained(\"facebook/dpr-ctx_encoder-single-nq-base\")\n```\n\n2. Load your dataset and compute the vector representations:\n\n```py\n>>> from datasets import load_dataset\n>>> ds = load_dataset('community-datasets/crime_and_punish', split='train[:100]')\n>>> ds_with_embeddings = ds.map(lambda example: {'embeddings': ctx_encoder(**ctx_tokenizer(example[\"line\"], return_tensors=\"pt\"))[0][0].numpy()})\n```\n\n3. Create the index with [`Dataset.add_faiss_index`]:\n\n```py\n>>> ds_with_embeddings.add_faiss_index(column='embeddings')\n```\n\n4. Now you can query your dataset with the `embeddings` index. Load the DPR Question Encoder, and search for a question with [`Dataset.get_nearest_examples`]:\n\n```py\n>>> from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer\n>>> q_encoder = DPRQuestionEncoder.from_pretrained(\"facebook/dpr-question_encoder-single-nq-base\")\n>>> q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(\"facebook/dpr-question_encoder-single-nq-base\")\n\n>>> question = \"Is it serious ?\"\n>>> question_embedding = q_encoder(**q_tokenizer(question, return_tensors=\"pt\"))[0][0].numpy()\n>>> scores, retrieved_examples = ds_with_embeddings.get_nearest_examples('embeddings', question_embedding, k=10)\n>>> retrieved_examples[\"line\"][0]\n'_that_ serious? It is not serious at all. It’s simply a fantasy to amuse\\r\\n'\n```\n\n5. You can access the index with [`Dataset.get_index`] and use it for special operations, e.g. query it using `range_search`:\n\n```py\n>>> faiss_index = ds_with_embeddings.get_index('embeddings').faiss_index\n>>> limits, distances, indices = faiss_index.range_search(x=question_embedding.reshape(1, -1), thresh=0.95)\n```\n\n6. When you are done querying, save the index on disk with [`Dataset.save_faiss_index`]:\n\n```py\n>>> ds_with_embeddings.save_faiss_index('embeddings', 'my_index.faiss')\n```\n\n7. Reload it at a later time with [`Dataset.load_faiss_index`]:\n\n```py\n>>> ds = load_dataset('community-datasets/crime_and_punish', split='train[:100]')\n>>> ds.load_faiss_index('embeddings', 'my_index.faiss')\n```\n\n## Elasticsearch\n\nUnlike FAISS, Elasticsearch retrieves documents based on exact matches. \n\nStart Elasticsearch on your machine, or see the [Elasticsearch installation guide](https://www.elastic.co/guide/en/elasticsearch/reference/current/setup.html) if you don't already have it installed.\n\n1. Load the dataset you want to index:\n\n```py\n>>> from datasets import load_dataset\n>>> squad = load_dataset('rajpurkar/squad', split='validation')\n```\n\n2. Build the index with [`Dataset.add_elasticsearch_index`]:\n\n```py\n>>> squad.add_elasticsearch_index(\"context\", host=\"localhost\", port=\"9200\")\n```\n\n3. Then you can query the `context` index with [`Dataset.get_nearest_examples`]:\n\n```py\n>>> query = \"machine\"\n>>> scores, retrieved_examples = squad.get_nearest_examples(\"context\", query, k=10)\n>>> retrieved_examples[\"title\"][0]\n'Computational_complexity_theory'\n```\n\n4. If you want to reuse the index, define the `es_index_name` parameter when you build the index:\n\n```py\n>>> from datasets import load_dataset\n>>> squad = load_dataset('rajpurkar/squad', split='validation')\n>>> squad.add_elasticsearch_index(\"context\", host=\"localhost\", port=\"9200\", es_index_name=\"hf_squad_val_context\")\n>>> squad.get_index(\"context\").es_index_name\nhf_squad_val_context\n```\n\n5. Reload it later with the index name when you call [`Dataset.load_elasticsearch_index`]:\n\n```py\n>>> from datasets import load_dataset\n>>> squad = load_dataset('rajpurkar/squad', split='validation')\n>>> squad.load_elasticsearch_index(\"context\", host=\"localhost\", port=\"9200\", es_index_name=\"hf_squad_val_context\")\n>>> query = \"machine\"\n>>> scores, retrieved_examples = squad.get_nearest_examples(\"context\", query, k=10)\n```\n\nFor more advanced Elasticsearch usage, you can specify your own configuration with custom settings:\n\n```py\n>>> import elasticsearch as es\n>>> import elasticsearch.helpers\n>>> from elasticsearch import Elasticsearch\n>>> es_client = Elasticsearch([{\"host\": \"localhost\", \"port\": \"9200\"}])  # default client\n>>> es_config = {\n...     \"settings\": {\n...         \"number_of_shards\": 1,\n...         \"analysis\": {\"analyzer\": {\"stop_standard\": {\"type\": \"standard\", \" stopwords\": \"_english_\"}}},\n...     },\n...     \"mappings\": {\"properties\": {\"text\": {\"type\": \"text\", \"analyzer\": \"standard\", \"similarity\": \"BM25\"}}},\n... }  # default config\n>>> es_index_name = \"hf_squad_context\"  # name of the index in Elasticsearch\n>>> squad.add_elasticsearch_index(\"context\", es_client=es_client, es_config=es_config, es_index_name=es_index_name)\n```\n"
  },
  {
    "path": "docs/source/filesystems.mdx",
    "content": "# Cloud storage\n\n## Hugging Face Datasets\n\nThe Hugging Face Dataset Hub is home to a growing collection of datasets that span a variety of domains and tasks.\n\nIt's more than a cloud storage: the Dataset Hub is a platform that provides data versioning thanks to git, as well as a Dataset Viewer to explore the data, making it a great place to store AI-ready datasets.\n\nThis guide shows how to import data from other cloud storage using the filesystems implementations from `fsspec`.\n\n## Hugging Face Storage Buckets\n\nStorage Buckets are a repo type on the Hugging Face Hub providing S3-like object storage, powered by the Xet storage backend. Unlike Git-based dataset repositories, buckets are non-versioned and mutable, designed for use cases where you need simple, fast storage such as logs, intermediate artifacts, or any large collection of files that doesn’t need version control.\n\n## Import data from a cloud storage\n\nMost cloud storage providers have a `fsspec` FileSystem implementation, which is useful to import data from any cloud provider with the same code.\nThis is especially useful to publish datasets on Hugging Face.\n\nTake a look at the following table for some example of supported cloud storage providers:\n\n| Storage provider     | Filesystem implementation                                     |\n|----------------------|---------------------------------------------------------------|\n| Amazon S3            | [s3fs](https://s3fs.readthedocs.io/en/latest/)                |\n| Google Cloud Storage | [gcsfs](https://gcsfs.readthedocs.io/en/latest/)              |\n| Azure Blob/DataLake  | [adlfs](https://github.com/fsspec/adlfs)                      |\n| Oracle Cloud Storage | [ocifs](https://ocifs.readthedocs.io/en/latest/)              |\n\nThis guide will show you how to import data files from any cloud storage and save a dataset on Hugging Face.\n\nLet's say we want to publish a dataset on Hugging Face from Parquet files from a cloud storage.\n\nFirst, instantiate your cloud storage filesystem and list the files you'd like to import:\n\n```python\n>>> import fsspec\n>>> fs = fsspec.filesystem(\"...\")  # s3 / gcs / abfs / adl / oci / ...\n>>> data_dir = \"path/to/my/data/\"\n>>> pattern = \"*.parquet\"\n>>> data_files = fs.glob(data_dir + pattern)\n[\"path/to/my/data/0001.parquet\", \"path/to/my/data/0001.parquet\", ...]\n```\n\n### Publish a Dataset\n\nThen you can create a dataset on Hugging Face and import the data files, using for example:\n\n```python\n>>> from huggingface_hub import create_repo, upload_folder\n>>> from tqdm.auto import tqdm\n>>> destination_dataset = \"username/my-dataset\"\n>>> create_repo(destination_dataset, repo_type=\"dataset\")\n>>> batch_size = 100\n>>> for data_files in batched(tqdm(fs.glob(data_dir + pattern)), batch_size):\n...     with TemporaryDirectory() as tmp_dir:\n...         tmp_files = [os.path.join(tmp_dir, x[len(data_dir):]) for x in data_files]\n...         fs.download(data_files, tmp_files)\n...         upload_folder(\n...             repo_id=destination_dataset,\n...             folder_path=tmp_dir,\n...             repo_type=\"dataset\",\n...         )\n```\n\nCheck out the [huggingface_hub](https://huggingface.co/docs/huggingface_hub) documentation on files uploads [here](https://huggingface.co/docs/huggingface_hub/en/guides/upload) if you're looking for more upload options.\n\nFinally you can now load the dataset using 🤗 Datasets:\n\n```python\n>>> from datasets import load_dataset\n>>> ds = load_dataset(\"username/my-dataset\")\n```\n\n### Import raw data to Storage Buckets\n\nAlternatively if you wish not to publish a dataset but simply import raw data files in a Hugging Face [Storage Bucket](https://huggingface.co/docs/hub/storage-buckets), you can use:\n\n```python\n>>> from huggingface_hub import create_bucket, sync_bucket\n>>> from tqdm.auto import tqdm\n>>> from itertools import batched\n>>> from tempfile import TemporaryDirectory\n>>> import os\n>>> create_bucket(\"username/my-bucket\")\n>>> bucket_files_location = \"hf://buckets/username/my-bucket/path/to/raw/files\"\n>>> batch_size = 100\n>>> for data_files in batched(tqdm(fs.glob(data_dir + pattern)), batch_size):\n...     with TemporaryDirectory() as tmp_dir:\n...         tmp_files = [os.path.join(tmp_dir, x[len(data_dir):]) for x in data_files]\n...         fs.download(data_files, tmp_files)\n...         sync_bucket(tmp_dir, bucket_files_location)\n```\n\nCheck out the [huggingface_hub](https://huggingface.co/docs/huggingface_hub) documentation on Storage Buckets [here](https://huggingface.co/docs/hub/storage-buckets) if you're looking for more upload options.\n\nThen later you can load the raw files using 🤗 Datasets, transform them and upload the final AI-ready datasets, e.g. in a streaming manner:\n\nIf the files are in a format supported by 🤗 Datasets:\n\n```python\n>>> from datasets import load_dataset\n>>> ds = load_dataset(bucket_files_location, streaming=True)\n>>> ds = ds.map(...).filter(...)\n>>> ds.push_to_hub(\"username/my-dataset\", num_proc=4)\n>>> # and later\n>>> ds = load_dataset(\"username/my-dataset\")\n```\n\nOtherwise you can use your own file parsing function:\n\n```python\n>>> from datasets import IterableDataset\n>>> from huggingface_hub import hffs\n>>> data_files = hffs.find(bucket_files_location)\n>>> num_shards = 1024  # For parallelism. PS: every shard should fit in RAM\n>>> ds = IterableDataset.from_dict({\"data_file\": data_files}, num_shards=num_shards)\n>>> def parse_data_files(data_files):\n...     ...\n...     return {\"col_1\": [...], \"col_2\": [...]}\n>>> ds = ds.map(parse_data_files, batched=True, input_column=[\"data_file\"])\n>>> ds.push_to_hub(\"username/my-dataset\", num_proc=4)\n>>> # and later\n>>> ds = load_dataset(\"username/my-dataset\")\n```\n"
  },
  {
    "path": "docs/source/how_to.md",
    "content": "# Overview\n\nThe how-to guides offer a more comprehensive overview of all the tools 🤗 Datasets offers and how to use them. This will help you tackle messier real-world datasets where you may need to manipulate the dataset structure or content to get it ready for training.\n\nThe guides assume you are familiar and comfortable with the 🤗 Datasets basics. We recommend newer users check out our [tutorials](tutorial) first.\n\n> [!TIP]\n> Interested in learning more? Take a look at [Chapter 5](https://huggingface.co/course/chapter5/1?fw=pt) of the Hugging Face course!\n\nThe guides are organized into six sections:\n\n- <span class=\"underline decoration-sky-400 decoration-2 font-semibold\">General usage</span>: Functions for general dataset loading and processing. The functions shown in this section are applicable across all dataset modalities.\n- <span class=\"underline decoration-pink-400 decoration-2 font-semibold\">Audio</span>: How to load, process, and share audio datasets.\n- <span class=\"underline decoration-yellow-400 decoration-2 font-semibold\">Vision</span>: How to load, process, and share image and video datasets.\n- <span class=\"underline decoration-green-400 decoration-2 font-semibold\">Text</span>: How to load, process, and share text datasets.\n- <span class=\"underline decoration-orange-400 decoration-2 font-semibold\">Tabular</span>: How to load, process, and share tabular datasets.\n- <span class=\"underline decoration-indigo-400 decoration-2 font-semibold\">Dataset repository</span>: How to share and upload a dataset to the <a href=\"https://huggingface.co/datasets\">Hub</a>.\n\nIf you have any questions about 🤗 Datasets, feel free to join and ask the community on our [forum](https://discuss.huggingface.co/c/datasets/10).\n"
  },
  {
    "path": "docs/source/image_classification.mdx",
    "content": "# Image classification\n\nImage classification datasets are used to train a model to classify an entire image. There are a wide variety of applications enabled by these datasets such as identifying endangered wildlife species or screening for disease in medical images. This guide will show you how to apply transformations to an image classification dataset.\n\nBefore you start, make sure you have up-to-date versions of `albumentations` and `cv2` installed:\n\n```bash\npip install -U albumentations opencv-python\n```\n\nThis guide uses the [Beans](https://huggingface.co/datasets/beans) dataset for identifying the type of bean plant disease based on an image of its leaf.\n\nLoad the dataset and take a look at an example:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"AI-Lab-Makerere/beans\")\n>>> dataset[\"train\"][10]\n{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x500 at 0x7F8D2F4D7A10>,\n 'image_file_path': '/root/.cache/huggingface/datasets/downloads/extracted/b0a21163f78769a2cf11f58dfc767fb458fc7cea5c05dccc0144a2c0f0bc1292/train/angular_leaf_spot/angular_leaf_spot_train.204.jpg',\n 'labels': 0}\n```\n\nThe dataset has three fields:\n\n* `image`: a PIL image object.\n* `image_file_path`: the path to the image file.\n* `labels`: the label or category of the image.\n\nNext, check out an image:\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/img_clf.png\">\n</div>\n\nNow apply some augmentations with `albumentations`. You'll randomly crop the image, flip it horizontally, and adjust its brightness.\n\n```py\n>>> import cv2\n>>> import albumentations\n>>> import numpy as np\n\n>>> transform = albumentations.Compose([\n...     albumentations.RandomCrop(width=256, height=256),\n...     albumentations.HorizontalFlip(p=0.5),\n...     albumentations.RandomBrightnessContrast(p=0.2),\n... ])\n```\n\nCreate a function to apply the transformation to the images:\n\n```py\n>>> def transforms(examples):\n...     examples[\"pixel_values\"] = [\n...         transform(image=np.array(image))[\"image\"] for image in examples[\"image\"]\n...     ]\n... \n...     return examples\n```\n\nUse the [`~Dataset.set_transform`] function to apply the transformation on-the-fly to batches of the dataset to consume less disk space:\n\n```py\n>>> dataset.set_transform(transforms)\n```\n\nYou can verify the transformation worked by indexing into the `pixel_values` of the first example:\n\n```py\n>>> import numpy as np\n>>> import matplotlib.pyplot as plt\n\n>>> img = dataset[\"train\"][0][\"pixel_values\"]\n>>> plt.imshow(img)\n```\n\n<div class=\"flex justify-center\">\n    <img class=\"block dark:hidden\" src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/img_clf_aug.png\">\n    <img class=\"hidden dark:block\" src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/img_clf_aug.png\"/>\n</div>\n\n> [!TIP]\n> Now that you know how to process a dataset for image classification, learn\n> [how to train an image classification model](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)\n> and use it for inference."
  },
  {
    "path": "docs/source/image_dataset.mdx",
    "content": "# Create an image dataset\n\nThere are two methods for creating and sharing an image dataset. This guide will show you how to:\n\n* Create an image dataset from local files in python with [`Dataset.push_to_hub`]. This is an easy way that requires only a few steps in python.\n\n* Create an image dataset with `ImageFolder` and some metadata. This is a no-code solution for quickly creating an image dataset with several thousand images.\n\n> [!TIP]\n> You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information about how to enable this feature on the Hub.\n\n## ImageFolder\n\nThe `ImageFolder` is a dataset builder designed to quickly load an image dataset with several thousand images without requiring you to write any code.\n\n> [!TIP]\n> 💡 Take a look at the [Split pattern hierarchy](repository_structure#split-pattern-hierarchy) to learn more about how `ImageFolder` creates dataset splits based on your dataset repository structure.\n\n`ImageFolder` automatically infers the class labels of your dataset based on the directory name. Store your dataset in a directory structure like:\n\n```\nfolder/train/dog/golden_retriever.png\nfolder/train/dog/german_shepherd.png\nfolder/train/dog/chihuahua.png\n\nfolder/train/cat/maine_coon.png\nfolder/train/cat/bengal.png\nfolder/train/cat/birman.png\n```\n\nIf the dataset follows the `ImageFolder` structure, then you can load it directly with [`load_dataset`]:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"path/to/folder\")\n```\n\nThis is equivalent to passing `imagefolder` manually in [`load_dataset`] and the directory in `data_dir`:\n\n```py\n>>> dataset = load_dataset(\"imagefolder\", data_dir=\"/path/to/folder\")\n```\n\nYou can also use `imagefolder` to load datasets involving multiple splits. To do so, your dataset directory should have the following structure:\n\n```\nfolder/train/dog/golden_retriever.png\nfolder/train/cat/maine_coon.png\nfolder/test/dog/german_shepherd.png\nfolder/test/cat/bengal.png\n```\n\n> [!WARNING]\n> If all image files are contained in a single directory or if they are not on the same level of directory structure, `label` column won't be added automatically. If you need it, set `drop_labels=False` explicitly.\n\n\nIf there is additional information you'd like to include about your dataset, like text captions or bounding boxes, add it as a `metadata.csv` file in your folder. This lets you quickly create datasets for different computer vision tasks like text captioning or object detection. You can also use a JSONL file `metadata.jsonl` or a Parquet file `metadata.parquet`.\n\n```\nfolder/train/metadata.csv\nfolder/train/0001.png\nfolder/train/0002.png\nfolder/train/0003.png\n```\n\nYou can also zip your images, and in this case each zip should contain both the images and the metadata\n\n```\nfolder/train.zip\nfolder/test.zip\nfolder/validation.zip\n```\n\nYour `metadata.csv` file must have a `file_name` or `*_file_name` field which links image files with their metadata:\n\n```csv\nfile_name,additional_feature\n0001.png,This is a first value of a text feature you added to your images\n0002.png,This is a second value of a text feature you added to your images\n0003.png,This is a third value of a text feature you added to your images\n```\n\nor using `metadata.jsonl`:\n\n```jsonl\n{\"file_name\": \"0001.png\", \"additional_feature\": \"This is a first value of a text feature you added to your images\"}\n{\"file_name\": \"0002.png\", \"additional_feature\": \"This is a second value of a text feature you added to your images\"}\n{\"file_name\": \"0003.png\", \"additional_feature\": \"This is a third value of a text feature you added to your images\"}\n```\n\nHere the `file_name` must be the name of the image file next to the metadata file. More generally, it must be the relative path from the directory containing the metadata to the image file.\n\nIt's possible to point to more than one image in each row in your dataset, for example if both your input and output are images:\n\n```jsonl\n{\"input_file_name\": \"0001.png\", \"output_file_name\": \"0001_output.png\"}\n{\"input_file_name\": \"0002.png\", \"output_file_name\": \"0002_output.png\"}\n{\"input_file_name\": \"0003.png\", \"output_file_name\": \"0003_output.png\"}\n```\n\nYou can also define lists of images. In that case you need to name the field `file_names` or `*_file_names`. Here is an example:\n\n```jsonl\n{\"frames_file_names\": [\"0001_t0.png\", \"0001_t1.png\"], label: \"moving_up\"}\n{\"frames_file_names\": [\"0002_t0.png\", \"0002_t1.png\"], label: \"moving_down\"}\n{\"frames_file_names\": [\"0003_t0.png\", \"0003_t1.png\"], label: \"moving_right\"}\n```\n\n### Image captioning\n\nImage captioning datasets have text describing an image. An example `metadata.csv` may look like:\n\n```csv\nfile_name,text\n0001.png,This is a golden retriever playing with a ball\n0002.png,A german shepherd\n0003.png,One chihuahua\n```\n\nLoad the dataset with `ImageFolder`, and it will create a `text` column for the image captions:\n\n```py\n>>> dataset = load_dataset(\"imagefolder\", data_dir=\"/path/to/folder\", split=\"train\")\n>>> dataset[0][\"text\"]\n\"This is a golden retriever playing with a ball\"\n```\n\n### Object detection\n\nObject detection datasets have bounding boxes and categories identifying objects in an image. An example `metadata.jsonl` may look like:\n\n```jsonl\n{\"file_name\": \"0001.png\", \"objects\": {\"bbox\": [[302.0, 109.0, 73.0, 52.0]], \"categories\": [0]}}\n{\"file_name\": \"0002.png\", \"objects\": {\"bbox\": [[810.0, 100.0, 57.0, 28.0]], \"categories\": [1]}}\n{\"file_name\": \"0003.png\", \"objects\": {\"bbox\": [[160.0, 31.0, 248.0, 616.0], [741.0, 68.0, 202.0, 401.0]], \"categories\": [2, 2]}}\n```\n\nLoad the dataset with `ImageFolder`, and it will create a `objects` column with the bounding boxes and the categories:\n\n```py\n>>> dataset = load_dataset(\"imagefolder\", data_dir=\"/path/to/folder\", split=\"train\")\n>>> dataset[0][\"objects\"]\n{\"bbox\": [[302.0, 109.0, 73.0, 52.0]], \"categories\": [0]}\n```\n\n### Upload dataset to the Hub\n\nOnce you've created a dataset, you can share it to the Hub with the [`~datasets.DatasetDict.push_to_hub`] method. Make sure you have the [huggingface_hub](https://huggingface.co/docs/huggingface_hub/index) library installed and you're logged in to your Hugging Face account (see the [Upload with Python tutorial](upload_dataset#upload-with-python) for more details).\n\nUpload your dataset with [`~datasets.DatasetDict.push_to_hub`]:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"imagefolder\", data_dir=\"/path/to/folder\", split=\"train\")\n>>> dataset.push_to_hub(\"stevhliu/my-image-captioning-dataset\")\n```\n\n## WebDataset\n\nThe [WebDataset](https://github.com/webdataset/webdataset) format is based on TAR archives and is suitable for big image datasets.\nIndeed you can group your images in TAR archives (e.g. 1GB of images per TAR archive) and have thousands of TAR archives:\n\n```\nfolder/train/00000.tar\nfolder/train/00001.tar\nfolder/train/00002.tar\n...\n```\n\nIn the archives, each example is made of files sharing the same prefix:\n\n```\ne39871fd9fd74f55.jpg\ne39871fd9fd74f55.json\nf18b91585c4d3f3e.jpg\nf18b91585c4d3f3e.json\nede6e66b2fb59aab.jpg\nede6e66b2fb59aab.json\ned600d57fcee4f94.jpg\ned600d57fcee4f94.json\n...\n```\n\nYou can put your images labels/captions/bounding boxes using JSON or text files for example.\n\nLoad your WebDataset and it will create on column per file suffix (here \"jpg\" and \"json\"):\n\n```python\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"webdataset\", data_dir=\"/path/to/folder\", split=\"train\")\n>>> dataset[0][\"json\"]\n{\"bbox\": [[302.0, 109.0, 73.0, 52.0]], \"categories\": [0]}\n```\n\nIt's also possible to have several images per example like this:\n\n```\ne39871fd9fd74f55.input.jpg\ne39871fd9fd74f55.output.jpg\ne39871fd9fd74f55.json\nf18b91585c4d3f3e.input.jpg\nf18b91585c4d3f3e.output.jpg\nf18b91585c4d3f3e.json\n...\n```\n\nFor more details on the WebDataset format and the python library, please check the [WebDataset documentation](https://webdataset.github.io/webdataset).\n\n## Lance\n\n[Lance](https://lance.org) is an open multimodal lakehouse table format. Lance tables can natively store not only text and scalar values,\nbut also large binary objects (blobs) such as images, audio, and video alongside your tabular data.\n\nStarting from image files on disk plus associated metadata (for example, captions and dimensions), you can write a self-contained Lance dataset to a\nlocal `*.lance` directory. The resulting table can store your metadata columns alongside an `image` column containing the encoded image bytes.\n\nFor example, you might start with metadata like:\n\n```text\n{'caption': 'Cordelia and Dudley on their wedding  day last year', 'height': 315, 'width': 233}\n{'caption': 'Statistics on challenges for automation in 2021', 'height': 299, 'width': 701}\n```\n\nYou can define a `pyarrow` schema for your metadata and image bytes, build a table, and write it as a Lance dataset:\n\n```python\nimport lance\nimport pyarrow as pa\n\nschema = pa.schema(\n    [\n        pa.field(\"caption\", pa.utf8()),\n        pa.field(\"height\", pa.int32()),\n        pa.field(\"width\", pa.int32()),\n        # ... add any additional metadata columns you want here ...\n        pa.field(\"image\", pa.binary()),\n    ]\n)\n\n# Provide image files alongside metadata\nrows = [\n    {\n        \"image_path\": \"/path/to/images/0001.jpg\",\n        \"caption\": \"Cordelia and Dudley on their wedding  day last year\",\n        \"height\": 315,\n        \"width\": 233,\n    },\n    {\n        \"image_path\": \"/path/to/images/0002.jpg\",\n        \"caption\": \"Statistics on challenges for automation in 2021\",\n        \"height\": 299,\n        \"width\": 701,\n    },\n]\n\nimage_bytes = []\nfor r in rows:\n    with open(r[\"image_path\"], \"rb\") as f:\n        image_bytes.append(f.read())\n\ntable = pa.table(\n    {\n        \"caption\": [r[\"caption\"] for r in rows],\n        \"height\": [r[\"height\"] for r in rows],\n        \"width\": [r[\"width\"] for r in rows],\n        \"image\": image_bytes,\n    },\n    schema=schema,\n)\n\nds = lance.write_dataset(\n    table,\n    \"./images.lance\",\n    schema=schema,\n    mode=\"create\",\n)\n```\n\nHere's a representative view of what a Lance table storing images might look like (the `image` column contains encoded bytes):\n\n```text\n+-----------------------------------------------+--------+-------+-----+------------------------------+\n| caption                                       | height | width | ... | image                        |\n+-----------------------------------------------+--------+-------+-----+------------------------------+\n| \"Cordelia and Dudley on their wedding ...\"    | 315    | 233   | ... | b\"\\\\xff\\\\xd8\\\\xff...\\\\xd9\"   |\n| \"Statistics on challenges for automation ...\" | 299    | 701   | ... | b\"\\\\xff\\\\xd8\\\\xff...\\\\xd9\"   |\n+-----------------------------------------------+--------+-------+-----+------------------------------+\n```\n\nUsing this approach, you can store arbitrarily large image datasets in Lance. The resulting `images.lance/` directory with\nits `*.lance` files can be uploaded to the Hugging Face Hub, just like the other examples above. See the `lance-format/laion-1m`\n[on the Hub](https://huggingface.co/datasets/lance-format/laion-1m) dataset for an example of a Lance image dataset.\n\nFor more details on working with Lance datasets, see the [Lance documentation](https://lance.org)."
  },
  {
    "path": "docs/source/image_load.mdx",
    "content": "# Load image data\n\nImage datasets have [`Image`] type columns, which contain PIL objects. \n\n> [!TIP]\n> To work with image datasets, you need to have the `vision` dependency installed. Check out the [installation](./installation#vision) guide to learn how to install it.\n\nWhen you load an image dataset and call the image column, the images are decoded as PIL Images:\n\n```py\n>>> from datasets import load_dataset, Image\n\n>>> dataset = load_dataset(\"AI-Lab-Makerere/beans\", split=\"train\")\n>>> dataset[0][\"image\"]\n```\n\n> [!WARNING]\n> Index into an image dataset using the row index first and then the `image` column - `dataset[0][\"image\"]` - to avoid decoding and resampling all the image objects in the dataset. Otherwise, this can be a slow and time-consuming process if you have a large dataset.\n\nFor a guide on how to load any type of dataset, take a look at the <a class=\"underline decoration-sky-400 decoration-2 font-semibold\" href=\"./loading\">general loading guide</a>.\n\n## Local files\n\nYou can load a dataset from the image path. Use the [`~Dataset.cast_column`] function to accept a column of image file paths, and decode it into a PIL image with the [`Image`] feature:\n```py\n>>> from datasets import Dataset, Image\n\n>>> dataset = Dataset.from_dict({\"image\": [\"path/to/image_1\", \"path/to/image_2\", ..., \"path/to/image_n\"]}).cast_column(\"image\", Image())\n>>> dataset[0][\"image\"]\n<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1200x215 at 0x15E6D7160>]\n```\n\nIf you only want to load the underlying path to the image dataset without decoding the image object, set `decode=False` in the [`Image`] feature:\n\n```py\n>>> dataset = load_dataset(\"AI-Lab-Makerere/beans\", split=\"train\").cast_column(\"image\", Image(decode=False))\n>>> dataset[0][\"image\"]\n{'bytes': None,\n 'path': '/root/.cache/huggingface/datasets/downloads/extracted/b0a21163f78769a2cf11f58dfc767fb458fc7cea5c05dccc0144a2c0f0bc1292/train/bean_rust/bean_rust_train.29.jpg'}\n```\n\n## ImageFolder\n\nYou can also load a dataset with an `ImageFolder` dataset builder which does not require writing a custom dataloader. This makes `ImageFolder` ideal for quickly creating and loading image datasets with several thousand images for different vision tasks. Your image dataset structure should look like this:\n\n```\nfolder/train/dog/golden_retriever.png\nfolder/train/dog/german_shepherd.png\nfolder/train/dog/chihuahua.png\n\nfolder/train/cat/maine_coon.png\nfolder/train/cat/bengal.png\nfolder/train/cat/birman.png\n```\n\nAlternatively it should have metadata, for example:\n\n```\nfolder/train/metadata.csv\nfolder/train/0001.png\nfolder/train/0002.png\nfolder/train/0003.png\n```\n\nIf the dataset follows the `ImageFolder` structure, then you can load it directly with [`load_dataset`]:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"username/dataset_name\")\n>>> # OR locally:\n>>> dataset = load_dataset(\"/path/to/folder\")\n```\n\nFor local datasets, this is equivalent to passing `imagefolder` manually in [`load_dataset`] and the directory in `data_dir`:\n\n```py\n>>> dataset = load_dataset(\"imagefolder\", data_dir=\"/path/to/folder\")\n```\n\nThen you can access the videos as `PIL.Image` objects:\n\n```\n>>> dataset[\"train\"][0]\n{\"image\": <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1200x215 at 0x15E6D7160>, \"label\": 0}\n\n>>> dataset[\"train\"][-1]\n{\"image\": <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1200x215 at 0x15E8DAD30>, \"label\": 1}\n```\n\nTo ignore the information in the metadata file, set `drop_metadata=True` in [`load_dataset`]:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"username/dataset_with_metadata\", drop_metadata=True)\n```\n\nIf you don't have a metadata file, `ImageFolder` automatically infers the label name from the directory name.\nIf you want to drop automatically created labels, set `drop_labels=True`.\nIn this case, your dataset will only contain an image column:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"username/dataset_without_metadata\", drop_labels=True)\n```\n\nFinally the `filters` argument lets you load only a subset of the dataset, based on a condition on the label or the metadata. This is especially useful if the metadata is in Parquet format, since this format enables fast filtering. It is also recommended to use this argument with `streaming=True`, because by default the dataset is fully downloaded before filtering.\n\n```python\n>>> filters = [(\"label\", \"=\", 0)]\n>>> dataset = load_dataset(\"username/dataset_name\", streaming=True, filters=filters)\n```\n\n> [!TIP]\n> For more information about creating your own `ImageFolder` dataset, take a look at the [Create an image dataset](./image_dataset) guide.\n\n\n## WebDataset\n\nThe [WebDataset](https://github.com/webdataset/webdataset) format is based on a folder of TAR archives and is suitable for big image datasets.\nBecause of their size, WebDatasets are generally loaded in streaming mode (using `streaming=True`).\n\nYou can load a WebDataset like this:\n\n```python\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"webdataset\", data_dir=\"/path/to/folder\", streaming=True)\n```\n\n## Lance\n\n[Lance](https://lance.org) is an open multimodal lakehouse table format. Lance tables can natively store not only text and scalar values,\nbut also large binary objects (blobs) such as images, audio, and video alongside your tabular data.\n\nLance keeps your metadata and image blobs together in one place, while still letting you efficiently scan only the metadata columns you care about\nwithout loading image bytes. When you're ready, you can fetch a small subset of rows (including the image blobs) and write them directly to files on\nyour local filesystem.\n\n```python\nfrom datasets import load_dataset\n\n# Return as a Hugging Face dataset\nds = load_dataset(\n    \"lance-format/laion-1m\",\n    split=\"train\",\n    streaming=True\n)\n\ndir_name = \"laion_samples\"\nPath(dir_name).mkdir(exist_ok=True)\n\nfor idx, row in enumerate(ds.take(3)):\n    with open(f\"{dir_name}/{idx}.jpg\", \"wb\") as f:\n        f.write(row[\"image\"])\n```\n\nIn this example, the `image` column contains the encoded image bytes, so you can write them directly to `.jpg` files.\n\n> [!NOTE] The `datasets` API doesn't currently push down operations to the Lance table, so for larger datasets it may be slow.\n> For now, you'll get much better performance using the `lance` Python package directly. See the\n> documentation on [the Hub](https://huggingface.co/docs/datasets-lance) for examples on usage.\n\n## Image decoding\n\nBy default, images are decoded sequentially as `PIL.Images` when you iterate on a dataset.\nHowever it is possible to speed up the dataset significantly using multithreaded decoding:\n\n```python\n>>> import os\n>>> num_threads = num_threads = min(32, (os.cpu_count() or 1) + 4)\n>>> dataset = dataset.decode(num_threads=num_threads)\n>>> for example in dataset:  # up to 20 times faster !\n...     ...\n```\n\nYou can enable multithreading using `num_threads`. This is especially useful to speed up remote data streaming.\nHowever it can be slower than `num_threads=0` for local data on fast disks.\n\nIf you are not interested in the images decoded as `PIL.Images` and would like to access the path/bytes instead, you can disable decoding:\n\n```python\n>>> dataset = dataset.decode(False)\n```\n\nNote: [`IterableDataset.decode`] is only available for streaming datasets at the moment.\n"
  },
  {
    "path": "docs/source/image_process.mdx",
    "content": "# Process image data\n\nThis guide shows specific methods for processing image datasets. Learn how to:\n\n- Use [`~Dataset.map`] with image dataset.\n- Apply data augmentations to a dataset with [`~Dataset.set_transform`].\n\nFor a guide on how to process any type of dataset, take a look at the <a class=\"underline decoration-sky-400 decoration-2 font-semibold\" href=\"./process\">general process guide</a>.\n\n## Map\n\nThe [`~Dataset.map`] function can apply transforms over an entire dataset.\n\nFor example, create a basic [`Resize`](https://pytorch.org/vision/stable/generated/torchvision.transforms.Resize.html) function:\n\n```py\n>>> def transforms(examples):\n...     examples[\"pixel_values\"] = [image.convert(\"RGB\").resize((100,100)) for image in examples[\"image\"]]\n...     return examples\n```\n\nNow use the [`~Dataset.map`] function to resize the entire dataset, and set `batched=True` to speed up the process by accepting batches of examples. The transform returns `pixel_values` as a cacheable `PIL.Image` object:\n\n```py\n>>> dataset = dataset.map(transforms, remove_columns=[\"image\"], batched=True)\n>>> dataset[0]\n{'label': 6,\n 'pixel_values': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=100x100 at 0x7F058237BB10>}\n```\n\nThe cache file saves time because you don't have to execute the same transform twice. The [`~Dataset.map`] function is best for operations you only run once per training - like resizing an image - instead of using it for operations executed for each epoch, like data augmentations.\n\n[`~Dataset.map`] takes up some memory, but you can reduce its memory requirements with the following parameters:\n\n- [`batch_size`](./package_reference/main_classes#datasets.DatasetDict.map.batch_size) determines the number of examples that are processed in one call to the transform function.\n- [`writer_batch_size`](./package_reference/main_classes#datasets.DatasetDict.map.writer_batch_size) determines the number of processed examples that are kept in memory before they are stored away.\n\nBoth parameter values default to 1000, which can be expensive if you are storing images. Lower these values to use less memory when you use [`~Dataset.map`].\n\n## Apply transforms\n\n🤗 Datasets applies data augmentations from any library or package to your dataset. Transforms can be applied on-the-fly on batches of data with [`~Dataset.set_transform`], which consumes less disk space.\n\n> [!TIP]\n> The following example uses [torchvision](https://pytorch.org/vision/stable/index.html), but feel free to use other data augmentation libraries like [Albumentations](https://albumentations.ai/docs/), [Kornia](https://kornia.readthedocs.io/en/latest/), and [imgaug](https://imgaug.readthedocs.io/en/latest/).\n\nFor example, if you'd like to change the color properties of an image randomly:\n\n```py\n>>> from torchvision.transforms import Compose, ColorJitter, ToTensor\n\n>>> jitter = Compose(\n...     [\n...          ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.7),\n...          ToTensor(),\n...     ]\n... )\n```\n\nCreate a function to apply the `ColorJitter` transform:\n\n```py\n>>> def transforms(examples):\n...     examples[\"pixel_values\"] = [jitter(image.convert(\"RGB\")) for image in examples[\"image\"]]\n...     return examples\n```\n\nApply the transform with the [`~Dataset.set_transform`] function:\n\n```py\n>>> dataset.set_transform(transforms)\n```"
  },
  {
    "path": "docs/source/index.mdx",
    "content": "# Datasets\n\n<img class=\"float-left !m-0 !border-0 !dark:border-0 !shadow-none !max-w-lg w-[150px]\" src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/datasets_logo.png\"/>\n\n🤗 Datasets is a library for easily accessing and sharing AI datasets for Audio, Computer Vision, and Natural Language Processing (NLP) tasks.\n\nLoad a dataset in a single line of code, and use our powerful data processing and streaming methods to quickly get your dataset ready for training in a deep learning model. Backed by the Apache Arrow format, process large datasets with zero-copy reads without any memory constraints for optimal speed and efficiency. We also feature a deep integration with the [Hugging Face Hub](https://huggingface.co/datasets), allowing you to easily load and share a dataset with the wider machine learning community.\n\nFind your dataset today on the [Hugging Face Hub](https://huggingface.co/datasets), and take an in-depth look inside of it with the live viewer.\n\n<div class=\"mt-10\">\n  <div class=\"w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-2 md:gap-y-4 md:gap-x-5\">\n    <a class=\"!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg\" href=\"./tutorial\"\n      ><div class=\"w-full text-center bg-gradient-to-br from-blue-400 to-blue-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed\">Tutorials</div>\n      <p class=\"text-gray-700\">Learn the basics and become familiar with loading, accessing, and processing a dataset. Start here if you are using 🤗 Datasets for the first time!</p>\n    </a>\n    <a class=\"!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg\" href=\"./how_to\"\n      ><div class=\"w-full text-center bg-gradient-to-br from-indigo-400 to-indigo-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed\">How-to guides</div>\n      <p class=\"text-gray-700\">Practical guides to help you achieve a specific goal. Take a look at these guides to learn how to use 🤗 Datasets to solve real-world problems.</p>\n    </a>\n    <a class=\"!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg\" href=\"./about_arrow\"\n      ><div class=\"w-full text-center bg-gradient-to-br from-pink-400 to-pink-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed\">Conceptual guides</div>\n      <p class=\"text-gray-700\">High-level explanations for building a better understanding about important topics such as the underlying data format, the cache, and how datasets are generated.</p>\n   </a>\n    <a class=\"!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg\" href=\"./package_reference/main_classes\"\n      ><div class=\"w-full text-center bg-gradient-to-br from-purple-400 to-purple-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed\">Reference</div>\n      <p class=\"text-gray-700\">Technical descriptions of how 🤗 Datasets classes and methods work.</p>\n    </a>\n  </div>\n</div>\n"
  },
  {
    "path": "docs/source/installation.md",
    "content": "# Installation\n\nBefore you start, you'll need to setup your environment and install the appropriate packages. 🤗 Datasets is tested on **Python 3.10+**.\n\n> [!TIP]\n> If you want to use 🤗 Datasets with TensorFlow or PyTorch, you'll need to install them separately. Refer to the [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2-packages-are-available) or the [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) for the specific install command for your framework.\n\n## Virtual environment\n\nYou should install 🤗 Datasets in a [virtual environment](https://docs.python.org/3/library/venv.html) to keep things tidy and avoid dependency conflicts.\n\n1. Create and navigate to your project directory:\n\n   ```bash\n   mkdir ~/my-project\n   cd ~/my-project\n   ```\n\n2. Start a virtual environment inside your directory:\n\n   ```bash\n   python -m venv .env\n   ```\n\n3. Activate and deactivate the virtual environment with the following commands:\n\n   ```bash\n   # Activate the virtual environment\n   source .env/bin/activate\n\n   # Deactivate the virtual environment\n   source .env/bin/deactivate\n   ```\n\nOnce you've created your virtual environment, you can install 🤗 Datasets in it.\n\n## pip\n\nThe most straightforward way to install 🤗 Datasets is with pip:\n\n```bash\npip install datasets\n```\n\nRun the following command to check if 🤗 Datasets has been properly installed:\n\n```bash\npython -c \"from datasets import load_dataset; print(load_dataset('rajpurkar/squad', split='train')[0])\"\n```\n\nThis command downloads version 1 of the [Stanford Question Answering Dataset (SQuAD)](https://rajpurkar.github.io/SQuAD-explorer/), loads the training split, and prints the first training example. You should see:\n\n```python\n{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']}, 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'id': '5733be284776f41900661182', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'title': 'University_of_Notre_Dame'}\n```\n\n## Audio\n\nTo work with audio datasets, you need to install the [`Audio`] feature as an extra dependency:\n\n```bash\npip install datasets[audio]\n```\n\n## Vision\n\nTo work with image datasets, you need to install the [`Image`] feature as an extra dependency:\n\n```bash\npip install datasets[vision]\n```\n\n## source\n\nBuilding 🤗 Datasets from source lets you make changes to the code base. To install from the source, clone the repository and install with the following commands:\n\n```bash\ngit clone https://github.com/huggingface/datasets.git\ncd datasets\npip install -e .\n```\n\nAgain, you can check if 🤗 Datasets was properly installed with the following command:\n\n```bash\npython -c \"from datasets import load_dataset; print(load_dataset('rajpurkar/squad', split='train')[0])\"\n```\n\n## conda\n\n🤗 Datasets can also be installed from conda, a package management system:\n\n```bash\nconda install -c huggingface -c conda-forge datasets\n```\n"
  },
  {
    "path": "docs/source/load_hub.mdx",
    "content": "# Load a dataset from the Hub\n\nFinding high-quality datasets that are reproducible and accessible can be difficult. One of 🤗 Datasets main goals is to provide a simple way to load a dataset of any format or type. The easiest way to get started is to discover an existing dataset on the [Hugging Face Hub](https://huggingface.co/datasets) - a community-driven collection of datasets for tasks in NLP, computer vision, and audio - and use 🤗 Datasets to download and generate the dataset.\n\nThis tutorial uses the [rotten_tomatoes](https://huggingface.co/datasets/rotten_tomatoes) and [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) datasets, but feel free to load any dataset you want and follow along. Head over to the Hub now and find a dataset for your task!\n\n## Load a dataset\n\nBefore you take the time to download a dataset, it's often helpful to quickly get some general information about a dataset. A dataset's information is stored inside [`DatasetInfo`] and can include information such as the dataset description, features, and dataset size. \n\nUse the [`load_dataset_builder`] function to load a dataset builder and inspect a dataset's attributes without committing to downloading it:\n\n```py\n>>> from datasets import load_dataset_builder\n>>> ds_builder = load_dataset_builder(\"cornell-movie-review-data/rotten_tomatoes\")\n\n# Inspect dataset description\n>>> ds_builder.info.description\nMovie Review Dataset. This is a dataset of containing 5,331 positive and 5,331 negative processed sentences from Rotten Tomatoes movie reviews. This data was first used in Bo Pang and Lillian Lee, ``Seeing stars: Exploiting class relationships for sentiment categorization with respect to rating scales.'', Proceedings of the ACL, 2005.\n\n# Inspect dataset features\n>>> ds_builder.info.features\n{'label': ClassLabel(names=['neg', 'pos']),\n 'text': Value('string')}\n```\n\nIf you're happy with the dataset, then load it with [`load_dataset`]:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\")\n```\n\n## Splits\n\nA split is a specific subset of a dataset like `train` and `test`. List a dataset's split names with the [`get_dataset_split_names`] function:\n\n```py\n>>> from datasets import get_dataset_split_names\n\n>>> get_dataset_split_names(\"cornell-movie-review-data/rotten_tomatoes\")\n['train', 'validation', 'test']\n```\n\nThen you can load a specific split with the `split` parameter. Loading a dataset `split` returns a [`Dataset`] object:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\")\n>>> dataset\nDataset({\n    features: ['text', 'label'],\n    num_rows: 8530\n})\n```\n\nIf you don't specify a `split`, 🤗 Datasets returns a [`DatasetDict`] object instead:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\nDatasetDict({\n    train: Dataset({\n        features: ['text', 'label'],\n        num_rows: 8530\n    })\n    validation: Dataset({\n        features: ['text', 'label'],\n        num_rows: 1066\n    })\n    test: Dataset({\n        features: ['text', 'label'],\n        num_rows: 1066\n    })\n})\n```\n\n## Configurations\n\nSome datasets contain several sub-datasets. For example, the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset has several sub-datasets, each one containing audio data in a different language. These sub-datasets are known as *configurations* or *subsets*, and you must explicitly select one when loading the dataset. If you don't provide a configuration name, 🤗 Datasets will raise a `ValueError` and remind you to choose a configuration.\n\nUse the [`get_dataset_config_names`] function to retrieve a list of all the possible configurations available to your dataset:\n\n```py\n>>> from datasets import get_dataset_config_names\n\n>>> configs = get_dataset_config_names(\"PolyAI/minds14\")\n>>> print(configs)\n['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR', 'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN', 'all']\n```\n\nThen load the configuration you want:\n\n```py\n>>> from datasets import load_dataset\n\n>>> mindsFR = load_dataset(\"PolyAI/minds14\", \"fr-FR\", split=\"train\")\n```\n"
  },
  {
    "path": "docs/source/loading.mdx",
    "content": "# Load\n\nYour data can be stored in various places; they can be on your local machine's disk, in a Github repository, and in in-memory data structures like Python dictionaries and Pandas DataFrames. Wherever a dataset is stored, 🤗 Datasets can help you load it.\n\nThis guide will show you how to load a dataset from:\n\n- The Hugging Face Hub\n- Local files\n- In-memory data\n- Offline\n- A specific slice of a split\n\nFor more details specific to loading other dataset modalities, take a look at the <a class=\"underline decoration-pink-400 decoration-2 font-semibold\" href=\"./audio_load\">load audio dataset guide</a>, the <a class=\"underline decoration-yellow-400 decoration-2 font-semibold\" href=\"./image_load\">load image dataset guide</a>, the <a class=\"underline decoration-blue-400 decoration-2 font-semibold\" href=\"./video_load\">load video dataset guide</a>, or the <a class=\"underline decoration-green-400 decoration-2 font-semibold\" href=\"./nlp_load\">load text dataset guide</a>.\n\n<a id='load-from-the-hub'></a>\n\n## Hugging Face Hub\n\nYou can also load a dataset from any dataset repository on the Hub! Begin by [creating a dataset repository](share#create-the-repository) and upload your data files. Now you can use the [`load_dataset`] function to load the dataset. \n\nFor example, try loading the files from this [demo repository](https://huggingface.co/datasets/lhoestq/demo1) by providing the repository namespace and dataset name. This dataset repository contains CSV files, and the code below loads the dataset from the CSV files:\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset(\"lhoestq/demo1\")\n```\n\nSome datasets may have more than one version based on Git tags, branches, or commits. Use the `revision` parameter to specify the dataset version you want to load:\n\n```py\n>>> dataset = load_dataset(\n...   \"lhoestq/custom_squad\",\n...   revision=\"main\"  # tag name, or branch name, or commit hash\n... )\n```\n\n> [!TIP]\n> Refer to the [Upload a dataset to the Hub](./upload_dataset) tutorial for more details on how to create a dataset repository on the Hub, and how to upload your data files.\n\nA dataset loads by default all the data into the `train` split, or checks for mentions or split names in the data files names (e.g. \"train\", \"test\" and \"validation\"). Use the `data_files` parameter to map data files to splits like `train`, `validation` and `test`:\n\n```py\n>>> data_files = {\"train\": \"train.csv\", \"test\": \"test.csv\"}\n>>> dataset = load_dataset(\"namespace/your_dataset_name\", data_files=data_files)\n```\n\n> [!WARNING]\n> If you don't specify which data files to use, [`load_dataset`] will return all the data files. This can take a long time if you load a large dataset like C4, which is approximately 13TB of data.\n\nYou can also load a specific subset of the files with the `data_files` or `data_dir` parameter. These parameters can accept a relative path which resolves to the base path corresponding to where the dataset is loaded from.\n\n```py\n>>> from datasets import load_dataset\n\n# load files that match the grep pattern\n>>> c4_subset = load_dataset(\"allenai/c4\", data_files=\"en/c4-train.0000*-of-01024.json.gz\")\n\n# load dataset from the en directory on the Hub\n>>> c4_subset = load_dataset(\"allenai/c4\", data_dir=\"en\")\n```\n\nThe `split` parameter can also map a data file to a specific split:\n\n```py\n>>> data_files = {\"validation\": \"en/c4-validation.*.json.gz\"}\n>>> c4_validation = load_dataset(\"allenai/c4\", data_files=data_files, split=\"validation\")\n```\n\n## Local and remote files\n\nDatasets can be loaded from local files stored on your computer and from remote files. The datasets are most likely stored as a `csv`, `json`, `txt` or `parquet` file. The [`load_dataset`] function can load each of these file types.\n\n### CSV\n\n🤗 Datasets can read a dataset made up of one or several CSV files (in this case, pass your CSV files as a list):\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset(\"csv\", data_files=\"my_file.csv\")\n```\n\n> [!TIP]\n> For more details, check out the [how to load tabular datasets from CSV files](tabular_load#csv-files) guide.\n\n### JSON\n\nJSON files are loaded directly with [`load_dataset`] as shown below:\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset(\"json\", data_files=\"my_file.json\")\n```\n\nJSON files have diverse formats, but we think the most efficient format is to have multiple JSON objects; each line represents an individual row of data. For example:\n\n```json\n{\"a\": 1, \"b\": 2.0, \"c\": \"foo\", \"d\": false}\n{\"a\": 4, \"b\": -5.5, \"c\": null, \"d\": true}\n```\n\nAnother JSON format you may encounter is a nested field, in which case you'll need to specify the `field` argument as shown in the following:\n\n```py\n{\"version\": \"0.1.0\",\n \"data\": [{\"a\": 1, \"b\": 2.0, \"c\": \"foo\", \"d\": false},\n          {\"a\": 4, \"b\": -5.5, \"c\": null, \"d\": true}]\n}\n\n>>> from datasets import load_dataset\n>>> dataset = load_dataset(\"json\", data_files=\"my_file.json\", field=\"data\")\n```\n\nTo load remote JSON files via HTTP, pass the URLs instead:\n\n```py\n>>> base_url = \"https://rajpurkar.github.io/SQuAD-explorer/dataset/\"\n>>> dataset = load_dataset(\"json\", data_files={\"train\": base_url + \"train-v1.1.json\", \"validation\": base_url + \"dev-v1.1.json\"}, field=\"data\")\n```\n\nWhile these are the most common JSON formats, you'll see other datasets that are formatted differently. 🤗 Datasets recognizes these other formats and will fallback accordingly on the Python JSON loading methods to handle them.\n\n### Parquet\n\nParquet files are stored in a columnar format, unlike row-based files like a CSV. Large datasets may be stored in a Parquet file because it is more efficient and faster at returning your query. \n\nTo load a Parquet file:\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset(\"parquet\", data_files={'train': 'train.parquet', 'test': 'test.parquet'})\n```\n\nTo load remote Parquet files via HTTP, pass the URLs instead:\n\n```py\n>>> base_url = \"https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.ab/\"\n>>> data_files = {\"train\": base_url + \"train-00000-of-00001.parquet\"}\n>>> wiki = load_dataset(\"parquet\", data_files=data_files, split=\"train\")\n```\n\n### Arrow\n\nArrow files are stored in an in-memory columnar format, unlike row-based formats like CSV and uncompressed formats like Parquet.\n\nTo load an Arrow file:\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset(\"arrow\", data_files={'train': 'train.arrow', 'test': 'test.arrow'})\n```\n\nTo load remote Arrow files via HTTP, pass the URLs instead:\n\n```py\n>>> base_url = \"https://huggingface.co/datasets/croissantllm/croissant_dataset/resolve/main/english_660B_11/\"\n>>> data_files = {\"train\": base_url + \"train/data-00000-of-00080.arrow\"}\n>>> wiki = load_dataset(\"arrow\", data_files=data_files, split=\"train\")\n```\n\nArrow is the file format used by 🤗 Datasets under the hood, therefore you can load a local Arrow file using [`Dataset.from_file`] directly:\n\n```py\n>>> from datasets import Dataset\n>>> dataset = Dataset.from_file(\"data.arrow\")\n```\n\nUnlike [`load_dataset`], [`Dataset.from_file`] memory maps the Arrow file without preparing the dataset in the cache, saving you disk space.\nThe cache directory to store intermediate processing results will be the Arrow file directory in that case.\n\nFor now only the Arrow streaming format is supported. The Arrow IPC file format (also known as Feather V2) is not supported.\n\n### Lance\n\n[Lance](https://lance.org) is an open multimodal lakehouse table format for AI. Lance tables can natively store not only text and scalar values, but also large binary objects (blobs) such as images, audio, and video alongside your tabular data.\n\n```py\n>>> from datasets import load_dataset\n>>> lance_base_url = \"lance-format/laion-1m\"\n```\n\nTo stream the dataset without copying it to your local machine, specify the `streaming=True` parameter:\n\n```py\nds = load_dataset(lance_base_url, split=\"train\", streaming=True)\n# Take first three rows\nfor row in ds.take(3):\n    print(row[\"caption\"], row[\"image\"])\n```\n\nThis will return the image caption and the image bytes in a single request.\n\n### HDF5 files\n\n[HDF5](https://www.hdfgroup.org/solutions/hdf5/) files are commonly used for storing large amounts of numerical data in scientific computing and machine learning. Loading HDF5 files with 🤗 Datasets is similar to loading CSV files:\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset(\"hdf5\", data_files=\"data.h5\")\n```\n\nNote that the HDF5 loader assumes that the file has \"tabular\" structure, i.e. that all datasets in the file have (the same number of) rows on their first dimension.\n\n### SQL\n\nRead database contents with [`~datasets.Dataset.from_sql`] by specifying the URI to connect to your database. You can read both table names and queries:\n\n```py\n>>> from datasets import Dataset\n# load entire table\n>>> dataset = Dataset.from_sql(\"data_table_name\", con=\"sqlite:///sqlite_file.db\")\n# load from query\n>>> dataset = Dataset.from_sql(\"SELECT text FROM table WHERE length(text) > 100 LIMIT 10\", con=\"sqlite:///sqlite_file.db\")\n```\n\n> [!TIP]\n> For more details, check out the [how to load tabular datasets from SQL databases](tabular_load#databases) guide.\n\n### WebDataset\n\nThe [WebDataset](https://github.com/webdataset/webdataset) format is based on TAR archives and is suitable for big image datasets.\nBecause of their size, WebDatasets are generally loaded in streaming mode (using `streaming=True`).\n\nYou can load a WebDataset like this:\n\n```python\n>>> from datasets import load_dataset\n>>>\n>>> path = \"path/to/train/*.tar\"\n>>> dataset = load_dataset(\"webdataset\", data_files={\"train\": path}, split=\"train\", streaming=True)\n```\n\nTo load remote WebDatasets via HTTP, pass the URLs instead:\n\n```python\n>>> from datasets import load_dataset\n>>>\n>>> base_url = \"https://huggingface.co/datasets/lhoestq/small-publaynet-wds/resolve/main/publaynet-train-{i:06d}.tar\"\n>>> urls = [base_url.format(i=i) for i in range(4)]\n>>> dataset = load_dataset(\"webdataset\", data_files={\"train\": urls}, split=\"train\", streaming=True)\n```\n\n## Remote files\n\nIf you have remote files likely stored as a `csv`, `json`, `txt`, `parquet` or any supported format, the [`load_dataset`] function can load load them if you specify their remote paths:\n\n- `https://` URLs for public online files, e.g. `data_files=[\"https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json\"]`\n- `hf://` URLs for files in any [Dataset repository](https://huggingface.co/docs/hub/datasets-overview) or [Storage Bucket](https://huggingface.co/docs/hub/storage-buckets) on Hugging Face, e.g. `data_files=[\"hf://datasets/karpathy/tinystories-gpt4-clean/tinystories_gpt4_clean.parquet\"]` or `data_files=[\"hf://buckets/julien-c/my-training-bucket/julien/affluence.csv\"]`\n\n## Multiprocessing\n\nWhen a dataset is made of several files (that we call \"shards\"), it is possible to significantly speed up the dataset downloading and preparation step.\n\nYou can choose how many processes you'd like to use to prepare a dataset in parallel using `num_proc`.\nIn this case, each process is given a subset of shards to prepare:\n\n```python\nfrom datasets import load_dataset\n\nimagenet = load_dataset(\"timm/imagenet-1k-wds\", num_proc=8)\nml_librispeech_spanish = load_dataset(\"facebook/multilingual_librispeech\", \"spanish\", num_proc=8)\n```\n\n## In-memory data\n\n🤗 Datasets will also allow you to create a [`Dataset`] directly from in-memory data structures like Python dictionaries and Pandas DataFrames.\n\n### Python dictionary\n\nLoad Python dictionaries with [`~Dataset.from_dict`]:\n\n```py\n>>> from datasets import Dataset\n>>> my_dict = {\"a\": [1, 2, 3]}\n>>> dataset = Dataset.from_dict(my_dict)\n```\n\n### Python list of dictionaries\n\nLoad a list of Python dictionaries with [`~Dataset.from_list`]:\n\n```py\n>>> from datasets import Dataset\n>>> my_list = [{\"a\": 1}, {\"a\": 2}, {\"a\": 3}]\n>>> dataset = Dataset.from_list(my_list)\n```\n\n### Python generator\n\nCreate a dataset from a Python generator with [`~Dataset.from_generator`]:\n\n```py\n>>> from datasets import Dataset\n>>> def my_gen():\n...     for i in range(1, 4):\n...         yield {\"a\": i}\n...\n>>> dataset = Dataset.from_generator(my_gen)\n```\n\nThis approach supports loading data larger than available memory.\n\nYou can also define a sharded dataset by passing lists to `gen_kwargs`:\n\n```py\n>>> def gen(shards):\n...     for shard in shards:\n...         with open(shard) as f:\n...             for line in f:\n...                 yield {\"line\": line}\n...\n>>> shards = [f\"data{i}.txt\" for i in range(32)]\n>>> ds = IterableDataset.from_generator(gen, gen_kwargs={\"shards\": shards})\n>>> ds = ds.shuffle(seed=42, buffer_size=10_000)  # shuffles the shards order + uses a shuffle buffer\n>>> from torch.utils.data import DataLoader\n>>> dataloader = DataLoader(ds.with_format(\"torch\"), num_workers=4)  # give each worker a subset of 32/4=8 shards\n```\n\n### Pandas DataFrame\n\nLoad Pandas DataFrames with [`~Dataset.from_pandas`]:\n\n```py\n>>> from datasets import Dataset\n>>> import pandas as pd\n>>> df = pd.DataFrame({\"a\": [1, 2, 3]})\n>>> dataset = Dataset.from_pandas(df)\n```\n\n> [!TIP]\n> For more details, check out the [how to load tabular datasets from Pandas DataFrames](tabular_load#pandas-dataframes) guide.\n\n## Offline\n\nEven if you don't have an internet connection, it is still possible to load a dataset. As long as you've downloaded a dataset from the Hub repository before, it should be cached. This means you can reload the dataset from the cache and use it offline.\n\nIf you know you won't have internet access, you can run 🤗 Datasets in full offline mode. This saves time because instead of waiting for the Dataset builder download to time out, 🤗 Datasets will look directly in the cache. Set the environment variable `HF_HUB_OFFLINE` to `1` to enable full offline mode.\n\n## Slice splits\n\nYou can also choose only to load specific slices of a split. There are two options for slicing a split: using strings or the [`ReadInstruction`] API. Strings are more compact and readable for simple cases, while [`ReadInstruction`] is easier to use with variable slicing parameters.\n\nConcatenate a `train` and `test` split by:\n\n```py\n>>> train_test_ds = datasets.load_dataset(\"ajibawa-2023/General-Stories-Collection\", split=\"train+test\")\n===STRINGAPI-READINSTRUCTION-SPLIT===\n>>> ri = datasets.ReadInstruction(\"train\") + datasets.ReadInstruction(\"test\")\n>>> train_test_ds = datasets.load_dataset(\"ajibawa-2023/General-Stories-Collection\", split=ri)\n```\n\nSelect specific rows of the `train` split:\n\n```py\n>>> train_10_20_ds = datasets.load_dataset(\"ajibawa-2023/General-Stories-Collection\", split=\"train[10:20]\")\n===STRINGAPI-READINSTRUCTION-SPLIT===\n>>> train_10_20_ds = datasets.load_dataset(\"rojagtap/bookcorpus\", split=datasets.ReadInstruction(\"train\", from_=10, to=20, unit=\"abs\"))\n```\n\nOr select a percentage of a split with:\n\n```py\n>>> train_10pct_ds = datasets.load_dataset(\"ajibawa-2023/General-Stories-Collection\", split=\"train[:10%]\")\n===STRINGAPI-READINSTRUCTION-SPLIT===\n>>> train_10_20_ds = datasets.load_dataset(\"ajibawa-2023/General-Stories-Collection\", split=datasets.ReadInstruction(\"train\", to=10, unit=\"%\"))\n```\n\nSelect a combination of percentages from each split:\n\n```py\n>>> train_10_80pct_ds = datasets.load_dataset(\"ajibawa-2023/General-Stories-Collection\", split=\"train[:10%]+train[-80%:]\")\n===STRINGAPI-READINSTRUCTION-SPLIT===\n>>> ri = (datasets.ReadInstruction(\"train\", to=10, unit=\"%\") + datasets.ReadInstruction(\"train\", from_=-80, unit=\"%\"))\n>>> train_10_80pct_ds = datasets.load_dataset(\"ajibawa-2023/General-Stories-Collection\", split=ri)\n```\n\nFinally, you can even create cross-validated splits. The example below creates 10-fold cross-validated splits. Each validation dataset is a 10% chunk, and the training dataset makes up the remaining complementary 90% chunk:\n\n```py\n>>> val_ds = datasets.load_dataset(\"ajibawa-2023/General-Stories-Collection\", split=[f\"train[{k}%:{k+10}%]\" for k in range(0, 100, 10)])\n>>> train_ds = datasets.load_dataset(\"ajibawa-2023/General-Stories-Collection\", split=[f\"train[:{k}%]+train[{k+10}%:]\" for k in range(0, 100, 10)])\n===STRINGAPI-READINSTRUCTION-SPLIT===\n>>> val_ds = datasets.load_dataset(\"ajibawa-2023/General-Stories-Collection\", [datasets.ReadInstruction(\"train\", from_=k, to=k+10, unit=\"%\") for k in range(0, 100, 10)])\n>>> train_ds = datasets.load_dataset(\"ajibawa-2023/General-Stories-Collection\", [(datasets.ReadInstruction(\"train\", to=k, unit=\"%\") + datasets.ReadInstruction(\"train\", from_=k+10, unit=\"%\")) for k in range(0, 100, 10)])\n```\n\n### Percent slicing and rounding\n\nThe default behavior is to round the boundaries to the nearest integer for datasets where the requested slice boundaries do not divide evenly by 100. As shown below, some slices may contain more examples than others. For instance, if the following train split includes 999 records, then:\n\n```py\n# 19 records, from 500 (included) to 519 (excluded).\n>>> train_50_52_ds = datasets.load_dataset(\"ajibawa-2023/General-Stories-Collection\", split=\"train[50%:52%]\")\n# 20 records, from 519 (included) to 539 (excluded).\n>>> train_52_54_ds = datasets.load_dataset(\"ajibawa-2023/General-Stories-Collection\", split=\"train[52%:54%]\")\n```\n\nIf you want equal sized splits, use `pct1_dropremainder` rounding instead. This treats the specified percentage boundaries as multiples of 1%. \n\n```py\n# 18 records, from 450 (included) to 468 (excluded).\n>>> train_50_52pct1_ds = datasets.load_dataset(\"ajibawa-2023/General-Stories-Collection\", split=datasets.ReadInstruction(\"train\", from_=50, to=52, unit=\"%\", rounding=\"pct1_dropremainder\"))\n# 18 records, from 468 (included) to 486 (excluded).\n>>> train_52_54pct1_ds = datasets.load_dataset(\"ajibawa-2023/General-Stories-Collection\", split=datasets.ReadInstruction(\"train\",from_=52, to=54, unit=\"%\", rounding=\"pct1_dropremainder\"))\n# Or equivalently:\n>>> train_50_52pct1_ds = datasets.load_dataset(\"ajibawa-2023/General-Stories-Collection\", split=\"train[50%:52%](pct1_dropremainder)\")\n>>> train_52_54pct1_ds = datasets.load_dataset(\"ajibawa-2023/General-Stories-Collection\", split=\"train[52%:54%](pct1_dropremainder)\")\n```\n\n> [!WARNING]\n> `pct1_dropremainder` rounding may truncate the last examples in a dataset if the number of examples in your dataset don't divide evenly by 100.\n\n<a id='troubleshoot'></a>\n\n## Troubleshooting\n\nSometimes, you may get unexpected results when you load a dataset. Two of the most common issues you may encounter are manually downloading a dataset and specifying features of a dataset.\n\n### Specify features\n\nWhen you create a dataset from local files, the [`Features`] are automatically inferred by [Apache Arrow](https://arrow.apache.org/docs/). However, the dataset's features may not always align with your expectations, or you may want to define the features yourself. The following example shows how you can add custom labels with the [`ClassLabel`] feature. \n\nStart by defining your own labels with the [`Features`] class:\n\n```py\n>>> class_names = [\"sadness\", \"joy\", \"love\", \"anger\", \"fear\", \"surprise\"]\n>>> emotion_features = Features({'text': Value('string'), 'label': ClassLabel(names=class_names)})\n```\n\nNext, specify the `features` parameter in [`load_dataset`] with the features you just created:\n\n```py\n>>> dataset = load_dataset('csv', data_files=file_dict, delimiter=';', column_names=['text', 'label'], features=emotion_features)\n```\n\nNow when you look at your dataset features, you can see it uses the custom labels you defined:\n\n```py\n>>> dataset['train'].features\n{'text': Value('string'),\n'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'])}\n```\n"
  },
  {
    "path": "docs/source/nifti_dataset.mdx",
    "content": "# Create a NIfTI dataset\n\nThis page shows how to create and share a dataset of medical images in NIfTI format (.nii / .nii.gz) using the `datasets` library.\n\nYou can share a dataset with your team or with anyone in the community by creating a dataset repository on the Hugging Face Hub:\n\n```py\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"<username>/my_nifti_dataset\")\n```\n\nThere are two common ways to create a NIfTI dataset:\n\n- Create a dataset from local NIfTI files in Python and upload it with `Dataset.push_to_hub`.\n- Use a folder-based convention (one file per example) and a small helper to convert it into a `Dataset`.\n\n> [!TIP]\n> You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information.\n\n## Local files\n\nIf you already have a list of file paths to NIfTI files, the easiest workflow is to create a `Dataset` from that list and cast the column to the `Nifti` feature.\n\n```py\nfrom datasets import Dataset\nfrom datasets import Nifti\n\n# simple example: create a dataset from file paths\nfiles = [\"/path/to/scan_001.nii.gz\", \"/path/to/scan_002.nii.gz\"]\nds = Dataset.from_dict({\"nifti\": files}).cast_column(\"nifti\", Nifti())\n\n# access a decoded nibabel image (if decode=True)\n# ds[0][\"nifti\"] will be a nibabel.Nifti1Image object when decode=True\n# or a dict {'bytes': None, 'path': '...'} when decode=False\n```\n\nThe `Nifti` feature supports a `decode` parameter. When `decode=True` (the default), it loads the NIfTI file into a `nibabel.nifti1.Nifti1Image` object. You can access the image data as a numpy array with `img.get_fdata()`. When `decode=False`, it returns a dict with the file path and bytes.\n\n```py\nfrom datasets import Dataset, Nifti\n\nds = Dataset.from_dict({\"nifti\": [\"/path/to/scan.nii.gz\"]}).cast_column(\"nifti\", Nifti(decode=True))\nimg = ds[0][\"nifti\"]  # instance of: nibabel.nifti1.Nifti1Image\narr = img.get_fdata()\n```\n\nAfter preparing the dataset you can push it to the Hub:\n\n```py\nds.push_to_hub(\"<username>/my_nifti_dataset\")\n```\n\nThis will create a dataset repository containing your NIfTI dataset with a `data/` folder of parquet shards.\n\n## Folder conventions and metadata\n\nIf you organize your dataset in folders you can create splits automatically (train/test/validation) by following a structure like:\n\n```\ndataset/train/scan_0001.nii\ndataset/train/scan_0002.nii\ndataset/validation/scan_1001.nii\ndataset/test/scan_2001.nii\n```\n\nIf you have labels or other metadata, provide a `metadata.csv`, `metadata.jsonl`, or `metadata.parquet` in the folder so files can be linked to metadata rows. The metadata must contain a `file_name` (or `*_file_name`) field with the relative path to the NIfTI file next to the metadata file.\n\nExample `metadata.csv`:\n\n```csv\nfile_name,patient_id,age,diagnosis\nscan_0001.nii.gz,P001,45,healthy\nscan_0002.nii.gz,P002,59,disease_x\n```\n\nThe `Nifti` feature works with zipped datasets too — each zip can contain NIfTI files and a metadata file. This is useful when uploading large datasets as archives.\nThis means your dataset structure could look like this (mixed compressed and uncompressed files):\n```\ndataset/train/scan_0001.nii.gz\ndataset/train/scan_0002.nii\ndataset/validation/scan_1001.nii.gz\ndataset/test/scan_2001.nii\n```\n\n## Converting to PyTorch tensors\n\nUse the [`~Dataset.set_transform`] function to apply the transformation on-the-fly to batches of the dataset:\n\n```py\nimport torch \nimport nibabel\nimport numpy as np\n\ndef transform_to_pytorch(example):\n    example[\"nifti_torch\"] = [torch.tensor(ex.get_fdata()) for ex in example[\"nifti\"]]\n    return example\n\nds.set_transform(transform_to_pytorch)\n\n```\nAccessing elements now (e.g. `ds[0]`) will yield torch tensors in the `\"nifti_torch\"` key.\n\n\n## Usage of NifTI1Image\n\nNifTI is a format to store the result of 3 (or even 4) dimensional brain scans. This includes 3 spatial dimensions (x,y,z)\nand optionally a time dimension (t). Furthermore, the given positions here are only relative to the scanner, therefore \nthe dimensions (4, 5, 6) are used to lift this to real world coordinates.\n\nYou can visualize nifti files for instance leveraging `matplotlib` as follows:\n```python\nimport matplotlib.pyplot as plt\nfrom datasets import load_dataset\n\ndef show_slices(slices):\n   \"\"\" Function to display row of image slices \"\"\"\n   fig, axes = plt.subplots(1, len(slices))\n   for i, slice in enumerate(slices):\n       axes[i].imshow(slice.T, cmap=\"gray\", origin=\"lower\")\n\nnifti_ds = load_dataset(\"<username>/my_nifti_dataset\")\nfor epi_img in nifti_ds:\n    nifti_img = epi_img[\"nifti\"].get_fdata()\n    show_slices([nifti_img[:, :, 16], nifti_img[26, :, :], nifti_img[:, 30, :]])\n    plt.show()\n```\n\nFor further reading we refer to the [nibabel documentation](https://nipy.org/nibabel/index.html) and especially [this nibabel tutorial](https://nipy.org/nibabel/coordinate_systems.html)\n---\n"
  },
  {
    "path": "docs/source/nlp_load.mdx",
    "content": "# Load text data\n\nThis guide shows you how to load text datasets. To learn how to load any type of dataset, take a look at the <a class=\"underline decoration-sky-400 decoration-2 font-semibold\" href=\"./loading\">general loading guide</a>.\n\nText files are one of the most common file types for storing a dataset. By default, 🤗 Datasets samples a text file line by line to build the dataset.\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset(\"text\", data_files={\"train\": [\"my_text_1.txt\", \"my_text_2.txt\"], \"test\": \"my_test_file.txt\"})\n\n# Load from a directory\n>>> dataset = load_dataset(\"text\", data_dir=\"path/to/text/dataset\")\n```\n\nTo sample a text file by paragraph or even an entire document, use the `sample_by` parameter:\n\n```py\n# Sample by paragraph\n>>> dataset = load_dataset(\"text\", data_files={\"train\": \"my_train_file.txt\", \"test\": \"my_test_file.txt\"}, sample_by=\"paragraph\")\n\n# Sample by document\n>>> dataset = load_dataset(\"text\", data_files={\"train\": \"my_train_file.txt\", \"test\": \"my_test_file.txt\"}, sample_by=\"document\")\n```\n\nYou can also use grep patterns to load specific files:\n\n```py\n>>> from datasets import load_dataset\n>>> c4_subset = load_dataset(\"allenai/c4\", data_files=\"en/c4-train.0000*-of-01024.json.gz\")\n```\n\nTo load remote text files via HTTP, pass the URLs instead:\n\n```py\n>>> dataset = load_dataset(\"text\", data_files=\"https://huggingface.co/datasets/hf-internal-testing/dataset_with_data_files/resolve/main/data/train.txt\")\n```\n\nTo load XML data you can use the \"xml\" loader, which is equivalent to \"text\" with sample_by=\"document\":\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset(\"xml\", data_files={\"train\": [\"my_xml_1.xml\", \"my_xml_2.xml\"], \"test\": \"my_xml_file.xml\"})\n\n# Load from a directory\n>>> dataset = load_dataset(\"xml\", data_dir=\"path/to/xml/dataset\")\n```\n"
  },
  {
    "path": "docs/source/nlp_process.mdx",
    "content": "# Process text data\n\nThis guide shows specific methods for processing text datasets. Learn how to:\n\n- Tokenize a dataset with [`~Dataset.map`].\n- Align dataset labels with label ids for NLI datasets.\n\nFor a guide on how to process any type of dataset, take a look at the <a class=\"underline decoration-sky-400 decoration-2 font-semibold\" href=\"./process\">general process guide</a>.\n\n## Map\n\nThe [`~Dataset.map`] function supports processing batches of examples at once which speeds up tokenization.\n\nLoad a tokenizer from 🤗 [Transformers](https://huggingface.co/transformers/):\n\n```py\n>>> from transformers import AutoTokenizer\n\n>>> tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n```\n\nSet the `batched` parameter to `True` in the [`~Dataset.map`] function to apply the tokenizer to batches of examples:\n\n```py\n>>> dataset = dataset.map(lambda examples: tokenizer(examples[\"text\"]), batched=True)\n>>> dataset[0]\n{'text': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', \n 'label': 1, \n 'input_ids': [101, 1996, 2600, 2003, 16036, 2000, 2022, 1996, 7398, 2301, 1005, 1055, 2047, 1000, 16608, 1000, 1998, 2008, 2002, 1005, 1055, 2183, 2000, 2191, 1037, 17624, 2130, 3618, 2084, 7779, 29058, 8625, 13327, 1010, 3744, 1011, 18856, 19513, 3158, 5477, 4168, 2030, 7112, 16562, 2140, 1012, 102], \n 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], \n 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}\n```\n\nThe [`~Dataset.map`] function converts the returned values to a PyArrow-supported format. But explicitly returning the tensors as NumPy arrays is faster because it is a natively supported PyArrow format. Set `return_tensors=\"np\"` when you tokenize your text:\n\n```py\n>>> dataset = dataset.map(lambda examples: tokenizer(examples[\"text\"], return_tensors=\"np\"), batched=True)\n```\n\n## Align\n\nThe [`~Dataset.align_labels_with_mapping`] function aligns a dataset label id with the label name. Not all 🤗 Transformers models follow the prescribed label mapping of the original dataset, especially for NLI datasets. For example, the [MNLI](https://huggingface.co/datasets/glue) dataset uses the following label mapping:\n\n```py\n>>> label2id = {\"entailment\": 0, \"neutral\": 1, \"contradiction\": 2}\n```\n\nTo align the dataset label mapping with the mapping used by a model, create a dictionary of the label name and id to align on:\n\n```py\n>>> label2id = {\"contradiction\": 0, \"neutral\": 1, \"entailment\": 2}\n```\n\nPass the dictionary of the label mappings to the [`~Dataset.align_labels_with_mapping`] function, and the column to align on:\n\n```py\n>>> from datasets import load_dataset\n\n>>> mnli = load_dataset(\"nyu-mll/glue\", \"mnli\", split=\"train\")\n>>> mnli_aligned = mnli.align_labels_with_mapping(label2id, \"label\")\n```\n\nYou can also use this function to assign a custom mapping of labels to ids."
  },
  {
    "path": "docs/source/object_detection.mdx",
    "content": "# Object detection\n\nObject detection models identify something in an image, and object detection datasets are used for applications such as autonomous driving and detecting natural hazards like wildfire. This guide will show you how to apply transformations to an object detection dataset following the [tutorial](https://albumentations.ai/docs/examples/example_bboxes/) from [Albumentations](https://albumentations.ai/docs/).\n\nTo run these examples, make sure you have up-to-date versions of [albumentations](https://albumentations.ai/docs/) and [cv2](https://docs.opencv.org/4.10.0/) installed:\n\n```bash\npip install -U albumentations opencv-python\n```\n\nIn this example, you'll use the [`cppe-5`](https://huggingface.co/datasets/rishitdagli/cppe-5) dataset for identifying medical personal protective equipment (PPE) in the context of the COVID-19 pandemic.\n\nLoad the dataset and take a look at an example:\n\n```py\n>>> from datasets import load_dataset\n\n>>> ds = load_dataset(\"rishitdagli/cppe-5\")\n>>> example = ds['train'][0]\n>>> example\n{'height': 663,\n 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=943x663 at 0x7FC3DC756250>,\n 'image_id': 15,\n 'objects': {'area': [3796, 1596, 152768, 81002],\n  'bbox': [[302.0, 109.0, 73.0, 52.0],\n   [810.0, 100.0, 57.0, 28.0],\n   [160.0, 31.0, 248.0, 616.0],\n   [741.0, 68.0, 202.0, 401.0]],\n  'category': [4, 4, 0, 0],\n  'id': [114, 115, 116, 117]},\n 'width': 943}\n```\n\nThe dataset has the following fields:\n\n- `image`: PIL.Image.Image object containing the image.\n- `image_id`: The image ID.\n- `height`: The image height.\n- `width`: The image width.\n- `objects`: A dictionary containing bounding box metadata for the objects in the image:\n  - `id`: The annotation id.\n  - `area`: The area of the bounding box.\n  - `bbox`: The object's bounding box (in the [coco](https://albumentations.ai/docs/3-basic-usage/bounding-boxes-augmentations/#understanding-bounding-box-formats) format).\n  - `category`: The object's category, with possible values including `Coverall (0)`, `Face_Shield (1)`, `Gloves (2)`, `Goggles (3)` and `Mask (4)`.\n\nYou can visualize the `bboxes` on the image using some internal torch utilities. To do that, you will need to reference the [`~datasets.ClassLabel`] feature associated with the category IDs so you can look up the string labels:\n\n\n```py\n>>> import torch\n>>> from torchvision.ops import box_convert\n>>> from torchvision.utils import draw_bounding_boxes\n>>> from torchvision.transforms.functional import pil_to_tensor, to_pil_image\n\n>>> categories = ds['train'].features['objects'].feature['category']\n\n>>> boxes_xywh = torch.tensor(example['objects']['bbox'])\n>>> boxes_xyxy = box_convert(boxes_xywh, 'xywh', 'xyxy')\n>>> labels = [categories.int2str(x) for x in example['objects']['category']]\n>>> to_pil_image(\n...     draw_bounding_boxes(\n...         pil_to_tensor(example['image']),\n...         boxes_xyxy,\n...         colors=\"red\",\n...         labels=labels,\n...     )\n... )\n```\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/visualize_detection_example.png\"/>\n</div>\n\n\nWith `albumentations`, you can apply transforms that will affect the image while also updating the `bboxes` accordingly. In this case, the image is resized to (480, 480), flipped horizontally, and brightened. \n\n```py\n>>> import albumentations\n>>> import numpy as np\n\n>>> transform = albumentations.Compose([\n...     albumentations.Resize(480, 480),\n...     albumentations.HorizontalFlip(p=1.0),\n...     albumentations.RandomBrightnessContrast(p=1.0),\n... ], bbox_params=albumentations.BboxParams(format='coco',  label_fields=['category']))\n\n>>> image = np.array(example['image'])\n>>> out = transform(\n...     image=image,\n...     bboxes=example['objects']['bbox'],\n...     category=example['objects']['category'],\n... )\n```\n\nNow when you visualize the result, the image should be flipped, but the `bboxes` should still be in the right places.\n\n```py\n>>> image = torch.tensor(out['image']).permute(2, 0, 1)\n>>> boxes_xywh = torch.stack([torch.tensor(x) for x in out['bboxes']])\n>>> boxes_xyxy = box_convert(boxes_xywh, 'xywh', 'xyxy')\n>>> labels = [categories.int2str(x) for x in out['category']]\n>>> to_pil_image(\n...     draw_bounding_boxes(\n...         image,\n...         boxes_xyxy,\n...         colors='red',\n...         labels=labels\n...     )\n... )\n```\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/visualize_detection_example_transformed.png\"/>\n</div>\n\nCreate a function to apply the transform to a batch of examples:\n\n```py\n>>> def transforms(examples):\n...     images, bboxes, categories = [], [], []\n...     for image, objects in zip(examples['image'], examples['objects']):\n...         image = np.array(image.convert(\"RGB\"))\n...         out = transform(\n...             image=image,\n...             bboxes=objects['bbox'],\n...             category=objects['category']\n...         )\n...         images.append(torch.tensor(out['image']).permute(2, 0, 1))\n...         bboxes.append(torch.tensor(out['bboxes']))\n...         categories.append(out['category'])\n...     return {'image': images, 'bbox': bboxes, 'category': categories}\n```\n\nUse the [`~Dataset.set_transform`] function to apply the transform on-the-fly which consumes less disk space. The randomness of data augmentation may return a different image if you access the same example twice. It is especially useful when training a model for several epochs.\n\n```py\n>>> ds['train'].set_transform(transforms)\n```\n\nYou can verify the transform works by visualizing the 10th example:\n\n```py\n>>> example = ds['train'][10]\n>>> to_pil_image(\n...     draw_bounding_boxes(\n...         example['image'],\n...         box_convert(example['bbox'], 'xywh', 'xyxy'),\n...         colors='red',\n...         labels=[categories.int2str(x) for x in example['category']]\n...     )\n... )\n```\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/visualize_detection_example_transformed_2.png\"/>\n</div>\n\n> [!TIP]\n> Now that you know how to process a dataset for object detection, learn\n> [how to train an object detection model](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/YOLOS/Fine_tuning_YOLOS_for_object_detection_on_custom_dataset_(balloon).ipynb)\n> and use it for inference.\n"
  },
  {
    "path": "docs/source/package_reference/builder_classes.mdx",
    "content": "# Builder classes\n\n## Builders\n\n🤗 Datasets relies on two main classes during the dataset building process: [`DatasetBuilder`] and [`BuilderConfig`].\n\n[[autodoc]] datasets.DatasetBuilder\n\n[[autodoc]] datasets.GeneratorBasedBuilder\n\n[[autodoc]] datasets.ArrowBasedBuilder\n\n[[autodoc]] datasets.BuilderConfig\n\n## Download\n\n[[autodoc]] datasets.DownloadManager\n\n[[autodoc]] datasets.StreamingDownloadManager\n\n[[autodoc]] datasets.DownloadConfig\n\n[[autodoc]] datasets.DownloadMode\n\n## Verification\n\n[[autodoc]] datasets.VerificationMode\n\n## Splits\n\n[[autodoc]] datasets.SplitGenerator\n\n[[autodoc]] datasets.Split\n\n[[autodoc]] datasets.NamedSplit\n\n[[autodoc]] datasets.NamedSplitAll\n\n[[autodoc]] datasets.ReadInstruction\n\n## Version\n\n[[autodoc]] datasets.utils.Version\n"
  },
  {
    "path": "docs/source/package_reference/loading_methods.mdx",
    "content": "# Loading methods\n\nMethods for listing and loading datasets:\n\n## Datasets\n\n[[autodoc]] datasets.load_dataset\n\n[[autodoc]] datasets.load_from_disk\n\n[[autodoc]] datasets.load_dataset_builder\n\n[[autodoc]] datasets.get_dataset_config_names\n\n[[autodoc]] datasets.get_dataset_infos\n\n[[autodoc]] datasets.get_dataset_split_names\n\n## From files\n\nConfigurations used to load data files.\nThey are used when loading local files or a dataset repository:\n\n- local files: `load_dataset(\"parquet\", data_dir=\"path/to/data/dir\")`\n- dataset repository: `load_dataset(\"allenai/c4\")`\n\nYou can pass arguments to `load_dataset` to configure data loading.\nFor example you can specify the `sep` parameter to define the [`~datasets.packaged_modules.csv.CsvConfig`] that is used to load the data:\n\n```python\nload_dataset(\"csv\", data_dir=\"path/to/data/dir\", sep=\"\\t\")\n```\n\n### Text\n\n[[autodoc]] datasets.packaged_modules.text.TextConfig\n\n[[autodoc]] datasets.packaged_modules.text.Text\n\n### CSV\n\n[[autodoc]] datasets.packaged_modules.csv.CsvConfig\n\n[[autodoc]] datasets.packaged_modules.csv.Csv\n\n### JSON\n\n[[autodoc]] datasets.packaged_modules.json.JsonConfig\n\n[[autodoc]] datasets.packaged_modules.json.Json\n\n### XML\n\n[[autodoc]] datasets.packaged_modules.xml.XmlConfig\n\n[[autodoc]] datasets.packaged_modules.xml.Xml\n\n### Parquet\n\n[[autodoc]] datasets.packaged_modules.parquet.ParquetConfig\n\n[[autodoc]] datasets.packaged_modules.parquet.Parquet\n\n### Arrow\n\n[[autodoc]] datasets.packaged_modules.arrow.ArrowConfig\n\n[[autodoc]] datasets.packaged_modules.arrow.Arrow\n\n### SQL\n\n[[autodoc]] datasets.packaged_modules.sql.SqlConfig\n\n[[autodoc]] datasets.packaged_modules.sql.Sql\n\n### Images\n\n[[autodoc]] datasets.packaged_modules.imagefolder.ImageFolderConfig\n\n[[autodoc]] datasets.packaged_modules.imagefolder.ImageFolder\n\n### Audio\n\n[[autodoc]] datasets.packaged_modules.audiofolder.AudioFolderConfig\n\n[[autodoc]] datasets.packaged_modules.audiofolder.AudioFolder\n\n### Videos\n\n[[autodoc]] datasets.packaged_modules.videofolder.VideoFolderConfig\n\n[[autodoc]] datasets.packaged_modules.videofolder.VideoFolder\n\n### HDF5\n\n[[autodoc]] datasets.packaged_modules.hdf5.HDF5Config\n\n[[autodoc]] datasets.packaged_modules.hdf5.HDF5\n\n### Pdf\n\n[[autodoc]] datasets.packaged_modules.pdffolder.PdfFolderConfig\n\n[[autodoc]] datasets.packaged_modules.pdffolder.PdfFolder\n\n### Nifti\n\n[[autodoc]] datasets.packaged_modules.niftifolder.NiftiFolderConfig\n\n[[autodoc]] datasets.packaged_modules.niftifolder.NiftiFolder\n\n### WebDataset\n\n[[autodoc]] datasets.packaged_modules.webdataset.WebDataset\n"
  },
  {
    "path": "docs/source/package_reference/main_classes.mdx",
    "content": "# Main classes\n\n\n## DatasetInfo\n\n[[autodoc]] datasets.DatasetInfo\n\n## Dataset\n\nThe base class [`Dataset`] implements a Dataset backed by an Apache Arrow table.\n\n[[autodoc]] datasets.Dataset\n    - add_column\n    - add_item\n    - from_file\n    - from_buffer\n    - from_pandas\n    - from_dict\n    - from_list\n    - from_generator\n    - data\n    - cache_files\n    - num_columns\n    - num_rows\n    - column_names\n    - shape\n    - unique\n    - flatten\n    - cast\n    - cast_column\n    - remove_columns\n    - rename_column\n    - rename_columns\n    - select_columns\n    - class_encode_column\n    - __len__\n    - __iter__\n    - iter\n    - formatted_as\n    - set_format\n    - set_transform\n    - reset_format\n    - with_format\n    - with_transform\n    - __getitem__\n    - cleanup_cache_files\n    - map\n    - filter\n    - select\n    - sort\n    - shuffle\n    - skip\n    - take\n    - train_test_split\n    - shard\n    - repeat\n    - to_tf_dataset\n    - push_to_hub\n    - save_to_disk\n    - load_from_disk\n    - flatten_indices\n    - to_csv\n    - to_pandas\n    - to_dict\n    - to_json\n    - to_parquet\n    - to_sql\n    - to_iterable_dataset\n    - add_faiss_index\n    - add_faiss_index_from_external_arrays\n    - save_faiss_index\n    - load_faiss_index\n    - add_elasticsearch_index\n    - load_elasticsearch_index\n    - list_indexes\n    - get_index\n    - drop_index\n    - search\n    - search_batch\n    - get_nearest_examples\n    - get_nearest_examples_batch\n    - info\n    - split\n    - builder_name\n    - citation\n    - config_name\n    - dataset_size\n    - description\n    - download_checksums\n    - download_size\n    - features\n    - homepage\n    - license\n    - size_in_bytes\n    - supervised_keys\n    - version\n    - from_csv\n    - from_json\n    - from_parquet\n    - from_text\n    - from_sql\n    - align_labels_with_mapping\n\n[[autodoc]] datasets.concatenate_datasets\n\n[[autodoc]] datasets.interleave_datasets\n\n[[autodoc]] datasets.distributed.split_dataset_by_node\n\n[[autodoc]] datasets.enable_caching\n\n[[autodoc]] datasets.disable_caching\n\n[[autodoc]] datasets.is_caching_enabled\n\n[[autodoc]] datasets.Column\n\n## DatasetDict\n\nDictionary with split names as keys ('train', 'test' for example), and `Dataset` objects as values.\nIt also has dataset transform methods like map or filter, to process all the splits at once.\n\n[[autodoc]] datasets.DatasetDict\n    - data\n    - cache_files\n    - num_columns\n    - num_rows\n    - column_names\n    - shape\n    - unique\n    - cleanup_cache_files\n    - map\n    - filter\n    - sort\n    - shuffle\n    - set_format\n    - reset_format\n    - formatted_as\n    - with_format\n    - with_transform\n    - flatten\n    - cast\n    - cast_column\n    - remove_columns\n    - rename_column\n    - rename_columns\n    - select_columns\n    - class_encode_column\n    - push_to_hub\n    - save_to_disk\n    - load_from_disk\n    - from_csv\n    - from_json\n    - from_parquet\n    - from_text\n\n<a id='package_reference_features'></a>\n\n## IterableDataset\n\nThe base class [`IterableDataset`] implements an iterable Dataset backed by python generators.\n\n[[autodoc]] datasets.IterableDataset\n    - from_file\n    - from_pandas\n    - from_dict\n    - from_list\n    - from_generator\n    - remove_columns\n    - select_columns\n    - cast_column\n    - cast\n    - decode\n    - __iter__\n    - iter\n    - map\n    - rename_column\n    - filter\n    - shuffle\n    - batch\n    - skip\n    - take\n    - shard\n    - reshard\n    - repeat\n    - to_csv\n    - to_pandas\n    - to_dict\n    - to_json\n    - to_parquet\n    - to_sql\n    - push_to_hub\n    - load_state_dict\n    - state_dict\n    - info\n    - split\n    - builder_name\n    - citation\n    - config_name\n    - dataset_size\n    - description\n    - download_checksums\n    - download_size\n    - features\n    - homepage\n    - license\n    - size_in_bytes\n    - supervised_keys\n    - version\n    - from_csv\n    - from_json\n    - from_parquet\n    - from_text\n\n[[autodoc]] datasets.IterableColumn\n\n## IterableDatasetDict\n\nDictionary with split names as keys ('train', 'test' for example), and `IterableDataset` objects as values.\n\n[[autodoc]] datasets.IterableDatasetDict\n    - map\n    - filter\n    - shuffle\n    - with_format\n    - cast\n    - cast_column\n    - remove_columns\n    - rename_column\n    - rename_columns\n    - select_columns\n    - push_to_hub\n\n## Features\n\n[[autodoc]] datasets.Features\n\n### Scalar\n\n[[autodoc]] datasets.Value\n\n[[autodoc]] datasets.ClassLabel\n\n### Composite\n\n[[autodoc]] datasets.LargeList\n\n[[autodoc]] datasets.List\n\n[[autodoc]] datasets.Sequence\n\n### Translation\n\n[[autodoc]] datasets.Translation\n\n[[autodoc]] datasets.TranslationVariableLanguages\n\n### Arrays\n\n[[autodoc]] datasets.Array2D\n\n[[autodoc]] datasets.Array3D\n\n[[autodoc]] datasets.Array4D\n\n[[autodoc]] datasets.Array5D\n\n### Audio\n\n[[autodoc]] datasets.Audio\n\n### Image\n\n[[autodoc]] datasets.Image\n\n### Video\n\n[[autodoc]] datasets.Video\n\n### Json\n\n[[autodoc]] datasets.Json\n\n### Pdf\n\n[[autodoc]] datasets.Pdf\n\n### Nifti\n\n[[autodoc]] datasets.Nifti\n\n## Filesystems\n\n[[autodoc]] datasets.filesystems.is_remote_filesystem\n\n## Fingerprint\n\n[[autodoc]] datasets.fingerprint.Hasher\n"
  },
  {
    "path": "docs/source/package_reference/table_classes.mdx",
    "content": "# Table Classes\n\nEach `Dataset` object is backed by a PyArrow Table.\nA Table can be loaded from either the disk (memory mapped) or in memory.\nSeveral Table types are available, and they all inherit from [`table.Table`].\n\n## Table\n\n[[autodoc]] datasets.table.Table\n    - validate\n    - equals\n    - to_batches\n    - to_pydict\n    - to_pandas\n    - to_string\n    - field\n    - column\n    - itercolumns\n    - schema\n    - columns\n    - num_columns\n    - num_rows\n    - shape\n    - nbytes\n\n## InMemoryTable\n\n[[autodoc]] datasets.table.InMemoryTable\n    - validate\n    - equals\n    - to_batches\n    - to_pydict\n    - to_pandas\n    - to_string\n    - field\n    - column\n    - itercolumns\n    - schema\n    - columns\n    - num_columns\n    - num_rows\n    - shape\n    - nbytes\n    - column_names\n    - slice\n    - filter\n    - flatten\n    - combine_chunks\n    - cast\n    - replace_schema_metadata\n    - add_column\n    - append_column\n    - remove_column\n    - set_column\n    - rename_columns\n    - select\n    - drop\n    - from_file\n    - from_buffer\n    - from_pandas\n    - from_arrays\n    - from_pydict\n    - from_batches\n\n## MemoryMappedTable\n\n[[autodoc]] datasets.table.MemoryMappedTable\n    - validate\n    - equals\n    - to_batches\n    - to_pydict\n    - to_pandas\n    - to_string\n    - field\n    - column\n    - itercolumns\n    - schema\n    - columns\n    - num_columns\n    - num_rows\n    - shape\n    - nbytes\n    - column_names\n    - slice\n    - filter\n    - flatten\n    - combine_chunks\n    - cast\n    - replace_schema_metadata\n    - add_column\n    - append_column\n    - remove_column\n    - set_column\n    - rename_columns\n    - select\n    - drop\n    - from_file\n\n## ConcatenationTable\n\n[[autodoc]] datasets.table.ConcatenationTable\n    - validate\n    - equals\n    - to_batches\n    - to_pydict\n    - to_pandas\n    - to_string\n    - field\n    - column\n    - itercolumns\n    - schema\n    - columns\n    - num_columns\n    - num_rows\n    - shape\n    - nbytes\n    - column_names\n    - slice\n    - filter\n    - flatten\n    - combine_chunks\n    - cast\n    - replace_schema_metadata\n    - add_column\n    - append_column\n    - remove_column\n    - set_column\n    - rename_columns\n    - select\n    - drop\n    - from_blocks\n    - from_tables\n\n## Utils\n\n[[autodoc]] datasets.table.concat_tables\n\n[[autodoc]] datasets.table.list_table_cache_files\n"
  },
  {
    "path": "docs/source/package_reference/utilities.mdx",
    "content": "# Utilities\n\n## Configure logging\n\n🤗 Datasets strives to be transparent and explicit about how it works, but this can be quite verbose at times. We have included a series of logging methods which allow you to easily adjust the level of verbosity of the entire library. Currently the default verbosity of the library is set to `WARNING`.\n\nTo change the level of verbosity, use one of the direct setters. For instance, here is how to change the verbosity to the `INFO` level:\n\n```py\nimport datasets\ndatasets.logging.set_verbosity_info()\n```\n\nYou can also use the environment variable `DATASETS_VERBOSITY` to override the default verbosity, and set it to one of the following: `debug`, `info`, `warning`, `error`, `critical`:\n\n```bash\nDATASETS_VERBOSITY=error ./myprogram.py\n```\n\nAll the methods of this logging module are documented below. The main ones are:\n\n- [`logging.get_verbosity`] to get the current level of verbosity in the logger\n- [`logging.set_verbosity`] to set the verbosity to the level of your choice\n\nIn order from the least to the most verbose (with their corresponding `int` values):\n\n1. `logging.CRITICAL` or `logging.FATAL` (int value, 50): only report the most critical errors.\n2. `logging.ERROR` (int value, 40): only report errors.\n3. `logging.WARNING` or `logging.WARN` (int value, 30): only reports error and warnings. This the default level used by the library.\n4. `logging.INFO` (int value, 20): reports error, warnings and basic information.\n5. `logging.DEBUG` (int value, 10): report all information.\n\n[[autodoc]] datasets.logging.get_verbosity\n\n[[autodoc]] datasets.logging.set_verbosity\n\n[[autodoc]] datasets.logging.set_verbosity_info\n\n[[autodoc]] datasets.logging.set_verbosity_warning\n\n[[autodoc]] datasets.logging.set_verbosity_debug\n\n[[autodoc]] datasets.logging.set_verbosity_error\n\n[[autodoc]] datasets.logging.disable_propagation\n\n[[autodoc]] datasets.logging.enable_propagation\n\n## Configure progress bars\n\nBy default, `tqdm` progress bars will be displayed during dataset download and preprocessing. You can disable them globally by setting `HF_DATASETS_DISABLE_PROGRESS_BARS`\nenvironment variable. You can also enable/disable them using [`~utils.enable_progress_bars`] and [`~utils.disable_progress_bars`]. If set, the environment variable has priority on the helpers.\n\n[[autodoc]] datasets.utils.enable_progress_bars\n\n[[autodoc]] datasets.utils.disable_progress_bars\n\n[[autodoc]] datasets.utils.are_progress_bars_disabled"
  },
  {
    "path": "docs/source/process.mdx",
    "content": "# Process\n\n🤗 Datasets provides many tools for modifying the structure and content of a dataset. These tools are important for tidying up a dataset, creating additional columns, converting between features and formats, and much more.\n\nThis guide will show you how to:\n\n- Reorder rows and split the dataset.\n- Rename and remove columns, and other common column operations.\n- Apply processing functions to each example in a dataset.\n- Concatenate datasets.\n- Apply a custom formatting transform.\n- Save and export processed datasets.\n\nFor more details specific to processing other dataset modalities, take a look at the <a class=\"underline decoration-pink-400 decoration-2 font-semibold\" href=\"./audio_process\">process audio dataset guide</a>, the <a class=\"underline decoration-yellow-400 decoration-2 font-semibold\" href=\"./image_process\">process image dataset guide</a>, or the <a class=\"underline decoration-green-400 decoration-2 font-semibold\" href=\"./nlp_process\">process text dataset guide</a>.\n\nThe examples in this guide use the MRPC dataset, but feel free to load any dataset of your choice and follow along!\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset(\"nyu-mll/glue\", \"mrpc\", split=\"train\")\n```\n\n> [!WARNING]\n> All processing methods in this guide return a new [`Dataset`] object. Modification is not done in-place. Be careful about overriding your previous dataset!\n\n## Sort, shuffle, select, split, and shard\n\nThere are several functions for rearranging the structure of a dataset.\nThese functions are useful for selecting only the rows you want, creating train and test splits, and sharding very large datasets into smaller chunks.\n\n### Sort\n\nUse [`~Dataset.sort`] to sort column values according to their numerical values. The provided column must be NumPy compatible.\n\n```py\n>>> dataset[\"label\"][:10]\n[1, 0, 1, 0, 1, 1, 0, 1, 0, 0]\n>>> sorted_dataset = dataset.sort(\"label\")\n>>> sorted_dataset[\"label\"][:10]\n[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n>>> sorted_dataset[\"label\"][-10:]\n[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n```\n\nUnder the hood, this creates a list of indices that is sorted according to values of the column.\nThis indices mapping is then used to access the right rows in the underlying Arrow table.\n\n### Shuffle\n\nThe [`~Dataset.shuffle`] function randomly rearranges the column values. You can specify the `generator` parameter in this function to use a different `numpy.random.Generator` if you want more control over the algorithm used to shuffle the dataset.\n\n```py\n>>> shuffled_dataset = sorted_dataset.shuffle(seed=42)\n>>> shuffled_dataset[\"label\"][:10]\n[1, 1, 1, 0, 1, 1, 1, 1, 1, 0]\n```\n\nShuffling takes the list of indices `[0:len(my_dataset)]` and shuffles it to create an indices mapping.\nHowever as soon as your [`Dataset`] has an indices mapping, the speed can become 10x slower.\nThis is because there is an extra step to get the row index to read using the indices mapping, and most importantly, you aren't reading contiguous chunks of data anymore.\nTo restore the speed, you'd need to rewrite the entire dataset on your disk again using [`Dataset.flatten_indices`], which removes the indices mapping.\nAlternatively, you can switch to an [`IterableDataset`] and leverage its fast approximate shuffling [`IterableDataset.shuffle`]:\n\n```py\n>>> iterable_dataset = dataset.to_iterable_dataset(num_shards=128)\n>>> shuffled_iterable_dataset = iterable_dataset.shuffle(seed=42, buffer_size=1000)\n```\n\n### Select and Filter\n\nThere are two options for filtering rows in a dataset: [`~Dataset.select`] and [`~Dataset.filter`].\n\n- [`~Dataset.select`] returns rows according to a list of indices:\n\n```py\n>>> small_dataset = dataset.select([0, 10, 20, 30, 40, 50])\n>>> len(small_dataset)\n6\n```\n\n- [`~Dataset.filter`] returns rows that match a specified condition:\n\n```py\n>>> start_with_ar = dataset.filter(lambda example: example[\"sentence1\"].startswith(\"Ar\"))\n>>> len(start_with_ar)\n6\n>>> start_with_ar[\"sentence1\"]\n['Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .',\n'Arison said Mann may have been one of the pioneers of the world music movement and he had a deep love of Brazilian music .',\n'Arts helped coach the youth on an eighth-grade football team at Lombardi Middle School in Green Bay .',\n'Around 9 : 00 a.m. EDT ( 1300 GMT ) , the euro was at $ 1.1566 against the dollar , up 0.07 percent on the day .',\n\"Arguing that the case was an isolated example , Canada has threatened a trade backlash if Tokyo 's ban is not justified on scientific grounds .\",\n'Artists are worried the plan would harm those who need help most - performers who have a difficult time lining up shows .'\n]\n```\n\n[`~Dataset.filter`] can also filter by indices if you set `with_indices=True`:\n\n```py\n>>> even_dataset = dataset.filter(lambda example, idx: idx % 2 == 0, with_indices=True)\n>>> len(even_dataset)\n1834\n>>> len(dataset) / 2\n1834.0\n```\n\nUnless the list of indices to keep is contiguous, those methods also create an indices mapping under the hood.\n\n### Split\n\nThe [`~Dataset.train_test_split`] function creates train and test splits if your dataset doesn't already have them. This allows you to adjust the relative proportions or an absolute number of samples in each split. In the example below, use the `test_size` parameter to create a test split that is 10% of the original dataset:\n\n```py\n>>> dataset.train_test_split(test_size=0.1)\n{'train': Dataset(schema: {'sentence1': 'string', 'sentence2': 'string', 'label': 'int64', 'idx': 'int32'}, num_rows: 3301),\n'test': Dataset(schema: {'sentence1': 'string', 'sentence2': 'string', 'label': 'int64', 'idx': 'int32'}, num_rows: 367)}\n>>> 0.1 * len(dataset)\n366.8\n```\n\nThe splits are shuffled by default, but you can set `shuffle=False` to prevent shuffling.\n\n### Shard\n\n🤗 Datasets supports sharding to divide a very large dataset into a predefined number of chunks. Specify the `num_shards` parameter in [`~Dataset.shard`] to determine the number of shards to split the dataset into. You'll also need to provide the shard you want to return with the `index` parameter.\n\nFor example, the [stanfordnlp/imdb](https://huggingface.co/datasets/stanfordnlp/imdb) dataset has 25000 examples:\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset(\"stanfordnlp/imdb\", split=\"train\")\n>>> print(dataset)\nDataset({\n    features: ['text', 'label'],\n    num_rows: 25000\n})\n```\n\nAfter sharding the dataset into four chunks, the first shard will only have 6250 examples:\n\n```py\n>>> dataset.shard(num_shards=4, index=0)\nDataset({\n    features: ['text', 'label'],\n    num_rows: 6250\n})\n>>> print(25000/4)\n6250.0\n```\n\n## Rename, remove, cast, and flatten\n\nThe following functions allow you to modify the columns of a dataset. These functions are useful for renaming or removing columns, changing columns to a new set of features, and flattening nested column structures.\n\n### Rename\n\nUse [`~Dataset.rename_column`] when you need to rename a column in your dataset. Features associated with the original column are actually moved under the new column name, instead of just replacing the original column in-place.\n\nProvide [`~Dataset.rename_column`] with the name of the original column, and the new column name:\n\n```py\n>>> dataset\nDataset({\n    features: ['sentence1', 'sentence2', 'label', 'idx'],\n    num_rows: 3668\n})\n>>> dataset = dataset.rename_column(\"sentence1\", \"sentenceA\")\n>>> dataset = dataset.rename_column(\"sentence2\", \"sentenceB\")\n>>> dataset\nDataset({\n    features: ['sentenceA', 'sentenceB', 'label', 'idx'],\n    num_rows: 3668\n})\n```\n\n### Remove\n\nWhen you need to remove one or more columns, provide the column name to remove to the [`~Dataset.remove_columns`] function. Remove more than one column by providing a list of column names:\n\n```py\n>>> dataset = dataset.remove_columns(\"label\")\n>>> dataset\nDataset({\n    features: ['sentence1', 'sentence2', 'idx'],\n    num_rows: 3668\n})\n>>> dataset = dataset.remove_columns([\"sentence1\", \"sentence2\"])\n>>> dataset\nDataset({\n    features: ['idx'],\n    num_rows: 3668\n})\n```\n\nConversely, [`~Dataset.select_columns`] selects one or more columns to keep and removes the rest. This function takes either one or a list of column names:\n\n```py\n>>> dataset\nDataset({\n    features: ['sentence1', 'sentence2', 'label', 'idx'],\n    num_rows: 3668\n})\n>>> dataset = dataset.select_columns(['sentence1', 'sentence2', 'idx'])\n>>> dataset\nDataset({\n    features: ['sentence1', 'sentence2', 'idx'],\n    num_rows: 3668\n})\n>>> dataset = dataset.select_columns('idx')\n>>> dataset\nDataset({\n    features: ['idx'],\n    num_rows: 3668\n})\n```\n\n### Cast\n\nThe [`~Dataset.cast`] function transforms the feature type of one or more columns. This function accepts your new [`Features`] as its argument. The example below demonstrates how to change the [`ClassLabel`] and [`Value`] features:\n\n```py\n>>> dataset.features\n{'sentence1': Value('string'),\n'sentence2': Value('string'),\n'label': ClassLabel(names=['not_equivalent', 'equivalent']),\n'idx': Value('int32')}\n\n>>> from datasets import ClassLabel, Value\n>>> new_features = dataset.features.copy()\n>>> new_features[\"label\"] = ClassLabel(names=[\"negative\", \"positive\"])\n>>> new_features[\"idx\"] = Value(\"int64\")\n>>> dataset = dataset.cast(new_features)\n>>> dataset.features\n{'sentence1': Value('string'),\n'sentence2': Value('string'),\n'label': ClassLabel(names=['negative', 'positive']),\n'idx': Value('int64')}\n```\n\n> [!TIP]\n> Casting only works if the original feature type and new feature type are compatible. For example, you can cast a column with the feature type `Value(\"int32\")` to `Value(\"bool\")` if the original column only contains ones and zeros.\n\nUse the [`~Dataset.cast_column`] function to change the feature type of a single column. Pass the column name and its new feature type as arguments:\n\n```py\n>>> dataset.features\n{'audio': Audio(sampling_rate=44100, mono=True)}\n\n>>> dataset = dataset.cast_column(\"audio\", Audio(sampling_rate=16000))\n>>> dataset.features\n{'audio': Audio(sampling_rate=16000, mono=True)}\n```\n\n### Flatten\n\nSometimes a column can be a nested structure of several types. Take a look at the nested structure below from the SQuAD dataset:\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset(\"rajpurkar/squad\", split=\"train\")\n>>> dataset.features\n{'id': Value('string'),\n 'title': Value('string'),\n 'context': Value('string'),\n 'question': Value('string'),\n 'answers': {'text': List(Value('string')),\n  'answer_start': List(Value('int32'))}}\n```\n\nThe `answers` field contains two subfields: `text` and `answer_start`. Use the [`~Dataset.flatten`] function to extract the subfields into their own separate columns:\n\n```py\n>>> flat_dataset = dataset.flatten()\n>>> flat_dataset\nDataset({\n    features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'],\n num_rows: 87599\n})\n```\n\nNotice how the subfields are now their own independent columns: `answers.text` and `answers.answer_start`.\n\n## Map\n\nSome of the more powerful applications of 🤗 Datasets come from using the [`~Dataset.map`] function. The primary purpose of [`~Dataset.map`] is to speed up processing functions. It allows you to apply a processing function to each example in a dataset, independently or in batches. This function can even create new rows and columns.\n\nIn the following example, prefix each `sentence1` value in the dataset with `'My sentence: '`.\n\nStart by creating a function that adds `'My sentence: '` to the beginning of each sentence. The function needs to accept and output a `dict`:\n\n```py\n>>> def add_prefix(example):\n...     example[\"sentence1\"] = 'My sentence: ' + example[\"sentence1\"]\n...     return example\n```\n\nNow use [`~Dataset.map`] to apply the `add_prefix` function to the entire dataset:\n\n```py\n>>> updated_dataset = small_dataset.map(add_prefix)\n>>> updated_dataset[\"sentence1\"][:5]\n['My sentence: Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .',\n\"My sentence: Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .\",\n'My sentence: They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .',\n'My sentence: Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .',\n]\n```\n\nLet's take a look at another example, except this time, you'll remove a column with [`~Dataset.map`]. When you remove a column, it is only removed after the example has been provided to the mapped function. This allows the mapped function to use the content of the columns before they are removed.\n\nSpecify the column to remove with the `remove_columns` parameter in [`~Dataset.map`]:\n\n```py\n>>> updated_dataset = dataset.map(lambda example: {\"new_sentence\": example[\"sentence1\"]}, remove_columns=[\"sentence1\"])\n>>> updated_dataset.column_names\n['sentence2', 'label', 'idx', 'new_sentence']\n```\n\n> [!TIP]\n> 🤗 Datasets also has a [`~Dataset.remove_columns`] function which is faster because it doesn't copy the data of the remaining columns.\n\nYou can also use [`~Dataset.map`] with indices if you set `with_indices=True`. The example below adds the index to the beginning of each sentence:\n\n```py\n>>> updated_dataset = dataset.map(lambda example, idx: {\"sentence2\": f\"{idx}: \" + example[\"sentence2\"]}, with_indices=True)\n>>> updated_dataset[\"sentence2\"][:5]\n['0: Referring to him as only \" the witness \" , Amrozi accused his brother of deliberately distorting his evidence .',\n \"1: Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .\",\n \"2: On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .\",\n '3: Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .',\n '4: PG & E Corp. shares jumped $ 1.63 or 8 percent to $ 21.03 on the New York Stock Exchange on Friday .'\n]\n```\n\n### Multiprocessing\n\nMultiprocessing significantly speeds up processing by parallelizing processes on the CPU. Set the `num_proc` parameter in [`~Dataset.map`] to set the number of processes to use:\n\n```py\n>>> updated_dataset = dataset.map(lambda example, idx: {\"sentence2\": f\"{idx}: \" + example[\"sentence2\"]}, with_indices=True, num_proc=4)\n```\n\nThe [`~Dataset.map`] also works with the rank of the process if you set `with_rank=True`. This is analogous to the `with_indices` parameter. The `with_rank` parameter in the mapped function goes after the `index` one if it is already present.\n\n```py\n>>> import torch\n>>> from multiprocess import set_start_method\n>>> from transformers import AutoTokenizer, AutoModelForCausalLM\n>>> from datasets import load_dataset\n>>>\n>>> # Get an example dataset\n>>> dataset = load_dataset(\"fka/awesome-chatgpt-prompts\", split=\"train\")\n>>>\n>>> # Get an example model and its tokenizer\n>>> model = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen1.5-0.5B-Chat\").eval()\n>>> tokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen1.5-0.5B-Chat\")\n>>>\n>>> def gpu_computation(batch, rank):\n...     # Move the model on the right GPU if it's not there already\n...     device = f\"cuda:{(rank or 0) % torch.cuda.device_count()}\"\n...     model.to(device)\n...\n...     # Your big GPU call goes here, for example:\n...     chats = [[\n...         {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n...         {\"role\": \"user\", \"content\": prompt}\n...     ] for prompt in batch[\"prompt\"]]\n...     texts = [tokenizer.apply_chat_template(\n...         chat,\n...         tokenize=False,\n...         add_generation_prompt=True\n...     ) for chat in chats]\n...     model_inputs = tokenizer(texts, padding=True, return_tensors=\"pt\").to(device)\n...     with torch.no_grad():\n...         outputs = model.generate(**model_inputs, max_new_tokens=512)\n...     batch[\"output\"] = tokenizer.batch_decode(outputs, skip_special_tokens=True)\n...     return batch\n>>>\n>>> if __name__ == \"__main__\":\n...     set_start_method(\"spawn\")\n...     updated_dataset = dataset.map(\n...         gpu_computation,\n...         batched=True,\n...         batch_size=16,\n...         with_rank=True,\n...         num_proc=torch.cuda.device_count(),  # one process per GPU\n...     )\n```\n\nThe main use-case for rank is to parallelize computation across several GPUs. This requires setting `multiprocess.set_start_method(\"spawn\")`. If you don't you'll receive the following CUDA error:\n\n```bash\nRuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method.\n```\n\n### Batch processing\n\nThe [`~Dataset.map`] function supports working with batches of examples. Operate on batches by setting `batched=True`. The default batch size is 1000, but you can adjust it with the `batch_size` parameter. Batch processing enables interesting applications such as splitting long sentences into shorter chunks and data augmentation.\n\n#### Split long examples\n\nWhen examples are too long, you may want to split them into several smaller chunks. Begin by creating a function that:\n\n1. Splits the `sentence1` field into chunks of 50 characters.\n\n2. Stacks all the chunks together to create the new dataset.\n\n```py\n>>> def chunk_examples(examples):\n...     chunks = []\n...     for sentence in examples[\"sentence1\"]:\n...         chunks += [sentence[i:i + 50] for i in range(0, len(sentence), 50)]\n...     return {\"chunks\": chunks}\n```\n\nApply the function with [`~Dataset.map`]:\n\n```py\n>>> chunked_dataset = dataset.map(chunk_examples, batched=True, remove_columns=dataset.column_names)\n>>> chunked_dataset[:10]\n{'chunks': ['Amrozi accused his brother , whom he called \" the ',\n            'witness \" , of deliberately distorting his evidenc',\n            'e .',\n            \"Yucaipa owned Dominick 's before selling the chain\",\n            ' to Safeway in 1998 for $ 2.5 billion .',\n            'They had published an advertisement on the Interne',\n            't on June 10 , offering the cargo for sale , he ad',\n            'ded .',\n            'Around 0335 GMT , Tab shares were up 19 cents , or',\n            ' 4.4 % , at A $ 4.56 , having earlier set a record']}\n```\n\nNotice how the sentences are split into shorter chunks now, and there are more rows in the dataset.\n\n```py\n>>> dataset\nDataset({\n features: ['sentence1', 'sentence2', 'label', 'idx'],\n num_rows: 3668\n})\n>>> chunked_dataset\nDataset({\n    features: ['chunks'],\n    num_rows: 10470\n})\n```\n\n#### Data augmentation\n\nThe [`~Dataset.map`] function could also be used for data augmentation. The following example generates additional words for a masked token in a sentence.\n\nLoad and use the [RoBERTA](https://huggingface.co/roberta-base) model in 🤗 Transformers' [FillMaskPipeline](https://huggingface.co/transformers/main_classes/pipelines#transformers.FillMaskPipeline):\n\n```py\n>>> from random import randint\n>>> from transformers import pipeline\n\n>>> fillmask = pipeline(\"fill-mask\", model=\"roberta-base\")\n>>> mask_token = fillmask.tokenizer.mask_token\n>>> smaller_dataset = dataset.filter(lambda e, i: i<100, with_indices=True)\n```\n\nCreate a function to randomly select a word to mask in the sentence. The function should also return the original sentence and the top two replacements generated by RoBERTA.\n\n```py\n>>> def augment_data(examples):\n...     outputs = []\n...     for sentence in examples[\"sentence1\"]:\n...         words = sentence.split(' ')\n...         K = randint(1, len(words)-1)\n...         masked_sentence = \" \".join(words[:K]  + [mask_token] + words[K+1:])\n...         predictions = fillmask(masked_sentence)\n...         augmented_sequences = [predictions[i][\"sequence\"] for i in range(3)]\n...         outputs += [sentence] + augmented_sequences\n...\n...     return {\"data\": outputs}\n```\n\nUse [`~Dataset.map`] to apply the function over the whole dataset:\n\n```py\n>>> augmented_dataset = smaller_dataset.map(augment_data, batched=True, remove_columns=dataset.column_names, batch_size=8)\n>>> augmented_dataset[:9][\"data\"]\n['Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .',\n 'Amrozi accused his brother, whom he called \" the witness \", of deliberately withholding his evidence.',\n 'Amrozi accused his brother, whom he called \" the witness \", of deliberately suppressing his evidence.',\n 'Amrozi accused his brother, whom he called \" the witness \", of deliberately destroying his evidence.',\n \"Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .\",\n 'Yucaipa owned Dominick Stores before selling the chain to Safeway in 1998 for $ 2.5 billion.',\n \"Yucaipa owned Dominick's before selling the chain to Safeway in 1998 for $ 2.5 billion.\",\n 'Yucaipa owned Dominick Pizza before selling the chain to Safeway in 1998 for $ 2.5 billion.'\n]\n```\n\nFor each original sentence, RoBERTA augmented a random word with three alternatives. The original word `distorting` is supplemented by `withholding`, `suppressing`, and `destroying`.\n\n### Asynchronous processing\n\nAsynchronous functions are useful to call API endpoints in parallel, for example to download content like images or call a model endpoint.\n\nYou can define an asynchronous function using the `async` and `await` keywords, here is an example function to call a chat model from Hugging Face:\n\n```python\n>>> import aiohttp\n>>> import asyncio\n>>> from huggingface_hub import get_token\n>>> sem = asyncio.Semaphore(20)  # max number of simultaneous queries\n>>> async def query_model(model, prompt):\n...     api_url = f\"https://api-inference.huggingface.co/models/{model}/v1/chat/completions\"\n...     headers = {\"Authorization\": f\"Bearer {get_token()}\", \"Content-Type\": \"application/json\"}\n...     json = {\"messages\": [{\"role\": \"user\", \"content\": prompt}], \"max_tokens\": 20, \"seed\": 42}\n...     async with sem, aiohttp.ClientSession() as session, session.post(api_url, headers=headers, json=json) as response:\n...         output = await response.json()\n...         return {\"Output\": output[\"choices\"][0][\"message\"][\"content\"]}\n```\n\nAsynchronous functions run in parallel, which accelerates the process a lot. The same code takes a lot more time if it's run sequentially, because it does nothing while waiting for the model response. It is generally recommended to use `async` / `await` when you function has to wait for a response from an API for example, or if it downloads data and it can take some time.\n\nNote the presence of a `Semaphore`: it sets the maximum number of queries that can run in parallel. It is recommended to use a `Semaphore` when calling APIs to avoid rate limit errors.\n\nLet's use it to call the [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) model and ask it to return the main topic of each math problem in the [Maxwell-Jia/AIME_2024](https://huggingface.co/Maxwell-Jia/AIME_2024) dataset:\n\n````python\n>>> from datasets import load_dataset\n>>> ds = load_dataset(\"Maxwell-Jia/AIME_2024\", split=\"train\")\n>>> model = \"microsoft/Phi-3-mini-4k-instruct\"\n>>> prompt = 'What is this text mainly about ? Here is the text:\\n\\n```\\n{Problem}\\n```\\n\\nReply using one or two words max, e.g. \"The main topic is Linear Algebra\".'\n>>> async def get_topic(example):\n...     return await query_model(model, prompt.format(Problem=example['Problem']))\n>>> ds = ds.map(get_topic)\n>>> ds[0]\n{'ID': '2024-II-4',\n 'Problem': 'Let $x,y$ and $z$ be positive real numbers that...',\n 'Solution': 'Denote $\\\\log_2(x) = a$, $\\\\log_2(y) = b$, and...,\n 'Answer': 33,\n 'Output': 'The main topic is Logarithms.'}\n````\n\nHere, [`Dataset.map`] runs many `get_topic` function asynchronously so it doesn't have to wait for every single model response which would take a lot of time to do sequentially.\n\nBy default, [`Dataset.map`] runs up to one thousand map functions in parallel, so don't forget to set the maximum number of API calls that can run in parallel with a `Semaphore`, otherwise the model could return rate limit errors or overload. For advanced use cases, you can change the maximum number of queries in parallel in `datasets.config`.\n\n### Process multiple splits\n\nMany datasets have splits that can be processed simultaneously with [`DatasetDict.map`]. For example, tokenize the `sentence1` field in the train and test split by:\n\n```py\n>>> from datasets import load_dataset\n\n# load all the splits\n>>> dataset = load_dataset('nyu-mll/glue', 'mrpc')\n>>> encoded_dataset = dataset.map(lambda examples: tokenizer(examples[\"sentence1\"]), batched=True)\n>>> encoded_dataset[\"train\"][0]\n{'sentence1': 'Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .',\n'sentence2': 'Referring to him as only \" the witness \" , Amrozi accused his brother of deliberately distorting his evidence .',\n'label': 1,\n'idx': 0,\n'input_ids': [  101,  7277,  2180,  5303,  4806,  1117,  1711,   117,  2292, 1119,  1270,   107,  1103,  7737,   107,   117,  1104,  9938, 4267, 12223, 21811,  1117,  2554,   119,   102],\n'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n}\n```\n\n### Distributed usage\n\nWhen you use [`~Dataset.map`] in a distributed setting, you should also use [torch.distributed.barrier](https://pytorch.org/docs/stable/distributed?highlight=barrier#torch.distributed.barrier). This ensures the main process performs the mapping, while the other processes load the results, thereby avoiding duplicate work.\n\nThe following example shows how you can use `torch.distributed.barrier` to synchronize the processes:\n\n```py\n>>> from datasets import Dataset\n>>> import torch.distributed\n\n>>> dataset1 = Dataset.from_dict({\"a\": [0, 1, 2]})\n\n>>> if training_args.local_rank > 0:\n...     print(\"Waiting for main process to perform the mapping\")\n...     torch.distributed.barrier()\n\n>>> dataset2 = dataset1.map(lambda x: {\"a\": x[\"a\"] + 1})\n\n>>> if training_args.local_rank == 0:\n...     print(\"Loading results from main process\")\n...     torch.distributed.barrier()\n```\n\n## Batch\n\nThe [`~Dataset.batch`] method allows you to group samples from the dataset into batches. This is particularly useful when you want to create batches of data for training or evaluation, especially when working with deep learning models.\n\nHere's an example of how to use the `batch()` method:\n\n```python\n>>> from datasets import load_dataset\n>>> dataset = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\")\n>>> batched_dataset = dataset.batch(batch_size=4)\n>>> batched_dataset[0]\n{'text': ['the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',\n        'the gorgeously elaborate continuation of \" the lord of the rings \" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\\'s expanded vision of j . r . r . tolkien\\'s middle-earth .',\n        'effective but too-tepid biopic',\n        'if you sometimes like to go to the movies to have fun , wasabi is a good place to start .'],\n'label': [1, 1, 1, 1]}\n```\n\nThe `batch()` method accepts the following parameters:\n\n- `batch_size` (`int`): The number of samples in each batch.\n- `drop_last_batch` (`bool`, defaults to `False`): Whether to drop the last incomplete batch if the dataset size is not divisible by the batch size.\n- `num_proc` (`int`, optional, defaults to `None`): The number of processes to use for multiprocessing. If None, no multiprocessing is used. This can significantly speed up batching for large datasets.\n\nNote that `Dataset.batch()` returns a new [`Dataset`] where each item is a batch of multiple samples from the original dataset. If you want to process data in batches, you should use a batched [`~Dataset.map`] directly, which applies a function to batches but the output dataset is unbatched.\n\n## Concatenate\n\nSeparate datasets can be concatenated if they share the same column types. Concatenate datasets with [`concatenate_datasets`]:\n\n```py\n>>> from datasets import concatenate_datasets, load_dataset\n\n>>> stories = load_dataset(\"ajibawa-2023/General-Stories-Collection\", split=\"train\")\n>>> stories = stories.select_columns([\"text\"])  # only keep the 'text' column\n>>> wiki = load_dataset(\"wikimedia/wikipedia\", \"20231101.en\", split=\"train\")\n>>> wiki = wiki.select_columns([\"text\"])  # only keep the 'text' column\n\n>>> assert stories.features.type == wiki.features.type\n>>> bert_dataset = concatenate_datasets([stories, wiki])\n```\n\nYou can also concatenate two datasets horizontally by setting `axis=1` as long as the datasets have the same number of rows:\n\n```py\n>>> from datasets import Dataset\n>>> stories_ids = Dataset.from_dict({\"ids\": list(range(len(stories)))})\n>>> stories_with_ids = concatenate_datasets([stories, stories_ids], axis=1)\n```\n\n### Interleave\n\nYou can also mix several datasets together by taking alternating examples from each one to create a new dataset. This is known as _interleaving_, which is enabled by the [`interleave_datasets`] function. Both [`interleave_datasets`] and [`concatenate_datasets`] work with regular [`Dataset`] and [`IterableDataset`] objects.\nRefer to the [Stream](./stream#interleave) guide for an example of how to interleave [`IterableDataset`] objects.\n\nYou can define sampling probabilities for each of the original datasets to specify how to interleave the datasets.\nIn this case, the new dataset is constructed by getting examples one by one from a random dataset until one of the datasets runs out of samples.\n\n```py\n>>> from datasets import Dataset, interleave_datasets\n>>> seed = 42\n>>> probabilities = [0.3, 0.5, 0.2]\n>>> d1 = Dataset.from_dict({\"a\": [0, 1, 2]})\n>>> d2 = Dataset.from_dict({\"a\": [10, 11, 12, 13]})\n>>> d3 = Dataset.from_dict({\"a\": [20, 21, 22]})\n>>> dataset = interleave_datasets([d1, d2, d3], probabilities=probabilities, seed=seed)\n>>> dataset[\"a\"]\n[10, 11, 20, 12, 0, 21, 13]\n```\n\nYou can also specify the `stopping_strategy`. The default strategy, `first_exhausted`, is a subsampling strategy, i.e the dataset construction is stopped as soon one of the dataset runs out of samples.\nYou can specify `stopping_strategy=all_exhausted` to execute an oversampling strategy. In this case, the dataset construction is stopped as soon as every samples in every dataset has been added at least once. In practice, it means that if a dataset is exhausted, it will return to the beginning of this dataset until the stop criterion has been reached.\nNote that if no sampling probabilities are specified, the new dataset will have `max_length_datasets*nb_dataset samples`.\nThere is also `stopping_strategy=all_exhausted_without_replacement` to ensure that every sample is seen exactly once.\n\n```py\n>>> d1 = Dataset.from_dict({\"a\": [0, 1, 2]})\n>>> d2 = Dataset.from_dict({\"a\": [10, 11, 12, 13]})\n>>> d3 = Dataset.from_dict({\"a\": [20, 21, 22]})\n>>> dataset = interleave_datasets([d1, d2, d3], stopping_strategy=\"all_exhausted\")\n>>> dataset[\"a\"]\n[0, 10, 20, 1, 11, 21, 2, 12, 22, 0, 13, 20]\n```\n\n## Format\n\nThe [`~Dataset.with_format`] function changes the format of a column to be compatible with some common data formats. Specify the output you'd like in the `type` parameter. You can also choose which the columns you want to format using `columns=`. Formatting is applied on-the-fly.\n\nFor example, create PyTorch tensors by setting `type=\"torch\"`:\n\n```py\n>>> dataset = dataset.with_format(type=\"torch\")\n```\n\nThe [`~Dataset.set_format`] function also changes the format of a column, except it runs in-place:\n\n```py\n>>> dataset.set_format(type=\"torch\")\n```\n\nIf you need to reset the dataset to its original format, set the format to `None` (or use [`~Dataset.reset_format`]):\n\n```py\n>>> dataset.format\n{'type': 'torch', 'format_kwargs': {}, 'columns': [...], 'output_all_columns': False}\n>>> dataset = dataset.with_format(None)\n>>> dataset.format\n{'type': None, 'format_kwargs': {}, 'columns': [...], 'output_all_columns': False}\n```\n\n### Tensors formats\n\nSeveral tensors or arrays formats are supported. It is generally recommended to use these formats instead of converting outputs of a dataset to tensors or arrays manually to avoid unnecessary data copies and accelerate data loading.\n\nHere is the list of supported tensors or arrays formats:\n\n- NumPy: format name is \"numpy\", for more information see [Using Datasets with NumPy](use_with_numpy)\n- PyTorch: format name is \"torch\", for more information see [Using Datasets with PyTorch](use_with_pytorch)\n- TensorFlow: format name is \"tensorflow\", for more information see [Using Datasets with TensorFlow](use_with_tensorflow)\n- JAX: format name is \"jax\", for more information see [Using Datasets with JAX](use_with_jax)\n\n> [!TIP]\n> Check out the [Using Datasets with TensorFlow](use_with_tensorflow#using-totfdataset) guide for more details on how to efficiently create a TensorFlow dataset.\n\nWhen a dataset is formatted in a tensor or array format, all the data are formatted as tensors or arrays (except unsupported types like strings for example for PyTorch):\n\n```python\n>>> ds = Dataset.from_dict({\"text\": [\"foo\", \"bar\"], \"tokens\": [[0, 1, 2], [3, 4, 5]]})\n>>> ds = ds.with_format(\"torch\")\n>>> ds[0]\n{'text': 'foo', 'tokens': tensor([0, 1, 2])}\n>>> ds[:2]\n{'text': ['foo', 'bar'],\n 'tokens': tensor([[0, 1, 2],\n         [3, 4, 5]])}\n```\n\n### Tabular formats\n\nYou can use a dataframes or tables format to optimize data loading and data processing, since they generally offer zero-copy operations and transforms written in low-level languages.\n\nHere is the list of supported dataframes or tables formats:\n\n- Pandas: format name is \"pandas\", for more information see [Using Datasets with Pandas](use_with_pandas)\n- Polars: format name is \"polars\", for more information see [Using Datasets with Polars](use_with_polars)\n- PyArrow: format name is \"arrow\", for more information see [Using Datasets with PyArrow](use_with_tensorflow)\n\nWhen a dataset is formatted in a dataframe or table format, every dataset row or batches of rows is formatted as a dataframe or table, and dataset colums are formatted as a series or array:\n\n```python\n>>> ds = Dataset.from_dict({\"text\": [\"foo\", \"bar\"], \"label\": [0, 1]})\n>>> ds = ds.with_format(\"pandas\")\n>>> ds[:2]\n  text  label\n0  foo      0\n1  bar      1\n```\n\nThose formats make it possible to iterate on the data faster by avoiding data copies, and also enable faster data processing in [`~Dataset.map`] or [`~Dataset.filter`]:\n\n```python\n>>> ds = ds.map(lambda df: df.assign(upper_text=df.text.str.upper()), batched=True)\n>>> ds[:2]\n  text  label upper_text\n0  foo      0        FOO\n1  bar      1        BAR\n```\n\n### Custom format transform\n\nThe [`~Dataset.with_transform`] function applies a custom formatting transform on-the-fly. This function replaces any previously specified format. For example, you can use this function to tokenize and pad tokens on-the-fly. Tokenization is only applied when examples are accessed:\n\n```py\n>>> from transformers import AutoTokenizer\n\n>>> tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n>>> def encode(batch):\n...     return tokenizer(batch[\"sentence1\"], batch[\"sentence2\"], padding=\"longest\", truncation=True, max_length=512, return_tensors=\"pt\")\n>>> dataset = dataset.with_transform(encode)\n>>> dataset.format\n{'type': 'custom', 'format_kwargs': {'transform': <function __main__.encode(batch)>}, 'columns': ['idx', 'label', 'sentence1', 'sentence2'], 'output_all_columns': False}\n```\n\nThere is also [`~Dataset.set_transform`] which does the same but runs in-place.\n\nYou can also use the [`~Dataset.with_transform`] function for custom decoding on [`Features`].\n\nThe example below uses the [`pydub`](http://pydub.com/) package as an alternative to `torchcodec` decoding:\n\n```py\n>>> import numpy as np\n>>> from pydub import AudioSegment\n\n>>> audio_dataset_amr = Dataset.from_dict({\"audio\": [\"audio_samples/audio.amr\"]})\n\n>>> def decode_audio_with_pydub(batch, sampling_rate=16_000):\n...     def pydub_decode_file(audio_path):\n...         sound = AudioSegment.from_file(audio_path)\n...         if sound.frame_rate != sampling_rate:\n...             sound = sound.set_frame_rate(sampling_rate)\n...         channel_sounds = sound.split_to_mono()\n...         samples = [s.get_array_of_samples() for s in channel_sounds]\n...         fp_arr = np.array(samples).T.astype(np.float32)\n...         fp_arr /= np.iinfo(samples[0].typecode).max\n...         return fp_arr\n...\n...     batch[\"audio\"] = [pydub_decode_file(audio_path) for audio_path in batch[\"audio\"]]\n...     return batch\n\n>>> audio_dataset_amr.set_transform(decode_audio_with_pydub)\n```\n\n## Save\n\nOnce your dataset is ready, you can save it as a Hugging Face Dataset in Parquet format and reuse it later with [`load_dataset`].\n\nSave your dataset by providing the name of the dataset repository on Hugging Face you wish to save it to to [`~Dataset.push_to_hub`]:\n\n```python\nencoded_dataset.push_to_hub(\"username/my_dataset\")\n```\n\nYou can use multiple processes to upload it in parallel. This is especially useful if you want to speed up the process:\n\n```python\ndataset.push_to_hub(\"username/my_dataset\", num_proc=8)\n```\n\nUse the [`load_dataset`] function to reload the dataset (in streaming mode or not):\n\n```python\nfrom datasets import load_dataset\nreloaded_dataset = load_dataset(\"username/my_dataset\", streaming=True)\n```\n\nAlternatively, you can save it locally in Arrow format on disk. Compared to Parquet, Arrow is uncompressed which makes it much faster to reload which is great for local use on disk and ephemeral caching. But since it's larger and with less metadata, it is slower to upload/download/query than Parquet and less suited for long term storage.\n\nUse the [`~Dataset.save_to_disk`] and [`load_from_disk`] function to reload the dataset from your disk:\n\n```py\n>>> encoded_dataset.save_to_disk(\"path/of/my/dataset/directory\")\n>>> # later\n>>> from datasets import load_from_disk\n>>> reloaded_dataset = load_from_disk(\"path/of/my/dataset/directory\")\n```\n\n## Export\n\n🤗 Datasets supports exporting as well so you can work with your dataset in other applications. The following table shows currently supported file formats you can export to:\n\n| File type               | Export method                                                       |\n| ----------------------- | ------------------------------------------------------------------- |\n| CSV                     | [`Dataset.to_csv`]                                                  |\n| JSON                    | [`Dataset.to_json`]                                                 |\n| Parquet                 | [`Dataset.to_parquet`]                                              |\n| SQL                     | [`Dataset.to_sql`]                                                  |\n| In-memory Python object | [`Dataset.to_pandas`], [`Dataset.to_polars`] or [`Dataset.to_dict`] |\n\nFor example, export your dataset to a CSV file like this:\n\n```py\n>>> encoded_dataset.to_csv(\"path/of/my/dataset.csv\")\n```\n\nUse a `hf://` path to export to a [Dataset repository](https://huggingface.co/docs/hub/datasets-overview) or a [Storage Bucket](https://huggingface.co/docs/hub/storage-buckets) on Hugging Face:\n\n```py\n>>> encoded_dataset.to_csv(\"hf://datasets/username/dataset_name/path/of/my/dataset.csv\")\n>>> encoded_dataset.to_csv(\"hf://buckets/username/raw_data_bucket/path/of/my/dataset.csv\")\n```\n"
  },
  {
    "path": "docs/source/quickstart.mdx",
    "content": "<!--Copyright 2023 The HuggingFace Team. All rights reserved.\n\nLicensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with\nthe License. You may obtain a copy of the License at\n\nhttp://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software distributed under the License is distributed on\nan \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the\nspecific language governing permissions and limitations under the License.\n-->\n\n# Quickstart\n\n[[open-in-colab]]\n\nThis quickstart is intended for developers who are ready to dive into the code and see an example of how to integrate 🤗 Datasets into their model training workflow. If you're a beginner, we recommend starting with our [tutorials](./tutorial), where you'll get a more thorough introduction.\n\nEach dataset is unique, and depending on the task, some datasets may require additional steps to prepare it for training. But you can always use 🤗 Datasets tools to load and process a dataset. The fastest and easiest way to get started is by loading an existing dataset from the [Hugging Face Hub](https://huggingface.co/datasets). There are thousands of datasets to choose from, spanning many tasks. Choose the type of dataset you want to work with, and let's get started!\n\n<div class=\"mt-4\">\n  <div class=\"w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-3 md:gap-y-4 md:gap-x-5\">\n    <a\n      class=\"!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg\"\n      href=\"#audio\"\n    >\n      <div class=\"w-full text-center bg-gradient-to-r from-violet-300 via-sky-400 to-green-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed\">\n        Audio\n      </div>\n      <p class=\"text-gray-700\">\n        Resample an audio dataset and get it ready for a model to classify what\n        type of banking issue a speaker is calling about.\n      </p>\n    </a>\n    <a\n      class=\"!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg\"\n      href=\"#vision\"\n    >\n      <div class=\"w-full text-center bg-gradient-to-r from-pink-400 via-purple-400 to-blue-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed\">\n        Vision\n      </div>\n      <p class=\"text-gray-700\">\n        Apply data augmentation to an image dataset and get it ready for a model\n        to diagnose disease in bean plants.\n      </p>\n    </a>\n    <a\n      class=\"!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg\"\n      href=\"#nlp\"\n    >\n      <div class=\"w-full text-center bg-gradient-to-r from-orange-300 via-red-400 to-violet-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed\">\n        NLP\n      </div>\n      <p class=\"text-gray-700\">\n        Tokenize a dataset and get it ready for a model to determine whether a\n        pair of sentences have the same meaning.\n      </p>\n    </a>\n  </div>\n</div>\n\n> [!TIP]\n> Check out [Chapter 5](https://huggingface.co/course/chapter5/1?fw=pt) of the Hugging Face course to learn more about other important topics such as loading remote or local datasets, tools for cleaning up a dataset, and creating your own dataset.\n\nStart by installing 🤗 Datasets:\n\n```bash\npip install datasets\n```\n\n🤗 Datasets also support audio and image data formats:\n\n- To work with audio datasets, install the [`Audio`] feature:\n\n  ```bash\n  pip install datasets[audio]\n  ```\n\n- To work with image datasets, install the [`Image`] feature:\n\n  ```bash\n  pip install datasets[vision]\n  ```\n\nBesides 🤗 Datasets, make sure your preferred machine learning framework is installed:\n\n<frameworkcontent>\n  <pt>```bash pip install torch ```</pt>\n  <tf>```bash pip install tensorflow ```</tf>\n</frameworkcontent>\n\n## Audio\n\nAudio datasets are loaded just like text datasets. However, an audio dataset is preprocessed a bit differently. Instead of a tokenizer, you'll need a [feature extractor](https://huggingface.co/docs/transformers/main_classes/feature_extractor#feature-extractor). An audio input may also require resampling its sampling rate to match the sampling rate of the pretrained model you're using. In this quickstart, you'll prepare the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset for a model train on and classify the banking issue a customer is having.\n\n**1**. Load the MInDS-14 dataset by providing the [`load_dataset`] function with the dataset name, dataset configuration (not all datasets will have a configuration), and a dataset split:\n\n```py\n>>> from datasets import load_dataset, Audio\n\n>>> dataset = load_dataset(\"PolyAI/minds14\", \"en-US\", split=\"train\")\n```\n\n**2**. Next, load a pretrained [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) model and its corresponding feature extractor from the [🤗 Transformers](https://huggingface.co/transformers/) library. It is totally normal to see a warning after you load the model about some weights not being initialized. This is expected because you are loading this model checkpoint for training with another task.\n\n```py\n>>> from transformers import AutoModelForAudioClassification, AutoFeatureExtractor\n\n>>> model = AutoModelForAudioClassification.from_pretrained(\"facebook/wav2vec2-base\")\n>>> feature_extractor = AutoFeatureExtractor.from_pretrained(\"facebook/wav2vec2-base\")\n```\n\n**3**. The [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset card indicates the sampling rate is 8kHz, but the Wav2Vec2 model was pretrained on a sampling rate of 16kHZ. You'll need to upsample the `audio` column with the [`~Dataset.cast_column`] function and [`Audio`] feature to match the model's sampling rate.\n\n```py\n>>> dataset = dataset.cast_column(\"audio\", Audio(sampling_rate=16000))\n>>> dataset[0][\"audio\"]\n<datasets.features._torchcodec.AudioDecoder object at 0x11642b6a0>\n```\n\n**4**. Create a function to preprocess the audio `array` with the feature extractor, and truncate and pad the sequences into tidy rectangular tensors. The most important thing to remember is to call the audio `array` in the feature extractor since the `array` - the actual speech signal - is the model input.\n\nOnce you have a preprocessing function, use the [`~Dataset.map`] function to speed up processing by applying the function to batches of examples in the dataset.\n\n```py\n>>> def preprocess_function(examples):\n...     audio_arrays = [x.get_all_samples().data for x in examples[\"audio\"]]\n...     inputs = feature_extractor(\n...         audio_arrays,\n...         sampling_rate=16000,\n...         padding=True,\n...         max_length=100000,\n...         truncation=True,\n...     )\n...     return inputs\n\n>>> dataset = dataset.map(preprocess_function, batched=True)\n```\n\n**5**. Use the [`~Dataset.rename_column`] function to rename the `intent_class` column to `labels`, which is the expected input name in [Wav2Vec2ForSequenceClassification](https://huggingface.co/docs/transformers/main/en/model_doc/wav2vec2#transformers.Wav2Vec2ForSequenceClassification):\n\n```py\n>>> dataset = dataset.rename_column(\"intent_class\", \"labels\")\n```\n\n**6**. Set the dataset format according to the machine learning framework you're using.\n\n<frameworkcontent>\n<pt>\nUse the [`~Dataset.set_format`] function to set the dataset format to `torch` and specify the columns you want to format. This function applies formatting on-the-fly. After converting to PyTorch tensors, wrap the dataset in [`torch.utils.data.DataLoader`](https://alband.github.io/doc_view/data.html?highlight=torch%20utils%20data%20dataloader#torch.utils.data.DataLoader):\n\n```py\n>>> from torch.utils.data import DataLoader\n\n>>> dataset.set_format(type=\"torch\", columns=[\"input_values\", \"labels\"])\n>>> dataloader = DataLoader(dataset, batch_size=4)\n```\n\n</pt>\n<tf>\n\nUse the [`~transformers.TFPreTrainedModel.prepare_tf_dataset`] method from 🤗 Transformers to prepare the dataset to be compatible with\nTensorFlow, and ready to train/fine-tune a model, as it wraps a HuggingFace [`~datasets.Dataset`] as a `tf.data.Dataset`\nwith collation and batching, so one can pass it directly to Keras methods like `fit()` without further modification.\n\n```py\n>>> import tensorflow as tf\n\n>>> tf_dataset = model.prepare_tf_dataset(\n...     dataset,\n...     batch_size=4,\n...     shuffle=True,\n... )\n```\n\n</tf>\n</frameworkcontent>\n\n**7**. Start training with your machine learning framework! Check out the 🤗 Transformers [audio classification guide](https://huggingface.co/docs/transformers/tasks/audio_classification) for an end-to-end example of how to train a model on an audio dataset.\n\n## Vision\n\nImage datasets are loaded just like text datasets. However, instead of a tokenizer, you'll need a [feature extractor](https://huggingface.co/docs/transformers/main_classes/feature_extractor#feature-extractor) to preprocess the dataset. Applying data augmentation to an image is common in computer vision to make the model more robust against overfitting. You're free to use any data augmentation library you want, and then you can apply the augmentations with 🤗 Datasets. In this quickstart, you'll load the [Beans](https://huggingface.co/datasets/beans) dataset and get it ready for the model to train on and identify disease from the leaf images.\n\n**1**. Load the Beans dataset by providing the [`load_dataset`] function with the dataset name and a dataset split:\n\n```py\n>>> from datasets import load_dataset, Image\n\n>>> dataset = load_dataset(\"AI-Lab-Makerere/beans\", split=\"train\")\n```\n\nMost image models work with RBG images. If your dataset contains images in a different mode, you can use the [`~Dataset.cast_column`] function to set the mode to RGB:\n\n```py\n>>> dataset = dataset.cast_column(\"image\", Image(mode=\"RGB\"))\n```\n\nThe Beans dataset contains only RGB images, so this step is unnecessary here.\n\n**2**. Now you can add some data augmentations with any library ([Albumentations](https://albumentations.ai/), [imgaug](https://imgaug.readthedocs.io/en/latest/), [Kornia](https://kornia.readthedocs.io/en/latest/)) you like. Here, you'll use [torchvision](https://pytorch.org/vision/stable/transforms.html) to randomly change the color properties of an image:\n\n```py\n>>> from torchvision.transforms import Compose, ColorJitter, ToTensor\n\n>>> jitter = Compose(\n...     [ColorJitter(brightness=0.5, hue=0.5), ToTensor()]\n... )\n```\n\n**3**. Create a function to apply your transform to the dataset and generate the model input: `pixel_values`.\n\n```python\n>>> def transforms(examples):\n...     examples[\"pixel_values\"] = [jitter(image.convert(\"RGB\")) for image in examples[\"image\"]]\n...     return examples\n```\n\n**4**. Use the [`~Dataset.with_transform`] function to apply the data augmentations on-the-fly:\n\n```py\n>>> dataset = dataset.with_transform(transforms)\n```\n\n**5**. Set the dataset format according to the machine learning framework you're using.\n\n<frameworkcontent>\n<pt>\nWrap the dataset in [`torch.utils.data.DataLoader`](https://alband.github.io/doc_view/data.html?highlight=torch%20utils%20data%20dataloader#torch.utils.data.DataLoader). You'll also need to create a collate function to collate the samples into batches:\n\n```py\n>>> from torch.utils.data import DataLoader\n\n>>> def collate_fn(examples):\n...     images = []\n...     labels = []\n...     for example in examples:\n...         images.append((example[\"pixel_values\"]))\n...         labels.append(example[\"labels\"])\n...\n...     pixel_values = torch.stack(images)\n...     labels = torch.tensor(labels)\n...     return {\"pixel_values\": pixel_values, \"labels\": labels}\n>>> dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=4)\n```\n\n</pt>\n<tf>\n\nUse the [`~transformers.TFPreTrainedModel.prepare_tf_dataset`] method from 🤗 Transformers to prepare the dataset to be compatible with\nTensorFlow, and ready to train/fine-tune a model, as it wraps a HuggingFace [`~datasets.Dataset`] as a `tf.data.Dataset`\nwith collation and batching, so one can pass it directly to Keras methods like `fit()` without further modification.\n\nBefore you start, make sure you have up-to-date versions of `albumentations` and `cv2` installed:\n\n```bash\npip install -U albumentations opencv-python\n```\n\n```py\n>>> import albumentations\n>>> import numpy as np\n\n>>> transform = albumentations.Compose([\n...     albumentations.RandomCrop(width=256, height=256),\n...     albumentations.HorizontalFlip(p=0.5),\n...     albumentations.RandomBrightnessContrast(p=0.2),\n... ])\n\n>>> def transforms(examples):\n...     examples[\"pixel_values\"] = [\n...         transform(image=np.array(image))[\"image\"] for image in examples[\"image\"]\n...     ]\n...     return examples\n\n>>> dataset.set_transform(transforms)\n>>> tf_dataset = model.prepare_tf_dataset(\n...     dataset,\n...     batch_size=4,\n...     shuffle=True,\n... )\n```\n\n</tf>\n</frameworkcontent>\n\n**6**. Start training with your machine learning framework! Check out the 🤗 Transformers [image classification guide](https://huggingface.co/docs/transformers/tasks/image_classification) for an end-to-end example of how to train a model on an image dataset.\n\n## NLP\n\nText needs to be tokenized into individual tokens by a [tokenizer](https://huggingface.co/docs/transformers/main_classes/tokenizer). For the quickstart, you'll load the [Microsoft Research Paraphrase Corpus (MRPC)](https://huggingface.co/datasets/nyu-mll/glue/viewer/mrpc) training dataset to train a model to determine whether a pair of sentences mean the same thing.\n\n**1**. Load the MRPC dataset by providing the [`load_dataset`] function with the dataset name, dataset configuration (not all datasets will have a configuration), and dataset split:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"nyu-mll/glue\", \"mrpc\", split=\"train\")\n```\n\n**2**. Next, load a pretrained [BERT](https://huggingface.co/bert-base-uncased) model and its corresponding tokenizer from the [🤗 Transformers](https://huggingface.co/transformers/) library. It is totally normal to see a warning after you load the model about some weights not being initialized. This is expected because you are loading this model checkpoint for training with another task.\n\n```py\n>>> from transformers import AutoModelForSequenceClassification, AutoTokenizer\n\n>>> model = AutoModelForSequenceClassification.from_pretrained(\"bert-base-uncased\")\n>>> tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n===PT-TF-SPLIT===\n>>> from transformers import TFAutoModelForSequenceClassification, AutoTokenizer\n\n>>> model = TFAutoModelForSequenceClassification.from_pretrained(\"bert-base-uncased\")\n>>> tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n```\n\n**3**. Create a function to tokenize the dataset, and you should also truncate and pad the text into tidy rectangular tensors. The tokenizer generates three new columns in the dataset: `input_ids`, `token_type_ids`, and an `attention_mask`. These are the model inputs.\n\nUse the [`~Dataset.map`] function to speed up processing by applying your tokenization function to batches of examples in the dataset:\n\n```py\n>>> def encode(examples):\n...     return tokenizer(examples[\"sentence1\"], examples[\"sentence2\"], truncation=True, padding=\"max_length\")\n\n>>> dataset = dataset.map(encode, batched=True)\n>>> dataset[0]\n{'sentence1': 'Amrozi accused his brother , whom he called \" the witness \" , of deliberately distorting his evidence .',\n'sentence2': 'Referring to him as only \" the witness \" , Amrozi accused his brother of deliberately distorting his evidence .',\n'label': 1,\n'idx': 0,\n'input_ids': [  101,  7277,  2180,  5303,  4806,  1117,  1711,   117,  2292, 1119,  1270,   107,  1103,  7737,   107,   117,  1104,  9938, 4267, 12223, 21811,  1117,  2554,   119,   102, 11336,  6732, 3384,  1106,  1140,  1112,  1178,   107,  1103,  7737,   107, 117,  7277,  2180,  5303,  4806,  1117,  1711,  1104,  9938, 4267, 12223, 21811,  1117,  2554,   119,   102, 0, 0, ...],\n'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...],\n'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...]}\n```\n\n**4**. Rename the `label` column to `labels`, which is the expected input name in [BertForSequenceClassification](https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification):\n\n```py\n>>> dataset = dataset.map(lambda examples: {\"labels\": examples[\"label\"]}, batched=True)\n```\n\n**5**. Set the dataset format according to the machine learning framework you're using.\n\n<frameworkcontent>\n<pt>\nUse the [`~Dataset.with_format`] function to set the dataset format to `torch` and specify the columns you want to format. This function applies formatting on-the-fly. After converting to PyTorch tensors, wrap the dataset in [`torch.utils.data.DataLoader`](https://alband.github.io/doc_view/data.html?highlight=torch%20utils%20data%20dataloader#torch.utils.data.DataLoader):\n\n```py\n>>> import torch\n\n>>> dataset = dataset.select_columns([\"input_ids\", \"token_type_ids\", \"attention_mask\", \"labels\"])\n>>> dataset = dataset.with_format(type=\"torch\")\n>>> dataloader = torch.utils.data.DataLoader(dataset, batch_size=32)\n```\n\n</pt>\n<tf>\n\nUse the [`~transformers.TFPreTrainedModel.prepare_tf_dataset`] method from 🤗 Transformers to prepare the dataset to be compatible with\nTensorFlow, and ready to train/fine-tune a model, as it wraps a HuggingFace [`~datasets.Dataset`] as a `tf.data.Dataset`\nwith collation and batching, so one can pass it directly to Keras methods like `fit()` without further modification.\n\n```py\n>>> import tensorflow as tf\n\n>>> tf_dataset = model.prepare_tf_dataset(\n...     dataset,\n...     batch_size=4,\n...     shuffle=True,\n... )\n```\n\n</tf>\n</frameworkcontent>\n\n**6**. Start training with your machine learning framework! Check out the 🤗 Transformers [text classification guide](https://huggingface.co/docs/transformers/tasks/sequence_classification) for an end-to-end example of how to train a model on a text dataset.\n\n## What's next?\n\nThis completes the 🤗 Datasets quickstart! You can load any text, audio, or image dataset with a single function and get it ready for your model to train on.\n\nFor your next steps, take a look at our [How-to guides](./how_to) and learn how to do more specific things like loading different dataset formats, aligning labels, and streaming large datasets. If you're interested in learning more about 🤗 Datasets core concepts, grab a cup of coffee and read our [Conceptual Guides](./about_arrow)!\n"
  },
  {
    "path": "docs/source/repository_structure.mdx",
    "content": "# Structure your repository\n\nTo host and share your dataset, create a dataset repository on the Hugging Face Hub and upload your data files.\n\nThis guide will show you how to structure your dataset repository when you upload it.\nA dataset with a supported structure and file format (`.txt`, `.csv`, `.parquet`, `.jsonl`, `.mp3`, `.jpg`, `.zip` etc.) are loaded automatically with [`~datasets.load_dataset`], and it'll have a dataset viewer on its dataset page on the Hub.\n\n## Main use-case\n\nThe simplest dataset structure has two files: `train.csv` and `test.csv` (this works with any supported file format).\n\nYour repository will also contain a `README.md` file, the [dataset card](dataset_card) displayed on your dataset page.\n\n```\nmy_dataset_repository/\n├── README.md\n├── train.csv\n└── test.csv\n```\n\nIn this simple case, you'll get a dataset with two splits: `train` (containing examples from `train.csv`) and `test` (containing examples from `test.csv`).\n\n## Define your splits and subsets in YAML\n\n## Splits\n\nIf you have multiple files and want to define which file goes into which split, you can use the YAML `configs` field at the top of your README.md.\n\nFor example, given a repository like this one:\n\n```\nmy_dataset_repository/\n├── README.md\n├── data.csv\n└── holdout.csv\n```\n\nYou can define your splits by adding the `configs` field in the YAML block at the top of your README.md:\n\n```yaml\n---\nconfigs:\n- config_name: default\n  data_files:\n  - split: train\n    path: \"data.csv\"\n  - split: test\n    path: \"holdout.csv\"\n---\n```\n\n\nYou can select multiple files per split using a list of paths:\n\n```\nmy_dataset_repository/\n├── README.md\n├── data/\n│   ├── abc.csv\n│   └── def.csv\n└── holdout/\n    └── ghi.csv\n```\n\n```yaml\n---\nconfigs:\n- config_name: default\n  data_files:\n  - split: train\n    path:\n    - \"data/abc.csv\"\n    - \"data/def.csv\"\n  - split: test\n    path: \"holdout/ghi.csv\"\n---\n```\n\nOr you can use glob patterns to automatically list all the files you need:\n\n```yaml\n---\nconfigs:\n- config_name: default\n  data_files:\n  - split: train\n    path: \"data/*.csv\"\n  - split: test\n    path: \"holdout/*.csv\"\n---\n```\n\n> [!WARNING]\n> Note that `config_name` field is required even if you have a single configuration.\n\n## Configurations\n\nYour dataset might have several subsets of data that you want to be able to load separately. In that case you can define a list of configurations inside the `configs` field in YAML:\n\n```\nmy_dataset_repository/\n├── README.md\n├── main_data.csv\n└── additional_data.csv\n```\n\n```yaml\n---\nconfigs:\n- config_name: main_data\n  data_files: \"main_data.csv\"\n- config_name: additional_data\n  data_files: \"additional_data.csv\"\n---\n```\n\nEach configuration is shown separately on the Hugging Face Hub, and can be loaded by passing its name as a second parameter:\n\n```python\nfrom datasets import load_dataset\n\nmain_data = load_dataset(\"my_dataset_repository\", \"main_data\")\nadditional_data = load_dataset(\"my_dataset_repository\", \"additional_data\")\n```\n\n## Builder parameters\n\nNot only `data_files`, but other builder-specific parameters can be passed via YAML, allowing for more flexibility on how to load the data while not requiring any custom code. For example, define which separator to use in which configuration to load your `csv` files:\n\n```yaml\n---\nconfigs:\n- config_name: tab\n  data_files: \"main_data.csv\"\n  sep: \"\\t\"\n- config_name: comma\n  data_files: \"additional_data.csv\"\n  sep: \",\"\n---\n```\n\nRefer to [specific builders' documentation](./package_reference/builder_classes) to see what configuration parameters they have.\n\n> [!TIP]\n> You can set a default configuration using `default: true`, e.g. you can run `main_data = load_dataset(\"my_dataset_repository\")` if you set \n>\n> ```yaml\n> - config_name: main_data\n>   data_files: \"main_data.csv\"\n>   default: true\n> ```\n\n## Automatic splits detection\n\nIf no YAML is provided, 🤗 Datasets searches for certain patterns in the dataset repository to automatically infer the dataset splits.\nThere is an order to the patterns, beginning with the custom filename split format to treating all files as a single split if no pattern is found.\n\n### Directory name\n\nYour data files may also be placed into different directories named `train`, `test`, and `validation` where each directory contains the data files for that split:\n\n```\nmy_dataset_repository/\n├── README.md\n└── data/\n    ├── train/\n    │   └── bees.csv\n    ├── test/\n    │   └── more_bees.csv\n    └── validation/\n        └── even_more_bees.csv\n```\n\n### Filename splits\n\nIf you don't have any non-traditional splits, then you can place the split name anywhere in the data file and it is automatically inferred. The only rule is that the split name must be delimited by non-word characters, like `test-file.csv` for example instead of `testfile.csv`. Supported delimiters include underscores, dashes, spaces, dots, and numbers.\n\nFor example, the following file names are all acceptable:\n\n- train split: `train.csv`, `my_train_file.csv`, `train1.csv`\n- validation split: `validation.csv`, `my_validation_file.csv`, `validation1.csv`\n- test split: `test.csv`, `my_test_file.csv`, `test1.csv`\n\nHere is an example where all the files are placed into a directory named `data`:\n\n```\nmy_dataset_repository/\n├── README.md\n└── data/\n    ├── train.csv\n    ├── test.csv\n    └── validation.csv\n```\n\n### Custom filename split\n\nIf your dataset splits have custom names that aren't `train`, `test`, or `validation`, then you can name your data files like `data/<split_name>-xxxxx-of-xxxxx.csv`.\n\nHere is an example with three splits, `train`, `test`, and `random`:\n\n```\nmy_dataset_repository/\n├── README.md\n└── data/\n    ├── train-00000-of-00003.csv\n    ├── train-00001-of-00003.csv\n    ├── train-00002-of-00003.csv\n    ├── test-00000-of-00001.csv\n    ├── random-00000-of-00003.csv\n    ├── random-00001-of-00003.csv\n    └── random-00002-of-00003.csv\n```\n\n### Single split\n\nWhen 🤗 Datasets can't find any of the above patterns, then it'll treat all the files as a single train split. If your dataset splits aren't loading as expected, it may be due to an incorrect pattern.\n\n### Split name keywords\n\nThere are several ways to name splits. Validation splits are sometimes called \"dev\", and test splits may be referred to as \"eval\".\nThese other split names are also supported, and the following keywords are equivalent:\n\n- train, training\n- validation, valid, val, dev\n- test, testing, eval, evaluation\n\nThe structure below is a valid repository:\n\n```\nmy_dataset_repository/\n├── README.md\n└── data/\n    ├── training.csv\n    ├── eval.csv\n    └── valid.csv\n```\n\n### Multiple files per split\n\nIf one of your splits comprises several files, 🤗 Datasets can still infer whether it is the train, validation, and test split from the file name.\nFor example, if your train and test splits span several files:\n\n```\nmy_dataset_repository/\n├── README.md\n├── train_0.csv\n├── train_1.csv\n├── train_2.csv\n├── train_3.csv\n├── test_0.csv\n└── test_1.csv\n```\n\nMake sure all the files of your `train` set have *train* in their names (same for test and validation).\nEven if you add a prefix or suffix to `train` in the file name (like `my_train_file_00001.csv` for example),\n🤗 Datasets can still infer the appropriate split.\n\nFor convenience, you can also place your data files into different directories.\nIn this case, the split name is inferred from the directory name.\n\n```\nmy_dataset_repository/\n├── README.md\n└── data/\n    ├── train/\n    │   ├── shard_0.csv\n    │   ├── shard_1.csv\n    │   ├── shard_2.csv\n    │   └── shard_3.csv\n    └── test/\n        ├── shard_0.csv\n        └── shard_1.csv\n```\n"
  },
  {
    "path": "docs/source/semantic_segmentation.mdx",
    "content": "# Semantic segmentation\n\nSemantic segmentation datasets are used to train a model to classify every pixel in an image. There are\na wide variety of applications enabled by these datasets such as background removal from images, stylizing\nimages, or scene understanding for autonomous driving. This guide will show you how to apply transformations\nto an image segmentation dataset.\n\nBefore you start, make sure you have up-to-date versions of `albumentations` and `cv2` installed:\n\n```bash\npip install -U albumentations opencv-python\n```\n\n[Albumentations](https://albumentations.ai/) is a Python library for performing data augmentation\nfor computer vision. It supports various computer vision tasks such as image classification, object\ndetection, segmentation, and keypoint estimation.\n\nThis guide uses the [Scene Parsing](https://huggingface.co/datasets/scene_parse_150) dataset for segmenting\nand parsing an image into different image regions associated with semantic categories, such as sky, road, person, and bed.\n\nLoad the `train` split of the dataset and take a look at an example:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"scene_parse_150\", split=\"train\")\n>>> index = 10\n>>> dataset[index]\n{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=683x512 at 0x7FB37B0EC810>,\n 'annotation': <PIL.PngImagePlugin.PngImageFile image mode=L size=683x512 at 0x7FB37B0EC9D0>,\n 'scene_category': 927}\n```\n\nThe dataset has three fields:\n\n* `image`: a PIL image object.\n* `annotation`: segmentation mask of the image.\n* `scene_category`: the label or scene category of the image (like “kitchen” or “office”).\n\nNext, check out an image with:\n\n```py\n>>> dataset[index][\"image\"]\n```\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/image_seg.png\">\n</div>\n\nSimilarly, you can check out the respective segmentation mask:\n\n```py\n>>> dataset[index][\"annotation\"]\n```\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/seg_mask.png\">\n</div>\n\nWe can also add a [color palette](https://github.com/tensorflow/models/blob/3f1ca33afe3c1631b733ea7e40c294273b9e406d/research/deeplab/utils/get_dataset_colormap.py#L51) on the\nsegmentation mask and overlay it on top of the original image to visualize the dataset:\n\nAfter defining the color palette, you should be ready to visualize some overlays. \n\n```py\n>>> import matplotlib.pyplot as plt\n\n>>> def visualize_seg_mask(image: np.ndarray, mask: np.ndarray):\n...    color_seg = np.zeros((mask.shape[0], mask.shape[1], 3), dtype=np.uint8)\n...    palette = np.array(create_ade20k_label_colormap())\n...    for label, color in enumerate(palette):\n...        color_seg[mask == label, :] = color\n...    color_seg = color_seg[..., ::-1]  # convert to BGR\n\n...    img = np.array(image) * 0.5 + color_seg * 0.5  # plot the image with the segmentation map\n...    img = img.astype(np.uint8)\n\n...    plt.figure(figsize=(15, 10))\n...    plt.imshow(img)\n...    plt.axis(\"off\")\n...    plt.show()\n\n\n>>> visualize_seg_mask(\n...     np.array(dataset[index][\"image\"]),\n...     np.array(dataset[index][\"annotation\"])\n... )\n```\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/seg_overlay.png\">\n</div>\n\nNow apply some augmentations with `albumentations`. You’ll first resize the image and adjust its brightness.\n\n```py\n>>> import albumentations\n\n>>> transform = albumentations.Compose(\n...     [\n...         albumentations.Resize(256, 256),\n...         albumentations.RandomBrightnessContrast(brightness_limit=0.3, contrast_limit=0.3, p=0.5),\n...     ]\n... )\n```\n\nCreate a function to apply the transformation to the images:\n\n```py\n>>> def transforms(examples):\n...     transformed_images, transformed_masks = [], []\n...\n...     for image, seg_mask in zip(examples[\"image\"], examples[\"annotation\"]):\n...         image, seg_mask = np.array(image), np.array(seg_mask)\n...         transformed = transform(image=image, mask=seg_mask)\n...         transformed_images.append(transformed[\"image\"])\n...         transformed_masks.append(transformed[\"mask\"])\n...\n...     examples[\"pixel_values\"] = transformed_images\n...     examples[\"label\"] = transformed_masks\n...     return examples\n```\n\nUse the [`~Dataset.set_transform`] function to apply the transformation on-the-fly to batches of the dataset to consume less disk space:\n\n```py\n>>> dataset.set_transform(transforms)\n```\n\nYou can verify the transformation worked by indexing into the `pixel_values` and `label` of an example:\n\n```py\n>>> image = np.array(dataset[index][\"pixel_values\"])\n>>> mask = np.array(dataset[index][\"label\"])\n\n>>> visualize_seg_mask(image, mask)\n```\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/albumentations_seg.png\">\n</div>\n\nIn this guide, you have used `albumentations` for augmenting the dataset. It's also possible to use `torchvision` to apply some similar transforms. \n\n```py \n>>> from torchvision.transforms import Resize, ColorJitter, Compose\n\n>>> transformation_chain = Compose([\n...     Resize((256, 256)),\n...     ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1)\n... ])\n>>> resize = Resize((256, 256))\n\n>>> def train_transforms(example_batch):\n...     example_batch[\"pixel_values\"] = [transformation_chain(x) for x in example_batch[\"image\"]]\n...     example_batch[\"label\"] = [resize(x) for x in example_batch[\"annotation\"]]\n...     return example_batch\n\n>>> dataset.set_transform(train_transforms)\n\n>>> image = np.array(dataset[index][\"pixel_values\"])\n>>> mask = np.array(dataset[index][\"label\"])\n\n>>> visualize_seg_mask(image, mask)\n```\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/torchvision_seg.png\">\n</div>\n\n> [!TIP]\n> Now that you know how to process a dataset for semantic segmentation, learn\n> [how to train a semantic segmentation model](https://huggingface.co/docs/transformers/tasks/semantic_segmentation)\n> and use it for inference."
  },
  {
    "path": "docs/source/share.mdx",
    "content": "# Share a dataset using the CLI\n\nAt Hugging Face, we are on a mission to democratize good Machine Learning and we believe in the value of open source. That's why we designed 🤗 Datasets so that anyone can share a dataset with the greater ML community. There are currently thousands of datasets in over 100 languages in the Hugging Face Hub, and the Hugging Face team always welcomes new contributions!\n\nDataset repositories offer features such as:\n\n- Free dataset hosting\n- Dataset versioning\n- Commit history and diffs\n- Metadata for discoverability\n- Dataset cards for documentation, licensing, limitations, etc.\n- [Dataset Viewer](https://huggingface.co/docs/hub/datasets-viewer)\n\nThis guide will show you how to share a dataset folder or repository that can be easily accessed by anyone.\n\n<a id='upload_dataset_repo'></a>\n\n## Add a dataset\n\nYou can share your dataset with the community with a dataset repository on the Hugging Face Hub.\nIt can also be a private dataset if you want to control who has access to it.\n\nIn a dataset repository, you can host all your data files and [configure your dataset](./repository_structure#define-your-splits-in-yaml) to define which file goes to which split.\nThe following formats are supported: CSV, TSV, JSON, JSON lines, text, Parquet, Arrow, SQLite, WebDataset.\nMany kinds of compressed file types are also supported: GZ, BZ2, LZ4, LZMA or ZSTD.\nFor example, your dataset can be made of `.json.gz` files.\n\nWhen loading a dataset from the Hub, all the files in the supported formats are loaded, following the [repository structure](./repository_structure).\n\nFor more information on how to load a dataset from the Hub, take a look at the [load a dataset from the Hub](./load_hub) tutorial.\n\n### Create the repository\n\nSharing a community dataset will require you to create an account on [hf.co](https://huggingface.co/join) if you don't have one yet.\nYou can directly create a [new dataset repository](https://huggingface.co/login?next=%2Fnew-dataset) from your account on the Hugging Face Hub, but this guide will show you how to upload a dataset from the terminal.\n\n1. Make sure you are in the virtual environment where you installed Datasets, and run the following command:\n\n```\nhuggingface-cli login\n```\n\n2. Login using your Hugging Face Hub credentials, and create a new dataset repository:\n\n```\nhuggingface-cli repo create my-cool-dataset --type dataset\n```\n\nAdd the `-organization` flag to create a repository under a specific organization:\n\n```\nhuggingface-cli repo create my-cool-dataset --type dataset --organization your-org-name\n```\n\n## Prepare your files\n\nCheck your directory to ensure the only files you're uploading are:\n\n- The data files of the dataset\n\n- The dataset card `README.md`\n\n\n## huggingface-cli upload\n\nUse the `huggingface-cli upload` command to upload files to the Hub directly. Internally, it uses the same [`upload_file`] and [`upload_folder`] helpers described in the [Upload guide](https://huggingface.co/docs/huggingface_hub/guides/upload). In the examples below, we will walk through the most common use cases. For a full list of available options, you can run:\n\n```bash\n>>> huggingface-cli upload --help\n```\n\nFor more general information about `huggingface-cli` you can check the [CLI guide](https://huggingface.co/docs/huggingface_hub/guides/cli).\n\n### Upload an entire folder\n\nThe default usage for this command is:\n\n```bash\n# Usage:  huggingface-cli upload [dataset_repo_id] [local_path] [path_in_repo] --repo-type dataset\n```\n\nTo upload the current directory at the root of the repo, use:\n\n```bash\n>>> huggingface-cli upload my-cool-dataset . . --repo-type dataset\nhttps://huggingface.co/datasets/Wauplin/my-cool-dataset/tree/main/\n```\n\n> [!TIP]\n> If the repo doesn't exist yet, it will be created automatically.\n\nYou can also upload a specific folder:\n\n```bash\n>>> huggingface-cli upload my-cool-dataset ./data . --repo-type dataset\nhttps://huggingface.co/datasetsWauplin/my-cool-dataset/tree/main/\n```\n\nFinally, you can upload a folder to a specific destination on the repo:\n\n```bash\n>>> huggingface-cli upload my-cool-dataset ./path/to/curated/data /data/train --repo-type dataset\nhttps://huggingface.co/datasetsWauplin/my-cool-dataset/tree/main/data/train\n```\n\n### Upload a single file\n\nYou can also upload a single file by setting `local_path` to point to a file on your machine. If that's the case, `path_in_repo` is optional and will default to the name of your local file:\n\n```bash\n>>> huggingface-cli upload Wauplin/my-cool-dataset ./files/train.csv --repo-type dataset\nhttps://huggingface.co/datasetsWauplin/my-cool-dataset/blob/main/train.csv\n```\n\nIf you want to upload a single file to a specific directory, set `path_in_repo` accordingly:\n\n```bash\n>>> huggingface-cli upload Wauplin/my-cool-dataset ./files/train.csv /data/train.csv --repo-type dataset\nhttps://huggingface.co/datasetsWauplin/my-cool-dataset/blob/main/data/train.csv\n```\n\n### Upload multiple files\n\nTo upload multiple files from a folder at once without uploading the entire folder, use the `--include` and `--exclude` patterns. It can also be combined with the `--delete` option to delete files on the repo while uploading new ones. In the example below, we sync the local Space by deleting remote files and uploading all CSV files:\n\n```bash\n# Sync local Space with Hub (upload new CSV files, delete removed files)\n>>> huggingface-cli upload Wauplin/my-cool-dataset --repo-type dataset --include=\"/data/*.csv\" --delete=\"*\" --commit-message=\"Sync local dataset with Hub\"\n...\n```\n\n### Upload to an organization\n\nTo upload content to a repo owned by an organization instead of a personal repo, you must explicitly specify it in the `repo_id`:\n\n```bash\n>>> huggingface-cli upload MyCoolOrganization/my-cool-dataset . . --repo-type dataset\nhttps://huggingface.co/datasetsMyCoolOrganization/my-cool-dataset/tree/main/\n```\n\n### Upload to a specific revision\n\nBy default, files are uploaded to the `main` branch. If you want to upload files to another branch or reference, use the `--revision` option:\n\n```bash\n# Upload files to a PR\nhuggingface-cli upload bigcode/the-stack . . --repo-type dataset --revision refs/pr/104\n...\n```\n\n**Note:** if `revision` does not exist and `--create-pr` is not set, a branch will be created automatically from the `main` branch.\n\n### Upload and create a PR\n\nIf you don't have the permission to push to a repo, you must open a PR and let the authors know about the changes you want to make. This can be done by setting the `--create-pr` option:\n\n```bash\n# Create a PR and upload the files to it\n>>> huggingface-cli upload bigcode/the-stack --repo-type dataset --revision refs/pr/104 --create-pr . .\nhttps://huggingface.co/datasets/bigcode/the-stack/blob/refs%2Fpr%2F104/\n```\n\n### Upload at regular intervals\n\nIn some cases, you might want to push regular updates to a repo. For example, this is useful if your dataset is growing over time and you want to upload the data folder every 10 minutes. You can do this using the `--every` option:\n\n```bash\n# Upload new logs every 10 minutes\nhuggingface-cli upload my-cool-dynamic-dataset data/ --every=10\n```\n\n### Specify a commit message\n\nUse the `--commit-message` and `--commit-description` to set a custom message and description for your commit instead of the default one\n\n```bash\n>>> huggingface-cli upload Wauplin/my-cool-dataset ./data . --repo-type dataset --commit-message=\"Version 2\" --commit-description=\"Train size: 4321. Check Dataset Viewer for more details.\"\n...\nhttps://huggingface.co/datasetsWauplin/my-cool-dataset/tree/main\n```\n\n### Specify a token\n\nTo upload files, you must use a token. By default, the token saved locally (using `huggingface-cli login`) will be used. If you want to authenticate explicitly, use the `--token` option:\n\n```bash\n>>> huggingface-cli upload Wauplin/my-cool-dataset ./data . --repo-type dataset --token=hf_****\n...\nhttps://huggingface.co/datasetsWauplin/my-cool-data/tree/main\n```\n\n### Quiet mode\n\nBy default, the `huggingface-cli upload` command will be verbose. It will print details such as warning messages, information about the uploaded files, and progress bars. If you want to silence all of this, use the `--quiet` option. Only the last line (i.e. the URL to the uploaded files) is printed. This can prove useful if you want to pass the output to another command in a script.\n\n```bash\n>>> huggingface-cli upload Wauplin/my-cool-dataset ./data . --repo-type dataset --quiet\nhttps://huggingface.co/datasets/Wauplin/my-cool-dataset/tree/main\n```\n\n## Enjoy !\n\nCongratulations, your dataset has now been uploaded to the Hugging Face Hub where anyone can load it in a single line of code! 🥳\n\n```\ndataset = load_dataset(\"Wauplin/my-cool-dataset\")\n```\n\nIf your dataset is supported, it should also have a [Dataset Viewer](https://huggingface.co/docs/hub/datasets-viewer) for everyone to explore the dataset content.\n\nFinally, don't forget to enrich the dataset card to document your dataset and make it discoverable! Check out the [Create a dataset card](dataset_card) guide to learn more.\n"
  },
  {
    "path": "docs/source/stream.mdx",
    "content": "# Stream\n\nDataset streaming lets you work with a dataset without downloading it.\nThe data is streamed as you iterate over the dataset.\nThis is especially helpful when:\n\n- You don't want to wait for an extremely large dataset to download.\n- The dataset size exceeds the amount of available disk space on your computer.\n- You want to quickly explore just a few samples of a dataset.\n\n<div class=\"flex justify-center\">\n    <img class=\"block dark:hidden\" src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/streaming.gif\"/>\n    <img class=\"hidden dark:block\" src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/streaming-dark.gif\"/>\n</div>\n\nFor example, the English split of the [HuggingFaceFW/fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb) dataset is 45 terabytes, but you can use it instantly with streaming. Stream a dataset by setting `streaming=True` in [`load_dataset`] as shown below:\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset('HuggingFaceFW/fineweb', split='train', streaming=True)\n>>> print(next(iter(dataset)))\n{'text': 'How AP reported in all formats from tornado-stricken regionsMarch 8, 2012\\nWhen the first serious bout of tornadoes of 2012 blew through middle America in the middle of the night, they touched down in places hours from any AP bureau...', ...,\n 'language_score': 0.9721424579620361, 'token_count': 717}\n```\n\nDataset streaming also lets you work with a dataset made of local files without doing any conversion.\nIn this case, the data is streamed from the local files as you iterate over the dataset.\nThis is especially helpful when:\n\n- You don't want to wait for an extremely large local dataset to be converted to Arrow.\n- The converted files size would exceed the amount of available disk space on your computer.\n- You want to quickly explore just a few samples of a dataset.\n- You want to load only certain columns or efficiently filter a Parquet dataset.\n\nFor example, you can stream a local dataset of hundreds of compressed JSONL files like [oscar-corpus/OSCAR-2201](https://huggingface.co/datasets/oscar-corpus/OSCAR-2201) to use it instantly:\n\n```py\n>>> from datasets import load_dataset\n>>> data_files = {'train': 'path/to/OSCAR-2201/compressed/en_meta/*.jsonl.gz'}\n>>> dataset = load_dataset('json', data_files=data_files, split='train', streaming=True)\n>>> print(next(iter(dataset)))\n{'id': 0, 'text': 'Founded in 2015, Golden Bees is a leading programmatic recruitment platform dedicated to employers, HR agencies and job boards. The company has developed unique HR-custom technologies and predictive algorithms to identify and attract the best candidates for a job opportunity.', ...\n```\n\nParquet is a columnar format that allows you to stream and load only a subset of columns and ignore unwanted columns. Parquet also stores metadata such as column statistics (at the file and row group level), enabling efficient filtering. Use the `columns` and `filters` arguments of [`datasets.packaged_modules.parquet.ParquetConfig`] to stream Parquet datasets, select columns, and apply filters:\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset('HuggingFaceFW/fineweb', split='train', streaming=True, columns=[\"url\", \"date\"])\n>>> print(next(iter(dataset)))\n{'url': 'http://%20jwashington@ap.org/Content/Press-Release/2012/How-AP-reported-in-all-formats-from-tornado-stricken-regions', 'date': '2013-05-18T05:48:54Z'}\n>>> dataset = load_dataset('HuggingFaceFW/fineweb', split='train', streaming=True, filters=[(\"language_score\", \">=\", 0.99)])\n>>> print(next(iter(dataset)))\n{'text': 'Everyone wishes for something. And lots of people believe they know how to make their wishes come true with magical thinking.\\nWhat is it? \"Magical thinking is a belief in forms of causation, with no known physical basis,\" said Professor Emily Pronin of Princeton...', ...,\n 'language_score': 0.9900368452072144, 'token_count': 716}\n```\n\nLoading a dataset in streaming mode creates a new dataset type instance (instead of the classic [`Dataset`] object), known as an [`IterableDataset`].\nThis special type of dataset has its own set of processing methods shown below.\n\n> [!TIP]\n> An [`IterableDataset`] is useful for iterative jobs like training a model.\n> You shouldn't use a [`IterableDataset`] for jobs that require random access to examples because you have to iterate all over it using a for loop. Getting the last example in an iterable dataset would require you to iterate over all the previous examples.\n> You can find more details in the [Dataset vs. IterableDataset guide](./about_mapstyle_vs_iterable).\n\n\n## Column indexing\n\nSometimes it is convenient to iterate over values of a specific column. Fortunately, an [`IterableDataset`] supports column indexing:\n```python\n>>> from datasets import load_dataset\n>>> dataset = load_dataset(\"allenai/c4\", \"en\", streaming=True, split=\"train\")\n>>> print(next(iter(dataset[\"text\"])))\nBeginners BBQ Class Taking Place in Missoula!...\n```\n\n## Convert from a Dataset\n\nIf you have an existing [`Dataset`] object, you can convert it to an [`IterableDataset`] with the [`~Dataset.to_iterable_dataset`] function. This is actually faster than setting the `streaming=True` argument in [`load_dataset`] because the data is streamed from local files.\n\n```py\n>>> from datasets import load_dataset\n\n# faster 🐇\n>>> dataset = load_dataset(\"ethz/food101\")\n>>> iterable_dataset = dataset.to_iterable_dataset()\n\n# slower 🐢\n>>> iterable_dataset = load_dataset(\"ethz/food101\", streaming=True)\n```\n\nThe [`~Dataset.to_iterable_dataset`] function supports sharding when the [`IterableDataset`] is instantiated. This is useful when working with big datasets, and you'd like to shuffle the dataset or to enable fast parallel loading with a PyTorch DataLoader.\n\n```py\n>>> import torch\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"ethz/food101\")\n>>> iterable_dataset = dataset.to_iterable_dataset(num_shards=64) # shard the dataset\n>>> iterable_dataset = iterable_dataset.shuffle(buffer_size=10_000)  # shuffles the shards order and use a shuffle buffer when you start iterating\ndataloader = torch.utils.data.DataLoader(iterable_dataset, num_workers=4)  # assigns 64 / 4 = 16 shards from the shuffled list of shards to each worker when you start iterating\n```\n\n## Shuffle\n\nLike a regular [`Dataset`] object, you can also shuffle a [`IterableDataset`] with [`IterableDataset.shuffle`].\n\nThe `buffer_size` argument controls the size of the buffer to randomly sample examples from. Let's say your dataset has one million examples, and you set the `buffer_size` to ten thousand. [`IterableDataset.shuffle`] will randomly select examples from the first ten thousand examples in the buffer. Selected examples in the buffer are replaced with new examples. By default, the buffer size is 1,000.\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset('HuggingFaceFW/fineweb', split='train', streaming=True)\n>>> shuffled_dataset = dataset.shuffle(seed=42, buffer_size=10_000)\n```\n\n> [!TIP]\n> \n> [`IterableDataset.shuffle`] will also shuffle the order of the shards if the dataset is sharded into multiple files.\n\n## Reshuffle\n\nSometimes you may want to reshuffle the dataset after each epoch. This will require you to set a different seed for each epoch. Use [`IterableDataset.set_epoch`] in between epochs to tell the dataset what epoch you're on.\n\nYour seed effectively becomes: `initial seed + current epoch`.\n\n```py\n>>> for epoch in range(epochs):\n...     shuffled_dataset.set_epoch(epoch)\n...     for example in shuffled_dataset:\n...         ...\n```\n\n## Split dataset\n\nYou can split your dataset one of two ways:\n\n- [`IterableDataset.take`] returns the first `n` examples in a dataset:\n\n```py\n>>> dataset = load_dataset('HuggingFaceFW/fineweb', split='train', streaming=True)\n>>> dataset_head = dataset.take(2)\n>>> list(dataset_head)\n[{'text': \"How AP reported in all formats from tor...},\n {'text': 'Did you know you have two little yellow...}]\n```\n\n- [`IterableDataset.skip`] omits the first `n` examples in a dataset and returns the remaining examples:\n\n```py\n>>> train_dataset = shuffled_dataset.skip(1000)\n```\n\n> [!WARNING]\n> `take` and `skip` prevent future calls to `shuffle` because they lock in the order of the shards. You should `shuffle` your dataset before splitting it.\n\n<a id='interleave_datasets'></a>\n\n\n### Shard\n\n🤗 Datasets supports sharding to divide a very large dataset into a predefined number of chunks. Specify the `num_shards` parameter in [`~IterableDataset.shard`] to determine the number of shards to split the dataset into. You'll also need to provide the shard you want to return with the `index` parameter.\n\nFor example, the [amazon_polarity](https://huggingface.co/datasets/fancyzhx/amazon_polarity) dataset has 4 shards (in this case they are 4 Parquet files):\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset(\"fancyzhx/amazon_polarity\", split=\"train\", streaming=True)\n>>> print(dataset)\nIterableDataset({\n    features: ['label', 'title', 'content'],\n    num_shards: 4\n})\n```\n\nAfter sharding the dataset into two chunks, the first one will only have 2 shards:\n\n```py\n>>> dataset.shard(num_shards=2, index=0)\nIterableDataset({\n    features: ['label', 'title', 'content'],\n    num_shards: 2\n})\n```\n\nTo increase the number of shards of a dataset, you can use [`IterableDataset.reshard`]:\n\n```py\n>>> dataset.reshard()\nIterableDataset({\n    features: ['label', 'title', 'content'],\n    num_shards: 3600\n})\n```\n\nThe resharding mechanism depends on the dataset file format.\nFor example for Parquet, it reshards using row groups instead of having one file per shard.\nSee how it works for every format in [`IterableDataset.reshard`]'s documentation.\n\nIf your dataset has `dataset.num_shards==1` even after resharding, you should chunk it using [`IterableDataset.skip`] and [`IterableDataset.take`] instead.\n\n## Concatenate\n\nSeparate datasets can be concatenated if they share the same column types. Concatenate datasets with [`concatenate_datasets`]:\n\n```py\n>>> from datasets import concatenate_datasets, load_dataset\n\n>>> stories = load_dataset(\"ajibawa-2023/General-Stories-Collection\", split=\"train\", streaming=True)\n>>> stories = stories.select_columns([\"text\"])  # only keep the 'text' column\nIterableDataset({\n    features: Unknown,\n    num_shards: 10\n})\n\n>>> wiki = load_dataset(\"wikimedia/wikipedia\", \"20231101.en\", split=\"train\", streaming=True)\n>>> wiki = wiki.select_columns([\"text\"])  # only keep the 'text' column\nIterableDataset({\n    features: ['text'],\n    num_shards: 41\n})\n\n>>> bert_dataset = concatenate_datasets([stories, wiki])\nIterableDataset({\n    features: ['text'],\n    num_shards: 51\n})\n```\n\nThe shards of the concatenated dataset are the shards of the input datasets.\n\nYou can also concatenate two datasets horizontally by setting `axis=1` as long as the datasets have the same number of rows:\n\n```py\n>>> from datasets import IterableDataset\n>>> stories_ids = IterableDataset.from_dict({\"ids\": list(range(num_stories))})\n>>> stories_with_ids = concatenate_datasets([stories, stories_ids], axis=1)\n```\n\nIn this case, the concatenated dataset only has 1 shard, to avoid ending with unaligned shards from the input datasets.\n\n## Interleave\n\n[`interleave_datasets`] can combine an [`IterableDataset`] with other datasets if they have the same column types. The combined dataset returns alternating examples from each of the original datasets.\n\n```py\n>>> from datasets import interleave_datasets\n>>> es_dataset = load_dataset('allenai/c4', 'es', split='train', streaming=True)\nIterableDataset({\n    features: Unknown,\n    num_shards: 2048\n})\n>>> fr_dataset = load_dataset('allenai/c4', 'fr', split='train', streaming=True)\nIterableDataset({\n    features: Unknown,\n    num_shards: 2048\n})\n\n>>> multilingual_dataset = interleave_datasets([es_dataset, fr_dataset])\nIterableDataset({\n    features: ['text', 'timestamp', 'url'],\n    num_shards: 2048\n})\n>>> list(multilingual_dataset.take(2))\n[{'text': 'Comprar Zapatillas para niña en chancla con goma por...'},\n {'text': 'Le sacre de philippe ier, 23 mai 1059 - Compte Rendu...'}]\n```\n\nDefine sampling probabilities from each of the original datasets for more control over how each of them are sampled and combined. Set the `probabilities` argument with your desired sampling probabilities:\n\n```py\n>>> multilingual_dataset_with_oversampling = interleave_datasets([es_dataset, fr_dataset], probabilities=[0.8, 0.2], seed=42)\n>>> list(multilingual_dataset_with_oversampling.take(2))\n[{'text': 'Comprar Zapatillas para niña en chancla con goma por...'},\n {'text': 'Chevrolet Cavalier Usados en Bogota - Carros en Vent...'}]\n```\n\nAround 80% of the final dataset is made of the `es_dataset`, and 20% of the `fr_dataset`.\n\nYou can also specify the `stopping_strategy`. The default strategy, `first_exhausted`, is a subsampling strategy, i.e the dataset construction is stopped as soon one of the dataset runs out of samples.\nYou can specify `stopping_strategy=all_exhausted` to execute an oversampling strategy. In this case, the dataset construction is stopped as soon as every samples in every dataset has been added at least once. In practice, it means that if a dataset is exhausted, it will return to the beginning of this dataset until the stop criterion has been reached.\nNote that if no sampling probabilities are specified, the new dataset will have `max_length_datasets*nb_dataset samples`.\nThere is also `stopping_strategy=all_exhausted_without_replacement` to ensure that every sample is seen exactly once.\n\nTo ensure proper parallelism using sharding, the shards of the interleaved dataset contain at least 1 shard of every input dataset. Therefore the sharding level of the interleaved dataset is the minimum sharding level of the input datasets.\n\nE.g. if the input datasets have respectively 32, 48 and 128 shards, then the interleaved dataset has 32 = min(32, 48, 128) shards, and each new shard has 1 shard from the first dataset, 1-2 shards from the second dataset and 4 shards from the third dataset.\n\n## Rename, remove, and cast\n\nThe following methods allow you to modify the columns of a dataset. These methods are useful for renaming or removing columns and changing columns to a new set of features.\n\n### Rename\n\nUse [`IterableDataset.rename_column`] when you need to rename a column in your dataset. Features associated with the original column are actually moved under the new column name, instead of just replacing the original column in-place.\n\nProvide [`IterableDataset.rename_column`] with the name of the original column, and the new column name:\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset('allenai/c4', 'en', streaming=True, split='train')\n>>> dataset = dataset.rename_column(\"text\", \"content\")\n```\n\n### Remove\n\nWhen you need to remove one or more columns, give [`IterableDataset.remove_columns`] the name of the column to remove. Remove more than one column by providing a list of column names:\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset('allenai/c4', 'en', streaming=True, split='train')\n>>> dataset = dataset.remove_columns('timestamp')\n```\n\n### Cast\n\n[`IterableDataset.cast`] changes the feature type of one or more columns. This method takes your new `Features` as its argument. The following sample code shows how to change the feature types of `ClassLabel` and `Value`:\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset('nyu-mll/glue', 'mrpc', split='train', streaming=True)\n>>> dataset.features\n{'sentence1': Value('string'),\n'sentence2': Value('string'),\n'label': ClassLabel(names=['not_equivalent', 'equivalent']),\n'idx': Value('int32')}\n\n>>> from datasets import ClassLabel, Value\n>>> new_features = dataset.features.copy()\n>>> new_features[\"label\"] = ClassLabel(names=['negative', 'positive'])\n>>> new_features[\"idx\"] = Value('int64')\n>>> dataset = dataset.cast(new_features)\n>>> dataset.features\n{'sentence1': Value('string'),\n'sentence2': Value('string'),\n'label': ClassLabel(names=['negative', 'positive']),\n'idx': Value('int64')}\n```\n\n> [!TIP]\n> Casting only works if the original feature type and new feature type are compatible. For example, you can cast a column with the feature type `Value('int32')` to `Value('bool')` if the original column only contains ones and zeros.\n\nUse [`IterableDataset.cast_column`] to change the feature type of just one column. Pass the column name and its new feature type as arguments:\n\n```py\n>>> dataset.features\n{'audio': Audio(sampling_rate=44100, mono=True)}\n\n>>> dataset = dataset.cast_column(\"audio\", Audio(sampling_rate=16000))\n>>> dataset.features\n{'audio': Audio(sampling_rate=16000, mono=True)}\n```\n\n## Map\n\nSimilar to the [`Dataset.map`] function for a regular [`Dataset`], 🤗  Datasets features [`IterableDataset.map`] for processing an [`IterableDataset`].\n[`IterableDataset.map`] applies processing on-the-fly when examples are streamed.\n\nIt allows you to apply a processing function to each example in a dataset, independently or in batches. This function can even create new rows and columns.\n\nThe following example demonstrates how to tokenize a [`IterableDataset`]. The function needs to accept and output a `dict`:\n\n```py\n>>> def add_prefix(example):\n...     example['text'] = 'My text: ' + example['text']\n...     return example\n```\n\nNext, apply this function to the dataset with [`IterableDataset.map`]:\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset('allenai/c4', 'en', streaming=True, split='train')\n>>> updated_dataset = dataset.map(add_prefix)\n>>> list(updated_dataset.take(3))\n[{'text': 'My text: Beginners BBQ Class Taking Place in Missoula!\\nDo you want to get better at making...',\n  'timestamp': '2019-04-25 12:57:54',\n  'url': 'https://klyq.com/beginners-bbq-class-taking-place-in-missoula/'},\n {'text': 'My text: Discussion in \\'Mac OS X Lion (10.7)\\' started by axboi87, Jan 20, 2012.\\nI\\'ve go...',\n  'timestamp': '2019-04-21 10:07:13',\n  'url': 'https://forums.macrumors.com/threads/restore-from-larger-disk-to-smaller-disk.1311329/'},\n {'text': 'My text: Foil plaid lycra and spandex shortall with metallic slinky insets. Attached metall...',\n  'timestamp': '2019-04-25 10:40:23',\n  'url': 'https://awishcometrue.com/Catalogs/Clearance/Tweens/V1960-Find-A-Way'}]\n```\n\nLet's take a look at another example, except this time, you will remove columns with [`IterableDataset.map`]. When you remove a column, it is only removed after the example has been provided to the mapped function. This allows the mapped function to use the content of the columns before they are removed.\n\nSpecify the column to remove with the `remove_columns` argument in [`IterableDataset.map`]:\n\n```py\n>>> updated_dataset = dataset.map(add_prefix, remove_columns=[\"timestamp\", \"url\"])\n>>> list(updated_dataset.take(3))\n[{'text': 'My text: Beginners BBQ Class Taking Place in Missoula!\\nDo you want to get better at making...'},\n {'text': 'My text: Discussion in \\'Mac OS X Lion (10.7)\\' started by axboi87, Jan 20, 2012.\\nI\\'ve go...'},\n {'text': 'My text: Foil plaid lycra and spandex shortall with metallic slinky insets. Attached metall...'}]\n```\n\n### Batch processing\n\n[`IterableDataset.map`] also supports working with batches of examples. Operate on batches by setting `batched=True`. The default batch size is 1000, but you can adjust it with the `batch_size` argument. This opens the door to many interesting applications such as tokenization, splitting long sentences into shorter chunks, and data augmentation.\n\n#### Tokenization\n\n```py\n>>> from datasets import load_dataset\n>>> from transformers import AutoTokenizer\n>>> dataset = load_dataset(\"allenai/c4\", \"en\", streaming=True, split=\"train\")\n>>> tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')\n>>> def encode(examples):\n...     return tokenizer(examples['text'], truncation=True, padding='max_length')\n>>> dataset = dataset.map(encode, batched=True, remove_columns=[\"text\", \"timestamp\", \"url\"])\n>>> next(iter(dataset))\n{'input_ids': [101, 4088, 16912, 22861, 4160, 2465, 2635, 2173, 1999, 3335, ..., 0, 0, 0],\n'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..., 0, 0]}\n```\n\n> [!TIP]\n> See other examples of batch processing in the [batched map processing](./process#batch-processing) documentation. They work the same for iterable datasets.\n\n### Filter\n\nYou can filter rows in the dataset based on a predicate function using [`Dataset.filter`]. It returns rows that match a specified condition:\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset('HuggingFaceFW/fineweb', streaming=True, split='train')\n>>> start_with_ar = dataset.filter(lambda example: example['text'].startswith('San Francisco'))\n>>> next(iter(start_with_ar))\n{'text': 'San Francisco 49ers cornerback Shawntae Spencer will miss the rest of the sea...}\n```\n\n[`Dataset.filter`] can also filter by indices if you set `with_indices=True`:\n\n```py\n>>> even_dataset = dataset.filter(lambda example, idx: idx % 2 == 0, with_indices=True)\n>>> list(even_dataset.take(3))\n[{'text': 'How AP reported in all formats from tornado-stricken regionsMarch 8, 2012 Whe...},\n {'text': 'Car Wash For Clara! Now is your chance to help! 2 year old Clara Woodward has...},\n {'text': 'Log In Please enter your ECode to log in. Forgotten your eCode? If you create...}]\n```\n\n## Batch\n\nThe `batch` method transforms your `IterableDataset` into an iterable of batches. This is particularly useful when you want to work with batches in your training loop or when using frameworks that expect batched inputs.\n\n> [!TIP]\n> There is also a \"Batch Processing\" option when using the `map` function to apply a function to batches of data, which is discussed in the [Map section](#map) above. The `batch` method described here is different and provides a more direct way to create batches from your dataset.\n\nYou can use the `batch` method like this:\n\n```python\nfrom datasets import load_dataset\n\n# Load a dataset in streaming mode\ndataset = load_dataset(\"some_dataset\", split=\"train\", streaming=True)\n\n# Create batches of 32 samples\nbatched_dataset = dataset.batch(batch_size=32)\n\n# Iterate over the batched dataset\nfor batch in batched_dataset:\n    print(batch)\n    break\n```\n\nIn this example, batched_dataset is still an IterableDataset, but each item yielded is now a batch of 32 samples instead of a single sample.\nThis batching is done on-the-fly as you iterate over the dataset, preserving the memory-efficient nature of IterableDataset.\n\nThe batch method also provides a drop_last_batch parameter. \nWhen set to True, it will discard the last batch if it's smaller than the specified batch_size. \nThis can be useful in scenarios where your downstream processing requires all batches to be of the same size:\n\n```python\nbatched_dataset = dataset.batch(batch_size=32, drop_last_batch=True)\n```\n\n## Stream in a training loop\n\n[`IterableDataset`] can be integrated into a training loop. First, shuffle the dataset:\n\n<frameworkcontent>\n<pt>\n```py\n>>> seed, buffer_size = 42, 10_000\n>>> dataset = dataset.shuffle(seed, buffer_size=buffer_size)\n```\n\nLastly, create a simple training loop and start training:\n\n```py\n>>> import torch\n>>> from torch.utils.data import DataLoader\n>>> from transformers import AutoModelForMaskedLM, DataCollatorForLanguageModeling\n>>> from tqdm import tqdm\n>>> dataset = dataset.with_format(\"torch\")\n>>> dataloader = DataLoader(dataset, collate_fn=DataCollatorForLanguageModeling(tokenizer))\n>>> device = 'cuda' if torch.cuda.is_available() else 'cpu' \n>>> model = AutoModelForMaskedLM.from_pretrained(\"distilbert-base-uncased\")\n>>> model.train().to(device)\n>>> optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-5)\n>>> for epoch in range(3):\n...     dataset.set_epoch(epoch)\n...     for i, batch in enumerate(tqdm(dataloader, total=5)):\n...         if i == 5:\n...             break\n...         batch = {k: v.to(device) for k, v in batch.items()}\n...         outputs = model(**batch)\n...         loss = outputs[0]\n...         loss.backward()\n...         optimizer.step()\n...         optimizer.zero_grad()\n...         if i % 10 == 0:\n...             print(f\"loss: {loss}\")\n```\n</pt>\n</frameworkcontent>\n\n<!-- TODO: Write the TF content! -->\n\n### Save a dataset checkpoint and resume iteration\n\nIf your training loop stops, you may want to restart the training from where it was. To do so you can save a checkpoint of your model and optimizers, as well as your data loader.\n\nIterable datasets don't provide random access to a specific example index to resume from, but you can use [`IterableDataset.state_dict`] and [`IterableDataset.load_state_dict`] to resume from a checkpoint instead, similarly to what you can do for models and optimizers:\n\n```python\n>>> iterable_dataset = Dataset.from_dict({\"a\": range(6)}).to_iterable_dataset(num_shards=3)\n>>> for idx, example in enumerate(iterable_dataset):\n...     print(example)\n...     if idx == 2:\n...         state_dict = iterable_dataset.state_dict()\n...         print(\"checkpoint\")\n...         break\n>>> iterable_dataset.load_state_dict(state_dict)\n>>> print(f\"restart from checkpoint\")\n>>> for example in iterable_dataset:\n...     print(example)\n```\n\nReturns:\n\n```\n{'a': 0}\n{'a': 1}\n{'a': 2}\ncheckpoint\nrestart from checkpoint\n{'a': 3}\n{'a': 4}\n{'a': 5}\n```\n\nUnder the hood, the iterable dataset keeps track of the current shard being read and the example index in the current shard and it stores this info in the `state_dict`.\n\nTo resume from a checkpoint, the dataset skips all the shards that were previously read to restart from the current shard. \nThen it reads the shard and skips examples until it reaches the exact example from the checkpoint.\n\nTherefore restarting a dataset is quite fast, since it will not re-read the shards that have already been iterated on. Still, resuming a dataset is generally not instantaneous since it has to restart reading from the beginning of the current shard and skip examples until it reaches the checkpoint location.\n\nThis can be used with the `StatefulDataLoader` from `torchdata`:\n\n```python\n>>> from torchdata.stateful_dataloader import StatefulDataLoader\n>>> iterable_dataset = load_dataset(\"deepmind/code_contests\", streaming=True, split=\"train\")\n>>> dataloader = StatefulDataLoader(iterable_dataset, batch_size=32, num_workers=4)\n>>> # checkpoint\n>>> state_dict = dataloader.state_dict()  # uses iterable_dataset.state_dict() under the hood\n>>> # resume from checkpoint\n>>> dataloader.load_state_dict(state_dict)  # uses iterable_dataset.load_state_dict() under the hood\n```\n\n> [!TIP]\n> Resuming returns exactly where the checkpoint was saved except if `.shuffle()` is used: examples from shuffle buffers are lost when resuming and the buffers are refilled with new data.\n\n\n## Save\n\nOnce your iterable dataset is ready, you can save it as a Hugging Face Dataset in Parquet format and reuse it later with [`load_dataset`].\n\nSave your dataset by providing the name of the dataset repository on Hugging Face you wish to save it to to [`~Dataset.push_to_hub`]. This iterates over the dataset and progressively uploads the data to Hugging Face:\n\n```python\ndataset.push_to_hub(\"username/my_dataset\")\n```\n\nIf the dataset consists of multiple shards (`dataset.num_shards > 1`), you can use multiple processes to upload it in parallel. This is especially useful if you applied `map()` or `filter()` steps since they will run faster in parallel:\n\n```python\ndataset.push_to_hub(\"username/my_dataset\", num_proc=8)\n```\n\nUse the [`load_dataset`] function to reload the dataset:\n\n```python\nfrom datasets import load_dataset\nreloaded_dataset = load_dataset(\"username/my_dataset\")\n```\n\n## Export\n\n🤗 Datasets supports exporting as well so you can work with your dataset in other applications. The following table shows currently supported file formats you can export to:\n\n| File type               | Export method                                                  |\n|-------------------------|----------------------------------------------------------------|\n| CSV                     | [`IterableDataset.to_csv`]                                    |\n| JSON                    | [`IterableDataset.to_json`]                                   |\n| Parquet                 | [`IterableDataset.to_parquet`]                                |\n| SQL                     | [`IterableDataset.to_sql`]                                    |\n| In-memory Python object | [`IterableDataset.to_pandas`], [`IterableDataset.to_polars`] or [`IterableDataset.to_dict`] |\n\nFor example, export your dataset to a CSV file like this:\n\n```py\n>>> dataset.to_csv(\"path/of/my/dataset.csv\")\n```\n\nIf you have a large dataset, you can save one file per shard, e.g.\n\n```py\n>>> num_shards = dataset.num_shards\n>>> for index in range(num_shards):\n...     shard = dataset.shard(index, num_shards)\n...     shard.to_parquet(f\"path/of/my/dataset/data-{index:05d}.parquet\")\n```\n"
  },
  {
    "path": "docs/source/tabular_load.mdx",
    "content": "# Load tabular data\n\nA tabular dataset is a generic dataset used to describe any data stored in rows and columns, where the rows represent an example and the columns represent a feature (can be continuous or categorical). These datasets are commonly stored in CSV files, Pandas DataFrames, and in database tables. This guide will show you how to load and create a tabular dataset from:\n\n- CSV files\n- Pandas DataFrames\n- HDF5 files\n- Databases\n\n## CSV files\n\n🤗 Datasets can read CSV files by specifying the generic `csv` dataset builder name in the [`~datasets.load_dataset`] method. To load more than one CSV file, pass them as a list to the `data_files` parameter:\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset(\"csv\", data_files=\"my_file.csv\")\n\n# load multiple CSV files\n>>> dataset = load_dataset(\"csv\", data_files=[\"my_file_1.csv\", \"my_file_2.csv\", \"my_file_3.csv\"])\n```\n\nYou can also map specific CSV files to the train and test splits:\n\n```py\n>>> dataset = load_dataset(\"csv\", data_files={\"train\": [\"my_train_file_1.csv\", \"my_train_file_2.csv\"], \"test\": \"my_test_file.csv\"})\n```\n\nTo load remote CSV files, pass the URLs instead:\n\n```py\n>>> base_url = \"https://huggingface.co/datasets/lhoestq/demo1/resolve/main/data/\"\n>>> dataset = load_dataset('csv', data_files={\"train\": base_url + \"train.csv\", \"test\": base_url + \"test.csv\"})\n```\n\nTo load zipped CSV files:\n\n```py\n>>> url = \"https://domain.org/train_data.zip\"\n>>> data_files = {\"train\": url}\n>>> dataset = load_dataset(\"csv\", data_files=data_files)\n```\n\n## Pandas DataFrames\n\n🤗 Datasets also supports loading datasets from [Pandas DataFrames](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) with the [`~datasets.Dataset.from_pandas`] method:\n\n```py\n>>> from datasets import Dataset\n>>> import pandas as pd\n\n# create a Pandas DataFrame\n>>> df = pd.read_csv(\"https://huggingface.co/datasets/imodels/credit-card/raw/main/train.csv\")\n>>> df = pd.DataFrame(df)\n# load Dataset from Pandas DataFrame\n>>> dataset = Dataset.from_pandas(df)\n```\n\nUse the `splits` parameter to specify the name of the dataset split:\n\n```py\n>>> train_ds = Dataset.from_pandas(train_df, split=\"train\")\n>>> test_ds = Dataset.from_pandas(test_df, split=\"test\")\n```\n\nIf the dataset doesn't look as expected, you should explicitly [specify your dataset features](loading#specify-features). A [pandas.Series](https://pandas.pydata.org/docs/reference/api/pandas.Series.html) may not always carry enough information for Arrow to automatically infer a data type. For example, if a DataFrame is of length `0` or if the Series only contains `None/NaN` objects, the type is set to `null`.\n\n## HDF5 files\n\n[HDF5](https://www.hdfgroup.org/solutions/hdf5/) files are commonly used for storing large amounts of numerical data in scientific computing and machine learning. Loading HDF5 files with 🤗 Datasets is similar to loading CSV files:\n\n```py\n>>> from datasets import load_dataset\n>>> dataset = load_dataset(\"hdf5\", data_files=\"data.h5\")\n```\n\nNote that the HDF5 loader assumes that the file has \"tabular\" structure, i.e. that all datasets in the file have (the same number of) rows on their first dimension.\n\n## Databases\n\nDatasets stored in databases are typically accessed with SQL queries. With 🤗 Datasets, you can connect to a database, query for the data you need, and create a dataset out of it. Then you can use all the processing features of 🤗 Datasets to prepare your dataset for training.\n\n### SQLite\n\nSQLite is a small, lightweight database that is fast and easy to set up. You can use an existing database if you'd like, or follow along and start from scratch.\n\nStart by creating a quick SQLite database with this [Covid-19 data](https://github.com/nytimes/covid-19-data/blob/master/us-states.csv) from the New York Times:\n\n```py\n>>> import sqlite3\n>>> import pandas as pd\n\n>>> conn = sqlite3.connect(\"us_covid_data.db\")\n>>> df = pd.read_csv(\"https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv\")\n>>> df.to_sql(\"states\", conn, if_exists=\"replace\")\n```\n\nThis creates a `states` table in the `us_covid_data.db` database which you can now load into a dataset.\n\nTo connect to the database, you'll need the [URI string](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) that identifies your database. Connecting to a database with a URI caches the returned dataset. The URI string differs for each database dialect, so be sure to check the [Database URLs](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) for whichever database you're using.\n\nFor SQLite, it is:\n\n```py\n>>> uri = \"sqlite:///us_covid_data.db\"\n```\n\nLoad the table by passing the table name and URI to [`~datasets.Dataset.from_sql`]:\n\n```py\n>>> from datasets import Dataset\n\n>>> ds = Dataset.from_sql(\"states\", uri)\n>>> ds\nDataset({\n    features: ['index', 'date', 'state', 'fips', 'cases', 'deaths'],\n    num_rows: 54382\n})\n```\n\nThen you can use all of 🤗 Datasets process features like [`~datasets.Dataset.filter`] for example:\n\n```py\n>>> ds.filter(lambda x: x[\"state\"] == \"California\")\n```\n\nYou can also load a dataset from a SQL query instead of an entire table, which is useful for querying and joining multiple tables. \n\nLoad the dataset by passing your query and URI to [`~datasets.Dataset.from_sql`]:\n\n```py\n>>> from datasets import Dataset\n\n>>> ds = Dataset.from_sql('SELECT * FROM states WHERE state=\"California\";', uri)\n>>> ds\nDataset({\n    features: ['index', 'date', 'state', 'fips', 'cases', 'deaths'],\n    num_rows: 1019\n})\n```\n\nThen you can use all of 🤗 Datasets process features like [`~datasets.Dataset.filter`] for example:\n\n```py\n>>> ds.filter(lambda x: x[\"cases\"] > 10000)\n```\n\n### PostgreSQL\n\nYou can also connect and load a dataset from a PostgreSQL database, however we won't directly demonstrate how in the documentation because the example is only meant to be run in a notebook. Instead, take a look at how to install and setup a PostgreSQL server in this [notebook](https://colab.research.google.com/github/nateraw/huggingface-hub-examples/blob/main/sql_with_huggingface_datasets.ipynb#scrollTo=d83yGQMPHGFi)!\n\nAfter you've setup your PostgreSQL database, you can use the [`~datasets.Dataset.from_sql`] method to load a dataset from a table or query."
  },
  {
    "path": "docs/source/troubleshoot.mdx",
    "content": "# Troubleshooting\n\nThis guide aims to provide you the tools and knowledge required to navigate some common issues. If the suggestions listed \nin this guide do not cover your such situation, please refer to the [Asking for Help](#asking-for-help) section to learn where to\nfind help with your specific issue.\n\n## Issues when uploading datasets with `push_to_hub`\n\n### Authentication issues\n\nIf you are experiencing authentication issues when sharing a dataset on 🤗 Hub using [`Dataset.push_to_hub`] and a Hugging Face \naccess token:\n\n* Make sure that the Hugging Face token you're using to authenticate yourself is a token with **write** permission.\n* On OSX, it may help to clean up all the huggingface.co passwords on your keychain access, as well as reconfigure `git config --global credential.helper osxkeychain`, before using `huggingface-cli login`.\n\nAlternatively, you can use SSH keys to authenticate yourself - read more in the [🤗 Hub documentation](https://huggingface.co/docs/hub/security-git-ssh).\n \n### Lost connection on large dataset upload \n\nWhen uploading large datasets to Hub, if the number of dataset shards is large, it can create too many commits for the Hub in a \nshort period. This will result in a connection error.\nThe connection error can also be caused by a HTTP 500 error returned by AWS S3 bucket that Hub uses internally.\nIn either situation, you can re-run [`Dataset.push_to_hub`] to proceed with the dataset upload. Hub will check the SHAs \nof already uploaded shards to avoid reuploading them. \nWe are working on making upload process more robust to transient errors, so updating to the latest library version is \nalways a good idea.\n\n### `Too Many Requests`\n\nUploading large datasets via `push_to_hub()` can result in an error: \n\n```bash\nHfHubHTTPError: 429 Client Error: Too Many Requests for url: ...\nYou have exceeded our hourly quotas for action: commit. We invite you to retry later.\n```\n\nIf you encounter this issue, you need to upgrade the `datasets` library to the latest version (or at least `2.15.0`).\n\n## Issues when creating datasets from custom data\n\n### Loading images and audio from a folder\n\nWhen creating a dataset from a folder, one of the most common issues is that the file structure does not follow the \nexpected format, or there's an issue with the metadata file.\n\nLearn more about required folder structure in corresponding documentation pages:\n\n* [AudioFolder](https://huggingface.co/docs/datasets/audio_dataset#audiofolder)\n* [ImageFolder](https://huggingface.co/docs/datasets/image_dataset#imagefolder)\n\n\n### Pickling issues \n\n#### Pickling issues when using `Dataset.from_generator`\n\nWhen creating a dataset, [`IterableDataset.from_generator`] and [`Dataset.from_generator`] expect a \"picklable\" generator function.\nThis is required to hash the function using [`pickle`](https://docs.python.org/3/library/pickle.html) to be able to cache the dataset on disk.\n\nWhile generator functions are generally \"picklable\", note that generator objects are not. So if you're using a generator object, \nyou will encounter a `TypeError` like this:\n\n```bash\nTypeError: cannot pickle 'generator' object\n```\n\nThis error can also occur when using a generator function that uses a global object that is not \"picklable\", such as a \nDB connection, for example. If that's the case, you can initialize such object directly inside the generator function to \navoid this error.\n\n#### Pickling issues with `Dataset.map`\n\nPickling errors can also happen in the multiprocess [`Dataset.map`] - objects are pickled to be passed to child processes.\nIf the objects used in the transformation are not picklable, it's not possible to cache the result of `map`, which leads to an error being raised. \n\nHere are some ways to address this issue:\n* A universal solution to pickle issues is to make sure the objects (or generator classes) are pickable manually by implementing `__getstate__` / `__setstate__` / `__reduce__`.\n* You can also provide your own unique hash in `map` with the `new_fingerprint` argument.\n* You can also disable caching by calling `datasets.disable_caching()`, however, this is undesirable - [read more about importance of cache](cache)\n\n## Asking for help\n\nIf the above troubleshooting advice did not help you resolve your issue, reach out for help to the community and the team.\n\n### Forums \n\nAsk for help on the Hugging Face forums - post your question in the [🤗Datasets category](https://discuss.huggingface.co/c/datasets/10) \nMake sure to write a descriptive post with relevant context about your setup and reproducible code to maximize the likelihood that your problem is solved!\n\n### Discord\n\nPost a question on [Discord](http://hf.co/join/discord), and let the team and the community help you.\n\n### Community Discussions on 🤗 Hub\n\nIf you are facing issues creating a custom dataset on Hub, you can ask the Hugging Face team for help by opening a discussion in the Community tab of your dataset with this message:\n\n```text\n# Dataset rewiew request for <Dataset name>\n\n## Description\n\n<brief description of the dataset>\n\n## Files to review\n\n- file1\n- file2\n- ...\n\ncc @lhoestq @albertvillanova\n```\n\n### GitHub Issues\n\nFinally, if you suspect to have found a bug related to the library itself, create an Issue on the 🤗 Datasets \n[GitHub repository](https://github.com/huggingface/datasets/issues). Include context regarding the bug: code snippet to reproduce,\ndetails about your environment and data, etc. to help us figure out what's wrong and how we can fix it.\n"
  },
  {
    "path": "docs/source/tutorial.md",
    "content": "# Overview\n\nWelcome to the 🤗 Datasets tutorials! These beginner-friendly tutorials will guide you through the fundamentals of working with 🤗 Datasets. You'll load and prepare a dataset for training with your machine learning framework of choice. Along the way, you'll learn how to load different dataset configurations and splits, interact with and see what's inside your dataset, preprocess, and share a dataset to the [Hub](https://huggingface.co/datasets).\n\nThe tutorials assume some basic knowledge of Python and a machine learning framework like PyTorch or TensorFlow. If you're already familiar with these, feel free to check out the [quickstart](./quickstart) to see what you can do with 🤗 Datasets.\n\n> [!TIP]\n> The tutorials only cover the basic skills you need to use 🤗 Datasets. There are many other useful functionalities and applications that aren't discussed here. If you're interested in learning more, take a look at [Chapter 5](https://huggingface.co/course/chapter5/1?fw=pt) of the Hugging Face course.\n\nIf you have any questions about 🤗 Datasets, feel free to join and ask the community on our [forum](https://discuss.huggingface.co/c/datasets/10).\n\nLet's get started! 🏁\n"
  },
  {
    "path": "docs/source/upload_dataset.mdx",
    "content": "# Share a dataset to the Hub\n\nThe [Hub](https://huggingface.co/datasets) is home to an extensive collection of community-curated and popular research datasets. We encourage you to share your dataset to the Hub to help grow the ML community and accelerate progress for everyone. All contributions are welcome; adding a dataset is just a drag and drop away!\n\nStart by [creating a Hugging Face Hub account](https://huggingface.co/join) if you don't have one yet.\n\n## Upload with the Hub UI\n\nThe Hub's web-based interface allows users without any developer experience to upload a dataset.\n\n### Create a repository\n\nA repository hosts all your dataset files, including the revision history, making storing more than one dataset version possible.\n\n1. Click on your profile and select **New Dataset** to create a new dataset repository. \n2. Pick a name for your dataset, and choose whether it is a public or private dataset. A public dataset is visible to anyone, whereas a private dataset can only be viewed by you or members of your organization.\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/create_repo.png\"/>\n</div>\n\n### Upload dataset\n\n1. Once you've created a repository, navigate to the **Files and versions** tab to add a file. Select **Add file** to upload your dataset files. We support many text, audio, and image data extensions such as `.csv`, `.mp3`, and `.jpg` among many others. For text data extensions like `.csv`, `.json`, `.jsonl`, and `.txt`, we recommend compressing them before uploading to the Hub (to `.zip` or `.gz` file extension for example).\n\n    Text file extensions are not tracked by Git LFS by default, and if they're greater than 10MB, they will not be committed and uploaded. Take a look at the `.gitattributes` file in your repository for a complete list of tracked file extensions. For this tutorial, you can use the following sample `.csv` files since they're small: <a href=\"https://huggingface.co/datasets/stevhliu/demo/raw/main/train.csv\" download>train.csv</a>, <a href=\"https://huggingface.co/datasets/stevhliu/demo/raw/main/test.csv\" download>test.csv</a>.\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/upload_files.png\"/>\n</div>\n\n2. Drag and drop your dataset files and add a brief descriptive commit message.\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/commit_files.png\"/>\n</div>\n\n3. After uploading your dataset files, they are stored in your dataset repository.\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/files_stored.png\"/>\n</div>\n\n### Create a Dataset card\n\nAdding a Dataset card is super valuable for helping users find your dataset and understand how to use it responsibly.\n\n1. Click on **Create Dataset Card** to create a Dataset card. This button creates a `README.md` file in your repository.\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/dataset_card.png\"/>\n</div>\n\n2. At the top, you'll see the **Metadata UI** with several fields to select from like license, language, and task categories. These are the most important tags for helping users discover your dataset on the Hub. When you select an option from each field, they'll be automatically added to the top of the dataset card.\n\n    You can also look at the [Dataset Card specifications](https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1), which has a complete set of (but not required) tag options like `annotations_creators`, to help you choose the appropriate tags.\n\n<div class=\"flex justify-center\">\n    <img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/metadata_ui.png\"/>\n</div>\n\n3. Click on the **Import dataset card template** link at the top of the editor to automatically create a dataset card template. Filling out the template is a great way to introduce your dataset to the community and help users understand how to use it. For a detailed example of what a good Dataset card should look like, take a look at the [CNN DailyMail Dataset card](https://huggingface.co/datasets/cnn_dailymail).\n\n### Load dataset\n\nOnce your dataset is stored on the Hub, anyone can load it with the [`load_dataset`] function:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"stevhliu/demo\")\n```\n\n## Upload with Python\n\nUsers who prefer to upload a dataset programmatically can use the [huggingface_hub](https://huggingface.co/docs/huggingface_hub/index) library. This library allows users to interact with the Hub from Python. \n\n1. Begin by installing the library:\n\n```bash\npip install huggingface_hub\n```\n\n2. To upload a dataset on the Hub in Python, you need to log in to your Hugging Face account:\n\n```bash\nhuggingface-cli login\n```\n\n3. Use the [`push_to_hub()`](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.DatasetDict.push_to_hub) function to help you add, commit, and push a file to your repository:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"stevhliu/demo\")\n# dataset = dataset.map(...)  # do all your processing here\n>>> dataset.push_to_hub(\"stevhliu/processed_demo\")\n```\n\nTo set your dataset as private, set the `private` parameter to `True`. This parameter will only work if you are creating a repository for the first time.\n\n```py\n>>> dataset.push_to_hub(\"stevhliu/private_processed_demo\", private=True)\n```\n\nTo add a new configuration (or subset) to a dataset or to add a new split (train/validation/test), please refer to the [`Dataset.push_to_hub`] documentation.\n\n### Privacy\n\nA private dataset is only accessible by you. Similarly, if you share a dataset within your organization, then members of the organization can also access the dataset.\n\nLoad a private dataset by providing your authentication token to the `token` parameter:\n\n```py\n>>> from datasets import load_dataset\n\n# Load a private individual dataset\n>>> dataset = load_dataset(\"stevhliu/demo\", token=True)\n\n# Load a private organization dataset\n>>> dataset = load_dataset(\"organization/dataset_name\", token=True)\n```\n\n## What's next?\n\nCongratulations, you've completed the tutorials! 🥳\n\nFrom here, you can go on to:\n\n- Learn more about how to use 🤗 Datasets other functions to [process your dataset](process).\n- [Stream large datasets](stream) without downloading it locally.\n- [Define your dataset splits and configurations](repository_structure) and share your dataset with the community.\n\nIf you have any questions about 🤗 Datasets, feel free to join and ask the community on our [forum](https://discuss.huggingface.co/c/datasets/10).\n"
  },
  {
    "path": "docs/source/use_dataset.mdx",
    "content": "# Preprocess\n\nIn addition to loading datasets, 🤗 Datasets other main goal is to offer a diverse set of preprocessing functions to get a dataset into an appropriate format for training with your machine learning framework.\n\nThere are many possible ways to preprocess a dataset, and it all depends on your specific dataset. Sometimes you may need to rename a column, and other times you might need to unflatten nested fields. 🤗 Datasets provides a way to do most of these things. But in nearly all preprocessing cases, depending on your dataset modality, you'll need to:\n\n- Tokenize a text dataset.\n- Resample an audio dataset.\n- Apply transforms to an image dataset.\n\nThe last preprocessing step is usually setting your dataset format to be compatible with your machine learning framework's expected input format.\n\nIn this tutorial, you'll also need to install the 🤗 Transformers library:\n\n```bash\npip install transformers\n```\n\nGrab a dataset of your choice and follow along!\n\n## Tokenize text\n\nModels cannot process raw text, so you'll need to convert the text into numbers. Tokenization provides a way to do this by dividing text into individual words called _tokens_. Tokens are finally converted to numbers.\n\n> [!TIP]\n> Check out the [Tokenizers](https://huggingface.co/course/chapter2/4?fw=pt) section in Chapter 2 of the Hugging Face course to learn more about tokenization and different tokenization algorithms.\n\n**1**. Start by loading the [rotten_tomatoes](https://huggingface.co/datasets/rotten_tomatoes) dataset and the tokenizer corresponding to a pretrained [BERT](https://huggingface.co/bert-base-uncased) model. Using the same tokenizer as the pretrained model is important because you want to make sure the text is split in the same way.\n\n```py\n>>> from transformers import AutoTokenizer\n>>> from datasets import load_dataset\n\n>>> tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n>>> dataset = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\")\n```\n\n**2**. Call your tokenizer on the first row of `text` in the dataset:\n\n```py\n>>> tokenizer(dataset[0][\"text\"])\n{'input_ids': [101, 1103, 2067, 1110, 17348, 1106, 1129, 1103, 6880, 1432, 112, 188, 1207, 107, 14255, 1389, 107, 1105, 1115, 1119, 112, 188, 1280, 1106, 1294, 170, 24194, 1256, 3407, 1190, 170, 11791, 5253, 188, 1732, 7200, 10947, 12606, 2895, 117, 179, 7766, 118, 172, 15554, 1181, 3498, 6961, 3263, 1137, 188, 1566, 7912, 14516, 6997, 119, 102],\n 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}\n```\n\nThe tokenizer returns a dictionary with three items:\n\n- `input_ids`: the numbers representing the tokens in the text.\n- `token_type_ids`: indicates which sequence a token belongs to if there is more than one sequence.\n- `attention_mask`: indicates whether a token should be masked or not.\n\nThese values are actually the model inputs.\n\n**3**. The fastest way to tokenize your entire dataset is to use the [`~Dataset.map`] function. This function speeds up tokenization by applying the tokenizer to batches of examples instead of individual examples. Set the `batched` parameter to `True`:\n\n```py\n>>> def tokenization(example):\n...     return tokenizer(example[\"text\"])\n\n>>> dataset = dataset.map(tokenization, batched=True)\n```\n\n**4**. Set the format of your dataset to be compatible with your machine learning framework:\n\n<frameworkcontent>\n<pt>\nUse the [`~Dataset.set_format`] function to set the dataset format to be compatible with PyTorch:\n\n```py\n>>> dataset.set_format(type=\"torch\", columns=[\"input_ids\", \"token_type_ids\", \"attention_mask\", \"label\"])\n>>> dataset.format['type']\n'torch'\n```\n\n</pt>\n<tf>\nUse the [`~Dataset.to_tf_dataset`] function to set the dataset format to be compatible with TensorFlow. You'll also need to import a [data collator](https://huggingface.co/docs/transformers/main_classes/data_collator#transformers.DataCollatorWithPadding) from 🤗 Transformers to combine the varying sequence lengths into a single batch of equal lengths:\n\n```py\n>>> from transformers import DataCollatorWithPadding\n\n>>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors=\"tf\")\n>>> tf_dataset = dataset.to_tf_dataset(\n...     columns=[\"input_ids\", \"token_type_ids\", \"attention_mask\"],\n...     label_cols=[\"label\"],\n...     batch_size=2,\n...     collate_fn=data_collator,\n...     shuffle=True\n... )\n```\n\n</tf>\n</frameworkcontent>\n\n**5**. The dataset is now ready for training with your machine learning framework!\n\n## Resample audio signals\n\nAudio inputs like text datasets need to be divided into discrete data points. This is known as _sampling_; the sampling rate tells you how much of the speech signal is captured per second. It is important to make sure the sampling rate of your dataset matches the sampling rate of the data used to pretrain the model you're using. If the sampling rates are different, the pretrained model may perform poorly on your dataset because it doesn't recognize the differences in the sampling rate.\n\n**1**. Start by loading the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset, the [`Audio`] feature, and the feature extractor corresponding to a pretrained [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base-960h) model:\n\n```py\n>>> from transformers import AutoFeatureExtractor\n>>> from datasets import load_dataset, Audio\n\n>>> feature_extractor = AutoFeatureExtractor.from_pretrained(\"facebook/wav2vec2-base-960h\")\n>>> dataset = load_dataset(\"PolyAI/minds14\", \"en-US\", split=\"train\")\n```\n\n**2**. Index into the first row of the dataset. When you call the `audio` column of the dataset, it is automatically decoded and resampled:\n\n```py\n>>> audio = dataset[0][\"audio\"]\n>>> print(audio)\n<datasets.features._torchcodec.AudioDecoder object at 0x11642b6a0>\n>>> audio.get_all_samples().sample_rate\n8000\n```\n\n**3**. Reading a dataset card is incredibly useful and can give you a lot of information about the dataset. A quick look at the MInDS-14 dataset card tells you the sampling rate is 8kHz. Likewise, you can get many details about a model from its model card. The Wav2Vec2 model card says it was sampled on 16kHz speech audio. This means you'll need to upsample the MInDS-14 dataset to match the sampling rate of the model.\n\nUse the [`~Dataset.cast_column`] function and set the `sampling_rate` parameter in the [`Audio`] feature to upsample the audio signal. When you call the `audio` column now, it is decoded and resampled to 16kHz:\n\n```py\n>>> dataset = dataset.cast_column(\"audio\", Audio(sampling_rate=16_000))\n>>> audio = dataset[0][\"audio\"]\n>>> print(audio)\n<datasets.features._torchcodec.AudioDecoder object at 0x11642b6a0>\n>>> audio.get_all_samples().sample_rate\n16000\n```\n\n**4**. Use the [`~Dataset.map`] function to resample the entire dataset to 16kHz. This function speeds up resampling by applying the feature extractor to batches of examples instead of individual examples. Set the `batched` parameter to `True`:\n\n```py\n>>> def preprocess_function(examples):\n...     audio_arrays = [x.get_all_samples().data for x in examples[\"audio\"]]\n...     inputs = feature_extractor(\n...         audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True\n...     )\n...     return inputs\n\n>>> dataset = dataset.map(preprocess_function, batched=True)\n```\n\n**5**. The dataset is now ready for training with your machine learning framework!\n\n## Apply data augmentations\n\nThe most common preprocessing you'll do with image datasets is _data augmentation_, a process that introduces random variations to an image without changing the meaning of the data. This can mean changing the color properties of an image or randomly cropping an image. You are free to use any data augmentation library you like, and 🤗 Datasets will help you apply your data augmentations to your dataset.\n\n**1**. Start by loading the [Beans](https://huggingface.co/datasets/AI-Lab-Makerere/beans) dataset, the `Image` feature, and the feature extractor corresponding to a pretrained [ViT](https://huggingface.co/google/vit-base-patch16-224-in21k) model:\n\n```py\n>>> from transformers import AutoFeatureExtractor\n>>> from datasets import load_dataset, Image\n\n>>> feature_extractor = AutoFeatureExtractor.from_pretrained(\"google/vit-base-patch16-224-in21k\")\n>>> dataset = load_dataset(\"AI-Lab-Makerere/beans\", split=\"train\")\n```\n\n**2**. Index into the first row of the dataset. When you call the `image` column of the dataset, the underlying PIL object is automatically decoded into an image.\n\n```py\n>>> dataset[0][\"image\"]\n<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x500 at 0x7FE5A047CC70>\n```\n\nMost image models expect the image to be in the RGB mode. The Beans images are already in the RGB mode, but if your dataset contains images in a different mode, you can use the [`~Dataset.cast_column`] function to set the mode to RGB:\n\n```py\n>>> dataset = dataset.cast_column(\"image\", Image(mode=\"RGB\"))\n```\n\n**3**. Now let's apply data augmentations to your images. 🤗 Datasets works with any augmentation library, and in this example we'll use Albumentations.\n\n[Albumentations](https://albumentations.ai) is a popular image augmentation library that provides a [rich set of transforms](https://albumentations.ai/docs/reference/supported-targets-by-transform/) including spatial-level transforms, pixel-level transforms, and mixing-level transforms.\n\nInstall Albumentations:\n\n```bash\npip install albumentations\n```\n\n**4**. Create a typical augmentation pipeline with Albumentations:\n\n```py\n>>> import albumentations as A\n>>> import numpy as np\n>>> from PIL import Image\n\n>>> transform = A.Compose([\n...     A.RandomCrop(height=256, width=256, pad_if_needed=True, p=1),\n...     A.HorizontalFlip(p=0.5),\n...     A.ColorJitter(p=0.5)\n... ])\n```\n\n**5**. Since 🤗 Datasets uses PIL images but Albumentations expects NumPy arrays, you need to convert between formats:\n\n```py\n>>> def albumentations_transforms(examples):\n...     # Apply Albumentations transforms\n...     transformed_images = []\n...     for image in examples[\"image\"]:\n...         # Convert PIL to numpy array (OpenCV format)\n...         image_np = np.array(image.convert(\"RGB\"))\n...         \n...         # Apply Albumentations transforms\n...         transformed_image = transform(image=image_np)[\"image\"]\n...         \n...         # Convert back to PIL Image\n...         pil_image = Image.fromarray(transformed_image)\n...         transformed_images.append(pil_image)\n...     \n...     examples[\"pixel_values\"] = transformed_images\n...     return examples\n```\n\n**6**. Apply the transform using [`~Dataset.with_transform`]:\n\n```py\n>>> dataset = dataset.with_transform(albumentations_transforms)\n>>> dataset[0][\"pixel_values\"]\n```\n\n**Key points when using Albumentations with 🤗 Datasets:**\n- Convert PIL images to NumPy arrays before applying transforms\n- Albumentations returns a dictionary with the transformed image under the \"image\" key\n- Convert the result back to PIL format after transformation\n\n**7**. The dataset is now ready for training with your machine learning framework!\n"
  },
  {
    "path": "docs/source/use_with_jax.mdx",
    "content": "# Use with JAX\n\nThis document is a quick introduction to using `datasets` with JAX, with a particular focus on how to get\n`jax.Array` objects out of our datasets, and how to use them to train JAX models.\n\n> [!TIP]\n> `jax` and `jaxlib` are required to reproduce to code above, so please make sure you\n> install them as `pip install datasets[jax]`.\n\n## Dataset format\n\nBy default, datasets return regular Python objects: integers, floats, strings, lists, etc., and \nstring and binary objects are unchanged, since JAX only supports numbers.\n\nTo get JAX arrays (numpy-like) instead, you can set the format of the dataset to `jax`:\n\n```py\n>>> from datasets import Dataset\n>>> data = [[1, 2], [3, 4]]\n>>> ds = Dataset.from_dict({\"data\": data})\n>>> ds = ds.with_format(\"jax\")\n>>> ds[0]\n{'data': DeviceArray([1, 2], dtype=int32)}\n>>> ds[:2]\n{'data': DeviceArray([\n    [1, 2],\n    [3, 4]], dtype=int32)}\n```\n\n> [!TIP]\n> A [`Dataset`] object is a wrapper of an Arrow table, which allows fast reads from arrays in the dataset to JAX arrays.\n\nNote that the exact same procedure applies to `DatasetDict` objects, so that\nwhen setting the format of a `DatasetDict` to `jax`, all the `Dataset`s there\nwill be formatted as `jax`:\n\n```py\n>>> from datasets import DatasetDict\n>>> data = {\"train\": {\"data\": [[1, 2], [3, 4]]}, \"test\": {\"data\": [[5, 6], [7, 8]]}}\n>>> dds = DatasetDict.from_dict(data)\n>>> dds = dds.with_format(\"jax\")\n>>> dds[\"train\"][:2]\n{'data': DeviceArray([\n    [1, 2],\n    [3, 4]], dtype=int32)}\n```\n\nAnother thing you'll need to take into consideration is that the formatting is not applied\nuntil you actually access the data. So if you want to get a JAX array out of a dataset,\nyou'll need to access the data first, otherwise the format will remain the same.\n\nFinally, to load the data in the device of your choice, you can specify the `device` argument,\nbut note that `jaxlib.xla_extension.Device` is not supported as it's not serializable with neither\n`pickle` not `dill`, so you'll need to use its string identifier instead:\n\n```py\n>>> import jax\n>>> from datasets import Dataset\n>>> data = [[1, 2], [3, 4]]\n>>> ds = Dataset.from_dict({\"data\": data})\n>>> device = str(jax.devices()[0])  # Not casting to `str` before passing it to `with_format` will raise a `ValueError`\n>>> ds = ds.with_format(\"jax\", device=device)\n>>> ds[0]\n{'data': DeviceArray([1, 2], dtype=int32)}\n>>> ds[0][\"data\"].device()\nTFRT_CPU_0\n>>> assert ds[0][\"data\"].device() == jax.devices()[0]\nTrue\n```\n\nNote that if the `device` argument is not provided to `with_format` then it will use the default\ndevice which is `jax.devices()[0]`.\n\n### N-dimensional arrays\n\nIf your dataset consists of N-dimensional arrays, you will see that by default they are considered as the same tensor if the shape is fixed:\n\n```py\n>>> from datasets import Dataset\n>>> data = [[[1, 2],[3, 4]], [[5, 6],[7, 8]]]  # fixed shape\n>>> ds = Dataset.from_dict({\"data\": data})\n>>> ds = ds.with_format(\"jax\")\n>>> ds[0]\n{'data': Array([[1, 2],\n        [3, 4]], dtype=int32)}\n```\n\n```py\n>>> from datasets import Dataset\n>>> data = [[[1, 2],[3]], [[4, 5, 6],[7, 8]]]  # varying shape\n>>> ds = Dataset.from_dict({\"data\": data})\n>>> ds = ds.with_format(\"jax\")\n>>> ds[0]\n{'data': [Array([1, 2], dtype=int32), Array([3], dtype=int32)]}\n```\n\nHowever this logic often requires slow shape comparisons and data copies.\nTo avoid this, you must explicitly use the [`Array`] feature type and specify the shape of your tensors:\n\n```py\n>>> from datasets import Dataset, Features, Array2D\n>>> data = [[[1, 2],[3, 4]],[[5, 6],[7, 8]]]\n>>> features = Features({\"data\": Array2D(shape=(2, 2), dtype='int32')})\n>>> ds = Dataset.from_dict({\"data\": data}, features=features)\n>>> ds = ds.with_format(\"jax\")\n>>> ds[0]\n{'data': Array([[1, 2],\n        [3, 4]], dtype=int32)}\n>>> ds[:2]\n{'data': Array([[[1, 2],\n         [3, 4]],\n \n        [[5, 6],\n         [7, 8]]], dtype=int32)}\n```\n\n### Other feature types\n\n[`ClassLabel`] data is properly converted to arrays:\n\n```py\n>>> from datasets import Dataset, Features, ClassLabel\n>>> labels = [0, 0, 1]\n>>> features = Features({\"label\": ClassLabel(names=[\"negative\", \"positive\"])})\n>>> ds = Dataset.from_dict({\"label\": labels}, features=features)\n>>> ds = ds.with_format(\"jax\")\n>>> ds[:3]\n{'label': DeviceArray([0, 0, 1], dtype=int32)}\n```\n\nString and binary objects are unchanged, since JAX only supports numbers.\n\nThe [`Image`] and [`Audio`] feature types are also supported.\n\n> [!TIP]\n> To use the [`Image`] feature type, you'll need to install the `vision` extra as\n> `pip install datasets[vision]`.\n\n```py\n>>> from datasets import Dataset, Features, Image\n>>> images = [\"path/to/image.png\"] * 10\n>>> features = Features({\"image\": Image()})\n>>> ds = Dataset.from_dict({\"image\": images}, features=features)\n>>> ds = ds.with_format(\"jax\")\n>>> ds[0][\"image\"].shape\n(512, 512, 3)\n>>> ds[0]\n{'image': DeviceArray([[[ 255, 255, 255],\n              [ 255, 255, 255],\n              ...,\n              [ 255, 255, 255],\n              [ 255, 255, 255]]], dtype=uint8)}\n>>> ds[:2][\"image\"].shape\n(2, 512, 512, 3)\n>>> ds[:2]\n{'image': DeviceArray([[[[ 255, 255, 255],\n              [ 255, 255, 255],\n              ...,\n              [ 255, 255, 255],\n              [ 255, 255, 255]]]], dtype=uint8)}\n```\n\n> [!TIP]\n> To use the [`Audio`] feature type, you'll need to install the `audio` extra as\n> `pip install datasets[audio]`.\n\n```py\n>>> from datasets import Dataset, Features, Audio\n>>> audio = [\"path/to/audio.wav\"] * 10\n>>> features = Features({\"audio\": Audio()})\n>>> ds = Dataset.from_dict({\"audio\": audio}, features=features)\n>>> ds = ds.with_format(\"jax\")\n>>> ds[0][\"audio\"][\"array\"]\nDeviceArray([-0.059021  , -0.03894043, -0.00735474, ...,  0.0133667 ,\n              0.01809692,  0.00268555], dtype=float32)\n>>> ds[0][\"audio\"][\"sampling_rate\"]\nDeviceArray(44100, dtype=int32, weak_type=True)\n```\n\n## Data loading\n\nJAX doesn't have any built-in data loading capabilities, so you'll need to use a library such\nas [PyTorch](https://pytorch.org/) to load your data using a `DataLoader` or [TensorFlow](https://www.tensorflow.org/)\nusing a `tf.data.Dataset`. Citing the [JAX documentation](https://jax.readthedocs.io/en/latest/notebooks/Neural_Network_and_Data_Loading.html#data-loading-with-pytorch) on this topic:\n\"JAX is laser-focused on program transformations and accelerator-backed NumPy, so we don’t\ninclude data loading or munging in the JAX library. There are already a lot of great data loaders\nout there, so let’s just use them instead of reinventing anything. We’ll grab PyTorch’s data loader,\nand make a tiny shim to make it work with NumPy arrays.\".\n\nSo that's the reason why JAX-formatting in `datasets` is so useful, because it lets you use\nany model from the HuggingFace Hub with JAX, without having to worry about the data loading\npart.\n\n### Using `with_format('jax')`\n\nThe easiest way to get JAX arrays out of a dataset is to use the `with_format('jax')` method. Lets assume\nthat we want to train a neural network on the [MNIST dataset](http://yann.lecun.com/exdb/mnist/) available\nat the HuggingFace Hub at https://huggingface.co/datasets/ylecun/mnist.\n\n```py\n>>> from datasets import load_dataset\n>>> ds = load_dataset(\"ylecun/mnist\")\n>>> ds = ds.with_format(\"jax\")\n>>> ds[\"train\"][0]\n{'image': DeviceArray([[  0,   0,   0, ...],\n                       [  0,   0,   0, ...],\n                       ...,\n                       [  0,   0,   0, ...],\n                       [  0,   0,   0, ...]], dtype=uint8),\n 'label': DeviceArray(5, dtype=int32)}\n```\n\nOnce the format is set we can feed the dataset to the JAX model in batches using the `Dataset.iter()`\nmethod:\n\n```py\n>>> for epoch in range(epochs):\n...     for batch in ds[\"train\"].iter(batch_size=32):\n...         x, y = batch[\"image\"], batch[\"label\"]\n...         ...\n```\n"
  },
  {
    "path": "docs/source/use_with_numpy.mdx",
    "content": "# Use with NumPy\n\nThis document is a quick introduction to using `datasets` with NumPy, with a particular focus on how to get\n`numpy.ndarray` objects out of our datasets, and how to use them to train models based on NumPy such as `scikit-learn` models.\n\n\n## Dataset format\n\nBy default, datasets return regular Python objects: integers, floats, strings, lists, etc..\n\nTo get NumPy arrays instead, you can set the format of the dataset to `numpy`:\n\n```py\n>>> from datasets import Dataset\n>>> data = [[1, 2], [3, 4]]\n>>> ds = Dataset.from_dict({\"data\": data})\n>>> ds = ds.with_format(\"numpy\")\n>>> ds[0]\n{'data': array([1, 2])}\n>>> ds[:2]\n{'data': array([\n    [1, 2],\n    [3, 4]])}\n```\n\n> [!TIP]\n> A [`Dataset`] object is a wrapper of an Arrow table, which allows fast reads from arrays in the dataset to NumPy arrays.\n\nNote that the exact same procedure applies to `DatasetDict` objects, so that\nwhen setting the format of a `DatasetDict` to `numpy`, all the `Dataset`s there\nwill be formatted as `numpy`:\n\n```py\n>>> from datasets import DatasetDict\n>>> data = {\"train\": {\"data\": [[1, 2], [3, 4]]}, \"test\": {\"data\": [[5, 6], [7, 8]]}}\n>>> dds = DatasetDict.from_dict(data)\n>>> dds = dds.with_format(\"numpy\")\n>>> dds[\"train\"][:2]\n{'data': array([\n    [1, 2],\n    [3, 4]])}\n```\n\n\n### N-dimensional arrays\n\nIf your dataset consists of N-dimensional arrays, you will see that by default they are considered as the same array if the shape is fixed:\n\n```py\n>>> from datasets import Dataset\n>>> data = [[[1, 2],[3, 4]], [[5, 6],[7, 8]]]  # fixed shape\n>>> ds = Dataset.from_dict({\"data\": data})\n>>> ds = ds.with_format(\"numpy\")\n>>> ds[0]\n{'data': array([[1, 2],\n        [3, 4]])}\n```\n\n```py\n>>> from datasets import Dataset\n>>> data = [[[1, 2],[3]], [[4, 5, 6],[7, 8]]]  # varying shape\n>>> ds = Dataset.from_dict({\"data\": data})\n>>> ds = ds.with_format(\"numpy\")\n>>> ds[0]\n{'data': array([array([1, 2]), array([3])], dtype=object)}\n```\n\nHowever this logic often requires slow shape comparisons and data copies.\nTo avoid this, you must explicitly use the [`Array`] feature type and specify the shape of your tensors:\n\n```py\n>>> from datasets import Dataset, Features, Array2D\n>>> data = [[[1, 2],[3, 4]],[[5, 6],[7, 8]]]\n>>> features = Features({\"data\": Array2D(shape=(2, 2), dtype='int32')})\n>>> ds = Dataset.from_dict({\"data\": data}, features=features)\n>>> ds = ds.with_format(\"numpy\")\n>>> ds[0]\n{'data': array([[1, 2],\n        [3, 4]])}\n>>> ds[:2]\n{'data': array([[[1, 2],\n         [3, 4]],\n \n        [[5, 6],\n         [7, 8]]])}\n```\n\n### Other feature types\n\n[`ClassLabel`] data is properly converted to arrays:\n\n```py\n>>> from datasets import Dataset, Features, ClassLabel\n>>> labels = [0, 0, 1]\n>>> features = Features({\"label\": ClassLabel(names=[\"negative\", \"positive\"])})\n>>> ds = Dataset.from_dict({\"label\": labels}, features=features)\n>>> ds = ds.with_format(\"numpy\")\n>>> ds[:3]\n{'label': array([0, 0, 1])}\n```\n\nString and binary objects are unchanged, since NumPy only supports numbers.\n\nThe [`Image`] and [`Audio`] feature types are also supported.\n\n> [!TIP]\n> To use the [`Image`] feature type, you'll need to install the `vision` extra as\n> `pip install datasets[vision]`.\n\n```py\n>>> from datasets import Dataset, Features, Image\n>>> images = [\"path/to/image.png\"] * 10\n>>> features = Features({\"image\": Image()})\n>>> ds = Dataset.from_dict({\"image\": images}, features=features)\n>>> ds = ds.with_format(\"numpy\")\n>>> ds[0][\"image\"].shape\n(512, 512, 3)\n>>> ds[0]\n{'image': array([[[ 255, 255, 255],\n              [ 255, 255, 255],\n              ...,\n              [ 255, 255, 255],\n              [ 255, 255, 255]]], dtype=uint8)}\n>>> ds[:2][\"image\"].shape\n(2, 512, 512, 3)\n>>> ds[:2]\n{'image': array([[[[ 255, 255, 255],\n              [ 255, 255, 255],\n              ...,\n              [ 255, 255, 255],\n              [ 255, 255, 255]]]], dtype=uint8)}\n```\n\n> [!TIP]\n> To use the [`Audio`] feature type, you'll need to install the `audio` extra as\n> `pip install datasets[audio]`.\n\n```py\n>>> from datasets import Dataset, Features, Audio\n>>> audio = [\"path/to/audio.wav\"] * 10\n>>> features = Features({\"audio\": Audio()})\n>>> ds = Dataset.from_dict({\"audio\": audio}, features=features)\n>>> ds = ds.with_format(\"numpy\")\n>>> ds[0][\"audio\"][\"array\"]\narray([-0.059021  , -0.03894043, -0.00735474, ...,  0.0133667 ,\n              0.01809692,  0.00268555], dtype=float32)\n>>> ds[0][\"audio\"][\"sampling_rate\"]\narray(44100, weak_type=True)\n```\n\n## Data loading\n\nNumPy doesn't have any built-in data loading capabilities, so you'll either need to materialize the NumPy arrays like `X, y` to use in `scikit-learn` or use a library such as [PyTorch](https://pytorch.org/) to load your data using a `DataLoader`.\n\n### Using `with_format('numpy')`\n\nThe easiest way to get NumPy arrays out of a dataset is to use the `with_format('numpy')` method. Lets assume\nthat we want to train a neural network on the [MNIST dataset](http://yann.lecun.com/exdb/mnist/) available\nat the HuggingFace Hub at https://huggingface.co/datasets/mnist.\n\n```py\n>>> from datasets import load_dataset\n>>> ds = load_dataset(\"ylecun/mnist\")\n>>> ds = ds.with_format(\"numpy\")\n>>> ds[\"train\"][0]\n{'image': array([[  0,   0,   0, ...],\n                       [  0,   0,   0, ...],\n                       ...,\n                       [  0,   0,   0, ...],\n                       [  0,   0,   0, ...]], dtype=uint8),\n 'label': array(5)}\n```\n\nOnce the format is set we can feed the dataset to the model based on NumPy in batches using the `Dataset.iter()`\nmethod:\n\n```py\n>>> for epoch in range(epochs):\n...     for batch in ds[\"train\"].iter(batch_size=32):\n...         x, y = batch[\"image\"], batch[\"label\"]\n...         ...\n```\n"
  },
  {
    "path": "docs/source/use_with_pandas.mdx",
    "content": "# Use with Pandas\n\nThis document is a quick introduction to using `datasets` with Pandas, with a particular focus on how to process\ndatasets using Pandas functions, and how to convert a dataset to Pandas or from Pandas.\n\nThis is particularly useful as it allows fast operations, since `datasets` uses PyArrow under the hood and PyArrow is well integrated with Pandas.\n\n## Dataset format\n\nBy default, datasets return regular Python objects: integers, floats, strings, lists, etc.\n\nTo get Pandas DataFrames or Series instead, you can set the format of the dataset to `pandas` using [`Dataset.with_format`]:\n\n```py\n>>> from datasets import Dataset\n>>> data = {\"col_0\": [\"a\", \"b\", \"c\", \"d\"], \"col_1\": [0., 0., 1., 1.]}\n>>> ds = Dataset.from_dict(data)\n>>> ds = ds.with_format(\"pandas\")\n>>> ds[0]       # pd.DataFrame\n  col_0  col_1\n0     a    0.0\n>>> ds[:2]      # pd.DataFrame\n  col_0  col_1\n0     a    0.0\n1     b    0.0\n>>> ds[\"data\"]  # pd.Series\n0    a\n1    b\n2    c\n3    d\nName: col_0, dtype: object\n```\n\nThis also works for `IterableDataset` objects obtained e.g. using `load_dataset(..., streaming=True)`:\n\n```py\n>>> ds = ds.with_format(\"pandas\")\n>>> for df in ds.iter(batch_size=2):\n...     print(df)\n...     break\n  col_0  col_1\n0     a    0.0\n1     b    0.0\n```\n\n## Process data\n\nPandas functions are generally faster than regular hand-written python functions, and therefore they are a good option to optimize data processing. You can use Pandas functions to process a dataset in [`Dataset.map`] or [`Dataset.filter`]:\n\n```python\n>>> from datasets import Dataset\n>>> data = {\"col_0\": [\"a\", \"b\", \"c\", \"d\"], \"col_1\": [0., 0., 1., 1.]}\n>>> ds = Dataset.from_dict(data)\n>>> ds = ds.with_format(\"pandas\")\n>>> ds = ds.map(lambda df: df.assign(col_2=df.col_1 + 1), batched=True)\n>>> ds[:2]\n  col_0  col_1  col_2\n0     a    0.0    1.0\n1     b    0.0    1.0\n>>> ds = ds.filter(lambda df: df.col_0 == \"b\", batched=True)\n>>> ds[0]\n  col_0  col_1  col_2\n0     b    0.0    1.0\n```\n\nWe use `batched=True` because it is faster to process batches of data in Pandas rather than row by row. It's also possible to use `batch_size=` in `map()` to set the size of each `df`.\n\nThis also works for [`IterableDataset.map`] and [`IterableDataset.filter`].\n\n## Import or Export from Pandas\n\nTo import data from Pandas, you can use [`Dataset.from_pandas`]:\n\n```python\nds = Dataset.from_pandas(df)\n```\n\nAnd you can use [`Dataset.to_pandas`] to export a Dataset to a Pandas DataFrame:\n\n\n```python\ndf = Dataset.to_pandas()\n```\n"
  },
  {
    "path": "docs/source/use_with_polars.mdx",
    "content": "# Use with Polars\n\nThis document is a quick introduction to using `datasets` with Polars, with a particular focus on how to process\ndatasets using Polars functions, and how to convert a dataset to Polars or from Polars.\n\nThis is particularly useful as it allows fast zero-copy operations, since both `datasets` and Polars use Arrow under the hood.\n\n## Dataset format\n\nBy default, datasets return regular Python objects: integers, floats, strings, lists, etc.\n\nTo get Polars DataFrames or Series instead, you can set the format of the dataset to `polars` using [`Dataset.with_format`]:\n\n```py\n>>> from datasets import Dataset\n>>> data = {\"col_0\": [\"a\", \"b\", \"c\", \"d\"], \"col_1\": [0., 0., 1., 1.]}\n>>> ds = Dataset.from_dict(data)\n>>> ds = ds.with_format(\"polars\")\n>>> ds[0]       # pl.DataFrame\nshape: (1, 2)\n┌───────┬───────┐\n│ col_0 ┆ col_1 │\n│ ---   ┆ ---   │\n│ str   ┆ f64   │\n╞═══════╪═══════╡\n│ a     ┆ 0.0   │\n└───────┴───────┘\n>>> ds[:2]      # pl.DataFrame\nshape: (2, 2)\n┌───────┬───────┐\n│ col_0 ┆ col_1 │\n│ ---   ┆ ---   │\n│ str   ┆ f64   │\n╞═══════╪═══════╡\n│ a     ┆ 0.0   │\n│ b     ┆ 0.0   │\n└───────┴───────┘\n>>> ds[\"data\"]  # pl.Series\nshape: (4,)\nSeries: 'col_0' [str]\n[\n        \"a\"\n        \"b\"\n        \"c\"\n        \"d\"\n]\n```\n\nThis also works for `IterableDataset` objects obtained e.g. using `load_dataset(..., streaming=True)`:\n\n```py\n>>> ds = ds.with_format(\"polars\")\n>>> for df in ds.iter(batch_size=2):\n...     print(df)\n...     break\nshape: (2, 2)\n┌───────┬───────┐\n│ col_0 ┆ col_1 │\n│ ---   ┆ ---   │\n│ str   ┆ f64   │\n╞═══════╪═══════╡\n│ a     ┆ 0.0   │\n│ b     ┆ 0.0   │\n└───────┴───────┘\n```\n\n## Process data\n\nPolars functions are generally faster than regular hand-written python functions, and therefore they are a good option to optimize data processing. You can use Polars functions to process a dataset in [`Dataset.map`] or [`Dataset.filter`]:\n\n```python\n>>> import polars as pl\n>>> from datasets import Dataset\n>>> data = {\"col_0\": [\"a\", \"b\", \"c\", \"d\"], \"col_1\": [0., 0., 1., 1.]}\n>>> ds = Dataset.from_dict(data)\n>>> ds = ds.with_format(\"polars\")\n>>> ds = ds.map(lambda df: df.with_columns(pl.col(\"col_1\").add(1).alias(\"col_2\")), batched=True)\n>>> ds[:2]\nshape: (2, 3)\n┌───────┬───────┬───────┐\n│ col_0 ┆ col_1 ┆ col_2 │\n│ ---   ┆ ---   ┆ ---   │\n│ str   ┆ f64   ┆ f64   │\n╞═══════╪═══════╪═══════╡\n│ a     ┆ 0.0   ┆ 1.0   │\n│ b     ┆ 0.0   ┆ 1.0   │\n└───────┴───────┴───────┘\n>>> ds = ds.filter(lambda df: df[\"col_0\"] == \"b\", batched=True)\n>>> ds[0]\nshape: (1, 3)\n┌───────┬───────┬───────┐\n│ col_0 ┆ col_1 ┆ col_2 │\n│ ---   ┆ ---   ┆ ---   │\n│ str   ┆ f64   ┆ f64   │\n╞═══════╪═══════╪═══════╡\n│ b     ┆ 0.0   ┆ 1.0   │\n└───────┴───────┴───────┘\n```\n\nWe use `batched=True` because it is faster to process batches of data in Polars rather than row by row. It's also possible to use `batch_size=` in `map()` to set the size of each `df`.\n\nThis also works for [`IterableDataset.map`] and [`IterableDataset.filter`].\n\n### Example: data extraction\n\nMany functions are available in Polars and for any data type: string, floats, integers, etc. You can find the full list [here](https://docs.pola.rs/api/python/stable/reference/expressions/functions.html). Those functions are written in Rust and run on batches of data which enables fast data processing.\n\nHere is an example that shows a 5x speed boost using Polars instead of a regular python function to extract solutions from a LLM reasoning dataset:\n\n```python\nfrom datasets import load_dataset\n\nds = load_dataset(\"ServiceNow-AI/R1-Distill-SFT\", \"v0\", split=\"train\")\n\n# Using a regular python function\npattern = re.compile(\"boxed\\\\{(.*)\\\\}\")\nresult_ds = ds.map(lambda x: {\"value_solution\": m.group(1) if (m:=pattern.search(x[\"solution\"])) else None})\n# Time: 10s\n\n# Using a Polars function\nexpr = pl.col(\"solution\").str.extract(\"boxed\\\\{(.*)\\\\}\").alias(\"value_solution\")\nresult_ds = ds.with_format(\"polars\").map(lambda df: df.with_columns(expr), batched=True)\n# Time: 2s\n```\n\n## Import or Export from Polars\n\nTo import data from Polars, you can use [`Dataset.from_polars`]:\n\n```python\nds = Dataset.from_polars(df)\n```\n\nAnd you can use [`Dataset.to_polars`] to export a Dataset to a Polars DataFrame:\n\n\n```python\ndf = Dataset.to_polars(ds)\n```\n"
  },
  {
    "path": "docs/source/use_with_pyarrow.mdx",
    "content": "# Use with PyArrow\n\nThis document is a quick introduction to using `datasets` with PyArrow, with a particular focus on how to process\ndatasets using Arrow compute functions, and how to convert a dataset to PyArrow or from PyArrow.\n\nThis is particularly useful as it allows fast zero-copy operations, since `datasets` uses PyArrow under the hood.\n\n## Dataset format\n\nBy default, datasets return regular Python objects: integers, floats, strings, lists, etc.\n\nTo get PyArrow Tables or Arrays instead, you can set the format of the dataset to `pyarrow` using [`Dataset.with_format`]:\n\n```py\n>>> from datasets import Dataset\n>>> data = {\"col_0\": [\"a\", \"b\", \"c\", \"d\"], \"col_1\": [0., 0., 1., 1.]}\n>>> ds = Dataset.from_dict(data)\n>>> ds = ds.with_format(\"arrow\")\n>>> ds[0]       # pa.Table\npyarrow.Table\ncol_0: string\ncol_1: double\n----\ncol_0: [[\"a\"]]\ncol_1: [[0]]\n>>> ds[:2]      # pa.Table\npyarrow.Table\ncol_0: string\ncol_1: double\n----\ncol_0: [[\"a\",\"b\"]]\ncol_1: [[0,0]]\n>>> ds[\"data\"]  # pa.array\n<pyarrow.lib.ChunkedArray object at 0x1394312a0>\n[\n  [\n    \"a\",\n    \"b\",\n    \"c\",\n    \"d\"\n  ]\n]\n```\n\nThis also works for `IterableDataset` objects obtained e.g. using `load_dataset(..., streaming=True)`:\n\n```py\n>>> ds = ds.with_format(\"arrow\")\n>>> for table in ds.iter(batch_size=2):\n...     print(table)\n...     break\npyarrow.Table\ncol_0: string\ncol_1: double\n----\ncol_0: [[\"a\",\"b\"]]\ncol_1: [[0,0]]\n```\n\n## Process data\n\nPyArrow functions are generally faster than regular hand-written python functions, and therefore they are a good option to optimize data processing. You can use Arrow compute functions to process a dataset in [`Dataset.map`] or [`Dataset.filter`]:\n\n```python\n>>> import pyarrow.compute as pc\n>>> from datasets import Dataset\n>>> data = {\"col_0\": [\"a\", \"b\", \"c\", \"d\"], \"col_1\": [0., 0., 1., 1.]}\n>>> ds = Dataset.from_dict(data)\n>>> ds = ds.with_format(\"arrow\")\n>>> ds = ds.map(lambda t: t.append_column(\"col_2\", pc.add(t[\"col_1\"], 1)), batched=True)\n>>> ds[:2]\npyarrow.Table\ncol_0: string\ncol_1: double\ncol_2: double\n----\ncol_0: [[\"a\",\"b\"]]\ncol_1: [[0,0]]\ncol_2: [[1,1]]\n>>> ds = ds.filter(lambda t: pc.equal(t[\"col_0\"], \"b\"), batched=True)\n>>> ds[0]\npyarrow.Table\ncol_0: string\ncol_1: double\ncol_2: double\n----\ncol_0: [[\"b\"]]\ncol_1: [[0]]\ncol_2: [[1]]\n```\n\nWe use `batched=True` because it is faster to process batches of data in PyArrow rather than row by row. It's also possible to use `batch_size=` in `map()` to set the size of each `table`.\n\nThis also works for [`IterableDataset.map`] and [`IterableDataset.filter`].\n\n## Import or Export from PyArrow\n\nA [`Dataset`] is a wrapper of a PyArrow Table, you can instantiate a Dataset directly from the Table:\n\n```python\nds = Dataset(table)\n```\n\nYou can access the PyArrow Table of a dataset using [`Dataset.data`], which returns a [`MemoryMappedTable`] or a [`InMemoryTable`] or a [`ConcatenationTable`], depending on the origin of the Arrow data and the operations that were applied.\n\nThose objects wrap the underlying PyArrow table accessible at `Dataset.data.table`. This table contains all the data of the dataset, but there might also be an indices mapping at `Dataset._indices` which maps the dataset rows indices to the PyArrow Table rows indices. This can happen if the dataset has been shuffled with [`Dataset.shuffle`] or if only a subset of the rows are used (e.g. after a [`Dataset.select`]).\n\nIn the general case, you can export a dataset to a PyArrow Table using `table = ds.with_format(\"arrow\")[:]`.\n"
  },
  {
    "path": "docs/source/use_with_pytorch.mdx",
    "content": "# Use with PyTorch\n\nThis document is a quick introduction to using `datasets` with PyTorch, with a particular focus on how to get\n`torch.Tensor` objects out of our datasets, and how to use a PyTorch `DataLoader` and a Hugging Face `Dataset`\nwith the best performance.\n\n## Dataset format\n\nBy default, datasets return regular python objects: integers, floats, strings, lists, etc.\n\nTo get PyTorch tensors instead, you can set the format of the dataset to `pytorch` using [`Dataset.with_format`]:\n\n```py\n>>> from datasets import Dataset\n>>> data = [[1, 2],[3, 4]]\n>>> ds = Dataset.from_dict({\"data\": data})\n>>> ds = ds.with_format(\"torch\")\n>>> ds[0]\n{'data': tensor([1, 2])}\n>>> ds[:2]\n{'data': tensor([[1, 2],\n         [3, 4]])}\n```\n\n> [!TIP]\n> A [`Dataset`] object is a wrapper of an Arrow table, which allows fast zero-copy reads from arrays in the dataset to PyTorch tensors.\n\n\nTo load the data as tensors on a GPU, specify the `device` argument:\n```py\n>>> import torch\n>>> device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n>>> ds = ds.with_format(\"torch\", device=device)\n>>> ds[0]\n{'data': tensor([1, 2], device='cuda:0')}\n```\n\n### N-dimensional arrays\n\nIf your dataset consists of N-dimensional arrays, you will see that by default they are considered as the same tensor if the shape is fixed:\n\n```py\n>>> from datasets import Dataset\n>>> data = [[[1, 2],[3, 4]],[[5, 6],[7, 8]]]  # fixed shape\n>>> ds = Dataset.from_dict({\"data\": data})\n>>> ds = ds.with_format(\"torch\")\n>>> ds[0]\n{'data': tensor([[1, 2],\n         [3, 4]])}\n```\n\n```py\n>>> from datasets import Dataset\n>>> data = [[[1, 2],[3]],[[4, 5, 6],[7, 8]]]  # varying shape\n>>> ds = Dataset.from_dict({\"data\": data})\n>>> ds = ds.with_format(\"torch\")\n>>> ds[0]\n{'data': [tensor([1, 2]), tensor([3])]}\n```\n\nHowever this logic often requires slow shape comparisons and data copies.\nTo avoid this, you must explicitly use the [`Array`] feature type and specify the shape of your tensors:\n\n```py\n>>> from datasets import Dataset, Features, Array2D\n>>> data = [[[1, 2],[3, 4]],[[5, 6],[7, 8]]]\n>>> features = Features({\"data\": Array2D(shape=(2, 2), dtype='int32')})\n>>> ds = Dataset.from_dict({\"data\": data}, features=features)\n>>> ds = ds.with_format(\"torch\")\n>>> ds[0]\n{'data': tensor([[1, 2],\n         [3, 4]])}\n>>> ds[:2]\n{'data': tensor([[[1, 2],\n          [3, 4]],\n \n         [[5, 6],\n          [7, 8]]])}\n```\n\n\n### Other feature types\n\n[`ClassLabel`] data are properly converted to tensors:\n\n```py\n>>> from datasets import Dataset, Features, ClassLabel\n>>> labels = [0, 0, 1]\n>>> features = Features({\"label\": ClassLabel(names=[\"negative\", \"positive\"])})\n>>> ds = Dataset.from_dict({\"label\": labels}, features=features) \n>>> ds = ds.with_format(\"torch\")  \n>>> ds[:3]\n{'label': tensor([0, 0, 1])}\n```\n\nString and binary objects are unchanged, since PyTorch only supports numbers.\n\nThe [`Image`] and [`Audio`] feature types are also supported.\n\n> [!TIP]\n> To use the [`Image`] feature type, you'll need to install the `vision` extra as\n> `pip install datasets[vision]`.\n\n```py\n>>> from datasets import Dataset, Features, Audio, Image\n>>> images = [\"path/to/image.png\"] * 10\n>>> features = Features({\"image\": Image()})\n>>> ds = Dataset.from_dict({\"image\": images}, features=features) \n>>> ds = ds.with_format(\"torch\")\n>>> ds[0][\"image\"].shape\ntorch.Size([512, 512, 4])\n>>> ds[0]\n{'image': tensor([[[255, 215, 106, 255],\n         [255, 215, 106, 255],\n         ...,\n         [255, 255, 255, 255],\n         [255, 255, 255, 255]]], dtype=torch.uint8)}\n>>> ds[:2][\"image\"].shape\ntorch.Size([2, 512, 512, 4])\n>>> ds[:2]\n{'image': tensor([[[[255, 215, 106, 255],\n          [255, 215, 106, 255],\n          ...,\n          [255, 255, 255, 255],\n          [255, 255, 255, 255]]]], dtype=torch.uint8)}\n```\n\n> [!TIP]\n> To use the [`Audio`] feature type, you'll need to install the `audio` extra as\n> `pip install datasets[audio]`.\n\n```py\n>>> from datasets import Dataset, Features, Audio, Image\n>>> audio = [\"path/to/audio.wav\"] * 10\n>>> features = Features({\"audio\": Audio()})\n>>> ds = Dataset.from_dict({\"audio\": audio}, features=features) \n>>> ds = ds.with_format(\"torch\")  \n>>> ds[0][\"audio\"][\"array\"]\ntensor([ 6.1035e-05,  1.5259e-05,  1.6785e-04,  ..., -1.5259e-05,\n        -1.5259e-05,  1.5259e-05])\n>>> ds[0][\"audio\"][\"sampling_rate\"]\ntensor(44100)\n```\n\n## Data loading\n\nLike `torch.utils.data.Dataset` objects, a [`Dataset`] can be passed directly to a PyTorch `DataLoader`:\n\n```py\n>>> import numpy as np\n>>> from datasets import Dataset \n>>> from torch.utils.data import DataLoader\n>>> data = np.random.rand(16)\n>>> label = np.random.randint(0, 2, size=16)\n>>> ds = Dataset.from_dict({\"data\": data, \"label\": label}).with_format(\"torch\")\n>>> dataloader = DataLoader(ds, batch_size=4)\n>>> for batch in dataloader:\n...     print(batch)                                                                                            \n{'data': tensor([0.0047, 0.4979, 0.6726, 0.8105]), 'label': tensor([0, 1, 0, 1])}\n{'data': tensor([0.4832, 0.2723, 0.4259, 0.2224]), 'label': tensor([0, 0, 0, 0])}\n{'data': tensor([0.5837, 0.3444, 0.4658, 0.6417]), 'label': tensor([0, 1, 0, 0])}\n{'data': tensor([0.7022, 0.1225, 0.7228, 0.8259]), 'label': tensor([1, 1, 1, 1])}\n```\n\n### Optimize data loading\n\nThere are several ways you can increase the speed your data is loaded which can save you time, especially if you are working with large datasets.\nPyTorch offers parallelized data loading, retrieving batches of indices instead of individually, and streaming to iterate over the dataset without downloading it on disk.\n\n#### Use multiple Workers\n\nYou can parallelize data loading with the `num_workers` argument of a PyTorch `DataLoader` and get a higher throughput.\n\nUnder the hood, the `DataLoader` starts `num_workers` processes.\nEach process reloads the dataset passed to the `DataLoader` and is used to query examples.\nReloading the dataset inside a worker doesn't fill up your RAM, since it simply memory-maps the dataset again from your disk.\n\n```py\n>>> import numpy as np\n>>> from datasets import Dataset, load_from_disk\n>>> from torch.utils.data import DataLoader\n>>> data = np.random.rand(10_000)\n>>> Dataset.from_dict({\"data\": data}).save_to_disk(\"my_dataset\")\n>>> ds = load_from_disk(\"my_dataset\").with_format(\"torch\")\n>>> dataloader = DataLoader(ds, batch_size=32, num_workers=4)\n```\n\n### Stream data\n\nStream a dataset by loading it as an [`IterableDataset`]. This allows you to progressively iterate over a remote dataset without downloading it on disk and or over local data files.\nLearn more about which type of dataset is best for your use case in the [choosing between a regular dataset or an iterable dataset](./about_mapstyle_vs_iterable) guide.\n\n\nAn iterable dataset from `datasets` inherits from `torch.utils.data.IterableDataset` so you can pass it to a `torch.utils.data.DataLoader`:\n\n```py\n>>> import numpy as np\n>>> from datasets import Dataset, load_dataset\n>>> from torch.utils.data import DataLoader\n>>> data = np.random.rand(10_000)\n>>> Dataset.from_dict({\"data\": data}).push_to_hub(\"<username>/my_dataset\")  # Upload to the Hugging Face Hub\n>>> my_iterable_dataset = load_dataset(\"<username>/my_dataset\", streaming=True, split=\"train\")\n>>> dataloader = DataLoader(my_iterable_dataset, batch_size=32)\n```\n\nIf the dataset is split in several shards (i.e. if the dataset consists of multiple data files), then you can stream in parallel using `num_workers`:\n\n```py\n>>> my_iterable_dataset = load_dataset(\"deepmind/code_contests\", streaming=True, split=\"train\")\n>>> my_iterable_dataset.num_shards\n39\n>>> dataloader = DataLoader(my_iterable_dataset, batch_size=32, num_workers=4)\n```\n\nIn this case each worker is given a subset of the list of shards to stream from.\n\n### Checkpoint and resume\n\nIf you need a DataLoader that you can checkpoint and resume in the middle of training, you can use the `StatefulDataLoader` from [torchdata](https://github.com/pytorch/data):\n\n```py\n>>> from torchdata.stateful_dataloader import StatefulDataLoader\n>>> my_iterable_dataset = load_dataset(\"deepmind/code_contests\", streaming=True, split=\"train\")\n>>> dataloader = StatefulDataLoader(my_iterable_dataset, batch_size=32, num_workers=4)\n>>> # save in the middle of training\n>>> state_dict = dataloader.state_dict()\n>>> # and resume later\n>>> dataloader.load_state_dict(state_dict)\n```\n\nThis is possible thanks to [`IterableDataset.state_dict`] and [`IterableDataset.load_state_dict`].\n\n### Distributed\n\nTo split your dataset across your training nodes, you can use [`datasets.distributed.split_dataset_by_node`]:\n\n```python\nimport os\nfrom datasets.distributed import split_dataset_by_node\n\nds = split_dataset_by_node(ds, rank=int(os.environ[\"RANK\"]), world_size=int(os.environ[\"WORLD_SIZE\"]))\n```\n\nThis works for both map-style datasets and iterable datasets.\nThe dataset is split for the node at rank `rank` in a pool of nodes of size `world_size`.\n\nFor map-style datasets:\n\nEach node is assigned a chunk of data, e.g. rank 0 is given the first chunk of the dataset.\n\nFor iterable datasets:\n\nIf the dataset has a number of shards that is a factor of `world_size` (i.e. if `dataset.num_shards % world_size == 0`),\nthen the shards are evenly assigned across the nodes, which is the most optimized.\nOtherwise, each node keeps 1 example out of `world_size`, skipping the other examples.\n\nThis can also be combined with a `torch.utils.data.DataLoader` if you want each node to use multiple workers to load the data.\n\n> [!WARNING]\n> If you shuffle your iterable dataset in a distributed setup, make sure to set a fixed `seed` in [`IterableDataset.shuffle`] so the same shuffled list of shards is used on every node to know which shards the node should skip.\n"
  },
  {
    "path": "docs/source/use_with_spark.mdx",
    "content": "# Use with Spark\n\nThis document is a quick introduction to using 🤗 Datasets with Spark, with a particular focus on how to load a Spark DataFrame into a [`Dataset`] object.\n\nFrom there, you have fast access to any element and you can use it as a data loader to train models.\n\n## Load from Spark\n\nA [`Dataset`] object is a wrapper of an Arrow table, which allows fast reads from arrays in the dataset to PyTorch, TensorFlow and JAX tensors.\nThe Arrow table is memory mapped from disk, which can load datasets bigger than your available RAM.\n\nYou can get a [`Dataset`] from a Spark DataFrame using [`Dataset.from_spark`]:\n\n```py\n>>> from datasets import Dataset\n>>> df = spark.createDataFrame(\n...     data=[[1, \"Elia\"], [2, \"Teo\"], [3, \"Fang\"]],\n...     columns=[\"id\", \"name\"],\n... )\n>>> ds = Dataset.from_spark(df)\n```\n\nThe Spark workers write the dataset on disk in a cache directory as Arrow files, and the [`Dataset`] is loaded from there.\n\nAlternatively, you can skip materialization by using [`IterableDataset.from_spark`], which returns an [`IterableDataset`]:\n\n ```py\n >>> from datasets import IterableDataset\n >>> df = spark.createDataFrame(\n ...     data=[[1, \"Elia\"], [2, \"Teo\"], [3, \"Fang\"]],\n ...     columns=[\"id\", \"name\"],\n ... )\n >>> ds = IterableDataset.from_spark(df)\n >>> print(next(iter(ds)))\n {\"id\": 1, \"name\": \"Elia\"}\n ```\n\n### Caching\n\nWhen using [`Dataset.from_spark`], the resulting [`Dataset`] is cached; if you call [`Dataset.from_spark`] multiple\ntimes on the same DataFrame it won't re-run the Spark job that writes the dataset as Arrow files on disk.\n\nYou can set the cache location by passing `cache_dir=` to [`Dataset.from_spark`].\nMake sure to use a disk that is available to both your workers and your current machine (the driver).\n\n> [!WARNING]\n> In a different session, a Spark DataFrame doesn't have the same [semantic hash](https://spark.apache.org/docs/3.2.0/api/python/reference/api/pyspark.sql.DataFrame.semanticHash.html), and it will rerun a Spark job and store it in a new cache.\n\n### Feature types\n\nIf your dataset is made of images, audio data or N-dimensional arrays, you can specify the `features=` argument in\n[`Dataset.from_spark`] (or [`IterableDataset.from_spark`]):\n\n```py\n>>> from datasets import Dataset, Features, Image, Value\n>>> data = [(0, open(\"image.png\", \"rb\").read())]\n>>> df = spark.createDataFrame(data, \"idx: int, image: binary\")\n>>> # Also works if you have arrays\n>>> # data = [(0, np.zeros(shape=(32, 32, 3), dtype=np.int32).tolist())]\n>>> # df = spark.createDataFrame(data, \"idx: int, image: array<array<array<int>>>\")\n>>> features = Features({\"idx\": Value(\"int64\"), \"image\": Image()})\n>>> dataset = Dataset.from_spark(df, features=features)\n>>> dataset[0]\n{'idx': 0, 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32>}\n```\n\nYou can check the [`Features`] documentation to know about all the feature types available.\n"
  },
  {
    "path": "docs/source/use_with_tensorflow.mdx",
    "content": "# Using Datasets with TensorFlow\n\nThis document is a quick introduction to using `datasets` with TensorFlow, with a particular focus on how to get\n`tf.Tensor` objects out of our datasets, and how to stream data from Hugging Face `Dataset` objects to Keras methods\nlike `model.fit()`.\n\n## Dataset format\n\nBy default, datasets return regular Python objects: integers, floats, strings, lists, etc.\n\nTo get TensorFlow tensors instead, you can set the format of the dataset to `tf`:\n\n```py\n>>> from datasets import Dataset\n>>> data = [[1, 2],[3, 4]]\n>>> ds = Dataset.from_dict({\"data\": data})\n>>> ds = ds.with_format(\"tf\")\n>>> ds[0]\n{'data': <tf.Tensor: shape=(2,), dtype=int64, numpy=array([1, 2])>}\n>>> ds[:2]\n{'data': <tf.Tensor: shape=(2, 2), dtype=int64, numpy=\narray([[1, 2],\n       [3, 4]])>}\n```\n\n> [!TIP]\n> A [`Dataset`] object is a wrapper of an Arrow table, which allows fast reads from arrays in the dataset to TensorFlow tensors.\n\nThis can be useful for converting your dataset to a dict of `Tensor` objects, or for writing a generator to load TF\nsamples from it. If you wish to convert the entire dataset to `Tensor`, simply query the full dataset:\n\n```py\n>>> ds[:]\n{'data': <tf.Tensor: shape=(2, 2), dtype=int64, numpy=\narray([[1, 2],\n       [3, 4]])>}\n```\n\n### N-dimensional arrays\n\nIf your dataset consists of N-dimensional arrays, you will see that by default they are considered as the same tensor if the shape is fixed:\n\n```py\n>>> from datasets import Dataset\n>>> data = [[[1, 2],[3, 4]],[[5, 6],[7, 8]]]  # fixed shape\n>>> ds = Dataset.from_dict({\"data\": data})\n>>> ds = ds.with_format(\"tf\")\n>>> ds[0]\n{'data': <tf.Tensor: shape=(2, 2), dtype=int64, numpy=\n array([[1, 2],\n        [3, 4]])>}\n```\n\nOtherwise, a TensorFlow formatted dataset outputs a `RaggedTensor` instead of a single tensor:\n\n```py\n>>> from datasets import Dataset\n>>> data = [[[1, 2],[3]],[[4, 5, 6],[7, 8]]]  # varying shape\n>>> ds = Dataset.from_dict({\"data\": data})\n>>> ds = ds.with_format(\"torch\")\n>>> ds[0]\n{'data': <tf.RaggedTensor [[1, 2], [3]]>}\n```\n\nHowever this logic often requires slow shape comparisons and data copies.\nTo avoid this, you must explicitly use the [`Array`] feature type and specify the shape of your tensors:\n\n```py\n>>> from datasets import Dataset, Features, Array2D\n>>> data = [[[1, 2],[3, 4]],[[5, 6],[7, 8]]]\n>>> features = Features({\"data\": Array2D(shape=(2, 2), dtype='int32')})\n>>> ds = Dataset.from_dict({\"data\": data}, features=features)\n>>> ds = ds.with_format(\"tf\")\n>>> ds[0]\n{'data': <tf.Tensor: shape=(2, 2), dtype=int64, numpy=\n array([[1, 2],\n        [3, 4]])>}\n>>> ds[:2]\n{'data': <tf.Tensor: shape=(2, 2, 2), dtype=int64, numpy=\n array([[[1, 2],\n         [3, 4]],\n \n        [[5, 6],\n         [7, 8]]])>}\n```\n\n\n### Other feature types\n\n[`ClassLabel`] data are properly converted to tensors:\n\n```py\n>>> from datasets import Dataset, Features, ClassLabel\n>>> labels = [0, 0, 1]\n>>> features = Features({\"label\": ClassLabel(names=[\"negative\", \"positive\"])})\n>>> ds = Dataset.from_dict({\"label\": labels}, features=features) \n>>> ds = ds.with_format(\"tf\")  \n>>> ds[:3]\n{'label': <tf.Tensor: shape=(3,), dtype=int64, numpy=array([0, 0, 1])>}\n```\n\nStrings and binary objects are also supported:\n\n```py\n>>> from datasets import Dataset, Features \n>>> text = [\"foo\", \"bar\"]\n>>> data = [0, 1] \n>>> ds = Dataset.from_dict({\"text\": text, \"data\": data})  \n>>> ds = ds.with_format(\"tf\") \n>>> ds[:2]\n{'text': <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'foo', b'bar'], dtype=object)>,\n 'data': <tf.Tensor: shape=(2,), dtype=int64, numpy=array([0, 1])>}\n```\n\nYou can also explicitly format certain columns and leave the other columns unformatted:\n\n```py\n>>> ds = ds.with_format(\"tf\", columns=[\"data\"], output_all_columns=True)\n>>> ds[:2]\n{'data': <tf.Tensor: shape=(2,), dtype=int64, numpy=array([0, 1])>,\n 'text': ['foo', 'bar']}\n```\n\nString and binary objects are unchanged, since PyTorch only supports numbers.\n\nThe [`Image`] and [`Audio`] feature types are also supported.\n\n> [!TIP]\n> To use the [`Image`] feature type, you'll need to install the `vision` extra as\n> `pip install datasets[vision]`.\n\n```py\n>>> from datasets import Dataset, Features, Audio, Image\n>>> images = [\"path/to/image.png\"] * 10\n>>> features = Features({\"image\": Image()})\n>>> ds = Dataset.from_dict({\"image\": images}, features=features) \n>>> ds = ds.with_format(\"tf\")  \n>>> ds[0]\n{'image': <tf.Tensor: shape=(512, 512, 4), dtype=uint8, numpy=\n array([[[255, 215, 106, 255],\n         [255, 215, 106, 255],\n         ...,\n         [255, 255, 255, 255],\n         [255, 255, 255, 255]]], dtype=uint8)>}\n>>> ds[:2]\n{'image': <tf.Tensor: shape=(2, 512, 512, 4), dtype=uint8, numpy=\n array([[[[255, 215, 106, 255],\n          [255, 215, 106, 255],\n          ...,\n          [255, 255, 255, 255],\n          [255, 255, 255, 255]]]], dtype=uint8)>}\n```\n\n> [!TIP]\n> To use the [`Audio`] feature type, you'll need to install the `audio` extra as\n> `pip install datasets[audio]`.\n\n```py\n>>> from datasets import Dataset, Features, Audio, Image\n>>> audio = [\"path/to/audio.wav\"] * 10\n>>> features = Features({\"audio\": Audio()})\n>>> ds = Dataset.from_dict({\"audio\": audio}, features=features) \n>>> ds = ds.with_format(\"tf\")  \n>>> ds[0][\"audio\"][\"array\"]\n<tf.Tensor: shape=(202311,), dtype=float32, numpy=\narray([ 6.1035156e-05,  1.5258789e-05,  1.6784668e-04, ...,\n       -1.5258789e-05, -1.5258789e-05,  1.5258789e-05], dtype=float32)>\n>>> ds[0][\"audio\"][\"sampling_rate\"]\n<tf.Tensor: shape=(), dtype=int32, numpy=44100>\n```\n\n## Data loading\n\nAlthough you can load individual samples and batches just by indexing into your dataset, this won't work if you want\nto use Keras methods like `fit()` and `predict()`. You could write a generator function that shuffles and loads batches\nfrom your dataset and `fit()` on that, but that sounds like a lot of unnecessary work. Instead, if you want to stream\ndata from your dataset on-the-fly, we recommend converting your dataset to a `tf.data.Dataset` using the\n`to_tf_dataset()` method.\n\nThe `tf.data.Dataset` class covers a wide range of use-cases - it is often created from Tensors in memory, or using a load function to read files on disc\nor external storage. The dataset can be transformed arbitrarily with the `map()` method, or methods like `batch()`\nand `shuffle()` can be used to create a dataset that's ready for training. These methods do not modify the stored data\nin any way - instead, the methods build a data pipeline graph that will be executed when the dataset is iterated over,\nusually during model training or inference. This is different from the `map()` method of Hugging Face `Dataset` objects,\nwhich runs the map function immediately and saves the new or changed columns.\n\nSince the entire data preprocessing pipeline can be compiled in a `tf.data.Dataset`, this approach allows for massively\nparallel, asynchronous data loading and training. However, the requirement for graph compilation can be a limitation,\nparticularly for Hugging Face tokenizers, which are usually not (yet!) compilable as part of a TF graph. As a result, \nwe usually advise pre-processing the dataset as a Hugging Face dataset, where arbitrary Python functions can be\nused, and then converting to `tf.data.Dataset` afterwards using `to_tf_dataset()` to get a batched dataset ready for\ntraining. To see examples of this approach, please see the [examples](https://github.com/huggingface/transformers/tree/main/examples) or [notebooks](https://huggingface.co/docs/transformers/notebooks) for `transformers`.\n\n### Using `to_tf_dataset()`\n\nUsing `to_tf_dataset()` is straightforward. Once your dataset is preprocessed and ready, simply call it like so:\n\n```py\n>>> from datasets import Dataset\n>>> data = {\"inputs\": [[1, 2],[3, 4]], \"labels\": [0, 1]}\n>>> ds = Dataset.from_dict(data)\n>>> tf_ds = ds.to_tf_dataset(\n            columns=[\"inputs\"],\n            label_cols=[\"labels\"],\n            batch_size=2,\n            shuffle=True\n            )\n```\n\nThe returned `tf_ds` object here is now fully ready to train on, and can be passed directly to `model.fit()`. Note\nthat you set the batch size when creating the dataset, and so you don't need to specify it when calling `fit()`:\n\n```py\n>>> model.fit(tf_ds, epochs=2)\n```\n\nFor a full description of the arguments, please see the [`~Dataset.to_tf_dataset`] documentation. In many cases,\nyou will also need to add a `collate_fn` to your call. This is a function that takes multiple elements of the dataset\nand combines them into a single batch. When all elements have the same length, the built-in default collator will\nsuffice, but for more complex tasks a custom collator may be necessary. In particular, many tasks have samples\nwith varying sequence lengths which will require a [data collator](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator) that can pad batches correctly. You can see examples\nof this in the `transformers` NLP [examples](https://github.com/huggingface/transformers/tree/main/examples) and\n[notebooks](https://huggingface.co/docs/transformers/notebooks), where variable sequence lengths are very common.\n\nIf you find that loading with `to_tf_dataset` is slow, you can also use the `num_workers` argument. This spins\nup multiple subprocesses to load data in parallel. This feature is recent and still somewhat experimental - please file\nan issue if you encounter any bugs while using it!\n\n### When to use to_tf_dataset\n\nThe astute reader may have noticed at this point that we have offered two approaches to achieve the same goal - if you\nwant to pass your dataset to a TensorFlow model, you can either convert the dataset to a `Tensor` or `dict` of `Tensors`\nusing `.with_format('tf')`, or you can convert the dataset to a `tf.data.Dataset` with `to_tf_dataset()`. Either of these\ncan be passed to `model.fit()`, so which should you choose?\n\nThe key thing to recognize is that when you convert the whole dataset to `Tensor`s, it is static and fully loaded into\nRAM. This is simple and convenient, but if any of the following apply, you should probably use `to_tf_dataset()`\ninstead:\n\n- Your dataset is too large to fit in RAM. `to_tf_dataset()` streams only one batch at a time, so even very large\n  datasets can be handled with this method.\n- You want to apply random transformations using `dataset.with_transform()` or the `collate_fn`. This is\n  common in several modalities, such as image augmentations when training vision models, or random masking when training\n  masked language models. Using `to_tf_dataset()` will apply those transformations\n  at the moment when a batch is loaded, which means the same samples will get different augmentations each time\n  they are loaded. This is usually what you want.\n- Your data has a variable dimension, such as input texts in NLP that consist of varying\n  numbers of tokens. When you create a batch with samples with a variable dimension, the standard solution is to\n  pad the shorter samples to the length of the longest one. When you stream samples from a dataset with `to_tf_dataset`,\n  you can apply this padding to each batch via your `collate_fn`. However, if you want to convert\n  such a dataset to dense `Tensor`s, then you will have to pad samples to the length of the longest sample in *the\n  entire dataset!* This can result in huge amounts of padding, which wastes memory and reduces your model's speed.\n\n### Caveats and limitations\n\nRight now, `to_tf_dataset()` always returns a batched dataset - we will add support for unbatched datasets soon!\n"
  },
  {
    "path": "docs/source/video_dataset.mdx",
    "content": "# Create a video dataset\n\nThis guide will show you how to create a video dataset with `VideoFolder` and some metadata. This is a no-code solution for quickly creating a video dataset with several thousand videos.\n\n> [!TIP]\n> You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information about how to enable this feature on the Hub.\n\n## VideoFolder\n\nThe `VideoFolder` is a dataset builder designed to quickly load a video dataset with several thousand videos without requiring you to write any code.\n\n> [!TIP]\n> 💡 Take a look at the [Split pattern hierarchy](repository_structure#split-pattern-hierarchy) to learn more about how `VideoFolder` creates dataset splits based on your dataset repository structure.\n\n`VideoFolder` automatically infers the class labels of your dataset based on the directory name. Store your dataset in a directory structure like:\n\n```\nfolder/train/dog/golden_retriever.mp4\nfolder/train/dog/german_shepherd.mp4\nfolder/train/dog/chihuahua.mp4\n\nfolder/train/cat/maine_coon.mp4\nfolder/train/cat/bengal.mp4\nfolder/train/cat/birman.mp4\n```\n\nIf the dataset follows the `VideoFolder` structure, then you can load it directly with [`load_dataset`]:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"path/to/folder\")\n```\n\nThis is equivalent to passing `videofolder` manually in [`load_dataset`] and the directory in `data_dir`:\n\n```py\n>>> dataset = load_dataset(\"videofolder\", data_dir=\"/path/to/folder\")\n```\n\nYou can also use `videofolder` to load datasets involving multiple splits. To do so, your dataset directory should have the following structure:\n\n```\nfolder/train/dog/golden_retriever.mp4\nfolder/train/cat/maine_coon.mp4\nfolder/test/dog/german_shepherd.mp4\nfolder/test/cat/bengal.mp4\n```\n\n> [!WARNING]\n> If all video files are contained in a single directory or if they are not on the same level of directory structure, `label` column won't be added automatically. If you need it, set `drop_labels=False` explicitly.\n\n\nIf there is additional information you'd like to include about your dataset, like text captions or bounding boxes, add it as a `metadata.csv` file in your folder. This lets you quickly create datasets for different computer vision tasks like text captioning or object detection. You can also use a JSONL file `metadata.jsonl` or a Parquet file `metadata.parquet`.\n\n```\nfolder/train/metadata.csv\nfolder/train/0001.mp4\nfolder/train/0002.mp4\nfolder/train/0003.mp4\n```\n\nYour `metadata.csv` file must have a `file_name` or `*_file_name` field which links video files with their metadata:\n\n```csv\nfile_name,additional_feature\n0001.mp4,This is a first value of a text feature you added to your videos\n0002.mp4,This is a second value of a text feature you added to your videos\n0003.mp4,This is a third value of a text feature you added to your videos\n```\n\nor using `metadata.jsonl`:\n\n```jsonl\n{\"file_name\": \"0001.mp4\", \"additional_feature\": \"This is a first value of a text feature you added to your videos\"}\n{\"file_name\": \"0002.mp4\", \"additional_feature\": \"This is a second value of a text feature you added to your videos\"}\n{\"file_name\": \"0003.mp4\", \"additional_feature\": \"This is a third value of a text feature you added to your videos\"}\n```\n\nHere the `file_name` must be the name of the video file next to the metadata file. More generally, it must be the relative path from the directory containing the metadata to the video file.\n\nIt's possible to point to more than one video in each row in your dataset, for example if both your input and output are videos:\n\n```jsonl\n{\"input_file_name\": \"0001.mp4\", \"output_file_name\": \"0001_output.mp4\"}\n{\"input_file_name\": \"0002.mp4\", \"output_file_name\": \"0002_output.mp4\"}\n{\"input_file_name\": \"0003.mp4\", \"output_file_name\": \"0003_output.mp4\"}\n```\n\nYou can also define lists of videos. In that case you need to name the field `file_names` or `*_file_names`. Here is an example:\n\n```jsonl\n{\"videos_file_names\": [\"0001_left.mp4\", \"0001_right.mp4\"], \"label\": \"moving_up\"}\n{\"videos_file_names\": [\"0002_left.mp4\", \"0002_right.mp4\"], \"label\": \"moving_down\"}\n{\"videos_file_names\": [\"0003_left.mp4\", \"0003_right.mp4\"], \"label\": \"moving_right\"}\n```\n\n### Video captioning\n\nVideo captioning datasets have text describing a video. An example `metadata.csv` may look like:\n\n```csv\nfile_name,text\n0001.mp4,This is a golden retriever playing with a ball\n0002.mp4,A german shepherd\n0003.mp4,One chihuahua\n```\n\nLoad the dataset with `VideoFolder`, and it will create a `text` column for the video captions:\n\n```py\n>>> dataset = load_dataset(\"videofolder\", data_dir=\"/path/to/folder\", split=\"train\")\n>>> dataset[0][\"text\"]\n\"This is a golden retriever playing with a ball\"\n```\n\n### Upload dataset to the Hub\n\nOnce you've created a dataset, you can share it to the using `huggingface_hub` for example. Make sure you have the [huggingface_hub](https://huggingface.co/docs/huggingface_hub/index) library installed and you're logged in to your Hugging Face account (see the [Upload with Python tutorial](upload_dataset#upload-with-python) for more details).\n\nUpload your dataset with `huggingface_hub.HfApi.upload_folder`:\n\n```py\nfrom huggingface_hub import HfApi\napi = HfApi()\n\napi.upload_folder(\n    folder_path=\"/path/to/local/dataset\",\n    repo_id=\"username/my-cool-dataset\",\n    repo_type=\"dataset\",\n)\n```\n\n## WebDataset\n\nThe [WebDataset](https://github.com/webdataset/webdataset) format is based on TAR archives and is suitable for big video datasets.\nIndeed you can group your videos in TAR archives (e.g. 1GB of videos per TAR archive) and have thousands of TAR archives:\n\n```\nfolder/train/00000.tar\nfolder/train/00001.tar\nfolder/train/00002.tar\n...\n```\n\nIn the archives, each example is made of files sharing the same prefix:\n\n```\ne39871fd9fd74f55.mp4\ne39871fd9fd74f55.json\nf18b91585c4d3f3e.mp4\nf18b91585c4d3f3e.json\nede6e66b2fb59aab.mp4\nede6e66b2fb59aab.json\ned600d57fcee4f94.mp4\ned600d57fcee4f94.json\n...\n```\n\nYou can put your videos labels/captions/features using JSON or text files for example.\n\nFor more details on the WebDataset format and the python library, please check the [WebDataset documentation](https://webdataset.github.io/webdataset).\n\nLoad your WebDataset and it will create on column per file suffix (here \"mp4\" and \"json\"):\n\n```python\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"webdataset\", data_dir=\"/path/to/folder\", split=\"train\")\n>>> dataset[0][\"json\"]\n{\"bbox\": [[302.0, 109.0, 73.0, 52.0]], \"categories\": [0]}\n```\n\n## Lance\n\n[Lance](https://lance.org) is an open multimodal lakehouse table format. Lance tables can natively store not only text and scalar values,\nbut also large binary objects (blobs) such as images, audio, and video alongside your tabular data.\n\nLance provides a [blob API](https://lance.org/guide/blob/) that makes it convenient to store and retrieve large blobs in Lance datasets.\nThe following example shows how to efficiently browse metadata without loading the heavier video blobs, then fetch the relevant video\nblobs on demand.\n\nHere's a representative view of what a Lance table storing videos might look like (the `video_blob` column uses Lance's blob encoding):\n\n```text\n+------------------------------------------+-----------------+-----+------------------------------------------+\n| caption                                  | aesthetic_score | ... | video_blob                               |\n+------------------------------------------+-----------------+-----+------------------------------------------+\n| \"a breathtaking view of a mounta...\"     | 5.2401          | ... | {position: 0, size: 4873879}             |\n| \"a captivating view of the sun, b...\"    | 5.2401          | ... | {position: 4873920, size: 3370571}       |\n+------------------------------------------+-----------------+-----+------------------------------------------+\n```\n\n### Write a Lance dataset from raw video files\n\nStarting from raw video files on disk plus associated metadata (for example, captions and scores), you can write a self-contained Lance dataset\nto a local `*.lance` directory (a Lance dataset is a directory on disk, and it's common to name it with a `.lance` suffix):\n\n```py\nimport lance\nimport pyarrow as pa\n\nimport urllib.request\n\nschema = pa.schema(\n    [\n        pa.field(\"caption\", pa.utf8()),\n        pa.field(\"aesthetic_score\", pa.float64()),\n        pa.field(\n            \"video_blob\",\n            pa.large_binary(),\n            metadata={\"lance-encoding:blob\": \"true\"},\n        ),\n    ]\n)\n\n# Provide video files alongside metadata\nrows = [\n    {\n        \"video_path\": \"/path/to/videos/0001.mp4\",\n        \"caption\": \"a breathtaking view of a mountainous landscape ...\",\n        \"aesthetic_score\": 5.240138053894043,\n    },\n    {\n        \"video_path\": \"0002.mp4\",\n        \"caption\": \"a captivating view of the sun, bathed in hues ...\",\n        \"aesthetic_score\": 5.240137100219727,\n    },\n]\n\nvideo_bytes = []\nfor r in rows:\n    with open(r[\"video_path\"], \"rb\") as f:\n        video_bytes.append(f.read())\n\ntable = pa.table(\n    {\n        \"caption\": [r[\"caption\"] for r in rows],\n        \"aesthetic_score\": [r[\"aesthetic_score\"] for r in rows],\n        \"video_blob\": video_bytes,\n    },\n    schema=schema,\n)\n\nds = lance.write_dataset(\n    table,\n    \"./videos.lance\",\n    schema=schema,\n    mode=\"create\",\n)\n```\n\nThis stores your metadata and video bytes together inside `videos.lance/`, so you can move/copy a single directory without having to keep\nseparate `*.mp4` files in sync.\n\nHere's a representative view of what a Lance table storing videos might look like (the `video_blob` column contains data that's\nstored natively as blobs inside the Lance dataset):\n\n```text\n+------------------------------------------+-----------------+-----+------------------------------------------+\n| caption                                  | aesthetic_score | ... | video_blob                               |\n+------------------------------------------+-----------------+-----+------------------------------------------+\n| \"a breathtaking view of a mounta...\"     | 5.2401          | ... | {position: 0, size: 4873879}             |\n| \"a captivating view of the sun, b...\"    | 5.2401          | ... | {position: 4873920, size: 3370571}       |\n+------------------------------------------+-----------------+-----+------------------------------------------+\n```\n\nYou can upload the resulting `videos.lance/` directory to the Hub (for example with `huggingface_hub.HfApi.upload_folder`) and share it as a\ndataset repository, keeping the metadata and videos together as a single artifact.\n\n> [!TIP]\n> Lance datasets scale to very large sizes (terabytes and beyond) since the data is stored in a columnar format on disk.\n> See the [blob API](https://lance.org/guide/blob/) guide for the latest information on best practices for storing and retrieving\n> large blobs in Lance.\n\nWhen writing large datasets, it's typically best to limit the size of each individual `*.lance` file to a few gigabytest at most.\nSimply gather the data via an iterator and specify the `max_bytes_per_file` parameter when writing the dataset:\n\n```python\nMAX_BYTES_PER_FILE = 5 * 1024 * 1024 * 1024  # ~5 GB per file\n\n# Write as Lance dataset with file size limits for each *.lance file\nds = lance.write_dataset(\n    table,\n    \"./videos.lance\",\n    schema=schema,\n    mode=\"create\",\n    max_bytes_per_file=MAX_BYTES_PER_FILE,\n)\n```\n\nFor more details on working with Lance datasets, see the [Lance documentation](https://lance.org)."
  },
  {
    "path": "docs/source/video_load.mdx",
    "content": "# Load video data\n\n> [!WARNING]\n> Video support is experimental and is subject to change.\n\nVideo datasets have [`Video`] type columns, which contain `torchvision` objects.\n\n> [!TIP]\n> To work with video datasets, you need to have the `torchvision` and `av` packages installed. Check out the [installation](https://github.com/pytorch/vision#installation) guide to learn how to install them.\n\nWhen you load a video dataset and call the video column, the videos are decoded as `torchvision` Videos:\n\n```py\n>>> from datasets import load_dataset, Video\n\n>>> dataset = load_dataset(\"path/to/video/folder\", split=\"train\")\n>>> dataset[0][\"video\"]\n<torchcodec.decoders._video_decoder.VideoDecoder object at 0x14a61d5a0>\n```\n\n> [!WARNING]\n> Index into a video dataset using the row index first and then the `video` column - `dataset[0][\"video\"]` - to avoid creating all the video objects in the dataset. Otherwise, this can be a slow and time-consuming process if you have a large dataset.\n\nFor a guide on how to load any type of dataset, take a look at the <a class=\"underline decoration-sky-400 decoration-2 font-semibold\" href=\"./loading\">general loading guide</a>.\n\n## Read frames\n\nAccess frames directly from a video using the `VideoReader` using `next()`:\n\n```python\n>>> video = dataset[0][\"video\"]\n>>> first_frame = video.get_frame_at(0)\n>>> first_frame.data.shape\n(3, 240, 320)\n>>> first_frame.pts_seconds  # timestamp\n0.0\n```\n\nTo get multiple frames at once, you can call `.get_frames_in_range(start: int, stop: int, step: int)`. This will return a frame batch.\nThis is the efficient way to obtain a long list of frames refer to the [torchcodec docs](https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.VideoDecoder.html) to see more functions for effiently accessing the data:\n\n```python\n>>> import torch\n>>> frames = video.get_frames_in_range(0, 6, 1)\n>>> frames.data.shape\ntorch.Size([5, 3, 240, 320])\n```\n\nThere is also `.get_frames_played_in_range(start_seconds: float, stop_seconds: float)` to access all frames played whithin a certain time range.\n\n```python\n>>> frames = video.get_frames_played_in_range(.5, 1.2)\n>>> frames.data.shape\ntorch.Size([42, 3, 240, 320])\n```\n\n## Local files\n\nYou can load a dataset from the video path. Use the [`~Dataset.cast_column`] function to accept a column of video file paths, and decode it into a `torchcodec` video with the [`Video`] feature:\n\n```py\n>>> from datasets import Dataset, Video\n\n>>> dataset = Dataset.from_dict({\"video\": [\"path/to/video_1\", \"path/to/video_2\", ..., \"path/to/video_n\"]}).cast_column(\"video\", Video())\n>>> dataset[0][\"video\"]\n<torchcodec.decoders._video_decoder.VideoDecoder object at 0x14a61e080>\n```\n\nIf you only want to load the underlying path to the video dataset without decoding the video object, set `decode=False` in the [`Video`] feature:\n\n```py\n>>> dataset = dataset.cast_column(\"video\", Video(decode=False))\n>>> dataset[0][\"video\"]\n{'bytes': None,\n 'path': 'path/to/video/folder/video0.mp4'}\n```\n\n## VideoFolder\n\nYou can also load a dataset with an `VideoFolder` dataset builder which does not require writing a custom dataloader. This makes `VideoFolder` ideal for quickly creating and loading video datasets with several thousand videos for different vision tasks. Your video dataset structure should look like this:\n\n```\nfolder/train/dog/golden_retriever.mp4\nfolder/train/dog/german_shepherd.mp4\nfolder/train/dog/chihuahua.mp4\n\nfolder/train/cat/maine_coon.mp4\nfolder/train/cat/bengal.mp4\nfolder/train/cat/birman.mp4\n```\n\nIf the dataset follows the `VideoFolder` structure, then you can load it directly with [`load_dataset`]:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"username/dataset_name\")\n>>> # OR locally:\n>>> dataset = load_dataset(\"/path/to/folder\")\n```\n\nFor local datasets, this is equivalent to passing `videofolder` manually in [`load_dataset`] and the directory in `data_dir`:\n\n```py\n>>> dataset = load_dataset(\"videofolder\", data_dir=\"/path/to/folder\")\n```\n\nThen you can access the videos as `torchcodec.decoders._video_decoder.VideoDecoder` objects:\n\n```\n>>> dataset[\"train\"][0]\n{\"video\": <torchcodec.decoders._video_decoder.VideoDecoder object at 0x14a61e080>, \"label\": 0}\n\n>>> dataset[\"train\"][-1]\n{\"video\": <torchcodec.decoders._video_decoder.VideoDecoder object at 0x14a61e090>, \"label\": 1}\n```\n\nTo ignore the information in the metadata file, set `drop_metadata=True` in [`load_dataset`]:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"username/dataset_with_metadata\", drop_metadata=True)\n```\n\nIf you don't have a metadata file, `VideoFolder` automatically infers the label name from the directory name.\nIf you want to drop automatically created labels, set `drop_labels=True`.\nIn this case, your dataset will only contain a video column:\n\n```py\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"username/dataset_without_metadata\", drop_labels=True)\n```\n\nFinally the `filters` argument lets you load only a subset of the dataset, based on a condition on the label or the metadata. This is especially useful if the metadata is in Parquet format, since this format enables fast filtering. It is also recommended to use this argument with `streaming=True`, because by default the dataset is fully downloaded before filtering.\n\n```python\n>>> filters = [(\"label\", \"=\", 0)]\n>>> dataset = load_dataset(\"username/dataset_name\", streaming=True, filters=filters)\n```\n\n> [!TIP]\n> For more information about creating your own `VideoFolder` dataset, take a look at the [Create a video dataset](./video_dataset) guide.\n\n## WebDataset\n\nThe [WebDataset](https://github.com/webdataset/webdataset) format is based on a folder of TAR archives and is suitable for big video datasets.\nBecause of their size, WebDatasets are generally loaded in streaming mode (using `streaming=True`).\n\nYou can load a WebDataset like this:\n\n```python\n>>> from datasets import load_dataset\n\n>>> dataset = load_dataset(\"webdataset\", data_dir=\"/path/to/folder\", streaming=True)\n```\n\n## Lance\n\n[Lance](https://lance.org) is an open multimodal lakehouse table format. Lance tables can natively store not only text and scalar values,\nbut also large binary objects (blobs) such as images, audio, and video alongside your tabular data. Inside a Lance table, large\nblobs like videos are stored as bytes with offsets (see the [blob guide](https://lance.org/guide/blob/) for more details), so this\nmakes it easy to scan and filter metadata without loading heavier video blobs, and then fetch only the specific video blobs you need on demand.\n\nAlso, because Lance is a columnar columnar format, you can project and filter only the metadata columns you care about\n(without fetching large video files), and then retrieve a small subset of rows (including the video) when you're ready. This\nkeeps your metadata and videos in one place, without needing a separate file store or an external index.\n\n```python\nimport lance\n\nds = lance.dataset(\"hf://datasets/lance-format/openvid-lance/data/train.lance\")\n\n# 1. Browse metadata without loading video blobs.\nmetadata = ds.scanner(\n    columns=[\"caption\", \"aesthetic_score\"],\n    filter=\"aesthetic_score >= 4.5\",\n    limit=2,\n).to_table().to_pylist()\n\n# 2. Fetch a single video blob by row index.\nselected_index = 0\nblob_file = ds.take_blobs(\"video_blob\", ids=[selected_index])[0]\nwith open(\"video_0.mp4\", \"wb\") as f:\n    f.write(blob_file.read())\n```\n\nIn this example, the video is stored natively (as its encoded bytes) in the Lance table, so you can write it directly to an `mp4` file on your local\nfilesystem without any extra conversion step.\n\nFor more details on working with Lance datasets, see the [Lance documentation](https://lance.org).\n\n## Video decoding\n\nBy default, videos are decoded sequentially as torchvision `VideoReaders` when you iterate on a dataset.\nIt sequentially decodes the metadata of the videos, and doesn't read the video frames until you access them.\n\nHowever it is possible to speed up the dataset significantly using multithreaded decoding:\n\n```python\n>>> import os\n>>> num_threads = num_threads = min(32, (os.cpu_count() or 1) + 4)\n>>> dataset = dataset.decode(num_threads=num_threads)\n>>> for example in dataset:  # up to 20 times faster !\n...     ...\n```\n\nYou can enable multithreading using `num_threads`. This is especially useful to speed up remote data streaming.\nHowever it can be slower than `num_threads=0` for local data on fast disks.\n\nIf you are not interested in the videos decoded as torchvision `VideoReaders` and would like to access the path/bytes instead, you can disable decoding:\n\n```python\n>>> dataset = dataset.decode(False)\n```\n\nNote: [`IterableDataset.decode`] is only available for streaming datasets at the moment.\n"
  },
  {
    "path": "notebooks/README.md",
    "content": "<!---\nCopyright 2023 The HuggingFace Team. All rights reserved.\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n\n    http://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License.\n-->\n\n# 🤗 Datasets Notebooks\n\nYou can find here a list of the official notebooks provided by Hugging Face.\n\nAlso, we would like to list here interesting content created by the community.\nIf you wrote some notebook(s) leveraging 🤗 Datasets and would like it to be listed here, please open a\nPull Request so it can be included under the Community notebooks.\n\n## Hugging Face's notebooks 🤗\n\n### Documentation notebooks\n\nYou can open any page of the documentation as a notebook in Colab (there is a button directly on said pages) but they are also listed here if you need them:\n\n| Notebook     |      Description      |   |   |\n|:----------|:-------------|:-------------|------:|\n| [Quickstart](https://github.com/huggingface/notebooks/blob/main/datasets_doc/en/quickstart.ipynb) | A quick presentation on integrating Datasets into a model training workflow |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/datasets_doc/en/quickstart.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/datasets_doc/en/quickstart.ipynb)|\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[tool.ruff]\nline-length = 119\n\n[tool.ruff.lint]\n# Ignored rules:\n#   \"E501\" -> line length violation\n#   \"F821\" -> undefined named in type annotation (e.g. Literal[\"something\"])\n#   \"C901\" -> `function_name` is too complex\nignore = [\"E501\", \"F821\", \"C901\"]\nselect = [\"C\", \"E\", \"F\", \"I\", \"W\"]\n\n[tool.ruff.lint.isort]\nlines-after-imports = 2\nknown-first-party = [\"datasets\"]\n\n[tool.ruff.lint.per-file-ignores]\n\"__init__.py\" = [\"F401\", \"F403\", \"F405\"]\n\n[tool.pytest.ini_options]\n# Test fails if a FutureWarning is thrown by `huggingface_hub`\nfilterwarnings = [\n    \"error::FutureWarning:huggingface_hub*\",\n]\nmarkers = [\n    \"unit: unit test\",\n    \"integration: integration test\",\n]\n"
  },
  {
    "path": "setup.py",
    "content": "# Lint as: python3\n\"\"\"HuggingFace/Datasets is an open library of datasets.\n\nNote:\n\n   VERSION needs to be formatted following the MAJOR.MINOR.PATCH convention\n\nSimple check list for release from AllenNLP repo: https://github.com/allenai/allennlp/blob/master/setup.py\n\nSteps to make a release:\n\n0. Prerequisites:\n   - Dependencies:\n     - twine: `pip install twine`\n   - Create an account in (and join the 'datasets' project):\n     - PyPI: https://pypi.org/\n     - Test PyPI: https://test.pypi.org/\n   - Don't break `transformers`: run the `transformers` CI using the `main` branch and make sure it's green.\n     - In `transformers`, use `datasets @ git+https://github.com/huggingface/datasets@main#egg=datasets`\n       Add a step to install `datasets@main` after `save_cache` in .circleci/create_circleci_config.py:\n       ```\n       {\"run\": {\"name\": \"Install `datasets@main`\", \"command\": 'pip uninstall datasets -y && pip install \"datasets @ git+https://github.com/huggingface/datasets@main#egg=datasets\"'}}\n       ```\n     - and then run the CI\n\n1. Create the release branch from main branch:\n     ```\n     git checkout main\n     git pull upstream main\n     git checkout -b release-VERSION\n     ```\n\n2. Change the version to the release VERSION in:\n   - __init__.py\n   - setup.py\n\n3. Commit these changes, push and create a Pull Request:\n     ```\n     git add -u\n     git commit -m \"Release: VERSION\"\n     git push upstream release-VERSION\n     ```\n   - Go to: https://github.com/huggingface/datasets/pull/new/release-VERSION\n   - Create pull request\n\n4. From your local release branch, build both the sources and the wheel. Do not change anything in setup.py between\n   creating the wheel and the source distribution (obviously).\n   - First, delete any building directories that may exist from previous builds:\n     - build\n     - dist\n   - From the top level directory, build the wheel and the sources:\n       ```\n       python setup.py bdist_wheel\n       python setup.py sdist\n       ```\n   - You should now have a /dist directory with both .whl and .tar.gz source versions.\n\n5. Check that everything looks correct by uploading the package to the test PyPI server:\n     ```\n     twine upload dist/* -r testpypi\n     ```\n   Check that you can install it in a virtualenv/notebook by running:\n     ```\n     !pip install -U --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ datasets\n     ```\n\n6. Upload the final version to the actual PyPI:\n     ```\n     twine upload dist/* -r pypi\n     ```\n\n7. Make the release on GitHub once everything is looking hunky-dory:\n   - Merge the release Pull Request\n   - Create a new release: https://github.com/huggingface/datasets/releases/new\n   - Choose a tag: Introduce the new VERSION as tag, that will be created when you publish the release\n     - Create new tag VERSION on publish\n   - Release title: Introduce the new VERSION as well\n   - Describe the release\n     - Use \"Generate release notes\" button for automatic generation\n   - Publish release\n\n8. Set the dev version\n   - Create the dev-version branch from the main branch:\n       ```\n       git checkout main\n       git pull upstream main\n       git branch -D dev-version\n       git checkout -b dev-version\n       ```\n   - Change the version to X.X.X+1.dev0 (e.g. VERSION=1.18.3 -> 1.18.4.dev0) in:\n     - __init__.py\n     - setup.py\n   - Commit these changes, push and create a Pull Request:\n       ```\n       git add -u\n       git commit -m \"Set dev version\"\n       git push upstream dev-version\n       ```\n     - Go to: https://github.com/huggingface/datasets/pull/new/dev-version\n     - Create pull request\n   - Merge the dev version Pull Request\n\"\"\"\n\nfrom setuptools import find_packages, setup\n\n\nREQUIRED_PKGS = [\n    # For file locking\n    \"filelock\",\n    # We use numpy>=1.17 to have np.random.Generator (Dataset shuffling)\n    \"numpy>=1.17\",\n    # Backend and serialization.\n    # Minimum 21.0.0 to support `use_content_defined_chunking` in ParquetWriter\n    \"pyarrow>=21.0.0\",\n    # For smart caching dataset processing\n    \"dill>=0.3.0,<0.4.2\",  # tmp pin until dill has official support for determinism see https://github.com/uqfoundation/dill/issues/19\n    # For performance gains with apache arrow\n    \"pandas\",\n    # for downloading datasets over HTTPS\n    \"requests>=2.32.2\",\n    \"httpx<1.0.0\",\n    # progress bars in downloads and data operations\n    \"tqdm>=4.66.3\",\n    # for fast hashing\n    \"xxhash\",\n    # for better multiprocessing\n    \"multiprocess<0.70.20\",  # to align with dill<0.3.9 (see above)\n    # to save datasets locally or on any filesystem\n    # minimum 2023.1.0 to support protocol=kwargs in fsspec's `open`, `get_fs_token_paths`, etc.: see https://github.com/fsspec/filesystem_spec/pull/1143\n    \"fsspec[http]>=2023.1.0,<=2026.2.0\",\n    # To get datasets from the Datasets Hub on huggingface.co\n    \"huggingface-hub>=0.25.0,<2.0\",\n    # Utilities from PyPA to e.g., compare versions\n    \"packaging\",\n    # To parse YAML metadata from dataset cards\n    \"pyyaml>=5.1\",\n]\n\nAUDIO_REQUIRE = [\n    \"torchcodec>=0.6.0\",\n    \"torch>=2.8.0\",\n]\n\nVISION_REQUIRE = [\n    \"Pillow>=9.4.0\",  # When PIL.Image.ExifTags was introduced\n]\n\nBENCHMARKS_REQUIRE = [\n    \"tensorflow==2.12.0\",\n    \"torch==2.0.1\",\n    \"transformers==4.30.1\",\n]\n\nTESTS_REQUIRE = [\n    # fix pip install issues for windows\n    \"numba>=0.56.4; python_version < '3.14'\",  # to get recent versions of llvmlite for windows ci, not available on 3.14\n    # test dependencies\n    \"absl-py\",\n    \"decorator\",\n    \"joblib<1.3.0\",  # joblibspark doesn't support recent joblib versions\n    \"joblibspark; python_version < '3.14'\",  # python 3.14 gives AttributeError: module 'ast' has no attribute 'Num'\n    \"pytest\",\n    \"pytest-datadir\",\n    \"pytest-xdist\",\n    # optional dependencies\n    \"aiohttp\",\n    \"elasticsearch>=7.17.12,<8.0.0\",  # 8.0 asks users to provide hosts or cloud_id when instantiating ElasticSearch(); 7.9.1 has legacy numpy.float_ which was fixed in https://github.com/elastic/elasticsearch-py/pull/2551.\n    \"faiss-cpu>=1.8.0.post1\",  # Pins numpy < 2\n    \"h5py\",\n    \"pylance\",\n    \"jax>=0.3.14; sys_platform != 'win32'\",\n    \"jaxlib>=0.3.14; sys_platform != 'win32'\",\n    \"lz4; python_version < '3.14'\",  # python 3.14 gives ImportError: cannot import name '_compression' from partially initialized module 'lz4.frame\n    \"moto[server]\",\n    \"pyspark>=3.4\",  # https://issues.apache.org/jira/browse/SPARK-40991 fixed in 3.4.0\n    \"py7zr\",\n    \"rarfile>=4.0\",\n    \"sqlalchemy\",\n    \"protobuf<4.0.0\",  # 4.0.0 breaks compatibility with tensorflow<2.12\n    \"tensorflow>=2.6.0; python_version<'3.10' and sys_platform != 'win32'\",  # numpy-2 is not supported for Python < 3.10\n    \"tensorflow>=2.16.0; python_version>='3.10' and sys_platform != 'win32' and python_version < '3.14'\",  # Pins numpy < 2\n    \"tiktoken\",\n    \"torch>=2.8.0\",\n    \"torchdata\",\n    \"transformers>=4.42.0\",  # Pins numpy < 2\n    \"zstandard\",\n    \"polars[timezone]>=0.20.0\",\n    \"Pillow>=9.4.0\",  # When PIL.Image.ExifTags was introduced\n    \"torchcodec>=0.7.0; python_version < '3.14'\",  # minium version to get windows support, torchcodec doesn't have wheels for 3.14 yet\n    \"nibabel>=5.3.1\",\n]\n\nNUMPY2_INCOMPATIBLE_LIBRARIES = [\n    \"faiss-cpu\",\n    \"tensorflow\",\n]\nTESTS_NUMPY2_REQUIRE = [\n    library for library in TESTS_REQUIRE if library.partition(\">\")[0] not in NUMPY2_INCOMPATIBLE_LIBRARIES\n]\n\nQUALITY_REQUIRE = [\"ruff>=0.3.0\"]\n\nDOCS_REQUIRE = [\n    # Following dependencies are required for the Python reference to be built properly\n    \"transformers\",\n    \"torch\",\n    \"tensorflow>=2.6.0\",\n]\n\nPDFS_REQUIRE = [\"pdfplumber>=0.11.4\"]\n\nNIBABEL_REQUIRE = [\"nibabel>=5.3.2\", \"ipyniivue==2.4.2\"]\n\nEXTRAS_REQUIRE = {\n    \"audio\": AUDIO_REQUIRE,\n    \"vision\": VISION_REQUIRE,\n    \"tensorflow\": [\n        \"tensorflow>=2.6.0\",\n    ],\n    \"tensorflow_gpu\": [\"tensorflow>=2.6.0\"],\n    \"torch\": [\"torch\"],\n    \"jax\": [\"jax>=0.3.14\", \"jaxlib>=0.3.14\"],\n    \"streaming\": [],  # for backward compatibility\n    \"dev\": TESTS_REQUIRE + QUALITY_REQUIRE + DOCS_REQUIRE,\n    \"tests\": TESTS_REQUIRE,\n    \"tests_numpy2\": TESTS_NUMPY2_REQUIRE,\n    \"quality\": QUALITY_REQUIRE,\n    \"benchmarks\": BENCHMARKS_REQUIRE,\n    \"docs\": DOCS_REQUIRE,\n    \"pdfs\": PDFS_REQUIRE,\n    \"nibabel\": NIBABEL_REQUIRE,\n}\n\nsetup(\n    name=\"datasets\",\n    version=\"4.8.4.dev0\",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)\n    description=\"HuggingFace community-driven open-source library of datasets\",\n    long_description=open(\"README.md\", encoding=\"utf-8\").read(),\n    long_description_content_type=\"text/markdown\",\n    author=\"HuggingFace Inc.\",\n    author_email=\"thomas@huggingface.co\",\n    url=\"https://github.com/huggingface/datasets\",\n    download_url=\"https://github.com/huggingface/datasets/tags\",\n    license=\"Apache 2.0\",\n    package_dir={\"\": \"src\"},\n    packages=find_packages(\"src\"),\n    package_data={\n        \"datasets\": [\"py.typed\"],\n        \"datasets.utils.resources\": [\"*.json\", \"*.yaml\", \"*.tsv\"],\n    },\n    entry_points={\"console_scripts\": [\"datasets-cli=datasets.commands.datasets_cli:main\"]},\n    python_requires=\">=3.10.0\",\n    install_requires=REQUIRED_PKGS,\n    extras_require=EXTRAS_REQUIRE,\n    classifiers=[\n        \"Development Status :: 5 - Production/Stable\",\n        \"Intended Audience :: Developers\",\n        \"Intended Audience :: Education\",\n        \"Intended Audience :: Science/Research\",\n        \"License :: OSI Approved :: Apache Software License\",\n        \"Operating System :: OS Independent\",\n        \"Programming Language :: Python :: 3\",\n        \"Programming Language :: Python :: 3.10\",\n        \"Programming Language :: Python :: 3.11\",\n        \"Programming Language :: Python :: 3.12\",\n        \"Programming Language :: Python :: 3.13\",\n        \"Programming Language :: Python :: 3.14\",\n        \"Topic :: Scientific/Engineering :: Artificial Intelligence\",\n    ],\n    keywords=\"datasets machine learning datasets\",\n    zip_safe=False,  # Required for mypy to find the py.typed file\n)\n"
  },
  {
    "path": "src/datasets/__init__.py",
    "content": "# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n__version__ = \"4.8.4.dev0\"\n\nfrom .arrow_dataset import Column, Dataset\nfrom .arrow_reader import ReadInstruction\nfrom .builder import ArrowBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder\nfrom .combine import concatenate_datasets, interleave_datasets\nfrom .dataset_dict import DatasetDict, IterableDatasetDict\nfrom .download import *\nfrom .features import *\nfrom .fingerprint import disable_caching, enable_caching, is_caching_enabled\nfrom .info import DatasetInfo\nfrom .inspect import (\n    get_dataset_config_info,\n    get_dataset_config_names,\n    get_dataset_default_config_name,\n    get_dataset_infos,\n    get_dataset_split_names,\n)\nfrom .iterable_dataset import IterableColumn, IterableDataset\nfrom .load import load_dataset, load_dataset_builder, load_from_disk\nfrom .splits import (\n    NamedSplit,\n    NamedSplitAll,\n    Split,\n    SplitBase,\n    SplitDict,\n    SplitGenerator,\n    SplitInfo,\n    SubSplitInfo,\n    percent,\n)\nfrom .utils import *\nfrom .utils import logging\n"
  },
  {
    "path": "src/datasets/arrow_dataset.py",
    "content": "# Copyright 2020 The HuggingFace Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Lint as: python3\n\"\"\"Simple Dataset wrapping an Arrow Table.\"\"\"\n\nimport asyncio\nimport contextlib\nimport copy\nimport glob\nimport inspect\nimport itertools\nimport json\nimport math\nimport os\nimport posixpath\nimport random\nimport re\nimport shutil\nimport string\nimport sys\nimport tempfile\nimport time\nimport warnings\nimport weakref\nfrom collections import Counter, defaultdict\nfrom collections.abc import Iterable, Iterator, Mapping\nfrom collections.abc import Sequence as Sequence_\nfrom copy import deepcopy\nfrom functools import partial, wraps\nfrom math import ceil, floor\nfrom pathlib import Path\nfrom random import sample\nfrom typing import (\n    TYPE_CHECKING,\n    Any,\n    BinaryIO,\n    Callable,\n    Optional,\n    Union,\n    overload,\n)\n\nimport fsspec\nimport multiprocess as mp\nimport numpy as np\nimport pandas as pd\nimport pyarrow as pa\nimport pyarrow.compute as pc\nimport pyarrow.dataset as pds\nfrom fsspec.core import url_to_fs\nfrom fsspec.implementations.dirfs import DirFileSystem\nfrom huggingface_hub import (\n    CommitInfo,\n    CommitOperationAdd,\n    CommitOperationDelete,\n    DatasetCard,\n    DatasetCardData,\n    HfApi,\n    HfFileSystem,\n    HfFileSystemResolvedPath,\n)\nfrom huggingface_hub.utils import EntryNotFoundError, HfHubHTTPError, RepositoryNotFoundError\nfrom packaging import version\nfrom tqdm.contrib.concurrent import thread_map\n\nfrom . import config\nfrom .arrow_reader import ArrowReader\nfrom .arrow_writer import ArrowWriter, OptimizedTypedSequence\nfrom .data_files import sanitize_patterns\nfrom .download.streaming_download_manager import xgetsize\nfrom .features import Audio, ClassLabel, Features, Image, List, Value, Video\nfrom .features.features import (\n    FeatureType,\n    _align_features,\n    _check_if_features_can_be_aligned,\n    _fix_for_backward_compatible_features,\n    generate_from_arrow_type,\n    pandas_types_mapper,\n    require_decoding,\n)\nfrom .filesystems import is_remote_filesystem\nfrom .fingerprint import (\n    fingerprint_transform,\n    format_kwargs_for_fingerprint,\n    format_transform_for_fingerprint,\n    generate_fingerprint,\n    generate_random_fingerprint,\n    get_temporary_cache_files_directory,\n    is_caching_enabled,\n    maybe_register_dataset_for_temp_dir_deletion,\n    update_fingerprint,\n    validate_fingerprint,\n)\nfrom .formatting import format_table, get_format_type_from_alias, get_formatter, query_table\nfrom .formatting.formatting import LazyDict, _is_range_contiguous\nfrom .info import DatasetInfo, DatasetInfosDict\nfrom .naming import _split_re\nfrom .search import IndexableMixin\nfrom .splits import NamedSplit, Split, SplitDict, SplitInfo\nfrom .table import (\n    InMemoryTable,\n    MemoryMappedTable,\n    Table,\n    _memory_mapped_record_batch_reader_from_file,\n    cast_array_to_feature,\n    concat_tables,\n    embed_table_storage,\n    list_table_cache_files,\n    table_cast,\n    table_iter,\n    table_visitor,\n)\nfrom .utils import logging\nfrom .utils import tqdm as hf_tqdm\nfrom .utils.file_utils import estimate_dataset_size\nfrom .utils.info_utils import is_small_dataset\nfrom .utils.metadata import MetadataConfigs\nfrom .utils.py_utils import (\n    Literal,\n    asdict,\n    convert_file_size_to_int,\n    glob_pattern_to_regex,\n    iflatmap_unordered,\n    string_to_dict,\n)\nfrom .utils.stratify import stratified_shuffle_split_generate_indices\nfrom .utils.tf_utils import dataset_to_tf, minimal_tf_collate_fn, multiprocess_dataset_to_tf\nfrom .utils.typing import ListLike, PathLike\n\n\nif config.HF_HUB_VERSION >= version.parse(\"1.6.0\"):\n    from huggingface_hub.errors import BucketNotFoundError\n    from huggingface_hub.hf_file_system import HfFileSystemResolvedBucketPath, HfFileSystemResolvedRepositoryPath\n\nelse:\n    BucketNotFoundError = None\n    HfFileSystemResolvedBucketPath = None\n    HfFileSystemResolvedRepositoryPath = HfFileSystemResolvedPath\n\n\nif TYPE_CHECKING:\n    import sqlite3\n\n    import polars as pl\n    import pyspark\n    import sqlalchemy\n\n    from .dataset_dict import DatasetDict\n    from .iterable_dataset import IterableDataset\n\nlogger = logging.get_logger(__name__)\n\nPUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED = (\n    \"data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.parquet\"\n)\n\n\nclass DatasetInfoMixin:\n    \"\"\"This base class exposes some attributes of DatasetInfo\n    at the base level of the Dataset for easy access.\n    \"\"\"\n\n    def __init__(self, info: DatasetInfo, split: Optional[NamedSplit]):\n        self._info = info\n        self._split = split\n\n    @property\n    def info(self):\n        \"\"\"[`~datasets.DatasetInfo`] object containing all the metadata in the dataset.\"\"\"\n        return self._info\n\n    @property\n    def split(self):\n        \"\"\"[`~datasets.NamedSplit`] object corresponding to a named dataset split.\"\"\"\n        return self._split\n\n    @property\n    def builder_name(self) -> str:\n        return self._info.builder_name\n\n    @property\n    def citation(self) -> str:\n        return self._info.citation\n\n    @property\n    def config_name(self) -> str:\n        return self._info.config_name\n\n    @property\n    def dataset_size(self) -> Optional[int]:\n        return self._info.dataset_size\n\n    @property\n    def description(self) -> str:\n        return self._info.description\n\n    @property\n    def download_checksums(self) -> Optional[dict]:\n        return self._info.download_checksums\n\n    @property\n    def download_size(self) -> Optional[int]:\n        return self._info.download_size\n\n    @property\n    def features(self) -> Optional[Features]:\n        return self._info.features.copy() if self._info.features is not None else None\n\n    @property\n    def homepage(self) -> Optional[str]:\n        return self._info.homepage\n\n    @property\n    def license(self) -> Optional[str]:\n        return self._info.license\n\n    @property\n    def size_in_bytes(self) -> Optional[int]:\n        return self._info.size_in_bytes\n\n    @property\n    def supervised_keys(self):\n        return self._info.supervised_keys\n\n    @property\n    def version(self):\n        return self._info.version\n\n\nclass TensorflowDatasetMixin:\n    _TF_DATASET_REFS = set()\n\n    @staticmethod\n    def _get_output_signature(\n        dataset: \"Dataset\",\n        collate_fn: Callable,\n        collate_fn_args: dict,\n        cols_to_retain: Optional[list[str]] = None,\n        batch_size: Optional[int] = None,\n        num_test_batches: int = 20,\n    ):\n        \"\"\"Private method used by `to_tf_dataset()` to find the shapes and dtypes of samples from this dataset\n           after being passed through the collate_fn. Tensorflow needs an exact signature for tf.numpy_function, so\n           the only way to do this is to run test batches - the collator may add or rename columns, so we can't figure\n           it out just by inspecting the dataset.\n\n        Args:\n            dataset (`Dataset`): Dataset to load samples from.\n            collate_fn(`bool`): Shuffle the dataset order when loading. Recommended True for training, False for\n                validation/evaluation.\n            collate_fn(`Callable`): A function or callable object (such as a `DataCollator`) that will collate\n                lists of samples into a batch.\n            collate_fn_args (`Dict`): A `dict` of keyword arguments to be passed to the\n                `collate_fn`.\n            batch_size (`int`, optional): The size of batches loaded from the dataset. Used for shape inference.\n                Can be None, which indicates that batch sizes can be variable.\n            num_test_batches (`int`): The number of batches to load from the dataset for shape inference.\n\n        Returns:\n            `dict`: Dict mapping column names to tf.Tensorspec objects\n            `dict`: Dict mapping column names to np.dtype objects\n        \"\"\"\n        if config.TF_AVAILABLE:\n            import tensorflow as tf\n        else:\n            raise ImportError(\"Called a Tensorflow-specific function but Tensorflow is not installed.\")\n\n        if len(dataset) == 0:\n            raise ValueError(\"Unable to get the output signature because the dataset is empty.\")\n        if batch_size is not None:\n            batch_size = min(len(dataset), batch_size)\n        test_batch_size = 1\n\n        if cols_to_retain is not None:\n            cols_to_retain = list(set(cols_to_retain + [\"label_ids\", \"label\", \"labels\"]))\n\n        test_batches = []\n        for _ in range(num_test_batches):\n            indices = sample(range(len(dataset)), test_batch_size)\n            test_batch = dataset[indices]\n            if cols_to_retain is not None:\n                test_batch = {key: value for key, value in test_batch.items() if key in cols_to_retain}\n            test_batch = [{key: value[i] for key, value in test_batch.items()} for i in range(test_batch_size)]\n            test_batch = collate_fn(test_batch, **collate_fn_args)\n            test_batches.append(test_batch)\n\n        tf_columns_to_signatures = {}\n        np_columns_to_dtypes = {}\n        for column in test_batches[0].keys():\n            raw_arrays = [batch[column] for batch in test_batches]\n            # In case the collate_fn returns something strange\n            np_arrays = []\n            for array in raw_arrays:\n                if isinstance(array, np.ndarray):\n                    np_arrays.append(array)\n                elif isinstance(array, tf.Tensor):\n                    np_arrays.append(array.numpy())\n                else:\n                    np_arrays.append(np.array(array))\n\n            if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == bool:\n                tf_dtype = tf.int64\n                np_dtype = np.int64\n            elif np.issubdtype(np_arrays[0].dtype, np.number):\n                tf_dtype = tf.float32\n                np_dtype = np.float32\n            elif np_arrays[0].dtype.kind == \"U\":  # Unicode strings\n                np_dtype = np.str_\n                tf_dtype = tf.string\n            else:\n                raise RuntimeError(\n                    f\"Unrecognized array dtype {np_arrays[0].dtype}. \\n\"\n                    \"Nested types and image/audio types are not supported yet.\"\n                )\n            shapes = [array.shape for array in np_arrays]\n            static_shape = []\n            for dim in range(len(shapes[0])):\n                sizes = {shape[dim] for shape in shapes}\n                if dim == 0:\n                    static_shape.append(batch_size)\n                    continue\n                if len(sizes) == 1:  # This dimension looks constant\n                    static_shape.append(sizes.pop())\n                else:  # Use None for variable dimensions\n                    static_shape.append(None)\n            tf_columns_to_signatures[column] = tf.TensorSpec(shape=static_shape, dtype=tf_dtype)\n            np_columns_to_dtypes[column] = np_dtype\n\n        return tf_columns_to_signatures, np_columns_to_dtypes\n\n    def to_tf_dataset(\n        self,\n        batch_size: Optional[int] = None,\n        columns: Optional[Union[str, list[str]]] = None,\n        shuffle: bool = False,\n        collate_fn: Optional[Callable] = None,\n        drop_remainder: bool = False,\n        collate_fn_args: Optional[dict[str, Any]] = None,\n        label_cols: Optional[Union[str, list[str]]] = None,\n        prefetch: bool = True,\n        num_workers: int = 0,\n        num_test_batches: int = 20,\n    ):\n        \"\"\"Create a `tf.data.Dataset` from the underlying Dataset. This `tf.data.Dataset` will load and collate batches from\n        the Dataset, and is suitable for passing to methods like `model.fit()` or `model.predict()`. The dataset will yield\n        `dicts` for both inputs and labels unless the `dict` would contain only a single key, in which case a raw\n        `tf.Tensor` is yielded instead.\n\n        Args:\n            batch_size (`int`, *optional*):\n                Size of batches to load from the dataset. Defaults to `None`, which implies that the dataset won't be\n                batched, but the returned dataset can be batched later with `tf_dataset.batch(batch_size)`.\n            columns (`List[str]` or `str`, *optional*):\n                Dataset column(s) to load in the `tf.data.Dataset`.\n                Column names that are created by the `collate_fn` and that do not exist in the original dataset can be used.\n            shuffle(`bool`, defaults to `False`):\n                Shuffle the dataset order when loading. Recommended `True` for training, `False` for\n                validation/evaluation.\n            drop_remainder(`bool`, defaults to `False`):\n                Drop the last incomplete batch when loading. Ensures\n                that all batches yielded by the dataset will have the same length on the batch dimension.\n            collate_fn(`Callable`, *optional*):\n                A function or callable object (such as a `DataCollator`) that will collate\n                lists of samples into a batch.\n            collate_fn_args (`Dict`, *optional*):\n                An optional `dict` of keyword arguments to be passed to the\n                `collate_fn`.\n            label_cols (`List[str]` or `str`, defaults to `None`):\n                Dataset column(s) to load as labels.\n                Note that many models compute loss internally rather than letting Keras do it, in which case\n                passing the labels here is optional, as long as they're in the input `columns`.\n            prefetch (`bool`, defaults to `True`):\n                Whether to run the dataloader in a separate thread and maintain\n                a small buffer of batches for training. Improves performance by allowing data to be loaded in the\n                background while the model is training.\n            num_workers (`int`, defaults to `0`):\n                Number of workers to use for loading the dataset.\n            num_test_batches (`int`, defaults to `20`):\n                Number of batches to use to infer the output signature of the dataset.\n                The higher this number, the more accurate the signature will be, but the longer it will take to\n                create the dataset.\n\n        Returns:\n            `tf.data.Dataset`\n\n        Example:\n\n        ```py\n        >>> ds_train = ds[\"train\"].to_tf_dataset(\n        ...    columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'],\n        ...    shuffle=True,\n        ...    batch_size=16,\n        ...    collate_fn=data_collator,\n        ... )\n        ```\n        \"\"\"\n        if config.TF_AVAILABLE:\n            import tensorflow as tf\n        else:\n            raise ImportError(\"Called a Tensorflow-specific function but Tensorflow is not installed.\")\n\n        if (isinstance(columns, list) and len(columns) == 1) or (\n            isinstance(label_cols, list) and len(label_cols) == 1\n        ):\n            warnings.warn(\n                \"The output of `to_tf_dataset` will change when a passing single element list for `labels` or \"\n                \"`columns` in the next datasets version. To return a tuple structure rather than dict, pass a \"\n                \"single string.\\n\"\n                \"Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  \\n\"\n                \"             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  \\n\"\n                \"New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  \\n\"\n                \"             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) \",\n                FutureWarning,\n            )\n\n        if isinstance(tf.distribute.get_strategy(), tf.distribute.TPUStrategy):\n            logger.warning(\n                \"Note that to_tf_dataset() loads the data with a generator rather than a full tf.data \"\n                \"pipeline and is not compatible with remote TPU connections. If you encounter errors, please \"\n                \"try using a TPU VM or, if your data can fit in memory, loading it into memory as a dict of \"\n                \"Tensors instead of streaming with to_tf_dataset().\"\n            )\n\n        if collate_fn is None:\n            # Set a very simple default collator that just stacks things together\n            collate_fn = minimal_tf_collate_fn\n        if collate_fn_args is None:\n            collate_fn_args = {}\n        if label_cols and not columns:\n            raise ValueError(\"Cannot specify label_cols without specifying columns!\")\n        if label_cols is None:\n            label_cols = []\n        elif isinstance(label_cols, str):\n            label_cols = [label_cols]\n        if len(set(label_cols)) < len(label_cols):\n            raise ValueError(\"List of label_cols contains duplicates.\")\n        if columns:\n            if isinstance(columns, str):\n                columns = [columns]\n            if len(set(columns)) < len(columns):\n                raise ValueError(\"List of columns contains duplicates.\")\n            cols_to_retain = list(set(columns + label_cols))\n        else:\n            cols_to_retain = None  # Indicates keeping all valid columns\n            columns = []\n\n        if self.format[\"type\"] not in [\"custom\", \"numpy\"]:\n            dataset = self.with_format(\"numpy\")\n        else:\n            dataset = self\n\n        # TODO(Matt, QL): deprecate the retention of label_ids and label\n\n        output_signature, columns_to_np_types = dataset._get_output_signature(\n            dataset,\n            collate_fn=collate_fn,\n            collate_fn_args=collate_fn_args,\n            cols_to_retain=cols_to_retain,\n            batch_size=batch_size if drop_remainder else None,\n            num_test_batches=num_test_batches,\n        )\n\n        if \"labels\" in output_signature:\n            if (\"label_ids\" in columns or \"label\" in columns) and \"labels\" not in columns:\n                columns = [col for col in columns if col not in [\"label_ids\", \"label\"]] + [\"labels\"]\n            if (\"label_ids\" in label_cols or \"label\" in label_cols) and \"labels\" not in label_cols:\n                label_cols = [col for col in label_cols if col not in [\"label_ids\", \"label\"]] + [\"labels\"]\n\n        for col in columns:\n            if col not in output_signature:\n                raise ValueError(f\"Column {col} not found in dataset!\")\n\n        for col in label_cols:\n            if col not in output_signature:\n                raise ValueError(f\"Label column {col} not found in dataset!\")\n\n        if num_workers == 0:\n            tf_dataset = dataset_to_tf(\n                dataset=dataset,\n                cols_to_retain=cols_to_retain,\n                collate_fn=collate_fn,\n                collate_fn_args=collate_fn_args,\n                columns_to_np_types=columns_to_np_types,\n                output_signature=output_signature,\n                shuffle=shuffle,\n                batch_size=batch_size,\n                drop_remainder=drop_remainder,\n            )\n        elif num_workers > 0:\n            if batch_size is None:\n                raise NotImplementedError(\n                    \"`batch_size` must be specified when using multiple workers, as unbatched multiprocessing \"\n                    \"is not supported yet. Please provide a `batch_size` if `num_workers` is greater than 0.\"\n                )\n            tf_dataset = multiprocess_dataset_to_tf(\n                dataset=dataset,\n                cols_to_retain=cols_to_retain,\n                collate_fn=collate_fn,\n                collate_fn_args=collate_fn_args,\n                columns_to_np_types=columns_to_np_types,\n                output_signature=output_signature,\n                shuffle=shuffle,\n                batch_size=batch_size,\n                drop_remainder=drop_remainder,\n                num_workers=num_workers,\n            )\n        else:\n            raise ValueError(\"num_workers must be >= 0\")\n\n        def split_features_and_labels(input_batch):\n            # TODO(Matt, QL): deprecate returning the dict content when there's only one key\n            features = {key: tensor for key, tensor in input_batch.items() if key in columns}\n            labels = {key: tensor for key, tensor in input_batch.items() if key in label_cols}\n            if len(features) == 1:\n                features = list(features.values())[0]\n            if len(labels) == 1:\n                labels = list(labels.values())[0]\n            if isinstance(labels, dict) and len(labels) == 0:\n                return features\n            else:\n                return features, labels\n\n        if cols_to_retain is not None:\n            tf_dataset = tf_dataset.map(split_features_and_labels)\n\n        if prefetch:\n            tf_dataset = tf_dataset.prefetch(tf.data.experimental.AUTOTUNE)\n\n        # Remove a reference to the open Arrow file on delete\n        def cleanup_callback(ref):\n            dataset.__del__()\n            self._TF_DATASET_REFS.remove(ref)\n\n        self._TF_DATASET_REFS.add(weakref.ref(tf_dataset, cleanup_callback))\n\n        return tf_dataset\n\n\nclass DatasetTransformationNotAllowedError(Exception):\n    pass\n\n\ndef transmit_format(func):\n    \"\"\"Wrapper for dataset transforms that recreate a new Dataset to transmit the format of the original dataset to the new dataset\"\"\"\n\n    @wraps(func)\n    def wrapper(*args, **kwargs):\n        if args:\n            self: \"Dataset\" = args[0]\n            args = args[1:]\n        else:\n            self: \"Dataset\" = kwargs.pop(\"self\")\n        # don't use self.format since it returns a list of columns for 'columns' even if self_format_columns is None\n        unformatted_columns = set(self.column_names) - set(self._format_columns or [])\n        self_format = {\n            \"type\": self._format_type,\n            \"format_kwargs\": self._format_kwargs,\n            \"columns\": self._format_columns,\n            \"output_all_columns\": self._output_all_columns,\n        }\n        # apply actual function\n        out: Union[\"Dataset\", \"DatasetDict\"] = func(self, *args, **kwargs)\n        datasets: list[\"Dataset\"] = list(out.values()) if isinstance(out, dict) else [out]\n        # re-apply format to the output\n        for dataset in datasets:\n            new_format = self_format.copy()\n            if new_format[\"columns\"] is not None:  # new formatted columns = (columns - previously unformatted columns)\n                # sort the columns to have a deterministic list of columns that we can compare with `out_format`\n                new_format[\"columns\"] = sorted(set(dataset.column_names) - unformatted_columns)\n            out_format = {\n                \"type\": dataset._format_type,\n                \"format_kwargs\": dataset._format_kwargs,\n                \"columns\": sorted(dataset._format_columns) if dataset._format_columns is not None else None,\n                \"output_all_columns\": dataset._output_all_columns,\n            }\n            if out_format != new_format:\n                fingerprint = dataset._fingerprint\n                dataset.set_format(**new_format)\n                dataset._fingerprint = fingerprint\n        return out\n\n    wrapper._decorator_name_ = \"transmit_format\"\n    return wrapper\n\n\ndef update_metadata_with_features(table: Table, features: Features):\n    \"\"\"To be used in dataset transforms that modify the features of the dataset, in order to update the features stored in the metadata of its schema.\"\"\"\n    features = Features({col_name: features[col_name] for col_name in table.column_names})\n    if table.schema.metadata is None or b\"huggingface\" not in table.schema.metadata:\n        pa_metadata = ArrowWriter._build_metadata(DatasetInfo(features=features))\n    else:\n        metadata = json.loads(table.schema.metadata[b\"huggingface\"].decode())\n        if \"info\" not in metadata:\n            metadata[\"info\"] = asdict(DatasetInfo(features=features))\n        else:\n            metadata[\"info\"][\"features\"] = asdict(DatasetInfo(features=features))[\"features\"]\n        pa_metadata = {\"huggingface\": json.dumps(metadata)}\n    table = table.replace_schema_metadata(pa_metadata)\n    return table\n\n\ndef _check_table(table) -> Table:\n    \"\"\"We check the table type to make sure it's an instance of :class:`datasets.table.Table`\"\"\"\n    if isinstance(table, pa.Table):\n        # for a pyarrow table, we can just consider it as a in-memory table\n        # this is here for backward compatibility\n        return InMemoryTable(table)\n    elif isinstance(table, Table):\n        return table\n    else:\n        raise TypeError(f\"Expected a pyarrow.Table or a datasets.table.Table object, but got {table}.\")\n\n\ndef _check_column_names(column_names: list[str]):\n    \"\"\"Check the column names to make sure they don't contain duplicates.\"\"\"\n    counter = Counter(column_names)\n    if not all(count == 1 for count in counter.values()):\n        duplicated_columns = [col for col in counter if counter[col] > 1]\n        raise ValueError(f\"The table can't have duplicated columns but columns {duplicated_columns} are duplicated.\")\n\n\ndef _check_valid_indices_value(index, size):\n    if (index < 0 and index + size < 0) or (index >= size):\n        raise IndexError(f\"Index {index} out of range for dataset of size {size}.\")\n\n\nclass NonExistentDatasetError(Exception):\n    \"\"\"Used when we expect the existence of a dataset\"\"\"\n\n    pass\n\n\nclass Column(Sequence_):\n    \"\"\"\n    An iterable for a specific column of a [`Dataset`].\n\n    Example:\n\n    Iterate on the texts of the \"text\" column of a dataset:\n\n    ```python\n    for text in dataset[\"text\"]:\n        ...\n    ```\n\n    It also works with nested columns:\n\n    ```python\n    for source in dataset[\"metadata\"][\"source\"]:\n        ...\n    ```\n    \"\"\"\n\n    def __init__(self, source: Union[\"Dataset\", \"Column\"], column_name: str):\n        self.source = source\n        self.column_name = column_name\n        if not isinstance(source.features, dict) or column_name not in source.features:\n            raise ValueError(f\"Column '{column_name}' doesn't exist.\")\n        self.features = source.features[column_name]\n\n    def __iter__(self) -> Iterator[Any]:\n        if isinstance(self.source, Dataset):\n            if self.source._format_type == \"custom\":\n                # the formatting transform may require all columns\n                source = self.source\n            else:\n                source = self.source._fast_select_column(self.column_name)\n        else:\n            source = self.source\n        for example in source:\n            yield example[self.column_name]\n\n    def __getitem__(self, key: Union[int, str, list[int]]) -> Any:\n        if isinstance(key, str):\n            return Column(self, key)\n        elif isinstance(self.source, Dataset):\n            if self.source._format_type == \"custom\":\n                # the formatting transform may require all columns\n                source = self.source\n            else:\n                source = self.source._fast_select_column(self.column_name)\n            return source[key][self.column_name]\n        elif isinstance(key, int):\n            return self.source[key][self.column_name]\n        else:\n            return [item[self.column_name] for item in self.source[key]]\n\n    def __len__(self) -> int:\n        return len(self.source)\n\n    def __repr__(self):\n        return \"Column(\" + repr(list(self[:5]))[:-1] + (\", ...])\" if len(self) > 5 else \"])\")\n\n    def __str__(self):\n        return \"Column(\" + str(list(self[:5]))[:-1] + (\", ...])\" if len(self) > 5 else \"])\")\n\n    def __eq__(self, value):\n        if isinstance(value, Column):\n            return list(self) == list(value)\n        else:\n            return value == list(self)\n\n\nclass Dataset(DatasetInfoMixin, IndexableMixin, TensorflowDatasetMixin):\n    \"\"\"A Dataset backed by an Arrow table.\"\"\"\n\n    def __init__(\n        self,\n        arrow_table: Table,\n        info: Optional[DatasetInfo] = None,\n        split: Optional[NamedSplit] = None,\n        indices_table: Optional[Table] = None,\n        fingerprint: Optional[str] = None,\n    ):\n        info = info.copy() if info is not None else DatasetInfo()\n        DatasetInfoMixin.__init__(self, info=info, split=split)\n        IndexableMixin.__init__(self)\n\n        self._data: Table = _check_table(arrow_table)\n        self._indices: Optional[Table] = _check_table(indices_table) if indices_table is not None else None\n        maybe_register_dataset_for_temp_dir_deletion(self)\n\n        self._format_type: Optional[str] = None\n        self._format_kwargs: dict = {}\n        self._format_columns: Optional[list] = None\n        self._output_all_columns: bool = False\n        self._fingerprint: str = fingerprint\n\n        # Read metadata\n\n        if self._data.schema.metadata is not None and b\"huggingface\" in self._data.schema.metadata:\n            metadata = json.loads(self._data.schema.metadata[b\"huggingface\"].decode())\n            if (\n                \"fingerprint\" in metadata and self._fingerprint is None\n            ):  # try to load fingerprint from the arrow file metadata\n                self._fingerprint = metadata[\"fingerprint\"]\n\n        # Infer features if None\n        inferred_features = Features.from_arrow_schema(arrow_table.schema)\n        if self.info.features is None:\n            self.info.features = inferred_features\n        else:  # make sure the nested columns are in the right order\n            try:\n                self.info.features = self.info.features.reorder_fields_as(inferred_features)\n            except ValueError as e:\n                raise ValueError(\n                    f\"{e}\\nThe 'source' features come from dataset_info.json, and the 'target' ones are those of the dataset arrow file.\"\n                )\n\n        # In case there are types like pa.dictionary that we need to convert to the underlying type\n\n        if self.data.schema != self.info.features.arrow_schema:\n            self._data = self.data.cast(self.info.features.arrow_schema)\n\n        # Infer fingerprint if None\n\n        if self._fingerprint is None:\n            self._fingerprint = generate_fingerprint(self)\n\n        # Sanity checks\n\n        if self._info.features is None:\n            raise ValueError(\"Features can't be None in a Dataset object\")\n        if self._fingerprint is None:\n            raise ValueError(\"Fingerprint can't be None in a Dataset object\")\n        if self.info.features.type != inferred_features.type:\n            raise ValueError(\n                f\"External features info don't match the dataset:\\nGot\\n{self.info.features}\\nwith type\\n{self.info.features.type}\\n\\nbut expected something like\\n{inferred_features}\\nwith type\\n{inferred_features.type}\"\n            )\n\n        if self._indices is not None:\n            if not pa.types.is_unsigned_integer(self._indices.column(0).type):\n                raise ValueError(\n                    f\"indices must be an Arrow table of unsigned integers, current type is {self._indices.column(0).type}\"\n                )\n        _check_column_names(self._data.column_names)\n\n        self._data = update_metadata_with_features(self._data, self._info.features)\n\n    @property\n    def features(self) -> Features:\n        features = super().features\n        if features is None:  # this is already checked in __init__\n            raise ValueError(\"Features can't be None in a Dataset object\")\n        return features\n\n    @classmethod\n    def from_file(\n        cls,\n        filename: str,\n        info: Optional[DatasetInfo] = None,\n        split: Optional[NamedSplit] = None,\n        indices_filename: Optional[str] = None,\n        in_memory: bool = False,\n    ) -> \"Dataset\":\n        \"\"\"Instantiate a Dataset backed by an Arrow table at filename.\n\n        Args:\n            filename (`str`):\n                File name of the dataset.\n            info (`DatasetInfo`, *optional*):\n                Dataset information, like description, citation, etc.\n            split (`NamedSplit`, *optional*):\n                Name of the dataset split.\n            indices_filename (`str`, *optional*):\n                File names of the indices.\n            in_memory (`bool`, defaults to `False`):\n                Whether to copy the data in-memory.\n\n        Returns:\n            [`Dataset`]\n        \"\"\"\n        table = ArrowReader.read_table(filename, in_memory=in_memory)\n\n        if indices_filename is not None:\n            indices_pa_table = ArrowReader.read_table(indices_filename, in_memory=in_memory)\n        else:\n            indices_pa_table = None\n\n        return cls(\n            arrow_table=table,\n            info=info,\n            split=split,\n            indices_table=indices_pa_table,\n        )\n\n    @classmethod\n    def from_buffer(\n        cls,\n        buffer: pa.Buffer,\n        info: Optional[DatasetInfo] = None,\n        split: Optional[NamedSplit] = None,\n        indices_buffer: Optional[pa.Buffer] = None,\n    ) -> \"Dataset\":\n        \"\"\"Instantiate a Dataset backed by an Arrow buffer.\n\n        Args:\n            buffer (`pyarrow.Buffer`):\n                Arrow buffer.\n            info (`DatasetInfo`, *optional*):\n                Dataset information, like description, citation, etc.\n            split (`NamedSplit`, *optional*):\n                Name of the dataset split.\n            indices_buffer (`pyarrow.Buffer`, *optional*):\n                Indices Arrow buffer.\n\n        Returns:\n            [`Dataset`]\n        \"\"\"\n        table = InMemoryTable.from_buffer(buffer)\n\n        if indices_buffer is not None:\n            indices_table = InMemoryTable.from_buffer(buffer)\n        else:\n            indices_table = None\n\n        return cls(table, info=info, split=split, indices_table=indices_table)\n\n    @classmethod\n    def from_pandas(\n        cls,\n        df: pd.DataFrame,\n        features: Optional[Features] = None,\n        info: Optional[DatasetInfo] = None,\n        split: Optional[NamedSplit] = None,\n        preserve_index: Optional[bool] = None,\n    ) -> \"Dataset\":\n        \"\"\"\n        Convert `pandas.DataFrame` to a `pyarrow.Table` to create a [`Dataset`].\n\n        The column types in the resulting Arrow Table are inferred from the dtypes of the `pandas.Series` in the\n        DataFrame. In the case of non-object Series, the NumPy dtype is translated to its Arrow equivalent. In the\n        case of `object`, we need to guess the datatype by looking at the Python objects in this Series.\n\n        Be aware that Series of the `object` dtype don't carry enough information to always lead to a meaningful Arrow\n        type. In the case that we cannot infer a type, e.g. because the DataFrame is of length 0 or the Series only\n        contains `None/nan` objects, the type is set to `null`. This behavior can be avoided by constructing explicit\n        features and passing it to this function.\n\n        Important: a dataset created with from_pandas() lives in memory\n        and therefore doesn't have an associated cache directory.\n        This may change in the future, but in the meantime if you\n        want to reduce memory usage you should write it back on disk\n        and reload using e.g. save_to_disk / load_from_disk.\n\n        Args:\n            df (`pandas.DataFrame`):\n                Dataframe that contains the dataset.\n            features ([`Features`], *optional*):\n                Dataset features.\n            info (`DatasetInfo`, *optional*):\n                Dataset information, like description, citation, etc.\n            split (`NamedSplit`, *optional*):\n                Name of the dataset split.\n            preserve_index (`bool`, *optional*):\n                Whether to store the index as an additional column in the resulting Dataset.\n                The default of `None` will store the index as a column, except for `RangeIndex` which is stored as metadata only.\n                Use `preserve_index=True` to force it to be stored as a column.\n\n        Returns:\n            [`Dataset`]\n\n        Example:\n\n        ```py\n        >>> ds = Dataset.from_pandas(df)\n        ```\n        \"\"\"\n        if info is not None and features is not None and info.features != features:\n            raise ValueError(\n                f\"Features specified in `features` and `info.features` can't be different:\\n{features}\\n{info.features}\"\n            )\n        features = features if features is not None else info.features if info is not None else None\n        if features is not None:\n            features = _fix_for_backward_compatible_features(features)\n        if info is None:\n            info = DatasetInfo()\n        info.features = features\n        table = InMemoryTable.from_pandas(\n            df=df,\n            preserve_index=preserve_index,\n        )\n        if features is not None:\n            # more expensive cast than InMemoryTable.from_pandas(..., schema=features.arrow_schema)\n            # needed to support the str to Audio conversion for instance\n            table = table.cast(features.arrow_schema)\n        return cls(table, info=info, split=split)\n\n    @classmethod\n    def from_polars(\n        cls,\n        df: Union[\"pl.DataFrame\", \"pl.LazyFrame\"],\n        features: Optional[Features] = None,\n        info: Optional[DatasetInfo] = None,\n        split: Optional[NamedSplit] = None,\n    ) -> \"Dataset\":\n        \"\"\"\n        Collect the underlying arrow arrays in an Arrow Table.\n\n        This operation is mostly zero copy.\n\n        Data types that do copy:\n            * CategoricalType\n\n        Args:\n            df (`polars.DataFrame`): DataFrame to convert to Arrow Table\n            features (`Features`, optional): Dataset features.\n            info (`DatasetInfo`, optional): Dataset information, like description, citation, etc.\n            split (`NamedSplit`, optional): Name of the dataset split.\n\n        Examples:\n        ```py\n        >>> ds = Dataset.from_polars(df)\n        ```\n        \"\"\"\n        import polars as pl\n\n        if info is not None and features is not None and info.features != features:\n            raise ValueError(\n                f\"Features specified in `features` and `info.features` can't be different:\\n{features}\\n{info.features}\"\n            )\n        features = features if features is not None else info.features if info is not None else None\n        if features is not None:\n            features = _fix_for_backward_compatible_features(features)\n        if info is None:\n            info = DatasetInfo()\n        info.features = features\n        if isinstance(df, pl.LazyFrame):\n            df = df.collect()\n        table = InMemoryTable(df.to_arrow())\n        if features is not None:\n            # more expensive cast than InMemoryTable.from_polars(..., schema=features.arrow_schema)\n            # needed to support the str to Audio conversion for instance\n            table = table.cast(features.arrow_schema)\n        return cls(table, info=info, split=split)\n\n    @classmethod\n    def from_dict(\n        cls,\n        mapping: dict,\n        features: Optional[Features] = None,\n        info: Optional[DatasetInfo] = None,\n        split: Optional[NamedSplit] = None,\n        on_mixed_types: Optional[Literal[\"use_json\"]] = None,\n    ) -> \"Dataset\":\n        \"\"\"\n        Convert `dict` to a `pyarrow.Table` to create a [`Dataset`].\n\n        Important: a dataset created with from_dict() lives in memory\n        and therefore doesn't have an associated cache directory.\n        This may change in the future, but in the meantime if you\n        want to reduce memory usage you should write it back on disk\n        and reload using e.g. save_to_disk / load_from_disk.\n\n        Args:\n            mapping (`Mapping`):\n                Mapping of strings to Arrays or Python lists.\n            features ([`Features`], *optional*):\n                Dataset features.\n            info (`DatasetInfo`, *optional*):\n                Dataset information, like description, citation, etc.\n            split (`NamedSplit`, *optional*):\n                Name of the dataset split.\n            on_mixed_types (`Literal[\"use_json\"]`, *optional*, defaults to `None`):\n                If \"use_json\", use the Json() type for mixed-types fields,\n                i.e. unstructured fields that contain data without a predefined schema.\n                In this case, a field with mixed type is set to Json().\n\n                This allow loading lists with a mix of strings/integers/floats\n                for example, or dictionaries with arbitrary value types.\n\n                <Added version=\"4.7.0\"/>\n\n        Returns:\n            [`Dataset`]\n\n        Examples:\n\n        Get a Dataset from a dictionary containing one list per column:\n\n        ```py\n        >>> ds = Dataset.from_dict({\"text\": [\"hello there !\", \"general kenobi !\"]})\n        ```\n\n        Pass features to set the column types, e.g. for an image dataset:\n\n        ```py\n        >>> features = Features({\"image\": Image()})\n        >>> ds = Dataset.from_dict({\"image\": [\"path/to/image.png\"]}, features=features)\n        ```\n\n        Datasets are based on Arrow which is a columnar format, and therefore they expect every example to have the same\n        type and subtypes, and dictionaries to have the same keys and values types.\n        Loading a dataset errors out when fields have mismatching types, and fills missing fields in dictionaries with None so all dictionaries have the same keys and value types.\n\n        To avoid this and allow mixed-types without errors, you can use `on_mixed_types=\"use_json\"` or specify `features=` with a [`Json`] type:\n\n        ```py\n        >>> ds = Dataset.from_dict({\"a\": [0, \"foo\", {\"subfield\": \"bar\"}]})\n        Traceback (most recent call last):\n          ...\n          File \"pyarrow/error.pxi\", line 92, in pyarrow.lib.check_status\n        pyarrow.lib.ArrowInvalid: Could not convert 'foo' with type str: tried to convert to int64\n\n        >>> ds = Dataset.from_dict({\"a\": [0, \"foo\", {\"subfield\": \"bar\"}]}, on_mixed_types=\"use_json\")\n        >>> ds.features\n        {'a': Json()}\n        >>> list(ds[\"a\"])\n        [0, \"foo\", {\"subfield\": \"bar\"}]\n\n        >>> features = Features({\"a\": Json()})\n        >>> ds = Dataset.from_dict({\"a\": [0, \"foo\", {\"subfield\": \"bar\"}]}, features=features)\n        >>> ds.features\n        {'a': Json()}\n        >>> list(ds[\"a\"])\n        [0, \"foo\", {\"subfield\": \"bar\"}]\n        ```\n\n        This is also useful for lists of dictionaries with arbitrary keys and values, to avoid filling missing fields with None:\n\n        ```py\n        >>> ds = Dataset.from_dict({\"a\": [[{\"b\": 0}, {\"c\": 0}]]})\n        >>> ds.features\n        {'a': List({'b': Value('int64'), 'c': Value('int64')})}\n        >>> list(ds[\"a\"])\n        [[{'b': 0, 'c': None}, {'b': None, 'c': 0}]]  # missing fields are filled with None\n\n        >>> features = Features({\"a\": List(Json())})\n        >>> ds = Dataset.from_dict({\"a\": [[{\"b\": 0}, {\"c\": 0}]]}, features=features)\n        >>> ds.features\n        {'a': List(Json())}\n        >>> list(ds[\"a\"])\n        [[{'b': 0}, {'c': 0}]]  # OK\n\n        >>> ds = Dataset.from_dict({\"a\": [[{\"b\": 0}, {\"c\": 0}]]}, on_mixed_types=\"use_json\")\n        >>> ds.features\n        {'a': List(Json())}\n        >>> list(ds[\"a\"])\n        [[{'b': 0}, {'c': 0}]]  # OK\n        ```\n\n        Another example with tool calling data:\n\n        ```py\n        >>> messages = [\n        ...     {\"role\": \"user\", \"content\": \"Turn on the living room lights and play my electronic music playlist.\"},\n        ...     {\"role\": \"assistant\", \"tool_calls\": [\n        ...         {\"type\": \"function\", \"function\": {\n        ...             \"name\": \"control_light\",\n        ...             \"arguments\": {\"room\": \"living room\", \"state\": \"on\"}\n        ...         }},\n        ...         {\"type\": \"function\", \"function\": {\n        ...             \"name\": \"play_music\",\n        ...             \"arguments\": {\"playlist\": \"electronic\"}  # mixed-type here since keys [\"playlist\"] and [\"room\", \"state\"] are different\n        ...         }}]\n        ...     },\n        ...     {\"role\": \"tool\", \"name\": \"control_light\", \"content\": \"The lights in the living room are now on.\"},\n        ...     {\"role\": \"tool\", \"name\": \"play_music\", \"content\": \"The music is now playing.\"},\n        ...     {\"role\": \"assistant\", \"content\": \"Done!\"}\n        ... ]\n        >>> ds = Dataset.from_dict({\"messages\": [messages]}, on_mixed_types=\"use_json\")\n        >>> ds.features\n        {'messages': List({'role': Value('string'), 'content': Value('string'), 'tool_calls': List(Json()), 'name': Value('string')})}\n        >>> ds[0][\"messages\"][1][\"tool_calls\"][0][\"function\"][\"arguments\"]\n        {\"room\": \"living room\", \"state\": \"on\"}\n        ```\n        \"\"\"\n        if info is not None and features is not None and info.features != features:\n            raise ValueError(\n                f\"Features specified in `features` and `info.features` can't be different:\\n{features}\\n{info.features}\"\n            )\n        features = features if features is not None else info.features if info is not None else None\n        if features is not None:\n            features = _fix_for_backward_compatible_features(features)\n        arrow_typed_mapping = {}\n        for col, data in mapping.items():\n            if isinstance(data, (pa.Array, pa.ChunkedArray)):\n                data = cast_array_to_feature(data, features[col]) if features is not None else data\n            else:\n                data = OptimizedTypedSequence(\n                    features.encode_column(data, col) if features is not None else data,\n                    type=features[col] if features is not None else None,\n                    col=col,\n                    on_mixed_types=on_mixed_types,\n                )\n            arrow_typed_mapping[col] = data\n        mapping = arrow_typed_mapping\n        pa_table = InMemoryTable.from_pydict(mapping=mapping)\n        if info is None:\n            info = DatasetInfo()\n        info.features = features\n        if info.features is None:\n            info.features = Features(\n                {\n                    col: generate_from_arrow_type(data.type)\n                    if isinstance(data, (pa.Array, pa.ChunkedArray))\n                    else data.get_inferred_type()\n                    for col, data in mapping.items()\n                }\n            )\n        return cls(pa_table, info=info, split=split)\n\n    @classmethod\n    def from_list(\n        cls,\n        mapping: list[dict],\n        features: Optional[Features] = None,\n        info: Optional[DatasetInfo] = None,\n        split: Optional[NamedSplit] = None,\n        on_mixed_types: Optional[Literal[\"use_json\"]] = None,\n    ) -> \"Dataset\":\n        \"\"\"\n        Convert a list of dicts to a `pyarrow.Table` to create a [`Dataset`]`.\n\n        Note that the keys of the first entry will be used to determine the dataset columns,\n        regardless of what is passed to features.\n\n        Important: a dataset created with from_list() lives in memory\n        and therefore doesn't have an associated cache directory.\n        This may change in the future, but in the meantime if you\n        want to reduce memory usage you should write it back on disk\n        and reload using e.g. save_to_disk / load_from_disk.\n\n        Args:\n            mapping (`List[dict]`): A list of mappings of strings to row values.\n            features (`Features`, optional): Dataset features.\n            info (`DatasetInfo`, optional): Dataset information, like description, citation, etc.\n            split (`NamedSplit`, optional): Name of the dataset split.\n            on_mixed_types (`Literal[\"use_json\"]`, *optional*, defaults to `None`):\n                If \"use_json\", use the Json() type for mixed-types fields,\n                i.e. unstructured fields that contain data without a predefined schema.\n                In this case, a field with mixed type is set to Json().\n\n                This allow loading lists with a mix of strings/integers/floats\n                for example, or dictionaries with arbitrary value types.\n\n                <Added version=\"4.7.0\"/>\n\n        Returns:\n            [`Dataset`]\n\n        Examples:\n\n        Get a Dataset from a list containing the examples:\n\n        ```py\n        >>> ds = Dataset.from_list([{\"text\": \"hello there !\"}, {\"text\": \"general kenobi !\"}]})\n        ```\n\n        Pass features to set the column types, e.g. for an image dataset:\n\n        ```py\n        >>> features = Features({\"image\": Image()})\n        >>> ds = Dataset.from_list([{\"image\": \"path/to/image.png\"}], features=features)\n        ```\n\n        Datasets are based on Arrow which is a columnar format, and therefore they expect every example to have the same\n        type and subtypes, and dictionaries to have the same keys and values types.\n        Loading a dataset errors out when fields have mismatching types, and fills missing fields in dictionaries with None so all dictionaries have the same keys and value types.\n\n        To avoid this and allow mixed-types without errors, you can use `on_mixed_types=\"use_json\"` or specify `features=` with a [`Json`] type:\n\n        ```py\n        >>> ds = Dataset.from_list([{\"a\": 0}, {\"a\": \"foo\"}, {\"a\": {\"subfield\": \"bar\"}}])\n        Traceback (most recent call last):\n          ...\n          File \"pyarrow/error.pxi\", line 92, in pyarrow.lib.check_status\n        pyarrow.lib.ArrowInvalid: Could not convert 'foo' with type str: tried to convert to int64\n\n        >>> ds = Dataset.from_list([{\"a\": 0}, {\"a\": \"foo\"}, {\"a\": {\"subfield\": \"bar\"}}], on_mixed_types=\"use_json\")\n        >>> ds.features\n        {'a': Json()}\n        >>> list(ds[\"a\"])\n        [0, \"foo\", {\"subfield\": \"bar\"}]\n\n        >>> features = Features({\"a\": Json()})\n        >>> ds = Dataset.from_list([{\"a\": 0}, {\"a\": \"foo\"}, {\"a\": {\"subfield\": \"bar\"}}], features=features)\n        >>> ds.features\n        {'a': Json()}\n        >>> list(ds[\"a\"])\n        [0, \"foo\", {\"subfield\": \"bar\"}]\n        ```\n\n        This is also useful for lists of dictionaries with arbitrary keys and values, to avoid filling missing fields with None:\n\n        ```py\n        >>> ds = Dataset.from_list([{\"a\": [{\"b\": 0}, {\"c\": 0}]}])\n        >>> ds.features\n        {'a': List({'b': Value('int64'), 'c': Value('int64')})}\n        >>> list(ds[\"a\"])\n        [[{'b': 0, 'c': None}, {'b': None, 'c': 0}]]  # missing fields are filled with None\n\n        >>> features = Features({\"a\": List(Json())})\n        >>> ds = Dataset.from_list([{\"a\": [{\"b\": 0}, {\"c\": 0}]}], features=features)\n        >>> ds.features\n        {'a': List(Json())}\n        >>> list(ds[\"a\"])\n        [[{'b': 0}, {'c': 0}]]  # OK\n\n        >>> ds = Dataset.from_list([{\"a\": [{\"b\": 0}, {\"c\": 0}]}], on_mixed_types=\"use_json\")\n        >>> ds.features\n        {'a': List(Json())}\n        >>> list(ds[\"a\"])\n        [[{'b': 0}, {'c': 0}]]  # OK\n        ```\n\n        Another example with tool calling data:\n\n        ```py\n        >>> messages = [\n        ...     {\"role\": \"user\", \"content\": \"Turn on the living room lights and play my electronic music playlist.\"},\n        ...     {\"role\": \"assistant\", \"tool_calls\": [\n        ...         {\"type\": \"function\", \"function\": {\n        ...             \"name\": \"control_light\",\n        ...             \"arguments\": {\"room\": \"living room\", \"state\": \"on\"}\n        ...         }},\n        ...         {\"type\": \"function\", \"function\": {\n        ...             \"name\": \"play_music\",\n        ...             \"arguments\": {\"playlist\": \"electronic\"}  # mixed-type here since keys [\"playlist\"] and [\"room\", \"state\"] are different\n        ...         }}]\n        ...     },\n        ...     {\"role\": \"tool\", \"name\": \"control_light\", \"content\": \"The lights in the living room are now on.\"},\n        ...     {\"role\": \"tool\", \"name\": \"play_music\", \"content\": \"The music is now playing.\"},\n        ...     {\"role\": \"assistant\", \"content\": \"Done!\"}\n        ... ]\n        >>> ds = Dataset.from_list([{\"messages\": messages}], on_mixed_types=\"use_json\")\n        >>> ds.features\n        {'messages': List({'role': Value('string'), 'content': Value('string'), 'tool_calls': List(Json()), 'name': Value('string')})}\n        >>> ds[0][\"messages\"][1][\"tool_calls\"][0][\"function\"][\"arguments\"]\n        {\"room\": \"living room\", \"state\": \"on\"}\n        ```\n        \"\"\"\n        # for simplicity and consistency wrt OptimizedTypedSequence we do not use InMemoryTable.from_pylist here\n        mapping = {k: [r.get(k) for r in mapping] for k in mapping[0]} if mapping else {}\n        return cls.from_dict(mapping, features, info, split, on_mixed_types=on_mixed_types)\n\n    @staticmethod\n    def from_csv(\n        path_or_paths: Union[PathLike, list[PathLike]],\n        split: Optional[NamedSplit] = None,\n        features: Optional[Features] = None,\n        cache_dir: str = None,\n        keep_in_memory: bool = False,\n        num_proc: Optional[int] = None,\n        **kwargs,\n    ) -> \"Dataset\":\n        \"\"\"Create Dataset from CSV file(s).\n\n        Read the CSV files, cache the data in Arrow format on disk and return the Dataset from the memory-mapped Arrow data on disk.\n\n        Args:\n            path_or_paths (`path-like` or list of `path-like`):\n                Path(s) of the CSV file(s).\n            split ([`NamedSplit`], *optional*):\n                Split name to be assigned to the dataset.\n            features ([`Features`], *optional*):\n                Dataset features.\n            cache_dir (`str`, *optional*, defaults to `\"~/.cache/huggingface/datasets\"`):\n                Directory to cache data.\n            keep_in_memory (`bool`, defaults to `False`):\n                Whether to copy the data in-memory.\n            num_proc (`int`, *optional*, defaults to `None`):\n                Number of processes when downloading and generating the dataset locally.\n                This is helpful if the dataset is made of multiple files. Multiprocessing is disabled by default.\n\n                <Added version=\"2.8.0\"/>\n            **kwargs (additional keyword arguments):\n                Keyword arguments to be passed to [`pandas.read_csv`].\n\n        Returns:\n            [`Dataset`]\n\n        Example:\n\n        ```py\n        >>> ds = Dataset.from_csv('path/to/dataset.csv')\n        ```\n        \"\"\"\n        # Dynamic import to avoid circular dependency\n        from .io.csv import CsvDatasetReader\n\n        return CsvDatasetReader(\n            path_or_paths,\n            split=split,\n            features=features,\n            cache_dir=cache_dir,\n            keep_in_memory=keep_in_memory,\n            num_proc=num_proc,\n            **kwargs,\n        ).read()\n\n    @staticmethod\n    def from_generator(\n        generator: Callable,\n        features: Optional[Features] = None,\n        cache_dir: str = None,\n        keep_in_memory: bool = False,\n        gen_kwargs: Optional[dict] = None,\n        num_proc: Optional[int] = None,\n        split: NamedSplit = Split.TRAIN,\n        fingerprint: Optional[str] = None,\n        **kwargs,\n    ) -> \"Dataset\":\n        \"\"\"Create a Dataset from a generator.\n\n        Load the data from the generator, cache the data in Arrow format on disk and return the Dataset from the memory-mapped Arrow data on disk.\n\n        Args:\n            generator (:`Callable`):\n                A generator function that `yields` examples.\n            features ([`Features`], *optional*):\n                Dataset features.\n            cache_dir (`str`, *optional*, defaults to `\"~/.cache/huggingface/datasets\"`):\n                Directory to cache data.\n            keep_in_memory (`bool`, defaults to `False`):\n                Whether to copy the data in-memory.\n            gen_kwargs(`dict`, *optional*):\n                Keyword arguments to be passed to the `generator` callable.\n                You can define a sharded dataset by passing the list of shards in `gen_kwargs` and setting `num_proc` greater than 1.\n            num_proc (`int`, *optional*, defaults to `None`):\n                Number of processes when downloading and generating the dataset locally.\n                This is helpful if the dataset is made of multiple files. Multiprocessing is disabled by default.\n                If `num_proc` is greater than one, then all list values in `gen_kwargs` must be the same length. These values will be split between calls to the generator. The number of shards will be the minimum of the shortest list in `gen_kwargs` and `num_proc`.\n\n                <Added version=\"2.7.0\"/>\n            split ([`NamedSplit`], defaults to `Split.TRAIN`):\n                Split name to be assigned to the dataset.\n\n                <Added version=\"2.21.0\"/>\n            fingerprint (`str`, *optional*):\n                Fingerprint that will be used to generate dataset ID.\n                By default `fingerprint` is generated by hashing the generator function and all the args which can be slow\n                if it uses large objects like AI models.\n\n                <Added version=\"4.3.0\"/>\n            **kwargs (additional keyword arguments):\n                Keyword arguments to be passed to :[`GeneratorConfig`].\n\n        Returns:\n            [`Dataset`]\n\n        Example:\n\n        ```py\n        >>> def gen():\n        ...     yield {\"text\": \"Good\", \"label\": 0}\n        ...     yield {\"text\": \"Bad\", \"label\": 1}\n        ...\n        >>> ds = Dataset.from_generator(gen)\n        ```\n\n        ```py\n        >>> def gen(shards):\n        ...     for shard in shards:\n        ...         with open(shard) as f:\n        ...             for line in f:\n        ...                 yield {\"line\": line}\n        ...\n        >>> shards = [f\"data{i}.txt\" for i in range(32)]\n        >>> ds = Dataset.from_generator(gen, gen_kwargs={\"shards\": shards})\n        ```\n        \"\"\"\n        from .io.generator import GeneratorDatasetInputStream\n\n        return GeneratorDatasetInputStream(\n            generator=generator,\n            features=features,\n            cache_dir=cache_dir,\n            keep_in_memory=keep_in_memory,\n            gen_kwargs=gen_kwargs,\n            num_proc=num_proc,\n            split=split,\n            fingerprint=fingerprint,\n            **kwargs,\n        ).read()\n\n    @staticmethod\n    def from_json(\n        path_or_paths: Union[PathLike, list[PathLike]],\n        split: Optional[NamedSplit] = None,\n        features: Optional[Features] = None,\n        cache_dir: str = None,\n        keep_in_memory: bool = False,\n        field: Optional[str] = None,\n        num_proc: Optional[int] = None,\n        **kwargs,\n    ) -> \"Dataset\":\n        \"\"\"Create Dataset from JSON or JSON Lines file(s).\n\n        Read the JSON files, cache the data in Arrow format on disk and return the Dataset from the memory-mapped Arrow data on disk.\n\n        Args:\n            path_or_paths (`path-like` or list of `path-like`):\n                Path(s) of the JSON or JSON Lines file(s).\n            split ([`NamedSplit`], *optional*):\n                Split name to be assigned to the dataset.\n            features ([`Features`], *optional*):\n                 Dataset features.\n            cache_dir (`str`, *optional*, defaults to `\"~/.cache/huggingface/datasets\"`):\n                Directory to cache data.\n            keep_in_memory (`bool`, defaults to `False`):\n                Whether to copy the data in-memory.\n            field (`str`, *optional*):\n                Field name of the JSON file where the dataset is contained in.\n            num_proc (`int`, *optional* defaults to `None`):\n                Number of processes when downloading and generating the dataset locally.\n                This is helpful if the dataset is made of multiple files. Multiprocessing is disabled by default.\n\n                <Added version=\"2.8.0\"/>\n            **kwargs (additional keyword arguments):\n                Keyword arguments to be passed to [`JsonConfig`].\n\n        Returns:\n            [`Dataset`]\n\n        Example:\n\n        ```py\n        >>> ds = Dataset.from_json('path/to/dataset.json')\n        ```\n        \"\"\"\n        # Dynamic import to avoid circular dependency\n        from .io.json import JsonDatasetReader\n\n        return JsonDatasetReader(\n            path_or_paths,\n            split=split,\n            features=features,\n            cache_dir=cache_dir,\n            keep_in_memory=keep_in_memory,\n            field=field,\n            num_proc=num_proc,\n            **kwargs,\n        ).read()\n\n    @staticmethod\n    def from_parquet(\n        path_or_paths: Union[PathLike, list[PathLike]],\n        split: Optional[NamedSplit] = None,\n        features: Optional[Features] = None,\n        cache_dir: str = None,\n        keep_in_memory: bool = False,\n        columns: Optional[list[str]] = None,\n        num_proc: Optional[int] = None,\n        filters: Optional[Union[pds.Expression, list[tuple], list[list[tuple]]]] = None,\n        fragment_scan_options: Optional[pds.ParquetFragmentScanOptions] = None,\n        on_bad_files: Literal[\"error\", \"warn\", \"skip\"] = \"error\",\n        **kwargs,\n    ) -> \"Dataset\":\n        \"\"\"Create Dataset from Parquet file(s).\n\n        Read the Parquet files, cache the data in Arrow format on disk and return the Dataset from the memory-mapped Arrow data on disk.\n\n        Args:\n            path_or_paths (`path-like` or list of `path-like`):\n                Path(s) of the Parquet file(s).\n            split (`NamedSplit`, *optional*):\n                Split name to be assigned to the dataset.\n            features (`Features`, *optional*):\n                Dataset features.\n            cache_dir (`str`, *optional*, defaults to `\"~/.cache/huggingface/datasets\"`):\n                Directory to cache data.\n            keep_in_memory (`bool`, defaults to `False`):\n                Whether to copy the data in-memory.\n            columns (`List[str]`, *optional*):\n                If not `None`, only these columns will be read from the file.\n                A column name may be a prefix of a nested field, e.g. 'a' will select\n                'a.b', 'a.c', and 'a.d.e'.\n            num_proc (`int`, *optional*, defaults to `None`):\n                Number of processes when downloading and generating the dataset locally.\n                This is helpful if the dataset is made of multiple files. Multiprocessing is disabled by default.\n\n                <Added version=\"2.8.0\"/>\n            filters (`Union[pyarrow.dataset.Expression, list[tuple], list[list[tuple]]]`, *optional*):\n                Return only the rows matching the filter.\n                If possible the predicate will be pushed down to exploit the partition information\n                or internal metadata found in the data source, e.g. Parquet statistics.\n                Otherwise filters the loaded RecordBatches before yielding them.\n            fragment_scan_options (`pyarrow.dataset.ParquetFragmentScanOptions`, *optional*)\n                Scan-specific options for Parquet fragments.\n                This is especially useful to configure buffering and caching.\n\n                <Added version=\"4.2.0\"/>\n            on_bad_files (`Literal[\"error\", \"warn\", \"skip\"]`, *optional*, defaults to \"error\")\n                Specify what to do upon encountering a bad file (a file that can't be read). Allowed values are :\n                * 'error', raise an Exception when a bad file is encountered.\n                * 'warn', raise a warning when a bad file is encountered and skip that file.\n                * 'skip', skip bad files without raising or warning when they are encountered.\n\n                <Added version=\"4.2.0\"/>\n            **kwargs (additional keyword arguments):\n                Keyword arguments to be passed to [`ParquetConfig`].\n\n        Returns:\n            [`Dataset`]\n\n        Example:\n\n        ```py\n        >>> ds = Dataset.from_parquet('path/to/dataset.parquet')\n        ```\n\n        Load a subset of columns:\n\n        ```python\n        >>> ds = Dataset.from_parquet('path/to/dataset.parquet', columns=[\"col_0\", \"col_1\"])\n        ```\n\n        Efficiently filter data, possibly skipping entire files or row groups:\n\n        ```python\n        >>> filters = [(\"col_0\", \"==\", 0)]\n        >>> ds = Dataset.from_parquet(parquet_files_list, filters=filters)\n        ```\n        \"\"\"\n        # Dynamic import to avoid circular dependency\n        from .io.parquet import ParquetDatasetReader\n\n        return ParquetDatasetReader(\n            path_or_paths,\n            split=split,\n            features=features,\n            cache_dir=cache_dir,\n            keep_in_memory=keep_in_memory,\n            columns=columns,\n            num_proc=num_proc,\n            filters=filters,\n            fragment_scan_options=fragment_scan_options,\n            on_bad_files=on_bad_files,\n            **kwargs,\n        ).read()\n\n    @staticmethod\n    def from_text(\n        path_or_paths: Union[PathLike, list[PathLike]],\n        split: Optional[NamedSplit] = None,\n        features: Optional[Features] = None,\n        cache_dir: str = None,\n        keep_in_memory: bool = False,\n        num_proc: Optional[int] = None,\n        keep_linebreaks: bool = False,\n        sample_by: Literal[\"line\", \"paragraph\", \"document\"] = \"line\",\n        **kwargs,\n    ) -> \"Dataset\":\n        \"\"\"Create Dataset from text file(s).\n\n        Read the text files, cache the data in Arrow format on disk and return the Dataset from the memory-mapped Arrow data on disk.\n\n        Args:\n            path_or_paths (`path-like` or list of `path-like`):\n                Path(s) of the text file(s).\n            split (`NamedSplit`, *optional*):\n                Split name to be assigned to the dataset.\n            features (`Features`, *optional*):\n                Dataset features.\n            cache_dir (`str`, *optional*, defaults to `\"~/.cache/huggingface/datasets\"`):\n                Directory to cache data.\n            keep_in_memory (`bool`, defaults to `False`):\n                Whether to copy the data in-memory.\n            num_proc (`int`, *optional*, defaults to `None`):\n                Number of processes when downloading and generating the dataset locally.\n                This is helpful if the dataset is made of multiple files. Multiprocessing is disabled by default.\n\n                <Added version=\"2.8.0\"/>\n            keep_linebreaks: (`bool`, defaults to False):\n                Whether to keep line breaks.\n            sample_by (`Literal[\"line\", \"paragraph\", \"document\"]`, defaults to \"line\"):\n                Whether to load data per line, praragraph or document.\n                By default one row in the dataset = one line.\n            **kwargs (additional keyword arguments):\n                Keyword arguments to be passed to [`TextConfig`].\n\n        Returns:\n            [`Dataset`]\n\n        Example:\n\n        ```py\n        >>> ds = Dataset.from_text('path/to/dataset.txt')\n        ```\n        \"\"\"\n        # Dynamic import to avoid circular dependency\n        from .io.text import TextDatasetReader\n\n        return TextDatasetReader(\n            path_or_paths,\n            split=split,\n            features=features,\n            cache_dir=cache_dir,\n            keep_in_memory=keep_in_memory,\n            num_proc=num_proc,\n            keep_linebreaks=keep_linebreaks,\n            sample_by=sample_by,\n            **kwargs,\n        ).read()\n\n    @staticmethod\n    def from_spark(\n        df: \"pyspark.sql.DataFrame\",\n        split: Optional[NamedSplit] = None,\n        features: Optional[Features] = None,\n        keep_in_memory: bool = False,\n        cache_dir: str = None,\n        working_dir: str = None,\n        load_from_cache_file: bool = True,\n        **kwargs,\n    ) -> \"Dataset\":\n        \"\"\"Create a Dataset from Spark DataFrame. Dataset downloading is distributed over Spark workers.\n\n        Read the Spark DataFrame, cache the data in Arrow format on disk and return the Dataset from the memory-mapped Arrow data on disk.\n\n        Args:\n            df (`pyspark.sql.DataFrame`):\n                The DataFrame containing the desired data.\n            split (`NamedSplit`, *optional*):\n                Split name to be assigned to the dataset.\n            features (`Features`, *optional*):\n                Dataset features.\n            cache_dir (`str`, *optional*, defaults to `\"~/.cache/huggingface/datasets\"`):\n                Directory to cache data. When using a multi-node Spark cluster, the cache_dir must be accessible to both\n                workers and the driver.\n            keep_in_memory (`bool`):\n                Whether to copy the data in-memory.\n            working_dir (`str`, *optional*)\n                Intermediate directory for each Spark worker to write data to before moving it to `cache_dir`. Setting\n                a non-NFS intermediate directory may improve performance.\n            load_from_cache_file (`bool`):\n                Whether to load the dataset from the cache if possible.\n\n        Returns:\n            [`Dataset`]\n\n        Example:\n\n        ```py\n        >>> df = spark.createDataFrame(\n        >>>     data=[[1, \"Elia\"], [2, \"Teo\"], [3, \"Fang\"]],\n        >>>     columns=[\"id\", \"name\"],\n        >>> )\n        >>> ds = Dataset.from_spark(df)\n        ```\n        \"\"\"\n        # Dynamic import to avoid circular dependency\n        from .io.spark import SparkDatasetReader\n\n        if sys.platform == \"win32\":\n            raise OSError(\"Dataset.from_spark is not currently supported on Windows\")\n\n        return SparkDatasetReader(\n            df,\n            split=split,\n            features=features,\n            streaming=False,\n            cache_dir=cache_dir,\n            keep_in_memory=keep_in_memory,\n            working_dir=working_dir,\n            load_from_cache_file=load_from_cache_file,\n            **kwargs,\n        ).read()\n\n    @staticmethod\n    def from_sql(\n        sql: Union[str, \"sqlalchemy.sql.Selectable\"],\n        con: Union[str, \"sqlalchemy.engine.Connection\", \"sqlalchemy.engine.Engine\", \"sqlite3.Connection\"],\n        features: Optional[Features] = None,\n        cache_dir: str = None,\n        keep_in_memory: bool = False,\n        **kwargs,\n    ) -> \"Dataset\":\n        \"\"\"Create Dataset from SQL query or database table.\n\n        Query the SQL database, cache the data in Arrow format on disk and return the Dataset from the memory-mapped Arrow data on disk.\n\n        Args:\n            sql (`str` or `sqlalchemy.sql.Selectable`):\n                SQL query to be executed or a table name.\n            con (`str` or `sqlite3.Connection` or `sqlalchemy.engine.Connection` or `sqlalchemy.engine.Connection`):\n                A [URI string](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) used to instantiate a database connection or a SQLite3/SQLAlchemy connection object.\n            features ([`Features`], *optional*):\n                Dataset features.\n            cache_dir (`str`, *optional*, defaults to `\"~/.cache/huggingface/datasets\"`):\n                Directory to cache data.\n            keep_in_memory (`bool`, defaults to `False`):\n                Whether to copy the data in-memory.\n            **kwargs (additional keyword arguments):\n                Keyword arguments to be passed to [`SqlConfig`].\n\n        Returns:\n            [`Dataset`]\n\n        Example:\n\n        ```py\n        >>> # Fetch a database table\n        >>> ds = Dataset.from_sql(\"test_data\", \"postgres:///db_name\")\n        >>> # Execute a SQL query on the table\n        >>> ds = Dataset.from_sql(\"SELECT sentence FROM test_data\", \"postgres:///db_name\")\n        >>> # Use a Selectable object to specify the query\n        >>> from sqlalchemy import select, text\n        >>> stmt = select([text(\"sentence\")]).select_from(text(\"test_data\"))\n        >>> ds = Dataset.from_sql(stmt, \"postgres:///db_name\")\n        ```\n\n        > [!TIP]\n        > The returned dataset can only be cached if `con` is specified as URI string.\n        \"\"\"\n        from .io.sql import SqlDatasetReader\n\n        return SqlDatasetReader(\n            sql,\n            con,\n            features=features,\n            cache_dir=cache_dir,\n            keep_in_memory=keep_in_memory,\n            **kwargs,\n        ).read()\n\n    def __setstate__(self, state):\n        self.__dict__.update(state)\n        maybe_register_dataset_for_temp_dir_deletion(self)\n        return self\n\n    def __del__(self):\n        if hasattr(self, \"_data\"):\n            del self._data\n        if hasattr(self, \"_indices\"):\n            del self._indices\n\n    def __enter__(self):\n        return self\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        # Here `del` is used to del the pyarrow tables. This properly closes the files used for memory mapped tables\n        self.__del__()\n\n    def save_to_disk(\n        self,\n        dataset_path: PathLike,\n        max_shard_size: Optional[Union[str, int]] = None,\n        num_shards: Optional[int] = None,\n        num_proc: Optional[int] = None,\n        storage_options: Optional[dict] = None,\n    ):\n        \"\"\"\n        Saves a dataset to a dataset directory, or in a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.\n\n        For [`Image`], [`Audio`] and [`Video`] data:\n\n        All the Image(), Audio() and Video() data are stored in the arrow files.\n        If you want to store paths or urls, please use the Value(\"string\") type.\n\n        Args:\n            dataset_path (`path-like`):\n                Path (e.g. `dataset/train`) or remote URI (e.g. `s3://my-bucket/dataset/train`)\n                of the dataset directory where the dataset will be saved to.\n            max_shard_size (`int` or `str`, *optional*, defaults to `\"500MB\"`):\n                The maximum size of the dataset shards to be saved to the filesystem. If expressed as a string, needs to be digits followed by a unit\n                (like `\"50MB\"`).\n            num_shards (`int`, *optional*):\n                Number of shards to write. By default the number of shards depends on `max_shard_size` and `num_proc`.\n\n                <Added version=\"2.8.0\"/>\n            num_proc (`int`, *optional*):\n                Number of processes when downloading and generating the dataset locally.\n                Multiprocessing is disabled by default.\n\n                <Added version=\"2.8.0\"/>\n            storage_options (`dict`, *optional*):\n                Key/value pairs to be passed on to the file-system backend, if any.\n\n                <Added version=\"2.8.0\"/>\n\n        Example:\n\n        ```py\n        >>> ds.save_to_disk(\"path/to/dataset/directory\")\n        >>> ds.save_to_disk(\"path/to/dataset/directory\", max_shard_size=\"1GB\")\n        >>> ds.save_to_disk(\"path/to/dataset/directory\", num_shards=1024)\n        ```\n        \"\"\"\n        if max_shard_size is not None and num_shards is not None:\n            raise ValueError(\n                \"Failed to push_to_hub: please specify either max_shard_size or num_shards, but not both.\"\n            )\n        if self.list_indexes():\n            raise ValueError(\"please remove all the indexes using `dataset.drop_index` before saving a dataset\")\n\n        if num_shards is None:\n            dataset_nbytes = self._estimate_nbytes()\n            max_shard_size = convert_file_size_to_int(max_shard_size or config.MAX_SHARD_SIZE)\n            num_shards = int(dataset_nbytes / max_shard_size) + 1\n            num_shards = max(num_shards, num_proc or 1)\n            # if we have only a few large samples, we should only create as many shards as samples\n            num_shards = min(len(self.data), num_shards)\n\n        fs: fsspec.AbstractFileSystem\n        fs, _ = url_to_fs(dataset_path, **(storage_options or {}))\n\n        if not is_remote_filesystem(fs):\n            parent_cache_files_paths = {\n                Path(cache_filename[\"filename\"]).resolve().parent for cache_filename in self.cache_files\n            }\n            # Check that the dataset doesn't overwrite itself. It can cause a permission error on Windows and a segfault on linux.\n            if Path(dataset_path).expanduser().resolve() in parent_cache_files_paths:\n                raise PermissionError(\n                    f\"Tried to overwrite {Path(dataset_path).expanduser().resolve()} but a dataset can't overwrite itself.\"\n                )\n\n        fs.makedirs(dataset_path, exist_ok=True)\n\n        # Get json serializable state\n        state = {\n            key: self.__dict__[key]\n            for key in [\n                \"_fingerprint\",\n                \"_format_columns\",\n                \"_format_kwargs\",\n                \"_format_type\",\n                \"_output_all_columns\",\n            ]\n        }\n        state[\"_split\"] = str(self.split) if self.split is not None else self.split\n        state[\"_data_files\"] = [\n            {\"filename\": f\"data-{shard_idx:05d}-of-{num_shards:05d}.arrow\"} for shard_idx in range(num_shards)\n        ]\n        for k in state[\"_format_kwargs\"].keys():\n            try:\n                json.dumps(state[\"_format_kwargs\"][k])\n            except TypeError as e:\n                raise TypeError(\n                    str(e) + f\"\\nThe format kwargs must be JSON serializable, but key '{k}' isn't.\"\n                ) from None\n        # Get json serializable dataset info\n        dataset_info = asdict(self._info)\n\n        shards_done = 0\n        pbar = hf_tqdm(\n            unit=\" examples\",\n            total=len(self),\n            desc=f\"Saving the dataset ({shards_done}/{num_shards} shards)\",\n        )\n        kwargs_per_job = (\n            {\n                \"job_id\": shard_idx,\n                \"shard\": self.shard(num_shards=num_shards, index=shard_idx, contiguous=True),\n                \"fpath\": posixpath.join(dataset_path, f\"data-{shard_idx:05d}-of-{num_shards:05d}.arrow\"),\n                \"storage_options\": storage_options,\n            }\n            for shard_idx in range(num_shards)\n        )\n        shard_lengths = [None] * num_shards\n        shard_sizes = [None] * num_shards\n        if num_proc is not None and num_proc >= 1:\n            with mp.get_context(\"spawn\").Pool(num_proc) as pool:\n                with pbar:\n                    for job_id, done, content in iflatmap_unordered(\n                        pool, Dataset._save_to_disk_single, kwargs_iterable=kwargs_per_job\n                    ):\n                        if done:\n                            shards_done += 1\n                            pbar.set_description(f\"Saving the dataset ({shards_done}/{num_shards} shards)\")\n                            logger.debug(f\"Finished writing shard number {job_id} of {num_shards}.\")\n                            shard_lengths[job_id], shard_sizes[job_id] = content\n                        else:\n                            pbar.update(content)\n        else:\n            with pbar:\n                for kwargs in kwargs_per_job:\n                    for job_id, done, content in Dataset._save_to_disk_single(**kwargs):\n                        if done:\n                            shards_done += 1\n                            pbar.set_description(f\"Saving the dataset ({shards_done}/{num_shards} shards)\")\n                            logger.debug(f\"Finished writing shard number {job_id} of {num_shards}.\")\n                            shard_lengths[job_id], shard_sizes[job_id] = content\n                        else:\n                            pbar.update(content)\n        with fs.open(\n            posixpath.join(dataset_path, config.DATASET_STATE_JSON_FILENAME), \"w\", encoding=\"utf-8\"\n        ) as state_file:\n            json.dump(state, state_file, indent=2, sort_keys=True)\n        with fs.open(\n            posixpath.join(dataset_path, config.DATASET_INFO_FILENAME), \"w\", encoding=\"utf-8\"\n        ) as dataset_info_file:\n            # Sort only the first level of keys, or we might shuffle fields of nested features if we use sort_keys=True\n            sorted_keys_dataset_info = {key: dataset_info[key] for key in sorted(dataset_info)}\n            json.dump(sorted_keys_dataset_info, dataset_info_file, indent=2)\n\n    @staticmethod\n    def _save_to_disk_single(job_id: int, shard: \"Dataset\", fpath: str, storage_options: Optional[dict]):\n        batch_size = config.DEFAULT_MAX_BATCH_SIZE\n\n        num_examples_progress_update = 0\n        writer = ArrowWriter(\n            features=shard.features,\n            path=fpath,\n            storage_options=storage_options,\n            embed_local_files=True,\n        )\n        try:\n            _time = time.time()\n            for pa_table in shard.with_format(\"arrow\").iter(batch_size):\n                writer.write_table(pa_table)\n                num_examples_progress_update += len(pa_table)\n                if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL:\n                    _time = time.time()\n                    yield job_id, False, num_examples_progress_update\n                    num_examples_progress_update = 0\n        finally:\n            yield job_id, False, num_examples_progress_update\n            num_examples, num_bytes = writer.finalize()\n            writer.close()\n\n        yield job_id, True, (num_examples, num_bytes)\n\n    @staticmethod\n    def _build_local_temp_path(uri_or_path: str) -> Path:\n        \"\"\"\n        Builds and returns a Path concatenating a local temporary dir with the dir path (or absolute/relative\n        path extracted from the uri) passed.\n\n        Args:\n            uri_or_path (`str`): Path (e.g. `\"dataset/train\"`) or remote URI (e.g.\n                `\"s3://my-bucket/dataset/train\"`) to concatenate.\n\n        Returns:\n            :class:`Path`: the concatenated path (temp dir + path)\n        \"\"\"\n        src_dataset_path = Path(uri_or_path)\n        tmp_dir = get_temporary_cache_files_directory()\n        return Path(tmp_dir, src_dataset_path.relative_to(src_dataset_path.anchor))\n\n    @staticmethod\n    def load_from_disk(\n        dataset_path: PathLike,\n        keep_in_memory: Optional[bool] = None,\n        storage_options: Optional[dict] = None,\n    ) -> \"Dataset\":\n        \"\"\"\n        Loads a dataset that was previously saved using [`save_to_disk`] from a dataset directory, or from a\n        filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.\n\n        Args:\n            dataset_path (`path-like`):\n                Path (e.g. `\"dataset/train\"`) or remote URI (e.g. `\"s3//my-bucket/dataset/train\"`)\n                of the dataset directory where the dataset will be loaded from.\n            keep_in_memory (`bool`, defaults to `None`):\n                Whether to copy the dataset in-memory. If `None`, the\n                dataset will not be copied in-memory unless explicitly enabled by setting\n                `datasets.config.IN_MEMORY_MAX_SIZE` to nonzero. See more details in the\n                [improve performance](../cache#improve-performance) section.\n            storage_options (`dict`, *optional*):\n                Key/value pairs to be passed on to the file-system backend, if any.\n\n                <Added version=\"2.8.0\"/>\n\n        Returns:\n            [`Dataset`] or [`DatasetDict`]:\n            - If `dataset_path` is a path of a dataset directory, the dataset requested.\n            - If `dataset_path` is a path of a dataset dict directory, a `datasets.DatasetDict` with each split.\n\n        Example:\n\n        ```py\n        >>> ds = load_from_disk(\"path/to/dataset/directory\")\n        ```\n        \"\"\"\n        fs: fsspec.AbstractFileSystem\n        fs, dataset_path = url_to_fs(dataset_path, **(storage_options or {}))\n\n        dest_dataset_path = dataset_path\n        dataset_dict_json_path = posixpath.join(dest_dataset_path, config.DATASETDICT_JSON_FILENAME)\n        dataset_state_json_path = posixpath.join(dest_dataset_path, config.DATASET_STATE_JSON_FILENAME)\n        dataset_info_path = posixpath.join(dest_dataset_path, config.DATASET_INFO_FILENAME)\n\n        dataset_dict_is_file = fs.isfile(dataset_dict_json_path)\n        dataset_info_is_file = fs.isfile(dataset_info_path)\n        dataset_state_is_file = fs.isfile(dataset_state_json_path)\n        if not dataset_info_is_file and not dataset_state_is_file:\n            if dataset_dict_is_file:\n                raise FileNotFoundError(\n                    f\"No such files: '{dataset_info_path}', nor '{dataset_state_json_path}' found. Expected to load a `Dataset` object, but got a `DatasetDict`. Please use either `datasets.load_from_disk` or `DatasetDict.load_from_disk` instead.\"\n                )\n            raise FileNotFoundError(\n                f\"No such files: '{dataset_info_path}', nor '{dataset_state_json_path}' found. Expected to load a `Dataset` object but provided path is not a `Dataset`.\"\n            )\n        if not dataset_info_is_file:\n            if dataset_dict_is_file:\n                raise FileNotFoundError(\n                    f\"No such file: '{dataset_info_path}' found. Expected to load a `Dataset` object, but got a `DatasetDict`. Please use either `datasets.load_from_disk` or `DatasetDict.load_from_disk` instead.\"\n                )\n            raise FileNotFoundError(\n                f\"No such file: '{dataset_info_path}'. Expected to load a `Dataset` object but provided path is not a `Dataset`.\"\n            )\n        if not dataset_state_is_file:\n            if dataset_dict_is_file:\n                raise FileNotFoundError(\n                    f\"No such file: '{dataset_state_json_path}' found. Expected to load a `Dataset` object, but got a `DatasetDict`. Please use either `datasets.load_from_disk` or `DatasetDict.load_from_disk` instead.\"\n                )\n            raise FileNotFoundError(\n                f\"No such file: '{dataset_state_json_path}'. Expected to load a `Dataset` object but provided path is not a `Dataset`.\"\n            )\n\n        # copies file from filesystem if it is remote filesystem to local filesystem and modifies dataset_path to temp directory containing local copies\n        if is_remote_filesystem(fs):\n            src_dataset_path = dest_dataset_path\n            dest_dataset_path = Dataset._build_local_temp_path(src_dataset_path)\n            fs.download(src_dataset_path, dest_dataset_path.as_posix(), recursive=True)\n            dataset_state_json_path = posixpath.join(dest_dataset_path, config.DATASET_STATE_JSON_FILENAME)\n            dataset_info_path = posixpath.join(dest_dataset_path, config.DATASET_INFO_FILENAME)\n\n        with open(dataset_state_json_path, encoding=\"utf-8\") as state_file:\n            state = json.load(state_file)\n        with open(dataset_info_path, encoding=\"utf-8\") as dataset_info_file:\n            dataset_info = DatasetInfo.from_dict(json.load(dataset_info_file))\n\n        dataset_size = estimate_dataset_size(\n            Path(dest_dataset_path, data_file[\"filename\"]) for data_file in state[\"_data_files\"]\n        )\n        keep_in_memory = keep_in_memory if keep_in_memory is not None else is_small_dataset(dataset_size)\n        table_cls = InMemoryTable if keep_in_memory else MemoryMappedTable\n\n        arrow_table = concat_tables(\n            thread_map(\n                table_cls.from_file,\n                [posixpath.join(dest_dataset_path, data_file[\"filename\"]) for data_file in state[\"_data_files\"]],\n                tqdm_class=hf_tqdm,\n                desc=\"Loading dataset from disk\",\n                # set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached\n                disable=len(state[\"_data_files\"]) <= 16 or None,\n            )\n        )\n\n        split = state[\"_split\"]\n        split = Split(split) if split is not None else split\n\n        dataset = Dataset(\n            arrow_table=arrow_table,\n            info=dataset_info,\n            split=split,\n            fingerprint=state[\"_fingerprint\"],\n        )\n\n        format = {\n            \"type\": state[\"_format_type\"],\n            \"format_kwargs\": state[\"_format_kwargs\"],\n            \"columns\": state[\"_format_columns\"],\n            \"output_all_columns\": state[\"_output_all_columns\"],\n        }\n        dataset = dataset.with_format(**format)\n\n        return dataset\n\n    @property\n    def data(self) -> Table:\n        \"\"\"The Apache Arrow table backing the dataset.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds.data\n        MemoryMappedTable\n        text: string\n        label: int64\n        ----\n        text: [[\"compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .\",\"the soundtrack alone is worth the price of admission .\",\"rodriguez does a splendid job of racial profiling hollywood style--casting excellent latin actors of all ages--a trend long overdue .\",\"beneath the film's obvious determination to shock at any cost lies considerable skill and determination , backed by sheer nerve .\",\"bielinsky is a filmmaker of impressive talent .\",\"so beautifully acted and directed , it's clear that washington most certainly has a new career ahead of him if he so chooses .\",\"a visual spectacle full of stunning images and effects .\",\"a gentle and engrossing character study .\",\"it's enough to watch huppert scheming , with her small , intelligent eyes as steady as any noir villain , and to enjoy the perfectly pitched web of tension that chabrol spins .\",\"an engrossing portrait of uncompromising artists trying to create something original against the backdrop of a corporate music industry that only seems to care about the bottom line .\",...,\"ultimately , jane learns her place as a girl , softens up and loses some of the intensity that made her an interesting character to begin with .\",\"ah-nuld's action hero days might be over .\",\"it's clear why deuces wild , which was shot two years ago , has been gathering dust on mgm's shelf .\",\"feels like nothing quite so much as a middle-aged moviemaker's attempt to surround himself with beautiful , half-naked women .\",\"when the precise nature of matthew's predicament finally comes into sharp focus , the revelation fails to justify the build-up .\",\"this picture is murder by numbers , and as easy to be bored by as your abc's , despite a few whopping shootouts .\",\"hilarious musical comedy though stymied by accents thick as mud .\",\"if you are into splatter movies , then you will probably have a reasonably good time with the salton sea .\",\"a dull , simple-minded and stereotypical tale of drugs , death and mind-numbing indifference on the inner-city streets .\",\"the feature-length stretch . . . strains the show's concept .\"]]\n        label: [[1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0]]\n        ```\n        \"\"\"\n        return self._data\n\n    @property\n    def cache_files(self) -> list[dict]:\n        \"\"\"The cache files containing the Apache Arrow table backing the dataset.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds.cache_files\n        [{'filename': '/root/.cache/huggingface/datasets/rotten_tomatoes_movie_review/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46/rotten_tomatoes_movie_review-validation.arrow'}]\n        ```\n        \"\"\"\n        cache_files = list_table_cache_files(self._data)\n        if self._indices is not None:\n            cache_files += list_table_cache_files(self._indices)\n        return [{\"filename\": cache_filename} for cache_filename in cache_files]\n\n    @property\n    def num_columns(self) -> int:\n        \"\"\"Number of columns in the dataset.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds.num_columns\n        2\n        ```\n        \"\"\"\n        return self._data.num_columns\n\n    @property\n    def num_rows(self) -> int:\n        \"\"\"Number of rows in the dataset (same as [`Dataset.__len__`]).\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds.num_rows\n        1066\n        ```\n        \"\"\"\n        if self._indices is not None:\n            return self._indices.num_rows\n        return self._data.num_rows\n\n    @property\n    def column_names(self) -> list[str]:\n        \"\"\"Names of the columns in the dataset.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds.column_names\n        ['text', 'label']\n        ```\n        \"\"\"\n        return self._data.column_names\n\n    @property\n    def shape(self) -> tuple[int, int]:\n        \"\"\"Shape of the dataset (number of columns, number of rows).\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds.shape\n        (1066, 2)\n        ```\n        \"\"\"\n        if self._indices is not None:\n            return (self._indices.num_rows, self._data.num_columns)\n        return self._data.shape\n\n    def unique(self, column: str) -> list:\n        \"\"\"Return a list of the unique elements in a column.\n\n        This is implemented in the low-level backend and as such, very fast.\n\n        Args:\n            column (`str`):\n                Column name (list all the column names with [`~datasets.Dataset.column_names`]).\n\n        Returns:\n            `list`: List of unique elements in the given column.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds.unique('label')\n        [1, 0]\n        ```\n        \"\"\"\n        if column not in self._data.column_names:\n            raise ValueError(f\"Column ({column}) not in table columns ({self._data.column_names}).\")\n\n        if self._indices is not None and self._indices.num_rows != self._data.num_rows:\n            dataset = self.flatten_indices()\n        else:\n            dataset = self\n\n        return dataset._data.column(column).unique().to_pylist()\n\n    def class_encode_column(self, column: str, include_nulls: bool = False) -> \"Dataset\":\n        \"\"\"Casts the given column as [`~datasets.features.ClassLabel`] and updates the table.\n\n        Args:\n            column (`str`):\n                The name of the column to cast (list all the column names with [`~datasets.Dataset.column_names`])\n            include_nulls (`bool`, defaults to `False`):\n                Whether to include null values in the class labels. If `True`, the null values will be encoded as the `\"None\"` class label.\n\n                <Added version=\"1.14.2\"/>\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"google/boolq\", split=\"validation\")\n        >>> ds.features\n        {'answer': Value('bool'),\n         'passage': Value('string'),\n         'question': Value('string')}\n        >>> ds = ds.class_encode_column('answer')\n        >>> ds.features\n        {'answer': ClassLabel(num_classes=2, names=['False', 'True']),\n         'passage': Value('string'),\n         'question': Value('string')}\n        ```\n        \"\"\"\n        # Sanity checks\n        if column not in self._data.column_names:\n            raise ValueError(f\"Column ({column}) not in table columns ({self._data.column_names}).\")\n        src_feat = self._info.features[column]\n        if not isinstance(src_feat, Value):\n            raise ValueError(\n                f\"Class encoding is only supported for {Value.__name__} column, and column {column} is {type(src_feat).__name__}.\"\n            )\n\n        if src_feat.dtype != \"string\" or (include_nulls and None in self.unique(column)):\n\n            def stringify_column(batch):\n                batch[column] = [\n                    str(sample) if include_nulls or sample is not None else None for sample in batch[column]\n                ]\n                return batch\n\n            dset = self.map(\n                stringify_column,\n                batched=True,\n                desc=\"Stringifying the column\",\n            )\n        else:\n            dset = self\n\n        # Create the new feature\n        class_names = sorted(str(sample) for sample in dset.unique(column) if include_nulls or sample is not None)\n        dst_feat = ClassLabel(names=class_names)\n\n        def cast_to_class_labels(batch):\n            batch[column] = [\n                dst_feat.str2int(str(sample)) if include_nulls or sample is not None else None\n                for sample in batch[column]\n            ]\n            return batch\n\n        new_features = dset.features.copy()\n        new_features[column] = dst_feat\n\n        dset = dset.map(\n            cast_to_class_labels,\n            batched=True,\n            features=new_features,\n            desc=\"Casting to class labels\",\n        )\n\n        return dset\n\n    @fingerprint_transform(inplace=False)\n    def flatten(self, new_fingerprint: Optional[str] = None, max_depth=16) -> \"Dataset\":\n        \"\"\"Flatten the table.\n        Each column with a struct type is flattened into one column per struct field.\n        Other columns are left unchanged.\n\n        Args:\n            new_fingerprint (`str`, *optional*):\n                The new fingerprint of the dataset after transform.\n                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments.\n\n        Returns:\n            [`Dataset`]: A copy of the dataset with flattened columns.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"rajpurkar/squad\", split=\"train\")\n        >>> ds.features\n        {'id': Value('string'),\n         'title': Value('string'),\n         'context': Value('string'),\n         'question': Value('string'),\n         'answers': {'text': List(Value('string')),\n         'answer_start': List(Value('int32'))}}\n        >>> ds = ds.flatten()\n        >>> ds\n        Dataset({\n            features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'],\n            num_rows: 87599\n        })\n        ```\n        \"\"\"\n        dataset = copy.deepcopy(self)\n        for depth in range(1, max_depth):\n            if any(isinstance(field.type, pa.StructType) for field in dataset._data.schema):\n                dataset._data = dataset._data.flatten()\n            else:\n                break\n        dataset.info.features = self._info.features.flatten(max_depth=max_depth)\n        dataset.info.features = Features({col: dataset.info.features[col] for col in dataset.data.column_names})\n        dataset._data = update_metadata_with_features(dataset._data, dataset.features)\n        logger.info(f\"Flattened dataset from depth {depth} to depth {1 if depth + 1 < max_depth else 'unknown'}.\")\n        dataset._fingerprint = new_fingerprint\n        return dataset\n\n    def cast(\n        self,\n        features: Features,\n        batch_size: Optional[int] = 1000,\n        keep_in_memory: bool = False,\n        load_from_cache_file: Optional[bool] = None,\n        cache_file_name: Optional[str] = None,\n        writer_batch_size: Optional[int] = 1000,\n        num_proc: Optional[int] = None,\n    ) -> \"Dataset\":\n        \"\"\"\n        Cast the dataset to a new set of features.\n\n        Args:\n            features ([`Features`]):\n                New features to cast the dataset to.\n                The name of the fields in the features must match the current column names.\n                The type of the data must also be convertible from one type to the other.\n                For non-trivial conversion, e.g. `str` <-> `ClassLabel` you should use [`~datasets.Dataset.map`] to update the Dataset.\n            batch_size (`int`, defaults to `1000`):\n                Number of examples per batch provided to cast.\n                If `batch_size <= 0` or `batch_size == None` then provide the full dataset as a single batch to cast.\n            keep_in_memory (`bool`, defaults to `False`):\n                Whether to copy the data in-memory.\n            load_from_cache_file (`bool`, defaults to `True` if caching is enabled):\n                If a cache file storing the current computation from `function`\n                can be identified, use it instead of recomputing.\n            cache_file_name (`str`, *optional*, defaults to `None`):\n                Provide the name of a path for the cache file. It is used to store the\n                results of the computation instead of the automatically generated cache file name.\n            writer_batch_size (`int`, defaults to `1000`):\n                Number of rows per write operation for the cache file writer.\n                This value is a good trade-off between memory usage during the processing, and processing speed.\n                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running [`~datasets.Dataset.map`].\n            num_proc (`int`, *optional*, defaults to `None`):\n                Number of processes for multiprocessing. By default it doesn't\n                use multiprocessing.\n\n        Returns:\n            [`Dataset`]: A copy of the dataset with casted features.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset, ClassLabel, Value\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds.features\n        {'label': ClassLabel(names=['neg', 'pos']),\n         'text': Value('string')}\n        >>> new_features = ds.features.copy()\n        >>> new_features['label'] = ClassLabel(names=['bad', 'good'])\n        >>> new_features['text'] = Value('large_string')\n        >>> ds = ds.cast(new_features)\n        >>> ds.features\n        {'label': ClassLabel(names=['bad', 'good']),\n         'text': Value('large_string')}\n        ```\n        \"\"\"\n        if sorted(features) != sorted(self._data.column_names):\n            raise ValueError(\n                f\"The columns in features ({list(features)}) must be identical \"\n                f\"as the columns in the dataset: {self._data.column_names}\"\n            )\n\n        features = _fix_for_backward_compatible_features(features)\n        schema = features.arrow_schema\n        format = self.format\n        dataset = self.with_format(\"arrow\")\n        # capture the PyArrow version here to make the lambda serializable on Windows\n        dataset = dataset.map(\n            partial(table_cast, schema=schema),\n            batched=True,\n            batch_size=batch_size,\n            keep_in_memory=keep_in_memory,\n            load_from_cache_file=load_from_cache_file,\n            cache_file_name=cache_file_name,\n            writer_batch_size=writer_batch_size,\n            num_proc=num_proc,\n            features=features,\n            desc=\"Casting the dataset\",\n        )\n        dataset = dataset.with_format(**format)\n        return dataset\n\n    @fingerprint_transform(inplace=False)\n    def cast_column(self, column: str, feature: FeatureType, new_fingerprint: Optional[str] = None) -> \"Dataset\":\n        \"\"\"Cast column to feature for decoding.\n\n        Args:\n            column (`str`):\n                Column name.\n            feature (`FeatureType`):\n                Target feature.\n            new_fingerprint (`str`, *optional*):\n                The new fingerprint of the dataset after transform.\n                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments.\n\n        Returns:\n            [`Dataset`]\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset, ClassLabel\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds.features\n        {'label': ClassLabel(names=['neg', 'pos']),\n         'text': Value('string')}\n        >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))\n        >>> ds.features\n        {'label': ClassLabel(names=['bad', 'good']),\n         'text': Value('string')}\n        ```\n        \"\"\"\n        feature = _fix_for_backward_compatible_features(feature)\n        if hasattr(feature, \"decode_example\"):\n            dataset = copy.deepcopy(self)\n            dataset._info.features[column] = feature\n            dataset._fingerprint = new_fingerprint\n            dataset._data = dataset._data.cast(dataset.features.arrow_schema)\n            dataset._data = update_metadata_with_features(dataset._data, dataset.features)\n            return dataset\n        else:\n            features = self.features\n            features[column] = feature\n            return self.cast(features)\n\n    @transmit_format\n    @fingerprint_transform(inplace=False)\n    def remove_columns(self, column_names: Union[str, list[str]], new_fingerprint: Optional[str] = None) -> \"Dataset\":\n        \"\"\"\n        Remove one or several column(s) in the dataset and the features associated to them.\n\n        You can also remove a column using [`~datasets.Dataset.map`] with `remove_columns` but the present method\n        doesn't copy the data of the remaining columns and is thus faster.\n\n        Args:\n            column_names (`Union[str, List[str]]`):\n                Name of the column(s) to remove.\n            new_fingerprint (`str`, *optional*):\n                The new fingerprint of the dataset after transform.\n                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments.\n\n        Returns:\n            [`Dataset`]: A copy of the dataset object without the columns to remove.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds = ds.remove_columns('label')\n        Dataset({\n            features: ['text'],\n            num_rows: 1066\n        })\n        >>> ds = ds.remove_columns(column_names=ds.column_names) # Removing all the columns returns an empty dataset with the `num_rows` property set to 0\n        Dataset({\n            features: [],\n            num_rows: 0\n        })\n        ```\n        \"\"\"\n        dataset = copy.deepcopy(self)\n        if isinstance(column_names, str):\n            column_names = [column_names]\n\n        missing_columns = set(column_names) - set(self._data.column_names)\n        if missing_columns:\n            raise ValueError(\n                f\"Column name {list(missing_columns)} not in the dataset. \"\n                f\"Current columns in the dataset: {dataset._data.column_names}\"\n            )\n\n        for column_name in column_names:\n            del dataset._info.features[column_name]\n\n        dataset._data = dataset._data.drop(column_names)\n        dataset._data = update_metadata_with_features(dataset._data, dataset.features)\n        dataset._fingerprint = new_fingerprint\n        return dataset\n\n    @fingerprint_transform(inplace=False)\n    def rename_column(\n        self, original_column_name: str, new_column_name: str, new_fingerprint: Optional[str] = None\n    ) -> \"Dataset\":\n        \"\"\"\n        Rename a column in the dataset, and move the features associated to the original column under the new column\n        name.\n\n        Args:\n            original_column_name (`str`):\n                Name of the column to rename.\n            new_column_name (`str`):\n                New name for the column.\n            new_fingerprint (`str`, *optional*):\n                The new fingerprint of the dataset after transform.\n                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments.\n\n        Returns:\n            [`Dataset`]: A copy of the dataset with a renamed column.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds = ds.rename_column('label', 'label_new')\n        Dataset({\n            features: ['text', 'label_new'],\n            num_rows: 1066\n        })\n        ```\n        \"\"\"\n        dataset = copy.deepcopy(self)\n        if original_column_name not in dataset._data.column_names:\n            raise ValueError(\n                f\"Original column name {original_column_name} not in the dataset. \"\n                f\"Current columns in the dataset: {dataset._data.column_names}\"\n            )\n        if new_column_name in dataset._data.column_names:\n            raise ValueError(\n                f\"New column name {new_column_name} already in the dataset. \"\n                f\"Please choose a column name which is not already in the dataset. \"\n                f\"Current columns in the dataset: {dataset._data.column_names}\"\n            )\n        if not new_column_name:\n            raise ValueError(\"New column name is empty.\")\n\n        def rename(columns):\n            return [new_column_name if col == original_column_name else col for col in columns]\n\n        new_column_names = rename(self._data.column_names)\n        if self._format_columns is not None:\n            dataset._format_columns = rename(self._format_columns)\n\n        dataset._info.features = Features(\n            {\n                new_column_name if col == original_column_name else col: feature\n                for col, feature in self._info.features.items()\n            }\n        )\n\n        dataset._data = dataset._data.rename_columns(new_column_names)\n        dataset._data = update_metadata_with_features(dataset._data, dataset.features)\n        dataset._fingerprint = new_fingerprint\n        return dataset\n\n    @fingerprint_transform(inplace=False)\n    def rename_columns(self, column_mapping: dict[str, str], new_fingerprint: Optional[str] = None) -> \"Dataset\":\n        \"\"\"\n        Rename several columns in the dataset, and move the features associated to the original columns under\n        the new column names.\n\n        Args:\n            column_mapping (`Dict[str, str]`):\n                A mapping of columns to rename to their new names\n            new_fingerprint (`str`, *optional*):\n                The new fingerprint of the dataset after transform.\n                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments.\n\n        Returns:\n            [`Dataset`]: A copy of the dataset with renamed columns\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds = ds.rename_columns({'text': 'text_new', 'label': 'label_new'})\n        Dataset({\n            features: ['text_new', 'label_new'],\n            num_rows: 1066\n        })\n        ```\n        \"\"\"\n        dataset = copy.deepcopy(self)\n\n        extra_columns = set(column_mapping.keys()) - set(dataset.column_names)\n        if extra_columns:\n            raise ValueError(\n                f\"Original column names {extra_columns} not in the dataset. \"\n                f\"Current columns in the dataset: {dataset._data.column_names}\"\n            )\n\n        number_of_duplicates_in_new_columns = len(column_mapping.values()) - len(set(column_mapping.values()))\n        if number_of_duplicates_in_new_columns != 0:\n            raise ValueError(\n                \"New column names must all be different, but this column mapping \"\n                f\"has {number_of_duplicates_in_new_columns} duplicates\"\n            )\n\n        empty_new_columns = [new_col for new_col in column_mapping.values() if not new_col]\n        if empty_new_columns:\n            raise ValueError(f\"New column names {empty_new_columns} are empty.\")\n\n        def rename(columns):\n            return [column_mapping[col] if col in column_mapping else col for col in columns]\n\n        new_column_names = rename(self._data.column_names)\n        if self._format_columns is not None:\n            dataset._format_columns = rename(self._format_columns)\n\n        dataset._info.features = Features(\n            {\n                column_mapping[col] if col in column_mapping else col: feature\n                for col, feature in (self._info.features or {}).items()\n            }\n        )\n\n        dataset._data = dataset._data.rename_columns(new_column_names)\n        dataset._data = update_metadata_with_features(dataset._data, dataset.features)\n        dataset._fingerprint = new_fingerprint\n        return dataset\n\n    @transmit_format\n    @fingerprint_transform(inplace=False)\n    def select_columns(self, column_names: Union[str, list[str]], new_fingerprint: Optional[str] = None) -> \"Dataset\":\n        \"\"\"Select one or several column(s) in the dataset and the features\n        associated to them.\n\n        Args:\n            column_names (`Union[str, List[str]]`):\n                Name of the column(s) to keep.\n            new_fingerprint (`str`, *optional*):\n                The new fingerprint of the dataset after transform. If `None`,\n                the new fingerprint is computed using a hash of the previous\n                fingerprint, and the transform arguments.\n\n        Returns:\n            [`Dataset`]: A copy of the dataset object which only consists of\n            selected columns.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds = ds.select_columns(['text'])\n        >>> ds\n        Dataset({\n            features: ['text'],\n            num_rows: 1066\n        })\n        ```\n        \"\"\"\n        if isinstance(column_names, str):\n            column_names = [column_names]\n\n        missing_columns = set(column_names) - set(self._data.column_names)\n        if missing_columns:\n            raise ValueError(\n                f\"Column name {list(missing_columns)} not in the \"\n                \"dataset. Current columns in the dataset: \"\n                f\"{self._data.column_names}.\"\n            )\n\n        dataset = copy.deepcopy(self)\n        dataset._data = dataset._data.select(column_names)\n        dataset._info.features = Features({col: self._info.features[col] for col in dataset._data.column_names})\n        dataset._data = update_metadata_with_features(dataset._data, dataset.features)\n        dataset._fingerprint = new_fingerprint\n        return dataset\n\n    @transmit_format\n    def _fast_select_column(self, column_name: str) -> \"Dataset\":\n        dataset = copy.copy(self)\n        dataset._data = dataset._data.select([column_name])\n        dataset._info = DatasetInfo(features=Features({column_name: self._info.features[column_name]}))\n        return dataset\n\n    def __len__(self):\n        \"\"\"Number of rows in the dataset.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds.__len__\n        <bound method Dataset.__len__ of Dataset({\n            features: ['text', 'label'],\n            num_rows: 1066\n        })>\n        ```\n        \"\"\"\n        return self.num_rows\n\n    def __iter__(self):\n        \"\"\"Iterate through the examples.\n\n        If a formatting is set with [`Dataset.set_format`] rows will be returned with the\n        selected format.\n        \"\"\"\n        if self._indices is None:\n            # Fast iteration\n            # Benchmark: https://gist.github.com/mariosasko/0248288a2e3a7556873969717c1fe52b (fast_iter_batch)\n            format_kwargs = self._format_kwargs if self._format_kwargs is not None else {}\n            formatter = get_formatter(self._format_type, features=self._info.features, **format_kwargs)\n            batch_size = config.ARROW_READER_BATCH_SIZE_IN_DATASET_ITER\n            for pa_subtable in table_iter(self.data, batch_size=batch_size):\n                for i in range(pa_subtable.num_rows):\n                    pa_subtable_ex = pa_subtable.slice(i, 1)\n                    formatted_output = format_table(\n                        pa_subtable_ex,\n                        0,\n                        formatter=formatter,\n                        format_columns=self._format_columns,\n                        output_all_columns=self._output_all_columns,\n                    )\n                    yield formatted_output\n        else:\n            for i in range(self.num_rows):\n                yield self._getitem(\n                    i,\n                )\n\n    def iter(self, batch_size: int, drop_last_batch: bool = False):\n        \"\"\"Iterate through the batches of size `batch_size`.\n\n        If a formatting is set with [`~datasets.Dataset.set_format`] rows will be returned with the\n        selected format.\n\n        Args:\n            batch_size (:obj:`int`): size of each batch to yield.\n            drop_last_batch (:obj:`bool`, default `False`): Whether a last batch smaller than the batch_size should be\n                dropped\n        \"\"\"\n        if self._indices is None:\n            # Fast iteration\n            # Benchmark: https://gist.github.com/mariosasko/0248288a2e3a7556873969717c1fe52b (fast_iter_batch)\n            format_kwargs = self._format_kwargs if self._format_kwargs is not None else {}\n            formatter = get_formatter(self._format_type, features=self._info.features, **format_kwargs)\n            for pa_subtable in table_iter(self.data, batch_size=batch_size, drop_last_batch=drop_last_batch):\n                formatted_batch = format_table(\n                    pa_subtable,\n                    range(pa_subtable.num_rows),\n                    formatter=formatter,\n                    format_columns=self._format_columns,\n                    output_all_columns=self._output_all_columns,\n                )\n                yield formatted_batch\n        else:\n            num_rows = self.num_rows if not drop_last_batch else self.num_rows // batch_size * batch_size\n            for i in range(0, num_rows, batch_size):\n                yield self._getitem(\n                    slice(i, i + batch_size),\n                )\n\n    def __repr__(self):\n        return f\"Dataset({{\\n    features: {list(self._info.features.keys())},\\n    num_rows: {self.num_rows}\\n}})\"\n\n    @property\n    def format(self):\n        return {\n            \"type\": self._format_type,\n            \"format_kwargs\": self._format_kwargs,\n            \"columns\": self.column_names if self._format_columns is None else self._format_columns,\n            \"output_all_columns\": self._output_all_columns,\n        }\n\n    @contextlib.contextmanager\n    def formatted_as(\n        self,\n        type: Optional[str] = None,\n        columns: Optional[list] = None,\n        output_all_columns: bool = False,\n        **format_kwargs,\n    ):\n        \"\"\"To be used in a `with` statement. Set `__getitem__` return format (type and columns).\n\n        Args:\n            type (`str`, *optional*):\n                Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'jax', 'arrow', 'pandas', 'polars']`.\n                `None` means `__getitem__`` returns python objects (default).\n            columns (`List[str]`, *optional*):\n                Columns to format in the output.\n                `None` means `__getitem__` returns all columns (default).\n            output_all_columns (`bool`, defaults to `False`):\n                Keep un-formatted columns as well in the output (as python objects).\n            **format_kwargs (additional keyword arguments):\n                Keywords arguments passed to the convert function like `np.array`, `torch.tensor` or `tensorflow.ragged.constant`.\n        \"\"\"\n        old_format_type = self._format_type\n        old_format_kwargs = self._format_kwargs\n        old_format_columns = self._format_columns\n        old_output_all_columns = self._output_all_columns\n        try:\n            self.set_format(type, columns, output_all_columns, **format_kwargs)\n            yield\n        finally:\n            self.set_format(old_format_type, old_format_columns, old_output_all_columns, **old_format_kwargs)\n\n    @fingerprint_transform(inplace=True)\n    def set_format(\n        self,\n        type: Optional[str] = None,\n        columns: Optional[list] = None,\n        output_all_columns: bool = False,\n        **format_kwargs,\n    ):\n        \"\"\"Set `__getitem__` return format (type and columns). The data formatting is applied on-the-fly.\n        The format `type` (for example \"numpy\") is used to format batches when using `__getitem__`.\n        It's also possible to use custom transforms for formatting using [`~datasets.Dataset.set_transform`].\n\n        Args:\n            type (`str`, *optional*):\n                Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'jax', 'arrow', 'pandas', 'polars']`.\n                `None` means `__getitem__` returns python objects (default).\n            columns (`List[str]`, *optional*):\n                Columns to format in the output.\n                `None` means `__getitem__` returns all columns (default).\n            output_all_columns (`bool`, defaults to `False`):\n                Keep un-formatted columns as well in the output (as python objects).\n            **format_kwargs (additional keyword arguments):\n                Keywords arguments passed to the convert function like `np.array`, `torch.tensor` or `tensorflow.ragged.constant`.\n\n        It is possible to call [`~datasets.Dataset.map`] after calling `set_format`. Since `map` may add new columns, then the list of formatted columns\n        gets updated. In this case, if you apply `map` on a dataset to add a new column, then this column will be formatted as:\n\n            ```\n            new formatted columns = (all columns - previously unformatted columns)\n            ```\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> from transformers import AutoTokenizer\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n        >>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True)\n        >>> ds.set_format(type='numpy', columns=['text', 'label'])\n        >>> ds.format\n        {'type': 'numpy',\n        'format_kwargs': {},\n        'columns': ['text', 'label'],\n        'output_all_columns': False}\n        ```\n        \"\"\"\n        format_kwargs.update(format_kwargs.pop(\"format_kwargs\", {}))  # allow to use self.set_format(**self.format)\n\n        # Check that the format_type and format_kwargs are valid and make it possible to have a Formatter\n        type = get_format_type_from_alias(type)\n        get_formatter(type, features=self._info.features, **format_kwargs)\n\n        # Check filter column\n        if isinstance(columns, str):\n            columns = [columns]\n        if isinstance(columns, tuple):\n            columns = list(columns)\n        if columns is not None:\n            missing_columns = set(columns) - set(self._data.column_names)\n            if missing_columns:\n                raise ValueError(\n                    f\"Columns {list(missing_columns)} not in the dataset. Current columns in the dataset: {self._data.column_names}\"\n                )\n        if columns is not None:\n            columns = columns.copy()  # Ensures modifications made to the list after this call don't cause bugs\n\n        self._format_type = type\n        self._format_kwargs = format_kwargs\n        self._format_columns = columns\n        self._output_all_columns = output_all_columns\n        logger.debug(\n            \"Set __getitem__(key) output type to %s for %s columns \"\n            \" (when key is int or slice) and %s output other (un-formatted) columns.\",\n            \"python objects\" if type is None else type,\n            \"no\" if columns is None else str(columns),\n            \"do\" if output_all_columns else \"don't\",\n        )\n\n    def reset_format(self):\n        \"\"\"Reset `__getitem__` return format to python objects and all columns.\n\n        Same as `self.set_format()`\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> from transformers import AutoTokenizer\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n        >>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True)\n        >>> ds.set_format(type='numpy', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])\n        >>> ds.format\n        {'columns': ['input_ids', 'token_type_ids', 'attention_mask', 'label'],\n         'format_kwargs': {},\n         'output_all_columns': False,\n         'type': 'numpy'}\n        >>> ds.reset_format()\n        >>> ds.format\n        {'columns': ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],\n         'format_kwargs': {},\n         'output_all_columns': False,\n         'type': None}\n        ```\n        \"\"\"\n        self.set_format()\n\n    def set_transform(\n        self,\n        transform: Optional[Callable],\n        columns: Optional[list] = None,\n        output_all_columns: bool = False,\n    ):\n        \"\"\"Set `__getitem__` return format using this transform. The transform is applied on-the-fly on batches when `__getitem__` is called.\n        As [`~datasets.Dataset.set_format`], this can be reset using [`~datasets.Dataset.reset_format`].\n\n        Args:\n            transform (`Callable`, *optional*):\n                User-defined formatting transform, replaces the format defined by [`~datasets.Dataset.set_format`].\n                A formatting function is a callable that takes a batch (as a `dict`) as input and returns a batch.\n                This function is applied right before returning the objects in `__getitem__`.\n            columns (`List[str]`, *optional*):\n                Columns to format in the output.\n                If specified, then the input batch of the transform only contains those columns.\n            output_all_columns (`bool`, defaults to `False`):\n                Keep un-formatted columns as well in the output (as python objects).\n                If set to True, then the other un-formatted columns are kept with the output of the transform.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> from transformers import AutoTokenizer\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n        >>> def encode(batch):\n        ...     return tokenizer(batch['text'], padding=True, truncation=True, return_tensors='pt')\n        >>> ds.set_transform(encode)\n        >>> ds[0]\n        {'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n         1, 1]),\n         'input_ids': tensor([  101, 29353,  2135, 15102,  1996,  9428, 20868,  2890,  8663,  6895,\n                 20470,  2571,  3663,  2090,  4603,  3017,  3008,  1998,  2037, 24211,\n                 5637,  1998, 11690,  2336,  1012,   102]),\n         'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                 0, 0])}\n        ```\n        \"\"\"\n        self.set_format(\"custom\", columns=columns, output_all_columns=output_all_columns, transform=transform)\n\n    def with_format(\n        self,\n        type: Optional[str] = None,\n        columns: Optional[list] = None,\n        output_all_columns: bool = False,\n        **format_kwargs,\n    ):\n        \"\"\"Set `__getitem__` return format (type and columns). The data formatting is applied on-the-fly.\n        The format `type` (for example \"numpy\") is used to format batches when using `__getitem__`.\n\n        It's also possible to use custom transforms for formatting using [`~datasets.Dataset.with_transform`].\n\n        Contrary to [`~datasets.Dataset.set_format`], `with_format` returns a new [`Dataset`] object.\n\n        Args:\n            type (`str`, *optional*):\n                Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'jax', 'arrow', 'pandas', 'polars']`.\n                `None` means `__getitem__` returns python objects (default).\n            columns (`List[str]`, *optional*):\n                Columns to format in the output.\n                `None` means `__getitem__` returns all columns (default).\n            output_all_columns (`bool`, defaults to `False`):\n                Keep un-formatted columns as well in the output (as python objects).\n            **format_kwargs (additional keyword arguments):\n                Keywords arguments passed to the convert function like `np.array`, `torch.tensor` or `tensorflow.ragged.constant`.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> from transformers import AutoTokenizer\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n        >>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True)\n        >>> ds.format\n        {'columns': ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],\n         'format_kwargs': {},\n         'output_all_columns': False,\n         'type': None}\n        >>> ds = ds.with_format(\"torch\")\n        >>> ds.format\n        {'columns': ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],\n         'format_kwargs': {},\n         'output_all_columns': False,\n         'type': 'torch'}\n        >>> ds[0]\n        {'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .',\n         'label': tensor(1),\n         'input_ids': tensor([  101, 18027, 16310, 16001,  1103,  9321,   178, 11604,  7235,  6617,\n                1742,  2165,  2820,  1206,  6588, 22572, 12937,  1811,  2153,  1105,\n                1147, 12890, 19587,  6463,  1105, 15026,  1482,   119,   102,     0,\n                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n                    0,     0,     0,     0]),\n         'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),\n         'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n                1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}\n        ```\n        \"\"\"\n        dataset = copy.deepcopy(self)\n        dataset.set_format(type=type, columns=columns, output_all_columns=output_all_columns, **format_kwargs)\n        return dataset\n\n    def with_transform(\n        self,\n        transform: Optional[Callable],\n        columns: Optional[list] = None,\n        output_all_columns: bool = False,\n    ):\n        \"\"\"Set `__getitem__` return format using this transform. The transform is applied on-the-fly on batches when `__getitem__` is called.\n\n        As [`~datasets.Dataset.set_format`], this can be reset using [`~datasets.Dataset.reset_format`].\n\n        Contrary to [`~datasets.Dataset.set_transform`], `with_transform` returns a new [`Dataset`] object.\n\n        Args:\n            transform (`Callable`, `optional`):\n                User-defined formatting transform, replaces the format defined by [`~datasets.Dataset.set_format`].\n                A formatting function is a callable that takes a batch (as a `dict`) as input and returns a batch.\n                This function is applied right before returning the objects in `__getitem__`.\n            columns (`List[str]`, `optional`):\n                Columns to format in the output.\n                If specified, then the input batch of the transform only contains those columns.\n            output_all_columns (`bool`, defaults to `False`):\n                Keep un-formatted columns as well in the output (as python objects).\n                If set to `True`, then the other un-formatted columns are kept with the output of the transform.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> from transformers import AutoTokenizer\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n        >>> def encode(example):\n        ...     return tokenizer(example[\"text\"], padding=True, truncation=True, return_tensors='pt')\n        >>> ds = ds.with_transform(encode)\n        >>> ds[0]\n        {'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n         1, 1, 1, 1, 1]),\n         'input_ids': tensor([  101, 18027, 16310, 16001,  1103,  9321,   178, 11604,  7235,  6617,\n                 1742,  2165,  2820,  1206,  6588, 22572, 12937,  1811,  2153,  1105,\n                 1147, 12890, 19587,  6463,  1105, 15026,  1482,   119,   102]),\n         'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                 0, 0, 0, 0, 0])}\n        ```\n        \"\"\"\n        dataset = copy.deepcopy(self)\n        dataset.set_transform(transform=transform, columns=columns, output_all_columns=output_all_columns)\n        return dataset\n\n    def _getitem(self, key: Union[int, slice, str, ListLike[int]], **kwargs) -> Union[dict, list]:\n        \"\"\"\n        Can be used to index columns (by string names) or rows (by integer, slice, or list-like of integer indices)\n        \"\"\"\n        if isinstance(key, bool):\n            raise TypeError(\"dataset index must be int, str, slice or collection of int, not bool\")\n        format_type = kwargs[\"format_type\"] if \"format_type\" in kwargs else self._format_type\n        format_columns = kwargs[\"format_columns\"] if \"format_columns\" in kwargs else self._format_columns\n        output_all_columns = (\n            kwargs[\"output_all_columns\"] if \"output_all_columns\" in kwargs else self._output_all_columns\n        )\n        format_kwargs = kwargs[\"format_kwargs\"] if \"format_kwargs\" in kwargs else self._format_kwargs\n        format_kwargs = format_kwargs if format_kwargs is not None else {}\n        formatter = get_formatter(format_type, features=self._info.features, **format_kwargs)\n        pa_subtable = query_table(self._data, key, indices=self._indices)\n        formatted_output = format_table(\n            pa_subtable, key, formatter=formatter, format_columns=format_columns, output_all_columns=output_all_columns\n        )\n        return formatted_output\n\n    @overload\n    def __getitem__(self, key: Union[int, slice, Iterable[int]]) -> dict:  # noqa: F811\n        ...\n\n    @overload\n    def __getitem__(self, key: str) -> list:  # noqa: F811\n        ...\n\n    def __getitem__(self, key):  # noqa: F811\n        \"\"\"Can be used to index columns (by string names) or rows (by integer index or iterable of indices or bools).\"\"\"\n        if isinstance(key, str):\n            if self._format_type is None or self._format_type not in (\"arrow\", \"pandas\", \"polars\"):\n                return Column(self, key)\n        return self._getitem(key)\n\n    def __getitems__(self, keys: list) -> list:\n        \"\"\"Can be used to get a batch using a list of integers indices.\"\"\"\n        batch = self.__getitem__(keys)\n        n_examples = len(batch[next(iter(batch))])\n        return [{col: array[i] for col, array in batch.items()} for i in range(n_examples)]\n\n    def cleanup_cache_files(self) -> int:\n        \"\"\"Clean up all cache files in the dataset cache directory, excepted the currently used cache file if there is\n        one.\n\n        Be careful when running this command that no other process is currently using other cache files.\n\n        Returns:\n            `int`: Number of removed files.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds.cleanup_cache_files()\n        10\n        ```\n        \"\"\"\n        current_cache_files = [os.path.abspath(cache_file[\"filename\"]) for cache_file in self.cache_files]\n        if not current_cache_files:\n            return 0\n        cache_directory = os.path.dirname(current_cache_files[0])\n        logger.info(f\"Listing files in {cache_directory}\")\n        files: list[str] = os.listdir(cache_directory)\n        files_to_remove = []\n        for f_name in files:\n            full_name = os.path.abspath(os.path.join(cache_directory, f_name))\n            if f_name.startswith(\"cache-\") and f_name.endswith(\".arrow\"):\n                if full_name in current_cache_files:\n                    logger.info(f\"Keeping currently used cache file at {full_name}\")\n                    continue\n                files_to_remove.append(full_name)\n        for file_path in files_to_remove:\n            logger.info(f\"Removing {file_path}\")\n            os.remove(file_path)\n        return len(files_to_remove)\n\n    def _get_cache_file_path(self, fingerprint):\n        if is_caching_enabled() and self.cache_files:\n            cache_file_name = \"cache-\" + fingerprint + \".arrow\"\n            cache_directory = os.path.dirname(self.cache_files[0][\"filename\"])\n        else:\n            cache_file_name = \"cache-\" + generate_random_fingerprint() + \".arrow\"\n            cache_directory = get_temporary_cache_files_directory()\n        cache_file_path = os.path.join(cache_directory, cache_file_name)\n        return cache_file_path\n\n    @transmit_format\n    def map(\n        self,\n        function: Optional[Callable] = None,\n        with_indices: bool = False,\n        with_rank: bool = False,\n        input_columns: Optional[Union[str, list[str]]] = None,\n        batched: bool = False,\n        batch_size: Optional[int] = 1000,\n        drop_last_batch: bool = False,\n        remove_columns: Optional[Union[str, list[str]]] = None,\n        keep_in_memory: bool = False,\n        load_from_cache_file: Optional[bool] = None,\n        cache_file_name: Optional[str] = None,\n        writer_batch_size: Optional[int] = 1000,\n        features: Optional[Features] = None,\n        disable_nullable: bool = False,\n        fn_kwargs: Optional[dict] = None,\n        num_proc: Optional[int] = None,\n        suffix_template: str = \"_{rank:05d}_of_{num_proc:05d}\",\n        new_fingerprint: Optional[str] = None,\n        desc: Optional[str] = None,\n        try_original_type: Optional[bool] = True,\n        on_mixed_types: Optional[Literal[\"use_json\"]] = \"use_json\",\n    ) -> \"Dataset\":\n        \"\"\"\n        Apply a function to all the examples in the table (individually or in batches) and update the table.\n        If your function returns a column that already exists, then it overwrites it.\n\n        You can specify whether the function should be batched or not with the `batched` parameter:\n\n        - If batched is `False`, then the function takes 1 example in and should return 1 example.\n          An example is a dictionary, e.g. `{\"text\": \"Hello there !\"}`.\n        - If batched is `True` and `batch_size` is 1, then the function takes a batch of 1 example as input and can return a batch with 1 or more examples.\n          A batch is a dictionary, e.g. a batch of 1 example is `{\"text\": [\"Hello there !\"]}`.\n        - If batched is `True` and `batch_size` is `n > 1`, then the function takes a batch of `n` examples as input and can return a batch with `n` examples, or with an arbitrary number of examples.\n          Note that the last batch may have less than `n` examples.\n          A batch is a dictionary, e.g. a batch of `n` examples is `{\"text\": [\"Hello there !\"] * n}`.\n\n        If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simultaneous calls.\n        It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time.\n\n        Args:\n            function (`Callable`): Function with one of the following signatures:\n\n                - `function(example: Dict[str, Any]) -> Dict[str, Any]` if `batched=False` and `with_indices=False` and `with_rank=False`\n                - `function(example: Dict[str, Any], *extra_args) -> Dict[str, Any]` if `batched=False` and `with_indices=True` and/or `with_rank=True` (one extra arg for each)\n                - `function(batch: Dict[str, List]) -> Dict[str, List]` if `batched=True` and `with_indices=False` and `with_rank=False`\n                - `function(batch: Dict[str, List], *extra_args) -> Dict[str, List]` if `batched=True` and `with_indices=True` and/or `with_rank=True` (one extra arg for each)\n\n                For advanced usage, the function can also return a `pyarrow.Table`.\n                If the function is asynchronous, then `map` will run your function in parallel.\n                Moreover if your function returns nothing (`None`), then `map` will run your function and return the dataset unchanged.\n                If no function is provided, default to identity function: `lambda x: x`.\n            with_indices (`bool`, defaults to `False`):\n                Provide example indices to `function`. Note that in this case the\n                signature of `function` should be `def function(example, idx[, rank]): ...`.\n            with_rank (`bool`, defaults to `False`):\n                Provide process rank to `function`. Note that in this case the\n                signature of `function` should be `def function(example[, idx], rank): ...`.\n            input_columns (`Optional[Union[str, List[str]]]`, defaults to `None`):\n                The columns to be passed into `function`\n                as positional arguments. If `None`, a `dict` mapping to all formatted columns is passed as one argument.\n            batched (`bool`, defaults to `False`):\n                Provide batch of examples to `function`.\n            batch_size (`int`, *optional*, defaults to `1000`):\n                Number of examples per batch provided to `function` if `batched=True`.\n                If `batch_size <= 0` or `batch_size == None`, provide the full dataset as a single batch to `function`.\n            drop_last_batch (`bool`, defaults to `False`):\n                Whether a last batch smaller than the batch_size should be\n                dropped instead of being processed by the function.\n            remove_columns (`Optional[Union[str, List[str]]]`, defaults to `None`):\n                Remove a selection of columns while doing the mapping.\n                Columns will be removed before updating the examples with the output of `function`, i.e. if `function` is adding\n                columns with names in `remove_columns`, these columns will be kept.\n            keep_in_memory (`bool`, defaults to `False`):\n                Keep the dataset in memory instead of writing it to a cache file.\n            load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled):\n                If a cache file storing the current computation from `function`\n                can be identified, use it instead of recomputing.\n            cache_file_name (`str`, *optional*, defaults to `None`):\n                Provide the name of a path for the cache file. It is used to store the\n                results of the computation instead of the automatically generated cache file name.\n            writer_batch_size (`int`, defaults to `1000`):\n                Number of rows per write operation for the cache file writer.\n                This value is a good trade-off between memory usage during the processing, and processing speed.\n                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`.\n            features (`Optional[datasets.Features]`, defaults to `None`):\n                Use a specific Features to store the cache file\n                instead of the automatically generated one.\n            disable_nullable (`bool`, defaults to `False`):\n                Disallow null values in the table.\n            fn_kwargs (`Dict`, *optional*, defaults to `None`):\n                Keyword arguments to be passed to `function`.\n            num_proc (`int`, *optional*, defaults to `None`):\n                 The number of processes to use for multiprocessing.\n                - If `None` or `0`, no multiprocessing is used and the operation runs in the main process.\n                - If greater than `1`, one or multiple worker processes are used to process data in parallel.\n                 Note: The function passed to `map()` must be picklable for multiprocessing to work correctly\n                 (i.e., prefer functions defined at the top level of a module, not inside another function or class).\n             suffix_template (`str`):\n                If `cache_file_name` is specified, then this suffix\n                will be added at the end of the base name of each. Defaults to `\"_{rank:05d}_of_{num_proc:05d}\"`. For example, if `cache_file_name` is \"processed.arrow\", then for\n                `rank=1` and `num_proc=4`, the resulting file would be `\"processed_00001_of_00004.arrow\"` for the default suffix.\n            new_fingerprint (`str`, *optional*, defaults to `None`):\n                The new fingerprint of the dataset after transform.\n                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments.\n            desc (`str`, *optional*, defaults to `None`):\n                Meaningful description to be displayed alongside with the progress bar while mapping examples.\n            try_original_type (`Optional[bool]`, defaults to `True`):\n                Try to keep the types of the original columns (e.g. int32 -> int32).\n                Set to False if you want to always infer new types.\n            on_mixed_types (`Literal[\"use_json\"]`, *optional*, defaults to `None`):\n                If \"use_json\", use the Json() type for mixed-types fields,\n                i.e. unstructured fields that contain data without a predefined schema.\n                In this case, a field with mixed type is set to Json().\n\n                This allow loading lists with a mix of strings/integers/floats\n                for example, or dictionaries with arbitrary value types.\n\n                <Added version=\"4.7.0\"/>\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> def add_prefix(example):\n        ...     example[\"text\"] = \"Review: \" + example[\"text\"]\n        ...     return example\n        >>> ds = ds.map(add_prefix)\n        >>> ds[0:3][\"text\"]\n        ['Review: compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .',\n         'Review: the soundtrack alone is worth the price of admission .',\n         'Review: rodriguez does a splendid job of racial profiling hollywood style--casting excellent latin actors of all ages--a trend long overdue .']\n\n        # process a batch of examples\n        >>> ds = ds.map(lambda example: tokenizer(example[\"text\"]), batched=True)\n        # set number of processors\n        >>> ds = ds.map(add_prefix, num_proc=4)\n        ```\n        \"\"\"\n        if keep_in_memory and cache_file_name is not None:\n            raise ValueError(\"Please use either `keep_in_memory` or `cache_file_name` but not both.\")\n\n        if num_proc == 0:\n            num_proc = None\n        elif num_proc is not None and num_proc < 0:\n            raise ValueError(\"num_proc must be >= 0 or None.\")\n\n        string_formatter = string.Formatter()\n        fields = {field_name for _, field_name, _, _ in string_formatter.parse(suffix_template) if field_name}\n        if fields != {\"rank\", \"num_proc\"}:\n            raise ValueError(f\"suffix_template must contain exactly the fields 'rank' and 'num_proc', got: {fields}\")\n\n        # If the array is empty we do nothing (but we make sure to handle an empty indices mapping and remove the requested columns anyway)\n        if len(self) == 0:\n            if self._indices is not None:  # empty indices mapping\n                self = Dataset(\n                    self.data.slice(0, 0),\n                    info=self.info.copy(),\n                    split=self.split,\n                    fingerprint=new_fingerprint,\n                )\n            if remove_columns:\n                return self.remove_columns(remove_columns)\n            else:\n                return self\n\n        if function is None:\n            function = lambda x: x  # noqa: E731\n\n        if isinstance(input_columns, str):\n            input_columns = [input_columns]\n\n        if input_columns is not None:\n            missing_columns = set(input_columns) - set(self._data.column_names)\n            if missing_columns:\n                raise ValueError(\n                    f\"Input column {list(missing_columns)} not in the dataset. Current columns in the dataset: {self._data.column_names}\"\n                )\n\n        if isinstance(remove_columns, str):\n            remove_columns = [remove_columns]\n\n        if remove_columns is not None:\n            missing_columns = set(remove_columns) - set(self._data.column_names)\n            if missing_columns:\n                raise ValueError(\n                    f\"Column to remove {list(missing_columns)} not in the dataset. Current columns in the dataset: {self._data.column_names}\"\n                )\n\n        load_from_cache_file = load_from_cache_file if load_from_cache_file is not None else is_caching_enabled()\n\n        if fn_kwargs is None:\n            fn_kwargs = {}\n\n        if features is not None:\n            features = _fix_for_backward_compatible_features(features)\n\n        if num_proc is not None and num_proc > len(self):\n            num_proc = len(self)\n            logger.warning(\n                f\"num_proc must be <= {len(self)}. Reducing num_proc to {num_proc} for dataset of size {len(self)}.\"\n            )\n\n        dataset_kwargs = {\n            \"shard\": self,\n            \"function\": function,\n            \"with_indices\": with_indices,\n            \"with_rank\": with_rank,\n            \"input_columns\": input_columns,\n            \"batched\": batched,\n            \"batch_size\": batch_size,\n            \"drop_last_batch\": drop_last_batch,\n            \"remove_columns\": remove_columns,\n            \"keep_in_memory\": keep_in_memory,\n            \"writer_batch_size\": writer_batch_size,\n            \"features\": features,\n            \"disable_nullable\": disable_nullable,\n            \"fn_kwargs\": fn_kwargs,\n            \"try_original_type\": try_original_type,\n            \"on_mixed_types\": on_mixed_types,\n        }\n\n        if new_fingerprint is None:\n            # we create a unique hash from the function,\n            # current dataset file and the mapping args\n            transform = format_transform_for_fingerprint(Dataset._map_single)\n            kwargs_for_fingerprint = format_kwargs_for_fingerprint(Dataset._map_single, (), dataset_kwargs)\n            kwargs_for_fingerprint[\"fingerprint_name\"] = \"new_fingerprint\"\n            new_fingerprint = update_fingerprint(self._fingerprint, transform, kwargs_for_fingerprint)\n        else:\n            validate_fingerprint(new_fingerprint)\n        dataset_kwargs[\"new_fingerprint\"] = new_fingerprint\n\n        if self.cache_files:\n            if cache_file_name is None:\n                cache_file_name = self._get_cache_file_path(new_fingerprint)\n        dataset_kwargs[\"cache_file_name\"] = cache_file_name\n\n        if cache_file_name is not None:\n            cache_file_prefix, cache_file_ext = os.path.splitext(cache_file_name)\n            if not cache_file_ext:\n                raise ValueError(f\"Expected cache_file_name to have an extension, but got: {cache_file_name}\")\n        else:\n            cache_file_prefix = cache_file_ext = None\n\n        def load_processed_shard_from_cache(shard_kwargs: dict[str, Any]) -> Dataset:\n            \"\"\"Load a processed shard from cache if it exists, otherwise throw an error.\"\"\"\n            shard = shard_kwargs[\"shard\"]\n            # Check if we've already cached this computation (indexed by a hash)\n            if shard_kwargs[\"cache_file_name\"] is not None:\n                if os.path.exists(shard_kwargs[\"cache_file_name\"]) and load_from_cache_file:\n                    info = shard.info.copy()\n                    info.features = features\n                    return Dataset.from_file(shard_kwargs[\"cache_file_name\"], info=info, split=shard.split)\n            raise NonExistentDatasetError\n\n        existing_cache_file_map: dict[int, list[str]] = defaultdict(list)\n        if cache_file_name is not None:\n            if os.path.exists(cache_file_name):\n                existing_cache_file_map[1] = [cache_file_name]\n\n            assert cache_file_prefix is not None and cache_file_ext is not None\n            cache_file_with_suffix_pattern = cache_file_prefix + suffix_template + cache_file_ext\n\n            for cache_file in glob.iglob(f\"{cache_file_prefix}*{cache_file_ext}\"):\n                suffix_variable_map = string_to_dict(\n                    Path(cache_file).as_posix(), Path(cache_file_with_suffix_pattern).as_posix()\n                )\n                if suffix_variable_map is not None:\n                    file_num_proc = int(suffix_variable_map[\"num_proc\"])\n                    existing_cache_file_map[file_num_proc].append(cache_file)\n\n        num_shards = num_proc or 1\n        if existing_cache_file_map:\n            # to avoid remapping when a different num_proc is given than when originally cached, update num_shards to\n            # what was used originally\n\n            def select_existing_cache_files(mapped_num_proc: int) -> tuple[float, ...]:\n                percent_missing = (mapped_num_proc - len(existing_cache_file_map[mapped_num_proc])) / mapped_num_proc\n                num_shards_diff = abs(mapped_num_proc - num_shards)\n                return (\n                    percent_missing,  # choose the most complete set of existing cache files\n                    num_shards_diff,  # then choose the mapped_num_proc closest to the current num_proc\n                    mapped_num_proc,  # finally, choose whichever mapped_num_proc is lower\n                )\n\n            num_shards = min(existing_cache_file_map, key=select_existing_cache_files)\n\n        existing_cache_files = existing_cache_file_map[num_shards]\n\n        def format_cache_file_name(\n            cache_file_name: Optional[str],\n            rank: Union[int, Literal[\"*\"]],  # noqa: F722\n        ) -> Optional[str]:\n            if not cache_file_name:\n                return cache_file_name\n\n            assert cache_file_prefix is not None and cache_file_ext is not None\n\n            if isinstance(rank, int):\n                cache_file_name = (\n                    cache_file_prefix + suffix_template.format(rank=rank, num_proc=num_shards) + cache_file_ext\n                )\n                if not os.path.exists(cache_file_name):\n                    process_name = (\n                        \"Main process\" if num_proc is None or num_proc == 1 else f\"Process #{rank % num_shards + 1}\"\n                    )\n                    logger.info(f\"{process_name} will write at {cache_file_name}\")\n            else:\n                # TODO: this assumes the format_spec of rank in suffix_template\n                cache_file_name = (\n                    cache_file_prefix\n                    + suffix_template.replace(\"{rank:05d}\", \"{rank}\").format(rank=rank, num_proc=num_shards)\n                    + cache_file_ext\n                )\n            return cache_file_name\n\n        def format_new_fingerprint(new_fingerprint: str, rank: int) -> str:\n            new_fingerprint = new_fingerprint + suffix_template.format(rank=rank, num_proc=num_shards)\n            validate_fingerprint(new_fingerprint)\n            return new_fingerprint\n\n        if num_proc is not None and num_proc >= 1:\n            prev_env = deepcopy(os.environ)\n            # check if parallelism if off\n            # from https://github.com/huggingface/tokenizers/blob/bb668bc439dc34389b71dbb8ce0c597f15707b53/tokenizers/src/utils/parallelism.rs#L22\n            if prev_env.get(\"TOKENIZERS_PARALLELISM\", \"false\").lower() not in (\n                \"\",\n                \"off\",\n                \"false\",\n                \"f\",\n                \"no\",\n                \"n\",\n                \"0\",\n            ):\n                logger.warning(\"Setting TOKENIZERS_PARALLELISM=false for forked processes.\")\n            os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n        else:\n            prev_env = os.environ\n\n        kwargs_per_job: list[Optional[dict[str, Any]]]\n        if num_shards == 1:\n            shards = [self]\n            kwargs_per_job = [dataset_kwargs]\n        else:\n            shards = [\n                self.shard(num_shards=num_shards, index=rank, contiguous=True, keep_in_memory=keep_in_memory)\n                for rank in range(num_shards)\n            ]\n            kwargs_per_job = [\n                {\n                    **dataset_kwargs,\n                    \"shard\": shards[rank],\n                    \"cache_file_name\": format_cache_file_name(cache_file_name, rank),\n                    \"rank\": rank,\n                    \"offset\": sum(len(s) for s in shards[:rank]),\n                    \"new_fingerprint\": format_new_fingerprint(new_fingerprint, rank),\n                }\n                for rank in range(num_shards)\n            ]\n\n        transformed_shards: list[Optional[Dataset]] = [None] * num_shards\n        for rank in range(num_shards):\n            try:\n                job_kwargs = kwargs_per_job[rank]\n                assert job_kwargs is not None\n                transformed_shards[rank] = load_processed_shard_from_cache(job_kwargs)\n                kwargs_per_job[rank] = None\n            except NonExistentDatasetError:\n                pass\n\n        if unprocessed_kwargs_per_job := [kwargs for kwargs in kwargs_per_job if kwargs is not None]:\n            if len(unprocessed_kwargs_per_job) != num_shards:\n                logger.info(\n                    f\"Reprocessing {len(unprocessed_kwargs_per_job)}/{num_shards} shards because some of them were \"\n                    \"missing from the cache.\"\n                )\n\n            pbar_total = len(self)\n            pbar_initial = len(existing_cache_files) * pbar_total // num_shards\n            if batched and drop_last_batch:\n                batch_size = batch_size or 1\n                pbar_initial = pbar_initial // num_shards // batch_size * num_shards * batch_size\n                pbar_total = pbar_total // num_shards // batch_size * num_shards * batch_size\n\n            with hf_tqdm(\n                unit=\" examples\",\n                initial=pbar_initial,\n                total=pbar_total,\n                desc=(desc or \"Map\") + (f\" (num_proc={num_proc})\" if num_proc is not None and num_proc >= 1 else \"\"),\n            ) as pbar:\n                shards_done = 0\n\n                def check_if_shard_done(rank: Optional[int], done: bool, content: Union[Dataset, int]) -> None:\n                    nonlocal shards_done\n                    if done:\n                        shards_done += 1\n                        logger.debug(f\"Finished processing shard number {rank} of {num_shards}.\")\n                        assert isinstance(content, Dataset)\n                        transformed_shards[rank or 0] = content\n                    else:\n                        assert isinstance(content, int)\n                        pbar.update(content)\n\n                if num_proc is not None and num_proc >= 1:\n                    with mp.Pool(num_proc) as pool:\n                        os.environ = prev_env\n                        logger.info(f\"Spawning {num_proc} processes\")\n\n                        for rank, done, content in iflatmap_unordered(\n                            pool, Dataset._map_single, kwargs_iterable=unprocessed_kwargs_per_job\n                        ):\n                            check_if_shard_done(rank, done, content)\n\n                        pool.close()\n                        pool.join()\n                else:\n                    for unprocessed_kwargs in unprocessed_kwargs_per_job:\n                        for rank, done, content in Dataset._map_single(**unprocessed_kwargs):\n                            check_if_shard_done(rank, done, content)\n\n            # Avoids PermissionError on Windows (the error: https://github.com/huggingface/datasets/actions/runs/4026734820/jobs/6921621805)\n            for job_kwargs in unprocessed_kwargs_per_job:\n                if \"shard\" in job_kwargs:\n                    del job_kwargs[\"shard\"]\n        else:\n            logger.info(f\"Loading cached processed dataset at {format_cache_file_name(cache_file_name, '*')}\")\n\n        all_transformed_shards = [shard for shard in transformed_shards if shard is not None]\n        if len(transformed_shards) != len(all_transformed_shards):\n            raise ValueError(\n                f\"Failed to retrieve results from map: result list {transformed_shards} still contains None - \"\n                \"at least one worker failed to return its results\"\n            )\n\n        if num_shards == 1:\n            result = all_transformed_shards[0]\n        else:\n            logger.info(f\"Concatenating {num_shards} shards\")\n            result = _concatenate_map_style_datasets(all_transformed_shards)\n\n        # update fingerprint if the dataset changed\n        result._fingerprint = (\n            new_fingerprint\n            if any(\n                transformed_shard._fingerprint != shard._fingerprint\n                for transformed_shard, shard in zip(all_transformed_shards, shards)\n            )\n            else self._fingerprint\n        )\n\n        return result\n\n    @staticmethod\n    def _map_single(\n        shard: \"Dataset\",\n        function: Optional[Callable] = None,\n        with_indices: bool = False,\n        with_rank: bool = False,\n        input_columns: Optional[list[str]] = None,\n        batched: bool = False,\n        batch_size: Optional[int] = 1000,\n        drop_last_batch: bool = False,\n        remove_columns: Optional[list[str]] = None,\n        keep_in_memory: bool = False,\n        cache_file_name: Optional[str] = None,\n        writer_batch_size: Optional[int] = 1000,\n        features: Optional[Features] = None,\n        disable_nullable: bool = False,\n        fn_kwargs: Optional[dict] = None,\n        new_fingerprint: Optional[str] = None,\n        rank: Optional[int] = None,\n        offset: int = 0,\n        try_original_type: Optional[bool] = True,\n        on_mixed_types: Optional[Literal[\"use_json\"]] = \"use_json\",\n    ) -> Iterable[tuple[Optional[int], bool, Union[int, \"Dataset\"]]]:\n        \"\"\"Apply a function to all the elements in the table (individually or in batches)\n        and update the table (if function does update examples).\n\n        Args:\n            shard (`datasets.Dataset`): Dataset to map the transform on.\n            function (`Callable`): with one of the following signature:\n                - `function(example: Dict[str, Any]) -> Dict[str, Any]` if `batched=False` and `with_indices=False` and `with_rank=False`\n                - `function(example: Dict[str, Any], *extra_args) -> Dict[str, Any]` if `batched=False` and `with_indices=True` and/or `with_rank=True` (one extra arg for each)\n                - `function(batch: Dict[str, List]) -> Dict[str, List]` if `batched=True` and `with_indices=False` and `with_rank=False`\n                - `function(batch: Dict[str, List], *extra_args) -> Dict[str, List]` if `batched=True` and `with_indices=True` and/or `with_rank=True` (one extra arg for each)\n\n                For advanced usage, the function can also return a `pyarrow.Table`.\n                Moreover if your function returns nothing (`None`), then `map` will run your function and return the dataset unchanged.\n                If no function is provided, default to identity function: lambda x: x\n            with_indices (`bool`, defaults to `False`): Provide example indices to `function`. Note that in this case the signature of `function` should be `def function(example, idx[, rank]): ...`.\n            with_rank (`bool`, default `False`): Provide process rank to `function`. Note that in this case the signature of `function` should be `def function(example[, idx], rank): ...`.\n            input_columns (`Optional[List[str]]`, defaults to `None`): The columns to be passed into `function` as\n                positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument.\n            batched (`bool`, defaults to `False`): Provide batch of examples to `function`\n            batch_size (`int`, optional, defaults to `1000`): Number of examples per batch provided to `function` if `batched=True`\n                `batch_size <= 0` or `batch_size == None`: Provide the full dataset as a single batch to `function`\n            drop_last_batch (`bool`, default: `False`): Whether a last batch smaller than the batch_size should be\n                dropped instead of being processed by the function.\n            remove_columns (`Optional[List[str]]`, defaults to `None`): Remove a selection of columns while doing the mapping.\n                Columns will be removed before updating the examples with the output of `function`, i.e. if `function` is adding\n                columns with names in `remove_columns`, these columns will be kept.\n            keep_in_memory (`bool`, defaults to `False`): Keep the dataset in memory instead of writing it to a cache file.\n            cache_file_name (`str`, optional, defaults to `None`): Provide the name of a path for the cache file. It is used to store the\n                results of the computation instead of the automatically generated cache file name.\n            writer_batch_size (`int`, default `1000`): Number of rows per write operation for the cache file writer.\n                This value is a good trade-off between memory usage during the processing, and processing speed.\n                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `.map()`.\n            features (`Optional[datasets.Features]`, defaults to `None`): Use a specific Features to store the cache file\n                instead of the automatically generated one.\n            disable_nullable (`bool`, defaults to `False`): Disallow null values in the table.\n            fn_kwargs (`Dict`, optional, defaults to `None`): Keyword arguments to be passed to `function`\n            new_fingerprint (`str`, optional, defaults to `None`): the new fingerprint of the dataset after transform.\n                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments\n            rank: (`int`, optional, defaults to `None`): If specified, this is the process rank when doing multiprocessing\n            offset: (`int`, defaults to 0): If specified, this is an offset applied to the indices passed to `function` if `with_indices=True`.\n            try_original_type: (`Optional[bool]`, defaults to `True`):\n                Try to keep the types of the original columns (e.g. int32 -> int32).\n                Set to False if you want to always infer new types.\n            on_mixed_types (`Literal[\"use_json\"]`, *optional*, defaults to `None`):\n                If \"use_json\", use the Json() type for mixed-types fields,\n                i.e. unstructured fields that contain data without a predefined schema.\n                In this case, a field with mixed type is set to Json().\n\n                This allow loading lists with a mix of strings/integers/floats\n                for example, or dictionaries with arbitrary value types.\n\n                <Added version=\"4.7.0\"/>\n        \"\"\"\n        if fn_kwargs is None:\n            fn_kwargs = {}\n\n        # If we do batch computation but no batch size is provided, default to the full dataset\n        if batched and (batch_size is None or batch_size <= 0):\n            batch_size = shard.num_rows\n\n        # We set this variable to True after processing the first example/batch in\n        # `apply_function_on_filtered_inputs` if the map function returns a dict.\n        # If set to False, no new arrow table will be created\n\n        update_data = None\n\n        format_kwargs = shard._format_kwargs.copy()\n        # Lazy formatting is only available for the default format (None/python)\n        if not input_columns and shard._format_type is None:\n            format_kwargs[\"lazy\"] = True\n        input_formatter = get_formatter(\n            shard._format_type,\n            features=shard.features,\n            **format_kwargs,\n        )\n\n        check_same_num_examples = batched and len(shard.list_indexes()) > 0\n\n        def validate_function_output(processed_inputs):\n            \"\"\"Validate output of the map function.\"\"\"\n            allowed_processed_inputs_types = (Mapping, pa.Table, pd.DataFrame)\n            if config.POLARS_AVAILABLE and \"polars\" in sys.modules:\n                import polars as pl\n\n                allowed_processed_inputs_types += (pl.DataFrame,)\n            if processed_inputs is not None and not isinstance(processed_inputs, allowed_processed_inputs_types):\n                raise TypeError(\n                    f\"Provided `function` which is applied to all elements of table returns a variable of type {type(processed_inputs)}. Make sure provided `function` returns a variable of type `dict` (or a pyarrow table) to update the dataset or `None` if you are only interested in side effects.\"\n                )\n            if batched and isinstance(processed_inputs, Mapping):\n                allowed_batch_return_types = (list, np.ndarray, pd.Series)\n                if config.POLARS_AVAILABLE and \"polars\" in sys.modules:\n                    import polars as pl\n\n                    allowed_batch_return_types += (pl.Series, pl.DataFrame)\n                if config.TF_AVAILABLE and \"tensorflow\" in sys.modules:\n                    import tensorflow as tf\n\n                    allowed_batch_return_types += (tf.Tensor,)\n                if config.TORCH_AVAILABLE and \"torch\" in sys.modules:\n                    import torch\n\n                    allowed_batch_return_types += (torch.Tensor,)\n                if config.JAX_AVAILABLE and \"jax\" in sys.modules:\n                    import jax.numpy as jnp\n\n                    allowed_batch_return_types += (jnp.ndarray,)\n                all_dict_values_are_lists = all(\n                    isinstance(value, allowed_batch_return_types) for value in processed_inputs.values()\n                )\n                if all_dict_values_are_lists is False:\n                    raise TypeError(\n                        f\"Provided `function` which is applied to all elements of table returns a `dict` of types {[type(x) for x in processed_inputs.values()]}. When using `batched=True`, make sure provided `function` returns a `dict` of types like `{allowed_batch_return_types}`.\"\n                    )\n\n        def prepare_inputs(pa_inputs, indices, offset=0):\n            \"\"\"Utility to apply the function on a selection of columns.\"\"\"\n            inputs = format_table(\n                pa_inputs,\n                0 if not batched else range(pa_inputs.num_rows),\n                format_columns=input_columns,\n                formatter=input_formatter,\n            )\n            fn_args = [inputs] if input_columns is None else [inputs[col] for col in input_columns]\n            if offset == 0:\n                effective_indices = indices\n            else:\n                effective_indices = [i + offset for i in indices] if isinstance(indices, list) else indices + offset\n            additional_args = ()\n            if with_indices:\n                additional_args += (effective_indices,)\n            if with_rank:\n                additional_args += (rank,)\n            return inputs, fn_args, additional_args, fn_kwargs\n\n        def prepare_outputs(pa_inputs, inputs, processed_inputs):\n            nonlocal update_data\n            if not (update_data := (processed_inputs is not None)):\n                return None\n            if isinstance(processed_inputs, LazyDict):\n                processed_inputs = {\n                    k: v for k, v in processed_inputs.data.items() if k not in processed_inputs.keys_to_format\n                }\n                returned_lazy_dict = True\n            else:\n                returned_lazy_dict = False\n            validate_function_output(processed_inputs)\n            if shard._format_type or input_columns:\n                # TODO(QL, MS): ideally the behavior should be the same even if the dataset is formatted (may require major release)\n                inputs_to_merge = dict(zip(pa_inputs.column_names, pa_inputs.itercolumns()))\n            elif isinstance(inputs, LazyDict):\n                inputs_to_merge = {\n                    k: (v if k not in inputs.keys_to_format else pa_inputs[k]) for k, v in inputs.data.items()\n                }\n            else:\n                inputs_to_merge = inputs\n            if remove_columns is not None:\n                for column in remove_columns:\n                    # `function` can modify input in-place causing column to be already removed.\n                    if column in inputs_to_merge:\n                        inputs_to_merge.pop(column)\n                    if returned_lazy_dict and column in processed_inputs:\n                        processed_inputs.pop(column)\n            if check_same_num_examples:\n                input_num_examples = len(pa_inputs)\n                processed_inputs_num_examples = len(processed_inputs[next(iter(processed_inputs.keys()))])\n                if input_num_examples != processed_inputs_num_examples:\n                    raise DatasetTransformationNotAllowedError(\n                        \"Using `.map` in batched mode on a dataset with attached indexes is allowed only if it doesn't create or remove existing examples. You can first run `.drop_index() to remove your index and then re-add it.\"\n                    ) from None\n            if isinstance(inputs, Mapping) and isinstance(processed_inputs, Mapping):\n                # The .map() transform *updates* the dataset:\n                # the output dictionary contains both the input data and the output data.\n                # The output dictionary may contain Arrow values from `inputs_to_merge` so that we can re-write them efficiently.\n                return {**inputs_to_merge, **processed_inputs}\n            else:\n                return processed_inputs\n\n        def apply_function(pa_inputs, indices, offset=0):\n            \"\"\"Utility to apply the function on a selection of columns.\"\"\"\n            inputs, fn_args, additional_args, fn_kwargs = prepare_inputs(pa_inputs, indices, offset=offset)\n            processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)\n            return prepare_outputs(pa_inputs, inputs, processed_inputs)\n\n        async def async_apply_function(pa_inputs, indices, offset=0):\n            \"\"\"Utility to apply the function on a selection of columns. Same code but async\"\"\"\n            inputs, fn_args, additional_args, fn_kwargs = prepare_inputs(pa_inputs, indices, offset=offset)\n            processed_inputs = await function(*fn_args, *additional_args, **fn_kwargs)\n            return prepare_outputs(pa_inputs, inputs, processed_inputs)\n\n        def init_buffer_and_writer():\n            # Prepare output buffer and batched writer in memory or on file if we update the table\n            writer_features = features\n            if writer_features is None:\n                writer_features = shard.features\n                update_features = True\n            else:\n                update_features = False\n            if keep_in_memory or cache_file_name is None:\n                buf_writer = pa.BufferOutputStream()\n                tmp_file = None\n                writer = ArrowWriter(\n                    features=writer_features,\n                    stream=buf_writer,\n                    writer_batch_size=writer_batch_size,\n                    update_features=update_features,\n                    fingerprint=new_fingerprint,\n                    disable_nullable=disable_nullable,\n                    on_mixed_types=on_mixed_types,\n                )\n            else:\n                buf_writer = None\n                logger.info(f\"Caching processed dataset at {cache_file_name}\")\n                cache_dir = os.path.dirname(cache_file_name)\n                os.makedirs(cache_dir, exist_ok=True)\n                tmp_file = tempfile.NamedTemporaryFile(\"wb\", dir=cache_dir, delete=False)\n                writer = ArrowWriter(\n                    features=writer_features,\n                    path=tmp_file.name,\n                    writer_batch_size=writer_batch_size,\n                    update_features=update_features,\n                    fingerprint=new_fingerprint,\n                    disable_nullable=disable_nullable,\n                    on_mixed_types=on_mixed_types,\n                )\n            return buf_writer, writer, tmp_file\n\n        tasks: list[asyncio.Task] = []\n        if inspect.iscoroutinefunction(function):\n            try:\n                loop = asyncio.get_running_loop()\n            except RuntimeError:\n                loop = asyncio.new_event_loop()\n        else:\n            loop = None\n\n        def iter_outputs(shard_iterable):\n            nonlocal tasks, loop\n            if inspect.iscoroutinefunction(function):\n                indices: Union[list[int], list[list[int]]] = []\n                for i, example in shard_iterable:\n                    indices.append(i)\n                    tasks.append(loop.create_task(async_apply_function(example, i, offset=offset)))\n                    # keep the total active tasks under a certain number\n                    if len(tasks) >= config.MAX_NUM_RUNNING_ASYNC_MAP_FUNCTIONS_IN_PARALLEL:\n                        done, pending = loop.run_until_complete(\n                            asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)\n                        )\n                        while tasks and len(pending) >= config.MAX_NUM_RUNNING_ASYNC_MAP_FUNCTIONS_IN_PARALLEL:\n                            done, pending = loop.run_until_complete(\n                                asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)\n                            )\n                    # yield finished tasks\n                    while tasks and tasks[0].done():\n                        yield indices.pop(0), tasks.pop(0).result()\n                while tasks:\n                    yield indices[0], loop.run_until_complete(tasks[0])\n                    indices.pop(0), tasks.pop(0)\n            else:\n                for i, example in shard_iterable:\n                    yield i, apply_function(example, i, offset=offset)\n\n        num_examples_progress_update = 0\n        # If `update_data` is True after processing the first example/batch, initialize these resources with `init_buffer_and_writer`\n        buf_writer, writer, tmp_file = None, None, None\n\n        # Optionally initialize the writer as a context manager\n        with contextlib.ExitStack() as stack:\n            try:\n                arrow_formatted_shard = shard.with_format(\"arrow\")\n\n                # Loop over single examples or batches and write to buffer/file if examples are to be updated\n                if not batched:\n                    shard_iterable = enumerate(arrow_formatted_shard)\n                else:\n                    num_rows = len(shard) if not drop_last_batch else len(shard) // batch_size * batch_size\n                    shard_iterable = zip(\n                        (list(range(i, min(i + batch_size, num_rows))) for i in range(0, num_rows, batch_size)),\n                        arrow_formatted_shard.iter(batch_size, drop_last_batch=drop_last_batch),\n                    )\n                if not batched:\n                    _time = time.time()\n                    for i, example in iter_outputs(shard_iterable):\n                        if update_data:\n                            if i == 0:\n                                buf_writer, writer, tmp_file = init_buffer_and_writer()\n                                stack.enter_context(writer)\n                            if isinstance(example, pa.Table):\n                                writer.write_row(example)\n                            elif isinstance(example, pd.DataFrame):\n                                writer.write_row(pa.Table.from_pandas(example))\n                            elif config.POLARS_AVAILABLE and \"polars\" in sys.modules:\n                                import polars as pl\n\n                                if isinstance(example, pl.DataFrame):\n                                    writer.write_row(example.to_arrow())\n                                else:\n                                    writer.write(example)\n                            else:\n                                writer.write(example)\n                        num_examples_progress_update += 1\n                        if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL:\n                            _time = time.time()\n                            yield rank, False, num_examples_progress_update\n                            num_examples_progress_update = 0\n                else:\n                    _time = time.time()\n                    for i, batch in iter_outputs(shard_iterable):\n                        num_examples_in_batch = len(i)\n                        if update_data:\n                            if i and i[0] == 0:\n                                buf_writer, writer, tmp_file = init_buffer_and_writer()\n                                stack.enter_context(writer)\n                            if isinstance(batch, pa.Table):\n                                writer.write_table(batch)\n                            elif isinstance(batch, pd.DataFrame):\n                                writer.write_table(pa.Table.from_pandas(batch))\n                            elif config.POLARS_AVAILABLE and \"polars\" in sys.modules:\n                                import polars as pl\n\n                                if isinstance(batch, pl.DataFrame):\n                                    writer.write_table(batch.to_arrow())\n                                else:\n                                    writer.write_batch(batch, try_original_type=try_original_type)\n                            else:\n                                writer.write_batch(batch, try_original_type=try_original_type)\n                        num_examples_progress_update += num_examples_in_batch\n                        if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL:\n                            _time = time.time()\n                            yield rank, False, num_examples_progress_update\n                            num_examples_progress_update = 0\n                if update_data and writer is not None:\n                    writer.finalize()  # close_stream=bool(buf_writer is None))  # We only close if we are writing in a file\n            except (Exception, KeyboardInterrupt):\n                yield rank, False, num_examples_progress_update\n                if update_data:\n                    if writer is not None:\n                        writer.finalize()\n                    if tmp_file is not None:\n                        tmp_file.close()\n                        if os.path.exists(tmp_file.name):\n                            os.remove(tmp_file.name)\n                if loop:\n                    logger.debug(f\"Canceling {len(tasks)} async tasks.\")\n                    for task in tasks:\n                        task.cancel(msg=\"KeyboardInterrupt\")\n                    try:\n                        loop.run_until_complete(asyncio.gather(*tasks))\n                    except (asyncio.CancelledError, ValueError):\n                        logger.debug(\"Tasks canceled.\")\n                raise\n\n        yield rank, False, num_examples_progress_update\n        if update_data and tmp_file is not None:\n            tmp_file.close()\n            shutil.move(tmp_file.name, cache_file_name)\n            umask = os.umask(0o666)\n            os.umask(umask)\n            os.chmod(cache_file_name, 0o666 & ~umask)\n\n        if update_data:\n            # Create new Dataset from buffer or file\n            info = shard.info.copy()\n            info.features = writer._features\n            if buf_writer is None:\n                yield rank, True, Dataset.from_file(cache_file_name, info=info, split=shard.split)\n            else:\n                yield rank, True, Dataset.from_buffer(buf_writer.getvalue(), info=info, split=shard.split)\n        else:\n            yield rank, True, shard\n\n    @transmit_format\n    @fingerprint_transform(inplace=False)\n    def batch(\n        self,\n        batch_size: int,\n        drop_last_batch: bool = False,\n        num_proc: Optional[int] = None,\n        new_fingerprint: Optional[str] = None,\n    ) -> \"Dataset\":\n        \"\"\"\n        Group samples from the dataset into batches.\n\n        Args:\n            batch_size (`int`):\n                The number of samples in each batch.\n            drop_last_batch (`bool`, defaults to `False`):\n                Whether to drop the last incomplete batch.\n            num_proc (`int`, *optional*, defaults to `None`):\n                Max number of processes when generating cache. Already cached shards are loaded sequentially.\n            new_fingerprint (`str`, *optional*, defaults to `None`):\n                The new fingerprint of the dataset after transform.\n                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments.\n\n        Returns:\n            [`Dataset`]: A new Dataset where each item is a batch of multiple samples from the original dataset.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\")\n        >>> batched_ds = ds.batch(batch_size=4)\n        >>> batched_ds[0]\n        {'text': ['compassionately explores the seemingly irreconcilable situation...', ...],  # 4 items\n        'label': [1, 1, 1, 1]}\n        ```\n        \"\"\"\n\n        def batch_fn(example):\n            return {k: [v] for k, v in example.items()}\n\n        return self.map(\n            batch_fn,\n            batched=True,\n            batch_size=batch_size,\n            drop_last_batch=drop_last_batch,\n            num_proc=num_proc,\n            new_fingerprint=new_fingerprint,\n            desc=\"Batching examples\",\n        )\n\n    @transmit_format\n    @fingerprint_transform(\n        inplace=False, ignore_kwargs=[\"load_from_cache_file\", \"cache_file_name\", \"desc\"], version=\"2.0.1\"\n    )\n    def filter(\n        self,\n        function: Optional[Callable] = None,\n        with_indices: bool = False,\n        with_rank: bool = False,\n        input_columns: Optional[Union[str, list[str]]] = None,\n        batched: bool = False,\n        batch_size: Optional[int] = 1000,\n        keep_in_memory: bool = False,\n        load_from_cache_file: Optional[bool] = None,\n        cache_file_name: Optional[str] = None,\n        writer_batch_size: Optional[int] = 1000,\n        fn_kwargs: Optional[dict] = None,\n        num_proc: Optional[int] = None,\n        suffix_template: str = \"_{rank:05d}_of_{num_proc:05d}\",\n        new_fingerprint: Optional[str] = None,\n        desc: Optional[str] = None,\n    ) -> \"Dataset\":\n        \"\"\"Apply a filter function to all the elements in the table in batches\n        and update the table so that the dataset only includes examples according to the filter function.\n\n        If the function is asynchronous, then `filter` will run your function in parallel, with up to one thousand simultaneous calls (configurable).\n        It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time.\n\n        Args:\n            function (`Callable`): Callable with one of the following signatures:\n\n                - `function(example: Dict[str, Any]) -> bool` if `batched=False` and `with_indices=False` and `with_rank=False`\n                - `function(example: Dict[str, Any], *extra_args) -> bool` if `batched=False` and `with_indices=True` and/or `with_rank=True` (one extra arg for each)\n                - `function(batch: Dict[str, List]) -> List[bool]` if `batched=True` and `with_indices=False` and `with_rank=False`\n                - `function(batch: Dict[str, List], *extra_args) -> List[bool]` if `batched=True` and `with_indices=True` and/or `with_rank=True` (one extra arg for each)\n\n                If the function is asynchronous, then `filter` will run your function in parallel.\n                If no function is provided, defaults to an always `True` function: `lambda x: True`.\n            with_indices (`bool`, defaults to `False`):\n                Provide example indices to `function`. Note that in this case the\n                signature of `function` should be `def function(example, idx[, rank]): ...`.\n            with_rank (`bool`, defaults to `False`):\n                Provide process rank to `function`. Note that in this case the\n                signature of `function` should be `def function(example[, idx], rank): ...`.\n            input_columns (`str` or `List[str]`, *optional*):\n                The columns to be passed into `function` as\n                positional arguments. If `None`, a `dict` mapping to all formatted columns is passed as one argument.\n            batched (`bool`, defaults to `False`):\n                Provide batch of examples to `function`.\n            batch_size (`int`, *optional*, defaults to `1000`):\n                Number of examples per batch provided to `function` if\n                `batched = True`. If `batched = False`, one example per batch is passed to `function`.\n                If `batch_size <= 0` or `batch_size == None`, provide the full dataset as a single batch to `function`.\n            keep_in_memory (`bool`, defaults to `False`):\n                Keep the dataset in memory instead of writing it to a cache file.\n            load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled):\n                If a cache file storing the current computation from `function`\n                can be identified, use it instead of recomputing.\n            cache_file_name (`str`, *optional*):\n                Provide the name of a path for the cache file. It is used to store the\n                results of the computation instead of the automatically generated cache file name.\n            writer_batch_size (`int`, defaults to `1000`):\n                Number of rows per write operation for the cache file writer.\n                This value is a good trade-off between memory usage during the processing, and processing speed.\n                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`.\n            fn_kwargs (`dict`, *optional*):\n                Keyword arguments to be passed to `function`.\n            num_proc (`int`, *optional*, defaults to `None`):\n                 The number of processes to use for multiprocessing.\n                - If `None` or `0`, no multiprocessing is used and the operation runs in the main process.\n                - If greater than `1`, one or multiple worker processes are used to process data in parallel.\n                 Note: The function passed to `map()` must be picklable for multiprocessing to work correctly\n                 (i.e., prefer functions defined at the top level of a module, not inside another function or class).\n            suffix_template (`str`):\n                If `cache_file_name` is specified, then this suffix will be added at the end of the base name of each.\n                For example, if `cache_file_name` is `\"processed.arrow\"`, then for `rank = 1` and `num_proc = 4`,\n                the resulting file would be `\"processed_00001_of_00004.arrow\"` for the default suffix (default\n                `_{rank:05d}_of_{num_proc:05d}`).\n            new_fingerprint (`str`, *optional*):\n                The new fingerprint of the dataset after transform.\n                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments.\n            desc (`str`, *optional*, defaults to `None`):\n                Meaningful description to be displayed alongside with the progress bar while filtering examples.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds = ds.filter(lambda x: x[\"label\"] == 1)\n        >>> ds\n        Dataset({\n            features: ['text', 'label'],\n            num_rows: 533\n        })\n        ```\n\n        \"\"\"\n        if len(self.list_indexes()) > 0:\n            raise DatasetTransformationNotAllowedError(\n                \"Using `.filter` on a dataset with attached indexes is not allowed. You can first run `.drop_index() to remove your index and then re-add it.`\"\n            )\n\n        if function is None:\n            function = lambda x: True  # noqa: E731\n\n        if len(self) == 0:\n            return self\n\n        # We generally batch the underlying map() to get faster throughput,\n        # but in case of async we force batch_size=1 to enable parallelism\n        if inspect.iscoroutinefunction(function) and not batched:\n            batch_size = 1\n\n        indices = self.map(\n            function=partial(\n                async_get_indices_from_mask_function\n                if inspect.iscoroutinefunction(function)\n                else get_indices_from_mask_function,\n                function,\n                batched,\n                with_indices,\n                with_rank,\n                input_columns,\n                self._indices,\n            ),\n            with_indices=True,\n            with_rank=True,\n            features=Features({\"indices\": Value(\"uint64\")}),\n            batched=True,\n            batch_size=batch_size,\n            remove_columns=self.column_names,\n            keep_in_memory=keep_in_memory,\n            load_from_cache_file=load_from_cache_file,\n            cache_file_name=cache_file_name,\n            writer_batch_size=writer_batch_size,\n            fn_kwargs=fn_kwargs,\n            num_proc=num_proc,\n            suffix_template=suffix_template,\n            new_fingerprint=new_fingerprint,\n            input_columns=input_columns,\n            desc=desc or \"Filter\",\n        )\n        new_dataset = copy.deepcopy(self)\n        new_dataset._indices = indices.data\n        new_dataset._fingerprint = new_fingerprint\n        return new_dataset\n\n    @transmit_format\n    @fingerprint_transform(inplace=False, ignore_kwargs=[\"cache_file_name\"])\n    def flatten_indices(\n        self,\n        keep_in_memory: bool = False,\n        cache_file_name: Optional[str] = None,\n        writer_batch_size: Optional[int] = 1000,\n        features: Optional[Features] = None,\n        disable_nullable: bool = False,\n        num_proc: Optional[int] = None,\n        new_fingerprint: Optional[str] = None,\n    ) -> \"Dataset\":\n        \"\"\"Create and cache a new Dataset by flattening the indices mapping.\n\n        Args:\n            keep_in_memory (`bool`, defaults to `False`):\n                Keep the dataset in memory instead of writing it to a cache file.\n            cache_file_name (`str`, *optional*, default `None`):\n                Provide the name of a path for the cache file. It is used to store the\n                results of the computation instead of the automatically generated cache file name.\n            writer_batch_size (`int`, defaults to `1000`):\n                Number of rows per write operation for the cache file writer.\n                This value is a good trade-off between memory usage during the processing, and processing speed.\n                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`.\n            features (`Optional[datasets.Features]`, defaults to `None`):\n                Use a specific [`Features`] to store the cache file\n                instead of the automatically generated one.\n            disable_nullable (`bool`, defaults to `False`):\n                Allow null values in the table.\n            num_proc (`int`, optional, default `None`):\n                Max number of processes when generating cache. Already cached shards are loaded sequentially\n            new_fingerprint (`str`, *optional*, defaults to `None`):\n                The new fingerprint of the dataset after transform.\n                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments\n        \"\"\"\n\n        return self.map(\n            batched=True,  # for speed\n            keep_in_memory=keep_in_memory,\n            cache_file_name=cache_file_name,\n            writer_batch_size=writer_batch_size,\n            features=features,\n            disable_nullable=disable_nullable,\n            new_fingerprint=new_fingerprint,\n            desc=\"Flattening the indices\",\n            num_proc=num_proc,\n        )\n\n    def _new_dataset_with_indices(\n        self,\n        indices_cache_file_name: Optional[str] = None,\n        indices_buffer: Optional[pa.Buffer] = None,\n        fingerprint: Optional[str] = None,\n    ) -> \"Dataset\":\n        \"\"\"Return a new Dataset obtained by adding indices (provided in indices_cache_file_name or in a buffer) to the\n        current Dataset.\n        \"\"\"\n\n        if indices_cache_file_name is None and indices_buffer is None:\n            raise ValueError(\"At least one of indices_cache_file_name or indices_buffer must be provided.\")\n\n        if fingerprint is None:\n            raise ValueError(\"please specify a fingerprint for the dataset with indices\")\n\n        if indices_cache_file_name is not None:\n            indices_table = MemoryMappedTable.from_file(indices_cache_file_name)\n        else:\n            indices_table = InMemoryTable.from_buffer(indices_buffer)\n\n        # Return new Dataset object\n        # don't forget to copy the objects\n        return Dataset(\n            self._data,\n            info=self.info.copy(),\n            split=self.split,\n            indices_table=indices_table,\n            fingerprint=fingerprint,\n        )\n\n    @transmit_format\n    @fingerprint_transform(inplace=False, ignore_kwargs=[\"indices_cache_file_name\"])\n    def select(\n        self,\n        indices: Iterable,\n        keep_in_memory: bool = False,\n        indices_cache_file_name: Optional[str] = None,\n        writer_batch_size: Optional[int] = 1000,\n        new_fingerprint: Optional[str] = None,\n    ) -> \"Dataset\":\n        \"\"\"Create a new dataset with rows selected following the list/array of indices.\n\n        Args:\n            indices (`range`, `list`, `iterable`, `ndarray` or `Series`):\n                Range, list or 1D-array of integer indices for indexing.\n                If the indices correspond to a contiguous range, the Arrow table is simply sliced.\n                However passing a list of indices that are not contiguous creates indices mapping, which is much less efficient,\n                but still faster than recreating an Arrow table made of the requested rows.\n            keep_in_memory (`bool`, defaults to `False`):\n                Keep the indices mapping in memory instead of writing it to a cache file.\n            indices_cache_file_name (`str`, *optional*, defaults to `None`):\n                Provide the name of a path for the cache file. It is used to store the\n                indices mapping instead of the automatically generated cache file name.\n            writer_batch_size (`int`, defaults to `1000`):\n                Number of rows per write operation for the cache file writer.\n                This value is a good trade-off between memory usage during the processing, and processing speed.\n                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`.\n            new_fingerprint (`str`, *optional*, defaults to `None`):\n                The new fingerprint of the dataset after transform.\n                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds = ds.select(range(4))\n        >>> ds\n        Dataset({\n            features: ['text', 'label'],\n            num_rows: 4\n        })\n        ```\n        \"\"\"\n        if keep_in_memory and indices_cache_file_name is not None:\n            raise ValueError(\"Please use either `keep_in_memory` or `indices_cache_file_name` but not both.\")\n\n        if len(self.list_indexes()) > 0:\n            raise DatasetTransformationNotAllowedError(\n                \"Using `.select` on a dataset with attached indexes is not allowed. You can first run `.drop_index() to remove your index and then re-add it.\"\n            )\n\n        # If the array is empty we do nothing\n        if len(self) == 0:\n            return self\n\n        # If indices is a PyArrow array, we convert to NumPy\n        if isinstance(indices, (pa.Array, pa.ChunkedArray)):\n            indices = indices.to_numpy().astype(np.int64)\n\n        # Convert generator objects to lists\n        if isinstance(indices, Iterator):\n            indices = list(indices)\n\n        # If the indices are contiguous, simply slice the arrow table\n        if isinstance(indices, range):\n            if _is_range_contiguous(indices) and indices.start >= 0:\n                start, length = indices.start, indices.stop - indices.start\n                return self._select_contiguous(start, length, new_fingerprint=new_fingerprint)\n        else:\n            try:\n                start = next(iter(indices))\n            except StopIteration:\n                # if `indices` is an empty iterable, we return an empty dataset\n                return self._select_contiguous(0, 0, new_fingerprint=new_fingerprint)\n            if start >= 0:\n                counter_from_start = itertools.count(start=start)\n                if all(i == j for i, j in zip(indices, counter_from_start)):\n                    length = next(counter_from_start) - start\n                    return self._select_contiguous(start, length, new_fingerprint=new_fingerprint)\n\n        # If not contiguous, we need to create a new indices mapping\n        return self._select_with_indices_mapping(\n            indices,\n            keep_in_memory=keep_in_memory,\n            indices_cache_file_name=indices_cache_file_name,\n            writer_batch_size=writer_batch_size,\n            new_fingerprint=new_fingerprint,\n        )\n\n    @transmit_format\n    @fingerprint_transform(inplace=False)\n    def _select_contiguous(\n        self,\n        start: int,\n        length: int,\n        new_fingerprint: Optional[str] = None,\n    ) -> \"Dataset\":\n        \"\"\"Create a new dataset with rows from a contiguous slice of data.\n        The slice is defined by that start index and its length.\n\n        Args:\n            start (`int`): start index.\n            length (`int`): length of the slice to select.\n            new_fingerprint (`str`, optional, default `None`): the new fingerprint of the dataset after transform.\n                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds._select_contiguous(0, 4)\n        Dataset({\n            features: ['text', 'label'],\n            num_rows: 4\n        })\n        ```\n        \"\"\"\n        if len(self.list_indexes()) > 0:\n            raise DatasetTransformationNotAllowedError(\n                \"Using `.select` on a dataset with attached indexes is not allowed. You can first run `.drop_index() to remove your index and then re-add it.\"\n            )\n\n        # If the array is empty we do nothing\n        if len(self) == 0:\n            return self\n\n        _check_valid_indices_value(start, len(self))\n        _check_valid_indices_value(start + length - 1, len(self))\n        if self._indices is None or length == 0:\n            return Dataset(\n                self.data.slice(start, length),\n                info=self.info.copy(),\n                split=self.split,\n                fingerprint=new_fingerprint,\n            )\n        else:\n            return Dataset(\n                self.data,\n                info=self.info.copy(),\n                split=self.split,\n                indices_table=self._indices.slice(start, length),\n                fingerprint=new_fingerprint,\n            )\n\n    @transmit_format\n    @fingerprint_transform(inplace=False, ignore_kwargs=[\"indices_cache_file_name\"])\n    def _select_with_indices_mapping(\n        self,\n        indices: Iterable,\n        keep_in_memory: bool = False,\n        indices_cache_file_name: Optional[str] = None,\n        writer_batch_size: Optional[int] = 1000,\n        new_fingerprint: Optional[str] = None,\n    ) -> \"Dataset\":\n        \"\"\"Create a new dataset with rows selected following the list/array of indices.\n        The new dataset is made by creating a new indices mapping on top of the main arrow table.\n\n        Args:\n            indices (sequence, iterable, range, ndarray or Series): List or 1D-array of integer indices for indexing.\n            keep_in_memory (`bool`, default `False`): Keep the indices mapping in memory instead of writing it to a cache file.\n            indices_cache_file_name (`str`, optional, default `None`): Provide the name of a path for the cache file. It is used to store the\n                indices mapping instead of the automatically generated cache file name.\n            writer_batch_size (`int`, default `1000`): Number of rows per write operation for the cache file writer.\n                This value is a good trade-off between memory usage during the processing, and processing speed.\n                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `.map()`.\n            new_fingerprint (`str`, optional, default `None`): the new fingerprint of the dataset after transform.\n                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds._select_with_indices_mapping(range(4))\n        Dataset({\n            features: ['text', 'label'],\n            num_rows: 4\n        })\n        ```\n        \"\"\"\n        if keep_in_memory and indices_cache_file_name is not None:\n            raise ValueError(\"Please use either `keep_in_memory` or `indices_cache_file_name` but not both.\")\n\n        if len(self.list_indexes()) > 0:\n            raise DatasetTransformationNotAllowedError(\n                \"Using `.select` on a dataset with attached indexes is not allowed. You can first run `.drop_index() to remove your index and then re-add it.\"\n            )\n\n        # If the array is empty we do nothing\n        if len(self) == 0:\n            return self\n\n        # Prepare the writer for our indices arrow table\n        if keep_in_memory or indices_cache_file_name is None:\n            buf_writer = pa.BufferOutputStream()\n            tmp_file = None\n            writer = ArrowWriter(\n                stream=buf_writer, writer_batch_size=writer_batch_size, fingerprint=new_fingerprint, unit=\"indices\"\n            )\n        else:\n            buf_writer = None\n            logger.info(f\"Caching indices mapping at {indices_cache_file_name}\")\n            cache_dir = os.path.dirname(indices_cache_file_name)\n            os.makedirs(cache_dir, exist_ok=True)\n            tmp_file = tempfile.NamedTemporaryFile(\"wb\", dir=cache_dir, delete=False)\n            writer = ArrowWriter(\n                path=tmp_file.name, writer_batch_size=writer_batch_size, fingerprint=new_fingerprint, unit=\"indices\"\n            )\n\n        indices = indices if isinstance(indices, list) else list(indices)\n\n        size = len(self)\n        if indices:\n            _check_valid_indices_value(int(max(indices)), size=size)\n            _check_valid_indices_value(int(min(indices)), size=size)\n        else:\n            return self._select_contiguous(0, 0, new_fingerprint=new_fingerprint)\n\n        indices_array = pa.array(indices, type=pa.uint64())\n        # Check if we need to convert indices\n        if self._indices is not None:\n            indices_array = self._indices.column(0).take(indices_array)\n\n        indices_table = pa.Table.from_arrays([indices_array], names=[\"indices\"])\n\n        with writer:\n            try:\n                writer.write_table(indices_table)\n                writer.finalize()  # close_stream=bool(buf_writer is None))  We only close if we are writing in a file\n            except (Exception, KeyboardInterrupt):\n                if tmp_file is not None:\n                    tmp_file.close()\n                    if os.path.exists(tmp_file.name):\n                        os.remove(tmp_file.name)\n                raise\n\n        if tmp_file is not None:\n            tmp_file.close()\n            shutil.move(tmp_file.name, indices_cache_file_name)\n            umask = os.umask(0o666)\n            os.umask(umask)\n            os.chmod(indices_cache_file_name, 0o666 & ~umask)\n\n        # Return new Dataset object\n        if buf_writer is None:\n            return self._new_dataset_with_indices(\n                indices_cache_file_name=indices_cache_file_name, fingerprint=new_fingerprint\n            )\n        else:\n            return self._new_dataset_with_indices(indices_buffer=buf_writer.getvalue(), fingerprint=new_fingerprint)\n\n    def skip(self, n: int) -> \"Dataset\":\n        \"\"\"\n        Create a new [`Dataset`] that skips the first `n` elements.\n\n        Args:\n            n (`int`):\n                Number of elements to skip.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\")\n        >>> list(ds.take(3))\n        [{'label': 1,\n         'text': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'},\n         {'label': 1,\n         'text': 'the gorgeously elaborate continuation of \" the lord of the rings \" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\\'s expanded vision of j . r . r . tolkien\\'s middle-earth .'},\n         {'label': 1, 'text': 'effective but too-tepid biopic'}]\n        >>> ds = ds.skip(1)\n        >>> list(ds.take(3))\n        [{'label': 1,\n         'text': 'the gorgeously elaborate continuation of \" the lord of the rings \" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\\'s expanded vision of j . r . r . tolkien\\'s middle-earth .'},\n         {'label': 1, 'text': 'effective but too-tepid biopic'},\n         {'label': 1,\n         'text': 'if you sometimes like to go to the movies to have fun , wasabi is a good place to start .'}]\n        ```\n        \"\"\"\n        return self.select(range(n, len(self)))\n\n    def repeat(self, num_times: int) -> \"Dataset\":\n        \"\"\"\n        Create a new [`Dataset`] that repeats the underlying dataset `num_times` times.\n\n        Like itertools.repeat, repeating once just returns the full dataset.\n\n        Args:\n            num_times (`int`):\n                Number of times to repeat the dataset.\n\n        Example:\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\")\n        >>> ds = ds.take(2).repeat(2)\n        >>> list(ds)\n        [{'label': 1,\n         'text': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'},\n         {'label': 1,\n         'text': 'the gorgeously elaborate continuation of \" the lord of the rings \" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\\'s expanded vision of j . r . r . tolkien\\'s middle-earth .'},\n         {'label': 1, 'text': 'effective but too-tepid biopic'},\n         {'label': 1,\n         'text': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'},\n         {'label': 1,\n         'text': 'the gorgeously elaborate continuation of \" the lord of the rings \" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\\'s expanded vision of j . r . r . tolkien\\'s middle-earth .'},\n         {'label': 1, 'text': 'effective but too-tepid biopic'}]\n        ```\n        \"\"\"\n        if num_times is None:\n            raise ValueError(\"Map style datasets do not support indefinite repetition.\")\n        return _concatenate_map_style_datasets([self] * num_times) if num_times > 0 else self.select([])\n\n    def take(self, n: int) -> \"Dataset\":\n        \"\"\"\n        Create a new [`Dataset`] with only the first `n` elements.\n\n        Args:\n            n (`int`):\n                Number of elements to take.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\")\n        >>> small_ds = ds.take(2)\n        >>> list(small_ds)\n        [{'label': 1,\n         'text': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'},\n         {'label': 1,\n         'text': 'the gorgeously elaborate continuation of \" the lord of the rings \" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\\'s expanded vision of j . r . r . tolkien\\'s middle-earth .'}]\n        ```\n        \"\"\"\n        return self.select(range(n))\n\n    @transmit_format\n    @fingerprint_transform(inplace=False, ignore_kwargs=[\"load_from_cache_file\", \"indices_cache_file_name\"])\n    def sort(\n        self,\n        column_names: Union[str, Sequence_[str]],\n        reverse: Union[bool, Sequence_[bool]] = False,\n        null_placement: str = \"at_end\",\n        keep_in_memory: bool = False,\n        load_from_cache_file: Optional[bool] = None,\n        indices_cache_file_name: Optional[str] = None,\n        writer_batch_size: Optional[int] = 1000,\n        new_fingerprint: Optional[str] = None,\n    ) -> \"Dataset\":\n        \"\"\"Create a new dataset sorted according to a single or multiple columns.\n\n        Args:\n            column_names (`Union[str, Sequence[str]]`):\n                Column name(s) to sort by.\n            reverse (`Union[bool, Sequence[bool]]`, defaults to `False`):\n                If `True`, sort by descending order rather than ascending. If a single bool is provided,\n                the value is applied to the sorting of all column names. Otherwise a list of bools with the\n                same length and order as column_names must be provided.\n            null_placement (`str`, defaults to `at_end`):\n                Put `None` values at the beginning if `at_start` or `first` or at the end if `at_end` or `last`\n\n                <Added version=\"1.14.2\"/>\n            keep_in_memory (`bool`, defaults to `False`):\n                Keep the sorted indices in memory instead of writing it to a cache file.\n            load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled):\n                If a cache file storing the sorted indices\n                can be identified, use it instead of recomputing.\n            indices_cache_file_name (`str`, *optional*, defaults to `None`):\n                Provide the name of a path for the cache file. It is used to store the\n                sorted indices instead of the automatically generated cache file name.\n            writer_batch_size (`int`, defaults to `1000`):\n                Number of rows per write operation for the cache file writer.\n                Higher value gives smaller cache files, lower value consume less temporary memory.\n            new_fingerprint (`str`, *optional*, defaults to `None`):\n                The new fingerprint of the dataset after transform.\n                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='validation')\n        >>> ds['label'][:10]\n        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n        >>> sorted_ds = ds.sort('label')\n        >>> sorted_ds['label'][:10]\n        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n        >>> another_sorted_ds = ds.sort(['label', 'text'], reverse=[True, False])\n        >>> another_sorted_ds['label'][:10]\n        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n        ```\n        \"\"\"\n        if len(self.list_indexes()) > 0:\n            raise DatasetTransformationNotAllowedError(\n                \"Using `.sort` on a dataset with attached indexes is not allowed. You can first run `.drop_index() to remove your index and then re-add it.\"\n            )\n        # If the array is empty we do nothing\n        if len(self) == 0:\n            return self\n\n        # Check proper format of and for duplicates in column_names\n        if isinstance(column_names, str):\n            column_names = [column_names]\n\n        # Check proper format and length of reverse\n        if not isinstance(reverse, bool):\n            if len(reverse) != len(column_names):\n                raise ValueError(\n                    \"Parameter 'reverse' should be either a boolean or a list of booleans with the same length as 'column_names'.\"\n                )\n        else:\n            reverse = [reverse] * len(column_names)\n\n        # Check whether column name(s) exist in dataset\n        for column in column_names:\n            if not isinstance(column, str) or column not in self._data.column_names:\n                raise ValueError(\n                    f\"Column '{column}' not found in the dataset. Please provide a column selected in: {self._data.column_names}\"\n                )\n\n        # Change null_placement to conform to pyarrow's sort_indices() while ensuring backwards compatibility\n        if null_placement not in [\"at_start\", \"at_end\"]:\n            if null_placement == \"first\":\n                null_placement = \"at_start\"\n            elif null_placement == \"last\":\n                null_placement = \"at_end\"\n            else:\n                raise ValueError(\n                    f\"null_placement '{null_placement}' is an invalid parameter value. Must be either 'last', 'at_end', 'first' or 'at_start'.\"\n                )\n\n        load_from_cache_file = load_from_cache_file if load_from_cache_file is not None else is_caching_enabled()\n\n        # Check if we've already cached this computation (indexed by a hash)\n        if self.cache_files:\n            if indices_cache_file_name is None:\n                # we create a unique hash from the function, current dataset file and the mapping args\n                indices_cache_file_name = self._get_cache_file_path(new_fingerprint)\n            if os.path.exists(indices_cache_file_name) and load_from_cache_file:\n                logger.info(f\"Loading cached sorted indices for dataset at {indices_cache_file_name}\")\n                return self._new_dataset_with_indices(\n                    fingerprint=new_fingerprint, indices_cache_file_name=indices_cache_file_name\n                )\n\n        sort_table = query_table(\n            table=self._data,\n            key=slice(0, len(self)),\n            indices=self._indices,\n        )\n\n        sort_keys = [\n            (col, \"ascending\" if not col_reverse else \"descending\") for col, col_reverse in zip(column_names, reverse)\n        ]\n\n        indices = pc.sort_indices(sort_table, sort_keys=sort_keys, null_placement=null_placement)\n\n        return self.select(\n            indices=indices,\n            keep_in_memory=keep_in_memory,\n            indices_cache_file_name=indices_cache_file_name,\n            writer_batch_size=writer_batch_size,\n            new_fingerprint=new_fingerprint,\n        )\n\n    @transmit_format\n    @fingerprint_transform(\n        inplace=False, randomized_function=True, ignore_kwargs=[\"load_from_cache_file\", \"indices_cache_file_name\"]\n    )\n    def shuffle(\n        self,\n        seed: Optional[int] = None,\n        generator: Optional[np.random.Generator] = None,\n        keep_in_memory: bool = False,\n        load_from_cache_file: Optional[bool] = None,\n        indices_cache_file_name: Optional[str] = None,\n        writer_batch_size: Optional[int] = 1000,\n        new_fingerprint: Optional[str] = None,\n    ) -> \"Dataset\":\n        \"\"\"Create a new Dataset where the rows are shuffled.\n\n        Currently shuffling uses numpy random generators.\n        You can either supply a NumPy BitGenerator to use, or a seed to initiate NumPy's default random generator (PCG64).\n\n        Shuffling takes the list of indices `[0:len(my_dataset)]` and shuffles it to create an indices mapping.\n        However as soon as your [`Dataset`] has an indices mapping, the speed can become 10x slower.\n        This is because there is an extra step to get the row index to read using the indices mapping, and most importantly, you aren't reading contiguous chunks of data anymore.\n        To restore the speed, you'd need to rewrite the entire dataset on your disk again using [`Dataset.flatten_indices`], which removes the indices mapping.\n        This may take a lot of time depending of the size of your dataset though:\n\n        ```python\n        my_dataset[0]  # fast\n        my_dataset = my_dataset.shuffle(seed=42)\n        my_dataset[0]  # up to 10x slower\n        my_dataset = my_dataset.flatten_indices()  # rewrite the shuffled dataset on disk as contiguous chunks of data\n        my_dataset[0]  # fast again\n        ```\n\n        In this case, we recommend switching to an [`IterableDataset`] and leveraging its fast approximate shuffling method [`IterableDataset.shuffle`].\n        It only shuffles the shards order and adds a shuffle buffer to your dataset, which keeps the speed of your dataset optimal:\n\n        ```python\n        my_iterable_dataset = my_dataset.to_iterable_dataset(num_shards=128)\n        for example in enumerate(my_iterable_dataset):  # fast\n            pass\n\n        shuffled_iterable_dataset = my_iterable_dataset.shuffle(seed=42, buffer_size=100)\n\n        for example in enumerate(shuffled_iterable_dataset):  # as fast as before\n            pass\n        ```\n\n        Args:\n            seed (`int`, *optional*):\n                A seed to initialize the default BitGenerator if `generator=None`.\n                If `None`, then fresh, unpredictable entropy will be pulled from the OS.\n                If an `int` or `array_like[ints]` is passed, then it will be passed to SeedSequence to derive the initial BitGenerator state.\n            generator (`numpy.random.Generator`, *optional*):\n                Numpy random Generator to use to compute the permutation of the dataset rows.\n                If `generator=None` (default), uses `np.random.default_rng` (the default BitGenerator (PCG64) of NumPy).\n            keep_in_memory (`bool`, default `False`):\n                Keep the shuffled indices in memory instead of writing it to a cache file.\n            load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled):\n                If a cache file storing the shuffled indices\n                can be identified, use it instead of recomputing.\n            indices_cache_file_name (`str`, *optional*):\n                Provide the name of a path for the cache file. It is used to store the\n                shuffled indices instead of the automatically generated cache file name.\n            writer_batch_size (`int`, defaults to `1000`):\n                Number of rows per write operation for the cache file writer.\n                This value is a good trade-off between memory usage during the processing, and processing speed.\n                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`.\n            new_fingerprint (`str`, *optional*, defaults to `None`):\n                The new fingerprint of the dataset after transform.\n                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds['label'][:10]\n        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n\n        # set a seed\n        >>> shuffled_ds = ds.shuffle(seed=42)\n        >>> shuffled_ds['label'][:10]\n        [1, 0, 1, 1, 0, 0, 0, 0, 0, 0]\n        ```\n        \"\"\"\n        if len(self.list_indexes()) > 0:\n            raise DatasetTransformationNotAllowedError(\n                \"Using `.shuffle` on a dataset with attached indexes is not allowed. You can first run `.drop_index() to remove your index and then re-add it.\"\n            )\n        # If the array is empty we do nothing\n        if len(self) == 0:\n            return self\n\n        if keep_in_memory and indices_cache_file_name is not None:\n            raise ValueError(\"Please use either `keep_in_memory` or `indices_cache_file_name` but not both.\")\n\n        if seed is not None and generator is not None:\n            raise ValueError(\"Both `seed` and `generator` were provided. Please specify just one of them.\")\n\n        if generator is not None and not isinstance(generator, np.random.Generator):\n            raise ValueError(\"The provided generator must be an instance of numpy.random.Generator\")\n\n        load_from_cache_file = load_from_cache_file if load_from_cache_file is not None else is_caching_enabled()\n\n        if generator is None:\n            if seed is None:\n                _, seed, pos, *_ = np.random.get_state()\n                seed = seed[pos] if pos < 624 else seed[0]\n                _ = np.random.random()  # do 1 step of rng\n            generator = np.random.default_rng(seed)\n\n        # Check if we've already cached this computation (indexed by a hash)\n        if self.cache_files:\n            if indices_cache_file_name is None:\n                # we create a unique hash from the function, current dataset file and the mapping args\n                indices_cache_file_name = self._get_cache_file_path(new_fingerprint)\n            if os.path.exists(indices_cache_file_name) and load_from_cache_file:\n                logger.info(f\"Loading cached shuffled indices for dataset at {indices_cache_file_name}\")\n                return self._new_dataset_with_indices(\n                    fingerprint=new_fingerprint, indices_cache_file_name=indices_cache_file_name\n                )\n\n        permutation = generator.permutation(len(self))\n\n        return self.select(\n            indices=permutation,\n            keep_in_memory=keep_in_memory,\n            indices_cache_file_name=indices_cache_file_name if not keep_in_memory else None,\n            writer_batch_size=writer_batch_size,\n            new_fingerprint=new_fingerprint,\n        )\n\n    @transmit_format\n    @fingerprint_transform(\n        inplace=False,\n        randomized_function=True,\n        fingerprint_names=[\"train_new_fingerprint\", \"test_new_fingerprint\"],\n        ignore_kwargs=[\"load_from_cache_file\", \"train_indices_cache_file_name\", \"test_indices_cache_file_name\"],\n    )\n    def train_test_split(\n        self,\n        test_size: Union[float, int, None] = None,\n        train_size: Union[float, int, None] = None,\n        shuffle: bool = True,\n        stratify_by_column: Optional[str] = None,\n        seed: Optional[int] = None,\n        generator: Optional[np.random.Generator] = None,\n        keep_in_memory: bool = False,\n        load_from_cache_file: Optional[bool] = None,\n        train_indices_cache_file_name: Optional[str] = None,\n        test_indices_cache_file_name: Optional[str] = None,\n        writer_batch_size: Optional[int] = 1000,\n        train_new_fingerprint: Optional[str] = None,\n        test_new_fingerprint: Optional[str] = None,\n    ) -> \"DatasetDict\":\n        \"\"\"Return a dictionary ([`datasets.DatasetDict`]) with two random train and test subsets (`train` and `test` `Dataset` splits).\n        Splits are created from the dataset according to `test_size`, `train_size` and `shuffle`.\n\n        This method is similar to scikit-learn `train_test_split`.\n\n        Args:\n            test_size (`Union[float, int, None]`, *optional*):\n                Size of the test split\n                If `float`, should be between `0.0` and `1.0` and represent the proportion of the dataset to include in the test split.\n                If `int`, represents the absolute number of test samples.\n                If `None`, the value is set to the complement of the train size.\n                If `train_size` is also `None`, it will be set to `0.25`.\n            train_size (`Union[float, int, None]`, *optional*):\n                Size of the train split\n                If `float`, should be between `0.0` and `1.0` and represent the proportion of the dataset to include in the train split.\n                If `int`, represents the absolute number of train samples.\n                If `None`, the value is automatically set to the complement of the test size.\n            shuffle (`bool`, *optional*, defaults to `True`):\n                Whether or not to shuffle the data before splitting.\n            stratify_by_column (`str`, *optional*, defaults to `None`):\n                The column name of labels to be used to perform stratified split of data.\n            seed (`int`, *optional*):\n                A seed to initialize the default BitGenerator if `generator=None`.\n                If `None`, then fresh, unpredictable entropy will be pulled from the OS.\n                If an `int` or `array_like[ints]` is passed, then it will be passed to SeedSequence to derive the initial BitGenerator state.\n            generator (`numpy.random.Generator`, *optional*):\n                Numpy random Generator to use to compute the permutation of the dataset rows.\n                If `generator=None` (default), uses `np.random.default_rng` (the default BitGenerator (PCG64) of NumPy).\n            keep_in_memory (`bool`, defaults to `False`):\n                Keep the splits indices in memory instead of writing it to a cache file.\n            load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled):\n                If a cache file storing the splits indices\n                can be identified, use it instead of recomputing.\n            train_cache_file_name (`str`, *optional*):\n                Provide the name of a path for the cache file. It is used to store the\n                train split indices instead of the automatically generated cache file name.\n            test_cache_file_name (`str`, *optional*):\n                Provide the name of a path for the cache file. It is used to store the\n                test split indices instead of the automatically generated cache file name.\n            writer_batch_size (`int`, defaults to `1000`):\n                Number of rows per write operation for the cache file writer.\n                This value is a good trade-off between memory usage during the processing, and processing speed.\n                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`.\n            train_new_fingerprint (`str`, *optional*, defaults to `None`):\n                The new fingerprint of the train set after transform.\n                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments\n            test_new_fingerprint (`str`, *optional*, defaults to `None`):\n                The new fingerprint of the test set after transform.\n                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds = ds.train_test_split(test_size=0.2, shuffle=True)\n        DatasetDict({\n            train: Dataset({\n                features: ['text', 'label'],\n                num_rows: 852\n            })\n            test: Dataset({\n                features: ['text', 'label'],\n                num_rows: 214\n            })\n        })\n\n        # set a seed\n        >>> ds = ds.train_test_split(test_size=0.2, seed=42)\n\n        # stratified split\n        >>> ds = load_dataset(\"stanfordnlp/imdb\",split=\"train\")\n        Dataset({\n            features: ['text', 'label'],\n            num_rows: 25000\n        })\n        >>> ds = ds.train_test_split(test_size=0.2, stratify_by_column=\"label\")\n        DatasetDict({\n            train: Dataset({\n                features: ['text', 'label'],\n                num_rows: 20000\n            })\n            test: Dataset({\n                features: ['text', 'label'],\n                num_rows: 5000\n            })\n        })\n        ```\n        \"\"\"\n        from .dataset_dict import DatasetDict  # import here because of circular dependency\n\n        if len(self.list_indexes()) > 0:\n            raise DatasetTransformationNotAllowedError(\n                \"Using `.train_test_split` on a dataset with attached indexes is not allowed. You can first run `.drop_index() to remove your index and then re-add it.\"\n            )\n        # If the array is empty we do nothing\n        if len(self) == 0:\n            return DatasetDict({\"train\": self, \"test\": self})\n\n        if test_size is None and train_size is None:\n            test_size = 0.25\n\n        # Safety checks similar to scikit-learn's ones.\n        # (adapted from https://github.com/scikit-learn/scikit-learn/blob/fd237278e895b42abe8d8d09105cbb82dc2cbba7/sklearn/model_selection/_split.py#L1750)\n        n_samples = len(self)\n        if (\n            isinstance(test_size, int)\n            and (test_size >= n_samples or test_size <= 0)\n            or isinstance(test_size, float)\n            and (test_size <= 0 or test_size >= 1)\n        ):\n            raise ValueError(\n                f\"test_size={test_size} should be either positive and smaller \"\n                f\"than the number of samples {n_samples} or a float in the (0, 1) range\"\n            )\n\n        if (\n            isinstance(train_size, int)\n            and (train_size >= n_samples or train_size <= 0)\n            or isinstance(train_size, float)\n            and (train_size <= 0 or train_size >= 1)\n        ):\n            raise ValueError(\n                f\"train_size={train_size} should be either positive and smaller \"\n                f\"than the number of samples {n_samples} or a float in the (0, 1) range\"\n            )\n\n        if train_size is not None and not isinstance(train_size, (int, float)):\n            raise ValueError(f\"Invalid value for train_size: {train_size} of type {type(train_size)}\")\n        if test_size is not None and not isinstance(test_size, (int, float)):\n            raise ValueError(f\"Invalid value for test_size: {test_size} of type {type(test_size)}\")\n\n        if isinstance(train_size, float) and isinstance(test_size, float) and train_size + test_size > 1:\n            raise ValueError(\n                f\"The sum of test_size and train_size = {train_size + test_size}, should be in the (0, 1)\"\n                \" range. Reduce test_size and/or train_size.\"\n            )\n\n        if isinstance(test_size, float):\n            n_test = ceil(test_size * n_samples)\n        elif isinstance(test_size, int):\n            n_test = float(test_size)\n\n        if isinstance(train_size, float):\n            n_train = floor(train_size * n_samples)\n        elif isinstance(train_size, int):\n            n_train = float(train_size)\n\n        if train_size is None:\n            n_train = n_samples - n_test\n        elif test_size is None:\n            n_test = n_samples - n_train\n\n        if n_train + n_test > n_samples:\n            raise ValueError(\n                f\"The sum of train_size and test_size = {n_train + n_test}, \"\n                \"should be smaller than the number of \"\n                f\"samples {n_samples}. Reduce test_size and/or \"\n                \"train_size.\"\n            )\n\n        n_train, n_test = int(n_train), int(n_test)\n\n        if n_train == 0:\n            raise ValueError(\n                f\"With n_samples={n_samples}, test_size={test_size} and train_size={train_size}, the \"\n                \"resulting train set will be empty. Adjust any of the \"\n                \"aforementioned parameters.\"\n            )\n\n        load_from_cache_file = load_from_cache_file if load_from_cache_file is not None else is_caching_enabled()\n\n        if generator is None and shuffle is True:\n            if seed is None:\n                _, seed, pos, *_ = np.random.get_state()\n                seed = seed[pos] if pos < 624 else seed[0]\n                _ = np.random.random()  # do 1 step of rng\n            generator = np.random.default_rng(seed)\n\n        # Check if we've already cached this computation (indexed by a hash)\n        if self.cache_files:\n            if train_indices_cache_file_name is None or test_indices_cache_file_name is None:\n                # we create a unique hash from the function, current dataset file and the mapping args\n\n                if train_indices_cache_file_name is None:\n                    train_indices_cache_file_name = self._get_cache_file_path(train_new_fingerprint)\n                if test_indices_cache_file_name is None:\n                    test_indices_cache_file_name = self._get_cache_file_path(test_new_fingerprint)\n            if (\n                os.path.exists(train_indices_cache_file_name)\n                and os.path.exists(test_indices_cache_file_name)\n                and load_from_cache_file\n            ):\n                logger.info(\n                    f\"Loading cached split indices for dataset at {train_indices_cache_file_name} and {test_indices_cache_file_name}\"\n                )\n                return DatasetDict(\n                    {\n                        \"train\": self._new_dataset_with_indices(\n                            fingerprint=train_new_fingerprint, indices_cache_file_name=train_indices_cache_file_name\n                        ),\n                        \"test\": self._new_dataset_with_indices(\n                            fingerprint=test_new_fingerprint, indices_cache_file_name=test_indices_cache_file_name\n                        ),\n                    }\n                )\n        if not shuffle:\n            if stratify_by_column is not None:\n                raise ValueError(\"Stratified train/test split is not implemented for `shuffle=False`\")\n            train_indices = np.arange(n_train)\n            test_indices = np.arange(n_train, n_train + n_test)\n        else:\n            # stratified partition\n            if stratify_by_column is not None:\n                if stratify_by_column not in self._info.features.keys():\n                    raise ValueError(f\"Key {stratify_by_column} not found in {self._info.features.keys()}\")\n                if not isinstance(self._info.features[stratify_by_column], ClassLabel):\n                    raise ValueError(\n                        f\"Stratifying by column is only supported for {ClassLabel.__name__} column, and column {stratify_by_column} is {type(self._info.features[stratify_by_column]).__name__}.\"\n                    )\n                try:\n                    train_indices, test_indices = next(\n                        stratified_shuffle_split_generate_indices(\n                            np.asarray(self.with_format(\"numpy\")[stratify_by_column]), n_train, n_test, rng=generator\n                        )\n                    )\n                except Exception as error:\n                    if str(error) == \"Minimum class count error\":\n                        raise ValueError(\n                            f\"The least populated class in {stratify_by_column} column has only 1\"\n                            \" member, which is too few. The minimum\"\n                            \" number of groups for any class cannot\"\n                            \" be less than 2.\"\n                        )\n                    else:\n                        raise error\n\n            # random partition\n            else:\n                permutation = generator.permutation(len(self))\n                test_indices = permutation[:n_test]\n                train_indices = permutation[n_test : (n_test + n_train)]\n\n        train_split = self.select(\n            indices=train_indices,\n            keep_in_memory=keep_in_memory,\n            indices_cache_file_name=train_indices_cache_file_name,\n            writer_batch_size=writer_batch_size,\n            new_fingerprint=train_new_fingerprint,\n        )\n        test_split = self.select(\n            indices=test_indices,\n            keep_in_memory=keep_in_memory,\n            indices_cache_file_name=test_indices_cache_file_name,\n            writer_batch_size=writer_batch_size,\n            new_fingerprint=test_new_fingerprint,\n        )\n\n        return DatasetDict({\"train\": train_split, \"test\": test_split})\n\n    def shard(\n        self,\n        num_shards: int,\n        index: int,\n        contiguous: bool = True,\n        keep_in_memory: bool = False,\n        indices_cache_file_name: Optional[str] = None,\n        writer_batch_size: Optional[int] = 1000,\n    ) -> \"Dataset\":\n        \"\"\"Return the `index`-nth shard from dataset split into `num_shards` pieces.\n\n        This shards deterministically. `dataset.shard(n, i)` splits the dataset into contiguous chunks,\n        so it can be easily concatenated back together after processing. If `len(dataset) % n == l`, then the\n        first `l` dataset each have length `(len(dataset) // n) + 1`, and the remaining dataset have length `(len(dataset) // n)`.\n        `datasets.concatenate_datasets([dset.shard(n, i) for i in range(n)])` returns a dataset with the same order as the original.\n\n        Note: n should be less or equal to the number of elements in the dataset `len(dataset)`.\n\n        On the other hand, `dataset.shard(n, i, contiguous=False)` contains all elements of the dataset whose index mod `n = i`.\n\n        Be sure to shard before using any randomizing operator (such as `shuffle`).\n        It is best if the shard operator is used early in the dataset pipeline.\n\n        Args:\n            num_shards (`int`):\n                How many shards to split the dataset into.\n            index (`int`):\n                Which shard to select and return.\n            contiguous: (`bool`, defaults to `True`):\n                Whether to select contiguous blocks of indices for shards.\n            keep_in_memory (`bool`, defaults to `False`):\n                Keep the dataset in memory instead of writing it to a cache file.\n            indices_cache_file_name (`str`, *optional*):\n                Provide the name of a path for the cache file. It is used to store the\n                indices of each shard instead of the automatically generated cache file name.\n            writer_batch_size (`int`, defaults to `1000`):\n                This only concerns the indices mapping.\n                Number of indices per write operation for the cache file writer.\n                This value is a good trade-off between memory usage during the processing, and processing speed.\n                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds\n        Dataset({\n            features: ['text', 'label'],\n            num_rows: 1066\n        })\n        >>> ds = ds.shard(num_shards=2, index=0)\n        >>> ds\n        Dataset({\n            features: ['text', 'label'],\n            num_rows: 533\n        })\n        ```\n        \"\"\"\n        if not 0 <= index < num_shards:\n            raise ValueError(\"index should be in [0, num_shards-1]\")\n        if contiguous:\n            div = len(self) // num_shards\n            mod = len(self) % num_shards\n            start = div * index + min(index, mod)\n            end = start + div + (1 if index < mod else 0)\n            indices = range(start, end)\n        else:\n            indices = np.arange(index, len(self), num_shards)\n\n        return self.select(\n            indices=indices,\n            keep_in_memory=keep_in_memory,\n            indices_cache_file_name=indices_cache_file_name,\n            writer_batch_size=writer_batch_size,\n        )\n\n    def to_csv(\n        self,\n        path_or_buf: Union[PathLike, BinaryIO],\n        batch_size: Optional[int] = None,\n        num_proc: Optional[int] = None,\n        storage_options: Optional[dict] = None,\n        **to_csv_kwargs,\n    ) -> int:\n        \"\"\"Exports the dataset to csv\n\n        Args:\n            path_or_buf (`PathLike` or `FileOrBuffer`):\n                Either a path to a file (e.g. `file.csv`), a remote URI (e.g. `hf://datasets/username/my_dataset_name/data.csv`),\n                or a BinaryIO, where the dataset will be saved to in the specified format.\n            batch_size (`int`, *optional*):\n                Size of the batch to load in memory and write at once.\n                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.\n            num_proc (`int`, *optional*):\n                Number of processes for multiprocessing. By default it doesn't\n                use multiprocessing. `batch_size` in this case defaults to\n                `datasets.config.DEFAULT_MAX_BATCH_SIZE` but feel free to make it 5x or 10x of the default\n                value if you have sufficient compute power.\n            storage_options (`dict`, *optional*):\n                Key/value pairs to be passed on to the file-system backend, if any.\n\n                <Added version=\"2.19.0\"/>\n            **to_csv_kwargs (additional keyword arguments):\n                Parameters to pass to pandas's [`pandas.DataFrame.to_csv`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html).\n\n                <Changed version=\"2.10.0\">\n\n                Now, `index` defaults to `False` if not specified.\n\n                If you would like to write the index, pass `index=True` and also set a name for the index column by\n                passing `index_label`.\n\n                </Changed>\n\n        Returns:\n            `int`: The number of characters or bytes written.\n\n        Example:\n\n        ```py\n        >>> ds.to_csv(\"path/to/dataset/directory\")\n        ```\n        \"\"\"\n        # Dynamic import to avoid circular dependency\n        from .io.csv import CsvDatasetWriter\n\n        return CsvDatasetWriter(\n            self,\n            path_or_buf,\n            batch_size=batch_size,\n            num_proc=num_proc,\n            storage_options=storage_options,\n            **to_csv_kwargs,\n        ).write()\n\n    def to_dict(self, batch_size: Optional[int] = None, batched: bool = False) -> Union[dict, Iterator[dict]]:\n        \"\"\"Returns the dataset as a Python dict. Can also return a generator for large datasets.\n\n        Args:\n            batch_size (`int`, *optional*): The size (number of rows) of the batches if `batched` is `True`.\n                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.\n            batched (`bool`):\n                Set to `True` to return a generator that yields the dataset as batches\n                of `batch_size` rows. Defaults to `False` (returns the whole datasets once).\n\n        Returns:\n            `dict` or `Iterator[dict]`\n\n        Example:\n\n        ```py\n        >>> ds.to_dict()\n        ```\n        \"\"\"\n        return query_table(\n            table=self._data,\n            key=slice(0, len(self)),\n            indices=self._indices,\n        ).to_pydict()\n\n    def to_list(self) -> list:\n        \"\"\"Returns the dataset as a Python list.\n\n        Returns:\n            `list`\n\n        Example:\n\n        ```py\n        >>> ds.to_list()\n        ```\n        \"\"\"\n        return query_table(\n            table=self._data,\n            key=slice(0, len(self)),\n            indices=self._indices,\n        ).to_pylist()\n\n    def to_json(\n        self,\n        path_or_buf: Union[PathLike, BinaryIO],\n        batch_size: Optional[int] = None,\n        num_proc: Optional[int] = None,\n        storage_options: Optional[dict] = None,\n        **to_json_kwargs,\n    ) -> int:\n        \"\"\"Export the dataset to JSON Lines or JSON.\n\n        The default output format is [JSON Lines](https://jsonlines.org/).\n        To export to [JSON](https://www.json.org), pass `lines=False` argument and the desired `orient`.\n\n        Args:\n            path_or_buf (`PathLike` or `FileOrBuffer`):\n                Either a path to a file (e.g. `file.json`), a remote URI (e.g. `hf://datasets/username/my_dataset_name/data.json`),\n                or a BinaryIO, where the dataset will be saved to in the specified format.\n            batch_size (`int`, *optional*):\n                Size of the batch to load in memory and write at once.\n                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.\n            num_proc (`int`, *optional*):\n                Number of processes for multiprocessing. By default, it doesn't\n                use multiprocessing. `batch_size` in this case defaults to\n                `datasets.config.DEFAULT_MAX_BATCH_SIZE` but feel free to make it 5x or 10x of the default\n                value if you have sufficient compute power.\n            storage_options (`dict`, *optional*):\n                Key/value pairs to be passed on to the file-system backend, if any.\n\n                <Added version=\"2.19.0\"/>\n            **to_json_kwargs (additional keyword arguments):\n                Parameters to pass to pandas's [`pandas.DataFrame.to_json`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_json.html).\n                Default arguments are `lines=True` and `orient=\"records\".\n\n                <Changed version=\"2.11.0\">\n\n                The parameter `index` defaults to `False` if `orient` is `\"split\"` or `\"table\"`.\n\n                If you would like to write the index, pass `index=True`.\n\n                </Changed>\n\n        Returns:\n            `int`: The number of characters or bytes written.\n\n        Example:\n\n        ```py\n        >>> ds.to_json(\"path/to/dataset/directory/filename.jsonl\")\n        ```\n        \"\"\"\n        # Dynamic import to avoid circular dependency\n        from .io.json import JsonDatasetWriter\n\n        return JsonDatasetWriter(\n            self,\n            path_or_buf,\n            batch_size=batch_size,\n            num_proc=num_proc,\n            storage_options=storage_options,\n            **to_json_kwargs,\n        ).write()\n\n    def to_pandas(\n        self, batch_size: Optional[int] = None, batched: bool = False\n    ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:\n        \"\"\"Returns the dataset as a `pandas.DataFrame`. Can also return a generator for large datasets.\n\n        Args:\n            batch_size (`int`, *optional*):\n                The size (number of rows) of the batches if `batched` is `True`.\n                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.\n            batched (`bool`):\n                Set to `True` to return a generator that yields the dataset as batches\n                of `batch_size` rows. Defaults to `False` (returns the whole datasets once).\n\n        Returns:\n            `pandas.DataFrame` or `Iterator[pandas.DataFrame]`\n\n        Example:\n\n        ```py\n        >>> ds.to_pandas()\n        ```\n        \"\"\"\n        if not batched:\n            return query_table(\n                table=self._data,\n                key=slice(0, len(self)),\n                indices=self._indices,\n            ).to_pandas(types_mapper=pandas_types_mapper)\n        else:\n            batch_size = batch_size if batch_size else config.DEFAULT_MAX_BATCH_SIZE\n            return (\n                query_table(\n                    table=self._data,\n                    key=slice(offset, offset + batch_size),\n                    indices=self._indices,\n                ).to_pandas(types_mapper=pandas_types_mapper)\n                for offset in range(0, len(self), batch_size)\n            )\n\n    def to_polars(\n        self,\n        batch_size: Optional[int] = None,\n        batched: bool = False,\n        schema_overrides: Optional[dict] = None,\n        rechunk: bool = True,\n    ) -> Union[\"pl.DataFrame\", Iterator[\"pl.DataFrame\"]]:\n        \"\"\"Returns the dataset as a `polars.DataFrame`. Can also return a generator for large datasets.\n\n        Args:\n            batch_size (`int`, *optional*):\n                The size (number of rows) of the batches if `batched` is `True`.\n                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.\n            batched (`bool`):\n                Set to `True` to return a generator that yields the dataset as batches\n                of `batch_size` rows. Defaults to `False` (returns the whole datasets once).\n            schema_overrides (`dict`, *optional*):\n                Support type specification or override of one or more columns; note that\n                any dtypes inferred from the schema param will be overridden.\n            rechunk (`bool`):\n                Make sure that all data is in contiguous memory. Defaults to `True`.\n        Returns:\n            `polars.DataFrame` or `Iterator[polars.DataFrame]`\n\n        Example:\n\n        ```py\n        >>> ds.to_polars()\n        ```\n        \"\"\"\n        if config.POLARS_AVAILABLE:\n            import polars as pl\n\n            if not batched:\n                return pl.from_arrow(\n                    query_table(\n                        table=self._data,\n                        key=slice(0, len(self)),\n                        indices=self._indices if self._indices is not None else None,\n                    ),\n                    schema_overrides=schema_overrides,\n                    rechunk=rechunk,\n                )\n            else:\n                batch_size = batch_size if batch_size else config.DEFAULT_MAX_BATCH_SIZE\n                return (\n                    pl.from_arrow(\n                        query_table(\n                            table=self._data,\n                            key=slice(offset, offset + batch_size),\n                            indices=self._indices if self._indices is not None else None,\n                        ),\n                        schema_overrides=schema_overrides,\n                        rechunk=rechunk,\n                    )\n                    for offset in range(0, len(self), batch_size)\n                )\n        else:\n            raise ValueError(\"Polars needs to be installed to be able to return Polars dataframes.\")\n\n    def to_parquet(\n        self,\n        path_or_buf: Union[PathLike, BinaryIO],\n        batch_size: Optional[int] = None,\n        storage_options: Optional[dict] = None,\n        **parquet_writer_kwargs,\n    ) -> int:\n        \"\"\"Exports the dataset to parquet\n\n        Args:\n            path_or_buf (`PathLike` or `FileOrBuffer`):\n                Either a path to a file (e.g. `file.parquet`), a remote URI (e.g. `hf://datasets/username/my_dataset_name/data.parquet`),\n                or a BinaryIO, where the dataset will be saved to in the specified format.\n            batch_size (`int`, *optional*):\n                Size of the batch to load in memory and write at once.\n                By default it aims for row groups with maximum uncompressed byte size of \"100MB\",\n                defined by `datasets.config.MAX_ROW_GROUP_SIZE`.\n            storage_options (`dict`, *optional*):\n                Key/value pairs to be passed on to the file-system backend, if any.\n\n                <Added version=\"2.19.0\"/>\n            **parquet_writer_kwargs (additional keyword arguments):\n                Parameters to pass to PyArrow's `pyarrow.parquet.ParquetWriter`.\n\n        Returns:\n            `int`: The number of characters or bytes written.\n\n        Example:\n\n        ```py\n        >>> ds.to_parquet(\"path/to/dataset/directory\")\n        ```\n        \"\"\"\n        # Dynamic import to avoid circular dependency\n        from .io.parquet import ParquetDatasetWriter\n\n        return ParquetDatasetWriter(\n            self, path_or_buf, batch_size=batch_size, storage_options=storage_options, **parquet_writer_kwargs\n        ).write()\n\n    def to_sql(\n        self,\n        name: str,\n        con: Union[str, \"sqlalchemy.engine.Connection\", \"sqlalchemy.engine.Engine\", \"sqlite3.Connection\"],\n        batch_size: Optional[int] = None,\n        **sql_writer_kwargs,\n    ) -> int:\n        \"\"\"Exports the dataset to a SQL database.\n\n        Args:\n            name (`str`):\n                Name of SQL table.\n            con (`str` or `sqlite3.Connection` or `sqlalchemy.engine.Connection` or `sqlalchemy.engine.Connection`):\n                A [URI string](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) or a SQLite3/SQLAlchemy connection object used to write to a database.\n            batch_size (`int`, *optional*):\n                Size of the batch to load in memory and write at once.\n                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.\n            **sql_writer_kwargs (additional keyword arguments):\n                Parameters to pass to pandas's [`pandas.DataFrame.to_sql`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html).\n\n                <Changed version=\"2.11.0\">\n\n                Now, `index` defaults to `False` if not specified.\n\n                If you would like to write the index, pass `index=True` and also set a name for the index column by\n                passing `index_label`.\n\n                </Changed>\n\n        Returns:\n            `int`: The number of records written.\n\n        Example:\n\n        ```py\n        >>> # con provided as a connection URI string\n        >>> ds.to_sql(\"data\", \"sqlite:///my_own_db.sql\")\n        >>> # con provided as a sqlite3 connection object\n        >>> import sqlite3\n        >>> con = sqlite3.connect(\"my_own_db.sql\")\n        >>> with con:\n        ...     ds.to_sql(\"data\", con)\n        ```\n        \"\"\"\n        # Dynamic import to avoid circular dependency\n        from .io.sql import SqlDatasetWriter\n\n        return SqlDatasetWriter(self, name, con, batch_size=batch_size, **sql_writer_kwargs).write()\n\n    def _estimate_nbytes(self) -> int:\n        dataset_nbytes = self.data.nbytes\n\n        # Find decodable columns, because if there are any, we need to\n        # adjust the dataset size computation (needed for sharding) to account for possible external files\n        decodable_columns = [\n            k for k, v in self._info.features.items() if require_decoding(v, ignore_decode_attribute=True)\n        ]\n\n        if decodable_columns:\n            # Approximate the space needed to store the bytes from the external files by analyzing the first 1000 examples\n            extra_nbytes = 0\n\n            def extra_nbytes_visitor(array, feature):\n                nonlocal extra_nbytes\n                if isinstance(feature, (Audio, Image, Video)):\n                    for x in array.to_pylist():\n                        if x is not None and x[\"bytes\"] is None and x[\"path\"] is not None:\n                            size = xgetsize(x[\"path\"])\n                            extra_nbytes += size\n                    extra_nbytes -= array.field(\"path\").nbytes\n\n            table = self.with_format(\"arrow\")[:1000]\n            table_visitor(table, extra_nbytes_visitor)\n\n            extra_nbytes = extra_nbytes * len(self.data) // len(table)\n            dataset_nbytes = dataset_nbytes + extra_nbytes\n\n        if self._indices is not None:\n            dataset_nbytes = dataset_nbytes * len(self._indices) // len(self.data)\n        return dataset_nbytes\n\n    @staticmethod\n    def _generate_tables_from_shards(shards: list[\"Dataset\"], batch_size: int):\n        for shard_idx, shard in enumerate(shards):\n            for pa_table in shard.with_format(\"arrow\").iter(batch_size):\n                yield shard_idx, pa_table\n\n    @staticmethod\n    def _generate_tables_from_cache_file(filename: str):\n        for batch_idx, batch in enumerate(_memory_mapped_record_batch_reader_from_file(filename)):\n            yield batch_idx, pa.Table.from_batches([batch])\n\n    def to_iterable_dataset(self, num_shards: Optional[int] = 1) -> \"IterableDataset\":\n        \"\"\"Get an [`datasets.IterableDataset`] from a map-style [`datasets.Dataset`].\n        This is equivalent to loading a dataset in streaming mode with [`datasets.load_dataset`], but much faster since the data is streamed from local files.\n\n        Contrary to map-style datasets, iterable datasets are lazy and can only be iterated over (e.g. using a for loop).\n        Since they are read sequentially in training loops, iterable datasets are much faster than map-style datasets.\n        All the transformations applied to iterable datasets like filtering or processing are done on-the-fly when you start iterating over the dataset.\n\n        Still, it is possible to shuffle an iterable dataset using [`datasets.IterableDataset.shuffle`].\n        This is a fast approximate shuffling that works best if you have multiple shards and if you specify a buffer size that is big enough.\n\n        To get the best speed performance, make sure your dataset doesn't have an indices mapping.\n        If this is the case, the data are not read contiguously, which can be slow sometimes.\n        You can use `ds = ds.flatten_indices()` to write your dataset in contiguous chunks of data and have optimal speed before switching to an iterable dataset.\n\n        Args:\n            num_shards (`int`, default to `1`):\n                Number of shards to define when instantiating the iterable dataset. This is especially useful for big datasets to be able to shuffle properly,\n                and also to enable fast parallel loading using a PyTorch DataLoader or in distributed setups for example.\n                Shards are defined using [`datasets.Dataset.shard`]: it simply slices the data without writing anything on disk.\n\n        Returns:\n            [`datasets.IterableDataset`]\n\n        Example:\n\n        Basic usage:\n        ```python\n        >>> ids = ds.to_iterable_dataset()\n        >>> for example in ids:\n        ...     pass\n        ```\n\n        With lazy filtering and processing:\n        ```python\n        >>> ids = ds.to_iterable_dataset()\n        >>> ids = ids.filter(filter_fn).map(process_fn)  # will filter and process on-the-fly when you start iterating over the iterable dataset\n        >>> for example in ids:\n        ...     pass\n        ```\n\n        With sharding to enable efficient shuffling:\n        ```python\n        >>> ids = ds.to_iterable_dataset(num_shards=64)  # the dataset is split into 64 shards to be iterated over\n        >>> ids = ids.shuffle(buffer_size=10_000)  # will shuffle the shards order and use a shuffle buffer for fast approximate shuffling when you start iterating\n        >>> for example in ids:\n        ...     pass\n        ```\n\n        With a PyTorch DataLoader:\n        ```python\n        >>> import torch\n        >>> ids = ds.to_iterable_dataset(num_shards=64)\n        >>> ids = ids.filter(filter_fn).map(process_fn)\n        >>> dataloader = torch.utils.data.DataLoader(ids, num_workers=4)  # will assign 64 / 4 = 16 shards to each worker to load, filter and process when you start iterating\n        >>> for example in ids:\n        ...     pass\n        ```\n\n        With a PyTorch DataLoader and shuffling:\n        ```python\n        >>> import torch\n        >>> ids = ds.to_iterable_dataset(num_shards=64)\n        >>> ids = ids.shuffle(buffer_size=10_000)  # will shuffle the shards order and use a shuffle buffer when you start iterating\n        >>> dataloader = torch.utils.data.DataLoader(ids, num_workers=4)  # will assign 64 / 4 = 16 shards from the shuffled list of shards to each worker when you start iterating\n        >>> for example in ids:\n        ...     pass\n        ```\n\n        In a distributed setup like PyTorch DDP with a PyTorch DataLoader and shuffling\n        ```python\n        >>> from datasets.distributed import split_dataset_by_node\n        >>> ids = ds.to_iterable_dataset(num_shards=512)\n        >>> ids = ids.shuffle(buffer_size=10_000, seed=42)  # will shuffle the shards order and use a shuffle buffer when you start iterating\n        >>> ids = split_dataset_by_node(ds, world_size=8, rank=0)  # will keep only 512 / 8 = 64 shards from the shuffled lists of shards when you start iterating\n        >>> dataloader = torch.utils.data.DataLoader(ids, num_workers=4)  # will assign 64 / 4 = 16 shards from this node's list of shards to each worker when you start iterating\n        >>> for example in ids:\n        ...     pass\n        ```\n\n        With shuffling and multiple epochs:\n        ```python\n        >>> ids = ds.to_iterable_dataset(num_shards=64)\n        >>> ids = ids.shuffle(buffer_size=10_000, seed=42)  # will shuffle the shards order and use a shuffle buffer when you start iterating\n        >>> for epoch in range(n_epochs):\n        ...     ids.set_epoch(epoch)  # will use effective_seed = seed + epoch to shuffle the shards and for the shuffle buffer when you start iterating\n        ...     for example in ids:\n        ...         pass\n        ```\n        Feel free to also use [`IterableDataset.set_epoch`] when using a PyTorch DataLoader or in distributed setups.\n        \"\"\"\n        from .iterable_dataset import ArrowExamplesIterable, IterableDataset\n\n        if self._format_type is not None:\n            if self._format_kwargs or (\n                self._format_columns is not None and set(self._format_columns) != set(self.column_names)\n            ):\n                raise NotImplementedError(\n                    \"Converting a formatted dataset with kwargs or selected columns to a formatted iterable dataset is not implemented yet. Please run `my_dataset = my_dataset.with_format(None)` before calling to_iterable_dataset\"\n                )\n        if num_shards > len(self):\n            raise ValueError(\n                f\"Unable to shard a dataset of size {len(self)} into {num_shards} shards (the number of shards exceeds the number of samples).\"\n            )\n        if self._indices is not None:\n            logger.info(\n                \"Converting an Arrow dataset to iterable but it has an indices mapping that can make it slower. \"\n                \"You can use `ds = ds.flatten_indices()` to write your dataset in contiguous chunks of data and have optimal speed.\"\n            )\n        shards = (\n            [copy.deepcopy(self)]\n            if num_shards == 1\n            else [\n                self.shard(num_shards=num_shards, index=shard_idx, contiguous=True) for shard_idx in range(num_shards)\n            ]\n        )\n        ex_iterable = ArrowExamplesIterable(\n            Dataset._generate_tables_from_shards,\n            kwargs={\"shards\": shards, \"batch_size\": config.DEFAULT_MAX_BATCH_SIZE},\n        )\n        ds = IterableDataset(ex_iterable, info=DatasetInfo(features=self.features))\n        if self._format_type:\n            ds = ds.with_format(self._format_type)\n        return ds\n\n    def _push_parquet_shards_to_hub_single(\n        self,\n        job_id: int,\n        num_jobs: int,\n        resolved_output_path: HfFileSystemResolvedPath,\n        data_dir: str,\n        split: str,\n        token: Optional[str],\n        create_pr: Optional[bool],\n        num_shards: int,\n        embed_external_files: bool,\n        writer_batch_size: int,\n    ):\n        div = num_shards // num_jobs\n        mod = num_shards % num_jobs\n        start = div * job_id + min(job_id, mod)\n        end = start + div + (1 if job_id < mod else 0)\n\n        index_shards = (\n            (start + i, self.shard(num_shards=end - start, index=i, contiguous=True)) for i in range(end - start)\n        )\n\n        api = HfApi(endpoint=config.HF_ENDPOINT, token=token)\n\n        additions: list[CommitOperationAdd] = []\n        new_parquet_paths: list[str] = []\n        uploaded_size = 0\n        for index, shard in index_shards:\n            if embed_external_files:\n                format = shard.format\n                shard = shard.with_format(\"arrow\")\n                shard = shard.map(\n                    embed_table_storage,\n                    batched=True,\n                    batch_size=writer_batch_size,\n                    keep_in_memory=True,\n                )\n                shard = shard.with_format(**format)\n            shard_path_in_repo = f\"{data_dir}/{split}-{index:05d}-of-{num_shards:05d}.parquet\"\n            tmp_file = tempfile.NamedTemporaryFile(suffix=\".parquet\", delete=False)\n            try:\n                shard.to_parquet(tmp_file, batch_size=writer_batch_size)\n                uploaded_size += tmp_file.tell()\n                tmp_file.close()\n                new_parquet_paths.append(shard_path_in_repo)\n                if (\n                    isinstance(resolved_output_path, HfFileSystemResolvedRepositoryPath)\n                    and not resolved_output_path.path_in_repo\n                ):\n                    shard_addition = CommitOperationAdd(path_in_repo=shard_path_in_repo, path_or_fileobj=tmp_file.name)\n                    api.preupload_lfs_files(\n                        repo_id=resolved_output_path.repo_id,\n                        additions=[shard_addition],\n                        repo_type=resolved_output_path.repo_type,\n                        revision=resolved_output_path.revision,\n                        create_pr=create_pr,\n                    )\n                    additions.append(shard_addition)\n                elif isinstance(resolved_output_path, HfFileSystemResolvedBucketPath):\n                    if resolved_output_path.path:\n                        shard_path_in_repo = resolved_output_path.path + \"/\" + shard_path_in_repo\n                    api.batch_bucket_files(\n                        bucket_id=resolved_output_path.bucket_id, add=[(tmp_file.name, shard_path_in_repo)]\n                    )\n                else:\n                    raise NotImplementedError(f\"Bad HF path: {resolved_output_path}\")\n            except (Exception, KeyboardInterrupt):\n                tmp_file.close()\n                Path(tmp_file.name).unlink()\n                raise\n            tmp_file.close()\n            Path(tmp_file.name).unlink()\n            yield job_id, False, 1\n\n        yield job_id, True, (additions, new_parquet_paths, uploaded_size)\n\n    def _push_parquet_shards_to_hub(\n        self,\n        resolved_output_path: HfFileSystemResolvedPath,\n        data_dir: str,\n        split: str,\n        token: Optional[str],\n        create_pr: Optional[bool],\n        max_shard_size: Optional[Union[int, str]],\n        num_shards: Optional[int],\n        embed_external_files: bool,\n        num_proc: Optional[int],\n    ) -> tuple[list[CommitOperationAdd], list[str], int, SplitInfo]:\n        \"\"\"Pushes the dataset shards as Parquet files to the hub.\n\n        Returns:\n            additions (`List[CommitOperation]`): list of the `CommitOperationAdd` of the uploaded shards\n            new_parquet_paths (`List[str]`): list of paths of the new files uploaded to the output path,\n                relative to output path\n            features (`features`): features of the uploaded dataset\n            split_info (`int`): info of the uploaded split, including the approximate size in bytes of\n                the uploaded dataset after uncompression\n            uploaded_size (`int`): number of uploaded bytes to the repository or bucket\n        \"\"\"\n        from .arrow_writer import get_writer_batch_size_from_data_size, get_writer_batch_size_from_features\n\n        dataset_nbytes = self._estimate_nbytes()\n        writer_batch_size = get_writer_batch_size_from_features(self.features) or get_writer_batch_size_from_data_size(\n            len(self), dataset_nbytes\n        )\n\n        # Find decodable columns, because if there are any, we need to:\n        # embed the bytes from the files in the shards\n        decodable_columns = (\n            [k for k, v in self._info.features.items() if require_decoding(v, ignore_decode_attribute=True)]\n            if embed_external_files\n            else []\n        )\n        embed_external_files = embed_external_files and bool(decodable_columns)\n\n        if num_shards is None:\n            max_shard_size = convert_file_size_to_int(max_shard_size or config.MAX_SHARD_SIZE)\n            num_shards = int(dataset_nbytes / max_shard_size) + 1\n            num_shards = max(num_shards, num_proc or 1)\n\n        additions: list[CommitOperationAdd] = []\n        new_parquet_paths: list[str] = []\n        uploaded_size = 0\n\n        num_jobs = num_proc or 1\n        if num_shards <= 1:\n            logger.warning(\n                f\"Setting num_proc from {num_jobs} back to 1 for the {split} split to disable multiprocessing as it only contains one shard.\"\n            )\n            num_jobs = 1\n        elif num_shards < num_jobs:\n            logger.warning(\n                f\"Setting num_proc from {num_jobs} to {num_shards} for the {split} split as it only contains {num_shards} shards.\"\n            )\n            num_proc = num_shards\n        kwargs_iterable = [\n            {\n                \"self\": self.shard(num_shards=num_jobs, index=job_id, contiguous=True),\n                \"job_id\": job_id,\n                \"num_jobs\": num_jobs,\n                \"resolved_output_path\": resolved_output_path,\n                \"data_dir\": data_dir,\n                \"split\": split,\n                \"token\": token,\n                \"create_pr\": create_pr,\n                \"num_shards\": num_shards,\n                \"embed_external_files\": embed_external_files,\n                \"writer_batch_size\": writer_batch_size,\n            }\n            for job_id in range(num_jobs)\n        ]\n        desc = \"Uploading the dataset shards\"\n        desc += f\" (num_proc={num_proc})\" if num_proc is not None and num_proc >= 1 else \"\"\n        pbar = hf_tqdm(\n            unit=\" shards\",\n            total=num_shards,\n            desc=desc,\n        )\n        with (\n            contextlib.nullcontext()\n            if num_proc is None or num_proc < 1\n            else mp.get_context(\"spawn\").Pool(num_proc) as pool\n        ):\n            update_stream = (\n                Dataset._push_parquet_shards_to_hub_single(**kwargs_iterable[0])\n                if pool is None\n                else iflatmap_unordered(\n                    pool,\n                    Dataset._push_parquet_shards_to_hub_single,\n                    kwargs_iterable=kwargs_iterable,\n                )\n            )\n            for job_id, done, content in update_stream:\n                if not done:\n                    pbar.update(content)\n                else:\n                    job_additions, job_new_parquet_paths, job_uploaded_size = content\n                    additions += job_additions\n                    new_parquet_paths += job_new_parquet_paths\n                    uploaded_size += job_uploaded_size\n\n        split_info = SplitInfo(name=split, num_bytes=dataset_nbytes, num_examples=len(self))\n        return additions, new_parquet_paths, self.features, split_info, uploaded_size\n\n    def push_to_hub(\n        self,\n        repo_id: str,\n        config_name: str = \"default\",\n        set_default: Optional[bool] = None,\n        split: Optional[str] = None,\n        data_dir: Optional[str] = None,\n        commit_message: Optional[str] = None,\n        commit_description: Optional[str] = None,\n        private: Optional[bool] = None,\n        token: Optional[str] = None,\n        revision: Optional[str] = None,\n        create_pr: Optional[bool] = False,\n        max_shard_size: Optional[Union[int, str]] = None,\n        num_shards: Optional[int] = None,\n        embed_external_files: bool = True,\n        num_proc: Optional[int] = None,\n    ) -> CommitInfo:\n        \"\"\"Pushes the dataset to the hub as a Parquet dataset.\n        The dataset is pushed using HTTP requests and does not need to have neither git or git-lfs installed.\n\n        The resulting Parquet files are self-contained by default. If your dataset contains [`Image`], [`Audio`] or [`Video`]\n        data, the Parquet files will store the bytes of your images or audio files.\n        You can disable this by setting `embed_external_files` to `False`.\n\n        Args:\n            repo_id (`str`):\n                The ID of the repository to push to in the following format: `<user>/<dataset_name>` or\n                `<org>/<dataset_name>`. Also accepts `<dataset_name>`, which will default to the namespace\n                of the logged-in user.\n\n                It could also be a location inside a bucket, e.g. `buckets/<user_or_org>/<bucket_name>/...`\n            config_name (`str`, defaults to \"default\"):\n                The configuration name (or subset) of a dataset. Defaults to \"default\".\n            set_default (`bool`, *optional*):\n                Whether to set this configuration as the default one. Otherwise, the default configuration is the one\n                named \"default\".\n            split (`str`, *optional*):\n                The name of the split that will be given to that dataset. Defaults to `self.split`.\n            data_dir (`str`, *optional*):\n                Directory name that will contain the uploaded data files. Defaults to the `config_name` if different\n                from \"default\", else \"data\".\n\n                <Added version=\"2.17.0\"/>\n            commit_message (`str`, *optional*):\n                Message to commit while pushing. Will default to `\"Upload dataset\"`.\n            commit_description (`str`, *optional*):\n                Description of the commit that will be created.\n                Additionally, description of the PR if a PR is created (`create_pr` is True).\n\n                <Added version=\"2.16.0\"/>\n            private (`bool`, *optional*):\n                Whether to make the repo private. If `None` (default), the repo will be public unless the\n                organization's default is private. This value is ignored if the repo already exists.\n            token (`str`, *optional*):\n                An optional authentication token for the Hugging Face Hub. If no token is passed, will default\n                to the token saved locally when logging in with `huggingface-cli login`. Will raise an error\n                if no token is passed and the user is not logged-in.\n            revision (`str`, *optional*):\n                Branch to push the uploaded files to. Defaults to the `\"main\"` branch.\n\n                <Added version=\"2.15.0\"/>\n            create_pr (`bool`, *optional*, defaults to `False`):\n                Whether to create a PR with the uploaded files or directly commit.\n\n                <Added version=\"2.15.0\"/>\n            max_shard_size (`int` or `str`, *optional*, defaults to `\"500MB\"`):\n                The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by\n                a unit (like `\"5MB\"`).\n            num_shards (`int`, *optional*):\n                Number of shards to write. By default, the number of shards depends on `max_shard_size`.\n\n                <Added version=\"2.8.0\"/>\n            embed_external_files (`bool`, defaults to `True`):\n                Whether to embed file bytes in the shards.\n                In particular, this will do the following before the push for the fields of type:\n\n                - [`Audio`] and [`Image`]: remove local path information and embed file content in the Parquet files.\n            num_proc (`int`, *optional*, defaults to `None`):\n                Number of processes when preparing and uploading the dataset.\n                This is helpful if the dataset is made of many samples or media files to embed.\n                I uses \"spawn\" context to work with hf_xet, the rust client for fast uploads to HF.\n                Multiprocessing is disabled by default.\n\n                <Added version=\"4.0.0\"/>\n\n        Return:\n            huggingface_hub.CommitInfo\n\n        Example:\n\n        ```python\n        >>> dataset.push_to_hub(\"<organization>/<dataset_id>\")\n        >>> dataset_dict.push_to_hub(\"<organization>/<dataset_id>\", private=True)\n        >>> dataset.push_to_hub(\"<organization>/<dataset_id>\", max_shard_size=\"1GB\")\n        >>> dataset.push_to_hub(\"<organization>/<dataset_id>\", num_shards=1024)\n        ```\n\n        If your dataset has multiple splits (e.g. train/validation/test):\n\n        ```python\n        >>> train_dataset.push_to_hub(\"<organization>/<dataset_id>\", split=\"train\")\n        >>> val_dataset.push_to_hub(\"<organization>/<dataset_id>\", split=\"validation\")\n        >>> # later\n        >>> dataset = load_dataset(\"<organization>/<dataset_id>\")\n        >>> train_dataset = dataset[\"train\"]\n        >>> val_dataset = dataset[\"validation\"]\n        ```\n\n        If you want to add a new configuration (or subset) to a dataset (e.g. if the dataset has multiple tasks/versions/languages):\n\n        ```python\n        >>> english_dataset.push_to_hub(\"<organization>/<dataset_id>\", \"en\")\n        >>> french_dataset.push_to_hub(\"<organization>/<dataset_id>\", \"fr\")\n        >>> # later\n        >>> english_dataset = load_dataset(\"<organization>/<dataset_id>\", \"en\")\n        >>> french_dataset = load_dataset(\"<organization>/<dataset_id>\", \"fr\")\n        ```\n        \"\"\"\n        if config_name == \"data\":\n            raise ValueError(\"`config_name` cannot be 'data'. Please, choose another name for configuration.\")\n\n        if max_shard_size is not None and num_shards is not None:\n            raise ValueError(\n                \"Failed to push_to_hub: please specify either max_shard_size or num_shards, but not both.\"\n            )\n\n        if split is None:\n            split = str(self.split) if self.split is not None else \"train\"\n\n        if not re.match(_split_re, split):\n            raise ValueError(f\"Split name should match '{_split_re}' but got '{split}'.\")\n\n        if not data_dir:\n            data_dir = config_name if config_name != \"default\" else \"data\"  # for backward compatibility\n\n        api = HfApi(endpoint=config.HF_ENDPOINT, token=token)\n        if repo_id.startswith(\"buckets/\"):\n            if BucketNotFoundError is None:\n                raise ImportError(\"Pushing datasets to buckets requires huggingface_hub>=1.6.0\")\n            _, _namespace, _bucket_name, *_path_segments = repo_id.split(\"/\")\n            try:\n                bucket_id = api.bucket_info(_namespace + \"/\" + _bucket_name).id\n            except BucketNotFoundError:\n                bucket_url = api.create_bucket(_namespace + \"/\" + _bucket_name, private=private, exist_ok=True)\n                bucket_id = bucket_url.bucket_id\n            path = \"/\".join(s for s in _path_segments if s)\n            return _push_to_bucket(\n                self,\n                bucket_id=bucket_id,\n                path=path,\n                config_name=config_name,\n                set_default=set_default,\n                split=split,\n                data_dir=data_dir,\n                token=token,\n                max_shard_size=max_shard_size,\n                num_shards=num_shards,\n                embed_external_files=embed_external_files,\n                num_proc=num_proc,\n            )\n        else:\n            try:\n                repo_id = api.repo_info(repo_id, repo_type=\"dataset\").id\n            except RepositoryNotFoundError:\n                repo_url = api.create_repo(\n                    repo_id,\n                    repo_type=\"dataset\",\n                    private=private,\n                    exist_ok=True,\n                )\n                repo_id = repo_url.repo_id\n\n            if revision is not None and not revision.startswith(\"refs/pr/\"):\n                # We do not call create_branch for a PR reference: 400 Bad Request\n                api.create_branch(repo_id, branch=revision, repo_type=\"dataset\", exist_ok=True)\n            return _push_to_repo(\n                self,\n                repo_id=repo_id,\n                config_name=config_name,\n                set_default=set_default,\n                split=split,\n                data_dir=data_dir,\n                commit_message=commit_message,\n                commit_description=commit_description,\n                token=token,\n                revision=revision,\n                create_pr=create_pr,\n                max_shard_size=max_shard_size,\n                num_shards=num_shards,\n                embed_external_files=embed_external_files,\n                num_proc=num_proc,\n            )\n\n    @transmit_format\n    @fingerprint_transform(inplace=False)\n    def add_column(\n        self,\n        name: str,\n        column: Union[list, np.ndarray],\n        new_fingerprint: Optional[str] = None,\n        feature: Optional[FeatureType] = None,\n    ):\n        \"\"\"Add column to Dataset.\n\n        <Added version=\"1.7\"/>\n\n        Args:\n            name (`str`):\n                Column name.\n            column (`list` or `np.array`):\n                Column data to be added.\n            feature (`FeatureType` or `None`, defaults to `None`):\n                Column datatype.\n\n        Returns:\n            [`Dataset`]\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> more_text = ds[\"text\"]\n        >>> ds = ds.add_column(name=\"text_2\", column=more_text)\n        >>> ds\n        Dataset({\n            features: ['text', 'label', 'text_2'],\n            num_rows: 1066\n        })\n        ```\n        \"\"\"\n\n        if feature:\n            pyarrow_schema = Features({name: feature}).arrow_schema\n        else:\n            pyarrow_schema = None\n\n        column_table = InMemoryTable.from_pydict({name: column}, schema=pyarrow_schema)\n        _check_column_names(self._data.column_names + column_table.column_names)\n        dataset = self.flatten_indices() if self._indices is not None else self\n        # Concatenate tables horizontally\n        table = concat_tables([dataset._data, column_table], axis=1)\n        # Update features\n        info = dataset.info.copy()\n        info.features.update(Features.from_arrow_schema(column_table.schema))\n        table = update_metadata_with_features(table, info.features)\n        return Dataset(table, info=info, split=self.split, indices_table=None, fingerprint=new_fingerprint)\n\n    def add_faiss_index(\n        self,\n        column: str,\n        index_name: Optional[str] = None,\n        device: Optional[int] = None,\n        string_factory: Optional[str] = None,\n        metric_type: Optional[int] = None,\n        custom_index: Optional[\"faiss.Index\"] = None,  # noqa: F821\n        batch_size: int = 1000,\n        train_size: Optional[int] = None,\n        faiss_verbose: bool = False,\n        dtype=np.float32,\n    ):\n        \"\"\"Add a dense index using Faiss for fast retrieval.\n        By default the index is done over the vectors of the specified column.\n        You can specify `device` if you want to run it on GPU (`device` must be the GPU index).\n        You can find more information about Faiss here:\n\n        - For [string factory](https://github.com/facebookresearch/faiss/wiki/The-index-factory)\n\n        Args:\n            column (`str`):\n                The column of the vectors to add to the index.\n            index_name (`str`, *optional*):\n                The `index_name`/identifier of the index.\n                This is the `index_name` that is used to call [`~datasets.Dataset.get_nearest_examples`] or [`~datasets.Dataset.search`].\n                By default it corresponds to `column`.\n            device (`Union[int, List[int]]`, *optional*):\n                If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.\n                If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.\n            string_factory (`str`, *optional*):\n                This is passed to the index factory of Faiss to create the index.\n                Default index class is `IndexFlat`.\n            metric_type (`int`, *optional*):\n                Type of metric. Ex: `faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`.\n            custom_index (`faiss.Index`, *optional*):\n                Custom Faiss index that you already have instantiated and configured for your needs.\n            batch_size (`int`):\n                Size of the batch to use while adding vectors to the `FaissIndex`. Default value is `1000`.\n                <Added version=\"2.4.0\"/>\n            train_size (`int`, *optional*):\n                If the index needs a training step, specifies how many vectors will be used to train the index.\n            faiss_verbose (`bool`, defaults to `False`):\n                Enable the verbosity of the Faiss index.\n            dtype (`data-type`):\n                The dtype of the numpy arrays that are indexed.\n                Default is `np.float32`.\n\n        Example:\n\n        ```python\n        >>> ds = datasets.load_dataset('community-datasets/crime_and_punish', split='train')\n        >>> ds_with_embeddings = ds.map(lambda example: {'embeddings': embed(example['line']}))\n        >>> ds_with_embeddings.add_faiss_index(column='embeddings')\n        >>> # query\n        >>> scores, retrieved_examples = ds_with_embeddings.get_nearest_examples('embeddings', embed('my new query'), k=10)\n        >>> # save index\n        >>> ds_with_embeddings.save_faiss_index('embeddings', 'my_index.faiss')\n\n        >>> ds = datasets.load_dataset('community-datasets/crime_and_punish', split='train')\n        >>> # load index\n        >>> ds.load_faiss_index('embeddings', 'my_index.faiss')\n        >>> # query\n        >>> scores, retrieved_examples = ds.get_nearest_examples('embeddings', embed('my new query'), k=10)\n        ```\n        \"\"\"\n        with self.formatted_as(type=\"numpy\", columns=[column], dtype=dtype):\n            super().add_faiss_index(\n                column=column,\n                index_name=index_name,\n                device=device,\n                string_factory=string_factory,\n                metric_type=metric_type,\n                custom_index=custom_index,\n                batch_size=batch_size,\n                train_size=train_size,\n                faiss_verbose=faiss_verbose,\n            )\n        return self\n\n    def add_faiss_index_from_external_arrays(\n        self,\n        external_arrays: np.array,\n        index_name: str,\n        device: Optional[int] = None,\n        string_factory: Optional[str] = None,\n        metric_type: Optional[int] = None,\n        custom_index: Optional[\"faiss.Index\"] = None,  # noqa: F821\n        batch_size: int = 1000,\n        train_size: Optional[int] = None,\n        faiss_verbose: bool = False,\n        dtype=np.float32,\n    ):\n        \"\"\"Add a dense index using Faiss for fast retrieval.\n        The index is created using the vectors of `external_arrays`.\n        You can specify `device` if you want to run it on GPU (`device` must be the GPU index).\n        You can find more information about Faiss here:\n\n        - For [string factory](https://github.com/facebookresearch/faiss/wiki/The-index-factory)\n\n        Args:\n            external_arrays (`np.array`):\n                If you want to use arrays from outside the lib for the index, you can set `external_arrays`.\n                It will use `external_arrays` to create the Faiss index instead of the arrays in the given `column`.\n            index_name (`str`):\n                The `index_name`/identifier of the index.\n                This is the `index_name` that is used to call [`~datasets.Dataset.get_nearest_examples`] or [`~datasets.Dataset.search`].\n            device (Optional `Union[int, List[int]]`, *optional*):\n                If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.\n                If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.\n            string_factory (`str`, *optional*):\n                This is passed to the index factory of Faiss to create the index.\n                Default index class is `IndexFlat`.\n            metric_type (`int`, *optional*):\n                Type of metric. Ex: `faiss.faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`.\n            custom_index (`faiss.Index`, *optional*):\n                Custom Faiss index that you already have instantiated and configured for your needs.\n            batch_size (`int`, *optional*):\n                Size of the batch to use while adding vectors to the FaissIndex. Default value is 1000.\n                <Added version=\"2.4.0\"/>\n            train_size (`int`, *optional*):\n                If the index needs a training step, specifies how many vectors will be used to train the index.\n            faiss_verbose (`bool`, defaults to False):\n                Enable the verbosity of the Faiss index.\n            dtype (`numpy.dtype`):\n                The dtype of the numpy arrays that are indexed. Default is np.float32.\n        \"\"\"\n        super().add_faiss_index_from_external_arrays(\n            external_arrays=external_arrays.astype(dtype),\n            index_name=index_name,\n            device=device,\n            string_factory=string_factory,\n            metric_type=metric_type,\n            custom_index=custom_index,\n            batch_size=batch_size,\n            train_size=train_size,\n            faiss_verbose=faiss_verbose,\n        )\n\n    def add_elasticsearch_index(\n        self,\n        column: str,\n        index_name: Optional[str] = None,\n        host: Optional[str] = None,\n        port: Optional[int] = None,\n        es_client: Optional[\"elasticsearch.Elasticsearch\"] = None,  # noqa: F821\n        es_index_name: Optional[str] = None,\n        es_index_config: Optional[dict] = None,\n    ):\n        \"\"\"Add a text index using ElasticSearch for fast retrieval. This is done in-place.\n\n        Args:\n            column (`str`):\n                The column of the documents to add to the index.\n            index_name (`str`, *optional*):\n                The `index_name`/identifier of the index.\n                This is the index name that is used to call [`~Dataset.get_nearest_examples`] or [`~Dataset.search`].\n                By default it corresponds to `column`.\n            host (`str`, *optional*, defaults to `localhost`):\n                Host of where ElasticSearch is running.\n            port (`str`, *optional*, defaults to `9200`):\n                Port of where ElasticSearch is running.\n            es_client (`elasticsearch.Elasticsearch`, *optional*):\n                The elasticsearch client used to create the index if host and port are `None`.\n            es_index_name (`str`, *optional*):\n                The elasticsearch index name used to create the index.\n            es_index_config (`dict`, *optional*):\n                The configuration of the elasticsearch index.\n                Default config is:\n                    ```\n                    {\n                        \"settings\": {\n                            \"number_of_shards\": 1,\n                            \"analysis\": {\"analyzer\": {\"stop_standard\": {\"type\": \"standard\", \" stopwords\": \"_english_\"}}},\n                        },\n                        \"mappings\": {\n                            \"properties\": {\n                                \"text\": {\n                                    \"type\": \"text\",\n                                    \"analyzer\": \"standard\",\n                                    \"similarity\": \"BM25\"\n                                },\n                            }\n                        },\n                    }\n                    ```\n        Example:\n\n        ```python\n        >>> es_client = elasticsearch.Elasticsearch()\n        >>> ds = datasets.load_dataset('community-datasets/crime_and_punish', split='train')\n        >>> ds.add_elasticsearch_index(column='line', es_client=es_client, es_index_name=\"my_es_index\")\n        >>> scores, retrieved_examples = ds.get_nearest_examples('line', 'my new query', k=10)\n        ```\n        \"\"\"\n        with self.formatted_as(type=None, columns=[column]):\n            super().add_elasticsearch_index(\n                column=column,\n                index_name=index_name,\n                host=host,\n                port=port,\n                es_client=es_client,\n                es_index_name=es_index_name,\n                es_index_config=es_index_config,\n            )\n        return self\n\n    @transmit_format\n    @fingerprint_transform(inplace=False)\n    def add_item(self, item: dict, new_fingerprint: Optional[str] = None):\n        \"\"\"Add item to Dataset.\n\n        <Added version=\"1.7\"/>\n\n        Args:\n            item (`dict`):\n                Item data to be added.\n\n        Returns:\n            [`Dataset`]\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> new_review = {'label': 0, 'text': 'this movie is the absolute worst thing I have ever seen'}\n        >>> ds = ds.add_item(new_review)\n        >>> ds[-1]\n        {'label': 0, 'text': 'this movie is the absolute worst thing I have ever seen'}\n        ```\n        \"\"\"\n        item_table = InMemoryTable.from_pydict({k: [v] for k, v in item.items()})\n        # We don't call _check_if_features_can_be_aligned here so this cast is \"unsafe\"\n        dset_features, item_features = _align_features(\n            [self._info.features, Features.from_arrow_schema(item_table.schema)]\n        )\n        # Cast to align the schemas of the tables and concatenate the tables\n        table = concat_tables(\n            [\n                self._data.cast(dset_features.arrow_schema) if self._info.features != dset_features else self._data,\n                item_table.cast(item_features.arrow_schema),\n            ]\n        )\n        if self._indices is None:\n            indices_table = None\n        else:\n            item_indices_array = pa.array([len(self._data)], type=pa.uint64())\n            item_indices_table = InMemoryTable.from_arrays([item_indices_array], names=[\"indices\"])\n            indices_table = concat_tables([self._indices, item_indices_table])\n        info = self.info.copy()\n        info.features.update(item_features)\n        table = update_metadata_with_features(table, info.features)\n        return Dataset(\n            table,\n            info=info,\n            split=self.split,\n            indices_table=indices_table,\n            fingerprint=new_fingerprint,\n        )\n\n    def align_labels_with_mapping(self, label2id: dict, label_column: str) -> \"Dataset\":\n        \"\"\"Align the dataset's label ID and label name mapping to match an input `label2id` mapping.\n        This is useful when you want to ensure that a model's predicted labels are aligned with the dataset.\n        The alignment in done using the lowercase label names.\n\n        Args:\n            label2id (`dict`):\n                The label name to ID mapping to align the dataset with.\n            label_column (`str`):\n                The column name of labels to align on.\n\n        Example:\n\n        ```python\n        >>> # dataset with mapping {'entailment': 0, 'neutral': 1, 'contradiction': 2}\n        >>> ds = load_dataset(\"nyu-mll/glue\", \"mnli\", split=\"train\")\n        >>> # mapping to align with\n        >>> label2id = {'CONTRADICTION': 0, 'NEUTRAL': 1, 'ENTAILMENT': 2}\n        >>> ds_aligned = ds.align_labels_with_mapping(label2id, \"label\")\n        ```\n\n        \"\"\"\n        # Sanity checks\n        if label_column not in self._data.column_names:\n            raise ValueError(f\"Column ({label_column}) not in table columns ({self._data.column_names}).\")\n\n        label_feature = self._info.features[label_column]\n        if not (\n            isinstance(label_feature, ClassLabel)\n            or (isinstance(label_feature, Sequence) and isinstance(label_feature.feature, ClassLabel))\n        ):\n            raise ValueError(\n                f\"Aligning labels with a mapping is only supported for {ClassLabel.__name__} column or {Sequence.__name__} column with the inner type {ClassLabel.__name__}, and column {label_feature} is of type {type(label_feature).__name__}.\"\n            )\n\n        # Sort input mapping by ID value to ensure the label names are aligned\n        label2id = dict(sorted(label2id.items(), key=lambda item: item[1]))\n        label_names = list(label2id.keys())\n        # Some label mappings use uppercase label names so we lowercase them during alignment\n        label2id = {k.lower(): v for k, v in label2id.items()}\n        int2str_function = (\n            label_feature.int2str if isinstance(label_feature, ClassLabel) else label_feature.feature.int2str\n        )\n\n        if isinstance(label_feature, ClassLabel):\n\n            def process_label_ids(batch):\n                dset_label_names = [\n                    int2str_function(label_id).lower() if label_id is not None else None\n                    for label_id in batch[label_column]\n                ]\n                batch[label_column] = [\n                    label2id[label_name] if label_name is not None else None for label_name in dset_label_names\n                ]\n                return batch\n\n        else:\n\n            def process_label_ids(batch):\n                dset_label_names = [\n                    [int2str_function(label_id).lower() if label_id is not None else None for label_id in seq]\n                    for seq in batch[label_column]\n                ]\n                batch[label_column] = [\n                    [label2id[label_name] if label_name is not None else None for label_name in seq]\n                    for seq in dset_label_names\n                ]\n                return batch\n\n        features = self.features\n        features[label_column] = (\n            ClassLabel(num_classes=len(label_names), names=label_names)\n            if isinstance(label_feature, ClassLabel)\n            else List(ClassLabel(num_classes=len(label_names), names=label_names))\n        )\n        return self.map(process_label_ids, features=features, batched=True, desc=\"Aligning the labels\")\n\n\ndef _push_to_repo(\n    dset: Union[\"Dataset\", \"IterableDataset\"],\n    repo_id: str,\n    config_name: str = \"default\",\n    set_default: Optional[bool] = None,\n    split: Optional[str] = None,\n    data_dir: Optional[str] = None,\n    commit_message: Optional[str] = None,\n    commit_description: Optional[str] = None,\n    token: Optional[str] = None,\n    revision: Optional[str] = None,\n    create_pr: Optional[bool] = False,\n    max_shard_size: Optional[Union[int, str]] = None,\n    num_shards: Optional[int] = None,\n    embed_external_files: bool = True,\n    num_proc: Optional[int] = None,\n) -> CommitInfo:\n    api = HfApi(endpoint=config.HF_ENDPOINT, token=token)\n    resolved_output_path = HfFileSystemResolvedRepositoryPath(\n        repo_id=repo_id, repo_type=\"dataset\", revision=revision or \"main\", path_in_repo=\"\"\n    )\n\n    additions, new_parquet_paths, features, split_info, uploaded_size = dset._push_parquet_shards_to_hub(\n        resolved_output_path=resolved_output_path,\n        data_dir=data_dir,\n        split=split,\n        token=token,\n        max_shard_size=max_shard_size,\n        num_shards=num_shards,\n        create_pr=create_pr,\n        embed_external_files=embed_external_files,\n        num_proc=num_proc,\n    )\n\n    commit_message = commit_message if commit_message is not None else \"Upload dataset\"\n    if len(additions) > config.UPLOADS_MAX_NUMBER_PER_COMMIT:\n        logger.info(\n            f\"Number of files to upload is larger than {config.UPLOADS_MAX_NUMBER_PER_COMMIT}. Splitting the push into multiple commits.\"\n        )\n        num_commits = math.ceil(len(additions) / config.UPLOADS_MAX_NUMBER_PER_COMMIT)\n        for i in range(0, num_commits):\n            operations = additions[\n                i * config.UPLOADS_MAX_NUMBER_PER_COMMIT : (i + 1) * config.UPLOADS_MAX_NUMBER_PER_COMMIT\n            ]\n            for retry, sleep_time in enumerate(itertools.chain(range(10), itertools.repeat(30)), start=1):\n                # We need to retry if another commit happens at the same time\n                sleep_time *= 1 + random.random()\n                try:\n                    commit_info = api.create_commit(\n                        repo_id,\n                        operations=operations,\n                        commit_message=commit_message + f\" (part {i:05d}-of-{num_commits:05d})\",\n                        commit_description=commit_description,\n                        repo_type=\"dataset\",\n                        revision=revision,\n                        create_pr=create_pr,\n                    )\n                except HfHubHTTPError as err:\n                    if (\n                        err.__context__\n                        and isinstance(err.__context__, HfHubHTTPError)\n                        and err.__context__.response.status_code == 409\n                    ):\n                        # 409 is Conflict (another commit is in progress)\n                        time.sleep(sleep_time)\n                        logger.info(\n                            f\"Retrying intermediate commit for {repo_id}, {config_name} ({retry}/n with status_code {err.__context__.response.status_code})\"\n                        )\n                        continue\n                    else:\n                        raise\n                break\n            logger.info(\n                f\"Commit #{i + 1} completed\"\n                + (f\" (still {num_commits - i - 1} to go)\" if num_commits - i - 1 else \"\")\n                + \".\"\n            )\n        last_commit_additions = []\n    else:\n        last_commit_additions = additions\n\n    for retry, sleep_time in enumerate(itertools.chain(range(10), itertools.repeat(30)), start=1):\n        # We need to retry if there was a commit in between in case it touched the dataset card data\n        sleep_time *= 1 + random.random()\n\n        # We make sure to get info from this commit\n        parent_commit = api.repo_info(repo_id, repo_type=\"dataset\", revision=revision).sha\n        hf_path = HfFileSystemResolvedRepositoryPath(\n            repo_type=resolved_output_path.repo_type,\n            repo_id=resolved_output_path.repo_id,\n            revision=parent_commit,\n            path_in_repo=resolved_output_path.path_in_repo,\n        ).unresolve()\n        hffs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=token)\n        dirfs = DirFileSystem(fs=hffs, path=hf_path)\n\n        # Check the files to delete\n        try:\n            files_to_delete = dirfs.glob(f\"{data_dir}/{split}-*\", detail=True)\n        except EntryNotFoundError:  # needed for huggingface_hub<=1.7.1\n            files_to_delete = {}\n\n        # Don't delete the new files\n        deletions = [\n            CommitOperationDelete(path_in_repo=file_to_delete)\n            for file_to_delete in files_to_delete\n            if file_to_delete not in new_parquet_paths\n        ]\n        deleted_size = sum(file_info[\"size\"] for file_info in files_to_delete.values())\n\n        # Update the dataset card\n        new_dataset_card, new_legacy_dataset_infos = _get_updated_dataset_card(\n            fs=dirfs,\n            config_name=config_name,\n            splits_info=[split_info],\n            features=features,\n            data_dir=data_dir,\n            set_default=set_default,\n            uploaded_sizes=[uploaded_size],\n            deleted_sizes=[deleted_size],\n            remove_other_splits=False,\n        )\n        dataset_card_additions = [\n            CommitOperationAdd(path_in_repo=config.REPOCARD_FILENAME, path_or_fileobj=str(new_dataset_card).encode())\n        ]\n        if new_legacy_dataset_infos:\n            dataset_card_additions.append(\n                CommitOperationAdd(\n                    path_in_repo=config.DATASETDICT_INFOS_FILENAME,\n                    path_or_fileobj=json.dumps(new_legacy_dataset_infos).encode(\"utf-8\"),\n                )\n            )\n        operations = last_commit_additions + dataset_card_additions + deletions\n\n        try:\n            commit_info = api.create_commit(\n                repo_id,\n                operations=operations,\n                commit_message=commit_message,\n                commit_description=commit_description,\n                repo_type=\"dataset\",\n                revision=revision,\n                create_pr=create_pr,\n                parent_commit=parent_commit,\n            )\n        except HfHubHTTPError as err:\n            if (\n                err.__context__\n                and isinstance(err.__context__, HfHubHTTPError)\n                and err.__context__.response.status_code in (412, 409)\n            ):\n                # 412 is Precondition failed (parent_commit isn't satisfied)\n                # 409 is Conflict (another commit is in progress)\n                time.sleep(sleep_time)\n                logger.info(\n                    f\"Retrying commit for {repo_id}, {config_name} ({retry}/n with status_code {err.__context__.response.status_code})\"\n                )\n                continue\n            else:\n                raise\n        break\n\n    return commit_info\n\n\ndef _push_to_bucket(\n    dset: Union[\"Dataset\", \"IterableDataset\"],\n    bucket_id: str,\n    path: str,\n    config_name: str = \"default\",\n    set_default: Optional[bool] = None,\n    split: Optional[str] = None,\n    data_dir: Optional[str] = None,\n    token: Optional[str] = None,\n    create_pr: Optional[bool] = False,\n    max_shard_size: Optional[Union[int, str]] = None,\n    num_shards: Optional[int] = None,\n    embed_external_files: bool = True,\n    num_proc: Optional[int] = None,\n) -> None:\n    api = HfApi(endpoint=config.HF_ENDPOINT, token=token)\n    resolved_output_path = HfFileSystemResolvedBucketPath(bucket_id=bucket_id, path=path)\n    hf_path = resolved_output_path.unresolve()\n    hffs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=token)\n    dirfs = DirFileSystem(fs=hffs, path=hf_path)\n\n    # Check the files to delete before uploading\n    try:\n        files_to_delete = dirfs.glob(f\"{data_dir}/{split}-*\", detail=True)\n    except EntryNotFoundError:  # needed for huggingface_hub<=1.7.1\n        files_to_delete = {}\n\n    # Upload the Parquet files\n    _, new_parquet_paths, features, split_info, uploaded_size = dset._push_parquet_shards_to_hub(\n        resolved_output_path=resolved_output_path,\n        data_dir=data_dir,\n        split=split,\n        token=token,\n        max_shard_size=max_shard_size,\n        num_shards=num_shards,\n        create_pr=create_pr,\n        embed_external_files=embed_external_files,\n        num_proc=num_proc,\n    )\n\n    # Don't delete the new files\n    new_parquet_paths = set(new_parquet_paths)\n    delete = [file_to_delete for file_to_delete in files_to_delete if file_to_delete not in new_parquet_paths]\n    deleted_size = sum(file_info[\"size\"] for file_info in files_to_delete.values())\n\n    # Update the dataset card\n    new_dataset_card, new_legacy_dataset_infos = _get_updated_dataset_card(\n        fs=dirfs,\n        config_name=config_name,\n        splits_info=[split_info],\n        features=features,\n        data_dir=data_dir,\n        set_default=set_default,\n        uploaded_sizes=[uploaded_size],\n        deleted_sizes=[deleted_size],\n        remove_other_splits=False,\n    )\n    path_prefix = (path + \"/\") if path else \"\"\n    add = [(str(new_dataset_card).encode(), path_prefix + config.REPOCARD_FILENAME)]\n    if new_legacy_dataset_infos:\n        add.append(\n            (json.dumps(new_legacy_dataset_infos).encode(\"utf-8\"), path_prefix + config.DATASETDICT_INFOS_FILENAME)\n        )\n\n    # Upload dataset card and delete old files\n    api.batch_bucket_files(\n        bucket_id=bucket_id,\n        add=add,\n        delete=delete,\n    )\n\n\ndef _get_updated_dataset_card(\n    fs: DirFileSystem,\n    config_name: str,\n    splits_info: list[SplitInfo],\n    features: Features,\n    data_dir: str,\n    set_default: Optional[bool],\n    uploaded_sizes: list[int],\n    deleted_sizes: list[int],\n    remove_other_splits: bool,\n) -> tuple[DatasetCard, Optional[dict]]:\n    \"\"\"Update a dataset card in push_to_hub\"\"\"\n    # get the deprecated dataset_infos.json to update them\n    try:\n        legacy_dataset_info: dict = json.loads(fs.read_text(config.DATASETDICT_INFOS_FILENAME, encoding=\"utf-8\")).get(\n            config_name, None\n        )\n        repo_info = DatasetInfo.from_dict(legacy_dataset_info) if legacy_dataset_info else None\n    except FileNotFoundError:\n        legacy_dataset_info = None\n        repo_info = None\n    # get the info from the README to update them\n    try:\n        dataset_card = DatasetCard(fs.read_text(config.REPOCARD_FILENAME, newline=\"\", encoding=\"utf-8\"))\n        dataset_card_data = dataset_card.data\n        metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)\n        dataset_infos: DatasetInfosDict = DatasetInfosDict.from_dataset_card_data(dataset_card_data)\n        if dataset_infos and config_name in dataset_infos:\n            repo_info = dataset_infos[config_name]\n        else:\n            repo_info = None\n    except FileNotFoundError:\n        dataset_card = None\n        dataset_card_data = DatasetCardData()\n        metadata_configs = MetadataConfigs()\n    # update the total info to dump from existing info\n    if repo_info is not None and not remove_other_splits:\n        logger.info(\"Updating downloaded metadata with the new split\" + (\"s.\" if len(splits_info) > 1 else \".\"))\n        for split_info, deleted_size, uploaded_size in zip(splits_info, deleted_sizes, uploaded_sizes):\n            split = split_info.name\n            if repo_info.splits and any(s != split for s in repo_info.splits):\n                if features != repo_info.features:\n                    raise ValueError(\n                        f\"Features of the new split don't match the features of the existing splits on the hub: {features} != {repo_info.features}\"\n                    )\n\n            if split in repo_info.splits:\n                repo_info.download_size -= deleted_size\n                repo_info.dataset_size -= repo_info.splits.get(split, SplitInfo()).num_bytes or 0\n\n            repo_info.download_checksums = None\n            repo_info.download_size = (repo_info.download_size or 0) + uploaded_size\n            repo_info.dataset_size = (repo_info.dataset_size or 0) + split_info.num_bytes\n            repo_info.size_in_bytes = repo_info.download_size + repo_info.dataset_size\n            repo_info.splits.pop(split, None)\n            repo_info.splits[split] = split_info\n        info_to_dump = repo_info\n    else:\n        info_to_dump = DatasetInfo(\n            config_name=config_name, features=features, splits=SplitDict(), download_size=0, dataset_size=0\n        )\n        for split_info, uploaded_size in zip(splits_info, uploaded_sizes):\n            info_to_dump.splits.add(split_info)\n            info_to_dump.download_size += uploaded_size\n            info_to_dump.dataset_size += split_info.num_bytes\n        info_to_dump.size_in_bytes = info_to_dump.download_size + info_to_dump.dataset_size\n    # create the metadata configs if it was uploaded with push_to_hub before metadata configs existed\n    repo_splits: list[str] = []\n    pattern = glob_pattern_to_regex(PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED)\n    for file_path in fs.glob(PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED.replace(\"{split}\", \"*\")):\n        split_pattern_fields = string_to_dict(file_path, pattern)\n        assert split_pattern_fields is not None\n        repo_split = split_pattern_fields[\"split\"]\n        if repo_split not in repo_splits:\n            repo_splits.append(repo_split)\n    if not metadata_configs and repo_splits:\n        default_metadata_configs_to_dump = {\n            \"data_files\": [{\"split\": split, \"path\": f\"data/{split}-*\"} for split in repo_splits]\n        }\n        MetadataConfigs({\"default\": default_metadata_configs_to_dump}).to_dataset_card_data(dataset_card_data)\n    # update the metadata configs\n    if config_name in metadata_configs:\n        metadata_config = metadata_configs[config_name]\n        if \"data_files\" in metadata_config:\n            data_files_to_dump = sanitize_patterns(metadata_config[\"data_files\"])\n        else:\n            data_files_to_dump = {}\n        # add the new splits\n        for split_info in splits_info:\n            split = split_info.name\n            data_files_to_dump[split] = [f\"{data_dir}/{split}-*\"]\n        metadata_config_to_dump = {\n            \"data_files\": [\n                {\n                    \"split\": _split,\n                    \"path\": _pattern[0] if len(_pattern) == 1 else _pattern,\n                }\n                for _split, _pattern in data_files_to_dump.items()\n            ]\n        }\n    else:\n        metadata_config_to_dump = {\n            \"data_files\": [\n                {\"split\": split_info.name, \"path\": f\"{data_dir}/{split_info.name}-*\"} for split_info in splits_info\n            ]\n        }\n    configs_to_dump = {config_name: metadata_config_to_dump}\n    if set_default and config_name != \"default\":\n        if metadata_configs:\n            current_default_config_name = metadata_configs.get_default_config_name()\n            if current_default_config_name == \"default\":\n                raise ValueError(\n                    \"There exists a configuration named 'default'. To set a different configuration as default, \"\n                    \"rename the 'default' one first.\"\n                )\n            if current_default_config_name:\n                _ = metadata_configs[current_default_config_name].pop(\"default\")\n                configs_to_dump[current_default_config_name] = metadata_configs[current_default_config_name]\n        metadata_config_to_dump[\"default\"] = True\n    # push to the deprecated dataset_infos.json\n    if legacy_dataset_info:\n        legacy_dataset_infos: dict = json.loads(fs.read_text(config.DATASETDICT_INFOS_FILENAME, encoding=\"utf-8\"))\n        legacy_dataset_infos[config_name] = asdict(info_to_dump)\n        new_legacy_dataset_infos = json.dumps(dataset_infos, indent=4)\n    else:\n        new_legacy_dataset_infos = None\n    # push to README\n    DatasetInfosDict({config_name: info_to_dump}).to_dataset_card_data(dataset_card_data)\n    MetadataConfigs(configs_to_dump).to_dataset_card_data(dataset_card_data)\n    new_dataset_card = DatasetCard(f\"---\\n{dataset_card_data}\\n---\\n\") if dataset_card is None else dataset_card\n    return new_dataset_card, new_legacy_dataset_infos\n\n\ndef _concatenate_map_style_datasets(\n    dsets: list[Dataset],\n    info: Optional[DatasetInfo] = None,\n    split: Optional[NamedSplit] = None,\n    axis: int = 0,\n):\n    \"\"\"\n    Converts a list of :class:`Dataset` with the same schema into a single :class:`Dataset`.\n    When you concatenate on axis 0, missing data are filled with None values.\n\n    Args:\n        dsets (`List[datasets.Dataset]`): List of Datasets to concatenate.\n        info (:class:`DatasetInfo`, optional): Dataset information, like description, citation, etc.\n        split (:class:`NamedSplit`, optional): Name of the dataset split.\n        axis (``{0, 1}``, default ``0``, meaning over rows):\n            Axis to concatenate over, where ``0`` means over rows (vertically) and ``1`` means over columns\n            (horizontally).\n\n            *New in version 1.6.0*\n\n    Example:\n\n    ```py\n    >>> ds3 = _concatenate_map_style_datasets([ds1, ds2])\n    ```\n    \"\"\"\n    # Ignore datasets with no rows\n    if any(dset.num_rows > 0 for dset in dsets):\n        dsets = [dset for dset in dsets if dset.num_rows > 0]\n    else:\n        # Return first dataset if all datasets are empty\n        return dsets[0]\n\n    # Perform checks (and a potential cast if axis=0)\n    if axis == 0:\n        _check_if_features_can_be_aligned([dset.features for dset in dsets])\n    else:\n        if not all(dset.num_rows == dsets[0].num_rows for dset in dsets):\n            raise ValueError(\"Number of rows must match for all datasets\")\n        _check_column_names([col_name for dset in dsets for col_name in dset._data.column_names])\n\n    # Find common format or reset format\n    format = dsets[0].format\n    if any(dset.format != format for dset in dsets):\n        format = {}\n        logger.info(\"Some of the datasets have disparate format. Resetting the format of the concatenated dataset.\")\n\n    def apply_offset_to_indices_table(table, offset):\n        if offset == 0:\n            return table\n        else:\n            array = table[\"indices\"]\n            new_array = pc.add(array, pa.scalar(offset, type=pa.uint64()))\n            return InMemoryTable.from_arrays([new_array], names=[\"indices\"])\n\n    # Concatenate indices if they exist\n    if any(dset._indices is not None for dset in dsets):\n        if axis == 0:\n            # Datasets with no indices tables are replaced with a dataset with an indices table in memory.\n            # Applying an offset to an indices table also brings the table in memory.\n            indices_tables = []\n            for i in range(len(dsets)):\n                if dsets[i]._indices is None:\n                    dsets[i] = dsets[i]._select_with_indices_mapping(range(len(dsets[i])))\n                indices_tables.append(dsets[i]._indices)\n\n            # An offset needs to be applied to the indices before concatenating\n            offset = 0\n            for i in range(len(dsets)):\n                indices_tables[i] = apply_offset_to_indices_table(indices_tables[i], offset)\n                offset += len(dsets[i]._data)\n\n            # Concatenate indices\n            indices_tables = [t for t in indices_tables if len(t) > 0]\n            if indices_tables:\n                indices_table = concat_tables(indices_tables)\n            else:\n                indices_table = InMemoryTable.from_batches([], schema=pa.schema({\"indices\": pa.int64()}))\n        else:\n            if len(dsets) == 1:\n                indices_table = dsets[0]._indices\n            else:\n                for i in range(len(dsets)):\n                    dsets[i] = dsets[i].flatten_indices()\n                indices_table = None\n    else:\n        indices_table = None\n\n    table = concat_tables([dset._data for dset in dsets], axis=axis)\n    if axis == 0:\n        features_list = _align_features([dset.features for dset in dsets])\n    else:\n        features_list = [dset.features for dset in dsets]\n    table = update_metadata_with_features(table, {k: v for features in features_list for k, v in features.items()})\n\n    # Concatenate infos\n    if info is None:\n        info = DatasetInfo.from_merge([dset.info for dset in dsets])\n    fingerprint = update_fingerprint(\n        \"\".join(dset._fingerprint for dset in dsets), _concatenate_map_style_datasets, {\"info\": info, \"split\": split}\n    )\n\n    # Make final concatenated dataset\n    concatenated_dataset = Dataset(\n        table,\n        info=info,\n        split=split,\n        indices_table=indices_table,\n        fingerprint=fingerprint,\n    )\n    concatenated_dataset.set_format(**format)\n    return concatenated_dataset\n\n\ndef _interleave_map_style_datasets(\n    datasets: list[\"Dataset\"],\n    probabilities: Optional[list[float]] = None,\n    seed: Optional[int] = None,\n    info: Optional[DatasetInfo] = None,\n    split: Optional[NamedSplit] = None,\n    stopping_strategy: Literal[\n        \"first_exhausted\", \"all_exhausted\", \"all_exhausted_without_replacement\"\n    ] = \"first_exhausted\",\n    **kwargs,\n) -> \"Dataset\":\n    \"\"\"\n    Interleave several map-style datasets (sources) into a single map-style dataset.\n    The new dataset is constructed by alternating between the sources to get the examples.\n    If `probabilities = None` (default) the new dataset is constructed by cycling between each source to get the examples.\n    If `probabilities` is not `None, the new dataset is constructed by getting examples from a random source at a time according to the provided probabilities.\n\n    Args:\n        datasets (`List[Dataset]`): list of datasets to interleave\n        probabilities (`List[float]`, optional, default None): If specified, the new dataset is constructed by sampling\n            examples from one source at a time according to these probabilities.\n        seed (`int`, optional, default None): The random seed used to choose a source for each example.\n        info (:class:`DatasetInfo`, optional): Dataset information, like description, citation, etc.\n        split (:class:`NamedSplit`, optional): Name of the dataset split.\n        stopping_strategy (`str`, defaults to `first_exhausted`):\n            Two strategies are proposed right now.\n            By default, `first_exhausted` is an undersampling strategy, i.e the dataset construction is stopped as soon as one dataset has ran out of samples.\n            If the strategy is `all_exhausted`,  we use an oversampling strategy, i.e the dataset construction is stopped as soon as every samples of every dataset has been added at least once.\n            When strategy is `all_exhausted_without_replacement` we make sure that each sample in each dataset is sampled only once.\n            Note that if the strategy is `all_exhausted`, the interleaved dataset size can get enormous:\n            - with no probabilities, the resulting dataset will have max_length_datasets*nb_dataset samples.\n            - with given probabilities, the resulting dataset will have more samples if some datasets have really low probability of visiting.\n        **kwargs (additional keyword arguments): Keyword arguments to be passed to :meth:`datasets.Datasets.select` when selecting the indices used to interleave the datasets.\n\n    Output:\n        :class:`datasets.Dataset`\n    \"\"\"\n    if stopping_strategy not in [\"first_exhausted\", \"all_exhausted\", \"all_exhausted_without_replacement\"]:\n        raise ValueError(\n            f\"{stopping_strategy} stopping strategy in `interleave_datasets` is not implemented yet with a list of {type(datasets[0])}\"\n        )\n\n    # To interleave the datasets, we concatenate them and then we re-order the indices\n    concatenated_datasets = _concatenate_map_style_datasets(datasets, info=info, split=split)\n\n    # Let's now build the indices to pass to .select()\n    lengths = [len(dset) for dset in datasets]\n    offsets = np.cumsum([0] + lengths[:-1])\n\n    # if stopping_strategy is \"first_exhausted\", it is an undersampling situation whereas it is an oversampling situation if it is \"all_exhausted\"\n    oversampling = stopping_strategy == \"all_exhausted\"\n\n    if probabilities is None and stopping_strategy == \"all_exhausted_without_replacement\":\n        # Without replacement situation with cycling between each sources\n        # Example: If lengths of the datasets are [3, 4, 3]\n        # Then the resulting indices should be [0, 3, 7, 1, 4, 8, 2, 5, 9, 6]\n        # We cycle through datasets until all are exhausted, but skip exhausted datasets\n\n        # Reasoning behind the following operation: keeping the first indices of each dataset\n        # while offsetting in order to correspond to the right indices of the concatenated dataset\n        # and flattening to effectively interleave the datasets. Then we remove the exausted datasets\n        # and we continue with the following indices, until all datasets are exhausted\n        chunks_boundaries = [0] + sorted(set(lengths))\n        chunks = zip(chunks_boundaries[:-1], chunks_boundaries[1:])\n        indices_chunks = []\n        for start, end in chunks:\n            indices_chunks.append((np.array(offsets).reshape(1, -1) + np.arange(start, end).reshape(-1, 1)).flatten())\n            exhausted_indices = [i for i in range(len(lengths)) if lengths[i] == end]\n            lengths = np.delete(lengths, exhausted_indices).tolist()\n            offsets = np.delete(offsets, exhausted_indices)\n        indices = np.concatenate(indices_chunks).tolist()\n\n    elif probabilities is None and not oversampling:\n        # Undersampling situation with cycling between each sources\n        # Example:: If lengths of the datasets are [3, 4, 5]\n        # Then the resulting indices should be [0, 3, 7, 1, 4, 8, 2, 5, 9]\n        # Note that we only have 3 examples per dataset since the first dataset ran out of examples\n\n        # Reasoning behind the following operation: keeping the min_length first indices of each dataset\n        # while offsetting in order to correspond to the right indices of the concatenated dataset\n        # and flattening to effectively interleave the datasets\n        indices = (offsets.reshape(1, -1) + np.arange(min(lengths)).reshape(-1, 1)).flatten().tolist()\n    elif probabilities is None:\n        # Oversampling situation with cycling between each sources\n        # Then the resulting indices should be [0, 3, 7, 1, 4, 8, 2, 5, 9, 0, 6, 10, 1, 3, 11]\n        # Note that we have 5 examples per dataset with a rolling window since the longest dataset has 5 samples\n\n        # Reasoning behind the following operation: for each dataset indices (i.e column) repeat the indices to have max_length indices per dataset\n        # For example, if the max_length is 5 and the i-th dataset has 3 samples, the i-th column will be [0,1,2,0,1]\n        indices = np.mod(np.arange(max(lengths)).reshape(-1, 1), np.array(lengths).reshape(1, -1))\n\n        # We have to keep the indices to their respective dataset offsets and to flatten to effectively interleave the datasets\n        indices = (indices + offsets).flatten().tolist()\n\n    else:\n        # boolean array indicating if at index i if the dataset_i has been fully exhausted\n        is_exhausted = np.full(len(lengths), False)\n\n        # if undersampling (\"first_exhausted\"), we stop as soon as one dataset is exhausted\n        # if oversampling (\"all_exhausted\"), we stop as soons as every dataset is exhausted, i.e as soon as every samples of every dataset has been visited at least once\n        bool_strategy_func = (\n            np.all if (oversampling or stopping_strategy == \"all_exhausted_without_replacement\") else np.any\n        )\n\n        def iter_random_indices():\n            \"\"\"Get an infinite iterator that randomly samples the index of the source to pick examples from.\"\"\"\n            rng = np.random.default_rng(seed)\n            while True:\n                yield from (int(i) for i in rng.choice(len(datasets), size=1000, p=probabilities))\n\n        current_index = [0] * len(datasets)\n        indices = []\n        for source_idx in iter_random_indices():\n            # If no oversampling, we stop as soon as a dataset has ran out of examples (np.any)\n            # Otherwise, we stop as soon as every dataset has ran out of examples (np.all)\n            if bool_strategy_func(is_exhausted):\n                # the stopping condition was reached, let's stop\n                break\n\n            # let's add the example at the current index of the `source_idx`-th dataset\n            # For without replacement sampling we additionally need to make sure the current source is not exhausted to not oversample.\n            if stopping_strategy != \"all_exhausted_without_replacement\" or not is_exhausted[source_idx]:\n                indices.append(current_index[source_idx] + offsets[source_idx])\n                current_index[source_idx] += 1\n\n            # we've ran out of examples for the current dataset, let's update our boolean array and bring the current_index back to 0\n            if current_index[source_idx] >= lengths[source_idx]:\n                is_exhausted[source_idx] = True\n                # We don't want to reset the iterator when stopping strategy is without replacement.\n                if stopping_strategy != \"all_exhausted_without_replacement\":\n                    current_index[source_idx] = 0\n\n    return concatenated_datasets.select(indices, **kwargs)\n\n\ndef _split_by_node_map_style_dataset(dataset: Dataset, rank: int, world_size: int) -> Dataset:\n    \"\"\"\n    Split a dataset for the node at rank `rank` in a pool of nodes of size `world_size`.\n    Each node is assigned a chunk of data, e.g. rank 0 is given the first chunk of the dataset.\n    To maximize data loading throughput, chunks are made of contiguous data on disk if possible.\n\n    Args:\n        dataset ([`Dataset`]):\n            The dataset to split by node.\n        rank (`int`):\n            Rank of the current node.\n        world_size (`int`):\n            Total number of nodes.\n\n    Returns:\n        [`Dataset`]: The dataset to be used on the node at rank `rank`.\n    \"\"\"\n    return dataset.shard(num_shards=world_size, index=rank, contiguous=True)\n\n\n# This is outside Dataset.filter as it needs to be picklable for multiprocessing\n\n\ndef get_indices_from_mask_function(\n    function: Callable,\n    batched: bool,\n    with_indices: bool,\n    with_rank: bool,\n    input_columns: Optional[Union[str, list[str]]],\n    indices_mapping: Optional[Table] = None,\n    *args,\n    **fn_kwargs,\n):\n    if batched:\n        # we extract indices and rank from args\n        *inputs, indices, rank = args\n        additional_args = ()\n        if with_indices:\n            additional_args += (indices,)\n        if with_rank:\n            additional_args += (rank,)\n        mask = function(*inputs, *additional_args, **fn_kwargs)\n        if isinstance(mask, (pa.Array, pa.ChunkedArray)):\n            mask = mask.to_pylist()\n    else:\n        # we get batched data (to return less data than input) but `function` only accepts one example\n        # therefore we need to call `function` on each example of the batch to get the mask\n        *inputs, indices, rank = args\n        mask = []\n        if input_columns is None:\n            # inputs only contains a batch of examples\n            batch: dict = inputs[0]\n            num_examples = len(batch[next(iter(batch.keys()))])\n            for i in range(num_examples):\n                example = {key: batch[key][i] for key in batch}\n                additional_args = ()\n                if with_indices:\n                    additional_args += (indices[i],)\n                if with_rank:\n                    additional_args += (rank,)\n                mask.append(function(example, *additional_args, **fn_kwargs))\n        else:\n            # inputs is a list of columns\n            columns: list[list] = inputs\n            num_examples = len(columns[0])\n            for i in range(num_examples):\n                input = [column[i] for column in columns]\n                additional_args = ()\n                if with_indices:\n                    additional_args += (indices[i],)\n                if with_rank:\n                    additional_args += (rank,)\n                mask.append(function(*input, *additional_args, **fn_kwargs))\n    indices_array = [i for i, to_keep in zip(indices, mask) if to_keep]\n    if indices_mapping is not None:\n        indices_array = pa.array(indices_array, type=pa.uint64())\n        indices_array = indices_mapping.column(0).take(indices_array)\n        indices_array = indices_array.to_pylist()\n    return {\"indices\": indices_array}\n\n\nasync def async_get_indices_from_mask_function(\n    function: Callable,\n    batched: bool,\n    with_indices: bool,\n    with_rank: bool,\n    input_columns: Optional[Union[str, list[str]]],\n    indices_mapping: Optional[Table] = None,\n    *args,\n    **fn_kwargs,\n):\n    \"\"\"same function but async\"\"\"\n    if batched:\n        # we extract indices and rank from args\n        *inputs, indices, rank = args\n        additional_args = ()\n        if with_indices:\n            additional_args += (indices,)\n        if with_rank:\n            additional_args += (rank,)\n        mask = await function(*inputs, *additional_args, **fn_kwargs)\n        if isinstance(mask, (pa.Array, pa.ChunkedArray)):\n            mask = mask.to_pylist()\n    else:\n        # we get batched data (to return less data than input) but `function` only accepts one example\n        # therefore we need to call `function` on each example of the batch to get the mask\n        *inputs, indices, rank = args\n        mask = []\n        if input_columns is None:\n            # inputs only contains a batch of examples\n            batch: dict = inputs[0]\n            num_examples = len(batch[next(iter(batch.keys()))])\n            for i in range(num_examples):\n                example = {key: batch[key][i] for key in batch}\n                additional_args = ()\n                if with_indices:\n                    additional_args += (indices[i],)\n                if with_rank:\n                    additional_args += (rank,)\n                mask.append(await function(example, *additional_args, **fn_kwargs))\n        else:\n            # inputs is a list of columns\n            columns: list[list] = inputs\n            num_examples = len(columns[0])\n            for i in range(num_examples):\n                input = [column[i] for column in columns]\n                additional_args = ()\n                if with_indices:\n                    additional_args += (indices[i],)\n                if with_rank:\n                    additional_args += (rank,)\n                mask.append(await function(*input, *additional_args, **fn_kwargs))\n    indices_array = [i for i, to_keep in zip(indices, mask) if to_keep]\n    if indices_mapping is not None:\n        indices_array = pa.array(indices_array, type=pa.uint64())\n        indices_array = indices_mapping.column(0).take(indices_array)\n        indices_array = indices_array.to_pylist()\n    return {\"indices\": indices_array}\n"
  },
  {
    "path": "src/datasets/arrow_reader.py",
    "content": "# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Lint as: python3\n\"\"\"Arrow ArrowReader.\"\"\"\n\nimport copy\nimport math\nimport os\nimport re\nfrom dataclasses import dataclass\nfrom functools import partial\nfrom typing import TYPE_CHECKING, Optional, Union\n\nimport pyarrow as pa\nimport pyarrow.parquet as pq\nfrom tqdm.contrib.concurrent import thread_map\n\nfrom .download.download_config import DownloadConfig  # noqa: F401\nfrom .naming import _split_re, filenames_for_dataset_split\nfrom .table import InMemoryTable, MemoryMappedTable, Table, concat_tables\nfrom .utils import logging\nfrom .utils import tqdm as hf_tqdm\n\n\nif TYPE_CHECKING:\n    from .info import DatasetInfo  # noqa: F401\n    from .splits import Split, SplitInfo  # noqa: F401\n\n\nlogger = logging.get_logger(__name__)\n\nHF_GCP_BASE_URL = \"https://storage.googleapis.com/huggingface-nlp/cache/datasets\"\n\n_SUB_SPEC_RE = re.compile(\n    rf\"\"\"\n^\n (?P<split>{_split_re[1:-1]})\n (\\[\n    ((?P<from>-?[\\d_]+)\n     (?P<from_pct>%)?)?\n    :\n    ((?P<to>-?[\\d_]+)\n     (?P<to_pct>%)?)?\n \\])?(\\((?P<rounding>[^\\)]*)\\))?\n$\n\"\"\",  # remove ^ and $\n    re.X,\n)\n\n_ADDITION_SEP_RE = re.compile(r\"\\s*\\+\\s*\")\n\n\nclass DatasetNotOnHfGcsError(ConnectionError):\n    \"\"\"When you can't get the dataset from the Hf google cloud storage\"\"\"\n\n    pass\n\n\nclass MissingFilesOnHfGcsError(ConnectionError):\n    \"\"\"When some files are missing on the Hf oogle cloud storage\"\"\"\n\n    pass\n\n\n@dataclass(frozen=True)\nclass FileInstructions:\n    \"\"\"The file instructions associated with a split ReadInstruction.\n\n    Attributes:\n        num_examples: `int`, The total number of examples\n        file_instructions: List[dict(filename, skip, take)], the files information.\n            The filenames contains the relative path, not absolute.\n            skip/take indicates which example read in the file: `ds.slice(skip, take)`\n    \"\"\"\n\n    num_examples: int\n    file_instructions: list[dict]\n\n\ndef make_file_instructions(\n    name: str,\n    split_infos: list[\"SplitInfo\"],\n    instruction: Union[str, \"ReadInstruction\"],\n    filetype_suffix: Optional[str] = None,\n    prefix_path: Optional[str] = None,\n) -> FileInstructions:\n    \"\"\"Returns instructions of the split dict.\n\n    Args:\n        name (`str`): Name of the dataset.\n        split_infos (`list` of `[SplitInfo]`): Dataset splits information.\n        instruction ([`ReadInstruction`] or `str`): Reading instruction for a dataset.\n        filetype_suffix (`str`, *optional*): Suffix of dataset files, e.g. 'arrow' or 'parquet'.\n        prefix_path (`str`, *optional*): Prefix of dataset files, e.g. directory name.\n\n    Returns:\n        [`FileInstructions`]\n    \"\"\"\n    if not isinstance(name, str):\n        raise TypeError(f\"Expected str 'name', but got: {type(name).__name__}\")\n    elif not name:\n        raise ValueError(\"Expected non-empty str 'name'\")\n    name2len = {info.name: info.num_examples for info in split_infos}\n    name2shard_lengths = {info.name: info.shard_lengths for info in split_infos}\n    name2filenames = {\n        info.name: filenames_for_dataset_split(\n            path=prefix_path,\n            dataset_name=name,\n            split=info.name,\n            filetype_suffix=filetype_suffix,\n            shard_lengths=name2shard_lengths[info.name],\n        )\n        for info in split_infos\n    }\n    if not isinstance(instruction, ReadInstruction):\n        instruction = ReadInstruction.from_spec(instruction)\n    # Create the absolute instruction (per split)\n    absolute_instructions = instruction.to_absolute(name2len)\n\n    # For each split, return the files instruction (skip/take)\n    file_instructions = []\n    num_examples = 0\n    for abs_instr in absolute_instructions:\n        split_length = name2len[abs_instr.splitname]\n        filenames = name2filenames[abs_instr.splitname]\n        shard_lengths = name2shard_lengths[abs_instr.splitname]\n        from_ = 0 if abs_instr.from_ is None else abs_instr.from_\n        to = split_length if abs_instr.to is None else abs_instr.to\n        if shard_lengths is None:  # not sharded\n            for filename in filenames:\n                take = to - from_\n                if take == 0:\n                    continue\n                num_examples += take\n                file_instructions.append({\"filename\": filename, \"skip\": from_, \"take\": take})\n        else:  # sharded\n            index_start = 0  # Beginning (included) of moving window.\n            index_end = 0  # End (excluded) of moving window.\n            for filename, shard_length in zip(filenames, shard_lengths):\n                index_end += shard_length\n                if from_ < index_end and to > index_start:  # There is something to take.\n                    skip = from_ - index_start if from_ > index_start else 0\n                    take = to - index_start - skip if to < index_end else -1\n                    if take == 0:\n                        continue\n                    file_instructions.append({\"filename\": filename, \"skip\": skip, \"take\": take})\n                    num_examples += shard_length - skip if take == -1 else take\n                index_start += shard_length\n    return FileInstructions(\n        num_examples=num_examples,\n        file_instructions=file_instructions,\n    )\n\n\nclass BaseReader:\n    \"\"\"\n    Build a Dataset object out of Instruction instance(s).\n    \"\"\"\n\n    def __init__(self, path: str, info: Optional[\"DatasetInfo\"]):\n        \"\"\"Initializes ArrowReader.\n\n        Args:\n            path (str): path where tfrecords are stored.\n            info (DatasetInfo): info about the dataset.\n        \"\"\"\n        self._path: str = path\n        self._info: Optional[\"DatasetInfo\"] = info\n        self._filetype_suffix: Optional[str] = None\n\n    def _get_table_from_filename(self, filename_skip_take, in_memory=False) -> Table:\n        \"\"\"Returns a Dataset instance from given (filename, skip, take).\"\"\"\n        raise NotImplementedError\n\n    def _read_files(self, files, in_memory=False) -> Table:\n        \"\"\"Returns Dataset for given file instructions.\n\n        Args:\n            files: List[dict(filename, skip, take)], the files information.\n                The filenames contain the absolute path, not relative.\n                skip/take indicates which example read in the file: `ds.slice(skip, take)`\n            in_memory (bool, default False): Whether to copy the data in-memory.\n        \"\"\"\n        if len(files) == 0 or not all(isinstance(f, dict) for f in files):\n            raise ValueError(\"please provide valid file informations\")\n        files = copy.deepcopy(files)\n        for f in files:\n            f[\"filename\"] = os.path.join(self._path, f[\"filename\"])\n\n        pa_tables = thread_map(\n            partial(self._get_table_from_filename, in_memory=in_memory),\n            files,\n            tqdm_class=hf_tqdm,\n            desc=\"Loading dataset shards\",\n            # set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached\n            disable=len(files) <= 16 or None,\n        )\n        pa_tables = [t for t in pa_tables if len(t) > 0]\n        if not pa_tables and (self._info is None or self._info.features is None):\n            raise ValueError(\n                \"Tried to read an empty table. Please specify at least info.features to create an empty table with the right type.\"\n            )\n        pa_tables = pa_tables or [InMemoryTable.from_batches([], schema=pa.schema(self._info.features.type))]\n        pa_table = concat_tables(pa_tables) if len(pa_tables) != 1 else pa_tables[0]\n        return pa_table\n\n    def get_file_instructions(self, name, instruction, split_infos):\n        \"\"\"Return list of dict {'filename': str, 'skip': int, 'take': int}\"\"\"\n        file_instructions = make_file_instructions(\n            name, split_infos, instruction, filetype_suffix=self._filetype_suffix, prefix_path=self._path\n        )\n        files = file_instructions.file_instructions\n        return files\n\n    def read(\n        self,\n        name,\n        instructions,\n        split_infos,\n        in_memory=False,\n    ):\n        \"\"\"Returns Dataset instance(s).\n\n        Args:\n            name (str): name of the dataset.\n            instructions (ReadInstruction): instructions to read.\n                Instruction can be string and will then be passed to the Instruction\n                constructor as it.\n            split_infos (list of SplitInfo proto): the available splits for dataset.\n            in_memory (bool, default False): Whether to copy the data in-memory.\n\n        Returns:\n             kwargs to build a single Dataset instance.\n        \"\"\"\n\n        files = self.get_file_instructions(name, instructions, split_infos)\n        if not files:\n            msg = f'Instruction \"{instructions}\" corresponds to no data!'\n            raise ValueError(msg)\n        return self.read_files(files=files, original_instructions=instructions, in_memory=in_memory)\n\n    def read_files(\n        self,\n        files: list[dict],\n        original_instructions: Union[None, \"ReadInstruction\", \"Split\"] = None,\n        in_memory=False,\n    ):\n        \"\"\"Returns single Dataset instance for the set of file instructions.\n\n        Args:\n            files: List[dict(filename, skip, take)], the files information.\n                The filenames contains the relative path, not absolute.\n                skip/take indicates which example read in the file: `ds.skip().take()`\n            original_instructions: store the original instructions used to build the dataset split in the dataset.\n            in_memory (bool, default False): Whether to copy the data in-memory.\n\n        Returns:\n            kwargs to build a Dataset instance.\n        \"\"\"\n        # Prepend path to filename\n        pa_table = self._read_files(files, in_memory=in_memory)\n        # If original_instructions is not None, convert it to a human-readable NamedSplit\n        if original_instructions is not None:\n            from .splits import Split  # noqa\n\n            split = Split(str(original_instructions))\n        else:\n            split = None\n        dataset_kwargs = {\"arrow_table\": pa_table, \"info\": self._info, \"split\": split}\n        return dataset_kwargs\n\n\nclass ArrowReader(BaseReader):\n    \"\"\"\n    Build a Dataset object out of Instruction instance(s).\n    This Reader uses either memory mapping or file descriptors (in-memory) on arrow files.\n    \"\"\"\n\n    def __init__(self, path: str, info: Optional[\"DatasetInfo\"]):\n        \"\"\"Initializes ArrowReader.\n\n        Args:\n            path (str): path where Arrow files are stored.\n            info (DatasetInfo): info about the dataset.\n        \"\"\"\n        super().__init__(path, info)\n        self._filetype_suffix = \"arrow\"\n\n    def _get_table_from_filename(self, filename_skip_take, in_memory=False) -> Table:\n        \"\"\"Returns a Dataset instance from given (filename, skip, take).\"\"\"\n        filename, skip, take = (\n            filename_skip_take[\"filename\"],\n            filename_skip_take[\"skip\"] if \"skip\" in filename_skip_take else None,\n            filename_skip_take[\"take\"] if \"take\" in filename_skip_take else None,\n        )\n        table = ArrowReader.read_table(filename, in_memory=in_memory)\n        if take == -1:\n            take = len(table) - skip\n        # here we don't want to slice an empty table, or it may segfault\n        if skip is not None and take is not None and not (skip == 0 and take == len(table)):\n            table = table.slice(skip, take)\n        return table\n\n    @staticmethod\n    def read_table(filename, in_memory=False) -> Table:\n        \"\"\"\n        Read table from file.\n\n        Args:\n            filename (str): File name of the table.\n            in_memory (bool, default=False): Whether to copy the data in-memory.\n\n        Returns:\n            pyarrow.Table\n        \"\"\"\n        table_cls = InMemoryTable if in_memory else MemoryMappedTable\n        return table_cls.from_file(filename)\n\n\nclass ParquetReader(BaseReader):\n    \"\"\"\n    Build a Dataset object out of Instruction instance(s).\n    This Reader uses memory mapping on parquet files.\n    \"\"\"\n\n    def __init__(self, path: str, info: Optional[\"DatasetInfo\"]):\n        \"\"\"Initializes ParquetReader.\n\n        Args:\n            path (str): path where tfrecords are stored.\n            info (DatasetInfo): info about the dataset.\n        \"\"\"\n        super().__init__(path, info)\n        self._filetype_suffix = \"parquet\"\n\n    def _get_table_from_filename(self, filename_skip_take, **kwargs):\n        \"\"\"Returns a Dataset instance from given (filename, skip, take).\"\"\"\n        filename, skip, take = (\n            filename_skip_take[\"filename\"],\n            filename_skip_take[\"skip\"] if \"skip\" in filename_skip_take else None,\n            filename_skip_take[\"take\"] if \"take\" in filename_skip_take else None,\n        )\n        # Parquet read_table always loads data in memory, independently of memory_map\n        pa_table = pq.read_table(filename, memory_map=True)\n        # here we don't want to slice an empty table, or it may segfault\n        if skip is not None and take is not None and not (skip == 0 and take == len(pa_table)):\n            pa_table = pa_table.slice(skip, take)\n        return pa_table\n\n\n@dataclass(frozen=True)\nclass _AbsoluteInstruction:\n    \"\"\"A machine friendly slice: defined absolute positive boundaries.\"\"\"\n\n    splitname: str\n    from_: int  # uint (starting index).\n    to: int  # uint (ending index).\n\n\n@dataclass(frozen=True)\nclass _RelativeInstruction:\n    \"\"\"Represents a single parsed slicing instruction, can use % and negatives.\"\"\"\n\n    splitname: str\n    from_: Optional[int] = None  # int (starting index) or None if no lower boundary.\n    to: Optional[int] = None  # int (ending index) or None if no upper boundary.\n    unit: Optional[str] = None\n    rounding: Optional[str] = None\n\n    def __post_init__(self):\n        if self.unit is not None and self.unit not in [\"%\", \"abs\"]:\n            raise ValueError(\"unit must be either % or abs\")\n        if self.rounding is not None and self.rounding not in [\"closest\", \"pct1_dropremainder\"]:\n            raise ValueError(\"rounding must be either closest or pct1_dropremainder\")\n        if self.unit != \"%\" and self.rounding is not None:\n            raise ValueError(\"It is forbidden to specify rounding if not using percent slicing.\")\n        if self.unit == \"%\" and self.from_ is not None and abs(self.from_) > 100:\n            raise ValueError(\"Percent slice boundaries must be > -100 and < 100.\")\n        if self.unit == \"%\" and self.to is not None and abs(self.to) > 100:\n            raise ValueError(\"Percent slice boundaries must be > -100 and < 100.\")\n        # Update via __dict__ due to instance being \"frozen\"\n        self.__dict__[\"rounding\"] = \"closest\" if self.rounding is None and self.unit == \"%\" else self.rounding\n\n\ndef _str_to_read_instruction(spec):\n    \"\"\"Returns ReadInstruction for given string.\"\"\"\n    res = _SUB_SPEC_RE.match(spec)\n    if not res:\n        raise ValueError(f\"Unrecognized instruction format: {spec}\")\n    unit = \"%\" if res.group(\"from_pct\") or res.group(\"to_pct\") else \"abs\"\n    return ReadInstruction(\n        split_name=res.group(\"split\"),\n        rounding=res.group(\"rounding\"),\n        from_=int(res.group(\"from\")) if res.group(\"from\") else None,\n        to=int(res.group(\"to\")) if res.group(\"to\") else None,\n        unit=unit,\n    )\n\n\ndef _pct_to_abs_pct1(boundary, num_examples):\n    # Using math.trunc here, since -99.5% should give -99%, not -100%.\n    if num_examples < 100:\n        msg = (\n            'Using \"pct1_dropremainder\" rounding on a split with less than 100 '\n            \"elements is forbidden: it always results in an empty dataset.\"\n        )\n        raise ValueError(msg)\n    return boundary * math.trunc(num_examples / 100.0)\n\n\ndef _pct_to_abs_closest(boundary, num_examples):\n    return int(round(boundary * num_examples / 100.0))\n\n\ndef _rel_to_abs_instr(rel_instr, name2len):\n    \"\"\"Returns _AbsoluteInstruction instance for given RelativeInstruction.\n\n    Args:\n        rel_instr: RelativeInstruction instance.\n        name2len: dict {split_name: num_examples}.\n    \"\"\"\n    pct_to_abs = _pct_to_abs_closest if rel_instr.rounding == \"closest\" else _pct_to_abs_pct1\n    split = rel_instr.splitname\n    if split not in name2len:\n        raise ValueError(f'Unknown split \"{split}\". Should be one of {list(name2len)}.')\n    num_examples = name2len[split]\n    from_ = rel_instr.from_\n    to = rel_instr.to\n    if rel_instr.unit == \"%\":\n        from_ = 0 if from_ is None else pct_to_abs(from_, num_examples)\n        to = num_examples if to is None else pct_to_abs(to, num_examples)\n    else:\n        from_ = 0 if from_ is None else from_\n        to = num_examples if to is None else to\n    if from_ < 0:\n        from_ = max(num_examples + from_, 0)\n    if to < 0:\n        to = max(num_examples + to, 0)\n    from_ = min(from_, num_examples)\n    to = min(to, num_examples)\n    return _AbsoluteInstruction(split, from_, to)\n\n\nclass ReadInstruction:\n    \"\"\"Reading instruction for a dataset.\n\n    Examples::\n\n      # The following lines are equivalent:\n      ds = datasets.load_dataset('ylecun/mnist', split='test[:33%]')\n      ds = datasets.load_dataset('ylecun/mnist', split=datasets.ReadInstruction.from_spec('test[:33%]'))\n      ds = datasets.load_dataset('ylecun/mnist', split=datasets.ReadInstruction('test', to=33, unit='%'))\n      ds = datasets.load_dataset('ylecun/mnist', split=datasets.ReadInstruction(\n          'test', from_=0, to=33, unit='%'))\n\n      # The following lines are equivalent:\n      ds = datasets.load_dataset('ylecun/mnist', split='test[:33%]+train[1:-1]')\n      ds = datasets.load_dataset('ylecun/mnist', split=datasets.ReadInstruction.from_spec(\n          'test[:33%]+train[1:-1]'))\n      ds = datasets.load_dataset('ylecun/mnist', split=(\n          datasets.ReadInstruction('test', to=33, unit='%') +\n          datasets.ReadInstruction('train', from_=1, to=-1, unit='abs')))\n\n      # The following lines are equivalent:\n      ds = datasets.load_dataset('ylecun/mnist', split='test[:33%](pct1_dropremainder)')\n      ds = datasets.load_dataset('ylecun/mnist', split=datasets.ReadInstruction.from_spec(\n          'test[:33%](pct1_dropremainder)'))\n      ds = datasets.load_dataset('ylecun/mnist', split=datasets.ReadInstruction(\n          'test', from_=0, to=33, unit='%', rounding=\"pct1_dropremainder\"))\n\n      # 10-fold validation:\n      tests = datasets.load_dataset(\n          'ylecun/mnist',\n          [datasets.ReadInstruction('train', from_=k, to=k+10, unit='%')\n          for k in range(0, 100, 10)])\n      trains = datasets.load_dataset(\n          'ylecun/mnist',\n          [datasets.ReadInstruction('train', to=k, unit='%') + datasets.ReadInstruction('train', from_=k+10, unit='%')\n          for k in range(0, 100, 10)])\n\n    \"\"\"\n\n    def _init(self, relative_instructions):\n        # Private initializer.\n        self._relative_instructions = relative_instructions\n\n    @classmethod\n    def _read_instruction_from_relative_instructions(cls, relative_instructions):\n        \"\"\"Returns ReadInstruction obj initialized with relative_instructions.\"\"\"\n        # Use __new__ to bypass __init__ used by public API and not conveniant here.\n        result = cls.__new__(cls)\n        result._init(relative_instructions)  # pylint: disable=protected-access\n        return result\n\n    def __init__(self, split_name, rounding=None, from_=None, to=None, unit=None):\n        \"\"\"Initialize ReadInstruction.\n\n        Args:\n            split_name (str): name of the split to read. Eg: 'train'.\n            rounding (str, optional): The rounding behaviour to use when percent slicing is\n                used. Ignored when slicing with absolute indices.\n                Possible values:\n                 - 'closest' (default): The specified percentages are rounded to the\n                     closest value. Use this if you want specified percents to be as\n                     much exact as possible.\n                 - 'pct1_dropremainder': the specified percentages are treated as\n                     multiple of 1%. Use this option if you want consistency. Eg:\n                         len(5%) == 5 * len(1%).\n                     Using this option, one might not be able to use the full set of\n                     examples, if the number of those is not a multiple of 100.\n            from_ (int):\n            to (int): alternative way of specifying slicing boundaries. If any of\n                {from_, to, unit} argument is used, slicing cannot be specified as\n                string.\n            unit (str): optional, one of:\n                '%': to set the slicing unit as percents of the split size.\n                'abs': to set the slicing unit as absolute numbers.\n        \"\"\"\n        # This constructor is not always called. See factory method\n        # `_read_instruction_from_relative_instructions`. Common init instructions\n        # MUST be placed in the _init method.\n        self._init([_RelativeInstruction(split_name, from_, to, unit, rounding)])\n\n    @classmethod\n    def from_spec(cls, spec):\n        \"\"\"Creates a `ReadInstruction` instance out of a string spec.\n\n        Args:\n            spec (`str`):\n                Split(s) + optional slice(s) to read + optional rounding\n                if percents are used as the slicing unit. A slice can be specified,\n                using absolute numbers (`int`) or percentages (`int`).\n\n        Examples:\n\n            ```\n            test: test split.\n            test + validation: test split + validation split.\n            test[10:]: test split, minus its first 10 records.\n            test[:10%]: first 10% records of test split.\n            test[:20%](pct1_dropremainder): first 10% records, rounded with the pct1_dropremainder rounding.\n            test[:-5%]+train[40%:60%]: first 95% of test + middle 20% of train.\n            ```\n\n        Returns:\n            ReadInstruction instance.\n        \"\"\"\n        spec = str(spec)  # Need to convert to str in case of NamedSplit instance.\n        subs = _ADDITION_SEP_RE.split(spec)\n        if not subs:\n            raise ValueError(f\"No instructions could be built out of {spec}\")\n        instruction = _str_to_read_instruction(subs[0])\n        return sum((_str_to_read_instruction(sub) for sub in subs[1:]), instruction)\n\n    def to_spec(self):\n        rel_instr_specs = []\n        for rel_instr in self._relative_instructions:\n            rel_instr_spec = rel_instr.splitname\n            if rel_instr.from_ is not None or rel_instr.to is not None:\n                from_ = rel_instr.from_\n                to = rel_instr.to\n                unit = rel_instr.unit\n                rounding = rel_instr.rounding\n                unit = unit if unit == \"%\" else \"\"\n                from_ = str(from_) + unit if from_ is not None else \"\"\n                to = str(to) + unit if to is not None else \"\"\n                slice_str = f\"[{from_}:{to}]\"\n                rounding_str = (\n                    f\"({rounding})\" if unit == \"%\" and rounding is not None and rounding != \"closest\" else \"\"\n                )\n                rel_instr_spec += slice_str + rounding_str\n            rel_instr_specs.append(rel_instr_spec)\n        return \"+\".join(rel_instr_specs)\n\n    def __add__(self, other):\n        \"\"\"Returns a new ReadInstruction obj, result of appending other to self.\"\"\"\n        if not isinstance(other, ReadInstruction):\n            msg = \"ReadInstruction can only be added to another ReadInstruction obj.\"\n            raise TypeError(msg)\n        self_ris = self._relative_instructions\n        other_ris = other._relative_instructions  # pylint: disable=protected-access\n        if (\n            self_ris[0].unit != \"abs\"\n            and other_ris[0].unit != \"abs\"\n            and self._relative_instructions[0].rounding != other_ris[0].rounding\n        ):\n            raise ValueError(\"It is forbidden to sum ReadInstruction instances with different rounding values.\")\n        return self._read_instruction_from_relative_instructions(self_ris + other_ris)\n\n    def __str__(self):\n        return self.to_spec()\n\n    def __repr__(self):\n        return f\"ReadInstruction({self._relative_instructions})\"\n\n    def to_absolute(self, name2len):\n        \"\"\"Translate instruction into a list of absolute instructions.\n\n        Those absolute instructions are then to be added together.\n\n        Args:\n            name2len (`dict`):\n                Associating split names to number of examples.\n\n        Returns:\n            list of _AbsoluteInstruction instances (corresponds to the + in spec).\n        \"\"\"\n        return [_rel_to_abs_instr(rel_instr, name2len) for rel_instr in self._relative_instructions]\n"
  },
  {
    "path": "src/datasets/arrow_writer.py",
    "content": "# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Lint as: python3\n\"\"\"To write records into Parquet files.\"\"\"\n\nimport io\nimport json\nimport sys\nfrom collections.abc import Iterable\nfrom typing import Any, Literal, Optional\n\nimport fsspec\nimport numpy as np\nimport pyarrow as pa\nimport pyarrow.json as paj\nimport pyarrow.parquet as pq\nfrom fsspec.core import url_to_fs\n\nfrom . import config\nfrom .features import Audio, Features, Image, Pdf, Value, Video\nfrom .features.features import (\n    FeatureType,\n    List,\n    _ArrayXDExtensionType,\n    _visit,\n    cast_to_python_objects,\n    generate_from_arrow_type,\n    get_nested_type,\n    list_of_np_array_to_pyarrow_listarray,\n    numpy_to_pyarrow_listarray,\n    require_storage_embed,\n    to_pyarrow_listarray,\n)\nfrom .filesystems import is_remote_filesystem\nfrom .info import DatasetInfo\nfrom .table import array_cast, cast_array_to_feature, embed_table_storage, table_cast\nfrom .utils import logging\nfrom .utils.json import (\n    find_mixed_struct_types_field_paths,\n    get_json_field_path_from_pyarrow_json_error,\n    get_json_field_paths_from_feature,\n    insert_json_field_path,\n    json_encode_field,\n    json_encode_fields_in_json_lines,\n    set_json_types_in_feature,\n    ujson_dumps,\n)\nfrom .utils.py_utils import asdict, convert_file_size_to_int, first_non_null_non_empty_value\n\n\nlogger = logging.get_logger(__name__)\n\ntype_ = type  # keep python's type function\n\n\ndef get_arrow_writer_batch_size_from_features(features: Optional[Features]) -> Optional[int]:\n    \"\"\"\n    Get the writer_batch_size that defines the maximum record batch size in the arrow files based on configuration values.\n    The default value is 100 for image/audio datasets and 10 for videos.\n    This allows to avoid overflows in arrow buffers.\n\n    Args:\n        features (`datasets.Features` or `None`):\n            Dataset Features from `datasets`.\n    Returns:\n        writer_batch_size (`Optional[int]`):\n            Writer batch size to pass to a dataset builder.\n            If `None`, then it will use the `datasets` default, i.e. `datasets.config.DEFAULT_MAX_BATCH_SIZE`.\n    \"\"\"\n    if not features:\n        return None\n\n    batch_size = np.inf\n\n    def set_batch_size(feature: FeatureType) -> None:\n        nonlocal batch_size\n        if isinstance(feature, Image) and config.ARROW_RECORD_BATCH_SIZE_FOR_IMAGE_DATASETS is not None:\n            batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_IMAGE_DATASETS)\n        elif isinstance(feature, Audio) and config.ARROW_RECORD_BATCH_SIZE_FOR_AUDIO_DATASETS is not None:\n            batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_AUDIO_DATASETS)\n        elif isinstance(feature, Video) and config.ARROW_RECORD_BATCH_SIZE_FOR_VIDEO_DATASETS is not None:\n            batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_VIDEO_DATASETS)\n        elif (\n            isinstance(feature, Value)\n            and feature.dtype == \"binary\"\n            and config.ARROW_RECORD_BATCH_SIZE_FOR_BINARY_DATASETS is not None\n        ):\n            batch_size = min(batch_size, config.ARROW_RECORD_BATCH_SIZE_FOR_BINARY_DATASETS)\n\n    _visit(features, set_batch_size)\n\n    return None if batch_size is np.inf else batch_size\n\n\ndef get_writer_batch_size_from_features(features: Optional[Features]) -> Optional[int]:\n    \"\"\"\n    Get the writer_batch_size that defines the maximum row group size in the parquet files based on configuration values.\n    By default these are not set, but it can be helpful to hard set those values in some cases.\n    This allows to optimize random access to parquet file, since accessing 1 row requires\n    to read its entire row group.\n\n    Args:\n        features (`datasets.Features` or `None`):\n            Dataset Features from `datasets`.\n    Returns:\n        writer_batch_size (`Optional[int]`):\n            Writer batch size to pass to a parquet writer.\n            If `None`, then it will use the `datasets` default, i.e. aiming for row groups of 100MB.\n    \"\"\"\n    if not features:\n        return None\n\n    batch_size = np.inf\n\n    def set_batch_size(feature: FeatureType) -> None:\n        nonlocal batch_size\n        if isinstance(feature, Image) and config.PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS is not None:\n            batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS)\n        elif isinstance(feature, Audio) and config.PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS is not None:\n            batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS)\n        elif isinstance(feature, Video) and config.PARQUET_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS is not None:\n            batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS)\n        elif (\n            isinstance(feature, Value)\n            and feature.dtype == \"binary\"\n            and config.PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS is not None\n        ):\n            batch_size = min(batch_size, config.PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS)\n\n    _visit(features, set_batch_size)\n\n    return None if batch_size is np.inf else batch_size\n\n\ndef get_writer_batch_size_from_data_size(num_rows: int, num_bytes: int) -> int:\n    \"\"\"\n    Get the writer_batch_size that defines the maximum row group size in the parquet files.\n    The default in `datasets` is aiming for row groups of maximum 100MB uncompressed.\n    This allows to optimize random access to parquet file, since accessing 1 row requires\n    to read its entire row group.\n\n    This can be improved to get optimized size for querying/iterating\n    but at least it matches the dataset viewer expectations on HF.\n\n    Args:\n        num_rows (`int`):\n            Number of rows in the dataset.\n        num_bytes (`int`):\n            Number of bytes in the dataset.\n            For dataset with external files to embed (image, audio, videos), this can also be an\n            estimate from `dataset._estimate_nbytes()`.\n    Returns:\n        writer_batch_size (`Optional[int]`):\n            Writer batch size to pass to a parquet writer.\n    \"\"\"\n    return max(1, num_rows * convert_file_size_to_int(config.MAX_ROW_GROUP_SIZE) // num_bytes) if num_bytes > 0 else 1\n\n\nclass SchemaInferenceError(ValueError):\n    pass\n\n\nclass TypedSequence:\n    \"\"\"\n    This data container generalizes the typing when instantiating pyarrow arrays, tables or batches.\n\n    More specifically it adds several features:\n    - Support extension types like ``datasets.features.Array2DExtensionType``:\n        By default pyarrow arrays don't return extension arrays. One has to call\n        ``pa.ExtensionArray.from_storage(type, pa.array(data, type.storage_type))``\n        in order to get an extension array.\n    - Support for ``try_type`` parameter that can be used instead of ``type``:\n        When an array is transformed, we like to keep the same type as before if possible.\n        For example when calling :func:`datasets.Dataset.map`, we don't want to change the type\n        of each column by default.\n    - Better error message when a pyarrow array overflows.\n\n    Example::\n\n        from datasets.features import Array2D, Array2DExtensionType, Value\n        from datasets.arrow_writer import TypedSequence\n        import pyarrow as pa\n\n        arr = pa.array(TypedSequence([1, 2, 3], type=Value(\"int32\")))\n        assert arr.type == pa.int32()\n\n        arr = pa.array(TypedSequence([1, 2, 3], try_type=Value(\"int32\")))\n        assert arr.type == pa.int32()\n\n        arr = pa.array(TypedSequence([\"foo\", \"bar\"], try_type=Value(\"int32\")))\n        assert arr.type == pa.string()\n\n        arr = pa.array(TypedSequence([[[1, 2, 3]]], type=Array2D((1, 3), \"int64\")))\n        assert arr.type == Array2DExtensionType((1, 3), \"int64\")\n\n        table = pa.Table.from_pydict({\n            \"image\": TypedSequence([[[1, 2, 3]]], type=Array2D((1, 3), \"int64\"))\n        })\n        assert table[\"image\"].type == Array2DExtensionType((1, 3), \"int64\")\n\n    \"\"\"\n\n    def __init__(\n        self,\n        data: Iterable,\n        type: Optional[FeatureType] = None,\n        try_type: Optional[FeatureType] = None,\n        optimized_int_type: Optional[FeatureType] = None,\n        on_mixed_types: Optional[Literal[\"use_json\"]] = None,\n    ):\n        # assert type is None or try_type is None,\n        if type is not None and try_type is not None:\n            raise ValueError(\"You cannot specify both type and try_type\")\n        # set attributes\n        self.data = data\n        self.type = type\n        self.try_type = try_type  # is ignored if it doesn't match the data\n        self.optimized_int_type = optimized_int_type\n        self.on_mixed_types = on_mixed_types\n        # when trying a type (is ignored if data is not compatible)\n        self.trying_type = self.try_type is not None\n        self.trying_int_optimization = optimized_int_type is not None and type is None and try_type is None\n        # used to get back the inferred type after __arrow_array__() is called once\n        self._inferred_type = None\n\n    def get_inferred_type(self) -> FeatureType:\n        \"\"\"Return the inferred feature type.\n        This is done by converting the sequence to an Arrow array, and getting the corresponding\n        feature type.\n\n        Since building the Arrow array can be expensive, the value of the inferred type is cached\n        as soon as pa.array is called on the typed sequence.\n\n        Returns:\n            FeatureType: inferred feature type of the sequence.\n        \"\"\"\n        if self._inferred_type is None:\n            pa.array(self)\n        return self._inferred_type\n\n    @staticmethod\n    def _infer_custom_type_and_encode(data: Iterable) -> tuple[Iterable, Optional[FeatureType]]:\n        \"\"\"Implement type inference for custom objects like PIL.Image.Image -> Image type.\n\n        This function is only used for custom python objects that can't be directly passed to build\n        an Arrow array. In such cases is infers the feature type to use, and it encodes the data so\n        that they can be passed to an Arrow array.\n\n        Args:\n            data (Iterable): array of data to infer the type, e.g. a list of PIL images.\n\n        Returns:\n            Tuple[Iterable, Optional[FeatureType]]: a tuple with:\n                - the (possibly encoded) array, if the inferred feature type requires encoding\n                - the inferred feature type if the array is made of supported custom objects like\n                    PIL images, else None.\n        \"\"\"\n        if config.PIL_AVAILABLE and \"PIL\" in sys.modules:\n            import PIL.Image\n\n            non_null_idx, non_null_value = first_non_null_non_empty_value(data)\n            if isinstance(non_null_value, PIL.Image.Image):\n                return [Image().encode_example(value) if value is not None else None for value in data], Image()\n            if isinstance(non_null_value, list) and isinstance(non_null_value[0], PIL.Image.Image):\n                return [\n                    [Image().encode_example(x) for x in value] if value is not None else None for value in data\n                ], List(Image())\n        if config.PDFPLUMBER_AVAILABLE and \"pdfplumber\" in sys.modules:\n            import pdfplumber\n\n            non_null_idx, non_null_value = first_non_null_non_empty_value(data)\n            if isinstance(non_null_value, pdfplumber.pdf.PDF):\n                return [Pdf().encode_example(value) if value is not None else None for value in data], Pdf()\n            if isinstance(non_null_value, list) and isinstance(non_null_value[0], pdfplumber.pdf.PDF):\n                return [\n                    [Pdf().encode_example(x) for x in value] if value is not None else None for value in data\n                ], List(Pdf())\n        return data, None\n\n    def __arrow_array__(self, type: Optional[pa.DataType] = None):\n        out = self._arrow_array(type=type)\n        if self._inferred_type is None:\n            self._inferred_type = generate_from_arrow_type(out.type)\n        return out\n\n    def _arrow_array(self, type: Optional[pa.DataType] = None):\n        \"\"\"This function is called when calling pa.array(typed_sequence)\"\"\"\n\n        if type is not None:\n            raise ValueError(\"TypedSequence is supposed to be used with pa.array(typed_sequence, type=None)\")\n        del type  # make sure we don't use it\n        data = self.data\n        # automatic type inference for custom objects\n        if self.type is None and self.try_type is None:\n            data, self._inferred_type = self._infer_custom_type_and_encode(data)\n        if self._inferred_type is None:\n            type = self.try_type if self.trying_type else self.type\n        else:\n            type = self._inferred_type\n        pa_type = get_nested_type(type) if type is not None else None\n        optimized_int_pa_type = (\n            get_nested_type(self.optimized_int_type) if self.optimized_int_type is not None else None\n        )\n        trying_cast_to_python_objects = False\n        json_field_paths = []\n        try:\n            # custom pyarrow types\n            if isinstance(pa_type, _ArrayXDExtensionType):\n                storage = to_pyarrow_listarray(data, pa_type)\n                return pa.ExtensionArray.from_storage(pa_type, storage)\n\n            # efficient np array to pyarrow array\n            if isinstance(data, np.ndarray):\n                out = numpy_to_pyarrow_listarray(data)\n            elif isinstance(data, list) and data and isinstance(first_non_null_non_empty_value(data)[1], np.ndarray):\n                out = list_of_np_array_to_pyarrow_listarray(data)\n            else:\n                trying_cast_to_python_objects = True\n                examples = data\n                # find fields to json-encode\n                if self.on_mixed_types == \"use_json\" and type is None:\n                    json_field_paths = find_mixed_struct_types_field_paths(examples, allow_root=True)\n                elif type is not None:\n                    json_field_paths = get_json_field_paths_from_feature(type)\n                # json encode if needed\n                if json_field_paths:\n                    for json_field_path in json_field_paths:\n                        examples = [json_encode_field(examples, json_field_path) for examples in examples]\n                # to arrow array\n                out = pa.array(cast_to_python_objects(examples, only_1d_for_numpy=True))\n                # cast to json type if needed\n                if json_field_paths:\n                    pa_table = pa.Table.from_arrays([out], names=[\"obj\"])\n                    features = Features.from_arrow_schema(pa_table.schema)\n                    feature = set_json_types_in_feature(features[\"obj\"], json_field_paths)\n                    pa_table = table_cast(pa_table, Features({\"obj\": feature}).arrow_schema)\n                    out = pa_table[0]  # get the \"obj\" column\n            # use smaller integer precisions if possible\n            if self.trying_int_optimization:\n                if pa.types.is_int64(out.type):\n                    out = out.cast(optimized_int_pa_type)\n                elif pa.types.is_list(out.type):\n                    if pa.types.is_int64(out.type.value_type):\n                        out = array_cast(out, pa.list_(optimized_int_pa_type))\n                    elif pa.types.is_list(out.type.value_type) and pa.types.is_int64(out.type.value_type.value_type):\n                        out = array_cast(out, pa.list_(pa.list_(optimized_int_pa_type)))\n            # otherwise we can finally use the user's type\n            elif type is not None:\n                # We use cast_array_to_feature to support casting to custom types like Audio and Image\n                # Also, when trying type \"string\", we don't want to convert integers or floats to \"string\".\n                # We only do it if trying_type is False - since this is what the user asks for.\n                out = cast_array_to_feature(\n                    out, type, allow_primitive_to_str=not self.trying_type, allow_decimal_to_str=not self.trying_type\n                )\n            return out\n        except (\n            TypeError,\n            pa.lib.ArrowTypeError,\n            pa.lib.ArrowInvalid,\n            pa.lib.ArrowNotImplementedError,\n        ) as e:  # handle type errors and overflows\n            # Ignore ArrowNotImplementedError caused by trying type, otherwise re-raise\n            if not self.trying_type and isinstance(e, pa.lib.ArrowNotImplementedError):\n                raise\n\n            if self.trying_type:\n                try:  # second chance\n                    if isinstance(data, np.ndarray):\n                        return numpy_to_pyarrow_listarray(data)\n                    elif isinstance(data, list) and data and any(isinstance(value, np.ndarray) for value in data):\n                        return list_of_np_array_to_pyarrow_listarray(data)\n                    else:\n                        trying_cast_to_python_objects = True\n                        return pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))\n                except pa.lib.ArrowInvalid as e:\n                    if \"overflow\" in str(e):\n                        raise OverflowError(\n                            f\"There was an overflow with type {type_(data)}. Try to reduce writer_batch_size to have batches smaller than 2GB.\\n({e})\"\n                        ) from None\n                    elif self.trying_int_optimization and \"not in range\" in str(e):\n                        optimized_int_pa_type_str = np.dtype(optimized_int_pa_type.to_pandas_dtype()).name\n                        logger.info(\n                            f\"Failed to cast a sequence to {optimized_int_pa_type_str}. Falling back to int64.\"\n                        )\n                        return out\n                    elif trying_cast_to_python_objects and \"Could not convert\" in str(e):\n                        out = pa.array(\n                            cast_to_python_objects(data, only_1d_for_numpy=True, optimize_list_casting=False)\n                        )\n                        if type is not None:\n                            out = cast_array_to_feature(\n                                out, type, allow_primitive_to_str=True, allow_decimal_to_str=True\n                            )\n                        return out\n                    else:\n                        raise\n            elif \"overflow\" in str(e):\n                raise OverflowError(\n                    f\"There was an overflow with type {type_(data)}. Try to reduce writer_batch_size to have batches smaller than 2GB.\\n({e})\"\n                ) from None\n            elif self.trying_int_optimization and \"not in range\" in str(e):\n                optimized_int_pa_type_str = np.dtype(optimized_int_pa_type.to_pandas_dtype()).name\n                logger.info(f\"Failed to cast a sequence to {optimized_int_pa_type_str}. Falling back to int64.\")\n                return out\n            elif trying_cast_to_python_objects and (\n                \"Could not convert\" in str(e) or \"cannot mix struct and non-struct\" in str(e) or \"Expected \" in str(e)\n            ):\n                try:  # third chance\n                    out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True, optimize_list_casting=False))\n                except (pa.ArrowInvalid, pa.ArrowTypeError) as ee:\n                    # in case of mixed types, we use the JSON Lines reader in pyarrow to locate them and set them to json fields\n                    if self.on_mixed_types == \"use_json\" and (\n                        \"Could not convert \" in str(ee)\n                        or \"cannot mix struct and non-struct\" in str(ee)\n                        or \"Expected \" in str(ee)\n                    ):\n                        # we use \"obj\" to have valid JSON Lines since data may contain lists\n                        original_batch = \"\\n\".join([ujson_dumps({\"obj\": example}) for example in data]).encode()\n                        json_field_paths = [[\"obj\"] + json_field_path for json_field_path in json_field_paths]\n                        batch = json_encode_fields_in_json_lines(original_batch, json_field_paths)\n                        pa_table = None\n                        while True:\n                            try:  # fourth chance\n                                pa_table = paj.read_json(\n                                    io.BytesIO(batch), read_options=paj.ReadOptions(use_threads=False)\n                                )\n                                break\n                            except pa.ArrowInvalid as eee:\n                                if \"JSON parse error: Column(\" in str(eee) and \") changed from\" in str(eee):\n                                    json_field_path = get_json_field_path_from_pyarrow_json_error(str(eee))\n                                    insert_json_field_path(json_field_paths, json_field_path)\n                                    batch = json_encode_fields_in_json_lines(original_batch, json_field_paths)\n                                else:\n                                    break\n                        if pa_table is not None:\n                            features = Features.from_arrow_schema(pa_table.schema)\n                            features = set_json_types_in_feature(features, json_field_paths)\n                            pa_table = table_cast(pa_table, features.arrow_schema)\n                            out = pa_table[0]  # get the \"obj\" column\n                            return out\n                        else:\n                            raise\n                    else:\n                        raise\n                if type is not None:\n                    out = cast_array_to_feature(out, type, allow_primitive_to_str=True, allow_decimal_to_str=True)\n                return out\n            else:\n                raise\n\n\nclass OptimizedTypedSequence(TypedSequence):\n    def __init__(\n        self,\n        data,\n        type: Optional[FeatureType] = None,\n        try_type: Optional[FeatureType] = None,\n        col: Optional[str] = None,\n        optimized_int_type: Optional[FeatureType] = None,\n        on_mixed_types: Optional[Literal[\"use_json\"]] = None,\n    ):\n        optimized_int_type_by_col = {\n            \"attention_mask\": Value(\"int8\"),  # binary tensor\n            \"special_tokens_mask\": Value(\"int8\"),\n            \"input_ids\": Value(\"int32\"),  # typical vocab size: 0-50k (max ~500k, never > 1M)\n            \"token_type_ids\": Value(\n                \"int8\"\n            ),  # binary mask; some (XLNetModel) use an additional token represented by a 2\n        }\n        if type is None and try_type is None:\n            optimized_int_type = optimized_int_type_by_col.get(col, None)\n        super().__init__(\n            data, type=type, try_type=try_type, optimized_int_type=optimized_int_type, on_mixed_types=on_mixed_types\n        )\n\n\nclass ArrowWriter:\n    \"\"\"Shuffles and writes Examples to Arrow files.\"\"\"\n\n    def __init__(\n        self,\n        schema: Optional[pa.Schema] = None,\n        features: Optional[Features] = None,\n        path: Optional[str] = None,\n        stream: Optional[pa.NativeFile] = None,\n        fingerprint: Optional[str] = None,\n        writer_batch_size: Optional[int] = None,\n        disable_nullable: bool = False,\n        update_features: bool = False,\n        on_mixed_types: Optional[Literal[\"use_json\"]] = \"use_json\",\n        with_metadata: bool = True,\n        unit: str = \"examples\",\n        embed_local_files: bool = False,\n        storage_options: Optional[dict] = None,\n    ):\n        if path is None and stream is None:\n            raise ValueError(\"At least one of path and stream must be provided.\")\n        if features is not None:\n            self._features = features\n            self._schema = None\n        elif schema is not None:\n            self._schema: pa.Schema = schema\n            self._features = Features.from_arrow_schema(self._schema)\n        else:\n            self._features = None\n            self._schema = None\n\n        self._disable_nullable = disable_nullable\n\n        if stream is None:\n            fs, path = url_to_fs(path, **(storage_options or {}))\n            self._fs: fsspec.AbstractFileSystem = fs\n            self._path = path if not is_remote_filesystem(self._fs) else self._fs.unstrip_protocol(path)\n            self.stream = self._fs.open(path, \"wb\")\n            self._closable_stream = True\n        else:\n            self._fs = None\n            self._path = None\n            self.stream = stream\n            self._closable_stream = False\n\n        self.fingerprint = fingerprint\n        self.disable_nullable = disable_nullable\n        self.writer_batch_size = (\n            writer_batch_size\n            or get_arrow_writer_batch_size_from_features(self._features)\n            or config.DEFAULT_MAX_BATCH_SIZE\n        )\n        self.update_features = update_features\n        self.on_mixed_types = on_mixed_types\n        self.with_metadata = with_metadata\n        self.unit = unit\n        self.embed_local_files = embed_local_files\n\n        self._num_examples = 0\n        self._num_bytes = 0\n        self.current_examples: list[tuple[dict[str, Any], str]] = []\n        self.current_rows: list[pa.Table] = []\n        self.pa_writer: Optional[pa.RecordBatchStreamWriter] = None\n        self.hkey_record = []\n\n    def __len__(self):\n        \"\"\"Return the number of writed and staged examples\"\"\"\n        return self._num_examples + len(self.current_examples) + len(self.current_rows)\n\n    def __enter__(self):\n        return self\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        self.close()\n\n    def close(self):\n        # Try closing if opened; if closed: pyarrow.lib.ArrowInvalid: Invalid operation on closed file\n        if self.pa_writer:  # it might be None\n            try:\n                self.pa_writer.close()\n            except Exception:  # pyarrow.lib.ArrowInvalid, OSError\n                pass\n        if self._closable_stream and not self.stream.closed:\n            self.stream.close()  # This also closes self.pa_writer if it is opened\n\n    def _build_schema(self, inferred_schema: pa.Schema):\n        schema = self.schema\n        features = self._features\n        inferred_features = Features.from_arrow_schema(inferred_schema)\n        if self._features is not None:\n            if self.update_features:  # keep original features it they match, or update them\n                fields = {field.name: field for field in self._features.type}\n                for inferred_field in inferred_features.type:\n                    name = inferred_field.name\n                    if name in fields:\n                        if inferred_field == fields[name]:\n                            inferred_features[name] = self._features[name]\n                features = inferred_features\n                schema: pa.Schema = inferred_schema\n        else:\n            features = inferred_features\n            schema: pa.Schema = inferred_features.arrow_schema\n\n        if self.disable_nullable:\n            schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in schema)\n        if self.with_metadata:\n            schema = schema.with_metadata(self._build_metadata(DatasetInfo(features=features), self.fingerprint))\n        else:\n            schema = schema.with_metadata({})\n\n        return schema, features\n\n    def _build_writer(self, inferred_schema: pa.Schema):\n        self._schema, self._features = self._build_schema(inferred_schema)\n        self.pa_writer = pa.RecordBatchStreamWriter(self.stream, self._schema)\n\n    @property\n    def schema(self):\n        _schema = (\n            self._schema\n            if self._schema is not None\n            else (pa.schema(self._features.type) if self._features is not None else None)\n        )\n        if self._disable_nullable and _schema is not None:\n            _schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in _schema)\n        return _schema if _schema is not None else []\n\n    @staticmethod\n    def _build_metadata(info: DatasetInfo, fingerprint: Optional[str] = None) -> dict[str, str]:\n        info_keys = [\"features\"]  # we can add support for more DatasetInfo keys in the future\n        info_as_dict = asdict(info)\n        metadata = {}\n        metadata[\"info\"] = {key: info_as_dict[key] for key in info_keys}\n        if fingerprint is not None:\n            metadata[\"fingerprint\"] = fingerprint\n        return {\"huggingface\": json.dumps(metadata)}\n\n    def write_examples_on_file(self):\n        \"\"\"Write stored examples from the write-pool of examples. It makes a table out of the examples and write it.\"\"\"\n        if not self.current_examples:\n            return\n        # preserve the order the columns\n        if self.schema:\n            schema_cols = set(self.schema.names)\n            examples_cols = self.current_examples[0][0].keys()  # .keys() preserves the order (unlike set)\n            common_cols = [col for col in self.schema.names if col in examples_cols]\n            extra_cols = [col for col in examples_cols if col not in schema_cols]\n            cols = common_cols + extra_cols\n        else:\n            cols = list(self.current_examples[0][0])\n        batch_examples = {}\n        for col in cols:\n            # We use row[0][col] since current_examples contains (example, key) tuples.\n            # Moreover, examples could be Arrow arrays of 1 element.\n            # This can happen in `.map()` when we want to re-write the same Arrow data\n            if all(isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) for row in self.current_examples):\n                arrays = [row[0][col] for row in self.current_examples]\n                arrays = [\n                    chunk\n                    for array in arrays\n                    for chunk in (array.chunks if isinstance(array, pa.ChunkedArray) else [array])\n                ]\n                batch_examples[col] = pa.concat_arrays(arrays)\n            else:\n                batch_examples[col] = [\n                    row[0][col].to_pylist()[0] if isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) else row[0][col]\n                    for row in self.current_examples\n                ]\n        self.write_batch(batch_examples=batch_examples)\n        self.current_examples = []\n\n    def write_rows_on_file(self):\n        \"\"\"Write stored rows from the write-pool of rows. It concatenates the single-row tables and it writes the resulting table.\"\"\"\n        if not self.current_rows:\n            return\n        table = pa.concat_tables(self.current_rows)\n        self.write_table(table)\n        self.current_rows = []\n\n    def write(\n        self,\n        example: dict[str, Any],\n        writer_batch_size: Optional[int] = None,\n    ):\n        \"\"\"Add a given (Example,Key) pair to the write-pool of examples which is written to file.\n\n        Args:\n            example: the Example to add.\n        \"\"\"\n        # Store example as a tuple so as to keep the structure of `self.current_examples` uniform\n        self.current_examples.append((example, \"\"))\n\n        if writer_batch_size is None:\n            writer_batch_size = self.writer_batch_size\n        if writer_batch_size is not None and len(self.current_examples) >= writer_batch_size:\n            self.write_examples_on_file()\n\n    def write_row(self, row: pa.Table, writer_batch_size: Optional[int] = None):\n        \"\"\"Add a given single-row Table to the write-pool of rows which is written to file.\n\n        Args:\n            row: the row to add.\n        \"\"\"\n        if len(row) != 1:\n            raise ValueError(f\"Only single-row pyarrow tables are allowed but got table with {len(row)} rows.\")\n        self.current_rows.append(row)\n        if writer_batch_size is None:\n            writer_batch_size = self.writer_batch_size\n        if writer_batch_size is not None and len(self.current_rows) >= writer_batch_size:\n            self.write_rows_on_file()\n\n    def write_batch(\n        self,\n        batch_examples: dict[str, list],\n        writer_batch_size: Optional[int] = None,\n        try_original_type: Optional[bool] = True,\n    ):\n        \"\"\"Write a batch of Example to file.\n        Ignores the batch if it appears to be empty,\n        preventing a potential schema update of unknown types.\n\n        Args:\n            batch_examples: the batch of examples to add.\n            try_original_type: use `try_type` when instantiating OptimizedTypedSequence if `True`, otherwise `try_type = None`.\n        \"\"\"\n        if batch_examples and len(next(iter(batch_examples.values()))) == 0:\n            return\n        features = None if self.pa_writer is None and self.update_features else self._features\n        try_features = self._features if self.pa_writer is None and self.update_features else None\n        arrays = []\n        inferred_features = Features()\n        # preserve the order the columns\n        if self.schema:\n            schema_cols = set(self.schema.names)\n            batch_cols = batch_examples.keys()  # .keys() preserves the order (unlike set)\n            common_cols = [col for col in self.schema.names if col in batch_cols]\n            extra_cols = [col for col in batch_cols if col not in schema_cols]\n            cols = common_cols + extra_cols\n        else:\n            cols = list(batch_examples)\n        for col in cols:\n            col_values = batch_examples[col]\n            col_type = features[col] if features else None\n            if isinstance(col_values, (pa.Array, pa.ChunkedArray)):\n                array = cast_array_to_feature(col_values, col_type) if col_type is not None else col_values\n                arrays.append(array)\n                inferred_features[col] = generate_from_arrow_type(col_values.type)\n            else:\n                col_try_type = (\n                    try_features[col]\n                    if try_features is not None and col in try_features and try_original_type\n                    else None\n                )\n                typed_sequence = OptimizedTypedSequence(\n                    col_values, type=col_type, try_type=col_try_type, col=col, on_mixed_types=self.on_mixed_types\n                )\n                arrays.append(pa.array(typed_sequence))\n                inferred_features[col] = typed_sequence.get_inferred_type()\n        schema = inferred_features.arrow_schema if self.pa_writer is None else self.schema\n        pa_table = pa.Table.from_arrays(arrays, schema=schema)\n        self.write_table(pa_table, writer_batch_size)\n\n    def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = None):\n        \"\"\"Write a Table to file.\n\n        Args:\n            example: the Table to add.\n        \"\"\"\n        if writer_batch_size is None:\n            writer_batch_size = self.writer_batch_size\n        if self.pa_writer is None:\n            self._build_writer(inferred_schema=pa_table.schema)\n        pa_table = pa_table.combine_chunks()\n        pa_table = table_cast(pa_table, self._schema)\n        if self.embed_local_files:\n            pa_table = embed_table_storage(pa_table)\n        self._num_bytes += pa_table.nbytes\n        self._num_examples += pa_table.num_rows\n        self.pa_writer.write_table(pa_table, writer_batch_size)\n\n    def finalize(self, close_stream=True):\n        self.write_rows_on_file()\n        # In case current_examples < writer_batch_size, but user uses finalize()\n        self.write_examples_on_file()\n        # If schema is known, infer features even if no examples were written\n        if self.pa_writer is None and self.schema:\n            self._build_writer(self.schema)\n        if self.pa_writer is not None:\n            self.pa_writer.close()\n            self.pa_writer = None\n            if close_stream:\n                self.stream.close()\n        else:\n            if close_stream:\n                self.stream.close()\n            raise SchemaInferenceError(\"Please pass `features` or at least one example when writing data\")\n        logger.debug(\n            f\"Done writing {self._num_examples} {self.unit} in {self._num_bytes} bytes {self._path if self._path else ''}.\"\n        )\n        return self._num_examples, self._num_bytes\n\n\nclass ParquetWriter(ArrowWriter):\n    def __init__(self, *args, use_content_defined_chunking=True, write_page_index=True, **kwargs):\n        super().__init__(*args, **kwargs)\n        if use_content_defined_chunking is True:\n            use_content_defined_chunking = config.DEFAULT_CDC_OPTIONS\n        self.use_content_defined_chunking = use_content_defined_chunking\n        self.write_page_index = write_page_index\n\n    def _build_writer(self, inferred_schema: pa.Schema):\n        self._schema, self._features = self._build_schema(inferred_schema)\n        self.pa_writer = pq.ParquetWriter(\n            self.stream,\n            self._schema,\n            use_content_defined_chunking=self.use_content_defined_chunking,\n            write_page_index=self.write_page_index,\n            compression={\n                col: \"none\" if require_storage_embed(feature) else \"snappy\" for col, feature in self._features.items()\n            },\n            use_dictionary=[col for col, feature in self._features.items() if not require_storage_embed(feature)],\n            column_encoding={\n                col: \"PLAIN\" for col, feature in self._features.items() if require_storage_embed(feature)\n            },\n        )\n        if self.use_content_defined_chunking is not False:\n            self.pa_writer.add_key_value_metadata(\n                {\"content_defined_chunking\": json.dumps(self.use_content_defined_chunking)}\n            )\n"
  },
  {
    "path": "src/datasets/builder.py",
    "content": "# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Lint as: python3\n\"\"\"DatasetBuilder base class.\"\"\"\n\nimport abc\nimport contextlib\nimport copy\nimport inspect\nimport os\nimport posixpath\nimport shutil\nimport time\nimport urllib\nfrom collections.abc import Iterator, Mapping\nfrom dataclasses import dataclass\nfrom functools import partial\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any, Optional, Union\nfrom unittest.mock import patch\n\nimport fsspec\nimport pyarrow as pa\nfrom fsspec.core import url_to_fs\nfrom multiprocess import Pool\nfrom tqdm.contrib.concurrent import thread_map\n\nfrom . import config, utils\nfrom .arrow_dataset import Dataset\nfrom .arrow_reader import (\n    ArrowReader,\n    ReadInstruction,\n)\nfrom .arrow_writer import ArrowWriter, ParquetWriter, SchemaInferenceError\nfrom .data_files import DataFilesDict, DataFilesPatternsDict, sanitize_patterns\nfrom .dataset_dict import DatasetDict, IterableDatasetDict\nfrom .download.download_config import DownloadConfig\nfrom .download.download_manager import DownloadManager, DownloadMode\nfrom .download.streaming_download_manager import StreamingDownloadManager, xjoin\nfrom .exceptions import DatasetGenerationCastError, DatasetGenerationError, FileFormatError\nfrom .features import Features\nfrom .filesystems import (\n    is_remote_filesystem,\n    rename,\n)\nfrom .fingerprint import Hasher\nfrom .info import DatasetInfo, PostProcessedInfo\nfrom .iterable_dataset import ArrowExamplesIterable, ExamplesIterable, IterableDataset\nfrom .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH, camelcase_to_snakecase\nfrom .splits import Split, SplitDict, SplitGenerator, SplitInfo\nfrom .streaming import extend_dataset_builder_for_streaming\nfrom .table import CastError\nfrom .utils import logging\nfrom .utils import tqdm as hf_tqdm\nfrom .utils._filelock import FileLock\nfrom .utils.file_utils import is_remote_url\nfrom .utils.info_utils import VerificationMode, get_size_checksum_dict, verify_checksums, verify_splits\nfrom .utils.py_utils import (\n    classproperty,\n    convert_file_size_to_int,\n    has_sufficient_disk_space,\n    iflatmap_unordered,\n    map_nested,\n    memoize,\n    size_str,\n    temporary_assignment,\n)\nfrom .utils.sharding import _number_of_shards_in_gen_kwargs, _split_gen_kwargs\nfrom .utils.track import tracked_list\n\n\nif TYPE_CHECKING:\n    from .load import DatasetModule\n\n\nlogger = logging.get_logger(__name__)\n\n\nclass InvalidConfigName(ValueError):\n    pass\n\n\n@dataclass\nclass BuilderConfig:\n    \"\"\"Base class for `DatasetBuilder` data configuration.\n\n    `DatasetBuilder` subclasses with data configuration options should subclass\n    `BuilderConfig` and add their own properties.\n\n    Attributes:\n        name (`str`, defaults to `default`):\n            The name of the configuration.\n        version (`Version` or `str`, defaults to `0.0.0`):\n            The version of the configuration.\n        data_dir (`str`, *optional*):\n            Path to the directory containing the source data.\n        data_files (`str` or `Sequence` or `Mapping`, *optional*):\n            Path(s) to source data file(s).\n        description (`str`, *optional*):\n            A human description of the configuration.\n    \"\"\"\n\n    name: str = \"default\"\n    version: Optional[Union[utils.Version, str]] = utils.Version(\"0.0.0\")\n    data_dir: Optional[str] = None\n    data_files: Optional[Union[DataFilesDict, DataFilesPatternsDict]] = None\n    description: Optional[str] = None\n\n    def __post_init__(self):\n        # The config name is used to name the cache directory.\n        for invalid_char in INVALID_WINDOWS_CHARACTERS_IN_PATH:\n            if invalid_char in self.name:\n                raise InvalidConfigName(\n                    f\"Bad characters from black list '{INVALID_WINDOWS_CHARACTERS_IN_PATH}' found in '{self.name}'. \"\n                    f\"They could create issues when creating a directory for this config on Windows filesystem.\"\n                )\n        if self.data_files is not None and not isinstance(self.data_files, (DataFilesDict, DataFilesPatternsDict)):\n            raise ValueError(f\"Expected a DataFilesDict in data_files but got {self.data_files}\")\n\n    def __eq__(self, o):\n        # we need to override the default dataclass __eq__ since it doesn't check for\n        # other attributes that the ones of the signature.\n        if set(self.__dict__.keys()) != set(o.__dict__.keys()):\n            return False\n        return all((k, getattr(self, k)) == (k, getattr(o, k)) for k in self.__dict__.keys())\n\n    def create_config_id(\n        self,\n        config_kwargs: dict,\n        custom_features: Optional[Features] = None,\n    ) -> str:\n        \"\"\"\n        The config id is used to build the cache directory.\n        By default it is equal to the config name.\n        However the name of a config is not sufficient to have a unique identifier for the dataset being generated\n        since it doesn't take into account:\n        - the config kwargs that can be used to overwrite attributes\n        - the custom features used to write the dataset\n        - the data_files for json/text/csv/pandas datasets\n\n        Therefore the config id is just the config name with an optional suffix based on these.\n        \"\"\"\n        # Possibly add a suffix to the name to handle custom features/data_files/config_kwargs\n        suffix: Optional[str] = None\n        config_kwargs_to_add_to_suffix = config_kwargs.copy()\n        # name and version are already used to build the cache directory\n        config_kwargs_to_add_to_suffix.pop(\"name\", None)\n        config_kwargs_to_add_to_suffix.pop(\"version\", None)\n        # data dir handling (when specified it points to the manually downloaded data):\n        # it was previously ignored before the introduction of config id because we didn't want\n        # to change the config name. Now it's fine to take it into account for the config id.\n        # config_kwargs_to_add_to_suffix.pop(\"data_dir\", None)\n        if \"data_dir\" in config_kwargs_to_add_to_suffix:\n            if config_kwargs_to_add_to_suffix[\"data_dir\"] is None:\n                config_kwargs_to_add_to_suffix.pop(\"data_dir\", None)\n            else:\n                # canonicalize the data dir to avoid two paths to the same location having different\n                # hashes\n                data_dir = config_kwargs_to_add_to_suffix[\"data_dir\"]\n                data_dir = os.path.normpath(data_dir)\n                config_kwargs_to_add_to_suffix[\"data_dir\"] = data_dir\n        if config_kwargs_to_add_to_suffix:\n            # we don't care about the order of the kwargs\n            config_kwargs_to_add_to_suffix = {\n                k: config_kwargs_to_add_to_suffix[k] for k in sorted(config_kwargs_to_add_to_suffix)\n            }\n            if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()):\n                suffix = \",\".join(\n                    str(k) + \"=\" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items()\n                )\n                if len(suffix) > 32:  # hash if too long\n                    suffix = Hasher.hash(config_kwargs_to_add_to_suffix)\n            else:\n                suffix = Hasher.hash(config_kwargs_to_add_to_suffix)\n\n        if custom_features is not None:\n            m = Hasher()\n            if suffix:\n                m.update(suffix)\n            m.update(custom_features)\n            suffix = m.hexdigest()\n\n        if suffix:\n            config_id = self.name + \"-\" + suffix\n            if len(config_id) > config.MAX_DATASET_CONFIG_ID_READABLE_LENGTH:\n                config_id = self.name + \"-\" + Hasher.hash(suffix)\n            return config_id\n        else:\n            return self.name\n\n    def _resolve_data_files(self, base_path: str, download_config: DownloadConfig) -> None:\n        if isinstance(self.data_files, DataFilesPatternsDict):\n            base_path = xjoin(base_path, self.data_dir) if self.data_dir else base_path\n            self.data_files = self.data_files.resolve(base_path, download_config)\n\n\nclass DatasetBuilder:\n    \"\"\"Abstract base class for all datasets.\n\n    `DatasetBuilder` has 3 key methods:\n\n        - [`DatasetBuilder.info`]: Documents the dataset, including feature\n          names, types, shapes, version, splits, citation, etc.\n        - [`DatasetBuilder.download_and_prepare`]: Downloads the source data\n          and writes it to disk.\n        - [`DatasetBuilder.as_dataset`]: Generates a [`Dataset`].\n\n    Some `DatasetBuilder`s expose multiple variants of the\n    dataset by defining a [`BuilderConfig`] subclass and accepting a\n    config object (or name) on construction. Configurable datasets expose a\n    pre-defined set of configurations in [`DatasetBuilder.builder_configs`].\n\n    Args:\n        cache_dir (`str`, *optional*):\n            Directory to cache data. Defaults to `\"~/.cache/huggingface/datasets\"`.\n        dataset_name (`str`, *optional*):\n            Name of the dataset, if different from the builder name. Useful for packaged builders\n            like csv, imagefolder, audiofolder, etc. to reflect the difference between datasets\n            that use the same packaged builder.\n        config_name (`str`, *optional*):\n            Name of the dataset configuration.\n            It affects the data generated on disk. Different configurations will have their own subdirectories and\n            versions.\n            If not provided, the default configuration is used (if it exists).\n\n            <Added version=\"2.3.0\">\n\n            Parameter `name` was renamed to `config_name`.\n\n            </Added>\n        hash (`str`, *optional*):\n            Hash specific to the dataset builder code. Used to update the caching directory when the\n            dataset builder code is updated (to avoid reusing old data).\n            The typical caching directory (defined in `self._relative_data_dir`) is `name/version/hash/`.\n        base_path (`str`, *optional*):\n            Base path for relative paths that are used to download files.\n            This can be a remote URL.\n        features ([`Features`], *optional*):\n            Features types to use with this dataset.\n            It can be used to change the [`Features`] types of a dataset, for example.\n        token (`str` or `bool`, *optional*):\n            String or boolean to use as Bearer token for remote files on the\n            Datasets Hub. If `True`, will get token from `\"~/.huggingface\"`.\n        repo_id (`str`, *optional*):\n            ID of the dataset repository.\n            Used to distinguish builders with the same name but not coming from the same namespace, for example \"rajpurkar/squad\"\n            and \"lhoestq/squad\" repo IDs. In the latter, the builder name would be \"lhoestq___squad\".\n        data_files (`str` or `Sequence` or `Mapping`, *optional*):\n            Path(s) to source data file(s).\n            For builders like \"csv\" or \"json\" that need the user to specify data files. They can be either\n            local or remote files. For convenience, you can use a `DataFilesDict`.\n        data_dir (`str`, *optional*):\n            Path to directory containing source data file(s).\n            Use only if `data_files` is not passed, in which case it is equivalent to passing\n            `os.path.join(data_dir, \"**\")` as `data_files`.\n            For builders that require manual download, it must be the path to the local directory containing the\n            manually downloaded data.\n        storage_options (`dict`, *optional*):\n            Key/value pairs to be passed on to the dataset file-system backend, if any.\n        writer_batch_size (`int`, *optional*):\n            Batch size used by the ArrowWriter.\n            It defines the number of samples that are kept in memory before writing them\n            and also the length of the arrow chunks.\n            None means that the ArrowWriter will use its default value.\n        **config_kwargs (additional keyword arguments): Keyword arguments to be passed to the corresponding builder\n            configuration class, set on the class attribute [`DatasetBuilder.BUILDER_CONFIG_CLASS`]. The builder\n            configuration class is [`BuilderConfig`] or a subclass of it.\n    \"\"\"\n\n    # Default version\n    VERSION = None  # Default version set in BuilderConfig\n\n    # Class for the builder config.\n    BUILDER_CONFIG_CLASS = BuilderConfig\n\n    # Named configurations that modify the data generated by download_and_prepare.\n    BUILDER_CONFIGS = []\n\n    # Optional default config name to be used when name is None\n    DEFAULT_CONFIG_NAME = None\n\n    # Default batch size used by the ArrowWriter\n    # It defines the number of samples that are kept in memory before writing them\n    # and also the length of the arrow chunks\n    # None means that the ArrowWriter will use its default value\n    DEFAULT_WRITER_BATCH_SIZE = None\n\n    def __init__(\n        self,\n        cache_dir: Optional[str] = None,\n        dataset_name: Optional[str] = None,\n        config_name: Optional[str] = None,\n        hash: Optional[str] = None,\n        base_path: Optional[str] = None,\n        info: Optional[DatasetInfo] = None,\n        features: Optional[Features] = None,\n        token: Optional[Union[bool, str]] = None,\n        repo_id: Optional[str] = None,\n        data_files: Optional[Union[str, list, dict, DataFilesDict]] = None,\n        data_dir: Optional[str] = None,\n        storage_options: Optional[dict] = None,\n        writer_batch_size: Optional[int] = None,\n        config_id: Optional[str] = None,\n        **config_kwargs,\n    ):\n        # DatasetBuilder name\n        self.name: str = camelcase_to_snakecase(self.__module__.split(\".\")[-1])\n        self.hash: Optional[str] = hash\n        self.base_path = base_path\n        self.token = token\n        self.repo_id = repo_id\n        self.storage_options = storage_options or {}\n        self.dataset_name = camelcase_to_snakecase(dataset_name) if dataset_name else self.name\n        self._writer_batch_size = writer_batch_size or self.DEFAULT_WRITER_BATCH_SIZE\n\n        if data_files is not None and not isinstance(data_files, DataFilesDict):\n            data_files = DataFilesDict.from_patterns(\n                sanitize_patterns(data_files),\n                base_path=base_path,\n                download_config=DownloadConfig(token=token, storage_options=self.storage_options),\n            )\n\n        # Prepare config: DatasetConfig contains name, version and description but can be extended by each dataset\n        if \"features\" in inspect.signature(self.BUILDER_CONFIG_CLASS.__init__).parameters and features is not None:\n            config_kwargs[\"features\"] = features\n        if data_files is not None:\n            config_kwargs[\"data_files\"] = data_files\n        if data_dir is not None:\n            config_kwargs[\"data_dir\"] = data_dir\n        self.config_kwargs = config_kwargs\n        self.config, self.config_id = self._create_builder_config(\n            config_name=config_name,\n            custom_features=features,\n            config_id=config_id,\n            **config_kwargs,\n        )\n\n        # prepare info: DatasetInfo are a standardized dataclass across all datasets\n        # Prefill datasetinfo\n        if info is None:\n            info = self._info()\n        info.builder_name = self.name\n        info.dataset_name = self.dataset_name\n        info.config_name = self.config.name\n        info.version = self.config.version\n        self.info = info\n        # update info with user specified infos\n        if features is not None:\n            self.info.features = features\n\n        # Prepare data dirs:\n        # cache_dir can be a remote bucket on GCS or S3\n        self._cache_dir_root = str(cache_dir or config.HF_DATASETS_CACHE)\n        self._cache_dir_root = (\n            self._cache_dir_root if is_remote_url(self._cache_dir_root) else os.path.expanduser(self._cache_dir_root)\n        )\n        self._cache_downloaded_dir = (\n            posixpath.join(self._cache_dir_root, config.DOWNLOADED_DATASETS_DIR)\n            if cache_dir\n            else str(config.DOWNLOADED_DATASETS_PATH)\n        )\n        self._cache_downloaded_dir = (\n            self._cache_downloaded_dir\n            if is_remote_url(self._cache_downloaded_dir)\n            else os.path.expanduser(self._cache_downloaded_dir)\n        )\n\n        # In case there exists a legacy cache directory\n        self._legacy_relative_data_dir = None\n\n        self._cache_dir = self._build_cache_dir()\n        if not is_remote_url(self._cache_dir_root):\n            os.makedirs(self._cache_dir_root, exist_ok=True)\n            lock_path = os.path.join(\n                self._cache_dir_root, Path(self._cache_dir).as_posix().replace(\"/\", \"_\") + \".lock\"\n            )\n            with FileLock(lock_path):\n                if os.path.exists(self._cache_dir):  # check if data exist\n                    if len(os.listdir(self._cache_dir)) > 0:\n                        if os.path.exists(os.path.join(self._cache_dir, config.DATASET_INFO_FILENAME)):\n                            logger.debug(\"Overwrite dataset info from restored data version if exists.\")\n                            self.info = DatasetInfo.from_directory(self._cache_dir)\n                    else:  # dir exists but no data, remove the empty dir as data aren't available anymore\n                        logger.warning(\n                            f\"Old caching folder {self._cache_dir} for dataset {self.dataset_name} exists but no data were found. Removing it. \"\n                        )\n                        os.rmdir(self._cache_dir)\n\n        # Store in the cache by default unless the user specifies a custom output_dir to download_and_prepare\n        self._output_dir = self._cache_dir\n        self._fs: fsspec.AbstractFileSystem = fsspec.filesystem(\"file\")\n\n        # Set download manager\n        self.dl_manager = None\n\n        # Set to True by \"datasets-cli test\" to generate file checksums for (deprecated) dataset_infos.json independently of verification_mode value.\n        self._record_infos = False\n\n        # Set in `.download_and_prepare` once the format of the generated dataset is known\n        self._file_format = None\n\n        # Enable streaming (e.g. it patches \"open\" to work with remote files)\n        extend_dataset_builder_for_streaming(self)\n\n    def __getstate__(self):\n        return self.__dict__\n\n    def __setstate__(self, d):\n        self.__dict__ = d\n        # Re-enable streaming, since patched functions are not kept when pickling\n        extend_dataset_builder_for_streaming(self)\n\n    def _check_legacy_cache(self) -> Optional[str]:\n        \"\"\"Check for the old cache directory template {cache_dir}/{namespace}___{builder_name} from 2.13\"\"\"\n        if (\n            self.__module__.startswith(\"datasets.\")\n            and not is_remote_url(self._cache_dir_root)\n            and self.config.name == \"default\"\n        ):\n            from .packaged_modules import _PACKAGED_DATASETS_MODULES\n\n            namespace = self.repo_id.split(\"/\")[0] if self.repo_id and self.repo_id.count(\"/\") > 0 else None\n            config_name = self.repo_id.replace(\"/\", \"--\") if self.repo_id is not None else self.dataset_name\n            config_id = config_name + self.config_id[len(self.config.name) :]\n            hash = _PACKAGED_DATASETS_MODULES.get(self.name, \"missing\")[1]\n            legacy_relative_data_dir = posixpath.join(\n                self.dataset_name if namespace is None else f\"{namespace}___{self.dataset_name}\",\n                config_id,\n                \"0.0.0\",\n                hash,\n            )\n            legacy_cache_dir = posixpath.join(self._cache_dir_root, legacy_relative_data_dir)\n            if os.path.isdir(legacy_cache_dir):\n                return legacy_relative_data_dir\n\n    def _check_legacy_cache2(self, dataset_module: \"DatasetModule\") -> Optional[str]:\n        \"\"\"Check for the old cache directory template {cache_dir}/{namespace}___{dataset_name}/{config_name}-xxx from 2.14 and 2.15\"\"\"\n        if (\n            self.__module__.startswith(\"datasets.\")\n            and not is_remote_url(self._cache_dir_root)\n            and not (set(self.config_kwargs) - {\"data_files\", \"data_dir\"})\n        ):\n            from .packaged_modules import _PACKAGED_DATASETS_MODULES_2_15_HASHES\n            from .utils._dill import Pickler\n\n            def update_hash_with_config_parameters(hash: str, config_parameters: dict) -> str:\n                \"\"\"\n                Used to update hash of packaged modules which is used for creating unique cache directories to reflect\n                different config parameters which are passed in metadata from readme.\n                \"\"\"\n                params_to_exclude = {\"config_name\", \"version\", \"description\"}\n                params_to_add_to_hash = {\n                    param: value\n                    for param, value in sorted(config_parameters.items())\n                    if param not in params_to_exclude\n                }\n                m = Hasher()\n                m.update(hash)\n                m.update(params_to_add_to_hash)\n                return m.hexdigest()\n\n            namespace = self.repo_id.split(\"/\")[0] if self.repo_id and self.repo_id.count(\"/\") > 0 else None\n            with patch.object(Pickler, \"_legacy_no_dict_keys_sorting\", True):\n                config_id = self.config.name + \"-\" + Hasher.hash({\"data_files\": self.config.data_files})\n            hash = _PACKAGED_DATASETS_MODULES_2_15_HASHES.get(self.name, \"missing\")\n            if (\n                dataset_module.builder_configs_parameters.metadata_configs\n                and self.config.name in dataset_module.builder_configs_parameters.metadata_configs\n            ):\n                hash = update_hash_with_config_parameters(\n                    hash, dataset_module.builder_configs_parameters.metadata_configs[self.config.name]\n                )\n            legacy_relative_data_dir = posixpath.join(\n                self.dataset_name if namespace is None else f\"{namespace}___{self.dataset_name}\",\n                config_id,\n                \"0.0.0\",\n                hash,\n            )\n            legacy_cache_dir = posixpath.join(self._cache_dir_root, legacy_relative_data_dir)\n            if os.path.isdir(legacy_cache_dir):\n                return legacy_relative_data_dir\n\n    def _create_builder_config(\n        self, config_name=None, custom_features=None, config_id=None, **config_kwargs\n    ) -> tuple[BuilderConfig, str]:\n        \"\"\"Create and validate BuilderConfig object as well as a unique config id for this config.\n        Raises ValueError if there are multiple builder configs and config_name and DEFAULT_CONFIG_NAME are None.\n        config_kwargs override the defaults kwargs in config\n        \"\"\"\n        builder_config = None\n\n        # try default config\n        if config_name is None and self.BUILDER_CONFIGS:\n            if self.DEFAULT_CONFIG_NAME is not None:\n                builder_config = self.builder_configs.get(self.DEFAULT_CONFIG_NAME)\n                logger.info(f\"No config specified, defaulting to: {self.dataset_name}/{builder_config.name}\")\n            else:\n                if len(self.BUILDER_CONFIGS) > 1:\n                    if not config_kwargs:\n                        example_of_usage = (\n                            f\"load_dataset('{self.repo_id or self.dataset_name}', '{self.BUILDER_CONFIGS[0].name}')\"\n                        )\n                        raise ValueError(\n                            \"Config name is missing.\"\n                            f\"\\nPlease pick one among the available configs: {list(self.builder_configs.keys())}\"\n                            + f\"\\nExample of usage:\\n\\t`{example_of_usage}`\"\n                        )\n                else:\n                    builder_config = self.BUILDER_CONFIGS[0]\n                    logger.info(\n                        f\"No config specified, defaulting to the single config: {self.dataset_name}/{builder_config.name}\"\n                    )\n\n        # try to get config by name\n        if isinstance(config_name, str):\n            builder_config = self.builder_configs.get(config_name)\n            if builder_config is None and self.BUILDER_CONFIGS:\n                raise ValueError(\n                    f\"BuilderConfig '{config_name}' not found. Available: {list(self.builder_configs.keys())}\"\n                )\n\n        # if not using an existing config, then create a new config on the fly\n        if not builder_config:\n            if config_name is not None:\n                config_kwargs[\"name\"] = config_name\n            elif self.DEFAULT_CONFIG_NAME and not config_kwargs:\n                # Use DEFAULT_CONFIG_NAME only if no config_kwargs are passed\n                config_kwargs[\"name\"] = self.DEFAULT_CONFIG_NAME\n            if \"version\" not in config_kwargs and hasattr(self, \"VERSION\") and self.VERSION:\n                config_kwargs[\"version\"] = self.VERSION\n            builder_config = self.BUILDER_CONFIG_CLASS(**config_kwargs)\n\n        # otherwise use the config_kwargs to overwrite the attributes\n        else:\n            builder_config = copy.deepcopy(builder_config) if config_kwargs else builder_config\n            for key, value in config_kwargs.items():\n                if value is not None:\n                    if not hasattr(builder_config, key):\n                        raise ValueError(f\"BuilderConfig {builder_config} doesn't have a '{key}' key.\")\n                    setattr(builder_config, key, value)\n\n        if not builder_config.name:\n            raise ValueError(f\"BuilderConfig must have a name, got {builder_config.name}\")\n\n        # resolve data files if needed\n        builder_config._resolve_data_files(\n            base_path=self.base_path,\n            download_config=DownloadConfig(token=self.token, storage_options=self.storage_options),\n        )\n\n        # compute the config id that is going to be used for caching\n        if config_id is None:\n            config_id = builder_config.create_config_id(\n                config_kwargs,\n                custom_features=custom_features,\n            )\n        is_custom = (config_id not in self.builder_configs) and config_id != \"default\"\n        if is_custom:\n            logger.info(f\"Using custom data configuration {config_id}\")\n        else:\n            if (\n                builder_config.name in self.builder_configs\n                and builder_config != self.builder_configs[builder_config.name]\n            ):\n                raise ValueError(\n                    \"Cannot name a custom BuilderConfig the same as an available \"\n                    f\"BuilderConfig. Change the name. Available BuilderConfigs: {list(self.builder_configs.keys())}\"\n                )\n            if not builder_config.version:\n                raise ValueError(f\"BuilderConfig {builder_config.name} must have a version\")\n\n        return builder_config, config_id\n\n    @classproperty\n    @classmethod\n    @memoize()\n    def builder_configs(cls) -> dict[str, BuilderConfig]:\n        \"\"\"Dictionary of pre-defined configurations for this builder class.\"\"\"\n        configs = {config.name: config for config in cls.BUILDER_CONFIGS}\n        if len(configs) != len(cls.BUILDER_CONFIGS):\n            names = [config.name for config in cls.BUILDER_CONFIGS]\n            raise ValueError(f\"Names in BUILDER_CONFIGS must not be duplicated. Got {names}\")\n        return configs\n\n    @property\n    def cache_dir(self):\n        return self._cache_dir\n\n    def _use_legacy_cache_dir_if_possible(self, dataset_module: \"DatasetModule\"):\n        # Check for the legacy cache directory template (datasets<3.0.0)\n        self._legacy_relative_data_dir = (\n            self._check_legacy_cache2(dataset_module) or self._check_legacy_cache() or None\n        )\n        self._cache_dir = self._build_cache_dir()\n        self._output_dir = self._cache_dir\n\n    def _relative_data_dir(self, with_version=True, with_hash=True) -> str:\n        \"\"\"Relative path of this dataset in cache_dir:\n        Will be:\n            self.dataset_name/self.config.version/self.hash/\n        or if a repo_id with a namespace has been specified:\n            self.namespace___self.dataset_name/self.config.version/self.hash/\n        If any of these element is missing or if ``with_version=False`` the corresponding subfolders are dropped.\n        \"\"\"\n        if self._legacy_relative_data_dir is not None and with_version and with_hash:\n            return self._legacy_relative_data_dir\n\n        namespace = self.repo_id.split(\"/\")[0] if self.repo_id and self.repo_id.count(\"/\") > 0 else None\n        builder_data_dir = self.dataset_name if namespace is None else f\"{namespace}___{self.dataset_name}\"\n        builder_data_dir = posixpath.join(builder_data_dir, self.config_id)\n        if with_version:\n            builder_data_dir = posixpath.join(builder_data_dir, str(self.config.version))\n        if with_hash and self.hash and isinstance(self.hash, str):\n            builder_data_dir = posixpath.join(builder_data_dir, self.hash)\n        return builder_data_dir\n\n    def _build_cache_dir(self):\n        \"\"\"Return the data directory for the current version.\"\"\"\n        builder_data_dir = posixpath.join(self._cache_dir_root, self._relative_data_dir(with_version=False))\n        version_data_dir = posixpath.join(self._cache_dir_root, self._relative_data_dir(with_version=True))\n\n        def _other_versions_on_disk():\n            \"\"\"Returns previous versions on disk.\"\"\"\n            if not os.path.exists(builder_data_dir):\n                return []\n\n            version_dirnames = []\n            for dir_name in os.listdir(builder_data_dir):\n                try:\n                    version_dirnames.append((utils.Version(dir_name), dir_name))\n                except ValueError:  # Invalid version (ex: incomplete data dir)\n                    pass\n            version_dirnames.sort(reverse=True)\n            return version_dirnames\n\n        # Check and warn if other versions exist\n        if not is_remote_url(builder_data_dir):\n            version_dirs = _other_versions_on_disk()\n            if version_dirs:\n                other_version = version_dirs[0][0]\n                if other_version != self.config.version:\n                    warn_msg = (\n                        f\"Found a different version {str(other_version)} of dataset {self.dataset_name} in \"\n                        f\"cache_dir {self._cache_dir_root}. Using currently defined version \"\n                        f\"{str(self.config.version)}.\"\n                    )\n                    logger.warning(warn_msg)\n\n        return version_data_dir\n\n    @abc.abstractmethod\n    def _info(self) -> DatasetInfo:\n        \"\"\"Construct the DatasetInfo object. See `DatasetInfo` for details.\n\n        Warning: This function is only called once and the result is cached for all\n        following .info() calls.\n\n        Returns:\n            info: (DatasetInfo) The dataset information\n        \"\"\"\n        raise NotImplementedError\n\n    @classmethod\n    def get_imported_module_dir(cls):\n        \"\"\"Return the path of the module of this class or subclass.\"\"\"\n        return os.path.dirname(inspect.getfile(inspect.getmodule(cls)))\n\n    def _rename(self, src: str, dst: str):\n        rename(self._fs, src, dst)\n\n    def download_and_prepare(\n        self,\n        output_dir: Optional[str] = None,\n        download_config: Optional[DownloadConfig] = None,\n        download_mode: Optional[Union[DownloadMode, str]] = None,\n        verification_mode: Optional[Union[VerificationMode, str]] = None,\n        dl_manager: Optional[DownloadManager] = None,\n        base_path: Optional[str] = None,\n        file_format: str = \"arrow\",\n        max_shard_size: Optional[Union[int, str]] = None,\n        num_proc: Optional[int] = None,\n        storage_options: Optional[dict] = None,\n        **download_and_prepare_kwargs,\n    ):\n        \"\"\"Downloads and prepares dataset for reading.\n\n        Args:\n            output_dir (`str`, *optional*):\n                Output directory for the dataset.\n                Default to this builder's `cache_dir`, which is inside `~/.cache/huggingface/datasets` by default.\n\n                <Added version=\"2.5.0\"/>\n            download_config (`DownloadConfig`, *optional*):\n                Specific download configuration parameters.\n            download_mode ([`DownloadMode`] or `str`, *optional*):\n                Select the download/generate mode, default to `REUSE_DATASET_IF_EXISTS`.\n            verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):\n                Verification mode determining the checks to run on the downloaded/processed dataset information (checksums/size/splits/...).\n\n                <Added version=\"2.9.1\"/>\n            dl_manager (`DownloadManager`, *optional*):\n                Specific `DownloadManger` to use.\n            base_path (`str`, *optional*):\n                Base path for relative paths that are used to download files. This can be a remote url.\n                If not specified, the value of the `base_path` attribute (`self.base_path`) will be used instead.\n            file_format (`str`, *optional*):\n                Format of the data files in which the dataset will be written.\n                Supported formats: \"arrow\", \"parquet\". Default to \"arrow\" format.\n                If the format is \"parquet\", then image and audio data are embedded into the Parquet files instead of pointing to local files.\n\n                <Added version=\"2.5.0\"/>\n            max_shard_size (`Union[str, int]`, *optional*):\n                Maximum number of bytes written per shard, default is \"500MB\".\n                The size is based on uncompressed data size, so in practice your shard files may be smaller than\n                `max_shard_size` thanks to Parquet compression for example.\n\n                <Added version=\"2.5.0\"/>\n            num_proc (`int`, *optional*, defaults to `None`):\n                Number of processes when downloading and generating the dataset locally.\n                Multiprocessing is disabled by default.\n\n                <Added version=\"2.7.0\"/>\n            storage_options (`dict`, *optional*):\n                Key/value pairs to be passed on to the caching file-system backend, if any.\n\n                <Added version=\"2.5.0\"/>\n            **download_and_prepare_kwargs (additional keyword arguments): Keyword arguments.\n\n        Example:\n\n        Download and prepare the dataset as Arrow files that can be loaded as a Dataset using `builder.as_dataset()`:\n\n        ```py\n        >>> from datasets import load_dataset_builder\n        >>> builder = load_dataset_builder(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> builder.download_and_prepare()\n        ```\n\n        Download and prepare the dataset as sharded Parquet files locally:\n\n        ```py\n        >>> from datasets import load_dataset_builder\n        >>> builder = load_dataset_builder(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> builder.download_and_prepare(\"./output_dir\", file_format=\"parquet\")\n        ```\n\n        Download and prepare the dataset as sharded Parquet files in a cloud storage:\n\n        ```py\n        >>> from datasets import load_dataset_builder\n        >>> storage_options = {\"key\": aws_access_key_id, \"secret\": aws_secret_access_key}\n        >>> builder = load_dataset_builder(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> builder.download_and_prepare(\"s3://my-bucket/my_rotten_tomatoes\", storage_options=storage_options, file_format=\"parquet\")\n        ```\n        \"\"\"\n        output_dir = output_dir if output_dir is not None else self._cache_dir\n        # output_dir can be a remote bucket on GCS or S3\n        fs, output_dir = url_to_fs(output_dir, **(storage_options or {}))\n        self._fs = fs\n        self._output_dir = output_dir if not is_remote_filesystem(self._fs) else self._fs.unstrip_protocol(output_dir)\n\n        download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)\n        verification_mode = VerificationMode(verification_mode or VerificationMode.BASIC_CHECKS)\n        base_path = base_path if base_path is not None else self.base_path\n\n        if file_format is not None and file_format not in [\"arrow\", \"parquet\"]:\n            raise ValueError(f\"Unsupported file_format: {file_format}. Expected 'arrow' or 'parquet'\")\n        self._file_format = file_format\n\n        if self._fs._strip_protocol(self._output_dir) == \"\":\n            # We don't support the root directory, because it has no dirname,\n            # and we need a dirname to use a <dirname>.incomplete directory\n            # when the dataset is being written\n            raise RuntimeError(\n                f\"Unable to download and prepare the dataset at the root {self._output_dir}. \"\n                f\"Please specify a subdirectory, e.g. '{self._output_dir + self.dataset_name}'\"\n            )\n\n        if dl_manager is None:\n            if download_config is None:\n                download_config = DownloadConfig(\n                    cache_dir=self._cache_downloaded_dir,\n                    force_download=download_mode == DownloadMode.FORCE_REDOWNLOAD,\n                    force_extract=download_mode == DownloadMode.FORCE_REDOWNLOAD,\n                    use_etag=False,\n                    num_proc=num_proc,\n                    token=self.token,\n                    storage_options=self.storage_options,\n                )  # We don't use etag for data files to speed up the process\n\n            dl_manager = DownloadManager(\n                dataset_name=self.dataset_name,\n                download_config=download_config,\n                data_dir=self.config.data_dir,\n                base_path=base_path,\n                record_checksums=(self._record_infos or verification_mode == VerificationMode.ALL_CHECKS),\n            )\n\n        is_local = not is_remote_filesystem(self._fs)\n        self.dl_manager = dl_manager\n\n        # Prevent parallel local disk operations\n        if is_local:\n            # Create parent directory of the output_dir to put the lock file in there\n            Path(self._output_dir).parent.mkdir(parents=True, exist_ok=True)\n            lock_path = self._output_dir + \"_builder.lock\"\n\n        # File locking only with local paths; no file locking on GCS or S3\n        with FileLock(lock_path) if is_local else contextlib.nullcontext():\n            # Check if the data already exists\n            data_exists = self._fs.exists(posixpath.join(self._output_dir, config.DATASET_INFO_FILENAME))\n            if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS:\n                logger.info(f\"Found cached dataset {self.dataset_name} ({self._output_dir})\")\n                # We need to update the info in case some splits were added in the meantime\n                # for example when calling load_dataset from multiple workers.\n                self.info = self._load_info()\n                self.download_post_processing_resources(dl_manager)\n                return\n\n            logger.info(f\"Generating dataset {self.dataset_name} ({self._output_dir})\")\n            if is_local:  # if cache dir is local, check for available space\n                if not has_sufficient_disk_space(\n                    self.info.size_in_bytes or 0, directory=Path(self._output_dir).parent\n                ):\n                    raise OSError(\n                        f\"Not enough disk space. Needed: {size_str(self.info.size_in_bytes or 0)} (download: {size_str(self.info.download_size or 0)}, generated: {size_str(self.info.dataset_size or 0)}, post-processed: {size_str(self.info.post_processing_size or 0)})\"\n                    )\n\n            @contextlib.contextmanager\n            def incomplete_dir(dirname):\n                \"\"\"Create temporary dir for dirname and rename on exit.\"\"\"\n                if not is_local:\n                    self._fs.makedirs(dirname, exist_ok=True)\n                    yield dirname\n                else:\n                    tmp_dir = dirname + \".incomplete\"\n                    os.makedirs(tmp_dir, exist_ok=True)\n                    try:\n                        yield tmp_dir\n                        if os.path.isdir(dirname):\n                            shutil.rmtree(dirname)\n                        # LocalFileSystem.mv does copy + rm, it is more efficient to simply rename a local directory\n                        shutil.move(tmp_dir, dirname)\n                    finally:\n                        if os.path.exists(tmp_dir):\n                            shutil.rmtree(tmp_dir)\n\n            # Print is intentional: we want this to always go to stdout so user has\n            # information needed to cancel download/preparation if needed.\n            # This comes right before the progress bar.\n            if self.info.size_in_bytes:\n                logger.info(\n                    f\"Downloading and preparing dataset {self.dataset_name}/{self.config.name} \"\n                    f\"(download: {size_str(self.info.download_size)}, generated: {size_str(self.info.dataset_size)}, \"\n                    f\"post-processed: {size_str(self.info.post_processing_size)}, \"\n                    f\"total: {size_str(self.info.size_in_bytes)}) to {self._output_dir}...\"\n                )\n            else:\n                _dest = self._fs._strip_protocol(self._output_dir) if is_local else self._output_dir\n                logger.info(f\"Downloading and preparing dataset {self.dataset_name}/{self.config.name} to {_dest}...\")\n\n            # Create a tmp dir and rename to self._output_dir on successful exit.\n            with incomplete_dir(self._output_dir) as tmp_output_dir:\n                # Temporarily assign _output_dir to tmp_data_dir to avoid having to forward\n                # it to every sub function.\n                with temporary_assignment(self, \"_output_dir\", tmp_output_dir):\n                    prepare_split_kwargs = {\"file_format\": file_format}\n                    if max_shard_size is not None:\n                        prepare_split_kwargs[\"max_shard_size\"] = max_shard_size\n                    if num_proc is not None:\n                        prepare_split_kwargs[\"num_proc\"] = num_proc\n                    self._download_and_prepare(\n                        dl_manager=dl_manager,\n                        verification_mode=verification_mode,\n                        **prepare_split_kwargs,\n                        **download_and_prepare_kwargs,\n                    )\n                    # Sync info\n                    self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())\n                    self.info.download_checksums = dl_manager.get_recorded_sizes_checksums()\n                    if self.info.download_size is not None:\n                        self.info.size_in_bytes = self.info.dataset_size + self.info.download_size\n                    # Save info\n                    self._save_info()\n\n            # Download post processing resources\n            self.download_post_processing_resources(dl_manager)\n\n            logger.info(\n                f\"Dataset {self.dataset_name} downloaded and prepared to {self._output_dir}. \"\n                f\"Subsequent calls will reuse this data.\"\n            )\n\n    def _download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs):\n        \"\"\"Downloads and prepares dataset for reading.\n\n        This is the internal implementation to overwrite called when user calls\n        `download_and_prepare`. It should download all required data and generate\n        the pre-processed datasets files.\n\n        Args:\n            dl_manager ([`DownloadManager`]):\n                `DownloadManager` used to download and cache data.\n            verification_mode ([`VerificationMode`]):\n                if `ALL_CHECKS`, perform all the verifications including checksums.\n                if `BASIC_CHECKS`, do not perform checksums, only perform split tests.\n                if `NO_CHECKS`, do not perform any verification.\n            prepare_split_kwargs: Additional options, such as `file_format`, `max_shard_size`\n        \"\"\"\n        # Generating data for all splits\n        split_dict = SplitDict(dataset_name=self.dataset_name)\n        split_generators_kwargs = self._make_split_generators_kwargs(prepare_split_kwargs)\n        split_generators: list[SplitGenerator] = self._split_generators(dl_manager, **split_generators_kwargs)\n\n        # Checksums verification\n        if verification_mode == VerificationMode.ALL_CHECKS and dl_manager.record_checksums:\n            verify_checksums(\n                self.info.download_checksums, dl_manager.get_recorded_sizes_checksums(), \"dataset source files\"\n            )\n\n        # Build splits\n        for split_generator in split_generators:\n            if str(split_generator.split_info.name).lower() == \"all\":\n                raise ValueError(\n                    \"`all` is a special split keyword corresponding to the \"\n                    \"union of all splits, so cannot be used as key in \"\n                    \"._split_generator().\"\n                )\n\n            logger.info(f\"Generating {split_generator.split_info.name} split\")\n            split_dict.add(split_generator.split_info)\n\n            try:\n                # Prepare split will record examples associated to the split\n                self._prepare_split(split_generator, **prepare_split_kwargs)\n            except OSError as e:\n                raise OSError(\"Cannot find data file. \" + \"\\nOriginal error:\\n\" + str(e)) from None\n            dl_manager.manage_extracted_files()\n\n        if verification_mode == VerificationMode.BASIC_CHECKS or verification_mode == VerificationMode.ALL_CHECKS:\n            verify_splits(self.info.splits, split_dict)\n\n        # Update the info object with the splits.\n        self.info.splits = split_dict\n        self.info.download_size = dl_manager.downloaded_size\n\n    def download_post_processing_resources(self, dl_manager):\n        for split in self.info.splits or []:\n            for resource_name, resource_file_name in self._post_processing_resources(split).items():\n                if not not is_remote_filesystem(self._fs):\n                    raise NotImplementedError(f\"Post processing is not supported on filesystem {self._fs}\")\n                if os.sep in resource_file_name:\n                    raise ValueError(f\"Resources shouldn't be in a sub-directory: {resource_file_name}\")\n                resource_path = os.path.join(self._output_dir, resource_file_name)\n                if not os.path.exists(resource_path):\n                    downloaded_resource_path = self._download_post_processing_resources(\n                        split, resource_name, dl_manager\n                    )\n                    if downloaded_resource_path:\n                        logger.info(f\"Downloaded post-processing resource {resource_name} as {resource_file_name}\")\n                        shutil.move(downloaded_resource_path, resource_path)\n\n    def _load_info(self) -> DatasetInfo:\n        return DatasetInfo.from_directory(self._output_dir, storage_options=self._fs.storage_options)\n\n    def _save_info(self):\n        file_lock = (\n            FileLock(self._output_dir + \"_info.lock\")\n            if not is_remote_filesystem(self._fs)\n            else contextlib.nullcontext()\n        )\n        with file_lock:\n            self.info.write_to_directory(self._output_dir, storage_options=self._fs.storage_options)\n\n    def _make_split_generators_kwargs(self, prepare_split_kwargs):\n        \"\"\"Get kwargs for `self._split_generators()` from `prepare_split_kwargs`.\"\"\"\n        del prepare_split_kwargs\n        return {}\n\n    def as_dataset(\n        self,\n        split: Optional[Union[str, Split, list[str], list[Split]]] = None,\n        run_post_process=True,\n        verification_mode: Optional[Union[VerificationMode, str]] = None,\n        in_memory=False,\n    ) -> Union[Dataset, DatasetDict]:\n        \"\"\"Return a Dataset for the specified split.\n\n        Args:\n            split (`datasets.Split`):\n                Which subset of the data to return.\n            run_post_process (`bool`, defaults to `True`):\n                Whether to run post-processing dataset transforms and/or add\n                indexes.\n            verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):\n                Verification mode determining the checks to run on the\n                downloaded/processed dataset information (checksums/size/splits/...).\n\n                <Added version=\"2.9.1\"/>\n            in_memory (`bool`, defaults to `False`):\n                Whether to copy the data in-memory.\n\n        Returns:\n            datasets.Dataset\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset_builder\n        >>> builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')\n        >>> builder.download_and_prepare()\n        >>> ds = builder.as_dataset(split='train')\n        >>> ds\n        Dataset({\n            features: ['text', 'label'],\n            num_rows: 8530\n        })\n        ```\n        \"\"\"\n        if self._file_format is not None and self._file_format != \"arrow\":\n            raise FileFormatError('Loading a dataset not written in the \"arrow\" format is not supported.')\n        if is_remote_filesystem(self._fs):\n            raise NotImplementedError(f\"Loading a dataset cached in a {type(self._fs).__name__} is not supported.\")\n        if not os.path.exists(self._output_dir):\n            raise FileNotFoundError(\n                f\"Dataset {self.dataset_name}: could not find data in {self._output_dir}. Please make sure to call \"\n                \"builder.download_and_prepare(), or use \"\n                \"datasets.load_dataset() before trying to access the Dataset object.\"\n            )\n\n        logger.debug(f\"Constructing Dataset for split {split or ', '.join(self.info.splits)}, from {self._output_dir}\")\n\n        # By default, return all splits\n        if split is None:\n            split = {s: s for s in self.info.splits}\n\n        verification_mode = VerificationMode(verification_mode or VerificationMode.BASIC_CHECKS)\n\n        # Create a dataset for each of the given splits\n        datasets = map_nested(\n            partial(\n                self._build_single_dataset,\n                run_post_process=run_post_process,\n                verification_mode=verification_mode,\n                in_memory=in_memory,\n            ),\n            split,\n            map_tuple=True,\n            disable_tqdm=True,\n        )\n        if isinstance(datasets, dict):\n            datasets = DatasetDict(datasets)\n        return datasets\n\n    def _build_single_dataset(\n        self,\n        split: Union[str, ReadInstruction, Split],\n        run_post_process: bool,\n        verification_mode: VerificationMode,\n        in_memory: bool = False,\n    ):\n        \"\"\"as_dataset for a single split.\"\"\"\n        if not isinstance(split, ReadInstruction):\n            split = str(split)\n            if split == \"all\":\n                split = \"+\".join(self.info.splits.keys())\n            split = Split(split)\n\n        # Build base dataset\n        ds = self._as_dataset(\n            split=split,\n            in_memory=in_memory,\n        )\n        if run_post_process:\n            for resource_file_name in self._post_processing_resources(split).values():\n                if os.sep in resource_file_name:\n                    raise ValueError(f\"Resources shouldn't be in a sub-directory: {resource_file_name}\")\n            resources_paths = {\n                resource_name: os.path.join(self._output_dir, resource_file_name)\n                for resource_name, resource_file_name in self._post_processing_resources(split).items()\n            }\n            post_processed = self._post_process(ds, resources_paths)\n            if post_processed is not None:\n                ds = post_processed\n                recorded_checksums = {}\n                record_checksums = False\n                for resource_name, resource_path in resources_paths.items():\n                    size_checksum = get_size_checksum_dict(resource_path)\n                    recorded_checksums[resource_name] = size_checksum\n                if verification_mode == VerificationMode.ALL_CHECKS and record_checksums:\n                    if self.info.post_processed is None or self.info.post_processed.resources_checksums is None:\n                        expected_checksums = None\n                    else:\n                        expected_checksums = self.info.post_processed.resources_checksums.get(split)\n                    verify_checksums(expected_checksums, recorded_checksums, \"post processing resources\")\n                if self.info.post_processed is None:\n                    self.info.post_processed = PostProcessedInfo()\n                if self.info.post_processed.resources_checksums is None:\n                    self.info.post_processed.resources_checksums = {}\n                self.info.post_processed.resources_checksums[str(split)] = recorded_checksums\n                self.info.post_processing_size = sum(\n                    checksums_dict[\"num_bytes\"]\n                    for split_checksums_dicts in self.info.post_processed.resources_checksums.values()\n                    for checksums_dict in split_checksums_dicts.values()\n                )\n                if self.info.dataset_size is not None and self.info.download_size is not None:\n                    self.info.size_in_bytes = (\n                        self.info.dataset_size + self.info.download_size + self.info.post_processing_size\n                    )\n                self._save_info()\n                ds._info.post_processed = self.info.post_processed\n                ds._info.post_processing_size = self.info.post_processing_size\n                ds._info.size_in_bytes = self.info.size_in_bytes\n                if self.info.post_processed.features is not None:\n                    if self.info.post_processed.features.type != ds.features.type:\n                        raise ValueError(\n                            f\"Post-processed features info don't match the dataset:\\nGot\\n{self.info.post_processed.features}\\nbut expected something like\\n{ds.features}\"\n                        )\n                    else:\n                        ds.info.features = self.info.post_processed.features\n\n        return ds\n\n    def _as_dataset(self, split: Union[ReadInstruction, Split] = Split.TRAIN, in_memory: bool = False) -> Dataset:\n        \"\"\"Constructs a `Dataset`.\n\n        This is the internal implementation to overwrite called when user calls\n        `as_dataset`. It should read the pre-processed datasets files and generate\n        the `Dataset` object.\n\n        Args:\n            split (`datasets.Split`):\n                which subset of the data to read.\n            in_memory (`bool`, defaults to `False`):\n                Whether to copy the data in-memory.\n\n        Returns:\n            `Dataset`\n        \"\"\"\n        cache_dir = self._fs._strip_protocol(self._output_dir)\n        dataset_name = self.dataset_name\n        if self._check_legacy_cache():\n            dataset_name = self.name\n        dataset_kwargs = ArrowReader(cache_dir, self.info).read(\n            name=dataset_name,\n            instructions=split,\n            split_infos=self.info.splits.values(),\n            in_memory=in_memory,\n        )\n        fingerprint = self._get_dataset_fingerprint(split)\n        return Dataset(fingerprint=fingerprint, **dataset_kwargs)\n\n    def _get_dataset_fingerprint(self, split: Union[ReadInstruction, Split]) -> str:\n        \"\"\"The dataset fingerprint is the hash of the relative directory dataset_name/config_name/version/hash, as well as the split specs.\"\"\"\n        hasher = Hasher()\n        hasher.update(Path(self._relative_data_dir()).as_posix())\n        hasher.update(str(split))  # for example: train, train+test, train[:10%], test[:33%](pct1_dropremainder)\n        fingerprint = hasher.hexdigest()\n        return fingerprint\n\n    def as_streaming_dataset(\n        self,\n        split: Optional[str] = None,\n        base_path: Optional[str] = None,\n    ) -> Union[dict[str, IterableDataset], IterableDataset]:\n        if is_remote_filesystem(self._fs):\n            raise NotImplementedError(\n                f\"Loading a streaming dataset cached in a {type(self._fs).__name__} is not supported yet.\"\n            )\n\n        dl_manager = StreamingDownloadManager(\n            base_path=base_path or self.base_path,\n            download_config=DownloadConfig(token=self.token, storage_options=self.storage_options),\n            dataset_name=self.dataset_name,\n            data_dir=self.config.data_dir,\n        )\n        splits_generators = {sg.name: sg for sg in self._split_generators(dl_manager)}\n        # By default, return all splits\n        if split is None:\n            splits_generator = splits_generators\n        elif split in splits_generators:\n            splits_generator = splits_generators[split]\n        else:\n            raise ValueError(f\"Bad split: {split}. Available splits: {list(splits_generators)}\")\n\n        # Create a dataset for each of the given splits\n        datasets = map_nested(\n            self._as_streaming_dataset_single,\n            splits_generator,\n            map_tuple=True,\n        )\n        if isinstance(datasets, dict):\n            datasets = IterableDatasetDict(datasets)\n        return datasets\n\n    def _as_streaming_dataset_single(\n        self,\n        splits_generator,\n    ) -> IterableDataset:\n        ex_iterable = self._get_examples_iterable_for_split(splits_generator)\n        # add auth to be able to access and decode audio/image files from private repositories.\n        token_per_repo_id = {self.repo_id: self.token} if self.repo_id else {}\n        return IterableDataset(\n            ex_iterable, info=self.info, split=splits_generator.name, token_per_repo_id=token_per_repo_id\n        )\n\n    def _post_process(self, dataset: Dataset, resources_paths: Mapping[str, str]) -> Optional[Dataset]:\n        \"\"\"Run dataset transforms or add indexes\"\"\"\n        return None\n\n    def _post_processing_resources(self, split: str) -> dict[str, str]:\n        \"\"\"Mapping resource_name -> resource_file_name\"\"\"\n        return {}\n\n    def _download_post_processing_resources(\n        self, split: str, resource_name: str, dl_manager: DownloadManager\n    ) -> Optional[str]:\n        \"\"\"Download the resource using the download manager and return the downloaded path.\"\"\"\n        return None\n\n    @abc.abstractmethod\n    def _split_generators(self, dl_manager: Union[DownloadManager, StreamingDownloadManager]):\n        \"\"\"Specify feature dictionary generators and dataset splits.\n\n        This function returns a list of `SplitGenerator`s defining how to generate\n        data and what splits to use.\n\n        Example:\n\n            return [\n                    datasets.SplitGenerator(\n                            name=datasets.Split.TRAIN,\n                            gen_kwargs={'file': 'train_data.zip'},\n                    ),\n                    datasets.SplitGenerator(\n                            name=datasets.Split.TEST,\n                            gen_kwargs={'file': 'test_data.zip'},\n                    ),\n            ]\n\n        The above code will first call `_generate_examples(file='train_data.zip')`\n        to write the train data, then `_generate_examples(file='test_data.zip')` to\n        write the test data.\n\n        Datasets are typically split into different subsets to be used at various\n        stages of training and evaluation.\n\n        Note that for datasets without a `VALIDATION` split, you can use a\n        fraction of the `TRAIN` data for evaluation as you iterate on your model\n        so as not to overfit to the `TEST` data.\n\n        For downloads and extractions, use the given `download_manager`.\n        Note that the `DownloadManager` caches downloads, so it is fine to have each\n        generator attempt to download the source data.\n\n        A good practice is to download all data in this function, and then\n        distribute the relevant parts to each split with the `gen_kwargs` argument\n\n        Args:\n            dl_manager (`Union[DownloadManager, StreamingDownloadManager]`):\n                Download manager to download the data\n\n        Returns:\n            `list<SplitGenerator>`.\n        \"\"\"\n        raise NotImplementedError()\n\n    @abc.abstractmethod\n    def _prepare_split(\n        self,\n        split_generator: SplitGenerator,\n        file_format: str = \"arrow\",\n        max_shard_size: Optional[Union[str, int]] = None,\n        num_proc: Optional[int] = None,\n        **kwargs,\n    ):\n        \"\"\"Generate the examples and record them on disk.\n\n        Args:\n            split_generator (`SplitGenerator`):\n                Split generator to process\n            file_format (`str`, *optional*):\n                format of the data files in which the dataset will be written.\n                Supported formats: \"arrow\", \"parquet\". Default to \"arrow\" format.\n            max_shard_size (`Union[str, int]`, *optional*):\n                Maximum number of bytes written per shard, default is \"500MB\".\n                The size is based on uncompressed data size, so in practice your shard files may be smaller than\n                `max_shard_size` thanks to Parquet compression for example.\n            num_proc (`int`, *optional*, defaults to `None`):\n                Number of processes when downloading and generating the dataset locally.\n                Multiprocessing is disabled by default.\n\n                <Added version=\"2.7.0\"/>\n            **kwargs: Additional kwargs forwarded from _download_and_prepare\n        \"\"\"\n        raise NotImplementedError()\n\n    def _get_examples_iterable_for_split(self, split_generator: SplitGenerator) -> ExamplesIterable:\n        \"\"\"Generate the examples on the fly.\n\n        Args:\n            split_generator (`SplitGenerator`):\n                Split generator to process\n        \"\"\"\n        raise NotImplementedError()\n\n\n@dataclass\nclass Key:\n    original_shard_id: int\n    item_or_batch_id: int\n\n    def __str__(self):\n        return str((self.original_shard_id, self.item_or_batch_id))\n\n\nclass GeneratorBasedBuilder(DatasetBuilder):\n    \"\"\"Base class for datasets with data generation based on dict generators.\n\n    `GeneratorBasedBuilder` is a convenience class that abstracts away much\n    of the data writing and reading of `DatasetBuilder`. It expects subclasses to\n    implement generators of feature dictionaries across the dataset splits\n    (`_split_generators`). See the method docstrings for details.\n    \"\"\"\n\n    def _generate_shards(self, **kwargs) -> Iterator[Union[str, dict[str, Any]]]:\n        \"\"\"Default function generating shards paths for each `SplitGenerator`.\n\n        This function is useful to list the original shards from where the data\n        comes from and is either converted to Arrow or streamed to an IterableDataset.\n\n        This is optional and only used for certain utilities, but not in Dataset\n        nor IterableDataset. E.g. it's used to map original shard files to Parquet\n        files in the Dataset Viewer after conversion.\n\n        Args:\n            **kwargs (additional keyword arguments):\n                Arguments forwarded from the SplitGenerator.gen_kwargs\n\n        Yields:\n            shard: generally a string representing the shard path, or a dict\n                representing the shard in case of shards spanning intra or inter-files.\n        \"\"\"\n        raise NotImplementedError()\n\n    @abc.abstractmethod\n    def _generate_examples(self, **kwargs) -> Iterator[tuple[Key, dict[str, Any]]]:\n        \"\"\"Default function generating examples for each `SplitGenerator`.\n\n        This function preprocess the examples from the raw data to the preprocessed\n        dataset files.\n        This function is called once for each `SplitGenerator` defined in\n        `_split_generators`. The examples yielded here will be written on\n        disk.\n\n        Args:\n            **kwargs (additional keyword arguments):\n                Arguments forwarded from the SplitGenerator.gen_kwargs\n\n        Yields:\n            key: `str` or `int`, a unique deterministic example identification key.\n                * Unique: An error will be raised if two examples are yield with the\n                    same key.\n                * Deterministic: When generating the dataset twice, the same example\n                    should have the same key.\n                Good keys can be the image id, or line number if examples are extracted\n                from a text file.\n                The key will be hashed and sorted to shuffle examples deterministically,\n                such as generating the dataset multiple times keep examples in the\n                same order.\n            example: `dict<str feature_name, feature_value>`, a feature dictionary\n                ready to be encoded and written to disk. The example will be\n                encoded with `self.info.features.encode_example({...})`.\n        \"\"\"\n        raise NotImplementedError()\n\n    def _prepare_split(\n        self,\n        split_generator: SplitGenerator,\n        file_format=\"arrow\",\n        num_proc: Optional[int] = None,\n        max_shard_size: Optional[Union[int, str]] = None,\n    ):\n        max_shard_size = convert_file_size_to_int(max_shard_size or config.MAX_SHARD_SIZE)\n\n        if self.info.splits is not None:\n            split_info = self.info.splits[split_generator.name]\n        else:\n            split_info = split_generator.split_info\n\n        SUFFIX = \"-JJJJJ-SSSSS-of-NNNNN\"\n        fname = f\"{self.dataset_name}-{split_generator.name}{SUFFIX}.{file_format}\"\n        fpath = posixpath.join(self._output_dir, fname)\n\n        if num_proc and num_proc > 1:\n            num_original_shards = _number_of_shards_in_gen_kwargs(split_generator.gen_kwargs)\n            if num_original_shards <= 1:\n                logger.warning(\n                    f\"Setting num_proc from {num_proc} back to 1 for the {split_info.name} split to disable multiprocessing as it only contains one shard.\"\n                )\n                num_proc = 1\n            elif num_original_shards < num_proc:\n                logger.warning(\n                    f\"Setting num_proc from {num_proc} to {num_original_shards} for the {split_info.name} split as it only contains {num_original_shards} shards.\"\n                )\n                num_proc = num_original_shards\n\n        pbar = hf_tqdm(\n            unit=\" examples\",\n            total=split_info.num_examples,\n            desc=f\"Generating {split_info.name} split\",\n        )\n\n        _prepare_split_args = {\n            \"fpath\": fpath,\n            \"file_format\": file_format,\n            \"max_shard_size\": max_shard_size,\n            \"split_info\": split_info,\n        }\n\n        if num_proc is None or num_proc == 1:\n            result = None\n            gen_kwargs = split_generator.gen_kwargs\n            job_id = 0\n            with pbar:\n                for job_id, done, content in self._prepare_split_single(\n                    gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args\n                ):\n                    if done:\n                        result = content\n                    else:\n                        pbar.update(content)\n            # wrapping everything into lists for consistency with the multiprocessed code path\n            assert result is not None, \"Failed to retrieve results from prepare_split\"\n            (\n                examples_per_job,\n                bytes_per_job,\n                features_per_job,\n                shards_per_job,\n                shard_lengths_per_job,\n                original_shards_per_job,\n                original_shard_lengths_per_job,\n            ) = ([item] for item in result)\n        else:\n            kwargs_per_job = [\n                {\"gen_kwargs\": gen_kwargs, \"job_id\": job_id, **_prepare_split_args}\n                for job_id, gen_kwargs in enumerate(\n                    _split_gen_kwargs(split_generator.gen_kwargs, max_num_jobs=num_proc)\n                )\n            ]\n            num_jobs = len(kwargs_per_job)\n\n            examples_per_job = [None] * num_jobs\n            bytes_per_job = [None] * num_jobs\n            features_per_job = [None] * num_jobs\n            shards_per_job = [None] * num_jobs\n            shard_lengths_per_job = [None] * num_jobs\n            original_shards_per_job = [None] * num_jobs\n            original_shard_lengths_per_job = [None] * num_jobs\n\n            with Pool(num_proc) as pool:\n                with pbar:\n                    for job_id, done, content in iflatmap_unordered(\n                        pool, self._prepare_split_single, kwargs_iterable=kwargs_per_job\n                    ):\n                        if done:\n                            # the content is the result of the job\n                            (\n                                examples_per_job[job_id],\n                                bytes_per_job[job_id],\n                                features_per_job[job_id],\n                                shards_per_job[job_id],\n                                shard_lengths_per_job[job_id],\n                                original_shards_per_job[job_id],\n                                original_shard_lengths_per_job[job_id],\n                            ) = content\n                        else:\n                            # the content is the number of examples progress update\n                            pbar.update(content)\n\n            assert None not in examples_per_job, (\n                f\"Failed to retrieve results from prepare_split: result list {examples_per_job} still contains None - at least one worker failed to return its results\"\n            )\n\n        total_shards = sum(shards_per_job)\n        total_original_shards = sum(original_shards_per_job)\n        total_num_examples = sum(examples_per_job)\n        total_num_bytes = sum(bytes_per_job)\n        features = features_per_job[0]\n\n        split_generator.split_info.num_examples = total_num_examples\n        split_generator.split_info.num_bytes = total_num_bytes\n\n        # should rename everything at the end\n        logger.debug(f\"Renaming {total_shards} shards.\")\n        if total_shards > 1:\n            # use the -SSSSS-of-NNNNN pattern\n\n            def _rename_shard(shard_and_job: tuple[int]):\n                shard_id, job_id = shard_and_job\n                global_shard_id = sum(shards_per_job[:job_id]) + shard_id\n                self._rename(\n                    fpath.replace(\"SSSSS\", f\"{shard_id:05d}\").replace(\"JJJJJ\", f\"{job_id:05d}\"),\n                    fpath.replace(\"JJJJJ-SSSSS\", f\"{global_shard_id:05d}\").replace(\"NNNNN\", f\"{total_shards:05d}\"),\n                )\n\n            shards_and_jobs = [\n                (shard_id, job_id)\n                for job_id, num_shards in enumerate(shards_per_job)\n                for shard_id in range(num_shards)\n            ]\n            thread_map(_rename_shard, shards_and_jobs, disable=True, max_workers=64)\n\n            split_generator.split_info.shard_lengths = [\n                shard_length for shard_lengths in shard_lengths_per_job for shard_length in shard_lengths\n            ]\n        else:\n            # don't use any pattern\n            shard_id, job_id = 0, 0\n            self._rename(\n                fpath.replace(\"SSSSS\", f\"{shard_id:05d}\").replace(\"JJJJJ\", f\"{job_id:05d}\"),\n                fpath.replace(SUFFIX, \"\"),\n            )\n\n        if total_original_shards > 1 and config.SAVE_ORIGINAL_SHARD_LENGTHS:\n            split_generator.split_info.original_shard_lengths = [\n                original_shard_length\n                for original_shard_lengths in original_shard_lengths_per_job\n                for original_shard_length in original_shard_lengths\n            ]\n\n        if self.info.features is None:\n            self.info.features = features\n\n    def _prepare_split_single(\n        self,\n        gen_kwargs: dict,\n        fpath: str,\n        file_format: str,\n        max_shard_size: int,\n        split_info: SplitInfo,\n        job_id: int,\n    ) -> Iterator[tuple[int, bool, tuple[int, int, Features, int, int, int]]]:\n        generator = self._generate_examples(**gen_kwargs)\n        writer_class = ParquetWriter if file_format == \"parquet\" else ArrowWriter\n        embed_local_files = file_format == \"parquet\"\n        shard_lengths = []\n        original_shard_lengths = []\n        total_num_examples, total_num_bytes = 0, 0\n\n        shard_id = 0\n        original_shard_id = 0\n        num_examples_progress_update = 0\n        try:\n            writer = writer_class(\n                features=self.info.features,\n                path=fpath.replace(\"SSSSS\", f\"{shard_id:05d}\").replace(\"JJJJJ\", f\"{job_id:05d}\"),\n                writer_batch_size=self._writer_batch_size,\n                storage_options=self._fs.storage_options,\n                embed_local_files=embed_local_files,\n            )\n            try:\n                _time = time.time()\n                for key, record in generator:\n                    if isinstance(key, Key):  # old custom builders may not use Key\n                        original_shard_id = key.original_shard_id\n                    if max_shard_size is not None and writer._num_bytes > max_shard_size:\n                        num_examples, num_bytes = writer.finalize()\n                        writer.close()\n                        shard_lengths.append(num_examples)\n                        total_num_examples += num_examples\n                        total_num_bytes += num_bytes\n                        shard_id += 1\n                        writer = writer_class(\n                            features=writer._features,\n                            path=fpath.replace(\"SSSSS\", f\"{shard_id:05d}\").replace(\"JJJJJ\", f\"{job_id:05d}\"),\n                            writer_batch_size=self._writer_batch_size,\n                            storage_options=self._fs.storage_options,\n                            embed_local_files=embed_local_files,\n                        )\n                    example = self.info.features.encode_example(record) if self.info.features is not None else record\n                    writer.write(example)\n                    if len(original_shard_lengths) <= original_shard_id:\n                        original_shard_lengths.extend([0] * (1 + original_shard_id - len(original_shard_lengths)))\n                    original_shard_lengths[original_shard_id] += 1\n                    num_examples_progress_update += 1\n                    if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL:\n                        _time = time.time()\n                        yield job_id, False, num_examples_progress_update\n                        num_examples_progress_update = 0\n            finally:\n                yield job_id, False, num_examples_progress_update\n                num_shards = shard_id + 1\n                num_original_shards = original_shard_id + 1\n                num_examples, num_bytes = writer.finalize()\n                writer.close()\n                shard_lengths.append(num_examples)\n                total_num_examples += num_examples\n                total_num_bytes += num_bytes\n        except Exception as e:\n            # Ignore the writer's error for no examples written to the file if this error was caused by the error in _generate_examples before the first example was yielded\n            if isinstance(e, SchemaInferenceError) and e.__context__ is not None:\n                e = e.__context__\n            raise DatasetGenerationError(\"An error occurred while generating the dataset\") from e\n\n        yield (\n            job_id,\n            True,\n            (\n                total_num_examples,\n                total_num_bytes,\n                writer._features,\n                num_shards,\n                shard_lengths,\n                num_original_shards,\n                original_shard_lengths,\n            ),\n        )\n\n    def _download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_kwargs):\n        super()._download_and_prepare(\n            dl_manager,\n            verification_mode,\n            **prepare_splits_kwargs,\n        )\n\n    def _get_examples_iterable_for_split(self, split_generator: SplitGenerator) -> ExamplesIterable:\n        return ExamplesIterable(\n            self._generate_examples,\n            split_generator.gen_kwargs,\n            generate_more_kwargs_fn=getattr(self, \"_generate_more_gen_kwargs\", None),\n        )\n\n\nclass ArrowBasedBuilder(DatasetBuilder):\n    \"\"\"Base class for datasets with data generation based on Arrow loading functions (CSV/JSON/Parquet).\"\"\"\n\n    def _generate_shards(self, **kwargs) -> Iterator[Union[str, dict[str, Any]]]:\n        \"\"\"Default function generating shards paths for each `SplitGenerator`.\n\n        This function is useful to list the original shards from where the data\n        comes from and is either converted to Arrow or streamed to an IterableDataset.\n\n        This is optional and only used for certain utilities, but not in Dataset\n        nor IterableDataset. E.g. it's used to map original shard files to Parquet\n        files in the Dataset Viewer after conversion.\n\n        Args:\n            **kwargs (additional keyword arguments):\n                Arguments forwarded from the SplitGenerator.gen_kwargs\n\n        Yields:\n            shard: generally a string representing the shard path, or a dict\n                representing the shard in case of shards spanning intra or inter-files.\n        \"\"\"\n        raise NotImplementedError()\n\n    @abc.abstractmethod\n    def _generate_tables(self, **kwargs) -> Iterator[tuple[Key, pa.Table]]:\n        \"\"\"Default function generating examples for each `SplitGenerator`.\n\n        This function preprocess the examples from the raw data to the preprocessed\n        dataset files.\n        This function is called once for each `SplitGenerator` defined in\n        `_split_generators`. The examples yielded here will be written on\n        disk.\n\n        Args:\n            **kwargs (additional keyword arguments):\n                Arguments forwarded from the SplitGenerator.gen_kwargs\n\n        Yields:\n            key: tuple[int, int] original_shard_id and table_idx within that shard\n            example: `pyarrow.Table`, a feature table\n                ready to be encoded and written to disk.\n        \"\"\"\n        raise NotImplementedError()\n\n    def _prepare_split(\n        self,\n        split_generator: SplitGenerator,\n        file_format: str = \"arrow\",\n        num_proc: Optional[int] = None,\n        max_shard_size: Optional[Union[str, int]] = None,\n    ):\n        max_shard_size = convert_file_size_to_int(max_shard_size or config.MAX_SHARD_SIZE)\n\n        try:\n            split_info = self.info.splits[split_generator.name]\n        except Exception:\n            split_info = split_generator.split_info\n\n        SUFFIX = \"-JJJJJ-SSSSS-of-NNNNN\"\n        fname = f\"{self.dataset_name}-{split_generator.name}{SUFFIX}.{file_format}\"\n        fpath = posixpath.join(self._output_dir, fname)\n\n        if num_proc and num_proc > 1:\n            num_original_shards = _number_of_shards_in_gen_kwargs(split_generator.gen_kwargs)\n            if num_original_shards <= 1:\n                logger.warning(\n                    f\"Setting num_proc from {num_proc} back to 1 for the {split_info.name} split to disable multiprocessing as it only contains one shard.\"\n                )\n                num_proc = 1\n            elif num_original_shards < num_proc:\n                logger.warning(\n                    f\"Setting num_proc from {num_proc} to {num_original_shards} for the {split_info.name} split as it only contains {num_original_shards} shards.\"\n                )\n                num_proc = num_original_shards\n\n        pbar = hf_tqdm(\n            unit=\" examples\",\n            total=split_info.num_examples,\n            desc=f\"Generating {split_info.name} split\",\n        )\n\n        _prepare_split_args = {\n            \"fpath\": fpath,\n            \"file_format\": file_format,\n            \"max_shard_size\": max_shard_size,\n        }\n\n        if num_proc is None or num_proc == 1:\n            result = None\n            gen_kwargs = split_generator.gen_kwargs\n            job_id = 0\n            with pbar:\n                for job_id, done, content in self._prepare_split_single(\n                    gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args\n                ):\n                    if done:\n                        result = content\n                    else:\n                        pbar.update(content)\n            # wrapping everything into lists for consistency with the multiprocessed code path\n            assert result is not None, \"Failed to retrieve results from prepare_split\"\n            (\n                examples_per_job,\n                bytes_per_job,\n                features_per_job,\n                shards_per_job,\n                shard_lengths_per_job,\n                original_shards_per_job,\n                original_shard_lengths_per_job,\n            ) = ([item] for item in result)\n        else:\n            kwargs_per_job = [\n                {\"gen_kwargs\": gen_kwargs, \"job_id\": job_id, **_prepare_split_args}\n                for job_id, gen_kwargs in enumerate(\n                    _split_gen_kwargs(split_generator.gen_kwargs, max_num_jobs=num_proc)\n                )\n            ]\n            num_jobs = len(kwargs_per_job)\n\n            examples_per_job = [None] * num_jobs\n            bytes_per_job = [None] * num_jobs\n            features_per_job = [None] * num_jobs\n            shards_per_job = [None] * num_jobs\n            shard_lengths_per_job = [None] * num_jobs\n            original_shards_per_job = [None] * num_jobs\n            original_shard_lengths_per_job = [None] * num_jobs\n\n            with Pool(num_proc) as pool:\n                with pbar:\n                    for job_id, done, content in iflatmap_unordered(\n                        pool, self._prepare_split_single, kwargs_iterable=kwargs_per_job\n                    ):\n                        if done:\n                            # the content is the result of the job\n                            (\n                                examples_per_job[job_id],\n                                bytes_per_job[job_id],\n                                features_per_job[job_id],\n                                shards_per_job[job_id],\n                                shard_lengths_per_job[job_id],\n                                original_shards_per_job[job_id],\n                                original_shard_lengths_per_job[job_id],\n                            ) = content\n                        else:\n                            # the content is the number of examples progress update\n                            pbar.update(content)\n\n            assert None not in examples_per_job, (\n                f\"Failed to retrieve results from prepare_split: result list {examples_per_job} still contains None - at least one worker failed to return its results\"\n            )\n\n        total_shards = sum(shards_per_job)\n        total_original_shards = sum(original_shards_per_job)\n        total_num_examples = sum(examples_per_job)\n        total_num_bytes = sum(bytes_per_job)\n        features = features_per_job[0]\n\n        split_generator.split_info.num_examples = total_num_examples\n        split_generator.split_info.num_bytes = total_num_bytes\n\n        # should rename everything at the end\n        logger.debug(f\"Renaming {total_shards} shards.\")\n        if total_shards > 1:\n            # use the -SSSSS-of-NNNNN pattern\n\n            def _rename_shard(shard_id_and_job: tuple[int]):\n                shard_id, job_id = shard_id_and_job\n                global_shard_id = sum(shards_per_job[:job_id]) + shard_id\n                self._rename(\n                    fpath.replace(\"SSSSS\", f\"{shard_id:05d}\").replace(\"JJJJJ\", f\"{job_id:05d}\"),\n                    fpath.replace(\"JJJJJ-SSSSS\", f\"{global_shard_id:05d}\").replace(\"NNNNN\", f\"{total_shards:05d}\"),\n                )\n\n            shard_ids_and_jobs = [\n                (shard_id, job_id)\n                for job_id, num_shards in enumerate(shards_per_job)\n                for shard_id in range(num_shards)\n            ]\n            thread_map(_rename_shard, shard_ids_and_jobs, disable=True, max_workers=64)\n\n            split_generator.split_info.shard_lengths = [\n                shard_length for shard_lengths in shard_lengths_per_job for shard_length in shard_lengths\n            ]\n        else:\n            # don't use any pattern\n            shard_id, job_id = 0, 0\n            self._rename(\n                fpath.replace(\"SSSSS\", f\"{shard_id:05d}\").replace(\"JJJJJ\", f\"{job_id:05d}\"),\n                fpath.replace(SUFFIX, \"\"),\n            )\n\n        if total_original_shards > 1 and config.SAVE_ORIGINAL_SHARD_LENGTHS:\n            split_generator.split_info.original_shard_lengths = [\n                original_shard_length\n                for original_shard_lengths in original_shard_lengths_per_job\n                for original_shard_length in original_shard_lengths\n            ]\n\n        if self.info.features is None:\n            self.info.features = features\n\n    def _prepare_split_single(\n        self, gen_kwargs: dict, fpath: str, file_format: str, max_shard_size: int, job_id: int\n    ) -> Iterator[tuple[int, bool, tuple[int, int, Features, int, int, int]]]:\n        gen_kwargs = {k: tracked_list(v) if isinstance(v, list) else v for k, v in gen_kwargs.items()}\n        generator = self._generate_tables(**gen_kwargs)\n        writer_class = ParquetWriter if file_format == \"parquet\" else ArrowWriter\n        embed_local_files = file_format == \"parquet\"\n        shard_lengths = []\n        original_shard_lengths = []\n        total_num_examples, total_num_bytes = 0, 0\n\n        shard_id = 0\n        original_shard_id = 0\n        num_examples_progress_update = 0\n        try:\n            writer = writer_class(\n                features=self.info.features,\n                path=fpath.replace(\"SSSSS\", f\"{shard_id:05d}\").replace(\"JJJJJ\", f\"{job_id:05d}\"),\n                writer_batch_size=self._writer_batch_size,\n                storage_options=self._fs.storage_options,\n                embed_local_files=embed_local_files,\n            )\n            try:\n                _time = time.time()\n                for key, table in generator:\n                    if isinstance(key, Key):  # old custom builders may not use Key\n                        original_shard_id = key.original_shard_id\n                    if max_shard_size is not None and writer._num_bytes > max_shard_size:\n                        num_examples, num_bytes = writer.finalize()\n                        writer.close()\n                        shard_lengths.append(num_examples)\n                        total_num_examples += num_examples\n                        total_num_bytes += num_bytes\n                        shard_id += 1\n                        writer = writer_class(\n                            features=writer._features,\n                            path=fpath.replace(\"SSSSS\", f\"{shard_id:05d}\").replace(\"JJJJJ\", f\"{job_id:05d}\"),\n                            writer_batch_size=self._writer_batch_size,\n                            storage_options=self._fs.storage_options,\n                            embed_local_files=embed_local_files,\n                        )\n                    try:\n                        writer.write_table(table)\n                    except CastError as cast_error:\n                        raise DatasetGenerationCastError.from_cast_error(\n                            cast_error=cast_error,\n                            builder_name=self.info.builder_name,\n                            gen_kwargs=gen_kwargs,\n                            token=self.token,\n                        )\n                    if len(original_shard_lengths) == original_shard_id:\n                        original_shard_lengths.append(len(table))\n                    else:\n                        original_shard_lengths[original_shard_id] += len(table)\n                    num_examples_progress_update += len(table)\n                    if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL:\n                        _time = time.time()\n                        yield job_id, False, num_examples_progress_update\n                        num_examples_progress_update = 0\n            finally:\n                yield job_id, False, num_examples_progress_update\n                num_shards = shard_id + 1\n                num_original_shards = original_shard_id + 1\n                num_examples, num_bytes = writer.finalize()\n                writer.close()\n                shard_lengths.append(num_examples)\n                total_num_examples += num_examples\n                total_num_bytes += num_bytes\n        except Exception as e:\n            # Ignore the writer's error for no examples written to the file if this error was caused by the error in _generate_examples before the first example was yielded\n            if isinstance(e, SchemaInferenceError) and e.__context__ is not None:\n                e = e.__context__\n            if isinstance(e, DatasetGenerationError):\n                raise\n            raise DatasetGenerationError(\"An error occurred while generating the dataset\") from e\n\n        yield (\n            job_id,\n            True,\n            (\n                total_num_examples,\n                total_num_bytes,\n                writer._features,\n                num_shards,\n                shard_lengths,\n                num_original_shards,\n                original_shard_lengths,\n            ),\n        )\n\n    def _get_examples_iterable_for_split(self, split_generator: SplitGenerator) -> ExamplesIterable:\n        return ArrowExamplesIterable(\n            self._generate_tables,\n            kwargs=split_generator.gen_kwargs,\n            generate_more_kwargs_fn=getattr(self, \"_generate_more_gen_kwargs\", None),\n        )\n\n\nclass _CountableBuilderMixin(DatasetBuilder):\n    @abc.abstractmethod\n    def _generate_num_examples(self, **kwargs) -> Iterator[int]:\n        raise NotImplementedError()\n\n    def count_examples(self, dl_manager: DownloadManager) -> dict[str, int]:\n        split_generators_kwargs = self._make_split_generators_kwargs({})\n        split_generators: list[SplitGenerator] = self._split_generators(dl_manager, **split_generators_kwargs)\n        return {split_generator.name: self._count_examples(split_generator) for split_generator in split_generators}\n\n    def _count_examples(self, split_generator: SplitGenerator) -> int:\n        max_workers = min(32, os.cpu_count() + 4)\n        return sum(\n            thread_map(\n                self._count_examples_single,\n                _split_gen_kwargs(split_generator.gen_kwargs, max_workers),\n                delay=5,\n                desc=f\"Counting rows for split={split_generator.name}\",\n            )\n        )\n\n    def _count_examples_single(self, gen_kwargs: dict[str, Any]) -> int:\n        return sum(self._generate_num_examples(**gen_kwargs))\n"
  },
  {
    "path": "src/datasets/combine.py",
    "content": "from typing import Optional, TypeVar\n\nfrom .arrow_dataset import Dataset, _concatenate_map_style_datasets, _interleave_map_style_datasets\nfrom .dataset_dict import DatasetDict, IterableDatasetDict\nfrom .info import DatasetInfo\nfrom .iterable_dataset import IterableDataset, _concatenate_iterable_datasets, _interleave_iterable_datasets\nfrom .splits import NamedSplit\nfrom .utils import logging\nfrom .utils.py_utils import Literal\n\n\nlogger = logging.get_logger(__name__)\n\n\nDatasetType = TypeVar(\"DatasetType\", Dataset, IterableDataset)\n\n\ndef interleave_datasets(\n    datasets: list[DatasetType],\n    probabilities: Optional[list[float]] = None,\n    seed: Optional[int] = None,\n    info: Optional[DatasetInfo] = None,\n    split: Optional[NamedSplit] = None,\n    stopping_strategy: Literal[\n        \"first_exhausted\", \"all_exhausted\", \"all_exhausted_without_replacement\"\n    ] = \"first_exhausted\",\n) -> DatasetType:\n    \"\"\"\n    Interleave several datasets (sources) into a single dataset.\n    The new dataset is constructed by alternating between the sources to get the examples.\n\n    You can use this function on a list of [`Dataset`] objects, or on a list of [`IterableDataset`] objects.\n\n        - If `probabilities` is `None` (default) the new dataset is constructed by cycling between each source to get the examples.\n        - If `probabilities` is not `None`, the new dataset is constructed by getting examples from a random source at a time according to the provided probabilities.\n\n    The resulting dataset ends when one of the source datasets runs out of examples except when `oversampling` is `True`,\n    in which case, the resulting dataset ends when all datasets have ran out of examples at least one time.\n\n    Note for iterable datasets:\n\n    * The resulting dataset's `num_shards` is the minimum of each dataset's `num_shards` to ensure good parallelism.\n      If some of your datasets have a very low number of shards, you may use [`IterableDataset.reshard`].\n    * In a distributed setup or in PyTorch DataLoader workers, the stopping strategy is applied per process.\n      Therefore the \"first_exhausted\" strategy on an sharded iterable dataset can generate less samples in total (up to 1 missing sample per subdataset per worker).\n\n    Args:\n        datasets (`List[Dataset]` or `List[IterableDataset]`):\n            List of datasets to interleave.\n        probabilities (`List[float]`, *optional*, defaults to `None`):\n            If specified, the new dataset is constructed by sampling\n            examples from one source at a time according to these probabilities.\n        seed (`int`, *optional*, defaults to `None`):\n            The random seed used to choose a source for each example.\n        info ([`DatasetInfo`], *optional*):\n            Dataset information, like description, citation, etc.\n            <Added version=\"2.4.0\"/>\n        split ([`NamedSplit`], *optional*):\n            Name of the dataset split.\n            <Added version=\"2.4.0\"/>\n        stopping_strategy (`str`, defaults to `first_exhausted`):\n            Three strategies are proposed right now, `first_exhausted`, `all_exhausted` and `all_exhausted_without_replacement`.\n            By default, `first_exhausted` is an undersampling strategy, i.e the dataset construction is stopped as soon as one dataset has ran out of samples.\n            If the strategy is `all_exhausted`,  we use an oversampling strategy, i.e the dataset construction is stopped as soon as every samples of every dataset has been added at least once.\n            When strategy is `all_exhausted_without_replacement` we make sure that each sample in each dataset is sampled only once.\n            Note that if the strategy is `all_exhausted`, the interleaved dataset size can get enormous:\n            - with no probabilities, the resulting dataset will have `max_length_datasets*nb_dataset` samples.\n            - with given probabilities, the resulting dataset will have more samples if some datasets have really low probability of visiting.\n    Returns:\n        [`Dataset`] or [`IterableDataset`]: Return type depends on the input `datasets`\n        parameter. `Dataset` if the input is a list of `Dataset`, `IterableDataset` if the input is a list of\n        `IterableDataset`.\n\n    Example:\n\n        For regular datasets (map-style):\n\n        ```python\n        >>> from datasets import Dataset, interleave_datasets\n        >>> d1 = Dataset.from_dict({\"a\": [0, 1, 2]})\n        >>> d2 = Dataset.from_dict({\"a\": [10, 11, 12]})\n        >>> d3 = Dataset.from_dict({\"a\": [20, 21, 22]})\n        >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42, stopping_strategy=\"all_exhausted\")\n        >>> dataset[\"a\"]\n        [10, 0, 11, 1, 2, 20, 12, 10, 0, 1, 2, 21, 0, 11, 1, 2, 0, 1, 12, 2, 10, 0, 22]\n        >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42)\n        >>> dataset[\"a\"]\n        [10, 0, 11, 1, 2]\n        >>> dataset = interleave_datasets([d1, d2, d3])\n        >>> dataset[\"a\"]\n        [0, 10, 20, 1, 11, 21, 2, 12, 22]\n        >>> dataset = interleave_datasets([d1, d2, d3], stopping_strategy=\"all_exhausted\")\n        >>> dataset[\"a\"]\n        [0, 10, 20, 1, 11, 21, 2, 12, 22]\n        >>> d1 = Dataset.from_dict({\"a\": [0, 1, 2]})\n        >>> d2 = Dataset.from_dict({\"a\": [10, 11, 12, 13]})\n        >>> d3 = Dataset.from_dict({\"a\": [20, 21, 22, 23, 24]})\n        >>> dataset = interleave_datasets([d1, d2, d3])\n        >>> dataset[\"a\"]\n        [0, 10, 20, 1, 11, 21, 2, 12, 22]\n        >>> dataset = interleave_datasets([d1, d2, d3], stopping_strategy=\"all_exhausted\")\n        >>> dataset[\"a\"]\n        [0, 10, 20, 1, 11, 21, 2, 12, 22, 0, 13, 23, 1, 10, 24]\n        >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42)\n        >>> dataset[\"a\"]\n        [10, 0, 11, 1, 2]\n        >>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42, stopping_strategy=\"all_exhausted\")\n        >>> dataset[\"a\"]\n        [10, 0, 11, 1, 2, 20, 12, 13, ..., 0, 1, 2, 0, 24]\n        For datasets in streaming mode (iterable):\n\n        >>> from datasets import interleave_datasets\n        >>> d1 = load_dataset('allenai/c4', 'es', split='train', streaming=True)\n        >>> d2 = load_dataset('allenai/c4', 'fr', split='train', streaming=True)\n        >>> dataset = interleave_datasets([d1, d2])\n        >>> iterator = iter(dataset)\n        >>> next(iterator)\n        {'text': 'Comprar Zapatillas para niña en chancla con goma por...'}\n        >>> next(iterator)\n        {'text': 'Le sacre de philippe ier, 23 mai 1059 - Compte Rendu...'\n        ```\n    \"\"\"\n    from .arrow_dataset import Dataset\n    from .iterable_dataset import IterableDataset\n\n    if not datasets:\n        raise ValueError(\"Unable to interleave an empty list of datasets.\")\n    for i, dataset in enumerate(datasets):\n        if not isinstance(dataset, (Dataset, IterableDataset)):\n            if isinstance(dataset, (DatasetDict, IterableDatasetDict)):\n                if not dataset:\n                    raise ValueError(\n                        f\"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} \"\n                        \"is an empty dataset dictionary.\"\n                    )\n                raise ValueError(\n                    f\"Dataset at position {i} has at least one split: {list(dataset)}\\n\"\n                    f\"Please pick one to interleave with the other datasets, for example: dataset['{next(iter(dataset))}']\"\n                )\n            raise ValueError(\n                f\"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} is a {type(dataset).__name__}.\"\n            )\n        if i == 0:\n            dataset_type, other_type = (\n                (Dataset, IterableDataset) if isinstance(dataset, Dataset) else (IterableDataset, Dataset)\n            )\n        elif not isinstance(dataset, dataset_type):\n            raise ValueError(\n                f\"Unable to interleave a {dataset_type.__name__} (at position 0) with a {other_type.__name__} (at position {i}). Expected a list of Dataset objects or a list of IterableDataset objects.\"\n            )\n    if stopping_strategy not in [\"first_exhausted\", \"all_exhausted\", \"all_exhausted_without_replacement\"]:\n        raise ValueError(f\"{stopping_strategy} is not supported. Please enter a valid stopping_strategy.\")\n    if dataset_type is Dataset:\n        return _interleave_map_style_datasets(\n            datasets, probabilities, seed, info=info, split=split, stopping_strategy=stopping_strategy\n        )\n    else:\n        return _interleave_iterable_datasets(\n            datasets,\n            probabilities,\n            seed,\n            info=info,\n            split=split,\n            stopping_strategy=stopping_strategy,\n        )\n\n\ndef concatenate_datasets(\n    dsets: list[DatasetType],\n    info: Optional[DatasetInfo] = None,\n    split: Optional[NamedSplit] = None,\n    axis: int = 0,\n) -> DatasetType:\n    \"\"\"\n    Concatenate several datasets (sources) into a single dataset.\n\n    Use axis=0 to concatenate vertically (default), or axis=1 to concatenate horizontally.\n\n    Note for iterable datasets:\n\n    * if axis=0, the resulting dataset's `num_shards` is the sum of each dataset's `num_shards`.\n    * if axis=1, the resulting dataset has one (1) shard to not misalign data.\n\n    Args:\n        dsets (`List[datasets.Dataset]` or `List[datasets.IterableDataset]`):\n            List of Datasets to concatenate.\n        info (`DatasetInfo`, *optional*):\n            Dataset information, like description, citation, etc.\n        split (`NamedSplit`, *optional*):\n            Name of the dataset split.\n        axis (`{0, 1}`, defaults to `0`):\n            Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns\n            (horizontally).\n\n            <Added version=\"1.6.0\"/>\n\n    Example:\n\n    ```py\n    >>> ds3 = concatenate_datasets([ds1, ds2])\n    ```\n    \"\"\"\n\n    if not dsets:\n        raise ValueError(\"Unable to concatenate an empty list of datasets.\")\n    for i, dataset in enumerate(dsets):\n        if not isinstance(dataset, (Dataset, IterableDataset)):\n            if isinstance(dataset, (DatasetDict, IterableDatasetDict)):\n                if not dataset:\n                    raise ValueError(\n                        f\"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} \"\n                        \"is an empty dataset dictionary.\"\n                    )\n                raise ValueError(\n                    f\"Dataset at position {i} has at least one split: {list(dataset)}\\n\"\n                    f\"Please pick one to interleave with the other datasets, for example: dataset['{next(iter(dataset))}']\"\n                )\n            raise ValueError(\n                f\"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} is a {type(dataset).__name__}.\"\n            )\n        if i == 0:\n            dataset_type, other_type = (\n                (Dataset, IterableDataset) if isinstance(dataset, Dataset) else (IterableDataset, Dataset)\n            )\n        elif not isinstance(dataset, dataset_type):\n            raise ValueError(\n                f\"Unable to interleave a {dataset_type.__name__} (at position 0) with a {other_type.__name__} (at position {i}). Expected a list of Dataset objects or a list of IterableDataset objects.\"\n            )\n    if dataset_type is Dataset:\n        return _concatenate_map_style_datasets(dsets, info=info, split=split, axis=axis)\n    else:\n        return _concatenate_iterable_datasets(dsets, info=info, split=split, axis=axis)\n"
  },
  {
    "path": "src/datasets/commands/__init__.py",
    "content": "from abc import ABC, abstractmethod\nfrom argparse import ArgumentParser\n\n\nclass BaseDatasetsCLICommand(ABC):\n    @staticmethod\n    @abstractmethod\n    def register_subcommand(parser: ArgumentParser):\n        raise NotImplementedError()\n\n    @abstractmethod\n    def run(self):\n        raise NotImplementedError()\n"
  },
  {
    "path": "src/datasets/commands/datasets_cli.py",
    "content": "#!/usr/bin/env python\nfrom argparse import ArgumentParser\n\nfrom datasets.commands.delete_from_hub import DeleteFromHubCommand\nfrom datasets.commands.env import EnvironmentCommand\nfrom datasets.commands.test import TestCommand\nfrom datasets.utils.logging import set_verbosity_info\n\n\ndef parse_unknown_args(unknown_args):\n    return {key.lstrip(\"-\"): value for key, value in zip(unknown_args[::2], unknown_args[1::2])}\n\n\ndef main():\n    parser = ArgumentParser(\n        \"HuggingFace Datasets CLI tool\", usage=\"datasets-cli <command> [<args>]\", allow_abbrev=False\n    )\n    commands_parser = parser.add_subparsers(help=\"datasets-cli command helpers\")\n    set_verbosity_info()\n\n    # Register commands\n    EnvironmentCommand.register_subcommand(commands_parser)\n    TestCommand.register_subcommand(commands_parser)\n    DeleteFromHubCommand.register_subcommand(commands_parser)\n\n    # Parse args\n    args, unknown_args = parser.parse_known_args()\n    if not hasattr(args, \"func\"):\n        parser.print_help()\n        exit(1)\n    kwargs = parse_unknown_args(unknown_args)\n\n    # Run\n    service = args.func(args, **kwargs)\n    service.run()\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "src/datasets/commands/delete_from_hub.py",
    "content": "from argparse import ArgumentParser\nfrom typing import Optional\n\nfrom datasets.commands import BaseDatasetsCLICommand\nfrom datasets.hub import delete_from_hub\n\n\ndef _command_factory(args):\n    return DeleteFromHubCommand(\n        args.dataset_id,\n        args.config_name,\n        args.token,\n        args.revision,\n    )\n\n\nclass DeleteFromHubCommand(BaseDatasetsCLICommand):\n    @staticmethod\n    def register_subcommand(parser):\n        parser: ArgumentParser = parser.add_parser(\"delete_from_hub\", help=\"Delete dataset config from the Hub\")\n        parser.add_argument(\n            \"dataset_id\", help=\"source dataset ID, e.g. USERNAME/DATASET_NAME or ORGANIZATION/DATASET_NAME\"\n        )\n        parser.add_argument(\"config_name\", help=\"config name to delete\")\n        parser.add_argument(\"--token\", help=\"access token to the Hugging Face Hub\")\n        parser.add_argument(\"--revision\", help=\"source revision\")\n        parser.set_defaults(func=_command_factory)\n\n    def __init__(\n        self,\n        dataset_id: str,\n        config_name: str,\n        token: Optional[str],\n        revision: Optional[str],\n    ):\n        self._dataset_id = dataset_id\n        self._config_name = config_name\n        self._token = token\n        self._revision = revision\n\n    def run(self) -> None:\n        _ = delete_from_hub(self._dataset_id, self._config_name, revision=self._revision, token=self._token)\n"
  },
  {
    "path": "src/datasets/commands/env.py",
    "content": "import platform\nfrom argparse import ArgumentParser\n\nimport fsspec\nimport huggingface_hub\nimport pandas\nimport pyarrow\n\nfrom datasets import __version__ as version\nfrom datasets.commands import BaseDatasetsCLICommand\n\n\ndef info_command_factory(_):\n    return EnvironmentCommand()\n\n\nclass EnvironmentCommand(BaseDatasetsCLICommand):\n    @staticmethod\n    def register_subcommand(parser: ArgumentParser):\n        download_parser = parser.add_parser(\"env\", help=\"Print relevant system environment info.\")\n        download_parser.set_defaults(func=info_command_factory)\n\n    def run(self):\n        info = {\n            \"`datasets` version\": version,\n            \"Platform\": platform.platform(),\n            \"Python version\": platform.python_version(),\n            \"`huggingface_hub` version\": huggingface_hub.__version__,\n            \"PyArrow version\": pyarrow.__version__,\n            \"Pandas version\": pandas.__version__,\n            \"`fsspec` version\": fsspec.__version__,\n        }\n\n        print(\"\\nCopy-and-paste the text below in your GitHub issue.\\n\")\n        print(self.format_dict(info))\n\n        return info\n\n    @staticmethod\n    def format_dict(d):\n        return \"\\n\".join([f\"- {prop}: {val}\" for prop, val in d.items()]) + \"\\n\"\n"
  },
  {
    "path": "src/datasets/commands/test.py",
    "content": "import logging\nimport os\nfrom argparse import ArgumentParser\nfrom collections.abc import Generator\nfrom shutil import rmtree\n\nimport datasets.config\nfrom datasets.builder import DatasetBuilder\nfrom datasets.commands import BaseDatasetsCLICommand\nfrom datasets.download.download_manager import DownloadMode\nfrom datasets.info import DatasetInfosDict\nfrom datasets.load import dataset_module_factory, get_dataset_builder_class\nfrom datasets.utils.info_utils import VerificationMode\nfrom datasets.utils.logging import ERROR, get_logger\n\n\nlogger = get_logger(__name__)\n\n\ndef _test_command_factory(args):\n    return TestCommand(\n        args.dataset,\n        args.name,\n        args.cache_dir,\n        args.data_dir,\n        args.all_configs,\n        args.save_info or args.save_infos,\n        args.ignore_verifications,\n        args.force_redownload,\n        args.clear_cache,\n        args.num_proc,\n    )\n\n\nclass TestCommand(BaseDatasetsCLICommand):\n    __test__ = False  # to tell pytest it's not a test class\n\n    @staticmethod\n    def register_subcommand(parser: ArgumentParser):\n        test_parser = parser.add_parser(\"test\", help=\"Test dataset loading.\")\n        test_parser.add_argument(\"--name\", type=str, default=None, help=\"Dataset processing name\")\n        test_parser.add_argument(\n            \"--cache_dir\",\n            type=str,\n            default=None,\n            help=\"Cache directory where the datasets are stored.\",\n        )\n        test_parser.add_argument(\n            \"--data_dir\",\n            type=str,\n            default=None,\n            help=\"Can be used to specify a manual directory to get the files from.\",\n        )\n        test_parser.add_argument(\"--all_configs\", action=\"store_true\", help=\"Test all dataset configurations\")\n        test_parser.add_argument(\n            \"--save_info\", action=\"store_true\", help=\"Save the dataset infos in the dataset card (README.md)\"\n        )\n        test_parser.add_argument(\n            \"--ignore_verifications\",\n            action=\"store_true\",\n            help=\"Run the test without checksums and splits checks.\",\n        )\n        test_parser.add_argument(\"--force_redownload\", action=\"store_true\", help=\"Force dataset redownload\")\n        test_parser.add_argument(\n            \"--clear_cache\",\n            action=\"store_true\",\n            help=\"Remove downloaded files and cached datasets after each config test\",\n        )\n        test_parser.add_argument(\"--num_proc\", type=int, default=None, help=\"Number of processes\")\n        # aliases\n        test_parser.add_argument(\"--save_infos\", action=\"store_true\", help=\"alias to save_info\")\n        test_parser.add_argument(\"dataset\", type=str, help=\"Name of the dataset to download\")\n        test_parser.set_defaults(func=_test_command_factory)\n\n    def __init__(\n        self,\n        dataset: str,\n        name: str,\n        cache_dir: str,\n        data_dir: str,\n        all_configs: bool,\n        save_infos: bool,\n        ignore_verifications: bool,\n        force_redownload: bool,\n        clear_cache: bool,\n        num_proc: int,\n    ):\n        self._dataset = dataset\n        self._name = name\n        self._cache_dir = cache_dir\n        self._data_dir = data_dir\n        self._all_configs = all_configs\n        self._save_infos = save_infos\n        self._ignore_verifications = ignore_verifications\n        self._force_redownload = force_redownload\n        self._clear_cache = clear_cache\n        self._num_proc = num_proc\n        if clear_cache and not cache_dir:\n            print(\n                \"When --clear_cache is used, specifying a cache directory is mandatory.\\n\"\n                \"The 'download' folder of the cache directory and the dataset builder cache will be deleted after each configuration test.\\n\"\n                \"Please provide a --cache_dir that will be used to test the dataset.\"\n            )\n            exit(1)\n        if save_infos:\n            self._ignore_verifications = True\n\n    def run(self):\n        logging.getLogger(\"filelock\").setLevel(ERROR)\n        if self._name is not None and self._all_configs:\n            print(\"Both parameters `config` and `all_configs` can't be used at once.\")\n            exit(1)\n        path, config_name = self._dataset, self._name\n        module = dataset_module_factory(path)\n        builder_cls = get_dataset_builder_class(module)\n        n_builders = len(builder_cls.BUILDER_CONFIGS) if self._all_configs and builder_cls.BUILDER_CONFIGS else 1\n\n        def get_builders() -> Generator[DatasetBuilder, None, None]:\n            if self._all_configs and builder_cls.BUILDER_CONFIGS:\n                for i, config in enumerate(builder_cls.BUILDER_CONFIGS):\n                    if \"config_name\" in module.builder_kwargs:\n                        yield builder_cls(\n                            cache_dir=self._cache_dir,\n                            data_dir=self._data_dir,\n                            **module.builder_kwargs,\n                        )\n                    else:\n                        yield builder_cls(\n                            config_name=config.name,\n                            cache_dir=self._cache_dir,\n                            data_dir=self._data_dir,\n                            **module.builder_kwargs,\n                        )\n            else:\n                if \"config_name\" in module.builder_kwargs:\n                    yield builder_cls(cache_dir=self._cache_dir, data_dir=self._data_dir, **module.builder_kwargs)\n                else:\n                    yield builder_cls(\n                        config_name=config_name,\n                        cache_dir=self._cache_dir,\n                        data_dir=self._data_dir,\n                        **module.builder_kwargs,\n                    )\n\n        for j, builder in enumerate(get_builders()):\n            print(f\"Testing builder '{builder.config.name}' ({j + 1}/{n_builders})\")\n            builder._record_infos = os.path.exists(\n                os.path.join(builder.get_imported_module_dir(), datasets.config.DATASETDICT_INFOS_FILENAME)\n            )  # record checksums only if we need to update a (deprecated) dataset_infos.json\n            builder.download_and_prepare(\n                download_mode=DownloadMode.REUSE_CACHE_IF_EXISTS\n                if not self._force_redownload\n                else DownloadMode.FORCE_REDOWNLOAD,\n                verification_mode=VerificationMode.NO_CHECKS\n                if self._ignore_verifications\n                else VerificationMode.ALL_CHECKS,\n                num_proc=self._num_proc,\n            )\n            builder.as_dataset()\n\n            # If save_infos=True, we create the dataset card (README.md)\n            # The dataset_infos are saved in the YAML part of the README.md\n            # This is to allow the user to upload them on HF afterwards.\n            if self._save_infos:\n                save_infos_dir = os.path.basename(path) if not os.path.isdir(path) else path\n                os.makedirs(save_infos_dir, exist_ok=True)\n                DatasetInfosDict(**{builder.config.name: builder.info}).write_to_directory(save_infos_dir)\n                print(f\"Dataset card saved at {os.path.join(save_infos_dir, datasets.config.REPOCARD_FILENAME)}\")\n\n            # If clear_cache=True, the download folder and the dataset builder cache directory are deleted\n            if self._clear_cache:\n                if os.path.isdir(builder._cache_dir):\n                    logger.warning(f\"Clearing cache at {builder._cache_dir}\")\n                    rmtree(builder._cache_dir)\n                download_dir = os.path.join(self._cache_dir, datasets.config.DOWNLOADED_DATASETS_DIR)\n                if os.path.isdir(download_dir):\n                    logger.warning(f\"Clearing cache at {download_dir}\")\n                    rmtree(download_dir)\n\n        print(\"Test successful.\")\n"
  },
  {
    "path": "src/datasets/config.py",
    "content": "import importlib\nimport importlib.metadata\nimport logging\nimport os\nimport platform\nfrom pathlib import Path\nfrom typing import Optional\n\nfrom huggingface_hub import constants\nfrom packaging import version\n\n\nlogger = logging.getLogger(__name__.split(\".\", 1)[0])  # to avoid circular import from .utils.logging\n\n# Datasets\nS3_DATASETS_BUCKET_PREFIX = \"https://s3.amazonaws.com/datasets.huggingface.co/datasets/datasets\"\nCLOUDFRONT_DATASETS_DISTRIB_PREFIX = \"https://cdn-datasets.huggingface.co/datasets/datasets\"\nREPO_DATASETS_URL = \"https://raw.githubusercontent.com/huggingface/datasets/{revision}/datasets/{path}/{name}\"\n\n# Hub\nHF_ENDPOINT = os.environ.get(\"HF_ENDPOINT\", \"https://huggingface.co\")\nHUB_DATASETS_URL = HF_ENDPOINT + \"/datasets/{repo_id}/resolve/{revision}/{path}\"\nHUB_DATASETS_HFFS_URL = \"hf://datasets/{repo_id}@{revision}/{path}\"\nHUB_DEFAULT_VERSION = \"main\"\n\nPY_VERSION = version.parse(platform.python_version())\n\n# General environment variables accepted values for booleans\nENV_VARS_TRUE_VALUES = {\"1\", \"ON\", \"YES\", \"TRUE\"}\nENV_VARS_FALSE_VALUES = {\"0\", \"OFF\", \"NO\", \"FALSE\"}\nENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({\"AUTO\"})\nENV_VARS_FALSE_AND_AUTO_VALUES = ENV_VARS_FALSE_VALUES.union({\"AUTO\"})\n\n\n# Imports\nDILL_VERSION = version.parse(importlib.metadata.version(\"dill\"))\nFSSPEC_VERSION = version.parse(importlib.metadata.version(\"fsspec\"))\nPANDAS_VERSION = version.parse(importlib.metadata.version(\"pandas\"))\nPYARROW_VERSION = version.parse(importlib.metadata.version(\"pyarrow\"))\nHF_HUB_VERSION = version.parse(importlib.metadata.version(\"huggingface_hub\"))\n\nUSE_TF = os.environ.get(\"USE_TF\", \"AUTO\").upper()\nUSE_TORCH = os.environ.get(\"USE_TORCH\", \"AUTO\").upper()\nUSE_JAX = os.environ.get(\"USE_JAX\", \"AUTO\").upper()\n\nTORCH_VERSION = \"N/A\"\nTORCH_AVAILABLE = False\n\nif USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:\n    TORCH_AVAILABLE = importlib.util.find_spec(\"torch\") is not None\n    if TORCH_AVAILABLE:\n        try:\n            TORCH_VERSION = version.parse(importlib.metadata.version(\"torch\"))\n            logger.debug(f\"PyTorch version {TORCH_VERSION} available.\")\n        except importlib.metadata.PackageNotFoundError:\n            pass\nelse:\n    logger.info(\"Disabling PyTorch because USE_TF is set\")\n\nPOLARS_VERSION = \"N/A\"\nPOLARS_AVAILABLE = importlib.util.find_spec(\"polars\") is not None\n\nif POLARS_AVAILABLE:\n    try:\n        POLARS_VERSION = version.parse(importlib.metadata.version(\"polars\"))\n        logger.debug(f\"Polars version {POLARS_VERSION} available.\")\n    except importlib.metadata.PackageNotFoundError:\n        pass\n\n\nDUCKDB_VERSION = \"N/A\"\nDUCKDB_AVAILABLE = importlib.util.find_spec(\"duckdb\") is not None\n\nif DUCKDB_AVAILABLE:\n    try:\n        DUCKDB_VERSION = version.parse(importlib.metadata.version(\"duckdb\"))\n        logger.debug(f\"Duckdb version {DUCKDB_VERSION} available.\")\n    except importlib.metadata.PackageNotFoundError:\n        pass\n\nTF_VERSION = \"N/A\"\nTF_AVAILABLE = False\n\nif USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:\n    TF_AVAILABLE = importlib.util.find_spec(\"tensorflow\") is not None\n    if TF_AVAILABLE:\n        # For the metadata, we have to look for both tensorflow and tensorflow-cpu\n        for package in [\n            \"tensorflow\",\n            \"tensorflow-cpu\",\n            \"tensorflow-gpu\",\n            \"tf-nightly\",\n            \"tf-nightly-cpu\",\n            \"tf-nightly-gpu\",\n            \"intel-tensorflow\",\n            \"tensorflow-rocm\",\n            \"tensorflow-macos\",\n        ]:\n            try:\n                TF_VERSION = version.parse(importlib.metadata.version(package))\n            except importlib.metadata.PackageNotFoundError:\n                continue\n            else:\n                break\n        else:\n            TF_AVAILABLE = False\n    if TF_AVAILABLE:\n        if TF_VERSION.major < 2:\n            logger.info(f\"TensorFlow found but with version {TF_VERSION}. `datasets` requires version 2 minimum.\")\n            TF_AVAILABLE = False\n        else:\n            logger.info(f\"TensorFlow version {TF_VERSION} available.\")\nelse:\n    logger.info(\"Disabling Tensorflow because USE_TORCH is set\")\n\n\nJAX_VERSION = \"N/A\"\nJAX_AVAILABLE = False\n\nif USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:\n    JAX_AVAILABLE = importlib.util.find_spec(\"jax\") is not None and importlib.util.find_spec(\"jaxlib\") is not None\n    if JAX_AVAILABLE:\n        try:\n            JAX_VERSION = version.parse(importlib.metadata.version(\"jax\"))\n            logger.info(f\"JAX version {JAX_VERSION} available.\")\n        except importlib.metadata.PackageNotFoundError:\n            pass\nelse:\n    logger.info(\"Disabling JAX because USE_JAX is set to False\")\n\n\n# Optional tools for data loading\nSQLALCHEMY_AVAILABLE = importlib.util.find_spec(\"sqlalchemy\") is not None\n\n# Optional tools for feature decoding\nPIL_AVAILABLE = importlib.util.find_spec(\"PIL\") is not None\nIS_OPUS_SUPPORTED = True\nIS_MP3_SUPPORTED = True\nTORCHCODEC_AVAILABLE = importlib.util.find_spec(\"torchcodec\") is not None\nTORCHVISION_AVAILABLE = importlib.util.find_spec(\"torchvision\") is not None\nPDFPLUMBER_AVAILABLE = importlib.util.find_spec(\"pdfplumber\") is not None\nNIBABEL_AVAILABLE = importlib.util.find_spec(\"nibabel\") is not None\n\n# Optional compression tools\nRARFILE_AVAILABLE = importlib.util.find_spec(\"rarfile\") is not None\nZSTANDARD_AVAILABLE = importlib.util.find_spec(\"zstandard\") is not None\nLZ4_AVAILABLE = importlib.util.find_spec(\"lz4\") is not None\nPY7ZR_AVAILABLE = importlib.util.find_spec(\"py7zr\") is not None\n\n# Cache location\nDEFAULT_XDG_CACHE_HOME = \"~/.cache\"\nXDG_CACHE_HOME = os.getenv(\"XDG_CACHE_HOME\", DEFAULT_XDG_CACHE_HOME)\nDEFAULT_HF_CACHE_HOME = os.path.join(XDG_CACHE_HOME, \"huggingface\")\nHF_CACHE_HOME = os.path.expanduser(os.getenv(\"HF_HOME\", DEFAULT_HF_CACHE_HOME))\n\nDEFAULT_HF_DATASETS_CACHE = os.path.join(HF_CACHE_HOME, \"datasets\")\nHF_DATASETS_CACHE = Path(os.getenv(\"HF_DATASETS_CACHE\", DEFAULT_HF_DATASETS_CACHE))\n\nDEFAULT_HF_MODULES_CACHE = os.path.join(HF_CACHE_HOME, \"modules\")\nHF_MODULES_CACHE = Path(os.getenv(\"HF_MODULES_CACHE\", DEFAULT_HF_MODULES_CACHE))\n\nDOWNLOADED_DATASETS_DIR = \"downloads\"\nDEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(HF_DATASETS_CACHE, DOWNLOADED_DATASETS_DIR)\nDOWNLOADED_DATASETS_PATH = Path(os.getenv(\"HF_DATASETS_DOWNLOADED_DATASETS_PATH\", DEFAULT_DOWNLOADED_DATASETS_PATH))\n\nEXTRACTED_DATASETS_DIR = \"extracted\"\nDEFAULT_EXTRACTED_DATASETS_PATH = os.path.join(DEFAULT_DOWNLOADED_DATASETS_PATH, EXTRACTED_DATASETS_DIR)\nEXTRACTED_DATASETS_PATH = Path(os.getenv(\"HF_DATASETS_EXTRACTED_DATASETS_PATH\", DEFAULT_EXTRACTED_DATASETS_PATH))\n\n# Cached dataset info options\nSAVE_ORIGINAL_SHARD_LENGTHS = False\n\n# Download count for the website\nHF_UPDATE_DOWNLOAD_COUNTS = (\n    os.environ.get(\"HF_UPDATE_DOWNLOAD_COUNTS\", \"AUTO\").upper() in ENV_VARS_TRUE_AND_AUTO_VALUES\n)\n\n# For downloads and to check remote files metadata\nHF_DATASETS_MULTITHREADING_MAX_WORKERS = 16\n\n# Dataset viewer API\nUSE_PARQUET_EXPORT = True\n\n# Batch size constants. For more info, see:\n# https://github.com/apache/arrow/blob/master/docs/source/cpp/arrays.rst#size-limitations-and-recommendations)\nDEFAULT_MAX_BATCH_SIZE = 1000\n\nDEFAULT_CDC_OPTIONS = {\"min_chunk_size\": 256 * 1024, \"max_chunk_size\": 1024 * 1024, \"norm_level\": 0}\n\n# Size of the preloaded record batch in `Dataset.__iter__`\nARROW_READER_BATCH_SIZE_IN_DATASET_ITER = 10\n\n# Max uncompressed shard size in bytes (e.g. to shard parquet datasets in push_to_hub or download_and_prepare)\nMAX_SHARD_SIZE = \"500MB\"\n\n# Max uncompressed row group size in bytes (e.g. for parquet files in push_to_hub or download_and_prepare)\nMAX_ROW_GROUP_SIZE = \"100MB\"\n\n# Parquet configuration\nPARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS = None\nPARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS = None\nPARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS = None\nPARQUET_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS = None\n\n# Arrow configuration\nARROW_RECORD_BATCH_SIZE_FOR_AUDIO_DATASETS = 100\nARROW_RECORD_BATCH_SIZE_FOR_IMAGE_DATASETS = 100\nARROW_RECORD_BATCH_SIZE_FOR_BINARY_DATASETS = 100\nARROW_RECORD_BATCH_SIZE_FOR_VIDEO_DATASETS = 10\n\n# Offline mode\n_offline = os.environ.get(\"HF_DATASETS_OFFLINE\")\nHF_HUB_OFFLINE = constants.HF_HUB_OFFLINE if _offline is None else _offline.upper() in ENV_VARS_TRUE_VALUES\nHF_DATASETS_OFFLINE = HF_HUB_OFFLINE  # kept for backward-compatibility\n\n# Here, `True` will disable progress bars globally without possibility of enabling it\n# programmatically. `False` will enable them without possibility of disabling them.\n# If environment variable is not set (None), then the user is free to enable/disable\n# them programmatically.\n# TL;DR: env variable has priority over code\n__HF_DATASETS_DISABLE_PROGRESS_BARS = os.environ.get(\"HF_DATASETS_DISABLE_PROGRESS_BARS\")\nHF_DATASETS_DISABLE_PROGRESS_BARS: Optional[bool] = (\n    __HF_DATASETS_DISABLE_PROGRESS_BARS.upper() in ENV_VARS_TRUE_VALUES\n    if __HF_DATASETS_DISABLE_PROGRESS_BARS is not None\n    else None\n)\n\n# In-memory\nDEFAULT_IN_MEMORY_MAX_SIZE = 0  # Disabled\nIN_MEMORY_MAX_SIZE = float(os.environ.get(\"HF_DATASETS_IN_MEMORY_MAX_SIZE\", DEFAULT_IN_MEMORY_MAX_SIZE))\n\n# File names\nDATASET_ARROW_FILENAME = \"dataset.arrow\"\nDATASET_INDICES_FILENAME = \"indices.arrow\"\nDATASET_STATE_JSON_FILENAME = \"state.json\"\nDATASET_INFO_FILENAME = \"dataset_info.json\"\nDATASETDICT_INFOS_FILENAME = \"dataset_infos.json\"\nLICENSE_FILENAME = \"LICENSE\"\nDATASETDICT_JSON_FILENAME = \"dataset_dict.json\"\nMETADATA_CONFIGS_FIELD = \"configs\"\nREPOCARD_FILENAME = \"README.md\"\nREPOYAML_FILENAME = \".huggingface.yaml\"\n\nMODULE_NAME_FOR_DYNAMIC_MODULES = \"datasets_modules\"\n\nMAX_DATASET_CONFIG_ID_READABLE_LENGTH = 255\n\n# Temporary cache directory prefix\nTEMP_CACHE_DIR_PREFIX = \"hf_datasets-\"\n\n# Streaming\nSTREAMING_READ_MAX_RETRIES = 20\nSTREAMING_READ_RETRY_INTERVAL = 5\nSTREAMING_READ_SERVER_UNAVAILABLE_RETRY_INTERVAL = 20\nSTREAMING_READ_RATE_LIMIT_RETRY_INTERVAL = 60\nSTREAMING_OPEN_MAX_RETRIES = 20\nSTREAMING_OPEN_RETRY_INTERVAL = 5\n\n# Datasets repositories exploration\nARCHIVES_MAX_NUMBER_FOR_MODULE_INFERENCE = 10\n\n# Async map functions\nMAX_NUM_RUNNING_ASYNC_MAP_FUNCTIONS_IN_PARALLEL = 1000\n\n# Progress bars\nPBAR_REFRESH_TIME_INTERVAL = 0.05  # 20 progress updates per sec\n\n# Maximum number of uploaded files per commit\nUPLOADS_MAX_NUMBER_PER_COMMIT = 50\n\n# Backward compatibility\nMAX_TABLE_NBYTES_FOR_PICKLING = 4 << 30\n"
  },
  {
    "path": "src/datasets/data_files.py",
    "content": "import os\nimport re\nfrom functools import partial\nfrom glob import has_magic\nfrom pathlib import Path, PurePath\nfrom typing import Callable, Optional, Union\n\nimport huggingface_hub\nfrom fsspec.core import url_to_fs\nfrom huggingface_hub import HfFileSystem\nfrom packaging import version\nfrom tqdm.contrib.concurrent import thread_map\n\nfrom . import config\nfrom .download import DownloadConfig\nfrom .naming import _split_re\nfrom .splits import Split\nfrom .utils import logging\nfrom .utils import tqdm as hf_tqdm\nfrom .utils.file_utils import _prepare_path_and_storage_options, is_local_path, is_relative_path, xbasename, xjoin\nfrom .utils.py_utils import string_to_dict\n\n\nSingleOriginMetadata = Union[tuple[str, str], tuple[str], tuple[()]]\n\n\nSANITIZED_DEFAULT_SPLIT = str(Split.TRAIN)\n\n\nlogger = logging.get_logger(__name__)\n\n\nclass Url(str):\n    pass\n\n\nclass EmptyDatasetError(FileNotFoundError):\n    pass\n\n\nSPLIT_PATTERN_SHARDED = \"data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*\"\n\nSPLIT_KEYWORDS = {\n    Split.TRAIN: [\"train\", \"training\"],\n    Split.VALIDATION: [\"validation\", \"valid\", \"dev\", \"val\"],\n    Split.TEST: [\"test\", \"testing\", \"eval\", \"evaluation\"],\n}\nNON_WORDS_CHARS = \"-._ 0-9\"\nif config.FSSPEC_VERSION < version.parse(\"2023.9.0\"):\n    KEYWORDS_IN_FILENAME_BASE_PATTERNS = [\"**[{sep}/]{keyword}[{sep}]*\", \"{keyword}[{sep}]*\"]\n    KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [\n        \"{keyword}/**\",\n        \"{keyword}[{sep}]*/**\",\n        \"**[{sep}/]{keyword}/**\",\n        \"**[{sep}/]{keyword}[{sep}]*/**\",\n    ]\nelif config.FSSPEC_VERSION < version.parse(\"2023.12.0\"):\n    KEYWORDS_IN_FILENAME_BASE_PATTERNS = [\"**/*[{sep}/]{keyword}[{sep}]*\", \"{keyword}[{sep}]*\"]\n    KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [\n        \"{keyword}/**/*\",\n        \"{keyword}[{sep}]*/**/*\",\n        \"**/*[{sep}/]{keyword}/**/*\",\n        \"**/*[{sep}/]{keyword}[{sep}]*/**/*\",\n    ]\nelse:\n    KEYWORDS_IN_FILENAME_BASE_PATTERNS = [\"**/{keyword}[{sep}]*\", \"**/*[{sep}]{keyword}[{sep}]*\"]\n    KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [\n        \"**/{keyword}/**\",\n        \"**/{keyword}[{sep}]*/**\",\n        \"**/*[{sep}]{keyword}/**\",\n        \"**/*[{sep}]{keyword}[{sep}]*/**\",\n    ]\n\nDEFAULT_SPLITS = [Split.TRAIN, Split.VALIDATION, Split.TEST]\nDEFAULT_PATTERNS_SPLIT_IN_FILENAME = {\n    split: [\n        pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)\n        for keyword in SPLIT_KEYWORDS[split]\n        for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS\n    ]\n    for split in DEFAULT_SPLITS\n}\nDEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {\n    split: [\n        pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)\n        for keyword in SPLIT_KEYWORDS[split]\n        for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS\n    ]\n    for split in DEFAULT_SPLITS\n}\n\n\nDEFAULT_PATTERNS_ALL = {\n    Split.TRAIN: [\"**\"],\n}\n\nDEFAULT_PATTERNS_LOGS = {\"logs\": [\"**/*.eval\"]}\n\nALL_SPLIT_PATTERNS = [SPLIT_PATTERN_SHARDED]\nALL_DEFAULT_PATTERNS = [\n    DEFAULT_PATTERNS_LOGS,\n    DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME,\n    DEFAULT_PATTERNS_SPLIT_IN_FILENAME,\n    DEFAULT_PATTERNS_ALL,\n]\nWILDCARD_CHARACTERS = \"*[]\"\nFILES_TO_IGNORE = [\n    \"README.md\",\n    \"config.json\",\n    \"dataset_info.json\",\n    \"dataset_infos.json\",\n    \"dummy_data.zip\",\n    \"dataset_dict.json\",\n]\n\n\ndef contains_wildcards(pattern: str) -> bool:\n    return any(wildcard_character in pattern for wildcard_character in WILDCARD_CHARACTERS)\n\n\ndef sanitize_patterns(patterns: Union[dict, list, str]) -> dict[str, Union[list[str], \"DataFilesList\"]]:\n    \"\"\"\n    Take the data_files patterns from the user, and format them into a dictionary.\n    Each key is the name of the split, and each value is a list of data files patterns (paths or urls).\n    The default split is \"train\".\n\n    Returns:\n        patterns: dictionary of split_name -> list of patterns\n    \"\"\"\n    if isinstance(patterns, dict):\n        return {str(key): value if isinstance(value, list) else [value] for key, value in patterns.items()}\n    elif isinstance(patterns, str):\n        return {SANITIZED_DEFAULT_SPLIT: [patterns]}\n    elif isinstance(patterns, list):\n        if any(isinstance(pattern, dict) for pattern in patterns):\n            for pattern in patterns:\n                if not (\n                    isinstance(pattern, dict)\n                    and len(pattern) == 2\n                    and \"split\" in pattern\n                    and isinstance(pattern.get(\"path\"), (str, list))\n                ):\n                    raise ValueError(\n                        \"Invalid format for data_files entry. \"\n                        \"Each item must be a dictionary with the structure \"\n                        \"{'split': <split_name>, 'path': <path_or_list_of_paths>}.\\n\"\n                        f\"Received: {pattern}\"\n                    )\n            splits = [pattern[\"split\"] for pattern in patterns]\n            if len(set(splits)) != len(splits):\n                raise ValueError(f\"Some splits are duplicated in data_files: {splits}\")\n            return {\n                str(pattern[\"split\"]): pattern[\"path\"] if isinstance(pattern[\"path\"], list) else [pattern[\"path\"]]\n                for pattern in patterns\n            }\n        else:\n            return {SANITIZED_DEFAULT_SPLIT: patterns}\n    else:\n        return sanitize_patterns(list(patterns))\n\n\ndef _is_inside_unrequested_special_dir(matched_rel_path: str, pattern: str) -> bool:\n    \"\"\"\n    When a path matches a pattern, we additionally check if it's inside a special directory\n    we ignore by default (if it starts with a double underscore).\n\n    Users can still explicitly request a filepath inside such a directory if \"__pycache__\" is\n    mentioned explicitly in the requested pattern.\n\n    Some examples:\n\n    base directory:\n\n        ./\n        └── __pycache__\n            └── b.txt\n\n    >>> _is_inside_unrequested_special_dir(\"__pycache__/b.txt\", \"**\")\n    True\n    >>> _is_inside_unrequested_special_dir(\"__pycache__/b.txt\", \"*/b.txt\")\n    True\n    >>> _is_inside_unrequested_special_dir(\"__pycache__/b.txt\", \"__pycache__/*\")\n    False\n    >>> _is_inside_unrequested_special_dir(\"__pycache__/b.txt\", \"__*/*\")\n    False\n    \"\"\"\n    # We just need to check if every special directories from the path is present explicitly in the pattern.\n    # Since we assume that the path matches the pattern, it's equivalent to counting that both\n    # the parent path and the parent pattern have the same number of special directories.\n    data_dirs_to_ignore_in_path = [part for part in PurePath(matched_rel_path).parent.parts if part.startswith(\"__\")]\n    data_dirs_to_ignore_in_pattern = [part for part in PurePath(pattern).parent.parts if part.startswith(\"__\")]\n    return len(data_dirs_to_ignore_in_path) != len(data_dirs_to_ignore_in_pattern)\n\n\ndef _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(matched_rel_path: str, pattern: str) -> bool:\n    \"\"\"\n    When a path matches a pattern, we additionally check if it's a hidden file or if it's inside\n    a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot.\n\n    Users can still explicitly request a filepath that is hidden or is inside a hidden directory\n    if the hidden part is mentioned explicitly in the requested pattern.\n\n    Some examples:\n\n    base directory:\n\n        ./\n        └── .hidden_file.txt\n\n    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(\".hidden_file.txt\", \"**\")\n    True\n    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(\".hidden_file.txt\", \".*\")\n    False\n\n    base directory:\n\n        ./\n        └── .hidden_dir\n            └── a.txt\n\n    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(\".hidden_dir/a.txt\", \"**\")\n    True\n    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(\".hidden_dir/a.txt\", \".*/*\")\n    False\n    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(\".hidden_dir/a.txt\", \".hidden_dir/*\")\n    False\n\n    base directory:\n\n        ./\n        └── .hidden_dir\n            └── .hidden_file.txt\n\n    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(\".hidden_dir/.hidden_file.txt\", \"**\")\n    True\n    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(\".hidden_dir/.hidden_file.txt\", \".*/*\")\n    True\n    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(\".hidden_dir/.hidden_file.txt\", \".*/.*\")\n    False\n    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(\".hidden_dir/.hidden_file.txt\", \".hidden_dir/*\")\n    True\n    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(\".hidden_dir/.hidden_file.txt\", \".hidden_dir/.*\")\n    False\n    \"\"\"\n    # We just need to check if every hidden part from the path is present explicitly in the pattern.\n    # Since we assume that the path matches the pattern, it's equivalent to counting that both\n    # the path and the pattern have the same number of hidden parts.\n    hidden_directories_in_path = [\n        part for part in PurePath(matched_rel_path).parts if part.startswith(\".\") and not set(part) == {\".\"}\n    ]\n    hidden_directories_in_pattern = [\n        part for part in PurePath(pattern).parts if part.startswith(\".\") and not set(part) == {\".\"}\n    ]\n    return len(hidden_directories_in_path) != len(hidden_directories_in_pattern)\n\n\ndef _get_data_files_patterns(pattern_resolver: Callable[[str], list[str]]) -> dict[str, list[str]]:\n    \"\"\"\n    Get the default pattern from a directory or repository by testing all the supported patterns.\n    The first patterns to return a non-empty list of data files is returned.\n\n    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.\n    \"\"\"\n    # first check the split patterns like data/{split}-00000-of-00001.parquet\n    for split_pattern in ALL_SPLIT_PATTERNS:\n        pattern = split_pattern.replace(\"{split}\", \"*\")\n        try:\n            data_files = pattern_resolver(pattern)\n        except FileNotFoundError:\n            continue\n        if len(data_files) > 0:\n            splits: set[str] = set()\n            for p in data_files:\n                p_parts = string_to_dict(xbasename(p), xbasename(split_pattern))\n                assert p_parts is not None\n                splits.add(p_parts[\"split\"])\n\n            if any(not re.match(_split_re, split) for split in splits):\n                raise ValueError(f\"Split name should match '{_split_re}'' but got '{splits}'.\")\n            sorted_splits = [str(split) for split in DEFAULT_SPLITS if split in splits] + sorted(\n                splits - {str(split) for split in DEFAULT_SPLITS}\n            )\n            return {split: [split_pattern.format(split=split)] for split in sorted_splits}\n    # then check the default patterns based on train/valid/test splits\n    for patterns_dict in ALL_DEFAULT_PATTERNS:\n        non_empty_splits = []\n        for split, patterns in patterns_dict.items():\n            for pattern in patterns:\n                try:\n                    data_files = pattern_resolver(pattern)\n                except FileNotFoundError:\n                    continue\n                if len(data_files) > 0:\n                    non_empty_splits.append(split)\n                    break\n        if non_empty_splits:\n            return {split: patterns_dict[split] for split in non_empty_splits}\n    raise FileNotFoundError(f\"Couldn't resolve pattern {pattern} with resolver {pattern_resolver}\")\n\n\ndef resolve_pattern(\n    pattern: str,\n    base_path: str,\n    allowed_extensions: Optional[list[str]] = None,\n    download_config: Optional[DownloadConfig] = None,\n) -> list[str]:\n    \"\"\"\n    Resolve the paths and URLs of the data files from the pattern passed by the user.\n\n    You can use patterns to resolve multiple local files. Here are a few examples:\n    - *.csv to match all the CSV files at the first level\n    - **.csv to match all the CSV files at any level\n    - data/* to match all the files inside \"data\"\n    - data/** to match all the files inside \"data\" and its subdirectories\n\n    The patterns are resolved using the fsspec glob. In fsspec>=2023.12.0 this is equivalent to\n    Python's glob.glob, Path.glob, Path.match and fnmatch where ** is unsupported with a prefix/suffix\n    other than a forward slash /.\n\n    More generally:\n    - '*' matches any character except a forward-slash (to match just the file or directory name)\n    - '**' matches any character including a forward-slash /\n\n    Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.\n    The same applies to special directories that start with a double underscore like \"__pycache__\".\n    You can still include one if the pattern explicitly mentions it:\n    - to include a hidden file: \"*/.hidden.txt\" or \"*/.*\"\n    - to include a hidden directory: \".hidden/*\" or \".*/*\"\n    - to include a special directory: \"__special__/*\" or \"__*/*\"\n\n    Example::\n\n        >>> from datasets.data_files import resolve_pattern\n        >>> base_path = \".\"\n        >>> resolve_pattern(\"docs/**/*.py\", base_path)\n        [/Users/mariosasko/Desktop/projects/datasets/docs/source/_config.py']\n\n    Args:\n        pattern (str): Unix pattern or paths or URLs of the data files to resolve.\n            The paths can be absolute or relative to base_path.\n            Remote filesystems using fsspec are supported, e.g. with the hf:// protocol.\n        base_path (str): Base path to use when resolving relative paths.\n        allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions).\n            For example: allowed_extensions=[\".csv\", \".json\", \".txt\", \".parquet\"]\n        download_config ([`DownloadConfig`], *optional*): Specific download configuration parameters.\n    Returns:\n        List[str]: List of paths or URLs to the local or remote files that match the patterns.\n    \"\"\"\n    if is_relative_path(pattern):\n        pattern = xjoin(base_path, pattern)\n    elif is_local_path(pattern):\n        base_path = os.path.splitdrive(pattern)[0] + os.sep\n    else:\n        base_path = \"\"\n    pattern, storage_options = _prepare_path_and_storage_options(pattern, download_config=download_config)\n    fs, fs_pattern = url_to_fs(pattern, **storage_options)\n    files_to_ignore = set(FILES_TO_IGNORE) - {xbasename(pattern)}\n    protocol = (\n        pattern.split(\"://\")[0]\n        if \"://\" in pattern\n        else (fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0])\n    )\n    protocol_prefix = protocol + \"://\" if protocol != \"file\" else \"\"\n    glob_kwargs = {}\n    if protocol == \"hf\":\n        # 10 times faster glob with detail=True (ignores costly info like lastCommit)\n        glob_kwargs[\"expand_info\"] = False\n\n    # if the pattern contains hops like \"zip://csv/*.csv::data.zip\", we need to keep them after globbing\n    _, *rest_hops = pattern.split(\"::\")\n    matched_paths = []\n    for filepath, info in fs.glob(fs_pattern, detail=True, **glob_kwargs).items():\n        if not (info[\"type\"] == \"file\" or (info.get(\"islink\") and os.path.isfile(os.path.realpath(filepath)))) or (\n            xbasename(filepath) in files_to_ignore\n        ):\n            continue\n        if _is_inside_unrequested_special_dir(filepath, fs_pattern):\n            continue\n        if _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(filepath, fs_pattern):\n            continue\n        filepath = filepath if \"://\" in filepath else protocol_prefix + filepath\n        if rest_hops:\n            filepath = \"::\".join([filepath] + rest_hops)\n        matched_paths.append(filepath)\n    # ignore .ipynb and __pycache__, but keep /../\n    if allowed_extensions is not None:\n        out = [\n            filepath\n            for filepath in matched_paths\n            if any(\".\" + suffix in allowed_extensions for suffix in xbasename(filepath).split(\".\")[1:])\n        ]\n        if len(out) < len(matched_paths):\n            invalid_matched_files = list(set(matched_paths) - set(out))\n            logger.info(\n                f\"Some files matched the pattern '{pattern}' but don't have valid data file extensions: {invalid_matched_files}\"\n            )\n    else:\n        out = matched_paths\n    if not out:\n        error_msg = f\"Unable to find '{pattern}'\"\n        if allowed_extensions is not None:\n            error_msg += f\" with any supported extension {list(allowed_extensions)}\"\n        raise FileNotFoundError(error_msg)\n    return out\n\n\ndef get_data_patterns(base_path: str, download_config: Optional[DownloadConfig] = None) -> dict[str, list[str]]:\n    \"\"\"\n    Get the default pattern from a directory testing all the supported patterns.\n    The first patterns to return a non-empty list of data files is returned.\n\n    Some examples of supported patterns:\n\n    Input:\n\n        my_dataset_repository/\n        ├── README.md\n        └── dataset.csv\n\n    Output:\n\n        {'train': ['**']}\n\n    Input:\n\n        my_dataset_repository/\n        ├── README.md\n        ├── train.csv\n        └── test.csv\n\n        my_dataset_repository/\n        ├── README.md\n        └── data/\n            ├── train.csv\n            └── test.csv\n\n        my_dataset_repository/\n        ├── README.md\n        ├── train_0.csv\n        ├── train_1.csv\n        ├── train_2.csv\n        ├── train_3.csv\n        ├── test_0.csv\n        └── test_1.csv\n\n    Output:\n\n        {'train': ['**/train[-._ 0-9]*', '**/*[-._ 0-9]train[-._ 0-9]*', '**/training[-._ 0-9]*', '**/*[-._ 0-9]training[-._ 0-9]*'],\n         'test': ['**/test[-._ 0-9]*', '**/*[-._ 0-9]test[-._ 0-9]*', '**/testing[-._ 0-9]*', '**/*[-._ 0-9]testing[-._ 0-9]*', ...]}\n\n    Input:\n\n        my_dataset_repository/\n        ├── README.md\n        └── data/\n            ├── train/\n            │   ├── shard_0.csv\n            │   ├── shard_1.csv\n            │   ├── shard_2.csv\n            │   └── shard_3.csv\n            └── test/\n                ├── shard_0.csv\n                └── shard_1.csv\n\n    Output:\n\n        {'train': ['**/train/**', '**/train[-._ 0-9]*/**', '**/*[-._ 0-9]train/**', '**/*[-._ 0-9]train[-._ 0-9]*/**', ...],\n         'test': ['**/test/**', '**/test[-._ 0-9]*/**', '**/*[-._ 0-9]test/**', '**/*[-._ 0-9]test[-._ 0-9]*/**', ...]}\n\n    Input:\n\n        my_dataset_repository/\n        ├── README.md\n        └── data/\n            ├── train-00000-of-00003.csv\n            ├── train-00001-of-00003.csv\n            ├── train-00002-of-00003.csv\n            ├── test-00000-of-00001.csv\n            ├── random-00000-of-00003.csv\n            ├── random-00001-of-00003.csv\n            └── random-00002-of-00003.csv\n\n    Output:\n\n        {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],\n         'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],\n         'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}\n\n    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.\n    \"\"\"\n    resolver = partial(resolve_pattern, base_path=base_path, download_config=download_config)\n    try:\n        return _get_data_files_patterns(resolver)\n    except FileNotFoundError:\n        raise EmptyDatasetError(f\"The directory at {base_path} doesn't contain any data files\") from None\n\n\ndef _get_single_origin_metadata(\n    data_file: str,\n    download_config: Optional[DownloadConfig] = None,\n) -> SingleOriginMetadata:\n    if data_file.startswith(config.HF_ENDPOINT):\n        fs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=download_config.token)\n        data_file = \"hf://\" + data_file[len(config.HF_ENDPOINT) + 1 :]\n        data_file = data_file.replace(\"/resolve/\", \"/\" if data_file.startswith(\"hf://buckets/\") else \"@\", 1)\n        fs_path = data_file\n    else:\n        data_file, storage_options = _prepare_path_and_storage_options(data_file, download_config=download_config)\n        fs, fs_path = url_to_fs(data_file, **storage_options)\n    if isinstance(fs, HfFileSystem):\n        resolved_path = fs.resolve_path(fs_path)\n        if hasattr(resolved_path, \"revision\"):  # no revision for buckets\n            return resolved_path.repo_id, resolved_path.revision\n    info = fs.info(fs_path)\n    # s3fs uses \"ETag\", gcsfs uses \"etag\", and for local we simply check mtime\n    for key in [\"ETag\", \"etag\", \"mtime\"]:\n        if key in info:\n            return (str(info[key]),)\n    return ()\n\n\ndef _get_origin_metadata(\n    data_files: list[str],\n    download_config: Optional[DownloadConfig] = None,\n    max_workers: Optional[int] = None,\n) -> list[SingleOriginMetadata]:\n    max_workers = max_workers if max_workers is not None else config.HF_DATASETS_MULTITHREADING_MAX_WORKERS\n    if all(\"hf://\" in data_file for data_file in data_files):\n        # No need for multithreading here since the origin metadata of HF files\n        # is (repo_id, revision) and is cached after first .info() call.\n        return [\n            _get_single_origin_metadata(data_file, download_config=download_config)\n            for data_file in hf_tqdm(\n                data_files,\n                desc=\"Resolving data files\",\n                # set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached\n                disable=len(data_files) <= 16 or None,\n            )\n        ]\n    return thread_map(\n        partial(_get_single_origin_metadata, download_config=download_config),\n        data_files,\n        max_workers=max_workers,\n        tqdm_class=hf_tqdm,\n        desc=\"Resolving data files\",\n        # set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached\n        disable=len(data_files) <= 16 or None,\n    )\n\n\nclass DataFilesList(list[str]):\n    \"\"\"\n    List of data files (absolute local paths or URLs).\n    It has two construction methods given the user's data files patterns:\n    - ``from_hf_repo``: resolve patterns inside a dataset repository\n    - ``from_local_or_remote``: resolve patterns from a local path\n\n    Moreover, DataFilesList has an additional attribute ``origin_metadata``.\n    It can store:\n    - the last modified time of local files\n    - ETag of remote files\n    - commit sha of a dataset repository\n\n    Thanks to this additional attribute, it is possible to hash the list\n    and get a different hash if and only if at least one file changed.\n    This is useful for caching Dataset objects that are obtained from a list of data files.\n    \"\"\"\n\n    def __init__(self, data_files: list[str], origin_metadata: list[SingleOriginMetadata]) -> None:\n        super().__init__(data_files)\n        self.origin_metadata = origin_metadata\n\n    def __add__(self, other: \"DataFilesList\") -> \"DataFilesList\":\n        return DataFilesList([*self, *other], self.origin_metadata + other.origin_metadata)\n\n    @classmethod\n    def from_hf_repo(\n        cls,\n        patterns: list[str],\n        dataset_info: huggingface_hub.hf_api.DatasetInfo,\n        base_path: Optional[str] = None,\n        allowed_extensions: Optional[list[str]] = None,\n        download_config: Optional[DownloadConfig] = None,\n    ) -> \"DataFilesList\":\n        base_path = f\"hf://datasets/{dataset_info.id}@{dataset_info.sha}/{base_path or ''}\".rstrip(\"/\")\n        return cls.from_patterns(\n            patterns, base_path=base_path, allowed_extensions=allowed_extensions, download_config=download_config\n        )\n\n    @classmethod\n    def from_local_or_remote(\n        cls,\n        patterns: list[str],\n        base_path: Optional[str] = None,\n        allowed_extensions: Optional[list[str]] = None,\n        download_config: Optional[DownloadConfig] = None,\n    ) -> \"DataFilesList\":\n        base_path = base_path if base_path is not None else Path().resolve().as_posix()\n        return cls.from_patterns(\n            patterns, base_path=base_path, allowed_extensions=allowed_extensions, download_config=download_config\n        )\n\n    @classmethod\n    def from_patterns(\n        cls,\n        patterns: list[str],\n        base_path: Optional[str] = None,\n        allowed_extensions: Optional[list[str]] = None,\n        download_config: Optional[DownloadConfig] = None,\n    ) -> \"DataFilesList\":\n        base_path = base_path if base_path is not None else Path().resolve().as_posix()\n        data_files = []\n        for pattern in patterns:\n            try:\n                data_files.extend(\n                    resolve_pattern(\n                        pattern,\n                        base_path=base_path,\n                        allowed_extensions=allowed_extensions,\n                        download_config=download_config,\n                    )\n                )\n            except FileNotFoundError:\n                if not has_magic(pattern):\n                    raise\n        origin_metadata = _get_origin_metadata(data_files, download_config=download_config)\n        return cls(data_files, origin_metadata)\n\n    def filter(\n        self, *, extensions: Optional[list[str]] = None, file_names: Optional[list[str]] = None\n    ) -> \"DataFilesList\":\n        patterns = []\n        if extensions:\n            ext_pattern = \"|\".join(re.escape(ext) for ext in extensions)\n            patterns.append(re.compile(f\".*({ext_pattern})(\\\\..+)?$\"))\n        if file_names:\n            fn_pattern = \"|\".join(re.escape(fn) for fn in file_names)\n            patterns.append(re.compile(rf\".*[\\/]?({fn_pattern})$\"))\n        if patterns:\n            return DataFilesList(\n                [data_file for data_file in self if any(pattern.match(data_file) for pattern in patterns)],\n                origin_metadata=self.origin_metadata,\n            )\n        else:\n            return DataFilesList(list(self), origin_metadata=self.origin_metadata)\n\n\nclass DataFilesDict(dict[str, DataFilesList]):\n    \"\"\"\n    Dict of split_name -> list of data files (absolute local paths or URLs).\n    It has two construction methods given the user's data files patterns :\n    - ``from_hf_repo``: resolve patterns inside a dataset repository\n    - ``from_local_or_remote``: resolve patterns from a local path\n\n    Moreover, each list is a DataFilesList. It is possible to hash the dictionary\n    and get a different hash if and only if at least one file changed.\n    For more info, see [`DataFilesList`].\n\n    This is useful for caching Dataset objects that are obtained from a list of data files.\n\n    Changing the order of the keys of this dictionary also doesn't change its hash.\n    \"\"\"\n\n    @classmethod\n    def from_local_or_remote(\n        cls,\n        patterns: dict[str, Union[list[str], DataFilesList]],\n        base_path: Optional[str] = None,\n        allowed_extensions: Optional[list[str]] = None,\n        download_config: Optional[DownloadConfig] = None,\n    ) -> \"DataFilesDict\":\n        out = cls()\n        for key, patterns_for_key in patterns.items():\n            out[key] = (\n                patterns_for_key\n                if isinstance(patterns_for_key, DataFilesList)\n                else DataFilesList.from_local_or_remote(\n                    patterns_for_key,\n                    base_path=base_path,\n                    allowed_extensions=allowed_extensions,\n                    download_config=download_config,\n                )\n            )\n        return out\n\n    @classmethod\n    def from_hf_repo(\n        cls,\n        patterns: dict[str, Union[list[str], DataFilesList]],\n        dataset_info: huggingface_hub.hf_api.DatasetInfo,\n        base_path: Optional[str] = None,\n        allowed_extensions: Optional[list[str]] = None,\n        download_config: Optional[DownloadConfig] = None,\n    ) -> \"DataFilesDict\":\n        out = cls()\n        for key, patterns_for_key in patterns.items():\n            out[key] = (\n                patterns_for_key\n                if isinstance(patterns_for_key, DataFilesList)\n                else DataFilesList.from_hf_repo(\n                    patterns_for_key,\n                    dataset_info=dataset_info,\n                    base_path=base_path,\n                    allowed_extensions=allowed_extensions,\n                    download_config=download_config,\n                )\n            )\n        return out\n\n    @classmethod\n    def from_patterns(\n        cls,\n        patterns: dict[str, Union[list[str], DataFilesList]],\n        base_path: Optional[str] = None,\n        allowed_extensions: Optional[list[str]] = None,\n        download_config: Optional[DownloadConfig] = None,\n    ) -> \"DataFilesDict\":\n        out = cls()\n        for key, patterns_for_key in patterns.items():\n            out[key] = (\n                patterns_for_key\n                if isinstance(patterns_for_key, DataFilesList)\n                else DataFilesList.from_patterns(\n                    patterns_for_key,\n                    base_path=base_path,\n                    allowed_extensions=allowed_extensions,\n                    download_config=download_config,\n                )\n            )\n        return out\n\n    def filter(\n        self, *, extensions: Optional[list[str]] = None, file_names: Optional[list[str]] = None\n    ) -> \"DataFilesDict\":\n        out = type(self)()\n        for key, data_files_list in self.items():\n            out[key] = data_files_list.filter(extensions=extensions, file_names=file_names)\n        return out\n\n\nclass DataFilesPatternsList(list[str]):\n    \"\"\"\n    List of data files patterns (absolute local paths or URLs).\n    For each pattern there should also be a list of allowed extensions\n    to keep, or a None ot keep all the files for the pattern.\n    \"\"\"\n\n    def __init__(\n        self,\n        patterns: list[str],\n        allowed_extensions: list[Optional[list[str]]],\n    ):\n        super().__init__(patterns)\n        self.allowed_extensions = allowed_extensions\n\n    def __add__(self, other):\n        return DataFilesList([*self, *other], self.allowed_extensions + other.allowed_extensions)\n\n    @classmethod\n    def from_patterns(\n        cls, patterns: list[str], allowed_extensions: Optional[list[str]] = None\n    ) -> \"DataFilesPatternsList\":\n        return cls(patterns, [allowed_extensions] * len(patterns))\n\n    def resolve(\n        self,\n        base_path: str,\n        download_config: Optional[DownloadConfig] = None,\n    ) -> \"DataFilesList\":\n        base_path = base_path if base_path is not None else Path().resolve().as_posix()\n        data_files = []\n        for pattern, allowed_extensions in zip(self, self.allowed_extensions):\n            try:\n                data_files.extend(\n                    resolve_pattern(\n                        pattern,\n                        base_path=base_path,\n                        allowed_extensions=allowed_extensions,\n                        download_config=download_config,\n                    )\n                )\n            except FileNotFoundError:\n                if not has_magic(pattern):\n                    raise\n        origin_metadata = _get_origin_metadata(data_files, download_config=download_config)\n        return DataFilesList(data_files, origin_metadata)\n\n    def filter_extensions(self, extensions: list[str]) -> \"DataFilesPatternsList\":\n        return DataFilesPatternsList(\n            self, [allowed_extensions + extensions for allowed_extensions in self.allowed_extensions]\n        )\n\n\nclass DataFilesPatternsDict(dict[str, DataFilesPatternsList]):\n    \"\"\"\n    Dict of split_name -> list of data files patterns (absolute local paths or URLs).\n    \"\"\"\n\n    @classmethod\n    def from_patterns(\n        cls, patterns: dict[str, list[str]], allowed_extensions: Optional[list[str]] = None\n    ) -> \"DataFilesPatternsDict\":\n        out = cls()\n        for key, patterns_for_key in patterns.items():\n            out[key] = (\n                patterns_for_key\n                if isinstance(patterns_for_key, DataFilesPatternsList)\n                else DataFilesPatternsList.from_patterns(\n                    patterns_for_key,\n                    allowed_extensions=allowed_extensions,\n                )\n            )\n        return out\n\n    def resolve(\n        self,\n        base_path: str,\n        download_config: Optional[DownloadConfig] = None,\n    ) -> \"DataFilesDict\":\n        out = DataFilesDict()\n        for key, data_files_patterns_list in self.items():\n            out[key] = data_files_patterns_list.resolve(base_path, download_config)\n        return out\n\n    def filter_extensions(self, extensions: list[str]) -> \"DataFilesPatternsDict\":\n        out = type(self)()\n        for key, data_files_patterns_list in self.items():\n            out[key] = data_files_patterns_list.filter_extensions(extensions)\n        return out\n"
  },
  {
    "path": "src/datasets/dataset_dict.py",
    "content": "import contextlib\nimport copy\nimport itertools\nimport json\nimport math\nimport posixpath\nimport random\nimport re\nimport time\nfrom collections.abc import Sequence\nfrom functools import partial\nfrom typing import Callable, Literal, Optional, Union\n\nimport fsspec\nimport numpy as np\nfrom fsspec.core import url_to_fs\nfrom fsspec.implementations.dirfs import DirFileSystem\nfrom huggingface_hub import (\n    CommitInfo,\n    CommitOperationAdd,\n    CommitOperationDelete,\n    HfApi,\n    HfFileSystem,\n    HfFileSystemResolvedPath,\n)\nfrom huggingface_hub.utils import EntryNotFoundError, HfHubHTTPError, RepositoryNotFoundError\nfrom packaging import version\n\nfrom . import config\nfrom .arrow_dataset import (\n    Dataset,\n    _get_updated_dataset_card,\n)\nfrom .features import Features\nfrom .features.features import FeatureType\nfrom .iterable_dataset import IterableDataset\nfrom .naming import _split_re\nfrom .splits import NamedSplit, Split, SplitInfo\nfrom .table import Table\nfrom .utils import logging\nfrom .utils.doc_utils import is_documented_by\nfrom .utils.typing import PathLike\n\n\nif config.HF_HUB_VERSION >= version.parse(\"1.6.0\"):\n    from huggingface_hub.errors import BucketNotFoundError\n    from huggingface_hub.hf_file_system import HfFileSystemResolvedBucketPath, HfFileSystemResolvedRepositoryPath\n\nelse:\n    BucketNotFoundError = None\n    HfFileSystemResolvedBucketPath = None\n    HfFileSystemResolvedRepositoryPath = HfFileSystemResolvedPath\n\n\nlogger = logging.get_logger(__name__)\n\n\nclass bind(partial):\n    def __call__(self, *fn_args, **fn_kwargs):\n        return self.func(*fn_args, *self.args, **fn_kwargs)\n\n\nclass DatasetDict(dict[Union[str, NamedSplit], \"Dataset\"]):\n    \"\"\"A dictionary (dict of str: datasets.Dataset) with dataset transforms methods (map, filter, etc.)\"\"\"\n\n    def _check_values_type(self):\n        for dataset in self.values():\n            if not isinstance(dataset, Dataset):\n                raise TypeError(f\"Values in `DatasetDict` should be of type `Dataset` but got type '{type(dataset)}'\")\n\n    def _check_values_features(self):\n        items = list(self.items())\n        for item_a, item_b in zip(items[:-1], items[1:]):\n            if item_a[1].features != item_b[1].features:\n                raise ValueError(\n                    f\"All datasets in `DatasetDict` should have the same features but features for '{item_a[0]}' and '{item_b[0]}' don't match: {item_a[1].features} != {item_b[1].features}\"\n                )\n\n    def __enter__(self):\n        return self\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        # Here `del` is used to del the pyarrow tables. This properly closes the files used for memory mapped tables\n        for dataset in self.values():\n            if hasattr(dataset, \"_data\"):\n                del dataset._data\n            if hasattr(dataset, \"_indices\"):\n                del dataset._indices\n\n    def __getitem__(self, k) -> Dataset:\n        if isinstance(k, (str, NamedSplit)) or len(self) == 0:\n            return super().__getitem__(k)\n        else:\n            available_suggested_splits = [\n                split for split in (Split.TRAIN, Split.TEST, Split.VALIDATION) if split in self\n            ]\n            suggested_split = available_suggested_splits[0] if available_suggested_splits else list(self)[0]\n            raise KeyError(\n                f\"Invalid key: {k}. Please first select a split. For example: \"\n                f\"`my_dataset_dictionary['{suggested_split}'][{k}]`. \"\n                f\"Available splits: {sorted(self)}\"\n            )\n\n    @property\n    def data(self) -> dict[str, Table]:\n        \"\"\"The Apache Arrow tables backing each split.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> ds.data\n        ```\n        \"\"\"\n        self._check_values_type()\n        return {k: dataset.data for k, dataset in self.items()}\n\n    @property\n    def cache_files(self) -> dict[str, dict]:\n        \"\"\"The cache files containing the Apache Arrow table backing each split.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> ds.cache_files\n        {'test': [{'filename': '/root/.cache/huggingface/datasets/rotten_tomatoes_movie_review/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46/rotten_tomatoes_movie_review-test.arrow'}],\n         'train': [{'filename': '/root/.cache/huggingface/datasets/rotten_tomatoes_movie_review/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46/rotten_tomatoes_movie_review-train.arrow'}],\n         'validation': [{'filename': '/root/.cache/huggingface/datasets/rotten_tomatoes_movie_review/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46/rotten_tomatoes_movie_review-validation.arrow'}]}\n        ```\n        \"\"\"\n        self._check_values_type()\n        return {k: dataset.cache_files for k, dataset in self.items()}\n\n    @property\n    def num_columns(self) -> dict[str, int]:\n        \"\"\"Number of columns in each split of the dataset.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> ds.num_columns\n        {'test': 2, 'train': 2, 'validation': 2}\n        ```\n        \"\"\"\n        self._check_values_type()\n        return {k: dataset.num_columns for k, dataset in self.items()}\n\n    @property\n    def num_rows(self) -> dict[str, int]:\n        \"\"\"Number of rows in each split of the dataset.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> ds.num_rows\n        {'test': 1066, 'train': 8530, 'validation': 1066}\n        ```\n        \"\"\"\n        self._check_values_type()\n        return {k: dataset.num_rows for k, dataset in self.items()}\n\n    @property\n    def column_names(self) -> dict[str, list[str]]:\n        \"\"\"Names of the columns in each split of the dataset.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> ds.column_names\n        {'test': ['text', 'label'],\n         'train': ['text', 'label'],\n         'validation': ['text', 'label']}\n        ```\n        \"\"\"\n        self._check_values_type()\n        return {k: dataset.column_names for k, dataset in self.items()}\n\n    @property\n    def shape(self) -> dict[str, tuple[int]]:\n        \"\"\"Shape of each split of the dataset (number of rows, number of columns).\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> ds.shape\n        {'test': (1066, 2), 'train': (8530, 2), 'validation': (1066, 2)}\n        ```\n        \"\"\"\n        self._check_values_type()\n        return {k: dataset.shape for k, dataset in self.items()}\n\n    def flatten(self, max_depth=16) -> \"DatasetDict\":\n        \"\"\"Flatten the Apache Arrow Table of each split (nested features are flatten).\n        Each column with a struct type is flattened into one column per struct field.\n        Other columns are left unchanged.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"rajpurkar/squad\")\n        >>> ds[\"train\"].features\n        {'id': Value('string'),\n         'title': Value('string'),\n         'context': Value('string'),\n         'question': Value('string'),\n         'answers.text': List(Value('string')),\n         'answers.answer_start': List(Value('int32'))}\n        >>> ds.flatten()\n        DatasetDict({\n            train: Dataset({\n                features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'],\n                num_rows: 87599\n            })\n            validation: Dataset({\n                features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'],\n                num_rows: 10570\n            })\n        })\n        ```\n        \"\"\"\n        self._check_values_type()\n        return DatasetDict({k: dataset.flatten(max_depth=max_depth) for k, dataset in self.items()})\n\n    def unique(self, column: str) -> dict[str, list]:\n        \"\"\"Return a list of the unique elements in a column for each split.\n\n        This is implemented in the low-level backend and as such, very fast.\n\n        Args:\n            column (`str`):\n                column name (list all the column names with [`~datasets.DatasetDict.column_names`])\n\n        Returns:\n            Dict[`str`, `list`]: Dictionary of unique elements in the given column.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> ds.unique(\"label\")\n        {'test': [1, 0], 'train': [1, 0], 'validation': [1, 0]}\n        ```\n        \"\"\"\n        self._check_values_type()\n        return {k: dataset.unique(column) for k, dataset in self.items()}\n\n    def cleanup_cache_files(self) -> dict[str, int]:\n        \"\"\"Clean up all cache files in the dataset cache directory, excepted the currently used cache file if there is one.\n        Be careful when running this command that no other process is currently using other cache files.\n\n        Return:\n            `Dict` with the number of removed files for each split\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> ds.cleanup_cache_files()\n        {'test': 0, 'train': 0, 'validation': 0}\n        ```\n        \"\"\"\n        self._check_values_type()\n        return {k: dataset.cleanup_cache_files() for k, dataset in self.items()}\n\n    def __repr__(self):\n        repr = \"\\n\".join([f\"{k}: {v}\" for k, v in self.items()])\n        repr = re.sub(r\"^\", \" \" * 4, repr, count=0, flags=re.M)\n        return f\"DatasetDict({{\\n{repr}\\n}})\"\n\n    def cast(self, features: Features) -> \"DatasetDict\":\n        \"\"\"\n        Cast the dataset to a new set of features.\n        The transformation is applied to all the datasets of the dataset dictionary.\n\n        Args:\n            features ([`Features`]):\n                New features to cast the dataset to.\n                The name and order of the fields in the features must match the current column names.\n                The type of the data must also be convertible from one type to the other.\n                For non-trivial conversion, e.g. `string` <-> `ClassLabel` you should use [`~DatasetDict.map`] to update the dataset.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset, ClassLabel, Value\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> ds[\"train\"].features\n        {'label': ClassLabel(names=['neg', 'pos']),\n         'text': Value('string')}\n        >>> new_features = ds[\"train\"].features.copy()\n        >>> new_features['label'] = ClassLabel(names=['bad', 'good'])\n        >>> new_features['text'] = Value('large_string')\n        >>> ds = ds.cast(new_features)\n        >>> ds[\"train\"].features\n        {'label': ClassLabel(names=['bad', 'good']),\n         'text': Value('large_string')}\n        ```\n        \"\"\"\n        self._check_values_type()\n        return DatasetDict({k: dataset.cast(features=features) for k, dataset in self.items()})\n\n    def cast_column(self, column: str, feature) -> \"DatasetDict\":\n        \"\"\"Cast column to feature for decoding.\n\n        Args:\n            column (`str`):\n                Column name.\n            feature ([`Feature`]):\n                Target feature.\n\n        Returns:\n            [`DatasetDict`]\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset, ClassLabel\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> ds[\"train\"].features\n        {'label': ClassLabel(names=['neg', 'pos']),\n         'text': Value('string')}\n        >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))\n        >>> ds[\"train\"].features\n        {'label': ClassLabel(names=['bad', 'good']),\n         'text': Value('string')}\n        ```\n        \"\"\"\n        self._check_values_type()\n        return DatasetDict({k: dataset.cast_column(column=column, feature=feature) for k, dataset in self.items()})\n\n    def remove_columns(self, column_names: Union[str, list[str]]) -> \"DatasetDict\":\n        \"\"\"\n        Remove one or several column(s) from each split in the dataset\n        and the features associated to the column(s).\n\n        The transformation is applied to all the splits of the dataset dictionary.\n\n        You can also remove a column using [`~DatasetDict.map`] with `remove_columns` but the present method\n        doesn't copy the data of the remaining columns and is thus faster.\n\n        Args:\n            column_names (`Union[str, list[str]]`):\n                Name of the column(s) to remove.\n\n        Returns:\n            [`DatasetDict`]: A copy of the dataset object without the columns to remove.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> ds = ds.remove_columns(\"label\")\n        DatasetDict({\n            train: Dataset({\n                features: ['text'],\n                num_rows: 8530\n            })\n            validation: Dataset({\n                features: ['text'],\n                num_rows: 1066\n            })\n            test: Dataset({\n                features: ['text'],\n                num_rows: 1066\n            })\n        })\n        ```\n        \"\"\"\n        self._check_values_type()\n        return DatasetDict({k: dataset.remove_columns(column_names=column_names) for k, dataset in self.items()})\n\n    def rename_column(self, original_column_name: str, new_column_name: str) -> \"DatasetDict\":\n        \"\"\"\n        Rename a column in the dataset and move the features associated to the original column under the new column name.\n        The transformation is applied to all the datasets of the dataset dictionary.\n\n        You can also rename a column using [`~DatasetDict.map`] with `remove_columns` but the present method:\n            - takes care of moving the original features under the new column name.\n            - doesn't copy the data to a new dataset and is thus much faster.\n\n        Args:\n            original_column_name (`str`):\n                Name of the column to rename.\n            new_column_name (`str`):\n                New name for the column.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> ds = ds.rename_column(\"label\", \"label_new\")\n        DatasetDict({\n            train: Dataset({\n                features: ['text', 'label_new'],\n                num_rows: 8530\n            })\n            validation: Dataset({\n                features: ['text', 'label_new'],\n                num_rows: 1066\n            })\n            test: Dataset({\n                features: ['text', 'label_new'],\n                num_rows: 1066\n            })\n        })\n        ```\n        \"\"\"\n        self._check_values_type()\n        return DatasetDict(\n            {\n                k: dataset.rename_column(\n                    original_column_name=original_column_name,\n                    new_column_name=new_column_name,\n                )\n                for k, dataset in self.items()\n            }\n        )\n\n    def rename_columns(self, column_mapping: dict[str, str]) -> \"DatasetDict\":\n        \"\"\"\n        Rename several columns in the dataset, and move the features associated to the original columns under\n        the new column names.\n        The transformation is applied to all the datasets of the dataset dictionary.\n\n        Args:\n            column_mapping (`Dict[str, str]`):\n                A mapping of columns to rename to their new names.\n\n        Returns:\n            [`DatasetDict`]: A copy of the dataset with renamed columns.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> ds.rename_columns({'text': 'text_new', 'label': 'label_new'})\n        DatasetDict({\n            train: Dataset({\n                features: ['text_new', 'label_new'],\n                num_rows: 8530\n            })\n            validation: Dataset({\n                features: ['text_new', 'label_new'],\n                num_rows: 1066\n            })\n            test: Dataset({\n                features: ['text_new', 'label_new'],\n                num_rows: 1066\n            })\n        })\n        ```\n        \"\"\"\n        self._check_values_type()\n        return DatasetDict({k: dataset.rename_columns(column_mapping=column_mapping) for k, dataset in self.items()})\n\n    def select_columns(self, column_names: Union[str, list[str]]) -> \"DatasetDict\":\n        \"\"\"Select one or several column(s) from each split in the dataset and\n        the features associated to the column(s).\n\n        The transformation is applied to all the splits of the dataset\n        dictionary.\n\n        Args:\n            column_names (`Union[str, list[str]]`):\n                Name of the column(s) to keep.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> ds.select_columns(\"text\")\n        DatasetDict({\n            train: Dataset({\n                features: ['text'],\n                num_rows: 8530\n            })\n            validation: Dataset({\n                features: ['text'],\n                num_rows: 1066\n            })\n            test: Dataset({\n                features: ['text'],\n                num_rows: 1066\n            })\n        })\n        ```\n        \"\"\"\n        self._check_values_type()\n        return DatasetDict({k: dataset.select_columns(column_names=column_names) for k, dataset in self.items()})\n\n    def class_encode_column(self, column: str, include_nulls: bool = False) -> \"DatasetDict\":\n        \"\"\"Casts the given column as [`~datasets.features.ClassLabel`] and updates the tables.\n\n        Args:\n            column (`str`):\n                The name of the column to cast.\n            include_nulls (`bool`, defaults to `False`):\n                Whether to include null values in the class labels. If `True`, the null values will be encoded as the `\"None\"` class label.\n\n                <Added version=\"1.14.2\"/>\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"google/boolq\")\n        >>> ds[\"train\"].features\n        {'answer': Value('bool'),\n         'passage': Value('string'),\n         'question': Value('string')}\n        >>> ds = ds.class_encode_column(\"answer\")\n        >>> ds[\"train\"].features\n        {'answer': ClassLabel(num_classes=2, names=['False', 'True']),\n         'passage': Value('string'),\n         'question': Value('string')}\n        ```\n        \"\"\"\n        self._check_values_type()\n        return DatasetDict(\n            {k: dataset.class_encode_column(column=column, include_nulls=include_nulls) for k, dataset in self.items()}\n        )\n\n    @contextlib.contextmanager\n    def formatted_as(\n        self,\n        type: Optional[str] = None,\n        columns: Optional[list] = None,\n        output_all_columns: bool = False,\n        **format_kwargs,\n    ):\n        \"\"\"To be used in a `with` statement. Set `__getitem__` return format (type and columns).\n        The transformation is applied to all the datasets of the dataset dictionary.\n\n        Args:\n            type (`str`, *optional*):\n                Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'jax', 'arrow', 'pandas', 'polars']`.\n                `None` means `__getitem__` returns python objects (default).\n            columns (`list[str]`, *optional*):\n                Columns to format in the output.\n                `None` means `__getitem__` returns all columns (default).\n            output_all_columns (`bool`, defaults to False):\n                Keep un-formatted columns as well in the output (as python objects).\n            **format_kwargs (additional keyword arguments):\n                Keywords arguments passed to the convert function like `np.array`, `torch.tensor` or `tensorflow.ragged.constant`.\n        \"\"\"\n        self._check_values_type()\n        old_format_type = {k: dataset._format_type for k, dataset in self.items()}\n        old_format_kwargs = {k: dataset._format_kwargs for k, dataset in self.items()}\n        old_format_columns = {k: dataset._format_columns for k, dataset in self.items()}\n        old_output_all_columns = {k: dataset._output_all_columns for k, dataset in self.items()}\n        try:\n            self.set_format(type, columns, output_all_columns, **format_kwargs)\n            yield\n        finally:\n            for k, dataset in self.items():\n                dataset.set_format(\n                    old_format_type[k],\n                    old_format_columns[k],\n                    old_output_all_columns[k],\n                    **old_format_kwargs[k],\n                )\n\n    def set_format(\n        self,\n        type: Optional[str] = None,\n        columns: Optional[list] = None,\n        output_all_columns: bool = False,\n        **format_kwargs,\n    ):\n        \"\"\"Set `__getitem__` return format (type and columns).\n        The format is set for every dataset in the dataset dictionary.\n\n        Args:\n            type (`str`, *optional*):\n                Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'jax', 'arrow', 'pandas', 'polars']`.\n                `None` means `__getitem__` returns python objects (default).\n            columns (`list[str]`, *optional*):\n                Columns to format in the output.\n                `None` means `__getitem__` returns all columns (default).\n            output_all_columns (`bool`, defaults to False):\n                Keep un-formatted columns as well in the output (as python objects),\n            **format_kwargs (additional keyword arguments):\n                Keywords arguments passed to the convert function like `np.array`, `torch.tensor` or `tensorflow.ragged.constant`.\n\n        It is possible to call `map` after calling `set_format`. Since `map` may add new columns, then the list of formatted columns\n        gets updated. In this case, if you apply `map` on a dataset to add a new column, then this column will be formatted:\n\n            `new formatted columns = (all columns - previously unformatted columns)`\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> from transformers import AutoTokenizer\n        >>> tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n        >>> ds = ds.map(lambda x: tokenizer(x[\"text\"], truncation=True, padding=True), batched=True)\n        >>> ds.set_format(type=\"numpy\", columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])\n        >>> ds[\"train\"].format\n        {'columns': ['input_ids', 'token_type_ids', 'attention_mask', 'label'],\n         'format_kwargs': {},\n         'output_all_columns': False,\n         'type': 'numpy'}\n        ```\n        \"\"\"\n        self._check_values_type()\n        for dataset in self.values():\n            dataset.set_format(\n                type=type,\n                columns=columns,\n                output_all_columns=output_all_columns,\n                **format_kwargs,\n            )\n\n    def reset_format(self):\n        \"\"\"Reset `__getitem__` return format to python objects and all columns.\n        The transformation is applied to all the datasets of the dataset dictionary.\n\n        Same as `self.set_format()`\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> from transformers import AutoTokenizer\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n        >>> ds = ds.map(lambda x: tokenizer(x[\"text\"], truncation=True, padding=True), batched=True)\n        >>> ds.set_format(type=\"numpy\", columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])\n        >>> ds[\"train\"].format\n        {'columns': ['input_ids', 'token_type_ids', 'attention_mask', 'label'],\n         'format_kwargs': {},\n         'output_all_columns': False,\n         'type': 'numpy'}\n        >>> ds.reset_format()\n        >>> ds[\"train\"].format\n        {'columns': ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],\n         'format_kwargs': {},\n         'output_all_columns': False,\n         'type': None}\n        ```\n        \"\"\"\n        self._check_values_type()\n        for dataset in self.values():\n            dataset.set_format()\n\n    def set_transform(\n        self,\n        transform: Optional[Callable],\n        columns: Optional[list] = None,\n        output_all_columns: bool = False,\n    ):\n        \"\"\"Set ``__getitem__`` return format using this transform. The transform is applied on-the-fly on batches when ``__getitem__`` is called.\n        The transform is set for every dataset in the dataset dictionary\n        As :func:`datasets.Dataset.set_format`, this can be reset using :func:`datasets.Dataset.reset_format`\n\n        Args:\n            transform (`Callable`, optional): user-defined formatting transform, replaces the format defined by :func:`datasets.Dataset.set_format`\n                A formatting function is a callable that takes a batch (as a dict) as input and returns a batch.\n                This function is applied right before returning the objects in ``__getitem__``.\n            columns (`list[str]`, optional): columns to format in the output\n                If specified, then the input batch of the transform only contains those columns.\n            output_all_columns (`bool`, default to False): keep un-formatted columns as well in the output (as python objects)\n                If set to True, then the other un-formatted columns are kept with the output of the transform.\n\n        \"\"\"\n        self._check_values_type()\n        for dataset in self.values():\n            dataset.set_format(\n                \"custom\",\n                columns=columns,\n                output_all_columns=output_all_columns,\n                transform=transform,\n            )\n\n    def with_format(\n        self,\n        type: Optional[str] = None,\n        columns: Optional[list] = None,\n        output_all_columns: bool = False,\n        **format_kwargs,\n    ) -> \"DatasetDict\":\n        \"\"\"Set `__getitem__` return format (type and columns). The data formatting is applied on-the-fly.\n        The format `type` (for example \"numpy\") is used to format batches when using `__getitem__`.\n        The format is set for every dataset in the dataset dictionary.\n\n        It's also possible to use custom transforms for formatting using [`~datasets.Dataset.with_transform`].\n\n        Contrary to [`~datasets.DatasetDict.set_format`], `with_format` returns a new [`DatasetDict`] object with new [`Dataset`] objects.\n\n        Args:\n            type (`str`, *optional*):\n                Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'jax', 'arrow', 'pandas', 'polars']`.\n                `None` means `__getitem__` returns python objects (default).\n            columns (`list[str]`, *optional*):\n                Columns to format in the output.\n                `None` means `__getitem__` returns all columns (default).\n            output_all_columns (`bool`, defaults to `False`):\n                Keep un-formatted columns as well in the output (as python objects).\n            **format_kwargs (additional keyword arguments):\n                Keywords arguments passed to the convert function like `np.array`, `torch.tensor` or `tensorflow.ragged.constant`.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> from transformers import AutoTokenizer\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n        >>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True)\n        >>> ds[\"train\"].format\n        {'columns': ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],\n         'format_kwargs': {},\n         'output_all_columns': False,\n         'type': None}\n        >>> ds = ds.with_format(\"torch\")\n        >>> ds[\"train\"].format\n        {'columns': ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],\n         'format_kwargs': {},\n         'output_all_columns': False,\n         'type': 'torch'}\n        >>> ds[\"train\"][0]\n        {'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .',\n         'label': tensor(1),\n         'input_ids': tensor([  101, 18027, 16310, 16001,  1103,  9321,   178, 11604,  7235,  6617,\n                1742,  2165,  2820,  1206,  6588, 22572, 12937,  1811,  2153,  1105,\n                1147, 12890, 19587,  6463,  1105, 15026,  1482,   119,   102,     0,\n                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n                    0,     0,     0,     0]),\n         'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),\n         'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n                1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}\n        ```\n        \"\"\"\n        dataset = copy.deepcopy(self)\n        dataset.set_format(\n            type=type,\n            columns=columns,\n            output_all_columns=output_all_columns,\n            **format_kwargs,\n        )\n        return dataset\n\n    def with_transform(\n        self,\n        transform: Optional[Callable],\n        columns: Optional[list] = None,\n        output_all_columns: bool = False,\n    ) -> \"DatasetDict\":\n        \"\"\"Set `__getitem__` return format using this transform. The transform is applied on-the-fly on batches when `__getitem__` is called.\n        The transform is set for every dataset in the dataset dictionary\n\n        As [`~datasets.Dataset.set_format`], this can be reset using [`~datasets.Dataset.reset_format`].\n\n        Contrary to [`~datasets.DatasetDict.set_transform`], `with_transform` returns a new [`DatasetDict`] object with new [`Dataset`] objects.\n\n        Args:\n            transform (`Callable`, *optional*):\n                User-defined formatting transform, replaces the format defined by [`~datasets.Dataset.set_format`].\n                A formatting function is a callable that takes a batch (as a dict) as input and returns a batch.\n                This function is applied right before returning the objects in `__getitem__`.\n            columns (`list[str]`, *optional*):\n                Columns to format in the output.\n                If specified, then the input batch of the transform only contains those columns.\n            output_all_columns (`bool`, defaults to False):\n                Keep un-formatted columns as well in the output (as python objects).\n                If set to `True`, then the other un-formatted columns are kept with the output of the transform.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> from transformers import AutoTokenizer\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n        >>> def encode(example):\n        ...     return tokenizer(example['text'], truncation=True, padding=True, return_tensors=\"pt\")\n        >>> ds = ds.with_transform(encode)\n        >>> ds[\"train\"][0]\n        {'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n         1, 1, 1, 1, 1, 1, 1, 1, 1]),\n         'input_ids': tensor([  101,  1103,  2067,  1110, 17348,  1106,  1129,  1103,  6880,  1432,\n                112,   188,  1207,   107, 14255,  1389,   107,  1105,  1115,  1119,\n                112,   188,  1280,  1106,  1294,   170, 24194,  1256,  3407,  1190,\n                170, 11791,  5253,   188,  1732,  7200, 10947, 12606,  2895,   117,\n                179,  7766,   118,   172, 15554,  1181,  3498,  6961,  3263,  1137,\n                188,  1566,  7912, 14516,  6997,   119,   102]),\n         'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0])}\n        ```\n        \"\"\"\n        dataset = copy.deepcopy(self)\n        dataset.set_transform(transform=transform, columns=columns, output_all_columns=output_all_columns)\n        return dataset\n\n    def map(\n        self,\n        function: Optional[Callable] = None,\n        with_indices: bool = False,\n        with_rank: bool = False,\n        with_split: bool = False,\n        input_columns: Optional[Union[str, list[str]]] = None,\n        batched: bool = False,\n        batch_size: Optional[int] = 1000,\n        drop_last_batch: bool = False,\n        remove_columns: Optional[Union[str, list[str]]] = None,\n        keep_in_memory: bool = False,\n        load_from_cache_file: Optional[bool] = None,\n        cache_file_names: Optional[dict[str, Optional[str]]] = None,\n        writer_batch_size: Optional[int] = 1000,\n        features: Optional[Features] = None,\n        disable_nullable: bool = False,\n        fn_kwargs: Optional[dict] = None,\n        num_proc: Optional[int] = None,\n        desc: Optional[str] = None,\n        try_original_type: Optional[bool] = True,\n        on_mixed_types: Optional[Literal[\"use_json\"]] = None,\n    ) -> \"DatasetDict\":\n        \"\"\"\n        Apply a function to all the examples in the table (individually or in batches) and update the table.\n        If your function returns a column that already exists, then it overwrites it.\n        The transformation is applied to all the datasets of the dataset dictionary.\n\n        You can specify whether the function should be batched or not with the `batched` parameter:\n\n        - If batched is `False`, then the function takes 1 example in and should return 1 example.\n          An example is a dictionary, e.g. `{\"text\": \"Hello there !\"}`.\n        - If batched is `True` and `batch_size` is 1, then the function takes a batch of 1 example as input and can return a batch with 1 or more examples.\n          A batch is a dictionary, e.g. a batch of 1 example is `{\"text\": [\"Hello there !\"]}`.\n        - If batched is `True` and `batch_size` is `n > 1`, then the function takes a batch of `n` examples as input and can return a batch with `n` examples, or with an arbitrary number of examples.\n          Note that the last batch may have less than `n` examples.\n          A batch is a dictionary, e.g. a batch of `n` examples is `{\"text\": [\"Hello there !\"] * n}`.\n\n        If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simultaneous calls.\n        It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time.\n\n        Args:\n            function (`callable`): with one of the following signature:\n                - `function(example: Dict[str, Any]) -> Dict[str, Any]` if `batched=False` and `with_indices=False`\n                - `function(example: Dict[str, Any], indices: int) -> Dict[str, Any]` if `batched=False` and `with_indices=True`\n                - `function(batch: Dict[str, list]) -> Dict[str, list]` if `batched=True` and `with_indices=False`\n                - `function(batch: Dict[str, list], indices: list[int]) -> Dict[str, list]` if `batched=True` and `with_indices=True`\n\n                For advanced usage, the function can also return a `pyarrow.Table`.\n                If the function is asynchronous, then `map` will run your function in parallel.\n                Moreover if your function returns nothing (`None`), then `map` will run your function and return the dataset unchanged.\n                If no function is provided, default to identity function: `lambda x: x`.\n            with_indices (`bool`, defaults to `False`):\n                Provide example indices to `function`. Note that in this case the signature of `function` should be `def function(example, idx): ...`.\n            with_rank (`bool`, defaults to `False`):\n                Provide process rank to `function`. Note that in this case the\n                signature of `function` should be `def function(example[, idx], rank): ...`.\n            with_split (`bool`, defaults to `False`):\n                Provide process split to `function`. Note that in this case the\n                signature of `function` should be `def function(example[, idx], split): ...`.\n            input_columns (`[Union[str, list[str]]]`, *optional*, defaults to `None`):\n                The columns to be passed into `function` as\n                positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument.\n            batched (`bool`, defaults to `False`):\n                Provide batch of examples to `function`.\n            batch_size (`int`, *optional*, defaults to `1000`):\n                Number of examples per batch provided to `function` if `batched=True`,\n                `batch_size <= 0` or `batch_size == None` then provide the full dataset as a single batch to `function`.\n            drop_last_batch (`bool`, defaults to `False`):\n                Whether a last batch smaller than the batch_size should be\n                dropped instead of being processed by the function.\n            remove_columns (`[Union[str, list[str]]]`, *optional*, defaults to `None`):\n                Remove a selection of columns while doing the mapping.\n                Columns will be removed before updating the examples with the output of `function`, i.e. if `function` is adding\n                columns with names in `remove_columns`, these columns will be kept.\n            keep_in_memory (`bool`, defaults to `False`):\n                Keep the dataset in memory instead of writing it to a cache file.\n            load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled):\n                If a cache file storing the current computation from `function`\n                can be identified, use it instead of recomputing.\n            cache_file_names (`[Dict[str, str]]`, *optional*, defaults to `None`):\n                Provide the name of a path for the cache file. It is used to store the\n                results of the computation instead of the automatically generated cache file name.\n                You have to provide one `cache_file_name` per dataset in the dataset dictionary.\n            writer_batch_size (`int`, default `1000`):\n                Number of rows per write operation for the cache file writer.\n                This value is a good trade-off between memory usage during the processing, and processing speed.\n                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`.\n            features (`[datasets.Features]`, *optional*, defaults to `None`):\n                Use a specific [`Features`] to store the cache file\n                instead of the automatically generated one.\n            disable_nullable (`bool`, defaults to `False`):\n                Disallow null values in the table.\n            fn_kwargs (`Dict`, *optional*, defaults to `None`):\n                Keyword arguments to be passed to `function`\n            num_proc (`int`, *optional*, defaults to `None`):\n                 The number of processes to use for multiprocessing.\n                - If `None` or `0`, no multiprocessing is used and the operation runs in the main process.\n                - If greater than `1`, one or multiple worker processes are used to process data in parallel.\n                 Note: The function passed to `map()` must be picklable for multiprocessing to work correctly\n                 (i.e., prefer functions defined at the top level of a module, not inside another function or class).\n            desc (`str`, *optional*, defaults to `None`):\n                Meaningful description to be displayed alongside with the progress bar while mapping examples.\n            try_original_type (`Optional[bool]`, defaults to `True`):\n                Try to keep the types of the original columns (e.g. int32 -> int32).\n                Set to False if you want to always infer new types.\n            on_mixed_types (`Literal[\"use_json\"]`, *optional*, defaults to `None`):\n                If \"use_json\", use the Json() type for mixed-types fields,\n                i.e. unstructured fields that contain data without a predefined schema.\n                In this case, a field with mixed type is set to Json().\n\n                This allow loading lists with a mix of strings/integers/floats\n                for example, or dictionaries with arbitrary value types.\n\n                <Added version=\"4.7.0\"/>\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> def add_prefix(example):\n        ...     example[\"text\"] = \"Review: \" + example[\"text\"]\n        ...     return example\n        >>> ds = ds.map(add_prefix)\n        >>> ds[\"train\"][0:3][\"text\"]\n        ['Review: the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',\n         'Review: the gorgeously elaborate continuation of \" the lord of the rings \" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\\'s expanded vision of j . r . r . tolkien\\'s middle-earth .',\n         'Review: effective but too-tepid biopic']\n\n        # process a batch of examples\n        >>> ds = ds.map(lambda example: tokenizer(example[\"text\"]), batched=True)\n        # set number of processors\n        >>> ds = ds.map(add_prefix, num_proc=4)\n        ```\n        \"\"\"\n        self._check_values_type()\n        if cache_file_names is None:\n            cache_file_names = dict.fromkeys(self)\n\n        dataset_dict = {}\n        for split, dataset in self.items():\n            if with_split:\n                function = bind(function, split)\n\n            dataset_dict[split] = dataset.map(\n                function=function,\n                with_indices=with_indices,\n                with_rank=with_rank,\n                input_columns=input_columns,\n                batched=batched,\n                batch_size=batch_size,\n                drop_last_batch=drop_last_batch,\n                remove_columns=remove_columns,\n                keep_in_memory=keep_in_memory,\n                load_from_cache_file=load_from_cache_file,\n                cache_file_name=cache_file_names[split],\n                writer_batch_size=writer_batch_size,\n                features=features,\n                disable_nullable=disable_nullable,\n                fn_kwargs=fn_kwargs,\n                num_proc=num_proc,\n                desc=desc,\n                try_original_type=try_original_type,\n                on_mixed_types=on_mixed_types,\n            )\n\n            if with_split:\n                function = function.func\n\n        return DatasetDict(dataset_dict)\n\n    def filter(\n        self,\n        function: Optional[Callable] = None,\n        with_indices: bool = False,\n        with_rank: bool = False,\n        input_columns: Optional[Union[str, list[str]]] = None,\n        batched: bool = False,\n        batch_size: Optional[int] = 1000,\n        keep_in_memory: bool = False,\n        load_from_cache_file: Optional[bool] = None,\n        cache_file_names: Optional[dict[str, Optional[str]]] = None,\n        writer_batch_size: Optional[int] = 1000,\n        fn_kwargs: Optional[dict] = None,\n        num_proc: Optional[int] = None,\n        desc: Optional[str] = None,\n    ) -> \"DatasetDict\":\n        \"\"\"Apply a filter function to all the elements in the table in batches\n        and update the table so that the dataset only includes examples according to the filter function.\n        The transformation is applied to all the datasets of the dataset dictionary.\n\n        Args:\n            function (`Callable`): Callable with one of the following signatures:\n\n                - `function(example: Dict[str, Any]) -> bool` if `batched=False` and `with_indices=False` and `with_rank=False`\n                - `function(example: Dict[str, Any], *extra_args) -> bool` if `batched=False` and `with_indices=True` and/or `with_rank=True` (one extra arg for each)\n                - `function(batch: Dict[str, list]) -> list[bool]` if `batched=True` and `with_indices=False` and `with_rank=False`\n                - `function(batch: Dict[str, list], *extra_args) -> list[bool]` if `batched=True` and `with_indices=True` and/or `with_rank=True` (one extra arg for each)\n\n                If no function is provided, defaults to an always `True` function: `lambda x: True`.\n            with_indices (`bool`, defaults to `False`):\n                Provide example indices to `function`. Note that in this case the\n                signature of `function` should be `def function(example, idx[, rank]): ...`.\n            with_rank (`bool`, defaults to `False`):\n                Provide process rank to `function`. Note that in this case the\n                signature of `function` should be `def function(example[, idx], rank): ...`.\n            input_columns (`[Union[str, list[str]]]`, *optional*, defaults to `None`):\n                The columns to be passed into `function` as\n                positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument.\n            batched (`bool`, defaults to `False`):\n                Provide batch of examples to `function`.\n            batch_size (`int`, *optional*, defaults to `1000`):\n                Number of examples per batch provided to `function` if `batched=True`\n                `batch_size <= 0` or `batch_size == None` then provide the full dataset as a single batch to `function`.\n            keep_in_memory (`bool`, defaults to `False`):\n                Keep the dataset in memory instead of writing it to a cache file.\n            load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled):\n                If a cache file storing the current computation from `function`\n                can be identified, use it instead of recomputing.\n            cache_file_names (`[Dict[str, str]]`, *optional*, defaults to `None`):\n                Provide the name of a path for the cache file. It is used to store the\n                results of the computation instead of the automatically generated cache file name.\n                You have to provide one `cache_file_name` per dataset in the dataset dictionary.\n            writer_batch_size (`int`, defaults to `1000`):\n                Number of rows per write operation for the cache file writer.\n                This value is a good trade-off between memory usage during the processing, and processing speed.\n                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`.\n            fn_kwargs (`Dict`, *optional*, defaults to `None`):\n                Keyword arguments to be passed to `function`\n            num_proc (`int`, *optional*, defaults to `None`):\n                 The number of processes to use for multiprocessing.\n                - If `None` or `0`, no multiprocessing is used and the operation runs in the main process.\n                - If greater than `1`, one or multiple worker processes are used to process data in parallel.\n                 Note: The function passed to `map()` must be picklable for multiprocessing to work correctly\n                 (i.e., prefer functions defined at the top level of a module, not inside another function or class).\n            desc (`str`, *optional*, defaults to `None`):\n                Meaningful description to be displayed alongside with the progress bar while filtering examples.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> ds.filter(lambda x: x[\"label\"] == 1)\n        DatasetDict({\n            train: Dataset({\n                features: ['text', 'label'],\n                num_rows: 4265\n            })\n            validation: Dataset({\n                features: ['text', 'label'],\n                num_rows: 533\n            })\n            test: Dataset({\n                features: ['text', 'label'],\n                num_rows: 533\n            })\n        })\n        ```\n        \"\"\"\n        self._check_values_type()\n        if cache_file_names is None:\n            cache_file_names = dict.fromkeys(self)\n        return DatasetDict(\n            {\n                k: dataset.filter(\n                    function=function,\n                    with_indices=with_indices,\n                    with_rank=with_rank,\n                    input_columns=input_columns,\n                    batched=batched,\n                    batch_size=batch_size,\n                    keep_in_memory=keep_in_memory,\n                    load_from_cache_file=load_from_cache_file,\n                    cache_file_name=cache_file_names[k],\n                    writer_batch_size=writer_batch_size,\n                    fn_kwargs=fn_kwargs,\n                    num_proc=num_proc,\n                    desc=desc,\n                )\n                for k, dataset in self.items()\n            }\n        )\n\n    def flatten_indices(\n        self,\n        keep_in_memory: bool = False,\n        cache_file_names: Optional[dict[str, Optional[str]]] = None,\n        writer_batch_size: Optional[int] = 1000,\n        features: Optional[Features] = None,\n        disable_nullable: bool = False,\n        num_proc: Optional[int] = None,\n        new_fingerprint: Optional[str] = None,\n    ) -> \"DatasetDict\":\n        \"\"\"Create and cache a new Dataset by flattening the indices mapping.\n\n        Args:\n            keep_in_memory (`bool`, defaults to `False`):\n                Keep the dataset in memory instead of writing it to a cache file.\n            cache_file_names (`Dict[str, str]`, *optional*, default `None`):\n                Provide the name of a path for the cache file. It is used to store the\n                results of the computation instead of the automatically generated cache file name.\n                You have to provide one `cache_file_name` per dataset in the dataset dictionary.\n            writer_batch_size (`int`, defaults to `1000`):\n                Number of rows per write operation for the cache file writer.\n                This value is a good trade-off between memory usage during the processing, and processing speed.\n                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`.\n            features (`Optional[datasets.Features]`, defaults to `None`):\n                Use a specific [`Features`] to store the cache file\n                instead of the automatically generated one.\n            disable_nullable (`bool`, defaults to `False`):\n                Allow null values in the table.\n            num_proc (`int`, optional, default `None`):\n                Max number of processes when generating cache. Already cached shards are loaded sequentially\n            new_fingerprint (`str`, *optional*, defaults to `None`):\n                The new fingerprint of the dataset after transform.\n                If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments\n        \"\"\"\n        self._check_values_type()\n        if cache_file_names is None:\n            cache_file_names = dict.fromkeys(self)\n        return DatasetDict(\n            {\n                k: dataset.flatten_indices(\n                    keep_in_memory=keep_in_memory,\n                    cache_file_name=cache_file_names[k],\n                    writer_batch_size=writer_batch_size,\n                    features=features,\n                    disable_nullable=disable_nullable,\n                    num_proc=num_proc,\n                    new_fingerprint=new_fingerprint,\n                )\n                for k, dataset in self.items()\n            }\n        )\n\n    def sort(\n        self,\n        column_names: Union[str, Sequence[str]],\n        reverse: Union[bool, Sequence[bool]] = False,\n        null_placement: str = \"at_end\",\n        keep_in_memory: bool = False,\n        load_from_cache_file: Optional[bool] = None,\n        indices_cache_file_names: Optional[dict[str, Optional[str]]] = None,\n        writer_batch_size: Optional[int] = 1000,\n    ) -> \"DatasetDict\":\n        \"\"\"Create a new dataset sorted according to a single or multiple columns.\n\n        Args:\n            column_names (`Union[str, Sequence[str]]`):\n                Column name(s) to sort by.\n            reverse (`Union[bool, Sequence[bool]]`, defaults to `False`):\n                If `True`, sort by descending order rather than ascending. If a single bool is provided,\n                the value is applied to the sorting of all column names. Otherwise a list of bools with the\n                same length and order as column_names must be provided.\n            null_placement (`str`, defaults to `at_end`):\n                Put `None` values at the beginning if `at_start` or `first` or at the end if `at_end` or `last`\n            keep_in_memory (`bool`, defaults to `False`):\n                Keep the sorted indices in memory instead of writing it to a cache file.\n            load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled):\n                If a cache file storing the sorted indices\n                can be identified, use it instead of recomputing.\n            indices_cache_file_names (`[Dict[str, str]]`, *optional*, defaults to `None`):\n                Provide the name of a path for the cache file. It is used to store the\n                indices mapping instead of the automatically generated cache file name.\n                You have to provide one `cache_file_name` per dataset in the dataset dictionary.\n            writer_batch_size (`int`, defaults to `1000`):\n                Number of rows per write operation for the cache file writer.\n                Higher value gives smaller cache files, lower value consume less temporary memory.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes')\n        >>> ds['train']['label'][:10]\n        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n        >>> sorted_ds = ds.sort('label')\n        >>> sorted_ds['train']['label'][:10]\n        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n        >>> another_sorted_ds = ds.sort(['label', 'text'], reverse=[True, False])\n        >>> another_sorted_ds['train']['label'][:10]\n        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n        ```\n        \"\"\"\n        self._check_values_type()\n        if indices_cache_file_names is None:\n            indices_cache_file_names = dict.fromkeys(self)\n        return DatasetDict(\n            {\n                k: dataset.sort(\n                    column_names=column_names,\n                    reverse=reverse,\n                    null_placement=null_placement,\n                    keep_in_memory=keep_in_memory,\n                    load_from_cache_file=load_from_cache_file,\n                    indices_cache_file_name=indices_cache_file_names[k],\n                    writer_batch_size=writer_batch_size,\n                )\n                for k, dataset in self.items()\n            }\n        )\n\n    def shuffle(\n        self,\n        seeds: Optional[Union[int, dict[str, Optional[int]]]] = None,\n        seed: Optional[int] = None,\n        generators: Optional[dict[str, np.random.Generator]] = None,\n        keep_in_memory: bool = False,\n        load_from_cache_file: Optional[bool] = None,\n        indices_cache_file_names: Optional[dict[str, Optional[str]]] = None,\n        writer_batch_size: Optional[int] = 1000,\n    ) -> \"DatasetDict\":\n        \"\"\"Create a new Dataset where the rows are shuffled.\n\n        The transformation is applied to all the datasets of the dataset dictionary.\n\n        Currently shuffling uses numpy random generators.\n        You can either supply a NumPy BitGenerator to use, or a seed to initiate NumPy's default random generator (PCG64).\n\n        Args:\n            seeds (`Dict[str, int]` or `int`, *optional*):\n                A seed to initialize the default BitGenerator if `generator=None`.\n                If `None`, then fresh, unpredictable entropy will be pulled from the OS.\n                If an `int` or `array_like[ints]` is passed, then it will be passed to SeedSequence to derive the initial BitGenerator state.\n                You can provide one `seed` per dataset in the dataset dictionary.\n            seed (`int`, *optional*):\n                A seed to initialize the default BitGenerator if `generator=None`. Alias for seeds (a `ValueError` is raised if both are provided).\n            generators (`Dict[str, *optional*, np.random.Generator]`):\n                Numpy random Generator to use to compute the permutation of the dataset rows.\n                If `generator=None` (default), uses `np.random.default_rng` (the default BitGenerator (PCG64) of NumPy).\n                You have to provide one `generator` per dataset in the dataset dictionary.\n            keep_in_memory (`bool`, defaults to `False`):\n                Keep the dataset in memory instead of writing it to a cache file.\n            load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled):\n                If a cache file storing the current computation from `function`\n                can be identified, use it instead of recomputing.\n            indices_cache_file_names (`Dict[str, str]`, *optional*):\n                Provide the name of a path for the cache file. It is used to store the\n                indices mappings instead of the automatically generated cache file name.\n                You have to provide one `cache_file_name` per dataset in the dataset dictionary.\n            writer_batch_size (`int`, defaults to `1000`):\n                Number of rows per write operation for the cache file writer.\n                This value is a good trade-off between memory usage during the processing, and processing speed.\n                Higher value makes the processing do fewer lookups, lower value consume less temporary memory while running `map`.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> ds[\"train\"][\"label\"][:10]\n        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n\n        # set a seed\n        >>> shuffled_ds = ds.shuffle(seed=42)\n        >>> shuffled_ds[\"train\"][\"label\"][:10]\n        [0, 1, 0, 1, 0, 0, 0, 0, 0, 0]\n        ```\n        \"\"\"\n        self._check_values_type()\n        if seed is not None and seeds is not None:\n            raise ValueError(\"Please specify seed or seeds, but not both\")\n        seeds = seed if seed is not None else seeds\n        if seeds is None:\n            seeds = dict.fromkeys(self)\n        elif not isinstance(seeds, dict):\n            seeds = dict.fromkeys(self, seeds)\n        if generators is None:\n            generators = dict.fromkeys(self)\n        if indices_cache_file_names is None:\n            indices_cache_file_names = dict.fromkeys(self)\n        return DatasetDict(\n            {\n                k: dataset.shuffle(\n                    seed=seeds[k],\n                    generator=generators[k],\n                    keep_in_memory=keep_in_memory,\n                    load_from_cache_file=load_from_cache_file,\n                    indices_cache_file_name=indices_cache_file_names[k],\n                    writer_batch_size=writer_batch_size,\n                )\n                for k, dataset in self.items()\n            }\n        )\n\n    def save_to_disk(\n        self,\n        dataset_dict_path: PathLike,\n        max_shard_size: Optional[Union[str, int]] = None,\n        num_shards: Optional[dict[str, int]] = None,\n        num_proc: Optional[int] = None,\n        storage_options: Optional[dict] = None,\n    ):\n        \"\"\"\n        Saves a dataset dict to a filesystem using `fsspec.spec.AbstractFileSystem`.\n\n        For [`Image`], [`Audio`] and [`Video`] data:\n\n        All the Image(), Audio() and Video() data are stored in the arrow files.\n        If you want to store paths or urls, please use the Value(\"string\") type.\n\n        Args:\n            dataset_dict_path (`path-like`):\n                Path (e.g. `dataset/train`) or remote URI (e.g. `s3://my-bucket/dataset/train`)\n                of the dataset dict directory where the dataset dict will be saved to.\n            max_shard_size (`int` or `str`, *optional*, defaults to `\"500MB\"`):\n                The maximum size of the dataset shards to be saved to the filesystem. If expressed as a string, needs to be digits followed by a unit\n                (like `\"50MB\"`).\n            num_shards (`Dict[str, int]`, *optional*):\n                Number of shards to write. By default the number of shards depends on `max_shard_size` and `num_proc`.\n                You need to provide the number of shards for each dataset in the dataset dictionary.\n                Use a dictionary to define a different num_shards for each split.\n\n                <Added version=\"2.8.0\"/>\n            num_proc (`int`, *optional*, default `None`):\n                Number of processes when downloading and generating the dataset locally.\n                Multiprocessing is disabled by default.\n\n                <Added version=\"2.8.0\"/>\n            storage_options (`dict`, *optional*):\n                Key/value pairs to be passed on to the file-system backend, if any.\n\n                <Added version=\"2.8.0\"/>\n\n        Example:\n\n        ```python\n        >>> dataset_dict.save_to_disk(\"path/to/dataset/directory\")\n        >>> dataset_dict.save_to_disk(\"path/to/dataset/directory\", max_shard_size=\"1GB\")\n        >>> dataset_dict.save_to_disk(\"path/to/dataset/directory\", num_shards={\"train\": 1024, \"test\": 8})\n        ```\n        \"\"\"\n        fs: fsspec.AbstractFileSystem\n        fs, _ = url_to_fs(dataset_dict_path, **(storage_options or {}))\n\n        if num_shards is None:\n            num_shards = dict.fromkeys(self)\n        elif not isinstance(num_shards, dict):\n            raise ValueError(\n                \"Please provide one `num_shards` per dataset in the dataset dictionary, e.g. {{'train': 128, 'test': 4}}\"\n            )\n\n        fs.makedirs(dataset_dict_path, exist_ok=True)\n\n        with fs.open(\n            posixpath.join(dataset_dict_path, config.DATASETDICT_JSON_FILENAME),\n            \"w\",\n            encoding=\"utf-8\",\n        ) as f:\n            json.dump({\"splits\": list(self)}, f)\n        for k, dataset in self.items():\n            dataset.save_to_disk(\n                posixpath.join(dataset_dict_path, k),\n                num_shards=num_shards.get(k),\n                max_shard_size=max_shard_size,\n                num_proc=num_proc,\n                storage_options=storage_options,\n            )\n\n    @staticmethod\n    def load_from_disk(\n        dataset_dict_path: PathLike,\n        keep_in_memory: Optional[bool] = None,\n        storage_options: Optional[dict] = None,\n    ) -> \"DatasetDict\":\n        \"\"\"\n        Load a dataset that was previously saved using [`save_to_disk`] from a filesystem using `fsspec.spec.AbstractFileSystem`.\n\n        Args:\n            dataset_dict_path (`path-like`):\n                Path (e.g. `\"dataset/train\"`) or remote URI (e.g. `\"s3//my-bucket/dataset/train\"`)\n                of the dataset dict directory where the dataset dict will be loaded from.\n            keep_in_memory (`bool`, defaults to `None`):\n                Whether to copy the dataset in-memory. If `None`, the\n                dataset will not be copied in-memory unless explicitly enabled by setting\n                `datasets.config.IN_MEMORY_MAX_SIZE` to nonzero. See more details in the\n                [improve performance](../cache#improve-performance) section.\n            storage_options (`dict`, *optional*):\n                Key/value pairs to be passed on to the file-system backend, if any.\n\n                <Added version=\"2.8.0\"/>\n\n        Returns:\n            [`DatasetDict`]\n\n        Example:\n\n        ```py\n        >>> ds = load_from_disk('path/to/dataset/directory')\n        ```\n        \"\"\"\n        fs: fsspec.AbstractFileSystem\n        fs, dataset_dict_path = url_to_fs(dataset_dict_path, **(storage_options or {}))\n\n        dataset_dict_json_path = posixpath.join(dataset_dict_path, config.DATASETDICT_JSON_FILENAME)\n        dataset_state_json_path = posixpath.join(dataset_dict_path, config.DATASET_STATE_JSON_FILENAME)\n        dataset_info_path = posixpath.join(dataset_dict_path, config.DATASET_INFO_FILENAME)\n        if not fs.isfile(dataset_dict_json_path):\n            if fs.isfile(dataset_info_path) and fs.isfile(dataset_state_json_path):\n                raise FileNotFoundError(\n                    f\"No such file: '{dataset_dict_json_path}'. Expected to load a `DatasetDict` object, but got a `Dataset`. Please use either `datasets.load_from_disk` or `Dataset.load_from_disk` instead.\"\n                )\n            raise FileNotFoundError(\n                f\"No such file: '{dataset_dict_json_path}'. Expected to load a `DatasetDict` object, but provided path is not a `DatasetDict`.\"\n            )\n\n        with fs.open(dataset_dict_json_path, \"r\", encoding=\"utf-8\") as f:\n            splits = json.load(f)[\"splits\"]\n\n        dataset_dict = DatasetDict()\n        for k in splits:\n            dataset_dict_split_path = posixpath.join(fs.unstrip_protocol(dataset_dict_path), k)\n            dataset_dict[k] = Dataset.load_from_disk(\n                dataset_dict_split_path,\n                keep_in_memory=keep_in_memory,\n                storage_options=storage_options,\n            )\n        return dataset_dict\n\n    @staticmethod\n    def from_csv(\n        path_or_paths: dict[str, PathLike],\n        features: Optional[Features] = None,\n        cache_dir: str = None,\n        keep_in_memory: bool = False,\n        **kwargs,\n    ) -> \"DatasetDict\":\n        \"\"\"Create [`DatasetDict`] from CSV file(s).\n\n        Args:\n            path_or_paths (`dict` of path-like):\n                Path(s) of the CSV file(s).\n            features ([`Features`], *optional*):\n                Dataset features.\n            cache_dir (str, *optional*, defaults to `\"~/.cache/huggingface/datasets\"`):\n                Directory to cache data.\n            keep_in_memory (`bool`, defaults to `False`):\n                Whether to copy the data in-memory.\n            **kwargs (additional keyword arguments):\n                Keyword arguments to be passed to [`pandas.read_csv`].\n\n        Returns:\n            [`DatasetDict`]\n\n        Example:\n\n        ```py\n        >>> from datasets import DatasetDict\n        >>> ds = DatasetDict.from_csv({'train': 'path/to/dataset.csv'})\n        ```\n        \"\"\"\n        # Dynamic import to avoid circular dependency\n        from .io.csv import CsvDatasetReader\n\n        return CsvDatasetReader(\n            path_or_paths,\n            features=features,\n            cache_dir=cache_dir,\n            keep_in_memory=keep_in_memory,\n            **kwargs,\n        ).read()\n\n    @staticmethod\n    def from_json(\n        path_or_paths: dict[str, PathLike],\n        features: Optional[Features] = None,\n        cache_dir: str = None,\n        keep_in_memory: bool = False,\n        **kwargs,\n    ) -> \"DatasetDict\":\n        \"\"\"Create [`DatasetDict`] from JSON Lines file(s).\n\n        Args:\n            path_or_paths (`path-like` or list of `path-like`):\n                Path(s) of the JSON Lines file(s).\n            features ([`Features`], *optional*):\n                Dataset features.\n            cache_dir (str, *optional*, defaults to `\"~/.cache/huggingface/datasets\"`):\n                Directory to cache data.\n            keep_in_memory (`bool`, defaults to `False`):\n                Whether to copy the data in-memory.\n            **kwargs (additional keyword arguments):\n                Keyword arguments to be passed to [`JsonConfig`].\n\n        Returns:\n            [`DatasetDict`]\n\n        Example:\n\n        ```py\n        >>> from datasets import DatasetDict\n        >>> ds = DatasetDict.from_json({'train': 'path/to/dataset.json'})\n        ```\n        \"\"\"\n        # Dynamic import to avoid circular dependency\n        from .io.json import JsonDatasetReader\n\n        return JsonDatasetReader(\n            path_or_paths,\n            features=features,\n            cache_dir=cache_dir,\n            keep_in_memory=keep_in_memory,\n            **kwargs,\n        ).read()\n\n    @staticmethod\n    def from_parquet(\n        path_or_paths: dict[str, PathLike],\n        features: Optional[Features] = None,\n        cache_dir: str = None,\n        keep_in_memory: bool = False,\n        columns: Optional[list[str]] = None,\n        **kwargs,\n    ) -> \"DatasetDict\":\n        \"\"\"Create [`DatasetDict`] from Parquet file(s).\n\n        Args:\n            path_or_paths (`dict` of path-like):\n                Path(s) of the CSV file(s).\n            features ([`Features`], *optional*):\n                Dataset features.\n            cache_dir (`str`, *optional*, defaults to `\"~/.cache/huggingface/datasets\"`):\n                Directory to cache data.\n            keep_in_memory (`bool`, defaults to `False`):\n                Whether to copy the data in-memory.\n            columns (`list[str]`, *optional*):\n                If not `None`, only these columns will be read from the file.\n                A column name may be a prefix of a nested field, e.g. 'a' will select\n                'a.b', 'a.c', and 'a.d.e'.\n            **kwargs (additional keyword arguments):\n                Keyword arguments to be passed to [`ParquetConfig`].\n\n        Returns:\n            [`DatasetDict`]\n\n        Example:\n\n        ```py\n        >>> from datasets import DatasetDict\n        >>> ds = DatasetDict.from_parquet({'train': 'path/to/dataset/parquet'})\n        ```\n        \"\"\"\n        # Dynamic import to avoid circular dependency\n        from .io.parquet import ParquetDatasetReader\n\n        return ParquetDatasetReader(\n            path_or_paths,\n            features=features,\n            cache_dir=cache_dir,\n            keep_in_memory=keep_in_memory,\n            columns=columns,\n            **kwargs,\n        ).read()\n\n    @staticmethod\n    def from_text(\n        path_or_paths: dict[str, PathLike],\n        features: Optional[Features] = None,\n        cache_dir: str = None,\n        keep_in_memory: bool = False,\n        **kwargs,\n    ) -> \"DatasetDict\":\n        \"\"\"Create [`DatasetDict`] from text file(s).\n\n        Args:\n            path_or_paths (`dict` of path-like):\n                Path(s) of the text file(s).\n            features ([`Features`], *optional*):\n                Dataset features.\n            cache_dir (`str`, *optional*, defaults to `\"~/.cache/huggingface/datasets\"`):\n                Directory to cache data.\n            keep_in_memory (`bool`, defaults to `False`):\n                Whether to copy the data in-memory.\n            **kwargs (additional keyword arguments):\n                Keyword arguments to be passed to [`TextConfig`].\n\n        Returns:\n            [`DatasetDict`]\n\n        Example:\n\n        ```py\n        >>> from datasets import DatasetDict\n        >>> ds = DatasetDict.from_text({'train': 'path/to/dataset.txt'})\n        ```\n        \"\"\"\n        # Dynamic import to avoid circular dependency\n        from .io.text import TextDatasetReader\n\n        return TextDatasetReader(\n            path_or_paths,\n            features=features,\n            cache_dir=cache_dir,\n            keep_in_memory=keep_in_memory,\n            **kwargs,\n        ).read()\n\n    @is_documented_by(Dataset.align_labels_with_mapping)\n    def align_labels_with_mapping(self, label2id: dict, label_column: str) -> \"DatasetDict\":\n        self._check_values_type()\n        return DatasetDict(\n            {\n                k: dataset.align_labels_with_mapping(label2id=label2id, label_column=label_column)\n                for k, dataset in self.items()\n            }\n        )\n\n    def push_to_hub(\n        self,\n        repo_id: str,\n        config_name: str = \"default\",\n        set_default: Optional[bool] = None,\n        data_dir: Optional[str] = None,\n        commit_message: Optional[str] = None,\n        commit_description: Optional[str] = None,\n        private: Optional[bool] = None,\n        token: Optional[str] = None,\n        revision: Optional[str] = None,\n        create_pr: Optional[bool] = False,\n        max_shard_size: Optional[Union[int, str]] = None,\n        num_shards: Optional[dict[str, int]] = None,\n        embed_external_files: bool = True,\n        num_proc: Optional[int] = None,\n    ) -> Optional[CommitInfo]:\n        \"\"\"Pushes the [`DatasetDict`] to the hub as a Parquet dataset.\n        The [`DatasetDict`] is pushed using HTTP requests and does not need to have neither git or git-lfs installed.\n\n        Each dataset split will be pushed independently. The pushed dataset will keep the original split names.\n\n        The resulting Parquet files are self-contained by default: if your dataset contains [`Image`] or [`Audio`]\n        data, the Parquet files will store the bytes of your images or audio files.\n        You can disable this by setting `embed_external_files` to False.\n\n        Args:\n            repo_id (`str`):\n                The ID of the repository to push to in the following format: `<user>/<dataset_name>` or\n                `<org>/<dataset_name>`. Also accepts `<dataset_name>`, which will default to the namespace\n                of the logged-in user.\n\n                It could also be a location inside a bucket, e.g. `buckets/<user_or_org>/<bucket_name>/...`\n            config_name (`str`):\n                Configuration name of a dataset. Defaults to \"default\".\n            set_default (`bool`, *optional*):\n                Whether to set this configuration as the default one. Otherwise, the default configuration is the one\n                named \"default\".\n            data_dir (`str`, *optional*):\n                Directory name that will contain the uploaded data files. Defaults to the `config_name` if different\n                from \"default\", else \"data\".\n\n                <Added version=\"2.17.0\"/>\n            commit_message (`str`, *optional*):\n                Message to commit while pushing. Will default to `\"Upload dataset\"`.\n            commit_description (`str`, *optional*):\n                Description of the commit that will be created.\n                Additionally, description of the PR if a PR is created (`create_pr` is True).\n\n                <Added version=\"2.16.0\"/>\n            private (`bool`, *optional*):\n                Whether to make the repo private. If `None` (default), the repo will be public unless the\n                organization's default is private. This value is ignored if the repo already exists.\n            token (`str`, *optional*):\n                An optional authentication token for the Hugging Face Hub. If no token is passed, will default\n                to the token saved locally when logging in with `huggingface-cli login`. Will raise an error\n                if no token is passed and the user is not logged-in.\n            revision (`str`, *optional*):\n                Branch to push the uploaded files to. Defaults to the `\"main\"` branch.\n\n                <Added version=\"2.15.0\"/>\n            create_pr (`bool`, *optional*, defaults to `False`):\n                Whether to create a PR with the uploaded files or directly commit.\n\n                <Added version=\"2.15.0\"/>\n            max_shard_size (`int` or `str`, *optional*, defaults to `\"500MB\"`):\n                The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by a unit\n                (like `\"500MB\"` or `\"1GB\"`).\n            num_shards (`Dict[str, int]`, *optional*):\n                Number of shards to write. By default, the number of shards depends on `max_shard_size`.\n                Use a dictionary to define a different num_shards for each split.\n\n                <Added version=\"2.8.0\"/>\n            embed_external_files (`bool`, defaults to `True`):\n                Whether to embed file bytes in the shards.\n                In particular, this will do the following before the push for the fields of type:\n\n                - [`Audio`] and [`Image`] removes local path information and embed file content in the Parquet files.\n            num_proc (`int`, *optional*, defaults to `None`):\n                Number of processes when preparing and uploading the dataset.\n                This is helpful if the dataset is made of many samples or media files to embed.\n                I uses \"spawn\" context to work with hf_xet, the rust client for fast uploads to HF.\n                Multiprocessing is disabled by default.\n\n                <Added version=\"4.0.0\"/>\n\n        Return:\n            huggingface_hub.CommitInfo\n\n        Example:\n\n        ```python\n        >>> dataset_dict.push_to_hub(\"<organization>/<dataset_id>\")\n        >>> dataset_dict.push_to_hub(\"<organization>/<dataset_id>\", private=True)\n        >>> dataset_dict.push_to_hub(\"<organization>/<dataset_id>\", max_shard_size=\"1GB\")\n        >>> dataset_dict.push_to_hub(\"<organization>/<dataset_id>\", num_shards={\"train\": 1024, \"test\": 8})\n        ```\n\n        If you want to add a new configuration (or subset) to a dataset (e.g. if the dataset has multiple tasks/versions/languages):\n\n        ```python\n        >>> english_dataset.push_to_hub(\"<organization>/<dataset_id>\", \"en\")\n        >>> french_dataset.push_to_hub(\"<organization>/<dataset_id>\", \"fr\")\n        >>> # later\n        >>> english_dataset = load_dataset(\"<organization>/<dataset_id>\", \"en\")\n        >>> french_dataset = load_dataset(\"<organization>/<dataset_id>\", \"fr\")\n        ```\n        \"\"\"\n        if num_shards is None:\n            num_shards = dict.fromkeys(self)\n        elif not isinstance(num_shards, dict):\n            raise ValueError(\n                \"Please provide one `num_shards` per dataset in the dataset dictionary, e.g. {{'train': 128, 'test': 4}}\"\n            )\n\n        self._check_values_type()\n        self._check_values_features()\n        if config_name == \"data\":\n            raise ValueError(\"`config_name` cannot be 'data'. Please, choose another name for configuration.\")\n\n        if max_shard_size is not None and num_shards is not None and any(x is not None for x in num_shards.values()):\n            raise ValueError(\n                \"Failed to push_to_hub: please specify either max_shard_size or num_shards, but not both.\"\n            )\n\n        for split in self:\n            if not re.match(_split_re, split):\n                raise ValueError(f\"Split name should match '{_split_re}' but got '{split}'.\")\n\n        if not data_dir:\n            data_dir = config_name if config_name != \"default\" else \"data\"  # for backward compatibility\n\n        api = HfApi(endpoint=config.HF_ENDPOINT, token=token)\n        if repo_id.startswith(\"buckets/\"):\n            if BucketNotFoundError is None:\n                raise ImportError(\"Pushing datasets to buckets requires huggingface_hub>=1.6.0\")\n            _, _namespace, _bucket_name, *_path_segments = repo_id.split(\"/\")\n            try:\n                bucket_id = api.bucket_info(_namespace + \"/\" + _bucket_name).id\n            except BucketNotFoundError:\n                bucket_url = api.create_bucket(_namespace + \"/\" + _bucket_name, private=private, exist_ok=True)\n                bucket_id = bucket_url.bucket_id\n            path = \"/\".join(s for s in _path_segments if s)\n            return _push_to_bucket(\n                self,\n                bucket_id=bucket_id,\n                path=path,\n                config_name=config_name,\n                set_default=set_default,\n                data_dir=data_dir,\n                token=token,\n                max_shard_size=max_shard_size,\n                num_shards=num_shards,\n                embed_external_files=embed_external_files,\n                num_proc=num_proc,\n            )\n        else:\n            try:\n                repo_id = api.repo_info(repo_id, repo_type=\"dataset\").id\n            except RepositoryNotFoundError:\n                repo_url = api.create_repo(\n                    repo_id,\n                    repo_type=\"dataset\",\n                    private=private,\n                    exist_ok=True,\n                )\n                repo_id = repo_url.repo_id\n\n            if revision is not None and not revision.startswith(\"refs/pr/\"):\n                # We do not call create_branch for a PR reference: 400 Bad Request\n                api.create_branch(repo_id, branch=revision, repo_type=\"dataset\", exist_ok=True)\n            return _push_to_repo(\n                self,\n                repo_id=repo_id,\n                config_name=config_name,\n                set_default=set_default,\n                data_dir=data_dir,\n                commit_message=commit_message,\n                commit_description=commit_description,\n                token=token,\n                revision=revision,\n                create_pr=create_pr,\n                max_shard_size=max_shard_size,\n                num_shards=num_shards,\n                embed_external_files=embed_external_files,\n                num_proc=num_proc,\n            )\n\n\nclass IterableDatasetDict(dict[Union[str, NamedSplit], IterableDataset]):\n    def _check_values_type(self):\n        for dataset in self.values():\n            if not isinstance(dataset, IterableDataset):\n                raise TypeError(f\"Values in `DatasetDict` should be of type `Dataset` but got type '{type(dataset)}'\")\n\n    def _check_values_features(self):\n        items = [(key, dataset._resolve_features()) for key, dataset in self.items()]\n        for item_a, item_b in zip(items[:-1], items[1:]):\n            if item_a[1].features != item_b[1].features:\n                raise ValueError(\n                    f\"All datasets in `DatasetDict` should have the same features but features for '{item_a[0]}' and '{item_b[0]}' don't match: {item_a[1].features} != {item_b[1].features}\"\n                )\n\n    def __repr__(self):\n        repr = \"\\n\".join([f\"{k}: {v}\" for k, v in self.items()])\n        repr = re.sub(r\"^\", \" \" * 4, repr, count=0, flags=re.M)\n        return f\"IterableDatasetDict({{\\n{repr}\\n}})\"\n\n    @property\n    def num_columns(self) -> dict[str, Optional[int]]:\n        \"\"\"Number of columns in each split of the dataset.\n        This can contain None valies if some splits have unknown features (e.g. after a map() operation).\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> ds.num_columns\n        {'test': 2, 'train': 2, 'validation': 2}\n        ```\n        \"\"\"\n        self._check_values_type()\n        return {k: dataset.num_columns for k, dataset in self.items()}\n\n    @property\n    def column_names(self) -> dict[str, Optional[list[str]]]:\n        \"\"\"Names of the columns in each split of the dataset.\n        This can contain None valies if some splits have unknown features (e.g. after a map() operation).\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\")\n        >>> ds.column_names\n        {'test': ['text', 'label'],\n         'train': ['text', 'label'],\n         'validation': ['text', 'label']}\n        ```\n        \"\"\"\n        self._check_values_type()\n        return {k: dataset.column_names for k, dataset in self.items()}\n\n    def with_format(\n        self,\n        type: Optional[str] = None,\n    ) -> \"IterableDatasetDict\":\n        \"\"\"\n        Return a dataset with the specified format.\n\n        Args:\n\n            type (`str`, *optional*):\n                Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'jax', 'arrow', 'pandas', 'polars']`.\n                `None` means it returns python objects (default).\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> from transformers import AutoTokenizer\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\", streaming=True)\n        >>> tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n        >>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True)\n        >>> ds = ds.with_format(\"torch\")\n        >>> next(iter(ds))\n        {'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .',\n         'label': tensor(1),\n         'input_ids': tensor([  101, 18027, 16310, 16001,  1103,  9321,   178, 11604,  7235,  6617,\n                1742,  2165,  2820,  1206,  6588, 22572, 12937,  1811,  2153,  1105,\n                1147, 12890, 19587,  6463,  1105, 15026,  1482,   119,   102,     0,\n                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n                    0,     0,     0,     0]),\n         'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),\n         'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n                1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}\n        ```\n        \"\"\"\n        return IterableDatasetDict({k: dataset.with_format(type=type) for k, dataset in self.items()})\n\n    def map(\n        self,\n        function: Optional[Callable] = None,\n        with_indices: bool = False,\n        with_split: bool = False,\n        input_columns: Optional[Union[str, list[str]]] = None,\n        batched: bool = False,\n        batch_size: int = 1000,\n        drop_last_batch: bool = False,\n        remove_columns: Optional[Union[str, list[str]]] = None,\n        fn_kwargs: Optional[dict] = None,\n    ) -> \"IterableDatasetDict\":\n        \"\"\"\n        Apply a function to all the examples in the iterable dataset (individually or in batches) and update them.\n        If your function returns a column that already exists, then it overwrites it.\n        The function is applied on-the-fly on the examples when iterating over the dataset.\n        The transformation is applied to all the datasets of the dataset dictionary.\n\n        You can specify whether the function should be batched or not with the `batched` parameter:\n\n        - If batched is `False`, then the function takes 1 example in and should return 1 example.\n          An example is a dictionary, e.g. `{\"text\": \"Hello there !\"}`.\n        - If batched is `True` and `batch_size` is 1, then the function takes a batch of 1 example as input and can return a batch with 1 or more examples.\n          A batch is a dictionary, e.g. a batch of 1 example is `{\"text\": [\"Hello there !\"]}`.\n        - If batched is `True` and `batch_size` is `n` > 1, then the function takes a batch of `n` examples as input and can return a batch with `n` examples, or with an arbitrary number of examples.\n          Note that the last batch may have less than `n` examples.\n          A batch is a dictionary, e.g. a batch of `n` examples is `{\"text\": [\"Hello there !\"] * n}`.\n\n        If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simultaneous calls.\n        It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time.\n\n        Args:\n            function (`Callable`, *optional*, defaults to `None`):\n                Function applied on-the-fly on the examples when you iterate on the dataset.\n                It must have one of the following signatures:\n\n                - `function(example: Dict[str, Any]) -> Dict[str, Any]` if `batched=False` and `with_indices=False`\n                - `function(example: Dict[str, Any], idx: int) -> Dict[str, Any]` if `batched=False` and `with_indices=True`\n                - `function(batch: Dict[str, list]) -> Dict[str, list]` if `batched=True` and `with_indices=False`\n                - `function(batch: Dict[str, list], indices: list[int]) -> Dict[str, list]` if `batched=True` and `with_indices=True`\n\n                For advanced usage, the function can also return a `pyarrow.Table`.\n                If the function is asynchronous, then `map` will run your function in parallel.\n                Moreover if your function returns nothing (`None`), then `map` will run your function and return the dataset unchanged.\n                If no function is provided, default to identity function: `lambda x: x`.\n            with_indices (`bool`, defaults to `False`):\n                Provide example indices to `function`. Note that in this case the signature of `function` should be `def function(example, idx[, rank]): ...`.\n            input_columns (`[Union[str, list[str]]]`, *optional*, defaults to `None`):\n                The columns to be passed into `function`\n                as positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument.\n            batched (`bool`, defaults to `False`):\n                Provide batch of examples to `function`.\n            batch_size (`int`, *optional*, defaults to `1000`):\n                Number of examples per batch provided to `function` if `batched=True`.\n            drop_last_batch (`bool`, defaults to `False`):\n                Whether a last batch smaller than the `batch_size` should be\n                dropped instead of being processed by the function.\n            remove_columns (`[list[str]]`, *optional*, defaults to `None`):\n                Remove a selection of columns while doing the mapping.\n                Columns will be removed before updating the examples with the output of `function`, i.e. if `function` is adding\n                columns with names in `remove_columns`, these columns will be kept.\n            fn_kwargs (`Dict`, *optional*, defaults to `None`):\n                Keyword arguments to be passed to `function`\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", streaming=True)\n        >>> def add_prefix(example):\n        ...     example[\"text\"] = \"Review: \" + example[\"text\"]\n        ...     return example\n        >>> ds = ds.map(add_prefix)\n        >>> next(iter(ds[\"train\"]))\n        {'label': 1,\n         'text': 'Review: the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}\n        ```\n        \"\"\"\n\n        dataset_dict = {}\n        for split, dataset in self.items():\n            if with_split:\n                function = bind(function, split)\n\n            dataset_dict[split] = dataset.map(\n                function=function,\n                with_indices=with_indices,\n                input_columns=input_columns,\n                batched=batched,\n                batch_size=batch_size,\n                drop_last_batch=drop_last_batch,\n                remove_columns=remove_columns,\n                fn_kwargs=fn_kwargs,\n            )\n\n            if with_split:\n                function = function.func\n\n        return IterableDatasetDict(dataset_dict)\n\n    def filter(\n        self,\n        function: Optional[Callable] = None,\n        with_indices=False,\n        input_columns: Optional[Union[str, list[str]]] = None,\n        batched: bool = False,\n        batch_size: Optional[int] = 1000,\n        fn_kwargs: Optional[dict] = None,\n    ) -> \"IterableDatasetDict\":\n        \"\"\"Apply a filter function to all the elements so that the dataset only includes examples according to the filter function.\n        The filtering is done on-the-fly when iterating over the dataset.\n        The filtering is applied to all the datasets of the dataset dictionary.\n\n        Args:\n            function (`Callable`):\n                Callable with one of the following signatures:\n\n                - `function(example: Dict[str, Any]) -> bool` if `with_indices=False, batched=False`\n                - `function(example: Dict[str, Any], indices: int) -> bool` if `with_indices=True, batched=False`\n                - `function(example: Dict[str, list]) -> list[bool]` if `with_indices=False, batched=True`\n                - `function(example: Dict[str, list], indices: list[int]) -> list[bool]` if `with_indices=True, batched=True`\n\n                If no function is provided, defaults to an always True function: `lambda x: True`.\n            with_indices (`bool`, defaults to `False`):\n                Provide example indices to `function`. Note that in this case the signature of `function` should be `def function(example, idx): ...`.\n            input_columns (`str` or `list[str]`, *optional*):\n                The columns to be passed into `function` as\n                positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument.\n            batched (`bool`, defaults to `False`):\n                Provide batch of examples to `function`\n            batch_size (`int`, *optional*, defaults to `1000`):\n                Number of examples per batch provided to `function` if `batched=True`.\n            fn_kwargs (`Dict`, *optional*, defaults to `None`):\n                Keyword arguments to be passed to `function`\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", streaming=True)\n        >>> ds = ds.filter(lambda x: x[\"label\"] == 0)\n        >>> list(ds[\"train\"].take(3))\n        [{'label': 0, 'text': 'Review: simplistic , silly and tedious .'},\n         {'label': 0,\n         'text': \"Review: it's so laddish and juvenile , only teenage boys could possibly find it funny .\"},\n         {'label': 0,\n         'text': 'Review: exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable .'}]\n        ```\n        \"\"\"\n        return IterableDatasetDict(\n            {\n                k: dataset.filter(\n                    function=function,\n                    with_indices=with_indices,\n                    input_columns=input_columns,\n                    batched=batched,\n                    batch_size=batch_size,\n                    fn_kwargs=fn_kwargs,\n                )\n                for k, dataset in self.items()\n            }\n        )\n\n    def shuffle(\n        self,\n        seed=None,\n        generator: Optional[np.random.Generator] = None,\n        buffer_size: int = 1000,\n    ) -> \"IterableDatasetDict\":\n        \"\"\"\n        Randomly shuffles the elements of this dataset.\n        The shuffling is applied to all the datasets of the dataset dictionary.\n\n        This dataset fills a buffer with buffer_size elements, then randomly samples elements from this buffer,\n        replacing the selected elements with new elements. For perfect shuffling, a buffer size greater than or\n        equal to the full size of the dataset is required.\n\n        For instance, if your dataset contains 10,000 elements but `buffer_size` is set to 1000, then `shuffle` will\n        initially select a random element from only the first 1000 elements in the buffer. Once an element is\n        selected, its space in the buffer is replaced by the next (i.e. 1,001-st) element,\n        maintaining the 1000 element buffer.\n\n        If the dataset is made of several shards, it also does `shuffle` the order of the shards.\n        However if the order has been fixed by using [`~datasets.IterableDataset.skip`] or [`~datasets.IterableDataset.take`]\n        then the order of the shards is kept unchanged.\n\n        Args:\n            seed (`int`, *optional*, defaults to `None`):\n                Random seed that will be used to shuffle the dataset.\n                It is used to sample from the shuffle buffer and also to shuffle the data shards.\n            generator (`numpy.random.Generator`, *optional*):\n                Numpy random Generator to use to compute the permutation of the dataset rows.\n                If `generator=None` (default), uses `np.random.default_rng` (the default BitGenerator (PCG64) of NumPy).\n            buffer_size (`int`, defaults to `1000`):\n                Size of the buffer.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", streaming=True)\n        >>> list(ds[\"train\"].take(3))\n        [{'label': 1,\n         'text': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'},\n         {'label': 1,\n         'text': 'the gorgeously elaborate continuation of \" the lord of the rings \" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\\'s expanded vision of j . r . r . tolkien\\'s middle-earth .'},\n         {'label': 1, 'text': 'effective but too-tepid biopic'}]\n        >>> ds = ds.shuffle(seed=42)\n        >>> list(ds[\"train\"].take(3))\n        [{'label': 1,\n         'text': \"a sports movie with action that's exciting on the field and a story you care about off it .\"},\n         {'label': 1,\n         'text': 'at its best , the good girl is a refreshingly adult take on adultery . . .'},\n         {'label': 1,\n         'text': \"sam jones became a very lucky filmmaker the day wilco got dropped from their record label , proving that one man's ruin may be another's fortune .\"}]\n        ```\n        \"\"\"\n        return IterableDatasetDict(\n            {\n                k: dataset.shuffle(seed=seed, generator=generator, buffer_size=buffer_size)\n                for k, dataset in self.items()\n            }\n        )\n\n    def rename_column(self, original_column_name: str, new_column_name: str) -> \"IterableDatasetDict\":\n        \"\"\"\n        Rename a column in the dataset, and move the features associated to the original column under the new column\n        name.\n        The renaming is applied to all the datasets of the dataset dictionary.\n\n        Args:\n            original_column_name (`str`):\n                Name of the column to rename.\n            new_column_name (`str`):\n                New name for the column.\n\n        Returns:\n            [`IterableDatasetDict`]: A copy of the dataset with a renamed column.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", streaming=True)\n        >>> ds = ds.rename_column(\"text\", \"movie_review\")\n        >>> next(iter(ds[\"train\"]))\n        {'label': 1,\n         'movie_review': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}\n        ```\n        \"\"\"\n        return IterableDatasetDict(\n            {\n                k: dataset.rename_column(\n                    original_column_name=original_column_name,\n                    new_column_name=new_column_name,\n                )\n                for k, dataset in self.items()\n            }\n        )\n\n    def rename_columns(self, column_mapping: dict[str, str]) -> \"IterableDatasetDict\":\n        \"\"\"\n        Rename several columns in the dataset, and move the features associated to the original columns under\n        the new column names.\n        The renaming is applied to all the datasets of the dataset dictionary.\n\n        Args:\n            column_mapping (`Dict[str, str]`):\n                A mapping of columns to rename to their new names.\n\n        Returns:\n            [`IterableDatasetDict`]: A copy of the dataset with renamed columns\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", streaming=True)\n        >>> ds = ds.rename_columns({\"text\": \"movie_review\", \"label\": \"rating\"})\n        >>> next(iter(ds[\"train\"]))\n        {'movie_review': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',\n         'rating': 1}\n        ```\n        \"\"\"\n        return IterableDatasetDict(\n            {k: dataset.rename_columns(column_mapping=column_mapping) for k, dataset in self.items()}\n        )\n\n    def remove_columns(self, column_names: Union[str, list[str]]) -> \"IterableDatasetDict\":\n        \"\"\"\n        Remove one or several column(s) in the dataset and the features associated to them.\n        The removal is done on-the-fly on the examples when iterating over the dataset.\n        The removal is applied to all the datasets of the dataset dictionary.\n\n\n        Args:\n            column_names (`Union[str, list[str]]`):\n                Name of the column(s) to remove.\n\n        Returns:\n            [`IterableDatasetDict`]: A copy of the dataset object without the columns to remove.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", streaming=True)\n        >>> ds = ds.remove_columns(\"label\")\n        >>> next(iter(ds[\"train\"]))\n        {'text': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}\n        ```\n        \"\"\"\n        return IterableDatasetDict({k: dataset.remove_columns(column_names) for k, dataset in self.items()})\n\n    def select_columns(self, column_names: Union[str, list[str]]) -> \"IterableDatasetDict\":\n        \"\"\"Select one or several column(s) in the dataset and the features\n        associated to them. The selection is done on-the-fly on the examples\n        when iterating over the dataset. The selection is applied to all the\n        datasets of the dataset dictionary.\n\n\n        Args:\n            column_names (`Union[str, list[str]]`):\n                Name of the column(s) to keep.\n\n        Returns:\n            [`IterableDatasetDict`]: A copy of the dataset object with only selected columns.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", streaming=True)\n        >>> ds = ds.select(\"text\")\n        >>> next(iter(ds[\"train\"]))\n        {'text': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}\n        ```\n        \"\"\"\n        return IterableDatasetDict({k: dataset.select_columns(column_names) for k, dataset in self.items()})\n\n    def cast_column(self, column: str, feature: FeatureType) -> \"IterableDatasetDict\":\n        \"\"\"Cast column to feature for decoding.\n        The type casting is applied to all the datasets of the dataset dictionary.\n\n        Args:\n            column (`str`):\n                Column name.\n            feature ([`Feature`]):\n                Target feature.\n\n        Returns:\n            [`IterableDatasetDict`]\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset, ClassLabel\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", streaming=True)\n        >>> ds[\"train\"].features\n        {'label': ClassLabel(names=['neg', 'pos']),\n         'text': Value('string')}\n        >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))\n        >>> ds[\"train\"].features\n        {'label': ClassLabel(names=['bad', 'good']),\n         'text': Value('string')}\n        ```\n        \"\"\"\n        return IterableDatasetDict(\n            {k: dataset.cast_column(column=column, feature=feature) for k, dataset in self.items()}\n        )\n\n    def cast(\n        self,\n        features: Features,\n    ) -> \"IterableDatasetDict\":\n        \"\"\"\n        Cast the dataset to a new set of features.\n        The type casting is applied to all the datasets of the dataset dictionary.\n\n        Args:\n            features (`Features`):\n                New features to cast the dataset to.\n                The name of the fields in the features must match the current column names.\n                The type of the data must also be convertible from one type to the other.\n                For non-trivial conversion, e.g. `string` <-> `ClassLabel` you should use [`map`] to update the Dataset.\n\n        Returns:\n            [`IterableDatasetDict`]: A copy of the dataset with casted features.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", streaming=True)\n        >>> ds[\"train\"].features\n        {'label': ClassLabel(names=['neg', 'pos']),\n         'text': Value('string')}\n        >>> new_features = ds[\"train\"].features.copy()\n        >>> new_features['label'] = ClassLabel(names=['bad', 'good'])\n        >>> new_features['text'] = Value('large_string')\n        >>> ds = ds.cast(new_features)\n        >>> ds[\"train\"].features\n        {'label': ClassLabel(names=['bad', 'good']),\n         'text': Value('large_string')}\n        ```\n        \"\"\"\n        return IterableDatasetDict({k: dataset.cast(features=features) for k, dataset in self.items()})\n\n    def push_to_hub(\n        self,\n        repo_id: str,\n        config_name: str = \"default\",\n        set_default: Optional[bool] = None,\n        data_dir: Optional[str] = None,\n        commit_message: Optional[str] = None,\n        commit_description: Optional[str] = None,\n        private: Optional[bool] = None,\n        token: Optional[str] = None,\n        revision: Optional[str] = None,\n        create_pr: Optional[bool] = False,\n        max_shard_size: Optional[Union[int, str]] = None,\n        num_shards: Optional[dict[str, int]] = None,\n        embed_external_files: bool = True,\n        num_proc: Optional[int] = None,\n    ) -> CommitInfo:\n        \"\"\"Pushes the [`IterableDatasetDict`] to the hub as a Parquet dataset.\n        The [`IterableDatasetDict`] is pushed using HTTP requests and does not need to have neither git or git-lfs installed.\n\n        Each dataset split will be pushed independently. The pushed dataset will keep the original split names.\n\n        The resulting Parquet files are self-contained by default: if your dataset contains [`Image`] or [`Audio`]\n        data, the Parquet files will store the bytes of your images or audio files.\n        You can disable this by setting `embed_external_files` to False.\n\n        Args:\n            repo_id (`str`):\n                The ID of the repository to push to in the following format: `<user>/<dataset_name>` or\n                `<org>/<dataset_name>`. Also accepts `<dataset_name>`, which will default to the namespace\n                of the logged-in user.\n\n                It could also be a location inside a bucket, e.g. `buckets/<user_or_org>/<bucket_name>/...`\n            config_name (`str`):\n                Configuration name of a dataset. Defaults to \"default\".\n            set_default (`bool`, *optional*):\n                Whether to set this configuration as the default one. Otherwise, the default configuration is the one\n                named \"default\".\n            data_dir (`str`, *optional*):\n                Directory name that will contain the uploaded data files. Defaults to the `config_name` if different\n                from \"default\", else \"data\".\n\n                <Added version=\"2.17.0\"/>\n            commit_message (`str`, *optional*):\n                Message to commit while pushing. Will default to `\"Upload dataset\"`.\n            commit_description (`str`, *optional*):\n                Description of the commit that will be created.\n                Additionally, description of the PR if a PR is created (`create_pr` is True).\n\n                <Added version=\"2.16.0\"/>\n            private (`bool`, *optional*):\n                Whether to make the repo private. If `None` (default), the repo will be public unless the\n                organization's default is private. This value is ignored if the repo already exists.\n            token (`str`, *optional*):\n                An optional authentication token for the Hugging Face Hub. If no token is passed, will default\n                to the token saved locally when logging in with `huggingface-cli login`. Will raise an error\n                if no token is passed and the user is not logged-in.\n            revision (`str`, *optional*):\n                Branch to push the uploaded files to. Defaults to the `\"main\"` branch.\n            create_pr (`bool`, *optional*, defaults to `False`):\n                Whether to create a PR with the uploaded files or directly commit.\n            max_shard_size (`int` or `str`, *optional*):\n                Optional maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed\n                by a unit (like `\"500MB\"` or `\"1GB\"`). If not provided, each split keeps its dataset-native shard count.\n            num_shards (`Dict[str, int]`, *optional*):\n                Number of shards to write. Use a dictionary to define a different `num_shards` for each split.\n                If `max_shard_size` is provided and a split's `num_shards` is not, then the number of shards for that split is estimated\n                from `max_shard_size`.\n            embed_external_files (`bool`, defaults to `True`):\n                Whether to embed file bytes in the shards.\n                In particular, this will do the following before the push for the fields of type:\n\n                - [`Audio`] and [`Image`] removes local path information and embed file content in the Parquet files.\n            num_proc (`int`, *optional*, defaults to `None`):\n                Number of processes when preparing and uploading the dataset.\n                This is helpful if the dataset is made of many samples or media files to embed.\n                I uses \"spawn\" context to work with hf_xet, the rust client for fast uploads to HF.\n                Multiprocessing is disabled by default.\n\n                <Added version=\"4.0.0\"/>\n\n        Return:\n            huggingface_hub.CommitInfo\n\n        Example:\n\n        ```python\n        >>> dataset_dict.push_to_hub(\"<organization>/<dataset_id>\")\n        >>> dataset_dict.push_to_hub(\"<organization>/<dataset_id>\", private=True)\n        >>> dataset_dict.push_to_hub(\"<organization>/<dataset_id>\", max_shard_size=\"1GB\")\n        >>> dataset_dict.push_to_hub(\"<organization>/<dataset_id>\", num_shards={\"train\": 1024, \"test\": 8})\n        ```\n\n        If you want to add a new configuration (or subset) to a dataset (e.g. if the dataset has multiple tasks/versions/languages):\n\n        ```python\n        >>> english_dataset.push_to_hub(\"<organization>/<dataset_id>\", \"en\")\n        >>> french_dataset.push_to_hub(\"<organization>/<dataset_id>\", \"fr\")\n        >>> # later\n        >>> english_dataset = load_dataset(\"<organization>/<dataset_id>\", \"en\")\n        >>> french_dataset = load_dataset(\"<organization>/<dataset_id>\", \"fr\")\n        ```\n        \"\"\"\n        # check to make sure that the user doesnt specify the # of shards and max shard sdize at same time, since these are 2 different ways to specify the same thing\n        if max_shard_size is not None and num_shards is not None:\n            raise ValueError(\n                \"Failed to push_to_hub: please specify either max_shard_size or num_shards, but not both.\"\n            )\n\n        if num_shards is None:\n            num_shards = dict.fromkeys(self)\n        elif not isinstance(num_shards, dict):\n            raise ValueError(\n                \"Please provide one `num_shards` per dataset in the dataset dictionary, e.g. {{'train': 128, 'test': 4}}\"\n            )\n\n        self._check_values_type()\n        self._check_values_features()\n        if config_name == \"data\":\n            raise ValueError(\"`config_name` cannot be 'data'. Please, choose another name for configuration.\")\n\n        # if max_shard_size is not None and num_shards is not None and any(x is not None for x in num_shards.values()):\n        #     raise ValueError(\n        #         \"Failed to push_to_hub: please specify either max_shard_size or num_shards, but not both.\"\n        #     )\n\n        for split in self:\n            if not re.match(_split_re, split):\n                raise ValueError(f\"Split name should match '{_split_re}' but got '{split}'.\")\n\n        if not data_dir:\n            data_dir = config_name if config_name != \"default\" else \"data\"  # for backward compatibility\n\n        api = HfApi(endpoint=config.HF_ENDPOINT, token=token)\n        if repo_id.startswith(\"buckets/\"):\n            if BucketNotFoundError is None:\n                raise ImportError(\"Pushing datasets to buckets requires huggingface_hub>=1.6.0\")\n            _, _namespace, _bucket_name, *_path_segments = repo_id.split(\"/\")\n            try:\n                bucket_id = api.bucket_info(_namespace + \"/\" + _bucket_name).id\n            except BucketNotFoundError:\n                bucket_url = api.create_bucket(_namespace + \"/\" + _bucket_name, private=private, exist_ok=True)\n                bucket_id = bucket_url.bucket_id\n            path = \"/\".join(s for s in _path_segments if s)\n            return _push_to_bucket(\n                self,\n                bucket_id=bucket_id,\n                path=path,\n                config_name=config_name,\n                set_default=set_default,\n                data_dir=data_dir,\n                token=token,\n                max_shard_size=max_shard_size,\n                num_shards=num_shards.get(split),\n                embed_external_files=embed_external_files,\n                num_proc=num_proc,\n            )\n        else:\n            try:\n                repo_id = api.repo_info(repo_id, repo_type=\"dataset\").id\n            except RepositoryNotFoundError:\n                repo_url = api.create_repo(\n                    repo_id,\n                    repo_type=\"dataset\",\n                    private=private,\n                    exist_ok=True,\n                )\n                repo_id = repo_url.repo_id\n\n            if revision is not None and not revision.startswith(\"refs/pr/\"):\n                # We do not call create_branch for a PR reference: 400 Bad Request\n                api.create_branch(repo_id, branch=revision, repo_type=\"dataset\", exist_ok=True)\n            return _push_to_repo(\n                self,\n                repo_id=repo_id,\n                config_name=config_name,\n                set_default=set_default,\n                data_dir=data_dir,\n                commit_message=commit_message,\n                commit_description=commit_description,\n                token=token,\n                create_pr=create_pr,\n                revision=revision,\n                max_shard_size=max_shard_size,\n                num_shards=num_shards,\n                embed_external_files=embed_external_files,\n                num_proc=num_proc,\n            )\n\n\ndef _push_to_repo(\n    dset_dict: Union[\"DatasetDict\", \"IterableDatasetDict\"],\n    repo_id: str,\n    config_name: str = \"default\",\n    set_default: Optional[bool] = None,\n    data_dir: Optional[str] = None,\n    commit_message: Optional[str] = None,\n    commit_description: Optional[str] = None,\n    token: Optional[str] = None,\n    revision: Optional[str] = None,\n    create_pr: Optional[bool] = False,\n    max_shard_size: Optional[Union[int, str]] = None,\n    num_shards: Optional[dict[str, Optional[int]]] = None,\n    embed_external_files: bool = True,\n    num_proc: Optional[int] = None,\n) -> CommitInfo:\n    api = HfApi(endpoint=config.HF_ENDPOINT, token=token)\n    resolved_output_path = HfFileSystemResolvedRepositoryPath(\n        repo_id=repo_id, repo_type=\"dataset\", revision=revision or \"main\", path_in_repo=\"\"\n    )\n\n    additions: list[CommitOperationAdd] = []\n    new_parquet_paths: list[str] = []\n    splits_info: list[SplitInfo] = []\n    uploaded_sizes: list[int] = []\n\n    for split in dset_dict:\n        split_additions, split_new_parquet_paths, features, split_info, uploaded_size = dset_dict[\n            split\n        ]._push_parquet_shards_to_hub(\n            resolved_output_path=resolved_output_path,\n            data_dir=data_dir,\n            split=split,\n            token=token,\n            max_shard_size=max_shard_size,\n            num_shards=(num_shards or {}).get(split),\n            create_pr=create_pr,\n            embed_external_files=embed_external_files,\n            num_proc=num_proc,\n        )\n        additions += split_additions\n        new_parquet_paths += split_new_parquet_paths\n        splits_info.append(split_info)\n        uploaded_sizes.append(uploaded_size)\n\n    commit_message = commit_message if commit_message is not None else \"Upload dataset\"\n    if len(additions) > config.UPLOADS_MAX_NUMBER_PER_COMMIT:\n        logger.info(\n            f\"Number of files to upload is larger than {config.UPLOADS_MAX_NUMBER_PER_COMMIT}. Splitting the push into multiple commits.\"\n        )\n        num_commits = math.ceil(len(additions) / config.UPLOADS_MAX_NUMBER_PER_COMMIT)\n        for i in range(0, num_commits):\n            operations = additions[\n                i * config.UPLOADS_MAX_NUMBER_PER_COMMIT : (i + 1) * config.UPLOADS_MAX_NUMBER_PER_COMMIT\n            ]\n            for retry, sleep_time in enumerate(itertools.chain(range(10), itertools.repeat(30)), start=1):\n                # We need to retry if another commit happens at the same time\n                sleep_time *= 1 + random.random()\n                try:\n                    commit_info = api.create_commit(\n                        repo_id,\n                        operations=operations,\n                        commit_message=commit_message + f\" (part {i:05d}-of-{num_commits:05d})\",\n                        commit_description=commit_description,\n                        repo_type=\"dataset\",\n                        revision=revision,\n                        create_pr=create_pr,\n                    )\n                except HfHubHTTPError as err:\n                    if (\n                        err.__context__\n                        and isinstance(err.__context__, HfHubHTTPError)\n                        and err.__context__.response.status_code == 409\n                    ):\n                        # 409 is Conflict (another commit is in progress)\n                        time.sleep(sleep_time)\n                        logger.info(\n                            f\"Retrying intermediate commit for {repo_id}, {config_name} ({retry}/n with status_code {err.__context__.response.status_code})\"\n                        )\n                        continue\n                    else:\n                        raise\n                break\n            logger.info(\n                f\"Commit #{i + 1} completed\"\n                + (f\" (still {num_commits - i - 1} to go)\" if num_commits - i - 1 else \"\")\n                + \".\"\n            )\n        last_commit_additions = []\n    else:\n        last_commit_additions = additions\n\n    for retry, sleep_time in enumerate(itertools.chain(range(10), itertools.repeat(30)), start=1):\n        # We need to retry if there was a commit in between in case it touched the dataset card data\n        sleep_time *= 1 + random.random()\n\n        # We make sure to get info from this commit\n        parent_commit = api.repo_info(repo_id, repo_type=\"dataset\", revision=revision).sha\n        hf_path = HfFileSystemResolvedRepositoryPath(\n            repo_type=resolved_output_path.repo_type,\n            repo_id=resolved_output_path.repo_id,\n            revision=parent_commit,\n            path_in_repo=resolved_output_path.path_in_repo,\n        ).unresolve()\n        hffs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=token)\n        dirfs = DirFileSystem(fs=hffs, path=hf_path)\n\n        # Check the files to delete\n        try:\n            files_to_delete = list(dirfs.glob(f\"{data_dir}/*\"))\n        except EntryNotFoundError:  # needed for huggingface_hub<=1.7.1\n            files_to_delete = []\n\n        # Don't delete the new files\n        deletions = [\n            CommitOperationDelete(path_in_repo=file_to_delete)\n            for file_to_delete in files_to_delete\n            if file_to_delete not in new_parquet_paths\n        ]\n\n        # Update the dataset card\n        new_dataset_card, new_legacy_dataset_infos = _get_updated_dataset_card(\n            fs=dirfs,\n            config_name=config_name,\n            splits_info=splits_info,\n            features=features,\n            data_dir=data_dir,\n            set_default=set_default,\n            uploaded_sizes=uploaded_sizes,\n            deleted_sizes=[],\n            remove_other_splits=True,\n        )\n        dataset_card_additions = [\n            CommitOperationAdd(path_in_repo=config.REPOCARD_FILENAME, path_or_fileobj=str(new_dataset_card).encode())\n        ]\n        if new_legacy_dataset_infos:\n            dataset_card_additions.append(\n                CommitOperationAdd(\n                    path_in_repo=config.DATASETDICT_INFOS_FILENAME,\n                    path_or_fileobj=json.dumps(new_legacy_dataset_infos).encode(\"utf-8\"),\n                )\n            )\n        operations = last_commit_additions + dataset_card_additions + deletions\n\n        try:\n            commit_info = api.create_commit(\n                repo_id,\n                operations=operations,\n                commit_message=commit_message,\n                commit_description=commit_description,\n                repo_type=\"dataset\",\n                revision=revision,\n                create_pr=create_pr,\n                parent_commit=parent_commit,\n            )\n        except HfHubHTTPError as err:\n            if (\n                err.__context__\n                and isinstance(err.__context__, HfHubHTTPError)\n                and err.__context__.response.status_code in (412, 409)\n            ):\n                # 412 is Precondition failed (parent_commit isn't satisfied)\n                # 409 is Conflict (another commit is in progress)\n                time.sleep(sleep_time)\n                logger.info(\n                    f\"Retrying commit for {repo_id}, {config_name} ({retry}/n with status_code {err.__context__.response.status_code})\"\n                )\n                continue\n            else:\n                raise\n        break\n\n    return commit_info\n\n\ndef _push_to_bucket(\n    dset_dict: Union[\"DatasetDict\", \"IterableDatasetDict\"],\n    bucket_id: str,\n    path: str,\n    config_name: str = \"default\",\n    set_default: Optional[bool] = None,\n    data_dir: Optional[str] = None,\n    token: Optional[str] = None,\n    max_shard_size: Optional[Union[int, str]] = None,\n    num_shards: Optional[dict[str, Optional[int]]] = None,\n    embed_external_files: bool = True,\n    num_proc: Optional[int] = None,\n) -> None:\n    api = HfApi(endpoint=config.HF_ENDPOINT, token=token)\n    resolved_output_path = HfFileSystemResolvedBucketPath(bucket_id=bucket_id, path=path)\n    hf_path = resolved_output_path.unresolve()\n    hffs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=token)\n    dirfs = DirFileSystem(fs=hffs, path=hf_path)\n\n    additions: list[CommitOperationAdd] = []\n    new_parquet_paths: list[str] = []\n    splits_info: list[SplitInfo] = []\n    uploaded_sizes: list[int] = []\n\n    # Check the files to delete before uploading\n    try:\n        files_to_delete = list(dirfs.glob(f\"{data_dir}/*\"))\n    except EntryNotFoundError:  # needed for huggingface_hub<=1.7.1\n        files_to_delete = []\n\n    for split in dset_dict:\n        # Upload the Parquet files\n        split_additions, split_new_parquet_paths, features, split_info, uploaded_size = dset_dict[\n            split\n        ]._push_parquet_shards_to_hub(\n            resolved_output_path=resolved_output_path,\n            data_dir=data_dir,\n            split=split,\n            token=token,\n            max_shard_size=max_shard_size,\n            num_shards=(num_shards or {}).get(split),\n            create_pr=False,\n            embed_external_files=embed_external_files,\n            num_proc=num_proc,\n        )\n        additions += split_additions\n        new_parquet_paths += split_new_parquet_paths\n        splits_info.append(split_info)\n        uploaded_sizes.append(uploaded_size)\n\n    # Don't delete the new files\n    new_parquet_paths = set(new_parquet_paths)\n    delete = [file_to_delete for file_to_delete in files_to_delete if file_to_delete not in new_parquet_paths]\n\n    # Update the dataset card\n    new_dataset_card, new_legacy_dataset_infos = _get_updated_dataset_card(\n        fs=dirfs,\n        config_name=config_name,\n        splits_info=splits_info,\n        features=features,\n        data_dir=data_dir,\n        set_default=set_default,\n        uploaded_sizes=uploaded_sizes,\n        deleted_sizes=[],\n        remove_other_splits=True,\n    )\n    path_prefix = (path + \"/\") if path else \"\"\n    add = [(str(new_dataset_card).encode(), path_prefix + config.REPOCARD_FILENAME)]\n    if new_legacy_dataset_infos:\n        add.append(\n            (json.dumps(new_legacy_dataset_infos).encode(\"utf-8\"), path_prefix + config.DATASETDICT_INFOS_FILENAME)\n        )\n\n    # Upload dataset card and delete old files\n    api.batch_bucket_files(\n        bucket_id=bucket_id,\n        add=add,\n        delete=delete,\n    )\n"
  },
  {
    "path": "src/datasets/distributed.py",
    "content": "from typing import TypeVar\n\nfrom .arrow_dataset import Dataset, _split_by_node_map_style_dataset\nfrom .iterable_dataset import IterableDataset, _split_by_node_iterable_dataset\n\n\nDatasetType = TypeVar(\"DatasetType\", Dataset, IterableDataset)\n\n\ndef split_dataset_by_node(dataset: DatasetType, rank: int, world_size: int) -> DatasetType:\n    \"\"\"\n    Split a dataset for the node at rank `rank` in a pool of nodes of size `world_size`.\n\n    For map-style datasets:\n\n    Each node is assigned a chunk of data, e.g. rank 0 is given the first chunk of the dataset.\n    To maximize data loading throughput, chunks are made of contiguous data on disk if possible.\n\n    For iterable datasets:\n\n    If the dataset has a number of shards that is a factor of `world_size` (i.e. if `dataset.num_shards % world_size == 0`),\n    then the shards are evenly assigned across the nodes, which is the most optimized.\n    Otherwise, each node keeps 1 example out of `world_size`, skipping the other examples.\n\n    > [!WARNING]\n    > If you shuffle your iterable dataset in a distributed setup, make sure to set a fixed `seed` in [`IterableDataset.shuffle`]\n    so the same shuffled list of shards is used on every node to know which shards the node should skip.\n\n    Args:\n        dataset ([`Dataset`] or [`IterableDataset`]):\n            The dataset to split by node.\n        rank (`int`):\n            Rank of the current node.\n        world_size (`int`):\n            Total number of nodes.\n\n    Returns:\n        [`Dataset`] or [`IterableDataset`]: The dataset to be used on the node at rank `rank`.\n    \"\"\"\n    if isinstance(dataset, Dataset):\n        return _split_by_node_map_style_dataset(dataset, rank=rank, world_size=world_size)\n    else:\n        return _split_by_node_iterable_dataset(dataset, rank=rank, world_size=world_size)\n"
  },
  {
    "path": "src/datasets/download/__init__.py",
    "content": "__all__ = [\n    \"DownloadConfig\",\n    \"DownloadManager\",\n    \"DownloadMode\",\n    \"StreamingDownloadManager\",\n]\n\nfrom .download_config import DownloadConfig\nfrom .download_manager import DownloadManager, DownloadMode\nfrom .streaming_download_manager import StreamingDownloadManager\n"
  },
  {
    "path": "src/datasets/download/download_config.py",
    "content": "import copy\nfrom dataclasses import dataclass, field\nfrom pathlib import Path\nfrom typing import Any, Optional, Union\n\nfrom .. import config\n\n\n@dataclass\nclass DownloadConfig:\n    \"\"\"Configuration for our cached path manager.\n\n    Attributes:\n        cache_dir (`str` or `Path`, *optional*):\n            Specify a cache directory to save the file to (overwrite the\n            default cache dir).\n        force_download (`bool`, defaults to `False`):\n            If `True`, re-download the file even if it's already cached in\n            the cache dir.\n        resume_download (`bool`, defaults to `False`):\n            If `True`, resume the download if an incompletely received file is\n            found.\n        proxies (`dict`, *optional*):\n        user_agent (`str`, *optional*):\n            Optional string or dict that will be appended to the user-agent on remote\n            requests.\n        extract_compressed_file (`bool`, defaults to `False`):\n            If `True` and the path point to a zip or tar file,\n            extract the compressed file in a folder along the archive.\n        force_extract (`bool`, defaults to `False`):\n            If `True` when `extract_compressed_file` is `True` and the archive\n            was already extracted, re-extract the archive and override the folder where it was extracted.\n        delete_extracted (`bool`, defaults to `False`):\n            Whether to delete (or keep) the extracted files.\n        extract_on_the_fly (`bool`, defaults to `False`):\n            If `True`, extract compressed files while they are being read.\n        use_etag (`bool`, defaults to `True`):\n            Whether to use the ETag HTTP response header to validate the cached files.\n        num_proc (`int`, *optional*):\n            The number of processes to launch to download the files in parallel.\n        max_retries (`int`, default to `1`):\n            The number of times to retry an HTTP request if it fails.\n        token (`str` or `bool`, *optional*):\n            Optional string or boolean to use as Bearer token\n            for remote files on the Datasets Hub. If `True`, or not specified, will get token from `~/.huggingface`.\n        storage_options (`dict`, *optional*):\n            Key/value pairs to be passed on to the dataset file-system backend, if any.\n        download_desc (`str`, *optional*):\n            A description to be displayed alongside with the progress bar while downloading the files.\n        disable_tqdm (`bool`, defaults to `False`):\n            Whether to disable the individual files download progress bar\n    \"\"\"\n\n    cache_dir: Optional[Union[str, Path]] = None\n    force_download: bool = False\n    resume_download: bool = False\n    local_files_only: bool = False\n    proxies: Optional[dict] = None\n    user_agent: Optional[str] = None\n    extract_compressed_file: bool = False\n    force_extract: bool = False\n    delete_extracted: bool = False\n    extract_on_the_fly: bool = False\n    use_etag: bool = True\n    num_proc: Optional[int] = None\n    max_retries: int = 1\n    token: Optional[Union[str, bool]] = None\n    storage_options: dict[str, Any] = field(default_factory=dict)\n    download_desc: Optional[str] = None\n    disable_tqdm: bool = False\n\n    def copy(self) -> \"DownloadConfig\":\n        return self.__class__(**{k: copy.deepcopy(v) for k, v in self.__dict__.items()})\n\n    def __setattr__(self, name, value):\n        if name == \"token\" and getattr(self, \"storage_options\", None) is not None:\n            if \"hf\" not in self.storage_options:\n                self.storage_options[\"hf\"] = {\"endpoint\": config.HF_ENDPOINT, \"token\": value}\n            elif getattr(self.storage_options[\"hf\"], \"token\", None) is None:\n                self.storage_options[\"hf\"][\"token\"] = value\n        super().__setattr__(name, value)\n"
  },
  {
    "path": "src/datasets/download/download_manager.py",
    "content": "# Copyright 2020 The TensorFlow Datasets Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Lint as: python3\n\"\"\"Download manager interface.\"\"\"\n\nimport enum\nimport io\nimport multiprocessing\nimport os\nfrom datetime import datetime\nfrom functools import partial\nfrom typing import Optional, Union\n\nimport fsspec\nfrom fsspec.core import url_to_fs\nfrom tqdm.contrib.concurrent import thread_map\n\nfrom .. import config\nfrom ..utils import tqdm as hf_tqdm\nfrom ..utils.file_utils import (\n    ArchiveIterable,\n    FilesIterable,\n    cached_path,\n    is_relative_path,\n    stack_multiprocessing_download_progress_bars,\n    url_or_path_join,\n)\nfrom ..utils.info_utils import get_size_checksum_dict\nfrom ..utils.logging import get_logger, tqdm\nfrom ..utils.py_utils import NestedDataStructure, map_nested\nfrom ..utils.track import tracked_str\nfrom .download_config import DownloadConfig\n\n\nlogger = get_logger(__name__)\n\n\nclass DownloadMode(enum.Enum):\n    \"\"\"`Enum` for how to treat pre-existing downloads and data.\n\n    The default mode is `REUSE_DATASET_IF_EXISTS`, which will reuse both\n    raw downloads and the prepared dataset if they exist.\n\n    The generations modes:\n\n    |                                     | Downloads | Dataset |\n    |-------------------------------------|-----------|---------|\n    | `REUSE_DATASET_IF_EXISTS` (default) | Reuse     | Reuse   |\n    | `REUSE_CACHE_IF_EXISTS`             | Reuse     | Fresh   |\n    | `FORCE_REDOWNLOAD`                  | Fresh     | Fresh   |\n\n    \"\"\"\n\n    REUSE_DATASET_IF_EXISTS = \"reuse_dataset_if_exists\"\n    REUSE_CACHE_IF_EXISTS = \"reuse_cache_if_exists\"\n    FORCE_REDOWNLOAD = \"force_redownload\"\n\n\nclass DownloadManager:\n    is_streaming = False\n\n    def __init__(\n        self,\n        dataset_name: Optional[str] = None,\n        data_dir: Optional[str] = None,\n        download_config: Optional[DownloadConfig] = None,\n        base_path: Optional[str] = None,\n        record_checksums=True,\n    ):\n        \"\"\"Download manager constructor.\n\n        Args:\n            data_dir:\n                can be used to specify a manual directory to get the files from.\n            dataset_name (`str`):\n                name of dataset this instance will be used for. If\n                provided, downloads will contain which datasets they were used for.\n            download_config (`DownloadConfig`):\n                to specify the cache directory and other\n                download options\n            base_path (`str`):\n                base path that is used when relative paths are used to\n                download files. This can be a remote url.\n            record_checksums (`bool`, defaults to `True`):\n                Whether to record the checksums of the downloaded files. If None, the value is inferred from the builder.\n        \"\"\"\n        self._dataset_name = dataset_name\n        self._data_dir = data_dir\n        self._base_path = base_path or os.path.abspath(\".\")\n        # To record what is being used: {url: {num_bytes: int, checksum: str}}\n        self._recorded_sizes_checksums: dict[str, dict[str, Optional[Union[int, str]]]] = {}\n        self.record_checksums = record_checksums\n        self.download_config = download_config or DownloadConfig()\n        self.downloaded_paths = {}\n        self.extracted_paths = {}\n\n    @property\n    def manual_dir(self):\n        return self._data_dir\n\n    @property\n    def downloaded_size(self):\n        \"\"\"Returns the total size of downloaded files.\"\"\"\n        return sum(checksums_dict[\"num_bytes\"] for checksums_dict in self._recorded_sizes_checksums.values())\n\n    def _record_sizes_checksums(self, url_or_urls: NestedDataStructure, downloaded_path_or_paths: NestedDataStructure):\n        \"\"\"Record size/checksum of downloaded files.\"\"\"\n        delay = 5\n        for url, path in hf_tqdm(\n            list(zip(url_or_urls.flatten(), downloaded_path_or_paths.flatten())),\n            delay=delay,\n            desc=\"Computing checksums\",\n        ):\n            # call str to support PathLike objects\n            self._recorded_sizes_checksums[str(url)] = get_size_checksum_dict(\n                path, record_checksum=self.record_checksums\n            )\n\n    def download(self, url_or_urls):\n        \"\"\"Download given URL(s).\n\n        By default, only one process is used for download. Pass customized `download_config.num_proc` to change this behavior.\n\n        Args:\n            url_or_urls (`str` or `list` or `dict`):\n                URL or `list` or `dict` of URLs to download. Each URL is a `str`.\n\n        Returns:\n            `str` or `list` or `dict`:\n                The downloaded paths matching the given input `url_or_urls`.\n\n        Example:\n\n        ```py\n        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')\n        ```\n        \"\"\"\n        download_config = self.download_config.copy()\n        download_config.extract_compressed_file = False\n        if download_config.download_desc is None:\n            download_config.download_desc = \"Downloading data\"\n\n        download_func = partial(self._download_batched, download_config=download_config)\n\n        start_time = datetime.now()\n        with stack_multiprocessing_download_progress_bars():\n            downloaded_path_or_paths = map_nested(\n                download_func,\n                url_or_urls,\n                map_tuple=True,\n                num_proc=download_config.num_proc,\n                desc=\"Downloading data files\",\n                batched=True,\n                batch_size=-1,\n            )\n        duration = datetime.now() - start_time\n        logger.info(f\"Downloading took {duration.total_seconds() // 60} min\")\n        url_or_urls = NestedDataStructure(url_or_urls)\n        downloaded_path_or_paths = NestedDataStructure(downloaded_path_or_paths)\n        self.downloaded_paths.update(dict(zip(url_or_urls.flatten(), downloaded_path_or_paths.flatten())))\n\n        start_time = datetime.now()\n        self._record_sizes_checksums(url_or_urls, downloaded_path_or_paths)\n        duration = datetime.now() - start_time\n        logger.info(f\"Checksum Computation took {duration.total_seconds() // 60} min\")\n\n        return downloaded_path_or_paths.data\n\n    def _download_batched(\n        self,\n        url_or_filenames: list[str],\n        download_config: DownloadConfig,\n    ) -> list[str]:\n        if len(url_or_filenames) >= 16:\n            download_config = download_config.copy()\n            download_config.disable_tqdm = True\n            download_func = partial(self._download_single, download_config=download_config)\n\n            fs: fsspec.AbstractFileSystem\n            path = str(url_or_filenames[0])\n            if is_relative_path(path):\n                # append the relative path to the base_path\n                path = url_or_path_join(self._base_path, path)\n            fs, path = url_to_fs(path, **download_config.storage_options)\n            size = 0\n            try:\n                size = fs.info(path).get(\"size\", 0)\n            except Exception:\n                pass\n            max_workers = (\n                config.HF_DATASETS_MULTITHREADING_MAX_WORKERS if size < (20 << 20) else 1\n            )  # enable multithreading if files are small\n\n            return thread_map(\n                download_func,\n                url_or_filenames,\n                desc=download_config.download_desc or \"Downloading\",\n                unit=\"files\",\n                position=multiprocessing.current_process()._identity[-1]  # contains the ranks of subprocesses\n                if os.environ.get(\"HF_DATASETS_STACK_MULTIPROCESSING_DOWNLOAD_PROGRESS_BARS\") == \"1\"\n                and multiprocessing.current_process()._identity\n                else None,\n                max_workers=max_workers,\n                tqdm_class=tqdm,\n            )\n        else:\n            return [\n                self._download_single(url_or_filename, download_config=download_config)\n                for url_or_filename in url_or_filenames\n            ]\n\n    def _download_single(self, url_or_filename: str, download_config: DownloadConfig) -> str:\n        url_or_filename = str(url_or_filename)\n        if is_relative_path(url_or_filename):\n            # append the relative path to the base_path\n            url_or_filename = url_or_path_join(self._base_path, url_or_filename)\n        out = cached_path(url_or_filename, download_config=download_config)\n        out = tracked_str(out)\n        out.set_origin(url_or_filename)\n        return out\n\n    def iter_archive(self, path_or_buf: Union[str, io.BufferedReader]):\n        \"\"\"Iterate over files within an archive.\n\n        Args:\n            path_or_buf (`str` or `io.BufferedReader`):\n                Archive path or archive binary file object.\n\n        Yields:\n            `tuple[str, io.BufferedReader]`:\n                2-tuple (path_within_archive, file_object).\n                File object is opened in binary mode.\n\n        Example:\n\n        ```py\n        >>> archive = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')\n        >>> files = dl_manager.iter_archive(archive)\n        ```\n        \"\"\"\n\n        if hasattr(path_or_buf, \"read\"):\n            return ArchiveIterable.from_buf(path_or_buf)\n        else:\n            return ArchiveIterable.from_urlpath(path_or_buf)\n\n    def iter_files(self, paths: Union[str, list[str]]):\n        \"\"\"Iterate over file paths.\n\n        Args:\n            paths (`str` or `list` of `str`):\n                Root paths.\n\n        Yields:\n            `str`: File path.\n\n        Example:\n\n        ```py\n        >>> files = dl_manager.download_and_extract('https://huggingface.co/datasets/AI-Lab-Makerere/beans/resolve/main/data/train.zip')\n        >>> files = dl_manager.iter_files(files)\n        ```\n        \"\"\"\n        return FilesIterable.from_urlpaths(paths)\n\n    def extract(self, path_or_paths):\n        \"\"\"Extract given path(s).\n\n        Args:\n            path_or_paths (path or `list` or `dict`):\n                Path of file to extract. Each path is a `str`.\n\n        Returns:\n            extracted_path(s): `str`, The extracted paths matching the given input\n            path_or_paths.\n\n        Example:\n\n        ```py\n        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')\n        >>> extracted_files = dl_manager.extract(downloaded_files)\n        ```\n        \"\"\"\n        download_config = self.download_config.copy()\n        download_config.extract_compressed_file = True\n        extract_func = partial(self._download_single, download_config=download_config)\n        extracted_paths = map_nested(\n            extract_func,\n            path_or_paths,\n            num_proc=download_config.num_proc,\n            desc=\"Extracting data files\",\n        )\n        path_or_paths = NestedDataStructure(path_or_paths)\n        extracted_paths = NestedDataStructure(extracted_paths)\n        self.extracted_paths.update(dict(zip(path_or_paths.flatten(), extracted_paths.flatten())))\n        return extracted_paths.data\n\n    def download_and_extract(self, url_or_urls):\n        \"\"\"Download and extract given `url_or_urls`.\n\n        Is roughly equivalent to:\n\n        ```\n        extracted_paths = dl_manager.extract(dl_manager.download(url_or_urls))\n        ```\n\n        Args:\n            url_or_urls (`str` or `list` or `dict`):\n                URL or `list` or `dict` of URLs to download and extract. Each URL is a `str`.\n\n        Returns:\n            extracted_path(s): `str`, extracted paths of given URL(s).\n        \"\"\"\n        return self.extract(self.download(url_or_urls))\n\n    def get_recorded_sizes_checksums(self):\n        return self._recorded_sizes_checksums.copy()\n\n    def delete_extracted_files(self):\n        paths_to_delete = set(self.extracted_paths.values()) - set(self.downloaded_paths.values())\n        for key, path in list(self.extracted_paths.items()):\n            if path in paths_to_delete and os.path.isfile(path):\n                os.remove(path)\n                del self.extracted_paths[key]\n\n    def manage_extracted_files(self):\n        if self.download_config.delete_extracted:\n            self.delete_extracted_files()\n"
  },
  {
    "path": "src/datasets/download/streaming_download_manager.py",
    "content": "import io\nimport os\nfrom collections.abc import Iterable\nfrom typing import Optional, Union\n\nfrom ..utils.file_utils import (  # noqa: F401 # backward compatibility\n    SINGLE_FILE_COMPRESSION_PROTOCOLS,\n    ArchiveIterable,\n    FilesIterable,\n    _get_extraction_protocol,\n    _get_path_extension,\n    _prepare_path_and_storage_options,\n    is_relative_path,\n    url_or_path_join,\n    xbasename,\n    xdirname,\n    xet_parse,\n    xexists,\n    xgetsize,\n    xglob,\n    xgzip_open,\n    xisdir,\n    xisfile,\n    xjoin,\n    xlistdir,\n    xnumpy_load,\n    xopen,\n    xpandas_read_csv,\n    xpandas_read_excel,\n    xPath,\n    xpyarrow_parquet_read_table,\n    xrelpath,\n    xsio_loadmat,\n    xsplit,\n    xsplitext,\n    xwalk,\n    xxml_dom_minidom_parse,\n)\nfrom ..utils.logging import get_logger\nfrom ..utils.py_utils import map_nested\nfrom .download_config import DownloadConfig\n\n\nlogger = get_logger(__name__)\n\n\nclass StreamingDownloadManager:\n    \"\"\"\n    Download manager that uses the \"::\" separator to navigate through (possibly remote) compressed archives.\n    Contrary to the regular `DownloadManager`, the `download` and `extract` methods don't actually download nor extract\n    data, but they rather return the path or url that could be opened using the `xopen` function which extends the\n    built-in `open` function to stream data from remote files.\n    \"\"\"\n\n    is_streaming = True\n\n    def __init__(\n        self,\n        dataset_name: Optional[str] = None,\n        data_dir: Optional[str] = None,\n        download_config: Optional[DownloadConfig] = None,\n        base_path: Optional[str] = None,\n    ):\n        self._dataset_name = dataset_name\n        self._data_dir = data_dir\n        self._base_path = base_path or os.path.abspath(\".\")\n        self.download_config = download_config or DownloadConfig()\n        self.downloaded_size = None\n        self.record_checksums = False\n\n    @property\n    def manual_dir(self):\n        return self._data_dir\n\n    def download(self, url_or_urls):\n        \"\"\"Normalize URL(s) of files to stream data from.\n        This is the lazy version of `DownloadManager.download` for streaming.\n\n        Args:\n            url_or_urls (`str` or `list` or `dict`):\n                URL(s) of files to stream data from. Each url is a `str`.\n\n        Returns:\n            url(s): (`str` or `list` or `dict`), URL(s) to stream data from matching the given input url_or_urls.\n\n        Example:\n\n        ```py\n        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')\n        ```\n        \"\"\"\n        url_or_urls = map_nested(self._download_single, url_or_urls, map_tuple=True)\n        return url_or_urls\n\n    def _download_single(self, urlpath: str) -> str:\n        urlpath = str(urlpath)\n        if is_relative_path(urlpath):\n            # append the relative path to the base_path\n            urlpath = url_or_path_join(self._base_path, urlpath)\n        return urlpath\n\n    def extract(self, url_or_urls):\n        \"\"\"Add extraction protocol for given url(s) for streaming.\n\n        This is the lazy version of `DownloadManager.extract` for streaming.\n\n        Args:\n            url_or_urls (`str` or `list` or `dict`):\n                URL(s) of files to stream data from. Each url is a `str`.\n\n        Returns:\n            url(s): (`str` or `list` or `dict`), URL(s) to stream data from matching the given input `url_or_urls`.\n\n        Example:\n\n        ```py\n        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')\n        >>> extracted_files = dl_manager.extract(downloaded_files)\n        ```\n        \"\"\"\n        urlpaths = map_nested(self._extract, url_or_urls, map_tuple=True)\n        return urlpaths\n\n    def _extract(self, urlpath: str) -> str:\n        urlpath = str(urlpath)\n        # get inner file: zip://train-00000.json.gz::https://foo.bar/data.zip -> zip://train-00000.json.gz\n        protocol = _get_extraction_protocol(urlpath, download_config=self.download_config)\n        path = urlpath.split(\"::\")[0]\n        extension = _get_path_extension(path)\n        if extension in [\"tgz\", \"tar\"] or path.endswith((\".tar.gz\", \".tar.bz2\", \".tar.xz\")):\n            raise NotImplementedError(\n                f\"Extraction protocol for TAR archives like '{urlpath}' is not implemented in streaming mode. \"\n                f\"Please use `dl_manager.iter_archive` instead.\\n\\n\"\n                f\"Example usage:\\n\\n\"\n                f\"\\turl = dl_manager.download(url)\\n\"\n                f\"\\ttar_archive_iterator = dl_manager.iter_archive(url)\\n\\n\"\n                f\"\\tfor filename, file in tar_archive_iterator:\\n\"\n                f\"\\t\\t...\"\n            )\n        if protocol is None:\n            # no extraction\n            return urlpath\n        elif protocol in SINGLE_FILE_COMPRESSION_PROTOCOLS:\n            # there is one single file which is the uncompressed file\n            inner_file = os.path.basename(urlpath.split(\"::\")[0])\n            inner_file = inner_file[: inner_file.rindex(\".\")] if \".\" in inner_file else inner_file\n            return f\"{protocol}://{inner_file}::{urlpath}\"\n        else:\n            return f\"{protocol}://::{urlpath}\"\n\n    def download_and_extract(self, url_or_urls):\n        \"\"\"Prepare given `url_or_urls` for streaming (add extraction protocol).\n\n        This is the lazy version of `DownloadManager.download_and_extract` for streaming.\n\n        Is equivalent to:\n\n        ```\n        urls = dl_manager.extract(dl_manager.download(url_or_urls))\n        ```\n\n        Args:\n            url_or_urls (`str` or `list` or `dict`):\n                URL(s) to stream from data from. Each url is a `str`.\n\n        Returns:\n            url(s): (`str` or `list` or `dict`), URL(s) to stream data from matching the given input `url_or_urls`.\n        \"\"\"\n        return self.extract(self.download(url_or_urls))\n\n    def iter_archive(self, urlpath_or_buf: Union[str, io.BufferedReader]) -> Iterable[tuple]:\n        \"\"\"Iterate over files within an archive.\n\n        Args:\n            urlpath_or_buf (`str` or `io.BufferedReader`):\n                Archive path or archive binary file object.\n\n        Yields:\n            `tuple[str, io.BufferedReader]`:\n                2-tuple (path_within_archive, file_object).\n                File object is opened in binary mode.\n\n        Example:\n\n        ```py\n        >>> archive = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')\n        >>> files = dl_manager.iter_archive(archive)\n        ```\n        \"\"\"\n\n        if hasattr(urlpath_or_buf, \"read\"):\n            return ArchiveIterable.from_buf(urlpath_or_buf)\n        else:\n            return ArchiveIterable.from_urlpath(urlpath_or_buf, download_config=self.download_config)\n\n    def iter_files(self, urlpaths: Union[str, list[str]]) -> Iterable[str]:\n        \"\"\"Iterate over files.\n\n        Args:\n            urlpaths (`str` or `list` of `str`):\n                Root paths.\n\n        Yields:\n            str: File URL path.\n\n        Example:\n\n        ```py\n        >>> files = dl_manager.download_and_extract('https://huggingface.co/datasets/AI-Lab-Makerere/beans/resolve/main/data/train.zip')\n        >>> files = dl_manager.iter_files(files)\n        ```\n        \"\"\"\n        return FilesIterable.from_urlpaths(urlpaths, download_config=self.download_config)\n\n    def manage_extracted_files(self):\n        pass\n\n    def get_recorded_sizes_checksums(self):\n        pass\n"
  },
  {
    "path": "src/datasets/exceptions.py",
    "content": "# SPDX-License-Identifier: Apache-2.0\n# Copyright 2023 The HuggingFace Authors.\nfrom typing import Any, Optional, Union\n\nfrom huggingface_hub import HfFileSystem\n\nfrom . import config\nfrom .table import CastError\nfrom .utils.track import TrackedIterableFromGenerator, tracked_list, tracked_str\n\n\nclass DatasetsError(Exception):\n    \"\"\"Base class for exceptions in this library.\"\"\"\n\n\nclass DefunctDatasetError(DatasetsError):\n    \"\"\"The dataset has been defunct.\"\"\"\n\n\nclass FileNotFoundDatasetsError(DatasetsError, FileNotFoundError):\n    \"\"\"FileNotFoundError raised by this library.\"\"\"\n\n\nclass DataFilesNotFoundError(FileNotFoundDatasetsError):\n    \"\"\"No (supported) data files found.\"\"\"\n\n\nclass DatasetNotFoundError(FileNotFoundDatasetsError):\n    \"\"\"Dataset not found.\n\n    Raised when trying to access:\n    - a missing dataset, or\n    - a private/gated dataset and the user is not authenticated.\n    \"\"\"\n\n\nclass DatasetBuildError(DatasetsError):\n    pass\n\n\nclass ManualDownloadError(DatasetBuildError):\n    pass\n\n\nclass FileFormatError(DatasetBuildError):\n    pass\n\n\nclass DatasetGenerationError(DatasetBuildError):\n    pass\n\n\nclass DatasetGenerationCastError(DatasetGenerationError):\n    @classmethod\n    def from_cast_error(\n        cls,\n        cast_error: CastError,\n        builder_name: str,\n        gen_kwargs: dict[str, Any],\n        token: Optional[Union[bool, str]],\n    ) -> \"DatasetGenerationCastError\":\n        explanation_message = (\n            f\"\\n\\nAll the data files must have the same columns, but at some point {cast_error.details()}\"\n        )\n        formatted_tracked_gen_kwargs: list[str] = []\n        for gen_kwarg in gen_kwargs.values():\n            if not isinstance(gen_kwarg, (tracked_str, tracked_list, TrackedIterableFromGenerator)):\n                continue\n            while (\n                isinstance(gen_kwarg, (tracked_list, TrackedIterableFromGenerator)) and gen_kwarg.last_item is not None\n            ):\n                gen_kwarg = gen_kwarg.last_item\n            if isinstance(gen_kwarg, tracked_str):\n                gen_kwarg = gen_kwarg.get_origin()\n            if isinstance(gen_kwarg, str) and gen_kwarg.startswith(\"hf://\"):\n                resolved_path = HfFileSystem(endpoint=config.HF_ENDPOINT, token=token).resolve_path(gen_kwarg)\n                gen_kwarg = \"hf://\" + resolved_path.unresolve()\n                if \"@\" + resolved_path.revision in gen_kwarg:\n                    gen_kwarg = (\n                        gen_kwarg.replace(\"@\" + resolved_path.revision, \"\", 1)\n                        + f\" (at revision {resolved_path.revision})\"\n                    )\n            formatted_tracked_gen_kwargs.append(str(gen_kwarg))\n        if formatted_tracked_gen_kwargs:\n            explanation_message += f\"\\n\\nThis happened while the {builder_name} dataset builder was generating data using\\n\\n{', '.join(formatted_tracked_gen_kwargs)}\"\n        help_message = \"\\n\\nPlease either edit the data files to have matching columns, or separate them into different configurations (see docs at https://hf.co/docs/hub/datasets-manual-configuration#multiple-configurations)\"\n        return cls(\"An error occurred while generating the dataset\" + explanation_message + help_message)\n\n\nclass ChecksumVerificationError(DatasetsError):\n    \"\"\"Error raised during checksums verifications of downloaded files.\"\"\"\n\n\nclass UnexpectedDownloadedFileError(ChecksumVerificationError):\n    \"\"\"Some downloaded files were not expected.\"\"\"\n\n\nclass ExpectedMoreDownloadedFilesError(ChecksumVerificationError):\n    \"\"\"Some files were supposed to be downloaded but were not.\"\"\"\n\n\nclass NonMatchingChecksumError(ChecksumVerificationError):\n    \"\"\"The downloaded file checksum don't match the expected checksum.\"\"\"\n\n\nclass SplitsVerificationError(DatasetsError):\n    \"\"\"Error raised during splits verifications.\"\"\"\n\n\nclass UnexpectedSplitsError(SplitsVerificationError):\n    \"\"\"The expected splits of the downloaded file is missing.\"\"\"\n\n\nclass ExpectedMoreSplitsError(SplitsVerificationError):\n    \"\"\"Some recorded splits are missing.\"\"\"\n\n\nclass NonMatchingSplitsSizesError(SplitsVerificationError):\n    \"\"\"The splits sizes don't match the expected splits sizes.\"\"\"\n"
  },
  {
    "path": "src/datasets/features/__init__.py",
    "content": "__all__ = [\n    \"Audio\",\n    \"Array2D\",\n    \"Array3D\",\n    \"Array4D\",\n    \"Array5D\",\n    \"ClassLabel\",\n    \"Features\",\n    \"Json\",\n    \"LargeList\",\n    \"List\",\n    \"Sequence\",\n    \"Value\",\n    \"Image\",\n    \"Translation\",\n    \"TranslationVariableLanguages\",\n    \"Video\",\n    \"Pdf\",\n    \"Nifti\",\n]\nfrom .audio import Audio\nfrom .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, Json, LargeList, List, Sequence, Value\nfrom .image import Image\nfrom .nifti import Nifti\nfrom .pdf import Pdf\nfrom .translation import Translation, TranslationVariableLanguages\nfrom .video import Video\n"
  },
  {
    "path": "src/datasets/features/_torchcodec.py",
    "content": "import numpy as np\nfrom torchcodec.decoders import AudioDecoder as _AudioDecoder\n\n\nclass AudioDecoder(_AudioDecoder):\n    def __getitem__(self, key: str):\n        if key == \"array\":\n            y = self.get_all_samples().data.cpu().numpy()\n            return np.mean(y, axis=tuple(range(y.ndim - 1))) if y.ndim > 1 else y\n        elif key == \"sampling_rate\":\n            return self.get_samples_played_in_range(0, 0).sample_rate\n        elif hasattr(super(), \"__getitem__\"):\n            return super().__getitem__(key)\n        else:\n            raise TypeError(\"'torchcodec.decoders.AudioDecoder' object is not subscriptable\")\n"
  },
  {
    "path": "src/datasets/features/audio.py",
    "content": "import os\nfrom dataclasses import dataclass, field\nfrom io import BytesIO\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any, ClassVar, Optional, Union\n\nimport numpy as np\nimport pyarrow as pa\n\nfrom .. import config\nfrom ..download.download_config import DownloadConfig\nfrom ..table import array_cast\nfrom ..utils.file_utils import is_local_path, xopen\nfrom ..utils.py_utils import no_op_if_value_is_null, string_to_dict\n\n\nif TYPE_CHECKING:\n    from torchcodec.decoders import AudioDecoder\n\n    from .features import FeatureType\n\n\n@dataclass\nclass Audio:\n    \"\"\"Audio [`Feature`] to extract audio data from an audio file.\n\n    Input: The Audio feature accepts as input:\n    - A `str`: Absolute path to the audio file (i.e. random access is allowed).\n    - A `pathlib.Path`: path to the audio file (i.e. random access is allowed).\n    - A `dict` with the keys:\n\n        - `path`: String with relative path of the audio file to the archive file.\n        - `bytes`: Bytes content of the audio file.\n\n      This is useful for parquet or webdataset files which embed audio files.\n\n    - A `dict` with the keys:\n\n        - `array`: Array containing the audio sample\n        - `sampling_rate`: Integer corresponding to the sampling rate of the audio sample.\n\n    - A `torchcodec.decoders.AudioDecoder`: torchcodec audio decoder object.\n\n    Output: The Audio features output data as `torchcodec.decoders.AudioDecoder` objects, with additional keys:\n\n    - `array`: Array containing the audio sample\n    - `sampling_rate`: Integer corresponding to the sampling rate of the audio sample.\n\n    Args:\n        sampling_rate (`int`, *optional*):\n            Target sampling rate. If `None`, the native sampling rate is used.\n        num_channels (`int`, *optional*):\n             The desired number of channels of the samples. By default, the number of channels of the source is used.\n             Audio decoding will return samples with shape (num_channels, num_samples)\n             Currently `None` (number of channels of the source, default), `1` (mono) or `2` (stereo) channels are supported.\n             The `num_channels` argument is passed to `torchcodec.decoders.AudioDecoder`.\n\n             <Added version=\"4.4.0\"/>\n        decode (`bool`, defaults to `True`):\n            Whether to decode the audio data. If `False`,\n            returns the underlying dictionary in the format `{\"path\": audio_path, \"bytes\": audio_bytes}`.\n        stream_index (`int`, *optional*):\n            The streaming index to use from the file. If `None` defaults to the \"best\" index.\n\n    Example:\n\n    ```py\n    >>> from datasets import load_dataset, Audio\n    >>> ds = load_dataset(\"PolyAI/minds14\", name=\"en-US\", split=\"train\")\n    >>> ds = ds.cast_column(\"audio\", Audio(sampling_rate=44100, num_channels=2))\n    >>> ds[0][\"audio\"]\n    <datasets.features._torchcodec.AudioDecoder object at 0x11642b6a0>\n    >>> audio = ds[0][\"audio\"]\n    >>> audio.get_samples_played_in_range(0, 10)\n    AudioSamples:\n        data (shape): torch.Size([2, 110592])\n        pts_seconds: 0.0\n        duration_seconds: 2.507755102040816\n        sample_rate: 44100\n    ```\n    \"\"\"\n\n    sampling_rate: Optional[int] = None\n    decode: bool = True\n    num_channels: Optional[int] = None\n    stream_index: Optional[int] = None\n    id: Optional[str] = field(default=None, repr=False)\n    # Automatically constructed\n    dtype: ClassVar[str] = \"dict\"\n    pa_type: ClassVar[Any] = pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})\n    _type: str = field(default=\"Audio\", init=False, repr=False)\n\n    def __call__(self):\n        return self.pa_type\n\n    def encode_example(self, value: Union[str, bytes, bytearray, dict, \"AudioDecoder\"]) -> dict:\n        \"\"\"Encode example into a format for Arrow.\n\n        Args:\n            value (`str`, `bytes`,`bytearray`,`dict`, `AudioDecoder`):\n                Data passed as input to Audio feature.\n\n        Returns:\n            `dict`\n        \"\"\"\n        try:\n            import torch\n            from torchcodec.encoders import AudioEncoder  # needed to write audio files\n        except ImportError as err:\n            raise ImportError(\"To support encoding audio data, please install 'torchcodec'.\") from err\n\n        if value is None:\n            raise ValueError(\"value must be provided\")\n\n        if config.TORCHCODEC_AVAILABLE:\n            from torchcodec.decoders import AudioDecoder\n\n        else:\n            AudioDecoder = None\n\n        if isinstance(value, str):\n            return {\"bytes\": None, \"path\": value}\n        elif isinstance(value, Path):\n            return {\"bytes\": None, \"path\": str(value.absolute())}\n        elif isinstance(value, (bytes, bytearray)):\n            return {\"bytes\": value, \"path\": None}\n        elif AudioDecoder is not None and isinstance(value, AudioDecoder):\n            return encode_torchcodec_audio(value)\n        elif \"array\" in value:\n            # convert the audio array to wav bytes\n            buffer = BytesIO()\n            AudioEncoder(\n                torch.from_numpy(value[\"array\"].astype(np.float32)), sample_rate=value[\"sampling_rate\"]\n            ).to_file_like(buffer, format=\"wav\", num_channels=self.num_channels)\n            return {\"bytes\": buffer.getvalue(), \"path\": None}\n        elif value.get(\"path\") is not None and os.path.isfile(value[\"path\"]):\n            # we set \"bytes\": None to not duplicate the data if they're already available locally\n            if value[\"path\"].endswith(\"pcm\"):\n                # \"PCM\" only has raw audio bytes\n                if value.get(\"sampling_rate\") is None:\n                    # At least, If you want to convert \"PCM-byte\" to \"WAV-byte\", you have to know sampling rate\n                    raise KeyError(\"To use PCM files, please specify a 'sampling_rate' in Audio object\")\n                if value.get(\"bytes\"):\n                    # If we already had PCM-byte, we don`t have to make \"read file, make bytes\" (just use it!)\n                    bytes_value = np.frombuffer(value[\"bytes\"], dtype=np.int16).astype(np.float32) / 32767\n                else:\n                    bytes_value = np.memmap(value[\"path\"], dtype=\"h\", mode=\"r\").astype(np.float32) / 32767\n\n                buffer = BytesIO()\n                AudioEncoder(torch.from_numpy(bytes_value), sample_rate=value[\"sampling_rate\"]).to_file_like(\n                    buffer, format=\"wav\", num_channels=self.num_channels\n                )\n                return {\"bytes\": buffer.getvalue(), \"path\": None}\n            else:\n                return {\"bytes\": None, \"path\": value.get(\"path\")}\n        elif value.get(\"bytes\") is not None or value.get(\"path\") is not None:\n            # store the audio bytes, and path is used to infer the audio format using the file extension\n            return {\"bytes\": value.get(\"bytes\"), \"path\": value.get(\"path\")}\n        else:\n            raise ValueError(\n                f\"An audio sample should have one of 'path' or 'bytes' but they are missing or None in {value}.\"\n            )\n\n    def decode_example(\n        self, value: dict, token_per_repo_id: Optional[dict[str, Union[str, bool, None]]] = None\n    ) -> \"AudioDecoder\":\n        \"\"\"Decode example audio file into audio data.\n\n        Args:\n            value (`dict`):\n                A dictionary with keys:\n\n                - `path`: String with relative audio file path.\n                - `bytes`: Bytes of the audio file.\n            token_per_repo_id (`dict`, *optional*):\n                To access and decode\n                audio files from private repositories on the Hub, you can pass\n                a dictionary repo_id (`str`) -> token (`bool` or `str`)\n\n        Returns:\n            `torchcodec.decoders.AudioDecoder`\n        \"\"\"\n        if config.TORCHCODEC_AVAILABLE:\n            from ._torchcodec import AudioDecoder\n        else:\n            raise ImportError(\"To support decoding audio data, please install 'torchcodec'.\")\n\n        if not self.decode:\n            raise RuntimeError(\"Decoding is disabled for this feature. Please use Audio(decode=True) instead.\")\n\n        path, bytes = (value[\"path\"], value[\"bytes\"]) if value[\"bytes\"] is not None else (value[\"path\"], None)\n        if path is None and bytes is None:\n            raise ValueError(f\"An audio sample should have one of 'path' or 'bytes' but both are None in {value}.\")\n\n        if bytes is None and is_local_path(path):\n            audio = AudioDecoder(\n                path, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=self.num_channels\n            )\n\n        elif bytes is None:\n            token_per_repo_id = token_per_repo_id or {}\n            source_url = path.split(\"::\")[-1]\n            pattern = (\n                config.HUB_DATASETS_URL if source_url.startswith(config.HF_ENDPOINT) else config.HUB_DATASETS_HFFS_URL\n            )\n            source_url_fields = string_to_dict(source_url, pattern)\n            token = token_per_repo_id.get(source_url_fields[\"repo_id\"]) if source_url_fields is not None else None\n\n            download_config = DownloadConfig(token=token)\n            f = xopen(path, \"rb\", download_config=download_config)\n            audio = AudioDecoder(\n                f, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=self.num_channels\n            )\n\n        else:\n            audio = AudioDecoder(\n                bytes, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=self.num_channels\n            )\n        audio._hf_encoded = {\"path\": path, \"bytes\": bytes}\n        audio.metadata.path = path\n        return audio\n\n    def flatten(self) -> Union[\"FeatureType\", dict[str, \"FeatureType\"]]:\n        \"\"\"If in the decodable state, raise an error, otherwise flatten the feature into a dictionary.\"\"\"\n        from .features import Value\n\n        if self.decode:\n            raise ValueError(\"Cannot flatten a decoded Audio feature.\")\n        return {\n            \"bytes\": Value(\"binary\"),\n            \"path\": Value(\"string\"),\n        }\n\n    def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray]) -> pa.StructArray:\n        \"\"\"Cast an Arrow array to the Audio arrow storage type.\n        The Arrow types that can be converted to the Audio pyarrow storage type are:\n\n        - `pa.string()` - it must contain the \"path\" data\n        - `pa.binary()` - it must contain the audio bytes\n        - `pa.struct({\"bytes\": pa.binary()})`\n        - `pa.struct({\"path\": pa.string()})`\n        - `pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})`  - order doesn't matter\n\n        Args:\n            storage (`Union[pa.StringArray, pa.StructArray]`):\n                PyArrow array to cast.\n\n        Returns:\n            `pa.StructArray`: Array in the Audio arrow storage type, that is\n                `pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})`\n        \"\"\"\n        if pa.types.is_string(storage.type):\n            bytes_array = pa.array([None] * len(storage), type=pa.binary())\n            storage = pa.StructArray.from_arrays([bytes_array, storage], [\"bytes\", \"path\"], mask=storage.is_null())\n        elif pa.types.is_large_binary(storage.type):\n            storage = array_cast(\n                storage, pa.binary()\n            )  # this can fail in case of big audios, paths should be used instead\n            path_array = pa.array([None] * len(storage), type=pa.string())\n            storage = pa.StructArray.from_arrays([storage, path_array], [\"bytes\", \"path\"], mask=storage.is_null())\n        elif pa.types.is_binary(storage.type):\n            path_array = pa.array([None] * len(storage), type=pa.string())\n            storage = pa.StructArray.from_arrays([storage, path_array], [\"bytes\", \"path\"], mask=storage.is_null())\n        elif pa.types.is_struct(storage.type) and storage.type.get_all_field_indices(\"array\"):\n            storage = pa.array(\n                [Audio().encode_example(x) if x is not None else None for x in storage.to_numpy(zero_copy_only=False)]\n            )\n        elif pa.types.is_struct(storage.type):\n            if storage.type.get_field_index(\"bytes\") >= 0:\n                bytes_array = storage.field(\"bytes\")\n            else:\n                bytes_array = pa.array([None] * len(storage), type=pa.binary())\n            if storage.type.get_field_index(\"path\") >= 0:\n                path_array = storage.field(\"path\")\n            else:\n                path_array = pa.array([None] * len(storage), type=pa.string())\n            storage = pa.StructArray.from_arrays([bytes_array, path_array], [\"bytes\", \"path\"], mask=storage.is_null())\n        return array_cast(storage, self.pa_type)\n\n    def embed_storage(self, storage: pa.StructArray, token_per_repo_id=None) -> pa.StructArray:\n        \"\"\"Embed audio files into the Arrow array.\n\n        Args:\n            storage (`pa.StructArray`):\n                PyArrow array to embed.\n\n        Returns:\n            `pa.StructArray`: Array in the Audio arrow storage type, that is\n                `pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})`.\n        \"\"\"\n        if token_per_repo_id is None:\n            token_per_repo_id = {}\n\n        @no_op_if_value_is_null\n        def path_to_bytes(path):\n            source_url = path.split(\"::\")[-1]\n            pattern = (\n                config.HUB_DATASETS_URL if source_url.startswith(config.HF_ENDPOINT) else config.HUB_DATASETS_HFFS_URL\n            )\n            source_url_fields = string_to_dict(source_url, pattern)\n            token = token_per_repo_id.get(source_url_fields[\"repo_id\"]) if source_url_fields is not None else None\n            download_config = DownloadConfig(token=token)\n            with xopen(path, \"rb\", download_config=download_config) as f:\n                return f.read()\n\n        bytes_array = pa.array(\n            [\n                (path_to_bytes(x[\"path\"]) if x[\"bytes\"] is None else x[\"bytes\"]) if x is not None else None\n                for x in storage.to_pylist()\n            ],\n            type=pa.binary(),\n        )\n        path_array = pa.array(\n            [os.path.basename(path) if path is not None else None for path in storage.field(\"path\").to_pylist()],\n            type=pa.string(),\n        )\n        storage = pa.StructArray.from_arrays([bytes_array, path_array], [\"bytes\", \"path\"], mask=bytes_array.is_null())\n        return array_cast(storage, self.pa_type)\n\n\ndef encode_torchcodec_audio(audio: \"AudioDecoder\") -> dict:\n    if hasattr(audio, \"_hf_encoded\"):\n        return audio._hf_encoded\n    else:\n        try:\n            from torchcodec.encoders import AudioEncoder  # needed to write audio files\n        except ImportError as err:\n            raise ImportError(\"To support encoding audio data, please install 'torchcodec'.\") from err\n\n        samples = audio.get_all_samples()\n        buffer = BytesIO()\n        num_channels = samples.data.shape[0]\n        AudioEncoder(samples.data.cpu(), sample_rate=samples.sample_rate).to_file_like(\n            buffer, format=\"wav\", num_channels=num_channels\n        )\n        return {\"bytes\": buffer.getvalue(), \"path\": None}\n"
  },
  {
    "path": "src/datasets/features/features.py",
    "content": "# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Lint as: python3\n\"\"\"This class handle features definition in datasets and some utilities to display table type.\"\"\"\n\nimport copy\nimport json\nimport re\nimport sys\nfrom collections.abc import Iterable, Mapping\nfrom collections.abc import Sequence as SequenceABC\nfrom collections.abc import Sequence as Sequence_\nfrom dataclasses import InitVar, dataclass, field, fields\nfrom functools import reduce, wraps\nfrom operator import mul\nfrom typing import Any, Callable, ClassVar, Literal, Optional, Union\n\nimport numpy as np\nimport pandas as pd\nimport pyarrow as pa\nimport pyarrow.compute as pc\nimport pyarrow.types\nfrom pandas.api.extensions import ExtensionArray as PandasExtensionArray\nfrom pandas.api.extensions import ExtensionDtype as PandasExtensionDtype\n\nfrom .. import config\nfrom ..naming import camelcase_to_snakecase, snakecase_to_camelcase\nfrom ..table import array_cast\nfrom ..utils import experimental, logging\nfrom ..utils.json import ujson_dumps, ujson_loads\nfrom ..utils.py_utils import asdict, first_non_null_value, zip_dict\nfrom .audio import Audio\nfrom .image import Image, encode_pil_image\nfrom .nifti import Nifti, encode_nibabel_image\nfrom .pdf import Pdf, encode_pdfplumber_pdf\nfrom .translation import Translation, TranslationVariableLanguages\nfrom .video import Video\n\n\nlogger = logging.get_logger(__name__)\n\n\ndef _arrow_to_datasets_dtype(arrow_type: pa.DataType) -> str:\n    \"\"\"\n    _arrow_to_datasets_dtype takes a pyarrow.DataType and converts it to a datasets string dtype.\n    In effect, `dt == string_to_arrow(_arrow_to_datasets_dtype(dt))`\n    \"\"\"\n    if pyarrow.types.is_null(arrow_type):\n        return \"null\"\n    elif pyarrow.types.is_boolean(arrow_type):\n        return \"bool\"\n    elif pyarrow.types.is_int8(arrow_type):\n        return \"int8\"\n    elif pyarrow.types.is_int16(arrow_type):\n        return \"int16\"\n    elif pyarrow.types.is_int32(arrow_type):\n        return \"int32\"\n    elif pyarrow.types.is_int64(arrow_type):\n        return \"int64\"\n    elif pyarrow.types.is_uint8(arrow_type):\n        return \"uint8\"\n    elif pyarrow.types.is_uint16(arrow_type):\n        return \"uint16\"\n    elif pyarrow.types.is_uint32(arrow_type):\n        return \"uint32\"\n    elif pyarrow.types.is_uint64(arrow_type):\n        return \"uint64\"\n    elif pyarrow.types.is_float16(arrow_type):\n        return \"float16\"  # pyarrow dtype is \"halffloat\"\n    elif pyarrow.types.is_float32(arrow_type):\n        return \"float32\"  # pyarrow dtype is \"float\"\n    elif pyarrow.types.is_float64(arrow_type):\n        return \"float64\"  # pyarrow dtype is \"double\"\n    elif pyarrow.types.is_time32(arrow_type):\n        return f\"time32[{pa.type_for_alias(str(arrow_type)).unit}]\"\n    elif pyarrow.types.is_time64(arrow_type):\n        return f\"time64[{pa.type_for_alias(str(arrow_type)).unit}]\"\n    elif pyarrow.types.is_timestamp(arrow_type):\n        if arrow_type.tz is None:\n            return f\"timestamp[{arrow_type.unit}]\"\n        elif arrow_type.tz:\n            return f\"timestamp[{arrow_type.unit}, tz={arrow_type.tz}]\"\n        else:\n            raise ValueError(f\"Unexpected timestamp object {arrow_type}.\")\n    elif pyarrow.types.is_date32(arrow_type):\n        return \"date32\"  # pyarrow dtype is \"date32[day]\"\n    elif pyarrow.types.is_date64(arrow_type):\n        return \"date64\"  # pyarrow dtype is \"date64[ms]\"\n    elif pyarrow.types.is_duration(arrow_type):\n        return f\"duration[{arrow_type.unit}]\"\n    elif pyarrow.types.is_decimal128(arrow_type):\n        return f\"decimal128({arrow_type.precision}, {arrow_type.scale})\"\n    elif pyarrow.types.is_decimal256(arrow_type):\n        return f\"decimal256({arrow_type.precision}, {arrow_type.scale})\"\n    elif pyarrow.types.is_binary(arrow_type):\n        return \"binary\"\n    elif pyarrow.types.is_large_binary(arrow_type):\n        return \"large_binary\"\n    elif pyarrow.types.is_binary_view(arrow_type):\n        return \"binary_view\"\n    elif pyarrow.types.is_string(arrow_type):\n        return \"string\"\n    elif pyarrow.types.is_large_string(arrow_type):\n        return \"large_string\"\n    elif pyarrow.types.is_string_view(arrow_type):\n        return \"string_view\"\n    elif pyarrow.types.is_dictionary(arrow_type):\n        return _arrow_to_datasets_dtype(arrow_type.value_type)\n    else:\n        raise ValueError(f\"Arrow type {arrow_type} does not have a datasets dtype equivalent.\")\n\n\ndef string_to_arrow(datasets_dtype: str) -> pa.DataType:\n    \"\"\"\n    string_to_arrow takes a datasets string dtype and converts it to a pyarrow.DataType.\n\n    In effect, `dt == string_to_arrow(_arrow_to_datasets_dtype(dt))`\n\n    This is necessary because the datasets.Value() primitive type is constructed using a string dtype\n\n    Value(dtype=str)\n\n    But Features.type (via `get_nested_type()` expects to resolve Features into a pyarrow Schema,\n        which means that each Value() must be able to resolve into a corresponding pyarrow.DataType, which is the\n        purpose of this function.\n    \"\"\"\n    if datasets_dtype == \"json\":\n        raise ValueError(\"'json' is not a valid dtype, use the Json() feature instead\")\n\n    def _dtype_error_msg(dtype, pa_dtype, examples=None, urls=None):\n        msg = f\"{dtype} is not a validly formatted string representation of the pyarrow {pa_dtype} type.\"\n        if examples:\n            examples = \", \".join(examples[:-1]) + \" or \" + examples[-1] if len(examples) > 1 else examples[0]\n            msg += f\"\\nValid examples include: {examples}.\"\n        if urls:\n            urls = \", \".join(urls[:-1]) + \" and \" + urls[-1] if len(urls) > 1 else urls[0]\n            msg += f\"\\nFor more insformation, see: {urls}.\"\n        return msg\n\n    if datasets_dtype in pa.__dict__:\n        return pa.__dict__[datasets_dtype]()\n\n    if (datasets_dtype + \"_\") in pa.__dict__:\n        return pa.__dict__[datasets_dtype + \"_\"]()\n\n    timestamp_matches = re.search(r\"^timestamp\\[(.*)\\]$\", datasets_dtype)\n    if timestamp_matches:\n        timestamp_internals = timestamp_matches.group(1)\n        internals_matches = re.search(r\"^(s|ms|us|ns),\\s*tz=([a-zA-Z0-9/_+\\-:]*)$\", timestamp_internals)\n        if timestamp_internals in [\"s\", \"ms\", \"us\", \"ns\"]:\n            return pa.timestamp(timestamp_internals)\n        elif internals_matches:\n            return pa.timestamp(internals_matches.group(1), internals_matches.group(2))\n        else:\n            raise ValueError(\n                _dtype_error_msg(\n                    datasets_dtype,\n                    \"timestamp\",\n                    examples=[\"timestamp[us]\", \"timestamp[us, tz=America/New_York\"],\n                    urls=[\"https://arrow.apache.org/docs/python/generated/pyarrow.timestamp.html\"],\n                )\n            )\n\n    duration_matches = re.search(r\"^duration\\[(.*)\\]$\", datasets_dtype)\n    if duration_matches:\n        duration_internals = duration_matches.group(1)\n        if duration_internals in [\"s\", \"ms\", \"us\", \"ns\"]:\n            return pa.duration(duration_internals)\n        else:\n            raise ValueError(\n                _dtype_error_msg(\n                    datasets_dtype,\n                    \"duration\",\n                    examples=[\"duration[s]\", \"duration[us]\"],\n                    urls=[\"https://arrow.apache.org/docs/python/generated/pyarrow.duration.html\"],\n                )\n            )\n\n    time_matches = re.search(r\"^time(.*)\\[(.*)\\]$\", datasets_dtype)\n    if time_matches:\n        time_internals_bits = time_matches.group(1)\n        if time_internals_bits == \"32\":\n            time_internals_unit = time_matches.group(2)\n            if time_internals_unit in [\"s\", \"ms\"]:\n                return pa.time32(time_internals_unit)\n            else:\n                raise ValueError(\n                    f\"{time_internals_unit} is not a valid unit for the pyarrow time32 type. Supported units: s (second) and ms (millisecond).\"\n                )\n        elif time_internals_bits == \"64\":\n            time_internals_unit = time_matches.group(2)\n            if time_internals_unit in [\"us\", \"ns\"]:\n                return pa.time64(time_internals_unit)\n            else:\n                raise ValueError(\n                    f\"{time_internals_unit} is not a valid unit for the pyarrow time64 type. Supported units: us (microsecond) and ns (nanosecond).\"\n                )\n        else:\n            raise ValueError(\n                _dtype_error_msg(\n                    datasets_dtype,\n                    \"time\",\n                    examples=[\"time32[s]\", \"time64[us]\"],\n                    urls=[\n                        \"https://arrow.apache.org/docs/python/generated/pyarrow.time32.html\",\n                        \"https://arrow.apache.org/docs/python/generated/pyarrow.time64.html\",\n                    ],\n                )\n            )\n\n    decimal_matches = re.search(r\"^decimal(.*)\\((.*)\\)$\", datasets_dtype)\n    if decimal_matches:\n        decimal_internals_bits = decimal_matches.group(1)\n        if decimal_internals_bits == \"128\":\n            decimal_internals_precision_and_scale = re.search(r\"^(\\d+),\\s*(-?\\d+)$\", decimal_matches.group(2))\n            if decimal_internals_precision_and_scale:\n                precision = decimal_internals_precision_and_scale.group(1)\n                scale = decimal_internals_precision_and_scale.group(2)\n                return pa.decimal128(int(precision), int(scale))\n            else:\n                raise ValueError(\n                    _dtype_error_msg(\n                        datasets_dtype,\n                        \"decimal128\",\n                        examples=[\"decimal128(10, 2)\", \"decimal128(4, -2)\"],\n                        urls=[\"https://arrow.apache.org/docs/python/generated/pyarrow.decimal128.html\"],\n                    )\n                )\n        elif decimal_internals_bits == \"256\":\n            decimal_internals_precision_and_scale = re.search(r\"^(\\d+),\\s*(-?\\d+)$\", decimal_matches.group(2))\n            if decimal_internals_precision_and_scale:\n                precision = decimal_internals_precision_and_scale.group(1)\n                scale = decimal_internals_precision_and_scale.group(2)\n                return pa.decimal256(int(precision), int(scale))\n            else:\n                raise ValueError(\n                    _dtype_error_msg(\n                        datasets_dtype,\n                        \"decimal256\",\n                        examples=[\"decimal256(30, 2)\", \"decimal256(38, -4)\"],\n                        urls=[\"https://arrow.apache.org/docs/python/generated/pyarrow.decimal256.html\"],\n                    )\n                )\n        else:\n            raise ValueError(\n                _dtype_error_msg(\n                    datasets_dtype,\n                    \"decimal\",\n                    examples=[\"decimal128(12, 3)\", \"decimal256(40, 6)\"],\n                    urls=[\n                        \"https://arrow.apache.org/docs/python/generated/pyarrow.decimal128.html\",\n                        \"https://arrow.apache.org/docs/python/generated/pyarrow.decimal256.html\",\n                    ],\n                )\n            )\n\n    raise ValueError(\n        f\"Neither {datasets_dtype} nor {datasets_dtype + '_'} seems to be a pyarrow data type. \"\n        f\"Please make sure to use a correct data type, see: \"\n        f\"https://arrow.apache.org/docs/python/api/datatypes.html#factory-functions\"\n    )\n\n\ndef _cast_to_python_objects(obj: Any, only_1d_for_numpy: bool, optimize_list_casting: bool) -> tuple[Any, bool]:\n    \"\"\"\n    Cast pytorch/tensorflow/pandas objects to python numpy array/lists.\n    It works recursively.\n\n    If `optimize_list_casting` is True, to avoid iterating over possibly long lists, it first checks (recursively) if the first element that is not None or empty (if it is a sequence) has to be casted.\n    If the first element needs to be casted, then all the elements of the list will be casted, otherwise they'll stay the same.\n    This trick allows to cast objects that contain tokenizers outputs without iterating over every single token for example.\n\n    Args:\n        obj: the object (nested struct) to cast.\n        only_1d_for_numpy (bool): whether to keep the full multi-dim tensors as multi-dim numpy arrays, or convert them to\n            nested lists of 1-dimensional numpy arrays. This can be useful to keep only 1-d arrays to instantiate Arrow arrays.\n            Indeed Arrow only support converting 1-dimensional array values.\n        optimize_list_casting (bool): whether to optimize list casting by checking the first non-null element to see if it needs to be casted\n            and if it doesn't, not checking the rest of the list elements.\n\n    Returns:\n        casted_obj: the casted object\n        has_changed (bool): True if the object has been changed, False if it is identical\n    \"\"\"\n\n    if config.TF_AVAILABLE and \"tensorflow\" in sys.modules:\n        import tensorflow as tf\n\n    if config.TORCH_AVAILABLE and \"torch\" in sys.modules:\n        import torch\n\n    if config.JAX_AVAILABLE and \"jax\" in sys.modules:\n        import jax.numpy as jnp\n\n    if config.PIL_AVAILABLE and \"PIL\" in sys.modules:\n        import PIL.Image\n\n    if config.PDFPLUMBER_AVAILABLE and \"pdfplumber\" in sys.modules:\n        import pdfplumber\n\n    if config.NIBABEL_AVAILABLE and \"nibabel\" in sys.modules:\n        import nibabel as nib\n\n    if config.TORCHCODEC_AVAILABLE and \"torchcodec\" in sys.modules:\n        from torchcodec.decoders import AudioDecoder, VideoDecoder\n\n    if isinstance(obj, np.ndarray):\n        if obj.ndim == 0:\n            return obj[()], True\n        elif not only_1d_for_numpy or obj.ndim == 1:\n            return obj, False\n        else:\n            return (\n                [\n                    _cast_to_python_objects(\n                        x, only_1d_for_numpy=only_1d_for_numpy, optimize_list_casting=optimize_list_casting\n                    )[0]\n                    for x in obj\n                ],\n                True,\n            )\n    elif config.TORCH_AVAILABLE and \"torch\" in sys.modules and isinstance(obj, torch.Tensor):\n        if obj.dtype == torch.bfloat16:\n            return _cast_to_python_objects(\n                obj.detach().to(torch.float).cpu().numpy(),\n                only_1d_for_numpy=only_1d_for_numpy,\n                optimize_list_casting=optimize_list_casting,\n            )[0], True\n        if obj.ndim == 0:\n            return obj.detach().cpu().numpy()[()], True\n        elif not only_1d_for_numpy or obj.ndim == 1:\n            return obj.detach().cpu().numpy(), True\n        else:\n            return (\n                [\n                    _cast_to_python_objects(\n                        x, only_1d_for_numpy=only_1d_for_numpy, optimize_list_casting=optimize_list_casting\n                    )[0]\n                    for x in obj.detach().cpu().numpy()\n                ],\n                True,\n            )\n    elif config.TF_AVAILABLE and \"tensorflow\" in sys.modules and isinstance(obj, tf.Tensor):\n        if obj.ndim == 0:\n            return obj.numpy()[()], True\n        elif not only_1d_for_numpy or obj.ndim == 1:\n            return obj.numpy(), True\n        else:\n            return (\n                [\n                    _cast_to_python_objects(\n                        x, only_1d_for_numpy=only_1d_for_numpy, optimize_list_casting=optimize_list_casting\n                    )[0]\n                    for x in obj.numpy()\n                ],\n                True,\n            )\n    elif config.JAX_AVAILABLE and \"jax\" in sys.modules and isinstance(obj, jnp.ndarray):\n        if obj.ndim == 0:\n            return np.asarray(obj)[()], True\n        elif not only_1d_for_numpy or obj.ndim == 1:\n            return np.asarray(obj), True\n        else:\n            return (\n                [\n                    _cast_to_python_objects(\n                        x, only_1d_for_numpy=only_1d_for_numpy, optimize_list_casting=optimize_list_casting\n                    )[0]\n                    for x in np.asarray(obj)\n                ],\n                True,\n            )\n    elif config.PIL_AVAILABLE and \"PIL\" in sys.modules and isinstance(obj, PIL.Image.Image):\n        return encode_pil_image(obj), True\n    elif config.PDFPLUMBER_AVAILABLE and \"pdfplumber\" in sys.modules and isinstance(obj, pdfplumber.pdf.PDF):\n        return encode_pdfplumber_pdf(obj), True\n    elif config.NIBABEL_AVAILABLE and \"nibabel\" in sys.modules and isinstance(obj, nib.analyze.AnalyzeImage):\n        return encode_nibabel_image(obj, force_bytes=True), True\n    elif isinstance(obj, pd.Series):\n        return (\n            _cast_to_python_objects(\n                obj.tolist(), only_1d_for_numpy=only_1d_for_numpy, optimize_list_casting=optimize_list_casting\n            )[0],\n            True,\n        )\n    elif isinstance(obj, pd.DataFrame):\n        return (\n            {\n                key: _cast_to_python_objects(\n                    value, only_1d_for_numpy=only_1d_for_numpy, optimize_list_casting=optimize_list_casting\n                )[0]\n                for key, value in obj.to_dict(\"series\").items()\n            },\n            True,\n        )\n    elif isinstance(obj, pd.Timestamp):\n        return obj.to_pydatetime(), True\n    elif isinstance(obj, pd.Timedelta):\n        return obj.to_pytimedelta(), True\n    elif isinstance(obj, Mapping):\n        has_changed = not isinstance(obj, dict)\n        output = {}\n        for k, v in obj.items():\n            casted_v, has_changed_v = _cast_to_python_objects(\n                v, only_1d_for_numpy=only_1d_for_numpy, optimize_list_casting=optimize_list_casting\n            )\n            has_changed |= has_changed_v\n            output[k] = casted_v\n        return output if has_changed else obj, has_changed\n    elif hasattr(obj, \"__array__\"):\n        if np.isscalar(obj):\n            return obj, False\n        else:\n            return (\n                _cast_to_python_objects(\n                    obj.__array__(), only_1d_for_numpy=only_1d_for_numpy, optimize_list_casting=optimize_list_casting\n                )[0],\n                True,\n            )\n    elif isinstance(obj, (list, tuple)):\n        if len(obj) > 0:\n            for first_elmt in obj:\n                if _check_non_null_non_empty_recursive(first_elmt):\n                    break\n            casted_first_elmt, has_changed_first_elmt = _cast_to_python_objects(\n                first_elmt, only_1d_for_numpy=only_1d_for_numpy, optimize_list_casting=optimize_list_casting\n            )\n            if has_changed_first_elmt or not optimize_list_casting:\n                return (\n                    [\n                        _cast_to_python_objects(\n                            elmt, only_1d_for_numpy=only_1d_for_numpy, optimize_list_casting=optimize_list_casting\n                        )[0]\n                        for elmt in obj\n                    ],\n                    True,\n                )\n            else:\n                if isinstance(obj, (list, tuple)):\n                    return obj, False\n                else:\n                    return list(obj), True\n        else:\n            return obj, False\n    elif config.TORCHCODEC_AVAILABLE and \"torchcodec\" in sys.modules and isinstance(obj, VideoDecoder):\n        v = Video()\n        return v.encode_example(obj), True\n    elif config.TORCHCODEC_AVAILABLE and \"torchcodec\" in sys.modules and isinstance(obj, AudioDecoder):\n        a = Audio()\n        return a.encode_example(obj), True\n    else:\n        return obj, False\n\n\ndef cast_to_python_objects(obj: Any, only_1d_for_numpy=False, optimize_list_casting=True) -> Any:\n    \"\"\"\n    Cast numpy/pytorch/tensorflow/pandas objects to python lists.\n    It works recursively.\n\n    If `optimize_list_casting` is True, To avoid iterating over possibly long lists, it first checks (recursively) if the first element that is not None or empty (if it is a sequence) has to be casted.\n    If the first element needs to be casted, then all the elements of the list will be casted, otherwise they'll stay the same.\n    This trick allows to cast objects that contain tokenizers outputs without iterating over every single token for example.\n\n    Args:\n        obj: the object (nested struct) to cast\n        only_1d_for_numpy (bool, default ``False``): whether to keep the full multi-dim tensors as multi-dim numpy arrays, or convert them to\n            nested lists of 1-dimensional numpy arrays. This can be useful to keep only 1-d arrays to instantiate Arrow arrays.\n            Indeed Arrow only support converting 1-dimensional array values.\n        optimize_list_casting (bool, default ``True``): whether to optimize list casting by checking the first non-null element to see if it needs to be casted\n            and if it doesn't, not checking the rest of the list elements.\n\n    Returns:\n        casted_obj: the casted object\n    \"\"\"\n    return _cast_to_python_objects(\n        obj, only_1d_for_numpy=only_1d_for_numpy, optimize_list_casting=optimize_list_casting\n    )[0]\n\n\n@dataclass(repr=False)\nclass Value:\n    \"\"\"\n    Scalar feature value of a particular data type.\n\n    The possible dtypes of `Value` are as follows:\n    - `null`\n    - `bool`\n    - `int8`\n    - `int16`\n    - `int32`\n    - `int64`\n    - `uint8`\n    - `uint16`\n    - `uint32`\n    - `uint64`\n    - `float16`\n    - `float32` (alias float)\n    - `float64` (alias double)\n    - `time32[(s|ms)]`\n    - `time64[(us|ns)]`\n    - `timestamp[(s|ms|us|ns)]`\n    - `timestamp[(s|ms|us|ns), tz=(tzstring)]`\n    - `date32`\n    - `date64`\n    - `duration[(s|ms|us|ns)]`\n    - `decimal128(precision, scale)`\n    - `decimal256(precision, scale)`\n    - `binary`\n    - `large_binary`\n    - `binary_view`\n    - `string`\n    - `large_string`\n    - `string_view`\n\n    Args:\n        dtype (`str`):\n            Name of the data type.\n\n    Example:\n\n    ```py\n    >>> from datasets import Features\n    >>> features = Features({'stars': Value('int32')})\n    >>> features\n    {'stars': Value('int32')}\n    ```\n    \"\"\"\n\n    dtype: str\n    id: Optional[str] = field(default=None, repr=False)\n    # Automatically constructed\n    pa_type: ClassVar[Any] = None\n    _type: str = field(default=\"Value\", init=False, repr=False)\n\n    def __post_init__(self):\n        if self.dtype == \"double\":  # fix inferred type\n            self.dtype = \"float64\"\n        if self.dtype == \"float\":  # fix inferred type\n            self.dtype = \"float32\"\n        self.pa_type = string_to_arrow(self.dtype)\n\n    def __call__(self):\n        return self.pa_type\n\n    def encode_example(self, value):\n        if pa.types.is_boolean(self.pa_type):\n            return bool(value)\n        elif pa.types.is_integer(self.pa_type):\n            return int(value)\n        elif pa.types.is_floating(self.pa_type):\n            return float(value)\n        elif pa.types.is_string(self.pa_type):\n            return str(value)\n        elif pa.types.is_large_string(self.pa_type):\n            return str(value)\n        elif pa.types.is_string_view(self.pa_type):\n            return str(value)\n        else:\n            return value\n\n    def __repr__(self):\n        return f\"{type(self).__name__}('{self.dtype}')\"\n\n\nclass _ArrayXD:\n    def __post_init__(self):\n        self.shape = tuple(self.shape)\n\n    def __call__(self):\n        pa_type = globals()[self.__class__.__name__ + \"ExtensionType\"](self.shape, self.dtype)\n        return pa_type\n\n    def encode_example(self, value):\n        return value\n\n\n@dataclass\nclass Array2D(_ArrayXD):\n    \"\"\"Create a two-dimensional array.\n\n    Args:\n        shape (`tuple`):\n            Size of each dimension.\n        dtype (`str`):\n            Name of the data type.\n\n    Example:\n\n    ```py\n    >>> from datasets import Features\n    >>> features = Features({'x': Array2D(shape=(1, 3), dtype='int32')})\n    ```\n    \"\"\"\n\n    shape: tuple\n    dtype: str\n    id: Optional[str] = field(default=None, repr=False)\n    # Automatically constructed\n    _type: str = field(default=\"Array2D\", init=False, repr=False)\n\n\n@dataclass\nclass Array3D(_ArrayXD):\n    \"\"\"Create a three-dimensional array.\n\n    Args:\n        shape (`tuple`):\n            Size of each dimension.\n        dtype (`str`):\n            Name of the data type.\n\n    Example:\n\n    ```py\n    >>> from datasets import Features\n    >>> features = Features({'x': Array3D(shape=(1, 2, 3), dtype='int32')})\n    ```\n    \"\"\"\n\n    shape: tuple\n    dtype: str\n    id: Optional[str] = field(default=None, repr=False)\n    # Automatically constructed\n    _type: str = field(default=\"Array3D\", init=False, repr=False)\n\n\n@dataclass\nclass Array4D(_ArrayXD):\n    \"\"\"Create a four-dimensional array.\n\n    Args:\n        shape (`tuple`):\n            Size of each dimension.\n        dtype (`str`):\n            Name of the data type.\n\n    Example:\n\n    ```py\n    >>> from datasets import Features\n    >>> features = Features({'x': Array4D(shape=(1, 2, 2, 3), dtype='int32')})\n    ```\n    \"\"\"\n\n    shape: tuple\n    dtype: str\n    id: Optional[str] = field(default=None, repr=False)\n    # Automatically constructed\n    _type: str = field(default=\"Array4D\", init=False, repr=False)\n\n\n@dataclass\nclass Array5D(_ArrayXD):\n    \"\"\"Create a five-dimensional array.\n\n    Args:\n        shape (`tuple`):\n            Size of each dimension.\n        dtype (`str`):\n            Name of the data type.\n\n    Example:\n\n    ```py\n    >>> from datasets import Features\n    >>> features = Features({'x': Array5D(shape=(1, 2, 2, 3, 3), dtype='int32')})\n    ```\n    \"\"\"\n\n    shape: tuple\n    dtype: str\n    id: Optional[str] = field(default=None, repr=False)\n    # Automatically constructed\n    _type: str = field(default=\"Array5D\", init=False, repr=False)\n\n\nclass _ArrayXDExtensionType(pa.ExtensionType):\n    ndims: Optional[int] = None\n\n    def __init__(self, shape: tuple, dtype: str):\n        if self.ndims is None or self.ndims <= 1:\n            raise ValueError(\"You must instantiate an array type with a value for dim that is > 1\")\n        if len(shape) != self.ndims:\n            raise ValueError(f\"shape={shape} and ndims={self.ndims} don't match\")\n        for dim in range(1, self.ndims):\n            if shape[dim] is None:\n                raise ValueError(f\"Support only dynamic size on first dimension. Got: {shape}\")\n        self.shape = tuple(shape)\n        self.value_type = dtype\n        self.storage_dtype = self._generate_dtype(self.value_type)\n        pa.ExtensionType.__init__(self, self.storage_dtype, f\"{self.__class__.__module__}.{self.__class__.__name__}\")\n\n    def __arrow_ext_serialize__(self):\n        return json.dumps((self.shape, self.value_type)).encode()\n\n    @classmethod\n    def __arrow_ext_deserialize__(cls, storage_type, serialized):\n        args = json.loads(serialized)\n        return cls(*args)\n\n    # This was added to pa.ExtensionType in pyarrow >= 13.0.0\n    def __reduce__(self):\n        return self.__arrow_ext_deserialize__, (self.storage_type, self.__arrow_ext_serialize__())\n\n    def __hash__(self):\n        return hash((self.__class__, self.shape, self.value_type))\n\n    def __arrow_ext_class__(self):\n        return ArrayExtensionArray\n\n    def _generate_dtype(self, dtype):\n        dtype = string_to_arrow(dtype)\n        for d in reversed(self.shape):\n            dtype = pa.list_(dtype)\n            # Don't specify the size of the list, since fixed length list arrays have issues\n            # being validated after slicing in pyarrow 0.17.1\n        return dtype\n\n    def to_pandas_dtype(self):\n        return PandasArrayExtensionDtype(self.value_type)\n\n\nclass Array2DExtensionType(_ArrayXDExtensionType):\n    ndims = 2\n\n\nclass Array3DExtensionType(_ArrayXDExtensionType):\n    ndims = 3\n\n\nclass Array4DExtensionType(_ArrayXDExtensionType):\n    ndims = 4\n\n\nclass Array5DExtensionType(_ArrayXDExtensionType):\n    ndims = 5\n\n\n# Register the extension types for deserialization\npa.register_extension_type(Array2DExtensionType((1, 2), \"int64\"))\npa.register_extension_type(Array3DExtensionType((1, 2, 3), \"int64\"))\npa.register_extension_type(Array4DExtensionType((1, 2, 3, 4), \"int64\"))\npa.register_extension_type(Array5DExtensionType((1, 2, 3, 4, 5), \"int64\"))\n\n\ndef _is_zero_copy_only(pa_type: pa.DataType, unnest: bool = False) -> bool:\n    \"\"\"\n    When converting a pyarrow array to a numpy array, we must know whether this could be done in zero-copy or not.\n    This function returns the value of the ``zero_copy_only`` parameter to pass to ``.to_numpy()``, given the type of the pyarrow array.\n\n    # zero copy is available for all primitive types except booleans and temporal types (date, time, timestamp or duration)\n    # primitive types are types for which the physical representation in arrow and in numpy\n    # https://github.com/wesm/arrow/blob/c07b9b48cf3e0bbbab493992a492ae47e5b04cad/python/pyarrow/types.pxi#L821\n    # see https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array.to_numpy\n    # and https://issues.apache.org/jira/browse/ARROW-2871?jql=text%20~%20%22boolean%20to_numpy%22\n    \"\"\"\n\n    def _unnest_pa_type(pa_type: pa.DataType) -> pa.DataType:\n        if pa.types.is_list(pa_type):\n            return _unnest_pa_type(pa_type.value_type)\n        return pa_type\n\n    if unnest:\n        pa_type = _unnest_pa_type(pa_type)\n    return pa.types.is_primitive(pa_type) and not (pa.types.is_boolean(pa_type) or pa.types.is_temporal(pa_type))\n\n\nclass ArrayExtensionArray(pa.ExtensionArray):\n    def __array__(self):\n        zero_copy_only = _is_zero_copy_only(self.storage.type, unnest=True)\n        return self.to_numpy(zero_copy_only=zero_copy_only)\n\n    def __getitem__(self, i):\n        return self.storage[i]\n\n    def to_numpy(self, zero_copy_only=True):\n        storage: pa.ListArray = self.storage\n        null_mask = storage.is_null().to_numpy(zero_copy_only=False)\n\n        if self.type.shape[0] is not None:\n            size = 1\n            null_indices = np.arange(len(storage))[null_mask] - np.arange(np.sum(null_mask))\n\n            for i in range(self.type.ndims):\n                size *= self.type.shape[i]\n                storage = storage.flatten()\n            numpy_arr = storage.to_numpy(zero_copy_only=zero_copy_only)\n            numpy_arr = numpy_arr.reshape(len(self) - len(null_indices), *self.type.shape)\n\n            if len(null_indices):\n                numpy_arr = np.insert(numpy_arr.astype(np.float64), null_indices, np.nan, axis=0)\n\n        else:\n            shape = self.type.shape\n            ndims = self.type.ndims\n            arrays = []\n            first_dim_offsets = np.array([off.as_py() for off in storage.offsets])\n            for i, is_null in enumerate(null_mask):\n                if is_null:\n                    arrays.append(np.nan)\n                else:\n                    storage_el = storage[i : i + 1]\n                    first_dim = first_dim_offsets[i + 1] - first_dim_offsets[i]\n                    # flatten storage\n                    for _ in range(ndims):\n                        storage_el = storage_el.flatten()\n\n                    numpy_arr = storage_el.to_numpy(zero_copy_only=zero_copy_only)\n                    arrays.append(numpy_arr.reshape(first_dim, *shape[1:]))\n\n            if len(np.unique(np.diff(first_dim_offsets))) > 1:\n                # ragged\n                numpy_arr = np.empty(len(arrays), dtype=object)\n                numpy_arr[:] = arrays\n            else:\n                numpy_arr = np.array(arrays)\n\n        return numpy_arr\n\n    def to_pylist(self, maps_as_pydicts: Optional[Literal[\"lossy\", \"strict\"]] = None):\n        zero_copy_only = _is_zero_copy_only(self.storage.type, unnest=True)\n        numpy_arr = self.to_numpy(zero_copy_only=zero_copy_only)\n        if self.type.shape[0] is None and numpy_arr.dtype == object:\n            return [arr.tolist() for arr in numpy_arr.tolist()]\n        else:\n            return numpy_arr.tolist()\n\n\nclass PandasArrayExtensionDtype(PandasExtensionDtype):\n    _metadata = \"value_type\"\n\n    def __init__(self, value_type: Union[\"PandasArrayExtensionDtype\", np.dtype]):\n        self._value_type = value_type\n\n    def __from_arrow__(self, array: Union[pa.Array, pa.ChunkedArray]):\n        if isinstance(array, pa.ChunkedArray):\n            array = array.type.wrap_array(pa.concat_arrays([chunk.storage for chunk in array.chunks]))\n        zero_copy_only = _is_zero_copy_only(array.storage.type, unnest=True)\n        numpy_arr = array.to_numpy(zero_copy_only=zero_copy_only)\n        return PandasArrayExtensionArray(numpy_arr)\n\n    @classmethod\n    def construct_array_type(cls):\n        return PandasArrayExtensionArray\n\n    @property\n    def type(self) -> type:\n        return np.ndarray\n\n    @property\n    def kind(self) -> str:\n        return \"O\"\n\n    @property\n    def name(self) -> str:\n        return f\"array[{self.value_type}]\"\n\n    @property\n    def value_type(self) -> np.dtype:\n        return self._value_type\n\n\nclass PandasArrayExtensionArray(PandasExtensionArray):\n    def __init__(self, data: np.ndarray, copy: bool = False):\n        self._data = data if not copy else np.array(data)\n        self._dtype = PandasArrayExtensionDtype(data.dtype)\n\n    def __array__(self, dtype=None):\n        \"\"\"\n        Convert to NumPy Array.\n        Note that Pandas expects a 1D array when dtype is set to object.\n        But for other dtypes, the returned shape is the same as the one of ``data``.\n\n        More info about pandas 1D requirement for PandasExtensionArray here:\n        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.api.extensions.ExtensionArray.html#pandas.api.extensions.ExtensionArray\n\n        \"\"\"\n        if dtype == np.dtype(object):\n            out = np.empty(len(self._data), dtype=object)\n            for i in range(len(self._data)):\n                out[i] = self._data[i]\n            return out\n        if dtype is None:\n            return self._data\n        else:\n            return self._data.astype(dtype)\n\n    def copy(self, deep: bool = False) -> \"PandasArrayExtensionArray\":\n        return PandasArrayExtensionArray(self._data, copy=True)\n\n    @classmethod\n    def _from_sequence(\n        cls, scalars, dtype: Optional[PandasArrayExtensionDtype] = None, copy: bool = False\n    ) -> \"PandasArrayExtensionArray\":\n        if len(scalars) > 1 and all(\n            isinstance(x, np.ndarray) and x.shape == scalars[0].shape and x.dtype == scalars[0].dtype for x in scalars\n        ):\n            data = np.array(scalars, dtype=dtype if dtype is None else dtype.value_type, copy=copy)\n        else:\n            data = np.empty(len(scalars), dtype=object)\n            data[:] = scalars\n        return cls(data, copy=copy)\n\n    @classmethod\n    def _concat_same_type(cls, to_concat: Sequence_[\"PandasArrayExtensionArray\"]) -> \"PandasArrayExtensionArray\":\n        if len(to_concat) > 1 and all(\n            va._data.shape == to_concat[0]._data.shape and va._data.dtype == to_concat[0]._data.dtype\n            for va in to_concat\n        ):\n            data = np.vstack([va._data for va in to_concat])\n        else:\n            data = np.empty(len(to_concat), dtype=object)\n            data[:] = [va._data for va in to_concat]\n        return cls(data, copy=False)\n\n    @property\n    def dtype(self) -> PandasArrayExtensionDtype:\n        return self._dtype\n\n    @property\n    def nbytes(self) -> int:\n        return self._data.nbytes\n\n    def isna(self) -> np.ndarray:\n        return np.array([pd.isna(arr).any() for arr in self._data])\n\n    def __setitem__(self, key: Union[int, slice, np.ndarray], value: Any) -> None:\n        raise NotImplementedError()\n\n    def __getitem__(self, item: Union[int, slice, np.ndarray]) -> Union[np.ndarray, \"PandasArrayExtensionArray\"]:\n        if isinstance(item, int):\n            return self._data[item]\n        return PandasArrayExtensionArray(self._data[item], copy=False)\n\n    def take(\n        self, indices: Sequence_[int], allow_fill: bool = False, fill_value: bool = None\n    ) -> \"PandasArrayExtensionArray\":\n        indices: np.ndarray = np.asarray(indices, dtype=int)\n        if allow_fill:\n            fill_value = (\n                self.dtype.na_value if fill_value is None else np.asarray(fill_value, dtype=self.dtype.value_type)\n            )\n            mask = indices == -1\n            if (indices < -1).any():\n                raise ValueError(\"Invalid value in `indices`, must be all >= -1 for `allow_fill` is True\")\n            elif len(self) > 0:\n                pass\n            elif not np.all(mask):\n                raise IndexError(\"Invalid take for empty PandasArrayExtensionArray, must be all -1.\")\n            else:\n                data = np.array([fill_value] * len(indices), dtype=self.dtype.value_type)\n                return PandasArrayExtensionArray(data, copy=False)\n        took = self._data.take(indices, axis=0)\n        if allow_fill and mask.any():\n            took[mask] = [fill_value] * np.sum(mask)\n        return PandasArrayExtensionArray(took, copy=False)\n\n    def __len__(self) -> int:\n        return len(self._data)\n\n    def __eq__(self, other) -> np.ndarray:\n        if not isinstance(other, PandasArrayExtensionArray):\n            raise NotImplementedError(f\"Invalid type to compare to: {type(other)}\")\n        return (self._data == other._data).all()\n\n\ndef pandas_types_mapper(dtype):\n    if isinstance(dtype, _ArrayXDExtensionType):\n        return PandasArrayExtensionDtype(dtype.value_type)\n\n\n@dataclass\nclass ClassLabel:\n    \"\"\"Feature type for integer class labels.\n\n    There are 3 ways to define a `ClassLabel`, which correspond to the 3 arguments:\n\n     * `num_classes`: Create 0 to (num_classes-1) labels.\n     * `names`: List of label strings.\n     * `names_file`: File containing the list of labels.\n\n    Under the hood the labels are stored as integers.\n    You can use negative integers to represent unknown/missing labels.\n\n    Args:\n        num_classes (`int`, *optional*):\n            Number of classes. All labels must be < `num_classes`.\n        names (`list` of `str`, *optional*):\n            String names for the integer classes.\n            The order in which the names are provided is kept.\n        names_file (`str`, *optional*):\n            Path to a file with names for the integer classes, one per line.\n\n    Example:\n\n    ```py\n    >>> from datasets import Features, ClassLabel\n    >>> features = Features({'label': ClassLabel(num_classes=3, names=['bad', 'ok', 'good'])})\n    >>> features\n    {'label': ClassLabel(names=['bad', 'ok', 'good'])}\n    ```\n    \"\"\"\n\n    num_classes: InitVar[Optional[int]] = None  # Pseudo-field: ignored by asdict/fields when converting to/from dict\n    names: list[str] = None\n    names_file: InitVar[Optional[str]] = None  # Pseudo-field: ignored by asdict/fields when converting to/from dict\n    id: Optional[str] = field(default=None, repr=False)\n    # Automatically constructed\n    dtype: ClassVar[str] = \"int64\"\n    pa_type: ClassVar[Any] = pa.int64()\n    _str2int: ClassVar[dict[str, int]] = None\n    _int2str: ClassVar[dict[int, int]] = None\n    _type: str = field(default=\"ClassLabel\", init=False, repr=False)\n\n    def __post_init__(self, num_classes, names_file):\n        self.num_classes = num_classes\n        self.names_file = names_file\n        if self.names_file is not None and self.names is not None:\n            raise ValueError(\"Please provide either names or names_file but not both.\")\n        # Set self.names\n        if self.names is None:\n            if self.names_file is not None:\n                self.names = self._load_names_from_file(self.names_file)\n            elif self.num_classes is not None:\n                self.names = [str(i) for i in range(self.num_classes)]\n            else:\n                raise ValueError(\"Please provide either num_classes, names or names_file.\")\n        elif not isinstance(self.names, SequenceABC):\n            raise TypeError(f\"Please provide names as a list, is {type(self.names)}\")\n        # Set self.num_classes\n        if self.num_classes is None:\n            self.num_classes = len(self.names)\n        elif self.num_classes != len(self.names):\n            raise ValueError(\n                \"ClassLabel number of names do not match the defined num_classes. \"\n                f\"Got {len(self.names)} names VS {self.num_classes} num_classes\"\n            )\n        # Prepare mappings\n        self._int2str = [str(name) for name in self.names]\n        self._str2int = {name: i for i, name in enumerate(self._int2str)}\n        if len(self._int2str) != len(self._str2int):\n            raise ValueError(\"Some label names are duplicated. Each label name should be unique.\")\n\n    def __call__(self):\n        return self.pa_type\n\n    def str2int(self, values: Union[str, Iterable]) -> Union[int, Iterable]:\n        \"\"\"Conversion class name `string` => `integer`.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\")\n        >>> ds.features[\"label\"].str2int('neg')\n        0\n        ```\n        \"\"\"\n        if not isinstance(values, str) and not isinstance(values, Iterable):\n            raise ValueError(\n                f\"Values {values} should be a string or an Iterable (list, numpy array, pytorch, tensorflow tensors)\"\n            )\n        return_list = True\n        if isinstance(values, str):\n            values = [values]\n            return_list = False\n\n        output = [self._strval2int(value) for value in values]\n        return output if return_list else output[0]\n\n    def _strval2int(self, value: str) -> int:\n        failed_parse = False\n        value = str(value)\n        # first attempt - raw string value\n        int_value = self._str2int.get(value)\n        if int_value is None:\n            # second attempt - strip whitespace\n            int_value = self._str2int.get(value.strip())\n            if int_value is None:\n                # third attempt - convert str to int\n                try:\n                    int_value = int(value)\n                except ValueError:\n                    failed_parse = True\n                else:\n                    if int_value < -1 or int_value >= self.num_classes:\n                        failed_parse = True\n        if failed_parse:\n            raise ValueError(f\"Invalid string class label {value}\")\n        return int_value\n\n    def int2str(self, values: Union[int, Iterable]) -> Union[str, Iterable]:\n        \"\"\"Conversion `integer` => class name `string`.\n\n        Regarding unknown/missing labels: passing negative integers raises `ValueError`.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\")\n        >>> ds.features[\"label\"].int2str(0)\n        'neg'\n        ```\n        \"\"\"\n        if not isinstance(values, int) and not isinstance(values, Iterable):\n            raise ValueError(\n                f\"Values {values} should be an integer or an Iterable (list, numpy array, pytorch, tensorflow tensors)\"\n            )\n        return_list = True\n        if isinstance(values, int):\n            values = [values]\n            return_list = False\n\n        for v in values:\n            if not 0 <= v < self.num_classes:\n                raise ValueError(f\"Invalid integer class label {v:d}\")\n\n        output = [self._int2str[int(v)] for v in values]\n        return output if return_list else output[0]\n\n    def encode_example(self, example_data):\n        if self.num_classes is None:\n            raise ValueError(\n                \"Trying to use ClassLabel feature with undefined number of class. \"\n                \"Please set ClassLabel.names or num_classes.\"\n            )\n\n        # If a string is given, convert to associated integer\n        if isinstance(example_data, str):\n            example_data = self.str2int(example_data)\n\n        # Allowing -1 to mean no label.\n        if not -1 <= example_data < self.num_classes:\n            raise ValueError(f\"Class label {example_data:d} greater than configured num_classes {self.num_classes}\")\n        return example_data\n\n    def cast_storage(self, storage: Union[pa.StringArray, pa.IntegerArray]) -> pa.Int64Array:\n        \"\"\"Cast an Arrow array to the `ClassLabel` arrow storage type.\n        The Arrow types that can be converted to the `ClassLabel` pyarrow storage type are:\n\n        - `pa.string()`\n        - `pa.int()`\n\n        Args:\n            storage (`Union[pa.StringArray, pa.IntegerArray]`):\n                PyArrow array to cast.\n\n        Returns:\n            `pa.Int64Array`: Array in the `ClassLabel` arrow storage type.\n        \"\"\"\n        if isinstance(storage, pa.IntegerArray) and len(storage) > 0:\n            min_max = pc.min_max(storage).as_py()\n            if min_max[\"max\"] is not None and min_max[\"max\"] >= self.num_classes:\n                raise ValueError(\n                    f\"Class label {min_max['max']} greater than configured num_classes {self.num_classes}\"\n                )\n        elif isinstance(storage, pa.StringArray):\n            storage = pa.array(\n                [self._strval2int(label) if label is not None else None for label in storage.to_pylist()]\n            )\n        return array_cast(storage, self.pa_type)\n\n    @staticmethod\n    def _load_names_from_file(names_filepath):\n        with open(names_filepath, encoding=\"utf-8\") as f:\n            return [name.strip() for name in f.read().split(\"\\n\") if name.strip()]  # Filter empty names\n\n\n@dataclass\nclass Json:\n    \"\"\"Feature type for JSON objects.\n\n    Under the hood the objects are stored as JSON-encoded strings.\n\n    Example:\n\n    ```py\n    >>> from datasets import Features, Json\n    >>> features = Features({'json': Json()})\n    >>> features\n    {'json': Json()}\n    ```\n\n    ```py\n    >>> from datasets import Dataset, Features, Json, List\n    >>> features = Features({\"a\": List(Json())})\n    >>> ds = Dataset.from_dict({\"a\": [[{\"b\": 0}, {\"c\": 0}]]}, features=features)\n    >>> # OR\n    >>> ds = Dataset.from_dict({\"a\": [[{\"b\": 0}, {\"c\": 0}]]}, on_mixed_types=\"use_json\")\n    >>> ds.features\n    {'a': List(Json())}\n    >>> ds[0]\n    {'a': [{'b': 0}, {'c': 0}]}\n    >>> def f(x):\n    ...     for y in x[\"a\"]:\n    ...         y[\"d\"] = \"foo\"\n    ...     return x\n    >>> ds = ds.map(f)\n    >>> ds.features\n    >>> ds[0]\n    {'a': [{'b': 0, 'd': 'foo'}, {'c': 0, 'd': 'foo'}]}\n    ```\n    \"\"\"\n\n    decode: bool = True\n    id: Optional[str] = field(default=None, repr=False)\n    # Automatically constructed\n    pa_type: ClassVar[Any] = pa.json_()\n    _type: str = field(default=\"Json\", init=False, repr=False)\n\n    def __call__(self):\n        return self.pa_type\n\n    def encode_example(self, example_data):\n        if not isinstance(example_data, str):\n            example_data = ujson_dumps(example_data)\n        else:\n            try:\n                ujson_loads(example_data)\n            except Exception:\n                example_data = ujson_dumps(example_data)\n        return example_data\n\n    def decode_example(self, example_data, token_per_repo_id: Optional[dict[str, Union[str, bool, None]]] = None):\n        if not self.decode:\n            raise RuntimeError(\"Decoding is disabled for this feature. Please use Json(decode=True) instead.\")\n        return ujson_loads(example_data)\n\n    def cast_storage(self, storage: Union[pa.Array]) -> pa.JsonArray:\n        \"\"\"Cast an Arrow array to the `Json` arrow storage type.\n\n        Args:\n            storage (`Union[pa.StringArray, pa.IntegerArray]`):\n                PyArrow array to cast.\n\n        Returns:\n            `pa.JsonArray`: Array in the `Json` arrow storage type.\n        \"\"\"\n        if isinstance(storage, pa.JsonArray):\n            return storage\n        elif isinstance(storage, (pa.StringArray)):\n            items = storage[:5].to_pylist()\n            try:\n                for item in items:\n                    ujson_loads(item)\n            except Exception:\n                storage = pa.array([ujson_dumps(x) for x in storage.to_pylist()], pa.json_())\n        else:\n            storage = pa.array([ujson_dumps(x) for x in storage.to_pylist()], pa.json_())\n        return array_cast(storage, self.pa_type)\n\n\nclass Sequence:\n    \"\"\"\n    A `Sequence` is a utility that automatically converts internal dictionary feature into a dictionary of\n    lists. This behavior is implemented to have a compatibility layer with the TensorFlow Datasets library but may be\n    un-wanted in some cases. If you don't want this behavior, you can use a [`List`] or a [`LargeList`]\n    instead of the [`Sequence`].\n\n    Args:\n        feature ([`FeatureType`]):\n            Child feature data type of each item within the large list.\n        length (optional `int`, default to -1):\n            Length of the list if it is fixed.\n            Defaults to -1 which means an arbitrary length.\n\n        Returns:\n            [`List`] of the specified feature, except `dict` of sub-features\n            which are converted to `dict` of lists of sub-features for compatibility with TFDS.\n\n    \"\"\"\n\n    def __new__(cls, feature=None, length=-1, **kwargs):\n        # useful to still get isinstance(Sequence(Value(\"int64\")), Sequence)\n        if (\n            cls is Sequence\n            and isinstance(feature, dict)\n            and any(not isinstance(subfeature, List) for subfeature in feature.values())\n        ):\n            out = {key: List(value, length=length, **kwargs) for key, value in feature.items()}\n        else:\n            out = super().__new__(List)\n        return out\n\n\n@dataclass(repr=False)\nclass List(Sequence):\n    \"\"\"Feature type for large list data composed of child feature data type.\n\n    It is backed by `pyarrow.ListType`, which uses 32-bit offsets or a fixed length.\n\n    Args:\n        feature ([`FeatureType`]):\n            Child feature data type of each item within the large list.\n        length (optional `int`, default to -1):\n            Length of the list if it is fixed.\n            Defaults to -1 which means an arbitrary length.\n    \"\"\"\n\n    feature: Any\n    length: int = -1\n    id: Optional[str] = field(default=None, repr=False)\n    # Automatically constructed\n    pa_type: ClassVar[Any] = None\n    _type: str = field(default=\"List\", init=False, repr=False)\n\n    def __repr__(self):\n        if self.length != -1:\n            return f\"{type(self).__name__}({self.feature}, length={self.length})\"\n        else:\n            return f\"{type(self).__name__}({self.feature})\"\n\n\n@dataclass(repr=False)\nclass LargeList:\n    \"\"\"Feature type for large list data composed of child feature data type.\n\n    It is backed by `pyarrow.LargeListType`, which is like `pyarrow.ListType` but with 64-bit rather than 32-bit offsets.\n\n    Args:\n        feature ([`FeatureType`]):\n            Child feature data type of each item within the large list.\n    \"\"\"\n\n    feature: Any\n    id: Optional[str] = field(default=None, repr=False)\n    # Automatically constructed\n    pa_type: ClassVar[Any] = None\n    _type: str = field(default=\"LargeList\", init=False, repr=False)\n\n    def __repr__(self):\n        return f\"{type(self).__name__}({self.feature})\"\n\n\nFeatureType = Union[\n    dict,\n    list,\n    tuple,\n    Value,\n    ClassLabel,\n    Translation,\n    TranslationVariableLanguages,\n    LargeList,\n    List,\n    Array2D,\n    Array3D,\n    Array4D,\n    Array5D,\n    Audio,\n    Image,\n    Video,\n    Pdf,\n    Nifti,\n]\n\n\ndef _check_non_null_non_empty_recursive(obj, schema: Optional[FeatureType] = None) -> bool:\n    \"\"\"\n    Check if the object is not None.\n    If the object is a list or a tuple, recursively check the first element of the sequence and stop if at any point the first element is not a sequence or is an empty sequence.\n    \"\"\"\n    if obj is None:\n        return False\n    elif isinstance(obj, (list, tuple)) and (schema is None or isinstance(schema, (list, tuple, LargeList, List))):\n        if len(obj) > 0:\n            if schema is None:\n                pass\n            elif isinstance(schema, (list, tuple)):\n                schema = schema[0]\n            else:\n                schema = schema.feature\n            return _check_non_null_non_empty_recursive(obj[0], schema)\n        else:\n            return False\n    else:\n        return True\n\n\ndef get_nested_type(schema: FeatureType) -> pa.DataType:\n    \"\"\"\n    get_nested_type() converts a datasets.FeatureType into a pyarrow.DataType, and acts as the inverse of\n        generate_from_arrow_type().\n\n    It performs double-duty as the implementation of Features.type and handles the conversion of\n        datasets.Feature->pa.struct\n    \"\"\"\n    # Nested structures: we allow dict, list/tuples, sequences\n    if isinstance(schema, Features):\n        return pa.struct(\n            {key: get_nested_type(schema[key]) for key in schema}\n        )  # Features is subclass of dict, and dict order is deterministic since Python 3.6\n    elif isinstance(schema, dict):\n        return pa.struct(\n            {key: get_nested_type(schema[key]) for key in schema}\n        )  # however don't sort on struct types since the order matters\n    elif isinstance(schema, (list, tuple)):\n        if len(schema) != 1:\n            raise ValueError(\"When defining list feature, you should just provide one example of the inner type\")\n        value_type = get_nested_type(schema[0])\n        return pa.list_(value_type)\n    elif isinstance(schema, LargeList):\n        value_type = get_nested_type(schema.feature)\n        return pa.large_list(value_type)\n    elif isinstance(schema, List):\n        value_type = get_nested_type(schema.feature)\n        return pa.list_(value_type, schema.length)\n\n    # Other objects are callable which returns their data type (ClassLabel, Array2D, Translation, Arrow datatype creation methods)\n    return schema()\n\n\ndef encode_nested_example(schema, obj, level=0):\n    \"\"\"Encode a nested example.\n    This is used since some features (in particular ClassLabel) have some logic during encoding.\n\n    To avoid iterating over possibly long lists, it first checks (recursively) if the first element that is not None or empty (if it is a sequence) has to be encoded.\n    If the first element needs to be encoded, then all the elements of the list will be encoded, otherwise they'll stay the same.\n    \"\"\"\n    # Nested structures: we allow dict, list/tuples, sequences\n    if isinstance(schema, dict):\n        if level == 0 and obj is None:\n            raise ValueError(\"Got None but expected a dictionary instead\")\n        return (\n            {k: encode_nested_example(schema[k], obj.get(k), level=level + 1) for k in schema}\n            if obj is not None\n            else None\n        )\n    elif isinstance(schema, (LargeList, List)):\n        if obj is None:\n            return None\n        else:\n            if len(obj) > 0:\n                sub_schema = schema.feature\n                for first_elmt in obj:\n                    if _check_non_null_non_empty_recursive(first_elmt, sub_schema):\n                        break\n                try:\n                    changed = bool(encode_nested_example(sub_schema, first_elmt, level=level + 1) != first_elmt)\n                except ValueError:  # can happen when comparing arrays\n                    changed = False\n                if changed:\n                    return [encode_nested_example(sub_schema, o, level=level + 1) for o in obj]\n            return list(obj)\n    # Object with special encoding:\n    # ClassLabel will convert from string to int, TranslationVariableLanguages does some checks\n    elif hasattr(schema, \"encode_example\"):\n        return schema.encode_example(obj) if obj is not None else None\n    # Other object should be directly convertible to a native Arrow type (like Translation and Translation)\n    return obj\n\n\ndef decode_nested_example(schema, obj, token_per_repo_id: Optional[dict[str, Union[str, bool, None]]] = None):\n    \"\"\"Decode a nested example.\n    This is used since some features (in particular Audio and Image) have some logic during decoding.\n\n    To avoid iterating over possibly long lists, it first checks (recursively) if the first element that is not None or empty (if it is a sequence) has to be decoded.\n    If the first element needs to be decoded, then all the elements of the list will be decoded, otherwise they'll stay the same.\n    \"\"\"\n    # Nested structures: we allow dict, list/tuples, sequences\n    if isinstance(schema, dict):\n        return (\n            {k: decode_nested_example(sub_schema, sub_obj) for k, (sub_schema, sub_obj) in zip_dict(schema, obj)}\n            if obj is not None\n            else None\n        )\n    elif isinstance(schema, (list, tuple)):\n        sub_schema = schema[0]\n        if obj is None:\n            return None\n        else:\n            if len(obj) > 0:\n                for first_elmt in obj:\n                    if _check_non_null_non_empty_recursive(first_elmt, sub_schema):\n                        break\n                if decode_nested_example(sub_schema, first_elmt) != first_elmt:\n                    return [decode_nested_example(sub_schema, o) for o in obj]\n            return list(obj)\n    elif isinstance(schema, (LargeList, List)):\n        if obj is None:\n            return None\n        else:\n            sub_schema = schema.feature\n            if isinstance(sub_schema, dict):\n                return [decode_nested_example(sub_schema, o) for o in obj]\n            if len(obj) > 0:\n                for first_elmt in obj:\n                    if _check_non_null_non_empty_recursive(first_elmt, sub_schema):\n                        break\n                if decode_nested_example(sub_schema, first_elmt) != first_elmt:\n                    return [decode_nested_example(sub_schema, o) for o in obj]\n            return list(obj)\n    # Object with special decoding:\n    elif hasattr(schema, \"decode_example\") and getattr(schema, \"decode\", True):\n        # we pass the token to read and decode files from private repositories in streaming mode\n        return schema.decode_example(obj, token_per_repo_id=token_per_repo_id) if obj is not None else None\n    return obj\n\n\n_FEATURE_TYPES: dict[str, FeatureType] = {\n    Value.__name__: Value,\n    ClassLabel.__name__: ClassLabel,\n    Translation.__name__: Translation,\n    TranslationVariableLanguages.__name__: TranslationVariableLanguages,\n    LargeList.__name__: LargeList,\n    List.__name__: List,\n    Array2D.__name__: Array2D,\n    Array3D.__name__: Array3D,\n    Array4D.__name__: Array4D,\n    Array5D.__name__: Array5D,\n    Audio.__name__: Audio,\n    Image.__name__: Image,\n    Video.__name__: Video,\n    Pdf.__name__: Pdf,\n    Nifti.__name__: Nifti,\n    Json.__name__: Json,\n}\n\n\n@experimental\ndef register_feature(\n    feature_cls: type,\n    feature_type: str,\n):\n    \"\"\"\n    Register a Feature object using a name and class.\n    This function must be used on a Feature class.\n    \"\"\"\n    if feature_type in _FEATURE_TYPES:\n        logger.warning(\n            f\"Overwriting feature type '{feature_type}' ({_FEATURE_TYPES[feature_type].__name__} -> {feature_cls.__name__})\"\n        )\n    _FEATURE_TYPES[feature_type] = feature_cls\n\n\ndef generate_from_dict(obj: Any):\n    \"\"\"Regenerate the nested feature object from a deserialized dict.\n    We use the '_type' fields to get the dataclass name to load.\n\n    generate_from_dict is the recursive helper for Features.from_dict, and allows for a convenient constructor syntax\n    to define features from deserialized JSON dictionaries. This function is used in particular when deserializing\n    a :class:`DatasetInfo` that was dumped to a JSON object. This acts as an analogue to\n    :meth:`Features.from_arrow_schema` and handles the recursive field-by-field instantiation, but doesn't require any\n    mapping to/from pyarrow, except for the fact that it takes advantage of the mapping of pyarrow primitive dtypes\n    that :class:`Value` automatically performs.\n    \"\"\"\n    # Nested structures: we allow dict, list/tuples, sequences\n    if isinstance(obj, list):\n        return [generate_from_dict(value) for value in obj]\n    # Otherwise we have a dict or a dataclass\n    if \"_type\" not in obj or isinstance(obj[\"_type\"], dict):\n        return {key: generate_from_dict(value) for key, value in obj.items()}\n    obj = dict(obj)\n    _type = obj.pop(\"_type\")\n    class_type = _FEATURE_TYPES.get(_type, None) or globals().get(_type, None)\n\n    if class_type is None:\n        raise ValueError(f\"Feature type '{_type}' not found. Available feature types: {list(_FEATURE_TYPES.keys())}\")\n\n    if class_type == LargeList:\n        feature = obj.pop(\"feature\")\n        return LargeList(generate_from_dict(feature), **obj)\n    if class_type == List:\n        feature = obj.pop(\"feature\")\n        return List(generate_from_dict(feature), **obj)\n    if class_type == Sequence:  # backward compatibility, this translates to a List or a dict\n        feature = obj.pop(\"feature\")\n        return Sequence(feature=generate_from_dict(feature), **obj)\n\n    field_names = {f.name for f in fields(class_type)}\n    return class_type(**{k: v for k, v in obj.items() if k in field_names})\n\n\ndef generate_from_arrow_type(pa_type: pa.DataType) -> FeatureType:\n    \"\"\"\n    generate_from_arrow_type accepts an arrow DataType and returns a datasets FeatureType to be used as the type for\n        a single field.\n\n    This is the high-level arrow->datasets type conversion and is inverted by get_nested_type().\n\n    This operates at the individual *field* level, whereas Features.from_arrow_schema() operates at the\n        full schema level and holds the methods that represent the bijection from Features<->pyarrow.Schema\n    \"\"\"\n    if isinstance(pa_type, pa.StructType):\n        return {field.name: generate_from_arrow_type(field.type) for field in pa_type}\n    elif isinstance(pa_type, pa.FixedSizeListType):\n        return List(generate_from_arrow_type(pa_type.value_type), length=pa_type.list_size)\n    elif isinstance(pa_type, pa.ListType):\n        return List(generate_from_arrow_type(pa_type.value_type))\n    elif isinstance(pa_type, pa.LargeListType):\n        return LargeList(generate_from_arrow_type(pa_type.value_type))\n    elif isinstance(pa_type, _ArrayXDExtensionType):\n        array_feature = [None, None, Array2D, Array3D, Array4D, Array5D][pa_type.ndims]\n        return array_feature(shape=pa_type.shape, dtype=pa_type.value_type)\n    elif isinstance(pa_type, pa.JsonType):\n        return Json()\n    elif isinstance(pa_type, pa.DataType):\n        return Value(dtype=_arrow_to_datasets_dtype(pa_type))\n    else:\n        raise ValueError(f\"Cannot convert {pa_type} to a Feature type.\")\n\n\ndef numpy_to_pyarrow_listarray(arr: np.ndarray, type: pa.DataType = None) -> pa.ListArray:\n    \"\"\"Build a PyArrow ListArray from a multidimensional NumPy array\"\"\"\n    arr = np.array(arr)\n    values = pa.array(arr.flatten(), type=type)\n    for i in range(arr.ndim - 1):\n        n_offsets = reduce(mul, arr.shape[: arr.ndim - i - 1], 1)\n        step_offsets = arr.shape[arr.ndim - i - 1]\n        offsets = pa.array(np.arange(n_offsets + 1) * step_offsets, type=pa.int32())\n        values = pa.ListArray.from_arrays(offsets, values)\n    return values\n\n\ndef list_of_pa_arrays_to_pyarrow_listarray(l_arr: list[Optional[pa.Array]]) -> pa.ListArray:\n    null_mask = np.array([arr is None for arr in l_arr])\n    null_indices = np.arange(len(null_mask))[null_mask] - np.arange(np.sum(null_mask))\n    l_arr = [arr for arr in l_arr if arr is not None]\n    offsets = np.cumsum(\n        [0] + [len(arr) for arr in l_arr], dtype=object\n    )  # convert to dtype object to allow None insertion\n    offsets = np.insert(offsets, null_indices, None)\n    offsets = pa.array(offsets, type=pa.int32())\n    values = pa.concat_arrays(l_arr)\n    return pa.ListArray.from_arrays(offsets, values)\n\n\ndef list_of_np_array_to_pyarrow_listarray(l_arr: list[np.ndarray], type: pa.DataType = None) -> pa.ListArray:\n    \"\"\"Build a PyArrow ListArray from a possibly nested list of NumPy arrays\"\"\"\n    if len(l_arr) > 0:\n        return list_of_pa_arrays_to_pyarrow_listarray(\n            [numpy_to_pyarrow_listarray(arr, type=type) if arr is not None else None for arr in l_arr]\n        )\n    else:\n        return pa.array([], type=type)\n\n\ndef contains_any_np_array(data: Any):\n    \"\"\"Return `True` if data is a NumPy ndarray or (recursively) if first non-null value in list is a NumPy ndarray.\n\n    Args:\n        data (Any): Data.\n\n    Returns:\n        bool\n    \"\"\"\n    if isinstance(data, np.ndarray):\n        return True\n    elif isinstance(data, list):\n        return contains_any_np_array(first_non_null_value(data)[1])\n    else:\n        return False\n\n\ndef any_np_array_to_pyarrow_listarray(data: Union[np.ndarray, list], type: pa.DataType = None) -> pa.ListArray:\n    \"\"\"Convert to PyArrow ListArray either a NumPy ndarray or (recursively) a list that may contain any NumPy ndarray.\n\n    Args:\n        data (Union[np.ndarray, List]): Data.\n        type (pa.DataType): Explicit PyArrow DataType passed to coerce the ListArray data type.\n\n    Returns:\n        pa.ListArray\n    \"\"\"\n    if isinstance(data, np.ndarray):\n        return numpy_to_pyarrow_listarray(data, type=type)\n    elif isinstance(data, list):\n        return list_of_pa_arrays_to_pyarrow_listarray([any_np_array_to_pyarrow_listarray(i, type=type) for i in data])\n\n\ndef to_pyarrow_listarray(data: Any, pa_type: _ArrayXDExtensionType) -> pa.Array:\n    \"\"\"Convert to PyArrow ListArray.\n\n    Args:\n        data (Any): List, iterable, np.ndarray or pd.Series.\n        pa_type (_ArrayXDExtensionType): Any of the ArrayNDExtensionType.\n\n    Returns:\n        pyarrow.Array\n    \"\"\"\n    if contains_any_np_array(data):\n        return any_np_array_to_pyarrow_listarray(data, type=pa_type.value_type)\n    else:\n        return pa.array(data, pa_type.storage_dtype)\n\n\ndef _visit(feature: FeatureType, func: Callable[[FeatureType], Optional[FeatureType]]) -> FeatureType:\n    \"\"\"Visit a (possibly nested) feature.\n\n    Args:\n        feature (FeatureType): the feature type to be checked\n    Returns:\n        visited feature (FeatureType)\n    \"\"\"\n    if isinstance(feature, Features):\n        out = func(Features({k: _visit(f, func) for k, f in feature.items()}))\n    elif isinstance(feature, dict):\n        out = func({k: _visit(f, func) for k, f in feature.items()})\n    elif isinstance(feature, LargeList):\n        out = func(LargeList(_visit(feature.feature, func)))\n    elif isinstance(feature, List):\n        out = func(List(_visit(feature.feature, func), length=feature.length))\n    else:\n        out = func(feature)\n    return feature if out is None else out\n\n\n_VisitPath = list[Union[str, Literal[0]]]\n\n\ndef _visit_with_path(\n    feature: FeatureType, func: Callable[[FeatureType, _VisitPath], Optional[FeatureType]], visit_path: _VisitPath = []\n) -> FeatureType:\n    \"\"\"Visit a (possibly nested) feature with its path in the Feature object.\n\n    A path in a nested feature object is the list of keys that need to be\n    sequentially accessed to get to the sub-feature.\n\n    For example:\n    - [\"foo\"] corresponds to the column \"foo\"\n    - [\"foo\", 0] corresponds to the sub-feature of the lists in \"foo\"\n    - [\"foo\", \"bar\"] corresponds to the sub-feature of the dicts in \"foo\" with key \"bar\"\n\n    Args:\n        feature (`FeatureType`): the feature type to be checked.\n\n    Returns:\n        `FeatureType`: the visited feature.\n    \"\"\"\n    if isinstance(feature, Features):\n        out = func(Features({k: _visit_with_path(f, func, visit_path + [k]) for k, f in feature.items()}), visit_path)\n    elif isinstance(feature, dict):\n        out = func({k: _visit_with_path(f, func, visit_path + [k]) for k, f in feature.items()}, visit_path)\n    elif isinstance(feature, List):\n        out = func(List(_visit_with_path(feature.feature, func, visit_path + [0]), length=feature.length), visit_path)\n    elif isinstance(feature, LargeList):\n        out = func(LargeList(_visit_with_path(feature.feature, func, visit_path + [0])), visit_path)\n    else:\n        out = func(feature, visit_path)\n    return feature if out is None else out\n\n\ndef require_decoding(feature: FeatureType, ignore_decode_attribute: bool = False) -> bool:\n    \"\"\"Check if a (possibly nested) feature requires decoding.\n\n    Args:\n        feature (FeatureType): the feature type to be checked\n        ignore_decode_attribute (:obj:`bool`, default ``False``): Whether to ignore the current value\n            of the `decode` attribute of the decodable feature types.\n    Returns:\n        :obj:`bool`\n    \"\"\"\n    if isinstance(feature, dict):\n        return any(require_decoding(f) for f in feature.values())\n    elif isinstance(feature, (list, tuple)):\n        return require_decoding(feature[0])\n    elif isinstance(feature, LargeList):\n        return require_decoding(feature.feature)\n    elif isinstance(feature, List):\n        return require_decoding(feature.feature)\n    else:\n        return hasattr(feature, \"decode_example\") and (\n            getattr(feature, \"decode\", True) if not ignore_decode_attribute else True\n        )\n\n\ndef require_storage_cast(feature: FeatureType) -> bool:\n    \"\"\"Check if a (possibly nested) feature requires storage casting.\n\n    Args:\n        feature (FeatureType): the feature type to be checked\n    Returns:\n        :obj:`bool`\n    \"\"\"\n    if isinstance(feature, dict):\n        return any(require_storage_cast(f) for f in feature.values())\n    elif isinstance(feature, LargeList):\n        return require_storage_cast(feature.feature)\n    elif isinstance(feature, List):\n        return require_storage_cast(feature.feature)\n    else:\n        return hasattr(feature, \"cast_storage\")\n\n\ndef require_storage_embed(feature: FeatureType) -> bool:\n    \"\"\"Check if a (possibly nested) feature requires embedding data into storage.\n\n    Args:\n        feature (FeatureType): the feature type to be checked\n    Returns:\n        :obj:`bool`\n    \"\"\"\n    if isinstance(feature, dict):\n        return any(require_storage_cast(f) for f in feature.values())\n    elif isinstance(feature, LargeList):\n        return require_storage_cast(feature.feature)\n    elif isinstance(feature, List):\n        return require_storage_cast(feature.feature)\n    else:\n        return hasattr(feature, \"embed_storage\")\n\n\ndef keep_features_dicts_synced(func):\n    \"\"\"\n    Wrapper to keep the secondary dictionary, which tracks whether keys are decodable, of the :class:`datasets.Features` object\n    in sync with the main dictionary.\n    \"\"\"\n\n    @wraps(func)\n    def wrapper(*args, **kwargs):\n        if args:\n            self: \"Features\" = args[0]\n            args = args[1:]\n        else:\n            self: \"Features\" = kwargs.pop(\"self\")\n        out = func(self, *args, **kwargs)\n        assert hasattr(self, \"_column_requires_decoding\")\n        self._column_requires_decoding = {col: require_decoding(feature) for col, feature in self.items()}\n        return out\n\n    wrapper._decorator_name_ = \"_keep_dicts_synced\"\n    return wrapper\n\n\nclass Features(dict):\n    \"\"\"A special dictionary that defines the internal structure of a dataset.\n\n    Instantiated with a dictionary of type `dict[str, FieldType]`, where keys are the desired column names,\n    and values are the type of that column.\n\n    `FieldType` can be one of the following:\n        - [`Value`] feature specifies a single data type value, e.g. `int64` or `string`.\n        - [`ClassLabel`] feature specifies a predefined set of classes which can have labels associated to them and\n          will be stored as integers in the dataset.\n        - Python `dict` specifies a composite feature containing a mapping of sub-fields to sub-features.\n          It's possible to have nested fields of nested fields in an arbitrary manner.\n        - [`List`] or [`LargeList`] specifies a composite feature containing a sequence of\n          sub-features, all of the same feature type.\n        - [`Array2D`], [`Array3D`], [`Array4D`] or [`Array5D`] feature for multidimensional arrays.\n        - [`Audio`] feature to store the absolute path to an audio file or a dictionary with the relative path\n          to an audio file (\"path\" key) and its bytes content (\"bytes\" key).\n          This feature loads the audio lazily with a decoder.\n        - [`Image`] feature to store the absolute path to an image file, an `np.ndarray` object, a `PIL.Image.Image` object\n          or a dictionary with the relative path to an image file (\"path\" key) and its bytes content (\"bytes\" key).\n          This feature extracts the image data.\n        - [`Video`] feature to store the absolute path to a video file, a `torchcodec.decoders.VideoDecoder` object\n          or a dictionary with the relative path to a video file (\"path\" key) and its bytes content (\"bytes\" key).\n          This feature loads the video lazily with a decoder.\n        - [`Pdf`] feature to store the absolute path to a PDF file, a `pdfplumber.pdf.PDF` object\n          or a dictionary with the relative path to a PDF file (\"path\" key) and its bytes content (\"bytes\" key).\n          This feature loads the PDF lazily with a PDF reader.\n        - [`Nifti`] feature to store the absolute path to a NIfTI neuroimaging file, a `nibabel.Nifti1Image` object\n          or a dictionary with the relative path to a NIfTI file (\"path\" key) and its bytes content (\"bytes\" key).\n          This feature loads the NIfTI file lazily with nibabel.\n        - [`Translation`] or [`TranslationVariableLanguages`] feature specific to Machine Translation.\n        - [`Json`] feature to store unstructred data, e.g. containing mixed/abritrary types. Under the hood\n    \"\"\"\n\n    def __init__(*args, **kwargs):\n        # self not in the signature to allow passing self as a kwarg\n        if not args:\n            raise TypeError(\"descriptor '__init__' of 'Features' object needs an argument\")\n        self, *args = args\n        super(Features, self).__init__(*args, **kwargs)\n        # keep track of columns which require decoding\n        self._column_requires_decoding: dict[str, bool] = {\n            col: require_decoding(feature) for col, feature in self.items()\n        }\n\n        # backward compatibility with datasets<4 : [feature] -> List(feature)\n        def _check_old_list(feature):\n            if isinstance(feature, list):\n                return List(_visit(feature[0], _check_old_list))\n            return feature\n\n        for column_name, feature in self.items():\n            self[column_name] = _visit(feature, _check_old_list)\n\n    __setitem__ = keep_features_dicts_synced(dict.__setitem__)\n    __delitem__ = keep_features_dicts_synced(dict.__delitem__)\n    update = keep_features_dicts_synced(dict.update)\n    setdefault = keep_features_dicts_synced(dict.setdefault)\n    pop = keep_features_dicts_synced(dict.pop)\n    popitem = keep_features_dicts_synced(dict.popitem)\n    clear = keep_features_dicts_synced(dict.clear)\n\n    def __reduce__(self):\n        return Features, (dict(self),)\n\n    @property\n    def type(self):\n        \"\"\"\n        Features field types.\n\n        Returns:\n            :obj:`pyarrow.DataType`\n        \"\"\"\n        return get_nested_type(self)\n\n    @property\n    def arrow_schema(self):\n        \"\"\"\n        Features schema.\n\n        Returns:\n            :obj:`pyarrow.Schema`\n        \"\"\"\n        hf_metadata = {\"info\": {\"features\": self.to_dict()}}\n        return pa.schema(self.type).with_metadata({\"huggingface\": json.dumps(hf_metadata)})\n\n    @classmethod\n    def from_arrow_schema(cls, pa_schema: pa.Schema) -> \"Features\":\n        \"\"\"\n        Construct [`Features`] from Arrow Schema.\n        It also checks the schema metadata for Hugging Face Datasets features.\n        Non-nullable fields are not supported and set to nullable.\n\n        Also, pa.dictionary is not supported and it uses its underlying type instead.\n        Therefore datasets convert DictionaryArray objects to their actual values.\n\n        Args:\n            pa_schema (`pyarrow.Schema`):\n                Arrow Schema.\n\n        Returns:\n            [`Features`]\n        \"\"\"\n        # try to load features from the arrow schema metadata\n        metadata_features = Features()\n        if pa_schema.metadata is not None and b\"huggingface\" in pa_schema.metadata:\n            metadata = json.loads(pa_schema.metadata[b\"huggingface\"].decode())\n            if \"info\" in metadata and \"features\" in metadata[\"info\"] and metadata[\"info\"][\"features\"] is not None:\n                metadata_features = Features.from_dict(metadata[\"info\"][\"features\"])\n        metadata_features_schema = metadata_features.arrow_schema\n        obj = {\n            field.name: (\n                metadata_features[field.name]\n                if field.name in metadata_features and metadata_features_schema.field(field.name) == field\n                else generate_from_arrow_type(field.type)\n            )\n            for field in pa_schema\n        }\n        return cls(**obj)\n\n    @classmethod\n    def from_dict(cls, dic) -> \"Features\":\n        \"\"\"\n        Construct [`Features`] from dict.\n\n        Regenerate the nested feature object from a deserialized dict.\n        We use the `_type` key to infer the dataclass name of the feature `FieldType`.\n\n        It allows for a convenient constructor syntax\n        to define features from deserialized JSON dictionaries. This function is used in particular when deserializing\n        a [`DatasetInfo`] that was dumped to a JSON object. This acts as an analogue to\n        [`Features.from_arrow_schema`] and handles the recursive field-by-field instantiation, but doesn't require\n        any mapping to/from pyarrow, except for the fact that it takes advantage of the mapping of pyarrow primitive\n        dtypes that [`Value`] automatically performs.\n\n        Args:\n            dic (`dict[str, Any]`):\n                Python dictionary.\n\n        Returns:\n            `Features`\n\n        Example::\n            >>> Features.from_dict({'_type': {'dtype': 'string', 'id': None, '_type': 'Value'}})\n            {'_type': Value('string')}\n        \"\"\"\n        obj = generate_from_dict(dic)\n        return cls(**obj)\n\n    def to_dict(self):\n        return asdict(self)\n\n    def _to_yaml_list(self) -> list:\n        # we compute the YAML list from the dict representation that is used for JSON dump\n        yaml_data = self.to_dict()\n\n        def simplify(feature: dict) -> dict:\n            if not isinstance(feature, dict):\n                raise TypeError(f\"Expected a dict but got a {type(feature)}: {feature}\")\n\n            for list_type in [\"large_list\", \"list\", \"sequence\"]:\n                #\n                # list_type:                ->              list_type: int32\n                #   dtype: int32            ->\n                #\n                if isinstance(feature.get(list_type), dict) and list(feature[list_type]) == [\"dtype\"]:\n                    feature[list_type] = feature[list_type][\"dtype\"]\n\n                #\n                # list_type:                ->              list_type:\n                #   struct:                 ->              - name: foo\n                #   - name: foo             ->                dtype: int32\n                #     dtype: int32          ->\n                #\n                if isinstance(feature.get(list_type), dict) and list(feature[list_type]) == [\"struct\"]:\n                    feature[list_type] = feature[list_type][\"struct\"]\n\n            #\n            # class_label:              ->              class_label:\n            #   names:                  ->                names:\n            #   - negative              ->                  '0': negative\n            #   - positive              ->                  '1': positive\n            #\n            if isinstance(feature.get(\"class_label\"), dict) and isinstance(feature[\"class_label\"].get(\"names\"), list):\n                # server-side requirement: keys must be strings\n                feature[\"class_label\"][\"names\"] = {\n                    str(label_id): label_name for label_id, label_name in enumerate(feature[\"class_label\"][\"names\"])\n                }\n            return feature\n\n        def to_yaml_inner(obj: Union[dict, list]) -> dict:\n            if isinstance(obj, dict):\n                _type = obj.pop(\"_type\", None)\n                if _type == \"LargeList\":\n                    _feature = obj.pop(\"feature\")\n                    return simplify({\"large_list\": to_yaml_inner(_feature), **obj})\n                elif _type == \"List\":\n                    _feature = obj.pop(\"feature\")\n                    return simplify({\"list\": to_yaml_inner(_feature), **obj})\n                elif _type == \"Value\":\n                    return obj\n                elif _type and not obj:\n                    return {\"dtype\": camelcase_to_snakecase(_type)}\n                elif _type:\n                    return {\"dtype\": simplify({camelcase_to_snakecase(_type): obj})}\n                else:\n                    return {\"struct\": [{\"name\": name, **to_yaml_inner(_feature)} for name, _feature in obj.items()]}\n            elif isinstance(obj, list):\n                return simplify({\"list\": simplify(to_yaml_inner(obj[0]))})\n            elif isinstance(obj, tuple):\n                return to_yaml_inner(list(obj))\n            else:\n                raise TypeError(f\"Expected a dict or a list but got {type(obj)}: {obj}\")\n\n        def to_yaml_types(obj: dict) -> dict:\n            if isinstance(obj, dict):\n                return {k: to_yaml_types(v) for k, v in obj.items()}\n            elif isinstance(obj, list):\n                return [to_yaml_types(v) for v in obj]\n            elif isinstance(obj, tuple):\n                return to_yaml_types(list(obj))\n            else:\n                return obj\n\n        return to_yaml_types(to_yaml_inner(yaml_data)[\"struct\"])\n\n    @classmethod\n    def _from_yaml_list(cls, yaml_data: list) -> \"Features\":\n        yaml_data = copy.deepcopy(yaml_data)\n\n        # we convert the list obtained from YAML data into the dict representation that is used for JSON dump\n\n        def unsimplify(feature: dict) -> dict:\n            if not isinstance(feature, dict):\n                raise TypeError(f\"Expected a dict but got a {type(feature)}: {feature}\")\n\n            for list_type in [\"large_list\", \"list\", \"sequence\"]:\n                #\n                # list_type: int32          ->              list_type:\n                #                           ->                dtype: int32\n                #\n                if isinstance(feature.get(list_type), str):\n                    feature[list_type] = {\"dtype\": feature[list_type]}\n\n            #\n            # class_label:              ->              class_label:\n            #   names:                  ->                names:\n            #     '0': negative              ->               - negative\n            #     '1': positive              ->               - positive\n            #\n            if isinstance(feature.get(\"class_label\"), dict) and isinstance(feature[\"class_label\"].get(\"names\"), dict):\n                label_ids = sorted(feature[\"class_label\"][\"names\"], key=int)\n                if label_ids and [int(label_id) for label_id in label_ids] != list(range(int(label_ids[-1]) + 1)):\n                    raise ValueError(\n                        f\"ClassLabel expected a value for all label ids [0:{int(label_ids[-1]) + 1}] but some ids are missing.\"\n                    )\n                feature[\"class_label\"][\"names\"] = [feature[\"class_label\"][\"names\"][label_id] for label_id in label_ids]\n            return feature\n\n        def from_yaml_inner(obj: Union[dict, list]) -> Union[dict, list]:\n            if isinstance(obj, dict):\n                if not obj:\n                    return {}\n                _type = next(iter(obj))\n                if _type == \"large_list\":\n                    _feature = from_yaml_inner(unsimplify(obj).pop(_type))\n                    return {\"feature\": _feature, **obj, \"_type\": \"LargeList\"}\n                if _type == \"sequence\":  # backward compatibility\n                    if isinstance(obj[_type], list):\n                        _feature = from_yaml_inner(unsimplify(obj).pop(_type))\n                        return {\n                            name: {\"feature\": _subfeature, **obj, \"_type\": \"List\"}\n                            for name, _subfeature in _feature.items()\n                        }\n                    else:\n                        _feature = from_yaml_inner(unsimplify(obj).pop(_type))\n                        return {\"feature\": _feature, **obj, \"_type\": \"List\"}\n                if _type == \"list\":\n                    _feature = from_yaml_inner(unsimplify(obj).pop(_type))\n                    return {\"feature\": _feature, **obj, \"_type\": \"List\"}\n                if _type == \"struct\":\n                    return from_yaml_inner(obj[\"struct\"])\n                elif _type == \"dtype\":\n                    if isinstance(obj[\"dtype\"], str):\n                        # e.g. int32, float64, string, audio, image\n                        try:\n                            Value(obj[\"dtype\"])\n                            return {**obj, \"_type\": \"Value\"}\n                        except ValueError:\n                            # e.g. Audio, Image, ArrayXD\n                            return {\"_type\": snakecase_to_camelcase(obj[\"dtype\"])}\n                    else:\n                        return from_yaml_inner(obj[\"dtype\"])\n                else:\n                    return {\"_type\": snakecase_to_camelcase(_type), **unsimplify(obj)[_type]}\n            elif isinstance(obj, list):\n                names = [_feature.pop(\"name\") for _feature in obj]\n                return {name: from_yaml_inner(_feature) for name, _feature in zip(names, obj)}\n            else:\n                raise TypeError(f\"Expected a dict or a list but got {type(obj)}: {obj}\")\n\n        return cls.from_dict(from_yaml_inner(yaml_data))\n\n    def encode_example(self, example):\n        \"\"\"\n        Encode example into a format for Arrow.\n\n        Args:\n            example (`dict[str, Any]`):\n                Data in a Dataset row.\n\n        Returns:\n            `dict[str, Any]`\n        \"\"\"\n        example = cast_to_python_objects(example)\n        return encode_nested_example(self, example)\n\n    def encode_column(self, column, column_name: str):\n        \"\"\"\n        Encode column into a format for Arrow.\n\n        Args:\n            column (`list[Any]`):\n                Data in a Dataset column.\n            column_name (`str`):\n                Dataset column name.\n\n        Returns:\n            `list[Any]`\n        \"\"\"\n        column = cast_to_python_objects(column)\n        return [encode_nested_example(self[column_name], obj, level=1) for obj in column]\n\n    def encode_batch(self, batch):\n        \"\"\"\n        Encode batch into a format for Arrow.\n\n        Args:\n            batch (`dict[str, list[Any]]`):\n                Data in a Dataset batch.\n\n        Returns:\n            `dict[str, list[Any]]`\n        \"\"\"\n        encoded_batch = {}\n        if set(batch) != set(self):\n            raise ValueError(f\"Column mismatch between batch {set(batch)} and features {set(self)}\")\n        for key, column in batch.items():\n            column = cast_to_python_objects(column)\n            encoded_batch[key] = [encode_nested_example(self[key], obj, level=1) for obj in column]\n        return encoded_batch\n\n    def decode_example(self, example: dict, token_per_repo_id: Optional[dict[str, Union[str, bool, None]]] = None):\n        \"\"\"Decode example with custom feature decoding.\n\n        Args:\n            example (`dict[str, Any]`):\n                Dataset row data.\n            token_per_repo_id (`dict`, *optional*):\n                To access and decode audio or image files from private repositories on the Hub, you can pass\n                a dictionary `repo_id (str) -> token (bool or str)`.\n\n        Returns:\n            `dict[str, Any]`\n        \"\"\"\n\n        return {\n            column_name: decode_nested_example(feature, value, token_per_repo_id=token_per_repo_id)\n            if self._column_requires_decoding[column_name]\n            else value\n            for column_name, (feature, value) in zip_dict(\n                {key: value for key, value in self.items() if key in example}, example\n            )\n        }\n\n    def decode_column(\n        self, column: list, column_name: str, token_per_repo_id: Optional[dict[str, Union[str, bool, None]]] = None\n    ):\n        \"\"\"Decode column with custom feature decoding.\n\n        Args:\n            column (`list[Any]`):\n                Dataset column data.\n            column_name (`str`):\n                Dataset column name.\n\n        Returns:\n            `list[Any]`\n        \"\"\"\n        return (\n            [\n                decode_nested_example(self[column_name], value, token_per_repo_id=token_per_repo_id)\n                if value is not None\n                else None\n                for value in column\n            ]\n            if self._column_requires_decoding[column_name]\n            else column\n        )\n\n    def decode_batch(self, batch: dict, token_per_repo_id: Optional[dict[str, Union[str, bool, None]]] = None):\n        \"\"\"Decode batch with custom feature decoding.\n\n        Args:\n            batch (`dict[str, list[Any]]`):\n                Dataset batch data.\n            token_per_repo_id (`dict`, *optional*):\n                To access and decode audio or image files from private repositories on the Hub, you can pass\n                a dictionary repo_id (str) -> token (bool or str)\n\n        Returns:\n            `dict[str, list[Any]]`\n        \"\"\"\n        decoded_batch = {}\n        for column_name, column in batch.items():\n            decoded_batch[column_name] = (\n                [\n                    decode_nested_example(self[column_name], value, token_per_repo_id=token_per_repo_id)\n                    if value is not None\n                    else None\n                    for value in column\n                ]\n                if self._column_requires_decoding[column_name]\n                else column\n            )\n        return decoded_batch\n\n    def copy(self) -> \"Features\":\n        \"\"\"\n        Make a deep copy of [`Features`].\n\n        Returns:\n            [`Features`]\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\")\n        >>> copy_of_features = ds.features.copy()\n        >>> copy_of_features\n        {'label': ClassLabel(names=['neg', 'pos']),\n         'text': Value('string')}\n        ```\n        \"\"\"\n        return copy.deepcopy(self)\n\n    def reorder_fields_as(self, other: \"Features\") -> \"Features\":\n        \"\"\"\n        Reorder Features fields to match the field order of other [`Features`].\n\n        The order of the fields is important since it matters for the underlying arrow data.\n        Re-ordering the fields allows to make the underlying arrow data type match.\n\n        Args:\n            other ([`Features`]):\n                The other [`Features`] to align with.\n\n        Returns:\n            [`Features`]\n\n        Example::\n\n            >>> from datasets import Features, List, Value\n            >>> # let's say we have two features with a different order of nested fields (for a and b for example)\n            >>> f1 = Features({\"root\": {\"a\": Value(\"string\"), \"b\": Value(\"string\")}})\n            >>> f2 = Features({\"root\": {\"b\": Value(\"string\"), \"a\": Value(\"string\")}})\n            >>> assert f1.type != f2.type\n            >>> # re-ordering keeps the base structure (here List is defined at the root level), but makes the fields order match\n            >>> f1.reorder_fields_as(f2)\n            {'root': List({'b': Value('string'), 'a': Value('string')})}\n            >>> assert f1.reorder_fields_as(f2).type == f2.type\n        \"\"\"\n\n        def recursive_reorder(source, target, stack=\"\"):\n            stack_position = \" at \" + stack[1:] if stack else \"\"\n            if isinstance(source, dict):\n                if not isinstance(target, dict):\n                    raise ValueError(f\"Type mismatch: between {source} and {target}\" + stack_position)\n                if sorted(source) != sorted(target):\n                    message = (\n                        f\"Keys mismatch: between {source} (source) and {target} (target).\\n\"\n                        f\"{source.keys() - target.keys()} are missing from target \"\n                        f\"and {target.keys() - source.keys()} are missing from source\" + stack_position\n                    )\n                    raise ValueError(message)\n                return {key: recursive_reorder(source[key], target[key], stack + f\".{key}\") for key in target}\n            elif isinstance(source, List):\n                if not isinstance(target, List):\n                    raise ValueError(f\"Type mismatch: between {source} and {target}\" + stack_position)\n                return List(recursive_reorder(source.feature, target.feature, stack + \".<list>\"), length=source.length)\n            elif isinstance(source, LargeList):\n                if not isinstance(target, LargeList):\n                    raise ValueError(f\"Type mismatch: between {source} and {target}\" + stack_position)\n                return LargeList(recursive_reorder(source.feature, target.feature, stack + \".<list>\"))\n            else:\n                return source\n\n        return Features(recursive_reorder(self, other))\n\n    def flatten(self, max_depth=16) -> \"Features\":\n        \"\"\"Flatten the features. Every dictionary column is removed and is replaced by\n        all the subfields it contains. The new fields are named by concatenating the\n        name of the original column and the subfield name like this: `<original>.<subfield>`.\n\n        If a column contains nested dictionaries, then all the lower-level subfields names are\n        also concatenated to form new columns: `<original>.<subfield>.<subsubfield>`, etc.\n\n        Returns:\n            [`Features`]:\n                The flattened features.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"rajpurkar/squad\", split=\"train\")\n        >>> ds.features.flatten()\n        {'answers.answer_start': List(Value('int32'), id=None),\n         'answers.text': List(Value('string'), id=None),\n         'context': Value('string'),\n         'id': Value('string'),\n         'question': Value('string'),\n         'title': Value('string')}\n        ```\n        \"\"\"\n        for depth in range(1, max_depth):\n            no_change = True\n            flattened = self.copy()\n            for column_name, subfeature in self.items():\n                if isinstance(subfeature, dict):\n                    no_change = False\n                    flattened.update({f\"{column_name}.{k}\": v for k, v in subfeature.items()})\n                    del flattened[column_name]\n                elif hasattr(subfeature, \"flatten\") and subfeature.flatten() != subfeature:\n                    no_change = False\n                    flattened.update({f\"{column_name}.{k}\": v for k, v in subfeature.flatten().items()})\n                    del flattened[column_name]\n            self = flattened\n            if no_change:\n                break\n        return self\n\n\ndef _is_null_feature(feature) -> bool:\n    \"\"\"Recursively check if a feature represents a null type.\n\n    This handles not only top-level ``Value(\"null\")`` but also nested null types\n    such as ``List(Value(\"null\"))``, ``LargeList(Value(\"null\"))``, and\n    ``Sequence(Value(\"null\"))``, which can arise when a shard contains only\n    empty lists during multi-process ``Dataset.map()``.\n    \"\"\"\n    if isinstance(feature, Value) and feature.dtype == \"null\":\n        return True\n    if isinstance(feature, (Sequence, LargeList)) and hasattr(feature, \"feature\"):\n        return _is_null_feature(feature.feature)\n    return False\n\n\ndef _align_features(features_list: list[Features]) -> list[Features]:\n    \"\"\"Align dictionaries of features so that the keys that are found in multiple dictionaries share the same feature.\"\"\"\n    name2feature = {}\n    for features in features_list:\n        for k, v in features.items():\n            if k in name2feature and isinstance(v, dict):\n                # Recursively align features.\n                name2feature[k] = _align_features([name2feature[k], v])[0]\n            elif k not in name2feature or _is_null_feature(name2feature[k]):\n                name2feature[k] = v\n\n    return [Features({k: name2feature[k] for k in features.keys()}) for features in features_list]\n\n\ndef _check_if_features_can_be_aligned(features_list: list[Features]):\n    \"\"\"Check if the dictionaries of features can be aligned.\n\n    Two dictionaries of features can be aligned if the keys they share have the same type or some of them is of\n    type ``Value(\"null\")`` (or a container wrapping ``Value(\"null\")``, such as ``List(Value(\"null\"))``).\n    \"\"\"\n    name2feature = {}\n    for features in features_list:\n        for k, v in features.items():\n            if k not in name2feature or _is_null_feature(name2feature[k]):\n                name2feature[k] = v\n\n    for features in features_list:\n        for k, v in features.items():\n            if isinstance(v, dict) and isinstance(name2feature[k], dict):\n                # Deep checks for structure.\n                _check_if_features_can_be_aligned([name2feature[k], v])\n            elif not _is_null_feature(v) and name2feature[k] != v:\n                raise ValueError(\n                    f'The features can\\'t be aligned because the key {k} of features {features} has unexpected type - {v} (expected either {name2feature[k]} or Value(\"null\").'\n                )\n\n\ndef _fix_for_backward_compatible_features(feature: Any) -> FeatureType:\n    def _fix_old_list(feature):\n        if isinstance(feature, list):\n            return List(_fix_for_backward_compatible_features(feature[0]))\n        return feature\n\n    return _visit(feature, _fix_old_list)\n"
  },
  {
    "path": "src/datasets/features/image.py",
    "content": "import os\nimport sys\nimport warnings\nfrom dataclasses import dataclass, field\nfrom io import BytesIO\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any, ClassVar, Optional, Union\n\nimport numpy as np\nimport pyarrow as pa\n\nfrom .. import config\nfrom ..download.download_config import DownloadConfig\nfrom ..table import array_cast\nfrom ..utils.file_utils import is_local_path, xopen\nfrom ..utils.py_utils import first_non_null_value, no_op_if_value_is_null, string_to_dict\n\n\nif TYPE_CHECKING:\n    import PIL.Image\n\n    from .features import FeatureType\n\n\n_IMAGE_COMPRESSION_FORMATS: Optional[list[str]] = None\n_NATIVE_BYTEORDER = \"<\" if sys.byteorder == \"little\" else \">\"\n# Origin: https://github.com/python-pillow/Pillow/blob/698951e19e19972aeed56df686868f1329981c12/src/PIL/Image.py#L3126 minus \"|i1\" which values are not preserved correctly when saving and loading an image\n_VALID_IMAGE_ARRAY_DTPYES = [\n    np.dtype(\"|b1\"),\n    np.dtype(\"|u1\"),\n    np.dtype(\"<u2\"),\n    np.dtype(\">u2\"),\n    np.dtype(\"<i2\"),\n    np.dtype(\">i2\"),\n    np.dtype(\"<u4\"),\n    np.dtype(\">u4\"),\n    np.dtype(\"<i4\"),\n    np.dtype(\">i4\"),\n    np.dtype(\"<f4\"),\n    np.dtype(\">f4\"),\n    np.dtype(\"<f8\"),\n    np.dtype(\">f8\"),\n]\n\n\n@dataclass\nclass Image:\n    \"\"\"Image [`Feature`] to read image data from an image file.\n\n    Input: The Image feature accepts as input:\n    - A `str`: Absolute path to the image file (i.e. random access is allowed).\n    - A `pathlib.Path`: path to the image file (i.e. random access is allowed).\n    - A `dict` with the keys:\n\n        - `path`: String with relative path of the image file to the archive file.\n        - `bytes`: Bytes of the image file.\n\n      This is useful for parquet or webdataset files which embed image files.\n\n    - An `np.ndarray`: NumPy array representing an image.\n    - A `PIL.Image.Image`: PIL image object.\n\n    Output: The Image features output data as `PIL.Image.Image` objects.\n\n    Args:\n        mode (`str`, *optional*):\n            The mode to convert the image to. If `None`, the native mode of the image is used.\n        decode (`bool`, defaults to `True`):\n            Whether to decode the image data. If `False`,\n            returns the underlying dictionary in the format `{\"path\": image_path, \"bytes\": image_bytes}`.\n\n    Examples:\n\n    ```py\n    >>> from datasets import load_dataset, Image\n    >>> ds = load_dataset(\"AI-Lab-Makerere/beans\", split=\"train\")\n    >>> ds.features[\"image\"]\n    Image(decode=True, id=None)\n    >>> ds[0][\"image\"]\n    <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x500 at 0x15E52E7F0>\n    >>> ds = ds.cast_column('image', Image(decode=False))\n    {'bytes': None,\n     'path': '/root/.cache/huggingface/datasets/downloads/extracted/b0a21163f78769a2cf11f58dfc767fb458fc7cea5c05dccc0144a2c0f0bc1292/train/healthy/healthy_train.85.jpg'}\n    ```\n    \"\"\"\n\n    mode: Optional[str] = None\n    decode: bool = True\n    id: Optional[str] = field(default=None, repr=False)\n    # Automatically constructed\n    dtype: ClassVar[str] = \"PIL.Image.Image\"\n    pa_type: ClassVar[Any] = pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})\n    _type: str = field(default=\"Image\", init=False, repr=False)\n\n    def __call__(self):\n        return self.pa_type\n\n    def encode_example(self, value: Union[str, bytes, bytearray, dict, np.ndarray, \"PIL.Image.Image\"]) -> dict:\n        \"\"\"Encode example into a format for Arrow.\n\n        Args:\n            value (`str`, `np.ndarray`, `PIL.Image.Image` or `dict`):\n                Data passed as input to Image feature.\n\n        Returns:\n            `dict` with \"path\" and \"bytes\" fields\n        \"\"\"\n        if config.PIL_AVAILABLE:\n            import PIL.Image\n        else:\n            raise ImportError(\"To support encoding images, please install 'Pillow'.\")\n\n        if isinstance(value, list):\n            value = np.array(value)\n\n        if isinstance(value, str):\n            return {\"path\": value, \"bytes\": None}\n        elif isinstance(value, Path):\n            return {\"path\": str(value.absolute()), \"bytes\": None}\n        elif isinstance(value, (bytes, bytearray)):\n            return {\"path\": None, \"bytes\": value}\n        elif isinstance(value, np.ndarray):\n            # convert the image array to PNG/TIFF bytes\n            return encode_np_array(value)\n        elif isinstance(value, PIL.Image.Image):\n            # convert the PIL image to bytes (default format is PNG/TIFF)\n            return encode_pil_image(value)\n        elif value.get(\"path\") is not None and os.path.isfile(value[\"path\"]):\n            # we set \"bytes\": None to not duplicate the data if they're already available locally\n            return {\"bytes\": None, \"path\": value.get(\"path\")}\n        elif value.get(\"bytes\") is not None or value.get(\"path\") is not None:\n            # store the image bytes, and path is used to infer the image format using the file extension\n            return {\"bytes\": value.get(\"bytes\"), \"path\": value.get(\"path\")}\n        else:\n            raise ValueError(\n                f\"An image sample should have one of 'path' or 'bytes' but they are missing or None in {value}.\"\n            )\n\n    def decode_example(self, value: dict, token_per_repo_id=None) -> \"PIL.Image.Image\":\n        \"\"\"Decode example image file into image data.\n\n        Args:\n            value (`str` or `dict`):\n                A string with the absolute image file path, a dictionary with\n                keys:\n\n                - `path`: String with absolute or relative image file path.\n                - `bytes`: The bytes of the image file.\n            token_per_repo_id (`dict`, *optional*):\n                To access and decode\n                image files from private repositories on the Hub, you can pass\n                a dictionary repo_id (`str`) -> token (`bool` or `str`).\n\n        Returns:\n            `PIL.Image.Image`\n        \"\"\"\n        if not self.decode:\n            raise RuntimeError(\"Decoding is disabled for this feature. Please use Image(decode=True) instead.\")\n\n        if config.PIL_AVAILABLE:\n            import PIL.Image\n            import PIL.ImageOps\n        else:\n            raise ImportError(\"To support decoding images, please install 'Pillow'.\")\n\n        if token_per_repo_id is None:\n            token_per_repo_id = {}\n\n        path, bytes_ = value[\"path\"], value[\"bytes\"]\n        if bytes_ is None:\n            if path is None:\n                raise ValueError(f\"An image should have one of 'path' or 'bytes' but both are None in {value}.\")\n            else:\n                if is_local_path(path):\n                    image = PIL.Image.open(path)\n                else:\n                    source_url = path.split(\"::\")[-1]\n                    pattern = (\n                        config.HUB_DATASETS_URL\n                        if source_url.startswith(config.HF_ENDPOINT)\n                        else config.HUB_DATASETS_HFFS_URL\n                    )\n                    source_url_fields = string_to_dict(source_url, pattern)\n                    token = (\n                        token_per_repo_id.get(source_url_fields[\"repo_id\"]) if source_url_fields is not None else None\n                    )\n                    download_config = DownloadConfig(token=token)\n                    with xopen(path, \"rb\", download_config=download_config) as f:\n                        bytes_ = BytesIO(f.read())\n                    image = PIL.Image.open(bytes_)\n        else:\n            image = PIL.Image.open(BytesIO(bytes_))\n        image.load()  # to avoid \"Too many open files\" errors\n        if image.getexif().get(PIL.Image.ExifTags.Base.Orientation) is not None:\n            image = PIL.ImageOps.exif_transpose(image)\n        if self.mode and self.mode != image.mode:\n            image = image.convert(self.mode)\n        return image\n\n    def flatten(self) -> Union[\"FeatureType\", dict[str, \"FeatureType\"]]:\n        \"\"\"If in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary.\"\"\"\n        from .features import Value\n\n        return (\n            self\n            if self.decode\n            else {\n                \"bytes\": Value(\"binary\"),\n                \"path\": Value(\"string\"),\n            }\n        )\n\n    def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.ListArray]) -> pa.StructArray:\n        \"\"\"Cast an Arrow array to the Image arrow storage type.\n        The Arrow types that can be converted to the Image pyarrow storage type are:\n\n        - `pa.string()` - it must contain the \"path\" data\n        - `pa.large_string()` - it must contain the \"path\" data (will be cast to string if possible)\n        - `pa.binary()` - it must contain the image bytes\n        - `pa.struct({\"bytes\": pa.binary()})`\n        - `pa.struct({\"path\": pa.string()})`\n        - `pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})`  - order doesn't matter\n        - `pa.list(*)` - it must contain the image array data\n\n        Args:\n            storage (`Union[pa.StringArray, pa.StructArray, pa.ListArray]`):\n                PyArrow array to cast.\n\n        Returns:\n            `pa.StructArray`: Array in the Image arrow storage type, that is\n                `pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})`.\n        \"\"\"\n        if pa.types.is_large_string(storage.type):\n            try:\n                storage = storage.cast(pa.string())\n            except pa.ArrowInvalid as e:\n                raise ValueError(\n                    f\"Failed to cast large_string to string for Image feature. \"\n                    f\"This can happen if string values exceed 2GB. \"\n                    f\"Original error: {e}\"\n                ) from e\n        if pa.types.is_string(storage.type):\n            bytes_array = pa.array([None] * len(storage), type=pa.binary())\n            storage = pa.StructArray.from_arrays([bytes_array, storage], [\"bytes\", \"path\"], mask=storage.is_null())\n        elif pa.types.is_large_binary(storage.type):\n            storage = array_cast(\n                storage, pa.binary()\n            )  # this can fail in case of big images, paths should be used instead\n            path_array = pa.array([None] * len(storage), type=pa.string())\n            storage = pa.StructArray.from_arrays([storage, path_array], [\"bytes\", \"path\"], mask=storage.is_null())\n        elif pa.types.is_binary(storage.type):\n            path_array = pa.array([None] * len(storage), type=pa.string())\n            storage = pa.StructArray.from_arrays([storage, path_array], [\"bytes\", \"path\"], mask=storage.is_null())\n        elif pa.types.is_struct(storage.type):\n            if storage.type.get_field_index(\"bytes\") >= 0:\n                bytes_array = storage.field(\"bytes\")\n            else:\n                bytes_array = pa.array([None] * len(storage), type=pa.binary())\n            if storage.type.get_field_index(\"path\") >= 0:\n                path_array = storage.field(\"path\")\n            else:\n                path_array = pa.array([None] * len(storage), type=pa.string())\n            storage = pa.StructArray.from_arrays([bytes_array, path_array], [\"bytes\", \"path\"], mask=storage.is_null())\n        elif pa.types.is_list(storage.type):\n            bytes_array = pa.array(\n                [encode_np_array(np.array(arr))[\"bytes\"] if arr is not None else None for arr in storage.to_pylist()],\n                type=pa.binary(),\n            )\n            path_array = pa.array([None] * len(storage), type=pa.string())\n            storage = pa.StructArray.from_arrays(\n                [bytes_array, path_array], [\"bytes\", \"path\"], mask=bytes_array.is_null()\n            )\n        return array_cast(storage, self.pa_type)\n\n    def embed_storage(self, storage: pa.StructArray, token_per_repo_id=None) -> pa.StructArray:\n        \"\"\"Embed image files into the Arrow array.\n\n        Args:\n            storage (`pa.StructArray`):\n                PyArrow array to embed.\n\n        Returns:\n            `pa.StructArray`: Array in the Image arrow storage type, that is\n                `pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})`.\n        \"\"\"\n        if token_per_repo_id is None:\n            token_per_repo_id = {}\n\n        @no_op_if_value_is_null\n        def path_to_bytes(path):\n            source_url = path.split(\"::\")[-1]\n            pattern = (\n                config.HUB_DATASETS_URL if source_url.startswith(config.HF_ENDPOINT) else config.HUB_DATASETS_HFFS_URL\n            )\n            source_url_fields = string_to_dict(source_url, pattern)\n            token = token_per_repo_id.get(source_url_fields[\"repo_id\"]) if source_url_fields is not None else None\n            download_config = DownloadConfig(token=token)\n            with xopen(path, \"rb\", download_config=download_config) as f:\n                return f.read()\n\n        bytes_array = pa.array(\n            [\n                (path_to_bytes(x[\"path\"]) if x[\"bytes\"] is None else x[\"bytes\"]) if x is not None else None\n                for x in storage.to_pylist()\n            ],\n            type=pa.binary(),\n        )\n        path_array = pa.array(\n            [os.path.basename(path) if path is not None else None for path in storage.field(\"path\").to_pylist()],\n            type=pa.string(),\n        )\n        storage = pa.StructArray.from_arrays([bytes_array, path_array], [\"bytes\", \"path\"], mask=bytes_array.is_null())\n        return array_cast(storage, self.pa_type)\n\n\ndef list_image_compression_formats() -> list[str]:\n    if config.PIL_AVAILABLE:\n        import PIL.Image\n    else:\n        raise ImportError(\"To support encoding images, please install 'Pillow'.\")\n\n    global _IMAGE_COMPRESSION_FORMATS\n    if _IMAGE_COMPRESSION_FORMATS is None:\n        PIL.Image.init()\n        _IMAGE_COMPRESSION_FORMATS = list(set(PIL.Image.OPEN.keys()) & set(PIL.Image.SAVE.keys()))\n    return _IMAGE_COMPRESSION_FORMATS\n\n\ndef image_to_bytes(image: \"PIL.Image.Image\") -> bytes:\n    \"\"\"Convert a PIL Image object to bytes using native compression if possible, otherwise use PNG/TIFF compression.\"\"\"\n    buffer = BytesIO()\n    if image.format in list_image_compression_formats():\n        format = image.format\n    else:\n        format = \"PNG\" if image.mode in [\"1\", \"L\", \"LA\", \"RGB\", \"RGBA\"] else \"TIFF\"\n    image.save(buffer, format=format)\n    return buffer.getvalue()\n\n\ndef encode_pil_image(image: \"PIL.Image.Image\") -> dict:\n    if hasattr(image, \"filename\") and image.filename != \"\":\n        return {\"path\": image.filename, \"bytes\": None}\n    else:\n        return {\"path\": None, \"bytes\": image_to_bytes(image)}\n\n\ndef encode_np_array(array: np.ndarray) -> dict:\n    if config.PIL_AVAILABLE:\n        import PIL.Image\n    else:\n        raise ImportError(\"To support encoding images, please install 'Pillow'.\")\n\n    dtype = array.dtype\n    dtype_byteorder = dtype.byteorder if dtype.byteorder != \"=\" else _NATIVE_BYTEORDER\n    dtype_kind = dtype.kind\n    dtype_itemsize = dtype.itemsize\n\n    dest_dtype = None\n\n    # Multi-channel array case (only np.dtype(\"|u1\") is allowed)\n    if array.shape[2:]:\n        if dtype_kind not in [\"u\", \"i\"]:\n            raise TypeError(\n                f\"Unsupported array dtype {dtype} for image encoding. Only {dest_dtype} is supported for multi-channel arrays.\"\n            )\n        dest_dtype = np.dtype(\"|u1\")\n        if dtype != dest_dtype:\n            warnings.warn(f\"Downcasting array dtype {dtype} to {dest_dtype} to be compatible with 'Pillow'\")\n    # Exact match\n    elif dtype in _VALID_IMAGE_ARRAY_DTPYES:\n        dest_dtype = dtype\n    else:  # Downcast the type within the kind (np.can_cast(from_type, to_type, casting=\"same_kind\") doesn't behave as expected, so do it manually)\n        while dtype_itemsize >= 1:\n            dtype_str = dtype_byteorder + dtype_kind + str(dtype_itemsize)\n            if np.dtype(dtype_str) in _VALID_IMAGE_ARRAY_DTPYES:\n                dest_dtype = np.dtype(dtype_str)\n                warnings.warn(f\"Downcasting array dtype {dtype} to {dest_dtype} to be compatible with 'Pillow'\")\n                break\n            else:\n                dtype_itemsize //= 2\n        if dest_dtype is None:\n            raise TypeError(\n                f\"Cannot downcast dtype {dtype} to a valid image dtype. Valid image dtypes: {_VALID_IMAGE_ARRAY_DTPYES}\"\n            )\n\n    image = PIL.Image.fromarray(array.astype(dest_dtype))\n    return {\"path\": None, \"bytes\": image_to_bytes(image)}\n\n\ndef objects_to_list_of_image_dicts(\n    objs: Union[list[str], list[dict], list[np.ndarray], list[\"PIL.Image.Image\"]],\n) -> list[dict]:\n    \"\"\"Encode a list of objects into a format suitable for creating an extension array of type `ImageExtensionType`.\"\"\"\n    if config.PIL_AVAILABLE:\n        import PIL.Image\n    else:\n        raise ImportError(\"To support encoding images, please install 'Pillow'.\")\n\n    if objs:\n        _, obj = first_non_null_value(objs)\n        if isinstance(obj, str):\n            return [{\"path\": obj, \"bytes\": None} if obj is not None else None for obj in objs]\n        if isinstance(obj, np.ndarray):\n            obj_to_image_dict_func = no_op_if_value_is_null(encode_np_array)\n            return [obj_to_image_dict_func(obj) for obj in objs]\n        elif isinstance(obj, PIL.Image.Image):\n            obj_to_image_dict_func = no_op_if_value_is_null(encode_pil_image)\n            return [obj_to_image_dict_func(obj) for obj in objs]\n        else:\n            return objs\n    else:\n        return objs\n"
  },
  {
    "path": "src/datasets/features/nifti.py",
    "content": "import os\nfrom dataclasses import dataclass, field\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Union\n\nimport pyarrow as pa\n\nfrom .. import config\nfrom ..download.download_config import DownloadConfig\nfrom ..table import array_cast\nfrom ..utils.file_utils import is_local_path, xopen\nfrom ..utils.py_utils import no_op_if_value_is_null, string_to_dict\n\n\nif TYPE_CHECKING:\n    import nibabel as nib\n\n    from .features import FeatureType\n\nif config.NIBABEL_AVAILABLE:\n    import nibabel as nib\n\n    class Nifti1ImageWrapper(nib.nifti1.Nifti1Image):\n        \"\"\"\n        A wrapper around nibabel's Nifti1Image to customize its representation.\n        \"\"\"\n\n        def __init__(self, nifti_image: nib.nifti1.Nifti1Image):\n            super().__init__(\n                dataobj=nifti_image.dataobj,\n                affine=nifti_image.affine,\n                header=nifti_image.header,\n                extra=nifti_image.extra,\n                file_map=nifti_image.file_map,\n                dtype=nifti_image.get_data_dtype(),\n            )\n            self.nifti_image = nifti_image\n\n        def _repr_html_(self):\n            from ipyniivue import NiiVue, ShowRender, SliceType, Volume\n            from IPython.display import display\n\n            bytes_ = self.nifti_image.to_bytes()\n            nv = NiiVue()\n            nv.set_slice_type(SliceType.MULTIPLANAR)\n            nv.opts.multiplanar_show_render = ShowRender.ALWAYS\n            nv.opts.show_3d_crosshair = True\n            nv.opts.multiplanar_force_render = True\n            name = None\n            if hasattr(self.nifti_image, \"file_map\"):\n                if (\n                    \"image\" in self.nifti_image.file_map\n                    and getattr(self.nifti_image.file_map[\"image\"], \"filename\", None) is not None\n                ):\n                    name = self.nifti_image.file_map[\"image\"].filename\n            if name is None:\n                name = \"volume.nii.gz\"\n            volume = Volume(name=name, data=bytes_)\n            nv.load_volumes([volume])\n            display(nv)\n\n\n@dataclass\nclass Nifti:\n    \"\"\"\n    **Experimental.**\n    Nifti [`Feature`] to read NIfTI neuroimaging files.\n\n    Input: The Nifti feature accepts as input:\n    - A `str`: Absolute path to the NIfTI file (i.e. random access is allowed).\n    - A `pathlib.Path`: path to the NIfTI file (i.e. random access is allowed).\n    - A `dict` with the keys:\n        - `path`: String with relative path of the NIfTI file in a dataset repository.\n        - `bytes`: Bytes of the NIfTI file.\n      This is useful for archived files with sequential access.\n\n    - A `nibabel` image object (e.g., `nibabel.nifti1.Nifti1Image`).\n\n    Args:\n        decode (`bool`, defaults to `True`):\n            Whether to decode the NIfTI data. If `False` a string with the bytes is returned. `decode=False` is not supported when decoding examples.\n\n    Examples:\n\n    ```py\n    >>> from datasets import Dataset, Nifti\n    >>> ds = Dataset.from_dict({\"nifti\": [\"path/to/file.nii.gz\"]}).cast_column(\"nifti\", Nifti())\n    >>> ds.features[\"nifti\"]\n    Nifti(decode=True, id=None)\n    >>> ds[0][\"nifti\"]\n    <nibabel.nifti1.Nifti1Image object at 0x7f8a1c2d8f40>\n    >>> ds = ds.cast_column(\"nifti\", Nifti(decode=False))\n    >>> ds[0][\"nifti\"]\n    {'bytes': None,\n    'path': 'path/to/file.nii.gz'}\n    ```\n    \"\"\"\n\n    decode: bool = True\n    id: Optional[str] = field(default=None, repr=False)\n\n    # Automatically constructed\n    dtype: ClassVar[str] = \"nibabel.nifti1.Nifti1Image\"\n    pa_type: ClassVar[Any] = pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})\n    _type: str = field(default=\"Nifti\", init=False, repr=False)\n\n    def __call__(self):\n        return self.pa_type\n\n    def encode_example(self, value: Union[str, bytes, bytearray, dict, \"nib.Nifti1Image\"]) -> dict:\n        \"\"\"Encode example into a format for Arrow.\n\n        Args:\n            value (`str`, `bytes`, `nibabel.Nifti1Image` or `dict`):\n                Data passed as input to Nifti feature.\n\n        Returns:\n            `dict` with \"path\" and \"bytes\" fields\n        \"\"\"\n        if config.NIBABEL_AVAILABLE:\n            import nibabel as nib\n        else:\n            nib = None\n\n        if isinstance(value, str):\n            return {\"path\": value, \"bytes\": None}\n        elif isinstance(value, Path):\n            return {\"path\": str(value.absolute()), \"bytes\": None}\n        elif isinstance(value, (bytes, bytearray)):\n            return {\"path\": None, \"bytes\": value}\n        elif nib is not None and isinstance(value, nib.spatialimages.SpatialImage):\n            # nibabel image object - try to get path or convert to bytes\n            return encode_nibabel_image(value)\n        elif isinstance(value, dict):\n            if value.get(\"path\") is not None and os.path.isfile(value[\"path\"]):\n                # we set \"bytes\": None to not duplicate the data if they're already available locally\n                return {\"bytes\": None, \"path\": value.get(\"path\")}\n            elif value.get(\"bytes\") is not None or value.get(\"path\") is not None:\n                # store the nifti bytes, and path is used to infer the format using the file extension\n                return {\"bytes\": value.get(\"bytes\"), \"path\": value.get(\"path\")}\n            else:\n                raise ValueError(\n                    f\"A nifti sample should have one of 'path' or 'bytes' but they are missing or None in {value}.\"\n                )\n        else:\n            raise ValueError(\n                f\"A nifti sample should be a string, bytes, Path, nibabel image, or dict, but got {type(value)}.\"\n            )\n\n    def decode_example(self, value: dict, token_per_repo_id=None) -> \"Nifti1ImageWrapper\":\n        \"\"\"Decode example NIfTI file into nibabel image object.\n\n        Args:\n            value (`str` or `dict`):\n                A string with the absolute NIfTI file path, a dictionary with\n                keys:\n\n                - `path`: String with absolute or relative NIfTI file path.\n                - `bytes`: The bytes of the NIfTI file.\n\n            token_per_repo_id (`dict`, *optional*):\n                To access and decode NIfTI files from private repositories on\n                the Hub, you can pass a dictionary\n                repo_id (`str`) -> token (`bool` or `str`).\n\n        Returns:\n            `nibabel.Nifti1Image` objects\n        \"\"\"\n        if config.NIBABEL_AVAILABLE:\n            import nibabel as nib\n        else:\n            raise ImportError(\"To support decoding NIfTI files, please install 'nibabel'.\")\n\n        if token_per_repo_id is None:\n            token_per_repo_id = {}\n\n        path, bytes_ = value[\"path\"], value[\"bytes\"]\n        if bytes_ is None:\n            if path is None:\n                raise ValueError(f\"A nifti should have one of 'path' or 'bytes' but both are None in {value}.\")\n            else:\n                # gzipped files have the structure: 'gzip://T1.nii::<local_path>'\n                if path.startswith(\"gzip://\") and is_local_path(path.split(\"::\")[-1]):\n                    path = path.split(\"::\")[-1]\n                if is_local_path(path):\n                    nifti = nib.load(path)\n                else:\n                    source_url = path.split(\"::\")[-1]\n                    pattern = (\n                        config.HUB_DATASETS_URL\n                        if source_url.startswith(config.HF_ENDPOINT)\n                        else config.HUB_DATASETS_HFFS_URL\n                    )\n                    source_url_fields = string_to_dict(source_url, pattern)\n                    token = (\n                        token_per_repo_id.get(source_url_fields[\"repo_id\"]) if source_url_fields is not None else None\n                    )\n                    download_config = DownloadConfig(token=token)\n                    with xopen(path, \"rb\", download_config=download_config) as f:\n                        nifti = nib.load(f)\n        else:\n            import gzip\n\n            if (\n                bytes_[:2] == b\"\\x1f\\x8b\"\n            ):  # gzip magic number, see https://stackoverflow.com/a/76055284/9534390 or \"Magic number\" on https://en.wikipedia.org/wiki/Gzip\n                bytes_ = gzip.decompress(bytes_)\n\n            nifti = nib.Nifti1Image.from_bytes(bytes_)\n\n        return Nifti1ImageWrapper(nifti)\n\n    def embed_storage(self, storage: pa.StructArray, token_per_repo_id=None) -> pa.StructArray:\n        \"\"\"Embed NifTI files into the Arrow array.\n\n        Args:\n            storage (`pa.StructArray`):\n                PyArrow array to embed.\n\n        Returns:\n            `pa.StructArray`: Array in the NifTI arrow storage type, that is\n                `pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})`.\n        \"\"\"\n        if token_per_repo_id is None:\n            token_per_repo_id = {}\n\n        @no_op_if_value_is_null\n        def path_to_bytes(path):\n            source_url = path.split(\"::\")[-1]\n            pattern = (\n                config.HUB_DATASETS_URL if source_url.startswith(config.HF_ENDPOINT) else config.HUB_DATASETS_HFFS_URL\n            )\n            source_url_fields = string_to_dict(source_url, pattern)\n            token = token_per_repo_id.get(source_url_fields[\"repo_id\"]) if source_url_fields is not None else None\n            download_config = DownloadConfig(token=token)\n            with xopen(path, \"rb\", download_config=download_config) as f:\n                return f.read()\n\n        bytes_array = pa.array(\n            [\n                (path_to_bytes(x[\"path\"]) if x[\"bytes\"] is None else x[\"bytes\"]) if x is not None else None\n                for x in storage.to_pylist()\n            ],\n            type=pa.binary(),\n        )\n        path_array = pa.array(\n            [os.path.basename(path) if path is not None else None for path in storage.field(\"path\").to_pylist()],\n            type=pa.string(),\n        )\n        storage = pa.StructArray.from_arrays([bytes_array, path_array], [\"bytes\", \"path\"], mask=bytes_array.is_null())\n        return array_cast(storage, self.pa_type)\n\n    def flatten(self) -> Union[\"FeatureType\", Dict[str, \"FeatureType\"]]:\n        \"\"\"If in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary.\"\"\"\n        from .features import Value\n\n        return (\n            self\n            if self.decode\n            else {\n                \"bytes\": Value(\"binary\"),\n                \"path\": Value(\"string\"),\n            }\n        )\n\n    def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.BinaryArray]) -> pa.StructArray:\n        \"\"\"Cast an Arrow array to the Nifti arrow storage type.\n        The Arrow types that can be converted to the Nifti pyarrow storage type are:\n\n        - `pa.string()` - it must contain the \"path\" data\n        - `pa.binary()` - it must contain the NIfTI bytes\n        - `pa.struct({\"bytes\": pa.binary()})`\n        - `pa.struct({\"path\": pa.string()})`\n        - `pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})`  - order doesn't matter\n\n        Args:\n            storage (`Union[pa.StringArray, pa.StructArray, pa.BinaryArray]`):\n                PyArrow array to cast.\n\n        Returns:\n            `pa.StructArray`: Array in the Nifti arrow storage type, that is\n                `pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})`.\n        \"\"\"\n        if pa.types.is_string(storage.type):\n            bytes_array = pa.array([None] * len(storage), type=pa.binary())\n            storage = pa.StructArray.from_arrays([bytes_array, storage], [\"bytes\", \"path\"], mask=storage.is_null())\n        elif pa.types.is_binary(storage.type):\n            path_array = pa.array([None] * len(storage), type=pa.string())\n            storage = pa.StructArray.from_arrays([storage, path_array], [\"bytes\", \"path\"], mask=storage.is_null())\n        elif pa.types.is_struct(storage.type):\n            if storage.type.get_field_index(\"bytes\") >= 0:\n                bytes_array = storage.field(\"bytes\")\n            else:\n                bytes_array = pa.array([None] * len(storage), type=pa.binary())\n            if storage.type.get_field_index(\"path\") >= 0:\n                path_array = storage.field(\"path\")\n            else:\n                path_array = pa.array([None] * len(storage), type=pa.string())\n            storage = pa.StructArray.from_arrays([bytes_array, path_array], [\"bytes\", \"path\"], mask=storage.is_null())\n        return array_cast(storage, self.pa_type)\n\n\ndef encode_nibabel_image(img: \"nib.Nifti1Image\", force_bytes: bool = False) -> dict[str, Optional[Union[str, bytes]]]:\n    \"\"\"\n    Encode a nibabel image object into a dictionary.\n\n    If the image has an associated file path, returns the path. Otherwise, serializes\n    the image content into bytes.\n\n    Args:\n        img: A nibabel image object (e.g., Nifti1Image).\n        force_bytes: If `True`, always serialize to bytes even if a file path exists. Needed to upload bytes properly.\n\n    Returns:\n        dict: A dictionary with \"path\" or \"bytes\" field.\n    \"\"\"\n    if hasattr(img, \"file_map\") and img.file_map is not None and not force_bytes:\n        filename = img.file_map[\"image\"].filename\n        return {\"path\": filename, \"bytes\": None}\n\n    bytes_data = img.to_bytes()\n    return {\"path\": None, \"bytes\": bytes_data}\n"
  },
  {
    "path": "src/datasets/features/pdf.py",
    "content": "import os\nfrom dataclasses import dataclass, field\nfrom io import BytesIO\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Union\n\nimport pyarrow as pa\n\nfrom .. import config\nfrom ..download.download_config import DownloadConfig\nfrom ..table import array_cast\nfrom ..utils.file_utils import is_local_path, xopen\nfrom ..utils.py_utils import no_op_if_value_is_null, string_to_dict\n\n\nif TYPE_CHECKING:\n    import pdfplumber\n\n    from .features import FeatureType\n\n\ndef pdf_to_bytes(pdf: \"pdfplumber.pdf.PDF\") -> bytes:\n    \"\"\"Convert a pdfplumber.pdf.PDF object to bytes.\"\"\"\n    with BytesIO() as buffer:\n        for page in pdf.pages:\n            buffer.write(page.pdf.stream)\n        return buffer.getvalue()\n\n\n@dataclass\nclass Pdf:\n    \"\"\"\n    **Experimental.**\n    Pdf [`Feature`] to read pdf documents from a pdf file.\n\n    Input: The Pdf feature accepts as input:\n    - A `str`: Absolute path to the pdf file (i.e. random access is allowed).\n    - A `pathlib.Path`: path to the pdf file (i.e. random access is allowed).\n    - A `dict` with the keys:\n        - `path`: String with relative path of the pdf file in a dataset repository.\n        - `bytes`: Bytes of the pdf file.\n      This is useful for archived files with sequential access.\n\n    - A `pdfplumber.pdf.PDF`: pdfplumber pdf object.\n\n    Args:\n        decode (`bool`, defaults to `True`):\n            Whether to decode the pdf data. If `False`,\n            returns the underlying dictionary in the format `{\"path\": pdf_path, \"bytes\": pdf_bytes}`.\n\n    Examples:\n\n    ```py\n    >>> from datasets import Dataset, Pdf\n    >>> ds = Dataset.from_dict({\"pdf\": [\"path/to/pdf/file.pdf\"]}).cast_column(\"pdf\", Pdf())\n    >>> ds.features[\"pdf\"]\n    Pdf(decode=True, id=None)\n    >>> ds[0][\"pdf\"]\n    <pdfplumber.pdf.PDF object at 0x7f8a1c2d8f40>\n    >>> ds = ds.cast_column(\"pdf\", Pdf(decode=False))\n    >>> ds[0][\"pdf\"]\n    {'bytes': None,\n    'path': 'path/to/pdf/file.pdf'}\n    ```\n    \"\"\"\n\n    decode: bool = True\n    id: Optional[str] = field(default=None, repr=False)\n\n    # Automatically constructed\n    dtype: ClassVar[str] = \"pdfplumber.pdf.PDF\"\n    pa_type: ClassVar[Any] = pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})\n    _type: str = field(default=\"Pdf\", init=False, repr=False)\n\n    def __call__(self):\n        return self.pa_type\n\n    def encode_example(self, value: Union[str, bytes, bytearray, dict, \"pdfplumber.pdf.PDF\"]) -> dict:\n        \"\"\"Encode example into a format for Arrow.\n\n        Args:\n            value (`str`, `bytes`, `pdfplumber.pdf.PDF` or `dict`):\n                Data passed as input to Pdf feature.\n\n        Returns:\n            `dict` with \"path\" and \"bytes\" fields\n        \"\"\"\n        if config.PDFPLUMBER_AVAILABLE:\n            import pdfplumber\n        else:\n            pdfplumber = None\n\n        if isinstance(value, str):\n            return {\"path\": value, \"bytes\": None}\n        elif isinstance(value, Path):\n            return {\"path\": str(value.absolute()), \"bytes\": None}\n        elif isinstance(value, (bytes, bytearray)):\n            return {\"path\": None, \"bytes\": value}\n        elif pdfplumber is not None and isinstance(value, pdfplumber.pdf.PDF):\n            # convert the pdfplumber.pdf.PDF to bytes\n            return encode_pdfplumber_pdf(value)\n        elif value.get(\"path\") is not None and os.path.isfile(value[\"path\"]):\n            # we set \"bytes\": None to not duplicate the data if they're already available locally\n            return {\"bytes\": None, \"path\": value.get(\"path\")}\n        elif value.get(\"bytes\") is not None or value.get(\"path\") is not None:\n            # store the pdf bytes, and path is used to infer the pdf format using the file extension\n            return {\"bytes\": value.get(\"bytes\"), \"path\": value.get(\"path\")}\n        else:\n            raise ValueError(\n                f\"A pdf sample should have one of 'path' or 'bytes' but they are missing or None in {value}.\"\n            )\n\n    def decode_example(self, value: dict, token_per_repo_id=None) -> \"pdfplumber.pdf.PDF\":\n        \"\"\"Decode example pdf file into pdf data.\n\n        Args:\n            value (`str` or `dict`):\n                A string with the absolute pdf file path, a dictionary with\n                keys:\n\n                - `path`: String with absolute or relative pdf file path.\n                - `bytes`: The bytes of the pdf file.\n\n            token_per_repo_id (`dict`, *optional*):\n                To access and decode pdf files from private repositories on\n                the Hub, you can pass a dictionary\n                repo_id (`str`) -> token (`bool` or `str`).\n\n        Returns:\n            `pdfplumber.pdf.PDF`\n        \"\"\"\n        if not self.decode:\n            raise RuntimeError(\"Decoding is disabled for this feature. Please use Pdf(decode=True) instead.\")\n\n        if config.PDFPLUMBER_AVAILABLE:\n            import pdfplumber\n        else:\n            raise ImportError(\"To support decoding pdfs, please install 'pdfplumber'.\")\n\n        if token_per_repo_id is None:\n            token_per_repo_id = {}\n\n        path, bytes_ = value[\"path\"], value[\"bytes\"]\n        if bytes_ is None:\n            if path is None:\n                raise ValueError(f\"A pdf should have one of 'path' or 'bytes' but both are None in {value}.\")\n            else:\n                if is_local_path(path):\n                    pdf = pdfplumber.open(path)\n                else:\n                    source_url = path.split(\"::\")[-1]\n                    pattern = (\n                        config.HUB_DATASETS_URL\n                        if source_url.startswith(config.HF_ENDPOINT)\n                        else config.HUB_DATASETS_HFFS_URL\n                    )\n                    try:\n                        repo_id = string_to_dict(source_url, pattern)[\"repo_id\"]\n                        token = token_per_repo_id.get(repo_id)\n                    except ValueError:\n                        token = None\n                    download_config = DownloadConfig(token=token)\n                    f = xopen(path, \"rb\", download_config=download_config)\n                    return pdfplumber.open(f)\n        else:\n            with pdfplumber.open(BytesIO(bytes_)) as p:\n                pdf = p\n\n        return pdf\n\n    def flatten(self) -> Union[\"FeatureType\", Dict[str, \"FeatureType\"]]:\n        \"\"\"If in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary.\"\"\"\n        from .features import Value\n\n        return (\n            self\n            if self.decode\n            else {\n                \"bytes\": Value(\"binary\"),\n                \"path\": Value(\"string\"),\n            }\n        )\n\n    def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.ListArray]) -> pa.StructArray:\n        \"\"\"Cast an Arrow array to the Pdf arrow storage type.\n        The Arrow types that can be converted to the Pdf pyarrow storage type are:\n\n        - `pa.string()` - it must contain the \"path\" data\n        - `pa.binary()` - it must contain the image bytes\n        - `pa.struct({\"bytes\": pa.binary()})`\n        - `pa.struct({\"path\": pa.string()})`\n        - `pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})`  - order doesn't matter\n        - `pa.list(*)` - it must contain the pdf array data\n\n        Args:\n            storage (`Union[pa.StringArray, pa.StructArray, pa.ListArray]`):\n                PyArrow array to cast.\n\n        Returns:\n            `pa.StructArray`: Array in the Pdf arrow storage type, that is\n                `pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})`.\n        \"\"\"\n        if pa.types.is_string(storage.type):\n            bytes_array = pa.array([None] * len(storage), type=pa.binary())\n            storage = pa.StructArray.from_arrays([bytes_array, storage], [\"bytes\", \"path\"], mask=storage.is_null())\n        elif pa.types.is_binary(storage.type):\n            path_array = pa.array([None] * len(storage), type=pa.string())\n            storage = pa.StructArray.from_arrays([storage, path_array], [\"bytes\", \"path\"], mask=storage.is_null())\n        elif pa.types.is_struct(storage.type):\n            if storage.type.get_field_index(\"bytes\") >= 0:\n                bytes_array = storage.field(\"bytes\")\n            else:\n                bytes_array = pa.array([None] * len(storage), type=pa.binary())\n            if storage.type.get_field_index(\"path\") >= 0:\n                path_array = storage.field(\"path\")\n            else:\n                path_array = pa.array([None] * len(storage), type=pa.string())\n            storage = pa.StructArray.from_arrays([bytes_array, path_array], [\"bytes\", \"path\"], mask=storage.is_null())\n        return array_cast(storage, self.pa_type)\n\n    def embed_storage(self, storage: pa.StructArray, token_per_repo_id=None) -> pa.StructArray:\n        \"\"\"Embed PDF files into the Arrow array.\n\n        Args:\n            storage (`pa.StructArray`):\n                PyArrow array to embed.\n\n        Returns:\n            `pa.StructArray`: Array in the PDF arrow storage type, that is\n                `pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})`.\n        \"\"\"\n        if token_per_repo_id is None:\n            token_per_repo_id = {}\n\n        @no_op_if_value_is_null\n        def path_to_bytes(path):\n            source_url = path.split(\"::\")[-1]\n            pattern = (\n                config.HUB_DATASETS_URL if source_url.startswith(config.HF_ENDPOINT) else config.HUB_DATASETS_HFFS_URL\n            )\n            source_url_fields = string_to_dict(source_url, pattern)\n            token = token_per_repo_id.get(source_url_fields[\"repo_id\"]) if source_url_fields is not None else None\n            download_config = DownloadConfig(token=token)\n            with xopen(path, \"rb\", download_config=download_config) as f:\n                return f.read()\n\n        bytes_array = pa.array(\n            [\n                (path_to_bytes(x[\"path\"]) if x[\"bytes\"] is None else x[\"bytes\"]) if x is not None else None\n                for x in storage.to_pylist()\n            ],\n            type=pa.binary(),\n        )\n        path_array = pa.array(\n            [os.path.basename(path) if path is not None else None for path in storage.field(\"path\").to_pylist()],\n            type=pa.string(),\n        )\n        storage = pa.StructArray.from_arrays([bytes_array, path_array], [\"bytes\", \"path\"], mask=bytes_array.is_null())\n        return array_cast(storage, self.pa_type)\n\n\ndef encode_pdfplumber_pdf(pdf: \"pdfplumber.pdf.PDF\") -> dict:\n    \"\"\"\n    Encode a pdfplumber.pdf.PDF object into a dictionary.\n\n    If the PDF has an associated file path, returns the path. Otherwise, serializes\n    the PDF content into bytes.\n\n    Args:\n        pdf (pdfplumber.pdf.PDF): A pdfplumber PDF object.\n\n    Returns:\n        dict: A dictionary with \"path\" or \"bytes\" field.\n    \"\"\"\n    if hasattr(pdf, \"stream\") and hasattr(pdf.stream, \"name\") and pdf.stream.name:\n        # Return the path if the PDF has an associated file path\n        return {\"path\": pdf.stream.name, \"bytes\": None}\n    else:\n        # Convert the PDF to bytes if no path is available\n        return {\"path\": None, \"bytes\": pdf_to_bytes(pdf)}\n"
  },
  {
    "path": "src/datasets/features/translation.py",
    "content": "from dataclasses import dataclass, field\nfrom typing import TYPE_CHECKING, Any, ClassVar, Optional, Union\n\nimport pyarrow as pa\n\n\nif TYPE_CHECKING:\n    from .features import FeatureType\n\n\n@dataclass\nclass Translation:\n    \"\"\"`Feature` for translations with fixed languages per example.\n    Here for compatibility with tfds.\n\n    Args:\n        languages (`dict`):\n            A dictionary for each example mapping string language codes to string translations.\n\n    Example:\n\n    ```python\n    >>> # At construction time:\n    >>> datasets.features.Translation(languages=['en', 'fr', 'de'])\n    >>> # During data generation:\n    >>> yield {\n    ...         'en': 'the cat',\n    ...         'fr': 'le chat',\n    ...         'de': 'die katze'\n    ... }\n    ```\n    \"\"\"\n\n    languages: list[str]\n    id: Optional[str] = field(default=None, repr=False)\n    # Automatically constructed\n    dtype: ClassVar[str] = \"dict\"\n    pa_type: ClassVar[Any] = None\n    _type: str = field(default=\"Translation\", init=False, repr=False)\n\n    def __call__(self):\n        return pa.struct({lang: pa.string() for lang in sorted(self.languages)})\n\n    def flatten(self) -> Union[\"FeatureType\", dict[str, \"FeatureType\"]]:\n        \"\"\"Flatten the Translation feature into a dictionary.\"\"\"\n        from .features import Value\n\n        return {k: Value(\"string\") for k in sorted(self.languages)}\n\n\n@dataclass\nclass TranslationVariableLanguages:\n    \"\"\"`Feature` for translations with variable languages per example.\n    Here for compatibility with tfds.\n\n    Args:\n        languages (`dict`):\n            A dictionary for each example mapping string language codes to one or more string translations.\n            The languages present may vary from example to example.\n\n    Returns:\n        - `language` or `translation` (variable-length 1D `tf.Tensor` of `tf.string`):\n            Language codes sorted in ascending order or plain text translations, sorted to align with language codes.\n\n    Example:\n\n    ```python\n    >>> # At construction time:\n    >>> datasets.features.TranslationVariableLanguages(languages=['en', 'fr', 'de'])\n    >>> # During data generation:\n    >>> yield {\n    ...         'en': 'the cat',\n    ...         'fr': ['le chat', 'la chatte,']\n    ...         'de': 'die katze'\n    ... }\n    >>> # Tensor returned :\n    >>> {\n    ...         'language': ['en', 'de', 'fr', 'fr'],\n    ...         'translation': ['the cat', 'die katze', 'la chatte', 'le chat'],\n    ... }\n    ```\n    \"\"\"\n\n    languages: Optional[list] = None\n    num_languages: Optional[int] = None\n    id: Optional[str] = field(default=None, repr=False)\n    # Automatically constructed\n    dtype: ClassVar[str] = \"dict\"\n    pa_type: ClassVar[Any] = None\n    _type: str = field(default=\"TranslationVariableLanguages\", init=False, repr=False)\n\n    def __post_init__(self):\n        self.languages = sorted(set(self.languages)) if self.languages else None\n        self.num_languages = len(self.languages) if self.languages else None\n\n    def __call__(self):\n        return pa.struct({\"language\": pa.list_(pa.string()), \"translation\": pa.list_(pa.string())})\n\n    def encode_example(self, translation_dict):\n        lang_set = set(self.languages)\n        if set(translation_dict) == {\"language\", \"translation\"}:\n            return translation_dict\n        elif self.languages and set(translation_dict) - lang_set:\n            raise ValueError(\n                f\"Some languages in example ({', '.join(sorted(set(translation_dict) - lang_set))}) are not in valid set ({', '.join(lang_set)}).\"\n            )\n\n        # Convert dictionary into tuples, splitting out cases where there are\n        # multiple translations for a single language.\n        translation_tuples = []\n        for lang, text in translation_dict.items():\n            if isinstance(text, str):\n                translation_tuples.append((lang, text))\n            else:\n                translation_tuples.extend([(lang, el) for el in text])\n\n        # Ensure translations are in ascending order by language code.\n        languages, translations = zip(*sorted(translation_tuples))\n\n        return {\"language\": languages, \"translation\": translations}\n\n    def flatten(self) -> Union[\"FeatureType\", dict[str, \"FeatureType\"]]:\n        \"\"\"Flatten the TranslationVariableLanguages feature into a dictionary.\"\"\"\n        from .features import List, Value\n\n        return {\n            \"language\": List(Value(\"string\")),\n            \"translation\": List(Value(\"string\")),\n        }\n"
  },
  {
    "path": "src/datasets/features/video.py",
    "content": "import os\nfrom dataclasses import dataclass, field\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, TypedDict, Union\n\nimport numpy as np\nimport pyarrow as pa\n\nfrom .. import config\nfrom ..download.download_config import DownloadConfig\nfrom ..table import array_cast\nfrom ..utils.file_utils import is_local_path, xopen\nfrom ..utils.py_utils import no_op_if_value_is_null, string_to_dict\n\n\nif TYPE_CHECKING:\n    import torch\n    from torchcodec.decoders import VideoDecoder\n\n    from .features import FeatureType\n\n\nclass Example(TypedDict):\n    path: Optional[str]\n    bytes: Optional[bytes]\n\n\n@dataclass\nclass Video:\n    \"\"\"\n    Video [`Feature`] to read video data from a video file.\n\n    Input: The Video feature accepts as input:\n    - A `str`: Absolute path to the video file (i.e. random access is allowed).\n    - A `pathlib.Path`: path to the video file (i.e. random access is allowed).\n    - A `dict` with the keys:\n\n        - `path`: String with relative path of the video file in a dataset repository.\n        - `bytes`: Bytes of the video file.\n\n      This is useful for parquet or webdataset files which embed video files.\n\n    - A `torchcodec.decoders.VideoDecoder`: torchcodec video decoder object.\n\n    Output: The Video features output data as `torchcodec.decoders.VideoDecoder` objects.\n\n    Args:\n        decode (`bool`, defaults to `True`):\n            Whether to decode the video data. If `False`,\n            returns the underlying dictionary in the format `{\"path\": video_path, \"bytes\": video_bytes}`.\n        stream_index (`int`, *optional*):\n            The streaming index to use from the file. If `None` defaults to the \"best\" index.\n        dimension_order (`str`, defaults to `NCHW`):\n            The dimension order of the decoded frames.\n            where N is the batch size, C is the number of channels,\n            H is the height, and W is the width of the frames.\n        num_ffmpeg_threads (`int`, defaults to `1`):\n            The number of threads to use for decoding the video. (Recommended to keep this at 1)\n        device (`str` or `torch.device`, defaults to `cpu`):\n            The device to use for decoding the video.\n        seek_mode (`str`, defaults to `exact`):\n            Determines if frame access will be “exact” or “approximate”.\n            Exact guarantees that requesting frame i will always return frame i, but doing so requires an initial scan of the file.\n            Approximate is faster as it avoids scanning the file, but less accurate as it uses the file's metadata to calculate where i probably is.\n            read more [here](https://docs.pytorch.org/torchcodec/stable/generated_examples/approximate_mode.html#sphx-glr-generated-examples-approximate-mode-py)\n\n    Examples:\n\n    ```py\n    >>> from datasets import Dataset, Video\n    >>> ds = Dataset.from_dict({\"video\":[\"path/to/Screen Recording.mov\"]}).cast_column(\"video\", Video())\n    >>> ds.features[\"video\"]\n    Video(decode=True, id=None)\n    >>> ds[0][\"video\"]\n    <torchcodec.decoders._video_decoder.VideoDecoder object at 0x14a61e080>\n    >>> video = ds[0][\"video\"]\n    >>> video.get_frames_in_range(0, 10)\n    FrameBatch:\n    data (shape): torch.Size([10, 3, 50, 66])\n    pts_seconds: tensor([0.4333, 0.4333, 0.4333, 0.4333, 0.4333, 0.4333, 0.4333, 0.4333, 0.4333,\n            0.4333], dtype=torch.float64)\n    duration_seconds: tensor([0.0167, 0.0167, 0.0167, 0.0167, 0.0167, 0.0167, 0.0167, 0.0167, 0.0167,\n            0.0167], dtype=torch.float64)\n    >>> ds.cast_column('video', Video(decode=False))[0][\"video]\n    {'bytes': None,\n     'path': 'path/to/Screen Recording.mov'}\n    ```\n    \"\"\"\n\n    decode: bool = True\n    stream_index: Optional[int] = None\n    dimension_order: Literal[\"NCHW\", \"NHWC\"] = \"NCHW\"\n    num_ffmpeg_threads: int = 1\n    device: Optional[Union[str, \"torch.device\"]] = \"cpu\"\n    seek_mode: Literal[\"exact\", \"approximate\"] = \"exact\"\n    id: Optional[str] = field(default=None, repr=False)\n    # Automatically constructed\n    dtype: ClassVar[str] = \"torchcodec.decoders.VideoDecoder\"\n    pa_type: ClassVar[Any] = pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})\n    _type: str = field(default=\"Video\", init=False, repr=False)\n\n    def __call__(self):\n        return self.pa_type\n\n    def encode_example(self, value: Union[str, bytes, bytearray, Example, np.ndarray, \"VideoDecoder\"]) -> Example:\n        \"\"\"Encode example into a format for Arrow.\n\n        Args:\n            value (`str`, `np.ndarray`, `bytes`, `bytearray`, `VideoDecoder` or `dict`):\n                Data passed as input to Video feature.\n\n        Returns:\n            `dict` with \"path\" and \"bytes\" fields\n        \"\"\"\n        if value is None:\n            raise ValueError(\"value must be provided\")\n\n        if config.TORCHCODEC_AVAILABLE:\n            from torchcodec.decoders import VideoDecoder\n        else:\n            VideoDecoder = None\n\n        if isinstance(value, list):\n            value = np.array(value)\n\n        if isinstance(value, str):\n            return {\"path\": value, \"bytes\": None}\n        elif isinstance(value, Path):\n            return {\"path\": str(value.absolute()), \"bytes\": None}\n        elif isinstance(value, (bytes, bytearray)):\n            return {\"path\": None, \"bytes\": value}\n        elif isinstance(value, np.ndarray):\n            # convert the video array to bytes\n            return encode_np_array(value)\n        elif VideoDecoder is not None and isinstance(value, VideoDecoder):\n            # convert the torchcodec video decoder to bytes\n            return encode_torchcodec_video(value)\n        elif isinstance(value, dict):\n            path, bytes_ = value.get(\"path\"), value.get(\"bytes\")\n            if path is not None and os.path.isfile(path):\n                # we set \"bytes\": None to not duplicate the data if they're already available locally\n                return {\"bytes\": None, \"path\": path}\n            elif bytes_ is not None or path is not None:\n                # store the video bytes, and path is used to infer the video format using the file extension\n                return {\"bytes\": bytes_, \"path\": path}\n            else:\n                raise ValueError(\n                    f\"A video sample should have one of 'path' or 'bytes' but they are missing or None in {value}.\"\n                )\n        else:\n            raise TypeError(f\"Unsupported encode_example type: {type(value)}\")\n\n    def decode_example(\n        self,\n        value: Union[str, Example],\n        token_per_repo_id: Optional[dict[str, Union[bool, str]]] = None,\n    ) -> \"VideoDecoder\":\n        \"\"\"Decode example video file into video data.\n\n        Args:\n            value (`str` or `dict`):\n                A string with the absolute video file path, a dictionary with\n                keys:\n\n                - `path`: String with absolute or relative video file path.\n                - `bytes`: The bytes of the video file.\n            token_per_repo_id (`dict`, *optional*):\n                To access and decode\n                video files from private repositories on the Hub, you can pass\n                a dictionary repo_id (`str`) -> token (`bool` or `str`).\n\n        Returns:\n            `torchcodec.decoders.VideoDecoder`\n        \"\"\"\n        if not self.decode:\n            raise RuntimeError(\"Decoding is disabled for this feature. Please use Video(decode=True) instead.\")\n\n        if config.TORCHCODEC_AVAILABLE:\n            from torchcodec.decoders import VideoDecoder\n\n        else:\n            raise ImportError(\"To support decoding videos, please install 'torchcodec'.\")\n\n        if token_per_repo_id is None:\n            token_per_repo_id = {}\n\n        if isinstance(value, str):\n            path, bytes_ = value, None\n        else:\n            path, bytes_ = value[\"path\"], value[\"bytes\"]\n\n        if bytes_ is None:\n            if path is None:\n                raise ValueError(f\"A video should have one of 'path' or 'bytes' but both are None in {value}.\")\n            elif is_local_path(path):\n                video = VideoDecoder(\n                    path,\n                    stream_index=self.stream_index,\n                    dimension_order=self.dimension_order,\n                    num_ffmpeg_threads=self.num_ffmpeg_threads,\n                    device=self.device,\n                    seek_mode=self.seek_mode,\n                )\n            else:\n                video = hf_video_reader(\n                    path,\n                    token_per_repo_id=token_per_repo_id,\n                    dimension_order=self.dimension_order,\n                    num_ffmpeg_threads=self.num_ffmpeg_threads,\n                    device=self.device,\n                    seek_mode=self.seek_mode,\n                )\n        else:\n            video = VideoDecoder(\n                bytes_,\n                stream_index=self.stream_index,\n                dimension_order=self.dimension_order,\n                num_ffmpeg_threads=self.num_ffmpeg_threads,\n                device=self.device,\n                seek_mode=self.seek_mode,\n            )\n        video._hf_encoded = {\"path\": path, \"bytes\": bytes_}\n        video.metadata.path = path\n        return video\n\n    def flatten(self) -> Union[\"FeatureType\", dict[str, \"FeatureType\"]]:\n        \"\"\"If in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary.\"\"\"\n        from .features import Value\n\n        return (\n            self\n            if self.decode\n            else {\n                \"bytes\": Value(\"binary\"),\n                \"path\": Value(\"string\"),\n            }\n        )\n\n    def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.ListArray]) -> pa.StructArray:\n        \"\"\"Cast an Arrow array to the Video arrow storage type.\n        The Arrow types that can be converted to the Video pyarrow storage type are:\n\n        - `pa.string()` - it must contain the \"path\" data\n        - `pa.binary()` - it must contain the video bytes\n        - `pa.struct({\"bytes\": pa.binary()})`\n        - `pa.struct({\"path\": pa.string()})`\n        - `pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})`  - order doesn't matter\n        - `pa.list(*)` - it must contain the video array data\n\n        Args:\n            storage (`Union[pa.StringArray, pa.StructArray, pa.ListArray]`):\n                PyArrow array to cast.\n\n        Returns:\n            `pa.StructArray`: Array in the Video arrow storage type, that is\n                `pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})`.\n        \"\"\"\n        if pa.types.is_string(storage.type):\n            bytes_array = pa.array([None] * len(storage), type=pa.binary())\n            storage = pa.StructArray.from_arrays([bytes_array, storage], [\"bytes\", \"path\"], mask=storage.is_null())\n        elif pa.types.is_large_binary(storage.type):\n            storage = array_cast(\n                storage, pa.binary()\n            )  # this can fail in case of big videos, paths should be used instead\n            path_array = pa.array([None] * len(storage), type=pa.string())\n            storage = pa.StructArray.from_arrays([storage, path_array], [\"bytes\", \"path\"], mask=storage.is_null())\n        elif pa.types.is_binary(storage.type):\n            path_array = pa.array([None] * len(storage), type=pa.string())\n            storage = pa.StructArray.from_arrays([storage, path_array], [\"bytes\", \"path\"], mask=storage.is_null())\n        elif pa.types.is_struct(storage.type):\n            if storage.type.get_field_index(\"bytes\") >= 0:\n                bytes_array = storage.field(\"bytes\")\n            else:\n                bytes_array = pa.array([None] * len(storage), type=pa.binary())\n            if storage.type.get_field_index(\"path\") >= 0:\n                path_array = storage.field(\"path\")\n            else:\n                path_array = pa.array([None] * len(storage), type=pa.string())\n            storage = pa.StructArray.from_arrays([bytes_array, path_array], [\"bytes\", \"path\"], mask=storage.is_null())\n        elif pa.types.is_list(storage.type):\n            bytes_array = pa.array(\n                [encode_np_array(np.array(arr))[\"bytes\"] if arr is not None else None for arr in storage.to_pylist()],\n                type=pa.binary(),\n            )\n            path_array = pa.array([None] * len(storage), type=pa.string())\n            storage = pa.StructArray.from_arrays(\n                [bytes_array, path_array], [\"bytes\", \"path\"], mask=bytes_array.is_null()\n            )\n        return array_cast(storage, self.pa_type)\n\n    def embed_storage(self, storage: pa.StructArray, token_per_repo_id=None) -> pa.StructArray:\n        \"\"\"Embed image files into the Arrow array.\n\n        Args:\n            storage (`pa.StructArray`):\n                PyArrow array to embed.\n\n        Returns:\n            `pa.StructArray`: Array in the Video arrow storage type, that is\n                `pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})`.\n        \"\"\"\n        if token_per_repo_id is None:\n            token_per_repo_id = {}\n\n        @no_op_if_value_is_null\n        def path_to_bytes(path):\n            source_url = path.split(\"::\")[-1]\n            pattern = (\n                config.HUB_DATASETS_URL if source_url.startswith(config.HF_ENDPOINT) else config.HUB_DATASETS_HFFS_URL\n            )\n            source_url_fields = string_to_dict(source_url, pattern)\n            token = token_per_repo_id.get(source_url_fields[\"repo_id\"]) if source_url_fields is not None else None\n            download_config = DownloadConfig(token=token)\n            with xopen(path, \"rb\", download_config=download_config) as f:\n                return f.read()\n\n        bytes_array = pa.array(\n            [\n                (path_to_bytes(x[\"path\"]) if x[\"bytes\"] is None else x[\"bytes\"]) if x is not None else None\n                for x in storage.to_pylist()\n            ],\n            type=pa.binary(),\n        )\n        path_array = pa.array(\n            [os.path.basename(path) if path is not None else None for path in storage.field(\"path\").to_pylist()],\n            type=pa.string(),\n        )\n        storage = pa.StructArray.from_arrays([bytes_array, path_array], [\"bytes\", \"path\"], mask=bytes_array.is_null())\n        return array_cast(storage, self.pa_type)\n\n\ndef video_to_bytes(video: \"VideoDecoder\") -> bytes:\n    \"\"\"Convert a torchcodec Video object to bytes using native compression if possible\"\"\"\n    raise NotImplementedError()\n\n\ndef encode_torchcodec_video(video: \"VideoDecoder\") -> Example:\n    if hasattr(video, \"_hf_encoded\"):\n        return video._hf_encoded\n    else:\n        raise NotImplementedError(\n            \"Encoding a VideoDecoder that doesn't come from datasets.Video.decode() is not implemented\"\n        )\n\n\ndef encode_np_array(array: np.ndarray) -> Example:\n    raise NotImplementedError()\n\n\n# No monkey patch needed!\n# 1. store the encoded video data {\"path\": ..., \"bytes\": ...} in `video._hf_encoded``\n# 2. add support for hf:// files\n\n\ndef hf_video_reader(\n    path: str,\n    token_per_repo_id: Optional[dict[str, Union[bool, str]]] = None,\n    stream: str = \"video\",\n    dimension_order: Literal[\"NCHW\", \"NHWC\"] = \"NCHW\",\n    num_ffmpeg_threads: int = 1,\n    device: Optional[Union[str, \"torch.device\"]] = \"cpu\",\n    seek_mode: Literal[\"exact\", \"approximate\"] = \"exact\",\n) -> \"VideoDecoder\":\n    from torchcodec.decoders import VideoDecoder\n\n    # Load the file from HF\n    if token_per_repo_id is None:\n        token_per_repo_id = {}\n    source_url = path.split(\"::\")[-1]\n    pattern = config.HUB_DATASETS_URL if source_url.startswith(config.HF_ENDPOINT) else config.HUB_DATASETS_HFFS_URL\n    source_url_fields = string_to_dict(source_url, pattern)\n    token = token_per_repo_id.get(source_url_fields[\"repo_id\"]) if source_url_fields is not None else None\n    download_config = DownloadConfig(token=token)\n    f = xopen(path, \"rb\", download_config=download_config)\n\n    # Instantiate the VideoDecoder\n    stream_id = 0 if len(stream.split(\":\")) == 1 else int(stream.split(\":\")[1])\n    vd = VideoDecoder(\n        f,\n        stream_index=stream_id,\n        dimension_order=dimension_order,\n        num_ffmpeg_threads=num_ffmpeg_threads,\n        device=device,\n        seek_mode=seek_mode,\n    )\n    return vd\n"
  },
  {
    "path": "src/datasets/filesystems/__init__.py",
    "content": "import importlib\nimport shutil\nimport warnings\nfrom typing import List\n\nimport fsspec\nimport fsspec.asyn\nfrom fsspec.implementations.local import LocalFileSystem\n\nfrom . import compression\n\n\nCOMPRESSION_FILESYSTEMS: list[compression.BaseCompressedFileFileSystem] = [\n    compression.Bz2FileSystem,\n    compression.GzipFileSystem,\n    compression.Lz4FileSystem,\n    compression.XzFileSystem,\n    compression.ZstdFileSystem,\n]\n\n# Register custom filesystems\nfor fs_class in COMPRESSION_FILESYSTEMS:\n    if fs_class.protocol in fsspec.registry and fsspec.registry[fs_class.protocol] is not fs_class:\n        warnings.warn(f\"A filesystem protocol was already set for {fs_class.protocol} and will be overwritten.\")\n    fsspec.register_implementation(fs_class.protocol, fs_class, clobber=True)\n\n\ndef is_remote_filesystem(fs: fsspec.AbstractFileSystem) -> bool:\n    \"\"\"\n    Checks if `fs` is a remote filesystem.\n\n    Args:\n        fs (`fsspec.spec.AbstractFileSystem`):\n            An abstract super-class for pythonic file-systems, e.g. `fsspec.filesystem(\\'file\\')` or `s3fs.S3FileSystem`.\n    \"\"\"\n    return not isinstance(fs, LocalFileSystem)\n\n\ndef rename(fs: fsspec.AbstractFileSystem, src: str, dst: str):\n    \"\"\"\n    Renames the file `src` in `fs` to `dst`.\n    \"\"\"\n    if not is_remote_filesystem(fs):\n        # LocalFileSystem.mv does copy + rm, it is more efficient to simply move a local directory\n        shutil.move(fs._strip_protocol(src), fs._strip_protocol(dst))\n    else:\n        fs.mv(src, dst, recursive=True)\n"
  },
  {
    "path": "src/datasets/filesystems/compression.py",
    "content": "import os\nfrom functools import partial\nfrom typing import Optional\n\nimport fsspec\nfrom fsspec.archive import AbstractArchiveFileSystem\n\n\nclass BaseCompressedFileFileSystem(AbstractArchiveFileSystem):\n    \"\"\"Read contents of compressed file as a filesystem with one file inside.\"\"\"\n\n    root_marker = \"\"\n    protocol: str = (\n        None  # protocol passed in prefix to the url. ex: \"gzip\", for gzip://file.txt::http://foo.bar/file.txt.gz\n    )\n    compression: str = None  # compression type in fsspec. ex: \"gzip\"\n    extensions: list[str] = None  # extensions of the filename to strip. ex: \".gz\" to get file.txt from file.txt.gz\n\n    def __init__(\n        self, fo: str = \"\", target_protocol: Optional[str] = None, target_options: Optional[dict] = None, **kwargs\n    ):\n        \"\"\"\n        The compressed file system can be instantiated from any compressed file.\n        It reads the contents of compressed file as a filesystem with one file inside, as if it was an archive.\n\n        The single file inside the filesystem is named after the compresssed file,\n        without the compression extension at the end of the filename.\n\n        Args:\n            fo (:obj:``str``): Path to compressed file. Will fetch file using ``fsspec.open()``\n            mode (:obj:``str``): Currently, only 'rb' accepted\n            target_protocol(:obj:``str``, optional): To override the FS protocol inferred from a URL.\n            target_options (:obj:``dict``, optional): Kwargs passed when instantiating the target FS.\n        \"\"\"\n        super().__init__(self, **kwargs)\n        self.fo = fo.__fspath__() if hasattr(fo, \"__fspath__\") else fo\n        # always open as \"rb\" since fsspec can then use the TextIOWrapper to make it work for \"r\" mode\n        self._open_with_fsspec = partial(\n            fsspec.open,\n            self.fo,\n            mode=\"rb\",\n            protocol=target_protocol,\n            compression=self.compression,\n            client_kwargs={\n                \"requote_redirect_url\": False,  # see https://github.com/huggingface/datasets/pull/5459\n                \"trust_env\": True,  # Enable reading proxy env variables.\n                **(target_options or {}).pop(\"client_kwargs\", {}),  # To avoid issues if it was already passed.\n            },\n            **(target_options or {}),\n        )\n        self.compressed_name = os.path.basename(self.fo.split(\"::\")[0])\n        self.uncompressed_name = (\n            self.compressed_name[: self.compressed_name.rindex(\".\")]\n            if \".\" in self.compressed_name\n            else self.compressed_name\n        )\n        self.dir_cache = None\n\n    @classmethod\n    def _strip_protocol(cls, path):\n        # compressed file paths are always relative to the archive root\n        return super()._strip_protocol(path).lstrip(\"/\")\n\n    def _get_dirs(self):\n        if self.dir_cache is None:\n            f = {**self._open_with_fsspec().fs.info(self.fo), \"name\": self.uncompressed_name}\n            self.dir_cache = {f[\"name\"]: f}\n\n    def cat(self, path: str):\n        with self._open_with_fsspec().open() as f:\n            return f.read()\n\n    def _open(\n        self,\n        path: str,\n        mode: str = \"rb\",\n        block_size=None,\n        autocommit=True,\n        cache_options=None,\n        **kwargs,\n    ):\n        path = self._strip_protocol(path)\n        if mode != \"rb\":\n            raise ValueError(f\"Tried to read with mode {mode} on file {self.fo} opened with mode 'rb'\")\n        return self._open_with_fsspec().open()\n\n\nclass Bz2FileSystem(BaseCompressedFileFileSystem):\n    \"\"\"Read contents of BZ2 file as a filesystem with one file inside.\"\"\"\n\n    protocol = \"bz2\"\n    compression = \"bz2\"\n    extensions = [\".bz2\"]\n\n\nclass GzipFileSystem(BaseCompressedFileFileSystem):\n    \"\"\"Read contents of GZIP file as a filesystem with one file inside.\"\"\"\n\n    protocol = \"gzip\"\n    compression = \"gzip\"\n    extensions = [\".gz\", \".gzip\"]\n\n\nclass Lz4FileSystem(BaseCompressedFileFileSystem):\n    \"\"\"Read contents of LZ4 file as a filesystem with one file inside.\"\"\"\n\n    protocol = \"lz4\"\n    compression = \"lz4\"\n    extensions = [\".lz4\"]\n\n\nclass XzFileSystem(BaseCompressedFileFileSystem):\n    \"\"\"Read contents of .xz (LZMA) file as a filesystem with one file inside.\"\"\"\n\n    protocol = \"xz\"\n    compression = \"xz\"\n    extensions = [\".xz\"]\n\n\nclass ZstdFileSystem(BaseCompressedFileFileSystem):\n    \"\"\"\n    Read contents of .zstd file as a filesystem with one file inside.\n    \"\"\"\n\n    protocol = \"zstd\"\n    compression = \"zstd\"\n    extensions = [\".zst\", \".zstd\"]\n"
  },
  {
    "path": "src/datasets/fingerprint.py",
    "content": "import inspect\nimport os\nimport random\nimport shutil\nimport tempfile\nimport weakref\nfrom functools import wraps\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any, Callable, Optional, Union\n\nimport numpy as np\nimport xxhash\n\nfrom . import config\nfrom .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH\nfrom .utils._dill import dumps\nfrom .utils.logging import get_logger\n\n\nif TYPE_CHECKING:\n    from .arrow_dataset import Dataset\n\n\nlogger = get_logger(__name__)\n\n\n# Fingerprinting allows to have one deterministic fingerprint per dataset state.\n# A dataset fingerprint is updated after each transform.\n# Re-running the same transforms on a dataset in a different session results in the same fingerprint.\n# This is possible thanks to a custom hashing function that works with most python objects.\n\n# Fingerprinting is the main mechanism that enables caching.\n# The caching mechanism allows to reload an existing cache file if it's already been computed.\n\n\n#################\n# Caching\n#################\n\n_CACHING_ENABLED = True\n_TEMP_DIR_FOR_TEMP_CACHE_FILES: Optional[\"_TempCacheDir\"] = None\n_DATASETS_WITH_TABLE_IN_TEMP_DIR: Optional[weakref.WeakSet] = None\n\n\nclass _TempCacheDir:\n    \"\"\"\n    A temporary directory for storing cached Arrow files with a cleanup that frees references to the Arrow files\n    before deleting the directory itself to avoid permission errors on Windows.\n    \"\"\"\n\n    def __init__(self):\n        # Check if TMPDIR is set and handle the case where it doesn't exist\n        tmpdir = os.environ.get(\"TMPDIR\") or os.environ.get(\"TEMP\") or os.environ.get(\"TMP\")\n        # Normalize the path to handle any path resolution issues\n        if tmpdir:\n            tmpdir = os.path.normpath(tmpdir)\n            if not os.path.exists(tmpdir):\n                # Auto-create the directory if it doesn't exist\n                # This prevents tempfile from silently falling back to /tmp\n                try:\n                    os.makedirs(tmpdir, exist_ok=True)\n                    logger.info(f\"Created TMPDIR directory: {tmpdir}\")\n                except OSError as e:\n                    raise OSError(\n                        f\"TMPDIR is set to '{tmpdir}' but the directory does not exist and could not be created: {e}. \"\n                        \"Please create it manually or unset TMPDIR to fall back to the default temporary directory.\"\n                    ) from e\n            # If tmpdir exists, verify it's actually a directory and writable\n            elif not os.path.isdir(tmpdir):\n                raise OSError(\n                    f\"TMPDIR is set to '{tmpdir}' but it is not a directory. \"\n                    \"Please point TMPDIR to a writable directory or unset it to fall back to the default temporary directory.\"\n                )\n\n        # Explicitly pass the directory to mkdtemp to ensure TMPDIR is respected\n        # This works even if tempfile.gettempdir() was already called and cached\n        # Pass dir=None if tmpdir is None to use default temp directory\n        self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX, dir=tmpdir)\n        self._finalizer = weakref.finalize(self, self._cleanup)\n\n    def _cleanup(self):\n        for dset in get_datasets_with_cache_file_in_temp_dir():\n            dset.__del__()\n        if os.path.exists(self.name):\n            try:\n                shutil.rmtree(self.name)\n            except Exception as e:\n                raise OSError(\n                    f\"An error occurred while trying to delete temporary cache directory {self.name}. Please delete it manually.\"\n                ) from e\n\n    def cleanup(self):\n        if self._finalizer.detach():\n            self._cleanup()\n\n\ndef maybe_register_dataset_for_temp_dir_deletion(dataset):\n    \"\"\"\n    This function registers the datasets that have cache files in _TEMP_DIR_FOR_TEMP_CACHE_FILES in order\n    to properly delete them before deleting the temporary directory.\n    The temporary directory _TEMP_DIR_FOR_TEMP_CACHE_FILES is used when caching is disabled.\n    \"\"\"\n    if _TEMP_DIR_FOR_TEMP_CACHE_FILES is None:\n        return\n\n    global _DATASETS_WITH_TABLE_IN_TEMP_DIR\n    if _DATASETS_WITH_TABLE_IN_TEMP_DIR is None:\n        _DATASETS_WITH_TABLE_IN_TEMP_DIR = weakref.WeakSet()\n    if any(\n        Path(_TEMP_DIR_FOR_TEMP_CACHE_FILES.name) in Path(cache_file[\"filename\"]).parents\n        for cache_file in dataset.cache_files\n    ):\n        _DATASETS_WITH_TABLE_IN_TEMP_DIR.add(dataset)\n\n\ndef get_datasets_with_cache_file_in_temp_dir():\n    return list(_DATASETS_WITH_TABLE_IN_TEMP_DIR) if _DATASETS_WITH_TABLE_IN_TEMP_DIR is not None else []\n\n\ndef enable_caching():\n    \"\"\"\n    When applying transforms on a dataset, the data are stored in cache files.\n    The caching mechanism allows to reload an existing cache file if it's already been computed.\n\n    Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated\n    after each transform.\n\n    If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.\n    More precisely, if the caching is disabled:\n    - cache files are always recreated\n    - cache files are written to a temporary directory that is deleted when session closes\n    - cache files are named using a random hash instead of the dataset fingerprint\n    - use [`~datasets.Dataset.save_to_disk`] to save a transformed dataset or it will be deleted when session closes\n    - caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use\n    the `download_mode` parameter in [`~datasets.load_dataset`].\n    \"\"\"\n    global _CACHING_ENABLED\n    _CACHING_ENABLED = True\n\n\ndef disable_caching():\n    \"\"\"\n    When applying transforms on a dataset, the data are stored in cache files.\n    The caching mechanism allows to reload an existing cache file if it's already been computed.\n\n    Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated\n    after each transform.\n\n    If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.\n    More precisely, if the caching is disabled:\n    - cache files are always recreated\n    - cache files are written to a temporary directory that is deleted when session closes\n    - cache files are named using a random hash instead of the dataset fingerprint\n    - use [`~datasets.Dataset.save_to_disk`] to save a transformed dataset or it will be deleted when session closes\n    - caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use\n    the `download_mode` parameter in [`~datasets.load_dataset`].\n    \"\"\"\n    global _CACHING_ENABLED\n    _CACHING_ENABLED = False\n\n\ndef is_caching_enabled() -> bool:\n    \"\"\"\n    When applying transforms on a dataset, the data are stored in cache files.\n    The caching mechanism allows to reload an existing cache file if it's already been computed.\n\n    Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated\n    after each transform.\n\n    If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.\n    More precisely, if the caching is disabled:\n    - cache files are always recreated\n    - cache files are written to a temporary directory that is deleted when session closes\n    - cache files are named using a random hash instead of the dataset fingerprint\n    - use [`~datasets.Dataset.save_to_disk`]] to save a transformed dataset or it will be deleted when session closes\n    - caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use\n    the `download_mode` parameter in [`~datasets.load_dataset`].\n    \"\"\"\n    global _CACHING_ENABLED\n    return bool(_CACHING_ENABLED)\n\n\ndef get_temporary_cache_files_directory() -> str:\n    \"\"\"Return a directory that is deleted when session closes.\"\"\"\n    global _TEMP_DIR_FOR_TEMP_CACHE_FILES\n    if _TEMP_DIR_FOR_TEMP_CACHE_FILES is None:\n        _TEMP_DIR_FOR_TEMP_CACHE_FILES = _TempCacheDir()\n    return _TEMP_DIR_FOR_TEMP_CACHE_FILES.name\n\n\n#################\n# Hashing\n#################\n\n\nclass Hasher:\n    \"\"\"Hasher that accepts python objects as inputs.\"\"\"\n\n    dispatch: dict = {}\n\n    def __init__(self):\n        self.m = xxhash.xxh64()\n\n    @classmethod\n    def hash_bytes(cls, value: Union[bytes, list[bytes]]) -> str:\n        value = [value] if isinstance(value, bytes) else value\n        m = xxhash.xxh64()\n        for x in value:\n            m.update(x)\n        return m.hexdigest()\n\n    @classmethod\n    def hash(cls, value: Any) -> str:\n        return cls.hash_bytes(dumps(value))\n\n    def update(self, value: Any) -> None:\n        header_for_update = f\"=={type(value)}==\"\n        value_for_update = self.hash(value)\n        self.m.update(header_for_update.encode(\"utf8\"))\n        self.m.update(value_for_update.encode(\"utf-8\"))\n\n    def hexdigest(self) -> str:\n        return self.m.hexdigest()\n\n\n#################\n# Fingerprinting\n#################\n\nfingerprint_rng = random.Random()\n# we show a warning only once when fingerprinting fails to avoid spam\nfingerprint_warnings: dict[str, bool] = {}\n\n\ndef generate_fingerprint(dataset: \"Dataset\") -> str:\n    state = dataset.__dict__\n    hasher = Hasher()\n    for key in sorted(state):\n        if key == \"_fingerprint\":\n            continue\n        hasher.update(key)\n        hasher.update(state[key])\n    # hash data files last modification timestamps as well\n    for cache_file in dataset.cache_files:\n        hasher.update(os.path.getmtime(cache_file[\"filename\"]))\n    return hasher.hexdigest()\n\n\ndef generate_random_fingerprint(nbits: int = 64) -> str:\n    return f\"{fingerprint_rng.getrandbits(nbits):0{nbits // 4}x}\"\n\n\ndef update_fingerprint(fingerprint, transform, transform_args):\n    global fingerprint_warnings\n    hasher = Hasher()\n    hasher.update(fingerprint)\n    try:\n        hasher.update(transform)\n    except:  # noqa various errors might raise here from pickle or dill\n        if _CACHING_ENABLED:\n            if not fingerprint_warnings.get(\"update_fingerprint_transform_hash_failed\", False):\n                logger.warning(\n                    f\"Transform {transform} couldn't be hashed properly, a random hash was used instead. \"\n                    \"Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. \"\n                    \"If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. \"\n                    \"This warning is only shown once. Subsequent hashing failures won't be shown.\"\n                )\n                fingerprint_warnings[\"update_fingerprint_transform_hash_failed\"] = True\n            else:\n                logger.info(f\"Transform {transform} couldn't be hashed properly, a random hash was used instead.\")\n        else:\n            logger.info(\n                f\"Transform {transform} couldn't be hashed properly, a random hash was used instead. This doesn't affect caching since it's disabled.\"\n            )\n\n        return generate_random_fingerprint()\n    for key in sorted(transform_args):\n        hasher.update(key)\n        try:\n            hasher.update(transform_args[key])\n        except:  # noqa various errors might raise here from pickle or dill\n            if _CACHING_ENABLED:\n                if not fingerprint_warnings.get(\"update_fingerprint_transform_hash_failed\", False):\n                    logger.warning(\n                        f\"Parameter '{key}'={transform_args[key]} of the transform {transform} couldn't be hashed properly, a random hash was used instead. \"\n                        \"Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. \"\n                        \"If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. \"\n                        \"This warning is only shown once. Subsequent hashing failures won't be shown.\"\n                    )\n                    fingerprint_warnings[\"update_fingerprint_transform_hash_failed\"] = True\n                else:\n                    logger.info(\n                        f\"Parameter '{key}'={transform_args[key]} of the transform {transform} couldn't be hashed properly, a random hash was used instead.\"\n                    )\n            else:\n                logger.info(\n                    f\"Parameter '{key}'={transform_args[key]} of the transform {transform} couldn't be hashed properly, a random hash was used instead. This doesn't affect caching since it's disabled.\"\n                )\n            return generate_random_fingerprint()\n    return hasher.hexdigest()\n\n\ndef validate_fingerprint(fingerprint: str, max_length=64):\n    \"\"\"\n    Make sure the fingerprint is a non-empty string that is not longer that max_length=64 by default,\n    so that the fingerprint can be used to name cache files without issues.\n    \"\"\"\n    if not isinstance(fingerprint, str) or not fingerprint:\n        raise ValueError(f\"Invalid fingerprint '{fingerprint}': it should be a non-empty string.\")\n    for invalid_char in INVALID_WINDOWS_CHARACTERS_IN_PATH:\n        if invalid_char in fingerprint:\n            raise ValueError(\n                f\"Invalid fingerprint. Bad characters from black list '{INVALID_WINDOWS_CHARACTERS_IN_PATH}' found in '{fingerprint}'. \"\n                f\"They could create issues when creating cache files.\"\n            )\n    if len(fingerprint) > max_length:\n        raise ValueError(\n            f\"Invalid fingerprint. Maximum lenth is {max_length} but '{fingerprint}' has length {len(fingerprint)}.\"\n            \"It could create issues when creating cache files.\"\n        )\n\n\ndef format_transform_for_fingerprint(func: Callable, version: Optional[str] = None) -> str:\n    \"\"\"\n    Format a transform to the format that will be used to update the fingerprint.\n    \"\"\"\n    transform = f\"{func.__module__}.{func.__qualname__}\"\n    if version is not None:\n        transform += f\"@{version}\"\n    return transform\n\n\ndef format_kwargs_for_fingerprint(\n    func: Callable,\n    args: tuple,\n    kwargs: dict[str, Any],\n    use_kwargs: Optional[list[str]] = None,\n    ignore_kwargs: Optional[list[str]] = None,\n    randomized_function: bool = False,\n) -> dict[str, Any]:\n    \"\"\"\n    Format the kwargs of a transform to the format that will be used to update the fingerprint.\n    \"\"\"\n    kwargs_for_fingerprint = kwargs.copy()\n    if args:\n        params = [p.name for p in inspect.signature(func).parameters.values() if p != p.VAR_KEYWORD]\n        args = args[1:]  # assume the first argument is the dataset\n        params = params[1:]\n        kwargs_for_fingerprint.update(zip(params, args))\n    else:\n        del kwargs_for_fingerprint[\n            next(iter(inspect.signature(func).parameters))\n        ]  # assume the first key is the dataset\n\n    # keep the right kwargs to be hashed to generate the fingerprint\n\n    if use_kwargs:\n        kwargs_for_fingerprint = {k: v for k, v in kwargs_for_fingerprint.items() if k in use_kwargs}\n    if ignore_kwargs:\n        kwargs_for_fingerprint = {k: v for k, v in kwargs_for_fingerprint.items() if k not in ignore_kwargs}\n    if randomized_function:  # randomized functions have `seed` and `generator` parameters\n        if kwargs_for_fingerprint.get(\"seed\") is None and kwargs_for_fingerprint.get(\"generator\") is None:\n            _, seed, pos, *_ = np.random.get_state()\n            seed = seed[pos] if pos < 624 else seed[0]\n            kwargs_for_fingerprint[\"generator\"] = np.random.default_rng(seed)\n\n    # remove kwargs that are the default values\n\n    default_values = {\n        p.name: p.default for p in inspect.signature(func).parameters.values() if p.default != inspect._empty\n    }\n    for default_varname, default_value in default_values.items():\n        if default_varname in kwargs_for_fingerprint and kwargs_for_fingerprint[default_varname] == default_value:\n            kwargs_for_fingerprint.pop(default_varname)\n    return kwargs_for_fingerprint\n\n\ndef fingerprint_transform(\n    inplace: bool,\n    use_kwargs: Optional[list[str]] = None,\n    ignore_kwargs: Optional[list[str]] = None,\n    fingerprint_names: Optional[list[str]] = None,\n    randomized_function: bool = False,\n    version: Optional[str] = None,\n):\n    \"\"\"\n    Wrapper for dataset transforms to update the dataset fingerprint using ``update_fingerprint``\n    Args:\n        inplace (:obj:`bool`):  If inplace is True, the fingerprint of the dataset is updated inplace.\n            Otherwise, a parameter \"new_fingerprint\" is passed to the wrapped method that should take care of\n            setting the fingerprint of the returned Dataset.\n        use_kwargs (:obj:`List[str]`, optional): optional white list of argument names to take into account\n            to update the fingerprint to the wrapped method that should take care of\n            setting the fingerprint of the returned Dataset. By default all the arguments are used.\n        ignore_kwargs (:obj:`List[str]`, optional): optional black list of argument names to take into account\n            to update the fingerprint. Note that ignore_kwargs prevails on use_kwargs.\n        fingerprint_names (:obj:`List[str]`, optional, defaults to [\"new_fingerprint\"]):\n            If the dataset transforms is not inplace and returns a DatasetDict, then it can require\n            several fingerprints (one per dataset in the DatasetDict). By specifying fingerprint_names,\n            one fingerprint named after each element of fingerprint_names is going to be passed.\n        randomized_function (:obj:`bool`, defaults to False): If the dataset transform is random and has\n            optional parameters \"seed\" and \"generator\", then you can set randomized_function to True.\n            This way, even if users set \"seed\" and \"generator\" to None, then the fingerprint is\n            going to be randomly generated depending on numpy's current state. In this case, the\n            generator is set to np.random.default_rng(np.random.get_state()[1][0]).\n        version (:obj:`str`, optional): version of the transform. The version is taken into account when\n            computing the fingerprint. If a datase transform changes (or at least if the output data\n            that are cached changes), then one should increase the version. If the version stays the\n            same, then old cached data could be reused that are not compatible with the new transform.\n            It should be in the format \"MAJOR.MINOR.PATCH\".\n    \"\"\"\n\n    if use_kwargs is not None and not isinstance(use_kwargs, list):\n        raise ValueError(f\"use_kwargs is supposed to be a list, not {type(use_kwargs)}\")\n\n    if ignore_kwargs is not None and not isinstance(ignore_kwargs, list):\n        raise ValueError(f\"ignore_kwargs is supposed to be a list, not {type(use_kwargs)}\")\n\n    if inplace and fingerprint_names:\n        raise ValueError(\"fingerprint_names are only used when inplace is False\")\n\n    fingerprint_names = fingerprint_names if fingerprint_names is not None else [\"new_fingerprint\"]\n\n    def _fingerprint(func):\n        if not inplace and not all(name in func.__code__.co_varnames for name in fingerprint_names):\n            raise ValueError(f\"function {func} is missing parameters {fingerprint_names} in signature\")\n\n        if randomized_function:  # randomized function have seed and generator parameters\n            if \"seed\" not in func.__code__.co_varnames:\n                raise ValueError(f\"'seed' must be in {func}'s signature\")\n            if \"generator\" not in func.__code__.co_varnames:\n                raise ValueError(f\"'generator' must be in {func}'s signature\")\n        # this call has to be outside the wrapper or since __qualname__ changes in multiprocessing\n        transform = format_transform_for_fingerprint(func, version=version)\n\n        @wraps(func)\n        def wrapper(*args, **kwargs):\n            kwargs_for_fingerprint = format_kwargs_for_fingerprint(\n                func,\n                args,\n                kwargs,\n                use_kwargs=use_kwargs,\n                ignore_kwargs=ignore_kwargs,\n                randomized_function=randomized_function,\n            )\n\n            if args:\n                dataset: Dataset = args[0]\n                args = args[1:]\n            else:\n                dataset: Dataset = kwargs.pop(next(iter(inspect.signature(func).parameters)))\n\n            # compute new_fingerprint and add it to the args of not in-place transforms\n            if inplace:\n                new_fingerprint = update_fingerprint(dataset._fingerprint, transform, kwargs_for_fingerprint)\n            else:\n                for fingerprint_name in fingerprint_names:  # transforms like `train_test_split` have several hashes\n                    if kwargs.get(fingerprint_name) is None:\n                        kwargs_for_fingerprint[\"fingerprint_name\"] = fingerprint_name\n                        kwargs[fingerprint_name] = update_fingerprint(\n                            dataset._fingerprint, transform, kwargs_for_fingerprint\n                        )\n                    else:\n                        validate_fingerprint(kwargs[fingerprint_name])\n\n            # Call actual function\n\n            out = func(dataset, *args, **kwargs)\n\n            # Update fingerprint of in-place transforms + update in-place history of transforms\n\n            if inplace:  # update after calling func so that the fingerprint doesn't change if the function fails\n                dataset._fingerprint = new_fingerprint\n\n            return out\n\n        wrapper._decorator_name_ = \"fingerprint\"\n        return wrapper\n\n    return _fingerprint\n"
  },
  {
    "path": "src/datasets/formatting/__init__.py",
    "content": "# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import Dict, List, Optional, Type\n\nfrom .. import config\nfrom ..utils import logging\nfrom .formatting import (\n    ArrowFormatter,\n    CustomFormatter,\n    Formatter,\n    PandasFormatter,\n    PythonFormatter,\n    TableFormatter,\n    TensorFormatter,\n    format_table,\n    query_table,\n)\nfrom .np_formatter import NumpyFormatter\n\n\nlogger = logging.get_logger(__name__)\n\n_FORMAT_TYPES: dict[Optional[str], type[Formatter]] = {}\n_FORMAT_TYPES_ALIASES: dict[Optional[str], str] = {}\n_FORMAT_TYPES_ALIASES_UNAVAILABLE: dict[Optional[str], Exception] = {}\n\n\ndef _register_formatter(\n    formatter_cls: type,\n    format_type: Optional[str],\n    aliases: Optional[list[str]] = None,\n):\n    \"\"\"\n    Register a Formatter object using a name and optional aliases.\n    This function must be used on a Formatter class.\n    \"\"\"\n    aliases = aliases if aliases is not None else []\n    if format_type in _FORMAT_TYPES:\n        logger.warning(\n            f\"Overwriting format type '{format_type}' ({_FORMAT_TYPES[format_type].__name__} -> {formatter_cls.__name__})\"\n        )\n    _FORMAT_TYPES[format_type] = formatter_cls\n    for alias in set(aliases + [format_type]):\n        if alias in _FORMAT_TYPES_ALIASES:\n            logger.warning(\n                f\"Overwriting format type alias '{alias}' ({_FORMAT_TYPES_ALIASES[alias]} -> {format_type})\"\n            )\n        _FORMAT_TYPES_ALIASES[alias] = format_type\n\n\ndef _register_unavailable_formatter(\n    unavailable_error: Exception, format_type: Optional[str], aliases: Optional[list[str]] = None\n):\n    \"\"\"\n    Register an unavailable Formatter object using a name and optional aliases.\n    This function must be used on an Exception object that is raised when trying to get the unavailable formatter.\n    \"\"\"\n    aliases = aliases if aliases is not None else []\n    for alias in set(aliases + [format_type]):\n        _FORMAT_TYPES_ALIASES_UNAVAILABLE[alias] = unavailable_error\n\n\n# Here we define all the available formatting functions that can be used by `Dataset.set_format`\n_register_formatter(PythonFormatter, None, aliases=[\"python\"])\n_register_formatter(ArrowFormatter, \"arrow\", aliases=[\"pa\", \"pyarrow\"])\n_register_formatter(NumpyFormatter, \"numpy\", aliases=[\"np\"])\n_register_formatter(PandasFormatter, \"pandas\", aliases=[\"pd\"])\n_register_formatter(CustomFormatter, \"custom\")\n\nif config.POLARS_AVAILABLE:\n    from .polars_formatter import PolarsFormatter\n\n    _register_formatter(PolarsFormatter, \"polars\", aliases=[\"pl\"])\nelse:\n    _polars_error = ValueError(\"Polars needs to be installed to be able to return Polars dataframes.\")\n    _register_unavailable_formatter(_polars_error, \"polars\", aliases=[\"pl\"])\n\nif config.TORCH_AVAILABLE:\n    from .torch_formatter import TorchFormatter\n\n    _register_formatter(TorchFormatter, \"torch\", aliases=[\"pt\", \"pytorch\"])\nelse:\n    _torch_error = ValueError(\"PyTorch needs to be installed to be able to return PyTorch tensors.\")\n    _register_unavailable_formatter(_torch_error, \"torch\", aliases=[\"pt\", \"pytorch\"])\n\nif config.TF_AVAILABLE:\n    from .tf_formatter import TFFormatter\n\n    _register_formatter(TFFormatter, \"tensorflow\", aliases=[\"tf\"])\nelse:\n    _tf_error = ValueError(\"Tensorflow needs to be installed to be able to return Tensorflow tensors.\")\n    _register_unavailable_formatter(_tf_error, \"tensorflow\", aliases=[\"tf\"])\n\nif config.JAX_AVAILABLE:\n    from .jax_formatter import JaxFormatter\n\n    _register_formatter(JaxFormatter, \"jax\", aliases=[])\nelse:\n    _jax_error = ValueError(\"JAX needs to be installed to be able to return JAX arrays.\")\n    _register_unavailable_formatter(_jax_error, \"jax\", aliases=[])\n\n\ndef get_format_type_from_alias(format_type: Optional[str]) -> Optional[str]:\n    \"\"\"If the given format type is a known alias, then return its main type name. Otherwise return the type with no change.\"\"\"\n    if format_type in _FORMAT_TYPES_ALIASES:\n        return _FORMAT_TYPES_ALIASES[format_type]\n    else:\n        return format_type\n\n\ndef get_formatter(format_type: Optional[str], **format_kwargs) -> Formatter:\n    \"\"\"\n    Factory function to get a Formatter given its type name and keyword arguments.\n    A formatter is an object that extracts and formats data from pyarrow table.\n    It defines the formatting for rows, columns and batches.\n    If the formatter for a given type name doesn't exist or is not available, an error is raised.\n    \"\"\"\n    format_type = get_format_type_from_alias(format_type)\n    if format_type in _FORMAT_TYPES:\n        return _FORMAT_TYPES[format_type](**format_kwargs)\n    if format_type in _FORMAT_TYPES_ALIASES_UNAVAILABLE:\n        raise _FORMAT_TYPES_ALIASES_UNAVAILABLE[format_type]\n    else:\n        raise ValueError(f\"Format type should be one of {list(_FORMAT_TYPES.keys())}, but got '{format_type}'\")\n"
  },
  {
    "path": "src/datasets/formatting/formatting.py",
    "content": "# Copyright 2020 The HuggingFace Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport numbers\nimport operator\nfrom collections.abc import Iterable, Mapping, MutableMapping\nfrom functools import partial\n\n# Lint as: python3\nfrom typing import Any, Callable, Generic, Optional, TypeVar, Union\n\nimport numpy as np\nimport pandas as pd\nimport pyarrow as pa\n\nfrom ..features import Features\nfrom ..features.features import _ArrayXDExtensionType, _is_zero_copy_only, decode_nested_example, pandas_types_mapper\nfrom ..table import Table\nfrom ..utils.py_utils import no_op_if_value_is_null\n\n\nT = TypeVar(\"T\")\n\nRowFormat = TypeVar(\"RowFormat\")\nColumnFormat = TypeVar(\"ColumnFormat\")\nBatchFormat = TypeVar(\"BatchFormat\")\n\n\ndef _is_range_contiguous(key: range) -> bool:\n    return key.step == 1 and key.stop >= key.start\n\n\ndef _raise_bad_key_type(key: Any):\n    raise TypeError(\n        f\"Wrong key type: '{key}' of type '{type(key)}'. Expected one of int, slice, range, str or Iterable.\"\n    )\n\n\ndef _query_table_with_indices_mapping(\n    table: Table, key: Union[int, slice, range, str, Iterable], indices: Table\n) -> pa.Table:\n    \"\"\"\n    Query a pyarrow Table to extract the subtable that correspond to the given key.\n    The :obj:`indices` parameter corresponds to the indices mapping in case we cant to take into\n    account a shuffling or an indices selection for example.\n    The indices table must contain one column named \"indices\" of type uint64.\n    \"\"\"\n    if isinstance(key, int):\n        key = indices.fast_slice(key % indices.num_rows, 1).column(0)[0].as_py()\n        return _query_table(table, key)\n    if isinstance(key, slice):\n        key = range(*key.indices(indices.num_rows))\n    if isinstance(key, range):\n        if _is_range_contiguous(key) and key.start >= 0:\n            return _query_table(\n                table, [i.as_py() for i in indices.fast_slice(key.start, key.stop - key.start).column(0)]\n            )\n        else:\n            pass  # treat as an iterable\n    if isinstance(key, str):\n        table = table.select([key])\n        return _query_table(table, indices.column(0).to_pylist())\n    if isinstance(key, Iterable):\n        return _query_table(table, [indices.fast_slice(i, 1).column(0)[0].as_py() for i in key])\n\n    _raise_bad_key_type(key)\n\n\ndef _query_table(table: Table, key: Union[int, slice, range, str, Iterable]) -> pa.Table:\n    \"\"\"\n    Query a pyarrow Table to extract the subtable that correspond to the given key.\n    \"\"\"\n    if isinstance(key, int):\n        return table.fast_slice(key % table.num_rows, 1)\n    if isinstance(key, slice):\n        key = range(*key.indices(table.num_rows))\n    if isinstance(key, range):\n        if _is_range_contiguous(key) and key.start >= 0:\n            return table.fast_slice(key.start, key.stop - key.start)\n        else:\n            pass  # treat as an iterable\n    if isinstance(key, str):\n        return table.table.drop([column for column in table.column_names if column != key])\n    if isinstance(key, Iterable):\n        key = np.fromiter(key, np.int64)\n        if len(key) == 0:\n            return table.table.slice(0, 0)\n        # don't use pyarrow.Table.take even for pyarrow >=1.0 (see https://issues.apache.org/jira/browse/ARROW-9773)\n        return table.fast_gather(key % table.num_rows)\n\n    _raise_bad_key_type(key)\n\n\ndef _is_array_with_nulls(pa_array: pa.Array) -> bool:\n    return pa_array.null_count > 0\n\n\nclass BaseArrowExtractor(Generic[RowFormat, ColumnFormat, BatchFormat]):\n    \"\"\"\n    Arrow extractor are used to extract data from pyarrow tables.\n    It makes it possible to extract rows, columns and batches.\n    These three extractions types have to be implemented.\n    \"\"\"\n\n    def extract_row(self, pa_table: pa.Table) -> RowFormat:\n        raise NotImplementedError\n\n    def extract_column(self, pa_table: pa.Table) -> ColumnFormat:\n        raise NotImplementedError\n\n    def extract_batch(self, pa_table: pa.Table) -> BatchFormat:\n        raise NotImplementedError\n\n\ndef _unnest(py_dict: dict[str, list[T]]) -> dict[str, T]:\n    \"\"\"Return the first element of a batch (dict) as a row (dict)\"\"\"\n    return {key: array[0] for key, array in py_dict.items()}\n\n\nclass SimpleArrowExtractor(BaseArrowExtractor[pa.Table, pa.Array, pa.Table]):\n    def extract_row(self, pa_table: pa.Table) -> pa.Table:\n        return pa_table\n\n    def extract_column(self, pa_table: pa.Table) -> pa.Array:\n        return pa_table.column(0)\n\n    def extract_batch(self, pa_table: pa.Table) -> pa.Table:\n        return pa_table\n\n\nclass PythonArrowExtractor(BaseArrowExtractor[dict, list, dict]):\n    def extract_row(self, pa_table: pa.Table) -> dict:\n        return _unnest(pa_table.to_pydict())\n\n    def extract_column(self, pa_table: pa.Table) -> list:\n        return pa_table.column(0).to_pylist()\n\n    def extract_batch(self, pa_table: pa.Table) -> dict:\n        return pa_table.to_pydict()\n\n\nclass NumpyArrowExtractor(BaseArrowExtractor[dict, np.ndarray, dict]):\n    def __init__(self, **np_array_kwargs):\n        self.np_array_kwargs = np_array_kwargs\n\n    def extract_row(self, pa_table: pa.Table) -> dict:\n        return _unnest(self.extract_batch(pa_table))\n\n    def extract_column(self, pa_table: pa.Table) -> np.ndarray:\n        return self._arrow_array_to_numpy(pa_table[pa_table.column_names[0]])\n\n    def extract_batch(self, pa_table: pa.Table) -> dict:\n        return {col: self._arrow_array_to_numpy(pa_table[col]) for col in pa_table.column_names}\n\n    def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray:\n        if isinstance(pa_array, pa.ChunkedArray):\n            if isinstance(pa_array.type, _ArrayXDExtensionType):\n                # don't call to_pylist() to preserve dtype of the fixed-size array\n                zero_copy_only = _is_zero_copy_only(pa_array.type.storage_dtype, unnest=True)\n                array: list = [\n                    row for chunk in pa_array.chunks for row in chunk.to_numpy(zero_copy_only=zero_copy_only)\n                ]\n            else:\n                zero_copy_only = _is_zero_copy_only(pa_array.type) and all(\n                    not _is_array_with_nulls(chunk) for chunk in pa_array.chunks\n                )\n                array: list = [\n                    row for chunk in pa_array.chunks for row in chunk.to_numpy(zero_copy_only=zero_copy_only)\n                ]\n        else:\n            if isinstance(pa_array.type, _ArrayXDExtensionType):\n                # don't call to_pylist() to preserve dtype of the fixed-size array\n                zero_copy_only = _is_zero_copy_only(pa_array.type.storage_dtype, unnest=True)\n                array: list = pa_array.to_numpy(zero_copy_only=zero_copy_only)\n            else:\n                zero_copy_only = _is_zero_copy_only(pa_array.type) and not _is_array_with_nulls(pa_array)\n                array: list = pa_array.to_numpy(zero_copy_only=zero_copy_only).tolist()\n\n        if len(array) > 0:\n            if any(\n                (isinstance(x, np.ndarray) and (x.dtype == object or x.shape != array[0].shape))\n                or (isinstance(x, float) and np.isnan(x))\n                for x in array\n            ):\n                if np.lib.NumpyVersion(np.__version__) >= \"2.0.0b1\":\n                    return np.asarray(array, dtype=object)\n                return np.array(array, copy=False, dtype=object)\n        if np.lib.NumpyVersion(np.__version__) >= \"2.0.0b1\":\n            return np.asarray(array)\n        else:\n            return np.array(array, copy=False)\n\n\nclass PandasArrowExtractor(BaseArrowExtractor[pd.DataFrame, pd.Series, pd.DataFrame]):\n    def extract_row(self, pa_table: pa.Table) -> pd.DataFrame:\n        return pa_table.slice(length=1).to_pandas(types_mapper=pandas_types_mapper)\n\n    def extract_column(self, pa_table: pa.Table) -> pd.Series:\n        return pa_table.select([0]).to_pandas(types_mapper=pandas_types_mapper)[pa_table.column_names[0]]\n\n    def extract_batch(self, pa_table: pa.Table) -> pd.DataFrame:\n        return pa_table.to_pandas(types_mapper=pandas_types_mapper)\n\n\nclass PythonFeaturesDecoder:\n    def __init__(\n        self, features: Optional[Features], token_per_repo_id: Optional[dict[str, Union[str, bool, None]]] = None\n    ):\n        self.features = features\n        self.token_per_repo_id = token_per_repo_id\n\n    def decode_row(self, row: dict) -> dict:\n        return self.features.decode_example(row, token_per_repo_id=self.token_per_repo_id) if self.features else row\n\n    def decode_column(self, column: list, column_name: str) -> list:\n        return (\n            self.features.decode_column(column, column_name, token_per_repo_id=self.token_per_repo_id)\n            if self.features\n            else column\n        )\n\n    def decode_batch(self, batch: dict) -> dict:\n        return self.features.decode_batch(batch, token_per_repo_id=self.token_per_repo_id) if self.features else batch\n\n\nclass PandasFeaturesDecoder:\n    def __init__(self, features: Optional[Features]):\n        self.features = features\n\n    def decode_row(self, row: pd.DataFrame) -> pd.DataFrame:\n        decode = (\n            {\n                column_name: no_op_if_value_is_null(partial(decode_nested_example, feature))\n                for column_name, feature in self.features.items()\n                if self.features._column_requires_decoding[column_name]\n            }\n            if self.features\n            else {}\n        )\n        if decode:\n            row[list(decode.keys())] = row.transform(decode)\n        return row\n\n    def decode_column(self, column: pd.Series, column_name: str) -> pd.Series:\n        decode = (\n            no_op_if_value_is_null(partial(decode_nested_example, self.features[column_name]))\n            if self.features and column_name in self.features and self.features._column_requires_decoding[column_name]\n            else None\n        )\n        if decode:\n            column = column.transform(decode)\n        return column\n\n    def decode_batch(self, batch: pd.DataFrame) -> pd.DataFrame:\n        return self.decode_row(batch)\n\n\nclass LazyDict(MutableMapping):\n    \"\"\"A dictionary backed by Arrow data. The values are formatted on-the-fly when accessing the dictionary.\"\"\"\n\n    def __init__(self, pa_table: pa.Table, formatter: \"Formatter\"):\n        self.pa_table = pa_table\n        self.formatter = formatter\n\n        self.data = dict.fromkeys(pa_table.column_names)\n        self.keys_to_format = set(self.data.keys())\n\n    def __len__(self):\n        return len(self.data)\n\n    def __getitem__(self, key):\n        value = self.data[key]\n        if key in self.keys_to_format:\n            value = self.format(key)\n            self.data[key] = value\n            self.keys_to_format.remove(key)\n        return value\n\n    def __setitem__(self, key, value):\n        if key in self.keys_to_format:\n            self.keys_to_format.remove(key)\n        self.data[key] = value\n\n    def __delitem__(self, key) -> None:\n        if key in self.keys_to_format:\n            self.keys_to_format.remove(key)\n        del self.data[key]\n\n    def __iter__(self):\n        return iter(self.data)\n\n    def __contains__(self, key):\n        return key in self.data\n\n    def __repr__(self):\n        self._format_all()\n        return repr(self.data)\n\n    def __or__(self, other):\n        if isinstance(other, LazyDict):\n            inst = self.copy()\n            other = other.copy()\n            other._format_all()\n            inst.keys_to_format -= other.data.keys()\n            inst.data = inst.data | other.data\n            return inst\n        if isinstance(other, dict):\n            inst = self.copy()\n            inst.keys_to_format -= other.keys()\n            inst.data = inst.data | other\n            return inst\n        return NotImplemented\n\n    def __ror__(self, other):\n        if isinstance(other, LazyDict):\n            inst = self.copy()\n            other = other.copy()\n            other._format_all()\n            inst.keys_to_format -= other.data.keys()\n            inst.data = other.data | inst.data\n            return inst\n        if isinstance(other, dict):\n            inst = self.copy()\n            inst.keys_to_format -= other.keys()\n            inst.data = other | inst.data\n            return inst\n        return NotImplemented\n\n    def __ior__(self, other):\n        if isinstance(other, LazyDict):\n            other = other.copy()\n            other._format_all()\n            self.keys_to_format -= other.data.keys()\n            self.data |= other.data\n        else:\n            self.keys_to_format -= other.keys()\n            self.data |= other\n        return self\n\n    def __copy__(self):\n        # Identical to `UserDict.__copy__`\n        inst = self.__class__.__new__(self.__class__)\n        inst.__dict__.update(self.__dict__)\n        # Create a copy and avoid triggering descriptors\n        inst.__dict__[\"data\"] = self.__dict__[\"data\"].copy()\n        inst.__dict__[\"keys_to_format\"] = self.__dict__[\"keys_to_format\"].copy()\n        return inst\n\n    def copy(self):\n        import copy\n\n        return copy.copy(self)\n\n    @classmethod\n    def fromkeys(cls, iterable, value=None):\n        raise NotImplementedError\n\n    def format(self, key):\n        raise NotImplementedError\n\n    def _format_all(self):\n        for key in self.keys_to_format:\n            self.data[key] = self.format(key)\n        self.keys_to_format.clear()\n\n\nclass LazyRow(LazyDict):\n    def format(self, key):\n        return self.formatter.format_column(self.pa_table.select([key]))[0]\n\n\nclass LazyBatch(LazyDict):\n    def format(self, key):\n        return self.formatter.format_column(self.pa_table.select([key]))\n\n\nclass Formatter(Generic[RowFormat, ColumnFormat, BatchFormat]):\n    \"\"\"\n    A formatter is an object that extracts and formats data from pyarrow tables.\n    It defines the formatting for rows, columns and batches.\n    \"\"\"\n\n    simple_arrow_extractor = SimpleArrowExtractor\n    python_arrow_extractor = PythonArrowExtractor\n    numpy_arrow_extractor = NumpyArrowExtractor\n    pandas_arrow_extractor = PandasArrowExtractor\n\n    def __init__(\n        self,\n        features: Optional[Features] = None,\n        token_per_repo_id: Optional[dict[str, Union[str, bool, None]]] = None,\n    ):\n        self.features = features\n        self.token_per_repo_id = token_per_repo_id\n        self.python_features_decoder = PythonFeaturesDecoder(self.features, self.token_per_repo_id)\n        self.pandas_features_decoder = PandasFeaturesDecoder(self.features)\n\n    def __call__(self, pa_table: pa.Table, query_type: str) -> Union[RowFormat, ColumnFormat, BatchFormat]:\n        if query_type == \"row\":\n            return self.format_row(pa_table)\n        elif query_type == \"column\":\n            return self.format_column(pa_table)\n        elif query_type == \"batch\":\n            return self.format_batch(pa_table)\n\n    def format_row(self, pa_table: pa.Table) -> RowFormat:\n        raise NotImplementedError\n\n    def format_column(self, pa_table: pa.Table) -> ColumnFormat:\n        raise NotImplementedError\n\n    def format_batch(self, pa_table: pa.Table) -> BatchFormat:\n        raise NotImplementedError\n\n\nclass TensorFormatter(Formatter[RowFormat, ColumnFormat, BatchFormat]):\n    def recursive_tensorize(self, data_struct: dict):\n        raise NotImplementedError\n\n\nclass TableFormatter(Formatter[RowFormat, ColumnFormat, BatchFormat]):\n    table_type: str\n    column_type: str\n\n\nclass ArrowFormatter(TableFormatter[pa.Table, pa.Array, pa.Table]):\n    table_type = \"arrow table\"\n    column_type = \"arrow array\"\n\n    def format_row(self, pa_table: pa.Table) -> pa.Table:\n        return self.simple_arrow_extractor().extract_row(pa_table)\n\n    def format_column(self, pa_table: pa.Table) -> pa.Array:\n        return self.simple_arrow_extractor().extract_column(pa_table)\n\n    def format_batch(self, pa_table: pa.Table) -> pa.Table:\n        return self.simple_arrow_extractor().extract_batch(pa_table)\n\n\nclass PythonFormatter(Formatter[Mapping, list, Mapping]):\n    def __init__(self, features=None, lazy=False, token_per_repo_id=None):\n        super().__init__(features, token_per_repo_id)\n        self.lazy = lazy\n\n    def format_row(self, pa_table: pa.Table) -> Mapping:\n        if self.lazy:\n            return LazyRow(pa_table, self)\n        row = self.python_arrow_extractor().extract_row(pa_table)\n        row = self.python_features_decoder.decode_row(row)\n        return row\n\n    def format_column(self, pa_table: pa.Table) -> list:\n        column = self.python_arrow_extractor().extract_column(pa_table)\n        column = self.python_features_decoder.decode_column(column, pa_table.column_names[0])\n        return column\n\n    def format_batch(self, pa_table: pa.Table) -> Mapping:\n        if self.lazy:\n            return LazyBatch(pa_table, self)\n        batch = self.python_arrow_extractor().extract_batch(pa_table)\n        batch = self.python_features_decoder.decode_batch(batch)\n        return batch\n\n\nclass PandasFormatter(TableFormatter[pd.DataFrame, pd.Series, pd.DataFrame]):\n    table_type = \"pandas dataframe\"\n    column_type = \"pandas series\"\n\n    def format_row(self, pa_table: pa.Table) -> pd.DataFrame:\n        row = self.pandas_arrow_extractor().extract_row(pa_table)\n        row = self.pandas_features_decoder.decode_row(row)\n        return row\n\n    def format_column(self, pa_table: pa.Table) -> pd.Series:\n        column = self.pandas_arrow_extractor().extract_column(pa_table)\n        column = self.pandas_features_decoder.decode_column(column, pa_table.column_names[0])\n        return column\n\n    def format_batch(self, pa_table: pa.Table) -> pd.DataFrame:\n        row = self.pandas_arrow_extractor().extract_batch(pa_table)\n        row = self.pandas_features_decoder.decode_batch(row)\n        return row\n\n\nclass CustomFormatter(Formatter[dict, ColumnFormat, dict]):\n    \"\"\"\n    A user-defined custom formatter function defined by a ``transform``.\n    The transform must take as input a batch of data extracted for an arrow table using the python extractor,\n    and return a batch.\n    If the output batch is not a dict, then output_all_columns won't work.\n    If the output batch has several fields, then querying a single column won't work since we don't know which field\n    to return.\n    \"\"\"\n\n    def __init__(self, transform: Callable[[dict], dict], features=None, token_per_repo_id=None, **kwargs):\n        super().__init__(features=features, token_per_repo_id=token_per_repo_id)\n        self.transform = transform\n\n    def format_row(self, pa_table: pa.Table) -> dict:\n        formatted_batch = self.format_batch(pa_table)\n        try:\n            return _unnest(formatted_batch)\n        except Exception as exc:\n            raise TypeError(\n                f\"Custom formatting function must return a dict of sequences to be able to pick a row, but got {formatted_batch}\"\n            ) from exc\n\n    def format_column(self, pa_table: pa.Table) -> ColumnFormat:\n        formatted_batch = self.format_batch(pa_table)\n        if hasattr(formatted_batch, \"keys\"):\n            if len(formatted_batch.keys()) > 1:\n                raise TypeError(\n                    \"Tried to query a column but the custom formatting function returns too many columns. \"\n                    f\"Only one column was expected but got columns {list(formatted_batch.keys())}.\"\n                )\n        else:\n            raise TypeError(\n                f\"Custom formatting function must return a dict to be able to pick a row, but got {formatted_batch}\"\n            )\n        try:\n            return formatted_batch[pa_table.column_names[0]]\n        except Exception as exc:\n            raise TypeError(\n                f\"Custom formatting function must return a dict to be able to pick a row, but got {formatted_batch}\"\n            ) from exc\n\n    def format_batch(self, pa_table: pa.Table) -> dict:\n        batch = self.python_arrow_extractor().extract_batch(pa_table)\n        batch = self.python_features_decoder.decode_batch(batch)\n        return self.transform(batch)\n\n\ndef _check_valid_column_key(key: str, columns: list[str]) -> None:\n    if key not in columns:\n        raise KeyError(f\"Column {key} not in the dataset. Current columns in the dataset: {columns}\")\n\n\ndef _check_valid_index_key(key: Union[int, slice, range, Iterable], size: int) -> None:\n    if isinstance(key, int):\n        if (key < 0 and key + size < 0) or (key >= size):\n            raise IndexError(f\"Invalid key: {key} is out of bounds for size {size}\")\n        return\n    elif isinstance(key, slice):\n        pass\n    elif isinstance(key, range):\n        if len(key) > 0:\n            _check_valid_index_key(max(key), size=size)\n            _check_valid_index_key(min(key), size=size)\n    elif isinstance(key, Iterable):\n        if len(key) > 0:\n            _check_valid_index_key(int(max(key)), size=size)\n            _check_valid_index_key(int(min(key)), size=size)\n    else:\n        _raise_bad_key_type(key)\n\n\ndef key_to_query_type(key: Union[int, slice, range, str, Iterable]) -> str:\n    if isinstance(key, numbers.Integral):\n        return \"row\"\n    elif isinstance(key, str):\n        return \"column\"\n    elif isinstance(key, (slice, range, Iterable)):\n        return \"batch\"\n    _raise_bad_key_type(key)\n\n\ndef query_table(\n    table: Table,\n    key: Union[int, slice, range, str, Iterable],\n    indices: Optional[Table] = None,\n) -> pa.Table:\n    \"\"\"\n    Query a Table to extract the subtable that correspond to the given key.\n\n    Args:\n        table (``datasets.table.Table``): The input Table to query from\n        key (``Union[int, slice, range, str, Iterable]``): The key can be of different types:\n            - an integer i: the subtable containing only the i-th row\n            - a slice [i:j:k]: the subtable containing the rows that correspond to this slice\n            - a range(i, j, k): the subtable containing the rows that correspond to this range\n            - a string c: the subtable containing all the rows but only the column c\n            - an iterable l: the subtable that is the concatenation of all the i-th rows for all i in the iterable\n        indices (Optional ``datasets.table.Table``): If not None, it is used to re-map the given key to the table rows.\n            The indices table must contain one column named \"indices\" of type uint64.\n            This is used in case of shuffling or rows selection.\n\n\n    Returns:\n        ``pyarrow.Table``: the result of the query on the input table\n    \"\"\"\n    # Check if key is valid\n    if not isinstance(key, (int, slice, range, str, Iterable)):\n        try:\n            key = operator.index(key)\n        except TypeError:\n            _raise_bad_key_type(key)\n    if isinstance(key, str):\n        _check_valid_column_key(key, table.column_names)\n    else:\n        size = indices.num_rows if indices is not None else table.num_rows\n        _check_valid_index_key(key, size)\n    # Query the main table\n    if indices is None:\n        pa_subtable = _query_table(table, key)\n    else:\n        pa_subtable = _query_table_with_indices_mapping(table, key, indices=indices)\n    return pa_subtable\n\n\ndef format_table(\n    table: Table,\n    key: Union[int, slice, range, str, Iterable],\n    formatter: Formatter,\n    format_columns: Optional[list] = None,\n    output_all_columns=False,\n):\n    \"\"\"\n    Format a Table depending on the key that was used and a Formatter object.\n\n    Args:\n        table (``datasets.table.Table``): The input Table to format\n        key (``Union[int, slice, range, str, Iterable]``): Depending on the key that was used, the formatter formats\n            the table as either a row, a column or a batch.\n        formatter (``datasets.formatting.formatting.Formatter``): Any subclass of a Formatter such as\n            PythonFormatter, NumpyFormatter, etc.\n        format_columns (:obj:`List[str]`, optional): if not None, it defines the columns that will be formatted using the\n            given formatter. Other columns are discarded (unless ``output_all_columns`` is True)\n        output_all_columns (:obj:`bool`, defaults to False). If True, the formatted output is completed using the columns\n            that are not in the ``format_columns`` list. For these columns, the PythonFormatter is used.\n\n\n    Returns:\n        A row, column or batch formatted object defined by the Formatter:\n        - the PythonFormatter returns a dictionary for a row or a batch, and a list for a column.\n        - the NumpyFormatter returns a dictionary for a row or a batch, and a np.array for a column.\n        - the PandasFormatter returns a pd.DataFrame for a row or a batch, and a pd.Series for a column.\n        - the TorchFormatter returns a dictionary for a row or a batch, and a torch.Tensor for a column.\n        - the TFFormatter returns a dictionary for a row or a batch, and a tf.Tensor for a column.\n    \"\"\"\n    if isinstance(table, Table):\n        pa_table = table.table\n    else:\n        pa_table = table\n    query_type = key_to_query_type(key)\n    python_formatter = PythonFormatter(features=formatter.features)\n    if format_columns is None:\n        return formatter(pa_table, query_type=query_type)\n    elif query_type == \"column\":\n        if key in format_columns:\n            return formatter(pa_table, query_type)\n        else:\n            return python_formatter(pa_table, query_type=query_type)\n    else:\n        pa_table_to_format = pa_table.drop(col for col in pa_table.column_names if col not in format_columns)\n        formatted_output = formatter(pa_table_to_format, query_type=query_type)\n        if output_all_columns:\n            if isinstance(formatted_output, MutableMapping):\n                pa_table_with_remaining_columns = pa_table.drop(\n                    col for col in pa_table.column_names if col in format_columns\n                )\n                remaining_columns_dict = python_formatter(pa_table_with_remaining_columns, query_type=query_type)\n                formatted_output.update(remaining_columns_dict)\n            else:\n                raise TypeError(\n                    f\"Custom formatting function must return a dict to work with output_all_columns=True, but got {formatted_output}\"\n                )\n        return formatted_output\n"
  },
  {
    "path": "src/datasets/formatting/jax_formatter.py",
    "content": "# Copyright 2021 The HuggingFace Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Lint as: python3\nimport sys\nfrom collections.abc import Mapping\nfrom typing import TYPE_CHECKING, Optional\n\nimport numpy as np\nimport pyarrow as pa\n\nfrom .. import config\nfrom ..utils.logging import get_logger\nfrom ..utils.py_utils import map_nested\nfrom .formatting import TensorFormatter\n\n\nif TYPE_CHECKING:\n    import jax\n    import jaxlib\n\nlogger = get_logger()\n\nDEVICE_MAPPING: Optional[dict] = None\n\n\nclass JaxFormatter(TensorFormatter[Mapping, \"jax.Array\", Mapping]):\n    def __init__(self, features=None, device=None, token_per_repo_id=None, **jnp_array_kwargs):\n        super().__init__(features=features, token_per_repo_id=token_per_repo_id)\n        import jax\n        from jaxlib.xla_client import Device\n\n        if isinstance(device, Device):\n            raise ValueError(\n                f\"Expected {device} to be a `str` not {type(device)}, as `jaxlib.xla_extension.Device` \"\n                \"is not serializable neither with `pickle` nor with `dill`. Instead you can surround \"\n                \"the device with `str()` to get its string identifier that will be internally mapped \"\n                \"to the actual `jaxlib.xla_extension.Device`.\"\n            )\n        self.device = device if isinstance(device, str) else str(jax.devices()[0])\n        # using global variable since `jaxlib.xla_extension.Device` is not serializable neither\n        # with `pickle` nor with `dill`, so we need to use a global variable instead\n        global DEVICE_MAPPING\n        if DEVICE_MAPPING is None:\n            DEVICE_MAPPING = self._map_devices_to_str()\n        if self.device not in list(DEVICE_MAPPING.keys()):\n            logger.warning(\n                f\"Device with string identifier {self.device} not listed among the available \"\n                f\"devices: {list(DEVICE_MAPPING.keys())}, so falling back to the default \"\n                f\"device: {str(jax.devices()[0])}.\"\n            )\n            self.device = str(jax.devices()[0])\n        self.jnp_array_kwargs = jnp_array_kwargs\n\n    @staticmethod\n    def _map_devices_to_str() -> dict[str, \"jaxlib.xla_extension.Device\"]:\n        import jax\n\n        return {str(device): device for device in jax.devices()}\n\n    def _consolidate(self, column):\n        import jax\n        import jax.numpy as jnp\n\n        if isinstance(column, list) and column:\n            if all(\n                isinstance(x, jax.Array) and x.shape == column[0].shape and x.dtype == column[0].dtype for x in column\n            ):\n                return jnp.stack(column, axis=0)\n        return column\n\n    def _tensorize(self, value):\n        import jax\n        import jax.numpy as jnp\n\n        if isinstance(value, (str, bytes, type(None))):\n            return value\n        elif isinstance(value, (np.character, np.ndarray)) and np.issubdtype(value.dtype, np.character):\n            return value.tolist()\n\n        default_dtype = {}\n\n        if isinstance(value, (np.number, np.ndarray)) and np.issubdtype(value.dtype, np.integer):\n            # the default int precision depends on the jax config\n            # see https://jax.readthedocs.io/en/latest/notebooks/Common_Gotchas_in_JAX.html#double-64bit-precision\n            if jax.config.jax_enable_x64:\n                default_dtype = {\"dtype\": jnp.int64}\n            else:\n                default_dtype = {\"dtype\": jnp.int32}\n        elif isinstance(value, (np.number, np.ndarray)) and np.issubdtype(value.dtype, np.floating):\n            default_dtype = {\"dtype\": jnp.float32}\n\n        if config.PIL_AVAILABLE and \"PIL\" in sys.modules:\n            import PIL.Image\n\n            if isinstance(value, PIL.Image.Image):\n                value = np.asarray(value)\n        if config.TORCHVISION_AVAILABLE and \"torchvision\" in sys.modules:\n            from torchvision.io import VideoReader\n\n            if isinstance(value, VideoReader):\n                return value  # TODO(QL): set output to jax arrays ?\n        if config.TORCHCODEC_AVAILABLE and \"torchcodec\" in sys.modules:\n            from torchcodec.decoders import AudioDecoder, VideoDecoder\n\n            if isinstance(value, (VideoDecoder, AudioDecoder)):\n                return value  # TODO(QL): set output to jax arrays ?\n\n        # using global variable since `jaxlib.xla_extension.Device` is not serializable neither\n        # with `pickle` nor with `dill`, so we need to use a global variable instead\n        global DEVICE_MAPPING\n        if DEVICE_MAPPING is None:\n            DEVICE_MAPPING = self._map_devices_to_str()\n\n        with jax.default_device(DEVICE_MAPPING[self.device]):\n            # calling jnp.array on a np.ndarray does copy the data\n            # see https://github.com/google/jax/issues/4486\n            return jnp.array(value, **{**default_dtype, **self.jnp_array_kwargs})\n\n    def _recursive_tensorize(self, data_struct):\n        import jax\n\n        # support for torch, tf, jax etc.\n        if config.TORCH_AVAILABLE and \"torch\" in sys.modules:\n            import torch\n\n            if isinstance(data_struct, torch.Tensor):\n                return self._tensorize(data_struct.detach().cpu().numpy()[()])\n        if hasattr(data_struct, \"__array__\") and not isinstance(data_struct, jax.Array):\n            data_struct = data_struct.__array__()\n        # support for nested types like struct of list of struct\n        if isinstance(data_struct, np.ndarray):\n            if data_struct.dtype == object:  # jax arrays cannot be instantied from an array of objects\n                return self._consolidate([self.recursive_tensorize(substruct) for substruct in data_struct])\n        elif isinstance(data_struct, (list, tuple)):\n            return self._consolidate([self.recursive_tensorize(substruct) for substruct in data_struct])\n        return self._tensorize(data_struct)\n\n    def recursive_tensorize(self, data_struct: dict):\n        return map_nested(self._recursive_tensorize, data_struct, map_list=False)\n\n    def format_row(self, pa_table: pa.Table) -> Mapping:\n        row = self.numpy_arrow_extractor().extract_row(pa_table)\n        row = self.python_features_decoder.decode_row(row)\n        return self.recursive_tensorize(row)\n\n    def format_column(self, pa_table: pa.Table) -> \"jax.Array\":\n        column = self.numpy_arrow_extractor().extract_column(pa_table)\n        column = self.python_features_decoder.decode_column(column, pa_table.column_names[0])\n        column = self.recursive_tensorize(column)\n        column = self._consolidate(column)\n        return column\n\n    def format_batch(self, pa_table: pa.Table) -> Mapping:\n        batch = self.numpy_arrow_extractor().extract_batch(pa_table)\n        batch = self.python_features_decoder.decode_batch(batch)\n        batch = self.recursive_tensorize(batch)\n        for column_name in batch:\n            batch[column_name] = self._consolidate(batch[column_name])\n        return batch\n"
  },
  {
    "path": "src/datasets/formatting/np_formatter.py",
    "content": "# Copyright 2020 The HuggingFace Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport sys\nfrom collections.abc import Mapping\n\nimport numpy as np\nimport pyarrow as pa\n\nfrom .. import config\nfrom ..utils.py_utils import map_nested\nfrom .formatting import TensorFormatter\n\n\nclass NumpyFormatter(TensorFormatter[Mapping, np.ndarray, Mapping]):\n    def __init__(self, features=None, token_per_repo_id=None, **np_array_kwargs):\n        super().__init__(features=features, token_per_repo_id=token_per_repo_id)\n        self.np_array_kwargs = np_array_kwargs\n\n    def _consolidate(self, column):\n        if isinstance(column, list):\n            if column and all(\n                isinstance(x, np.ndarray) and x.shape == column[0].shape and x.dtype == column[0].dtype for x in column\n            ):\n                return np.stack(column)\n            else:\n                # don't use np.array(column, dtype=object)\n                # since it fails in certain cases\n                # see https://stackoverflow.com/q/51005699\n                out = np.empty(len(column), dtype=object)\n                out[:] = column\n                return out\n        return column\n\n    def _tensorize(self, value):\n        if isinstance(value, (str, bytes, type(None))):\n            return value\n        elif isinstance(value, (np.character, np.ndarray)) and np.issubdtype(value.dtype, np.character):\n            return value\n        elif isinstance(value, np.number):\n            return value\n\n        default_dtype = {}\n\n        if isinstance(value, np.ndarray) and np.issubdtype(value.dtype, np.integer):\n            default_dtype = {\"dtype\": np.int64}\n        elif isinstance(value, np.ndarray) and np.issubdtype(value.dtype, np.floating):\n            default_dtype = {\"dtype\": np.float32}\n\n        if config.PIL_AVAILABLE and \"PIL\" in sys.modules:\n            import PIL.Image\n\n            if isinstance(value, PIL.Image.Image):\n                return np.asarray(value, **self.np_array_kwargs)\n        if config.TORCHVISION_AVAILABLE and \"torchvision\" in sys.modules:\n            from torchvision.io import VideoReader\n\n            if isinstance(value, VideoReader):\n                return value  # TODO(QL): set output to np arrays ?\n        if config.TORCHCODEC_AVAILABLE and \"torchcodec\" in sys.modules:\n            from torchcodec.decoders import AudioDecoder, VideoDecoder\n\n            if isinstance(value, (VideoDecoder, AudioDecoder)):\n                return value  # TODO(QL): set output to np arrays ?\n\n        return np.asarray(value, **{**default_dtype, **self.np_array_kwargs})\n\n    def _recursive_tensorize(self, data_struct):\n        # support for torch, tf, jax etc.\n        if config.TORCH_AVAILABLE and \"torch\" in sys.modules:\n            import torch\n\n            if isinstance(data_struct, torch.Tensor):\n                return self._tensorize(data_struct.detach().cpu().numpy()[()])\n        if hasattr(data_struct, \"__array__\") and not isinstance(data_struct, (np.ndarray, np.character, np.number)):\n            data_struct = data_struct.__array__()\n        # support for nested types like struct of list of struct\n        if isinstance(data_struct, np.ndarray):\n            if data_struct.dtype == object:\n                return self._consolidate([self.recursive_tensorize(substruct) for substruct in data_struct])\n        if isinstance(data_struct, (list, tuple)):\n            return self._consolidate([self.recursive_tensorize(substruct) for substruct in data_struct])\n        return self._tensorize(data_struct)\n\n    def recursive_tensorize(self, data_struct: dict):\n        return map_nested(self._recursive_tensorize, data_struct, map_list=False)\n\n    def format_row(self, pa_table: pa.Table) -> Mapping:\n        row = self.numpy_arrow_extractor().extract_row(pa_table)\n        row = self.python_features_decoder.decode_row(row)\n        return self.recursive_tensorize(row)\n\n    def format_column(self, pa_table: pa.Table) -> np.ndarray:\n        column = self.numpy_arrow_extractor().extract_column(pa_table)\n        column = self.python_features_decoder.decode_column(column, pa_table.column_names[0])\n        column = self.recursive_tensorize(column)\n        column = self._consolidate(column)\n        return column\n\n    def format_batch(self, pa_table: pa.Table) -> Mapping:\n        batch = self.numpy_arrow_extractor().extract_batch(pa_table)\n        batch = self.python_features_decoder.decode_batch(batch)\n        batch = self.recursive_tensorize(batch)\n        for column_name in batch:\n            batch[column_name] = self._consolidate(batch[column_name])\n        return batch\n"
  },
  {
    "path": "src/datasets/formatting/polars_formatter.py",
    "content": "# Copyright 2020 The HuggingFace Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport sys\nfrom functools import partial\nfrom typing import TYPE_CHECKING, Optional\n\nimport pyarrow as pa\n\nfrom .. import config\nfrom ..features import Features\nfrom ..features.features import decode_nested_example\nfrom ..utils.py_utils import no_op_if_value_is_null\nfrom .formatting import BaseArrowExtractor, TableFormatter\n\n\nif TYPE_CHECKING:\n    import polars as pl\n\n\nclass PolarsArrowExtractor(BaseArrowExtractor[\"pl.DataFrame\", \"pl.Series\", \"pl.DataFrame\"]):\n    def extract_row(self, pa_table: pa.Table) -> \"pl.DataFrame\":\n        if config.POLARS_AVAILABLE:\n            if \"polars\" not in sys.modules:\n                import polars\n            else:\n                polars = sys.modules[\"polars\"]\n\n            return polars.from_arrow(pa_table.slice(length=1))\n        else:\n            raise ValueError(\"Polars needs to be installed to be able to return Polars dataframes.\")\n\n    def extract_column(self, pa_table: pa.Table) -> \"pl.Series\":\n        if config.POLARS_AVAILABLE:\n            if \"polars\" not in sys.modules:\n                import polars\n            else:\n                polars = sys.modules[\"polars\"]\n\n            return polars.from_arrow(pa_table.select([0]))[pa_table.column_names[0]]\n        else:\n            raise ValueError(\"Polars needs to be installed to be able to return Polars dataframes.\")\n\n    def extract_batch(self, pa_table: pa.Table) -> \"pl.DataFrame\":\n        if config.POLARS_AVAILABLE:\n            if \"polars\" not in sys.modules:\n                import polars\n            else:\n                polars = sys.modules[\"polars\"]\n\n            return polars.from_arrow(pa_table)\n        else:\n            raise ValueError(\"Polars needs to be installed to be able to return Polars dataframes.\")\n\n\nclass PolarsFeaturesDecoder:\n    def __init__(self, features: Optional[Features]):\n        self.features = features\n        import polars as pl  # noqa: F401 - import pl at initialization\n\n    def decode_row(self, row: \"pl.DataFrame\") -> \"pl.DataFrame\":\n        decode = (\n            {\n                column_name: no_op_if_value_is_null(partial(decode_nested_example, feature))\n                for column_name, feature in self.features.items()\n                if self.features._column_requires_decoding[column_name]\n            }\n            if self.features\n            else {}\n        )\n        if decode:\n            row[list(decode.keys())] = row.map_rows(decode)\n        return row\n\n    def decode_column(self, column: \"pl.Series\", column_name: str) -> \"pl.Series\":\n        decode = (\n            no_op_if_value_is_null(partial(decode_nested_example, self.features[column_name]))\n            if self.features and column_name in self.features and self.features._column_requires_decoding[column_name]\n            else None\n        )\n        if decode:\n            column = column.map_elements(decode)\n        return column\n\n    def decode_batch(self, batch: \"pl.DataFrame\") -> \"pl.DataFrame\":\n        return self.decode_row(batch)\n\n\nclass PolarsFormatter(TableFormatter[\"pl.DataFrame\", \"pl.Series\", \"pl.DataFrame\"]):\n    table_type = \"polars dataframe\"\n    column_type = \"polars series\"\n\n    def __init__(self, features=None, **np_array_kwargs):\n        super().__init__(features=features)\n        self.np_array_kwargs = np_array_kwargs\n        self.polars_arrow_extractor = PolarsArrowExtractor\n        self.polars_features_decoder = PolarsFeaturesDecoder(features)\n        import polars as pl  # noqa: F401 - import pl at initialization\n\n    def format_row(self, pa_table: pa.Table) -> \"pl.DataFrame\":\n        row = self.polars_arrow_extractor().extract_row(pa_table)\n        row = self.polars_features_decoder.decode_row(row)\n        return row\n\n    def format_column(self, pa_table: pa.Table) -> \"pl.Series\":\n        column = self.polars_arrow_extractor().extract_column(pa_table)\n        column = self.polars_features_decoder.decode_column(column, pa_table.column_names[0])\n        return column\n\n    def format_batch(self, pa_table: pa.Table) -> \"pl.DataFrame\":\n        row = self.polars_arrow_extractor().extract_batch(pa_table)\n        row = self.polars_features_decoder.decode_batch(row)\n        return row\n"
  },
  {
    "path": "src/datasets/formatting/tf_formatter.py",
    "content": "# Copyright 2020 The HuggingFace Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Lint as: python3\nimport sys\nfrom collections.abc import Mapping\nfrom typing import TYPE_CHECKING\n\nimport numpy as np\nimport pyarrow as pa\n\nfrom .. import config\nfrom ..utils.py_utils import map_nested\nfrom .formatting import TensorFormatter\n\n\nif TYPE_CHECKING:\n    import tensorflow as tf\n\n\nclass TFFormatter(TensorFormatter[Mapping, \"tf.Tensor\", Mapping]):\n    def __init__(self, features=None, token_per_repo_id=None, **tf_tensor_kwargs):\n        super().__init__(features=features, token_per_repo_id=token_per_repo_id)\n        self.tf_tensor_kwargs = tf_tensor_kwargs\n        import tensorflow as tf  # noqa: F401 - import tf at initialization\n\n    def _consolidate(self, column):\n        import tensorflow as tf\n\n        if isinstance(column, list) and column:\n            if all(\n                isinstance(x, tf.Tensor) and x.shape == column[0].shape and x.dtype == column[0].dtype for x in column\n            ):\n                return tf.stack(column)\n            elif all(\n                isinstance(x, (tf.Tensor, tf.RaggedTensor)) and x.ndim == 1 and x.dtype == column[0].dtype\n                for x in column\n            ):\n                # only rag 1-D tensors, otherwise some dimensions become ragged even though they were consolidated\n                return tf.ragged.stack(column)\n\n        return column\n\n    def _tensorize(self, value):\n        import tensorflow as tf\n\n        if value is None:\n            return value\n\n        default_dtype = {}\n\n        if isinstance(value, (np.number, np.ndarray)) and np.issubdtype(value.dtype, np.integer):\n            default_dtype = {\"dtype\": tf.int64}\n        elif isinstance(value, (np.number, np.ndarray)) and np.issubdtype(value.dtype, np.floating):\n            default_dtype = {\"dtype\": tf.float32}\n\n        if config.PIL_AVAILABLE and \"PIL\" in sys.modules:\n            import PIL.Image\n\n            if isinstance(value, PIL.Image.Image):\n                value = np.asarray(value)\n        if config.TORCHVISION_AVAILABLE and \"torchvision\" in sys.modules:\n            from torchvision.io import VideoReader\n\n            if isinstance(value, VideoReader):\n                return value  # TODO(QL): set output to tf tensors ?\n        if config.TORCHCODEC_AVAILABLE and \"torchcodec\" in sys.modules:\n            from torchcodec.decoders import AudioDecoder, VideoDecoder\n\n            if isinstance(value, (VideoDecoder, AudioDecoder)):\n                return value  # TODO(QL): set output to jax arrays ?\n\n        return tf.convert_to_tensor(value, **{**default_dtype, **self.tf_tensor_kwargs})\n\n    def _recursive_tensorize(self, data_struct):\n        import tensorflow as tf\n\n        # support for torch, tf, jax etc.\n        if config.TORCH_AVAILABLE and \"torch\" in sys.modules:\n            import torch\n\n            if isinstance(data_struct, torch.Tensor):\n                return self._tensorize(data_struct.detach().cpu().numpy()[()])\n        if hasattr(data_struct, \"__array__\") and not isinstance(data_struct, tf.Tensor):\n            data_struct = data_struct.__array__()\n        # support for nested types like struct of list of struct\n        if isinstance(data_struct, np.ndarray):\n            if data_struct.dtype == object:  # tf tensors cannot be instantied from an array of objects\n                return self._consolidate([self.recursive_tensorize(substruct) for substruct in data_struct])\n        elif isinstance(data_struct, (list, tuple)):\n            return self._consolidate([self.recursive_tensorize(substruct) for substruct in data_struct])\n        return self._tensorize(data_struct)\n\n    def recursive_tensorize(self, data_struct: dict):\n        return map_nested(self._recursive_tensorize, data_struct, map_list=False)\n\n    def format_row(self, pa_table: pa.Table) -> Mapping:\n        row = self.numpy_arrow_extractor().extract_row(pa_table)\n        row = self.python_features_decoder.decode_row(row)\n        return self.recursive_tensorize(row)\n\n    def format_column(self, pa_table: pa.Table) -> \"tf.Tensor\":\n        column = self.numpy_arrow_extractor().extract_column(pa_table)\n        column = self.python_features_decoder.decode_column(column, pa_table.column_names[0])\n        column = self.recursive_tensorize(column)\n        column = self._consolidate(column)\n        return column\n\n    def format_batch(self, pa_table: pa.Table) -> Mapping:\n        batch = self.numpy_arrow_extractor().extract_batch(pa_table)\n        batch = self.python_features_decoder.decode_batch(batch)\n        batch = self.recursive_tensorize(batch)\n        for column_name in batch:\n            batch[column_name] = self._consolidate(batch[column_name])\n        return batch\n"
  },
  {
    "path": "src/datasets/formatting/torch_formatter.py",
    "content": "# Copyright 2020 The HuggingFace Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Lint as: python3\nimport sys\nfrom collections.abc import Mapping\nfrom typing import TYPE_CHECKING\n\nimport numpy as np\nimport pyarrow as pa\n\nfrom .. import config\nfrom ..utils.py_utils import map_nested\nfrom .formatting import TensorFormatter\n\n\nif TYPE_CHECKING:\n    import torch\n\n\nclass TorchFormatter(TensorFormatter[Mapping, \"torch.Tensor\", Mapping]):\n    def __init__(self, features=None, token_per_repo_id=None, **torch_tensor_kwargs):\n        super().__init__(features=features, token_per_repo_id=token_per_repo_id)\n        self.torch_tensor_kwargs = torch_tensor_kwargs\n        import torch  # noqa import torch at initialization\n\n    def _consolidate(self, column):\n        import torch\n\n        if isinstance(column, list) and column:\n            if all(\n                isinstance(x, torch.Tensor) and x.shape == column[0].shape and x.dtype == column[0].dtype\n                for x in column\n            ):\n                return torch.stack(column)\n        return column\n\n    def _tensorize(self, value):\n        import torch\n\n        if isinstance(value, (str, bytes, type(None))):\n            return value\n        elif isinstance(value, (np.character, np.ndarray)) and np.issubdtype(value.dtype, np.character):\n            return value.tolist()\n\n        default_dtype = {}\n\n        if isinstance(value, (np.number, np.ndarray)) and np.issubdtype(value.dtype, np.integer):\n            default_dtype = {\"dtype\": torch.int64}\n\n            # Convert dtype to np.int64 if it's either np.uint16 or np.uint32 to ensure compatibility.\n            # np.uint64 is excluded from this conversion as there is no compatible PyTorch dtype that can handle it without loss.\n            if value.dtype in [np.uint16, np.uint32]:\n                value = value.astype(np.int64)\n\n        elif isinstance(value, (np.number, np.ndarray)) and np.issubdtype(value.dtype, np.floating):\n            default_dtype = {\"dtype\": torch.float32}\n\n        if config.PIL_AVAILABLE and \"PIL\" in sys.modules:\n            import PIL.Image\n\n            if isinstance(value, PIL.Image.Image):\n                value = np.asarray(value)\n                if value.ndim == 2:\n                    value = value[:, :, np.newaxis]\n\n                value = value.transpose((2, 0, 1))\n        if config.TORCHVISION_AVAILABLE and \"torchvision\" in sys.modules:\n            from torchvision.io import VideoReader\n\n            if isinstance(value, VideoReader):\n                return value  # TODO(QL): set output to torch tensors ?\n        if config.TORCHCODEC_AVAILABLE and \"torchcodec\" in sys.modules:\n            from torchcodec.decoders import AudioDecoder, VideoDecoder\n\n            if isinstance(value, (VideoDecoder, AudioDecoder)):\n                return value  # TODO(QL): set output to jax arrays ?\n\n        return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})\n\n    def _recursive_tensorize(self, data_struct):\n        import torch\n\n        # support for torch, tf, jax etc.\n        if hasattr(data_struct, \"__array__\") and not isinstance(data_struct, torch.Tensor):\n            data_struct = data_struct.__array__()\n        # support for nested types like struct of list of struct\n        if isinstance(data_struct, np.ndarray):\n            if data_struct.dtype == object:  # torch tensors cannot be instantied from an array of objects\n                return self._consolidate([self.recursive_tensorize(substruct) for substruct in data_struct])\n        elif isinstance(data_struct, (list, tuple)):\n            return self._consolidate([self.recursive_tensorize(substruct) for substruct in data_struct])\n        return self._tensorize(data_struct)\n\n    def recursive_tensorize(self, data_struct: dict):\n        return map_nested(self._recursive_tensorize, data_struct, map_list=False)\n\n    def format_row(self, pa_table: pa.Table) -> Mapping:\n        row = self.numpy_arrow_extractor().extract_row(pa_table)\n        row = self.python_features_decoder.decode_row(row)\n        return self.recursive_tensorize(row)\n\n    def format_column(self, pa_table: pa.Table) -> \"torch.Tensor\":\n        column = self.numpy_arrow_extractor().extract_column(pa_table)\n        column = self.python_features_decoder.decode_column(column, pa_table.column_names[0])\n        column = self.recursive_tensorize(column)\n        column = self._consolidate(column)\n        return column\n\n    def format_batch(self, pa_table: pa.Table) -> Mapping:\n        batch = self.numpy_arrow_extractor().extract_batch(pa_table)\n        batch = self.python_features_decoder.decode_batch(batch)\n        batch = self.recursive_tensorize(batch)\n        for column_name in batch:\n            batch[column_name] = self._consolidate(batch[column_name])\n        return batch\n"
  },
  {
    "path": "src/datasets/hub.py",
    "content": "from itertools import chain\nfrom typing import Optional, Union\n\nfrom huggingface_hub import (\n    CommitInfo,\n    CommitOperationAdd,\n    CommitOperationDelete,\n    DatasetCard,\n    DatasetCardData,\n    HfApi,\n    HfFileSystem,\n)\n\nimport datasets.config\nfrom datasets.info import DatasetInfosDict\nfrom datasets.load import load_dataset_builder\nfrom datasets.utils.metadata import MetadataConfigs\n\n\ndef delete_from_hub(\n    repo_id: str,\n    config_name: str,\n    revision: Optional[str] = None,\n    token: Optional[Union[bool, str]] = None,\n) -> CommitInfo:\n    \"\"\"Delete a dataset configuration from a [data-only dataset](repository_structure) on the Hub.\n\n    Args:\n        repo_id (`str`): ID of the Hub dataset repository, in the following format: `<user>/<dataset_name>` or\n            `<org>/<dataset_name>`.\n        config_name (`str`): Name of the dataset configuration.\n        revision (`str`, *optional*): Branch to delete the configuration from. Defaults to the `\"main\"` branch.\n        token (`bool` or `str`, *optional*): Authentication token for the Hugging Face Hub.\n\n    Returns:\n        `huggingface_hub.CommitInfo`\n    \"\"\"\n    operations = []\n    # data_files\n    fs = HfFileSystem(endpoint=datasets.config.HF_ENDPOINT, token=token)\n    builder = load_dataset_builder(repo_id, config_name, revision=revision, token=token)\n    for data_file in chain(*builder.config.data_files.values()):\n        data_file_resolved_path = fs.resolve_path(data_file)\n        if data_file_resolved_path.repo_id == repo_id:\n            operations.append(CommitOperationDelete(path_in_repo=data_file_resolved_path.path_in_repo))\n    # README.md\n    dataset_card = DatasetCard.load(repo_id)\n    # config_names\n    if dataset_card.data.get(\"config_names\", None) and config_name in dataset_card.data[\"config_names\"]:\n        dataset_card.data[\"config_names\"].remove(config_name)\n    # metadata_configs\n    metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card.data)\n    if metadata_configs:\n        _ = metadata_configs.pop(config_name, None)\n        dataset_card_data = DatasetCardData()\n        metadata_configs.to_dataset_card_data(dataset_card_data)\n        if datasets.config.METADATA_CONFIGS_FIELD in dataset_card_data:\n            dataset_card.data[datasets.config.METADATA_CONFIGS_FIELD] = dataset_card_data[\n                datasets.config.METADATA_CONFIGS_FIELD\n            ]\n        else:\n            _ = dataset_card.data.pop(datasets.config.METADATA_CONFIGS_FIELD, None)\n    # dataset_info\n    dataset_infos: DatasetInfosDict = DatasetInfosDict.from_dataset_card_data(dataset_card.data)\n    if dataset_infos:\n        _ = dataset_infos.pop(config_name, None)\n        dataset_card_data = DatasetCardData()\n        dataset_infos.to_dataset_card_data(dataset_card_data)\n        if \"dataset_info\" in dataset_card_data:\n            dataset_card.data[\"dataset_info\"] = dataset_card_data[\"dataset_info\"]\n        else:\n            _ = dataset_card.data.pop(\"dataset_info\", None)\n    # Commit\n    operations.append(\n        CommitOperationAdd(path_in_repo=datasets.config.REPOCARD_FILENAME, path_or_fileobj=str(dataset_card).encode())\n    )\n    api = HfApi(endpoint=datasets.config.HF_ENDPOINT, token=token)\n    commit_info = api.create_commit(\n        repo_id,\n        operations=operations,\n        commit_message=f\"Delete '{config_name}' config\",\n        commit_description=f\"Delete '{config_name}' config.\",\n        token=token,\n        repo_type=\"dataset\",\n        revision=revision,\n        create_pr=True,\n    )\n    print(f\"You can find your PR to delete the dataset config at: {commit_info.pr_url}\")\n    return commit_info\n\n\ndef _delete_files(dataset_id, revision=None, token=None):\n    hf_api = HfApi(endpoint=datasets.config.HF_ENDPOINT, token=token)\n    repo_files = hf_api.list_repo_files(\n        dataset_id,\n        repo_type=\"dataset\",\n    )\n    if repo_files:\n        legacy_json_file = []\n        data_files = []\n        for filename in repo_files:\n            if filename in {\".gitattributes\", \"README.md\"}:\n                continue\n            elif filename == \"dataset_infos.json\":\n                legacy_json_file.append(filename)\n            else:\n                data_files.append(filename)\n        if legacy_json_file:\n            hf_api.delete_file(\n                \"dataset_infos.json\",\n                dataset_id,\n                repo_type=\"dataset\",\n                revision=revision,\n                commit_message=\"Delete legacy dataset_infos.json\",\n            )\n        if data_files:\n            for filename in data_files:\n                hf_api.delete_file(\n                    filename,\n                    dataset_id,\n                    repo_type=\"dataset\",\n                    revision=revision,\n                    commit_message=\"Delete data file\",\n                )\n"
  },
  {
    "path": "src/datasets/info.py",
    "content": "# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Lint as: python3\n\"\"\"DatasetInfo record information we know about a dataset.\n\nThis includes things that we know about the dataset statically, i.e.:\n - description\n - canonical location\n - does it have validation and tests splits\n - size\n - etc.\n\nThis also includes the things that can and should be computed once we've\nprocessed the dataset as well:\n - number of examples (in each split)\n - etc.\n\"\"\"\n\nimport copy\nimport dataclasses\nimport json\nimport os\nimport posixpath\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import ClassVar, Optional, Union\n\nimport fsspec\nfrom fsspec.core import url_to_fs\nfrom huggingface_hub import DatasetCard, DatasetCardData\n\nfrom . import config\nfrom .features import Features\nfrom .splits import SplitDict\nfrom .utils import Version\nfrom .utils.logging import get_logger\nfrom .utils.py_utils import asdict, unique_values\n\n\nlogger = get_logger(__name__)\n\n\n@dataclass\nclass SupervisedKeysData:\n    input: str = \"\"\n    output: str = \"\"\n\n\n@dataclass\nclass DownloadChecksumsEntryData:\n    key: str = \"\"\n    value: str = \"\"\n\n\nclass MissingCachedSizesConfigError(Exception):\n    \"\"\"The expected cached sizes of the download file are missing.\"\"\"\n\n\nclass NonMatchingCachedSizesError(Exception):\n    \"\"\"The prepared split doesn't have expected sizes.\"\"\"\n\n\n@dataclass\nclass PostProcessedInfo:\n    features: Optional[Features] = None\n    resources_checksums: Optional[dict] = None\n\n    def __post_init__(self):\n        # Convert back to the correct classes when we reload from dict\n        if self.features is not None and not isinstance(self.features, Features):\n            self.features = Features.from_dict(self.features)\n\n    @classmethod\n    def from_dict(cls, post_processed_info_dict: dict) -> \"PostProcessedInfo\":\n        field_names = {f.name for f in dataclasses.fields(cls)}\n        return cls(**{k: v for k, v in post_processed_info_dict.items() if k in field_names})\n\n\n@dataclass\nclass DatasetInfo:\n    \"\"\"Information about a dataset.\n\n    `DatasetInfo` documents datasets, including its name, version, and features.\n    See the constructor arguments and properties for a full list.\n\n    Not all fields are known on construction and may be updated later.\n\n    Attributes:\n        description (`str`):\n            A description of the dataset.\n        citation (`str`):\n            A BibTeX citation of the dataset.\n        homepage (`str`):\n            A URL to the official homepage for the dataset.\n        license (`str`):\n            The dataset's license. It can be the name of the license or a paragraph containing the terms of the license.\n        features ([`Features`], *optional*):\n            The features used to specify the dataset's column types.\n        post_processed (`PostProcessedInfo`, *optional*):\n            Information regarding the resources of a possible post-processing of a dataset. For example, it can contain the information of an index.\n        supervised_keys (`SupervisedKeysData`, *optional*):\n            Specifies the input feature and the label for supervised learning if applicable for the dataset (legacy from TFDS).\n        builder_name (`str`, *optional*):\n            The name of the `GeneratorBasedBuilder` subclass used to create the dataset. It is also the snake_case version of the dataset builder class name.\n        config_name (`str`, *optional*):\n            The name of the configuration derived from [`BuilderConfig`].\n        version (`str` or [`Version`], *optional*):\n            The version of the dataset.\n        splits (`dict`, *optional*):\n            The mapping between split name and metadata.\n        download_checksums (`dict`, *optional*):\n            The mapping between the URL to download the dataset's checksums and corresponding metadata.\n        download_size (`int`, *optional*):\n            The size of the files to download to generate the dataset, in bytes.\n        post_processing_size (`int`, *optional*):\n            Size of the dataset in bytes after post-processing, if any.\n        dataset_size (`int`, *optional*):\n            The combined size in bytes of the Arrow tables for all splits.\n        size_in_bytes (`int`, *optional*):\n            The combined size in bytes of all files associated with the dataset (downloaded files + Arrow files).\n        **config_kwargs (additional keyword arguments):\n            Keyword arguments to be passed to the [`BuilderConfig`] and used in the [`DatasetBuilder`].\n    \"\"\"\n\n    # Set in the dataset builders\n    description: str = dataclasses.field(default_factory=str)\n    citation: str = dataclasses.field(default_factory=str)\n    homepage: str = dataclasses.field(default_factory=str)\n    license: str = dataclasses.field(default_factory=str)\n    features: Optional[Features] = None\n    post_processed: Optional[PostProcessedInfo] = None\n    supervised_keys: Optional[SupervisedKeysData] = None\n\n    # Set later by the builder\n    builder_name: Optional[str] = None\n    dataset_name: Optional[str] = None  # for packaged builders, to be different from builder_name\n    config_name: Optional[str] = None\n    version: Optional[Union[str, Version]] = None\n    # Set later by `download_and_prepare`\n    splits: Optional[SplitDict] = None\n    download_checksums: Optional[dict] = None\n    download_size: Optional[int] = None\n    post_processing_size: Optional[int] = None\n    dataset_size: Optional[int] = None\n    size_in_bytes: Optional[int] = None\n\n    _INCLUDED_INFO_IN_YAML: ClassVar[list[str]] = [\n        \"config_name\",\n        \"download_size\",\n        \"dataset_size\",\n        \"features\",\n        \"splits\",\n    ]\n\n    def __post_init__(self):\n        # Convert back to the correct classes when we reload from dict\n        if self.features is not None and not isinstance(self.features, Features):\n            self.features = Features.from_dict(self.features)\n        if self.post_processed is not None and not isinstance(self.post_processed, PostProcessedInfo):\n            self.post_processed = PostProcessedInfo.from_dict(self.post_processed)\n        if self.version is not None and not isinstance(self.version, Version):\n            if isinstance(self.version, str):\n                self.version = Version(self.version)\n            else:\n                self.version = Version.from_dict(self.version)\n        if self.splits is not None and not isinstance(self.splits, SplitDict):\n            self.splits = SplitDict.from_split_dict(self.splits)\n        if self.supervised_keys is not None and not isinstance(self.supervised_keys, SupervisedKeysData):\n            if isinstance(self.supervised_keys, (tuple, list)):\n                self.supervised_keys = SupervisedKeysData(*self.supervised_keys)\n            else:\n                self.supervised_keys = SupervisedKeysData(**self.supervised_keys)\n\n    def write_to_directory(self, dataset_info_dir, pretty_print=False, storage_options: Optional[dict] = None):\n        \"\"\"Write `DatasetInfo` and license (if present) as JSON files to `dataset_info_dir`.\n\n        Args:\n            dataset_info_dir (`str`):\n                Destination directory.\n            pretty_print (`bool`, defaults to `False`):\n                If `True`, the JSON will be pretty-printed with the indent level of 4.\n            storage_options (`dict`, *optional*):\n                Key/value pairs to be passed on to the file-system backend, if any.\n\n                <Added version=\"2.9.0\"/>\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds.info.write_to_directory(\"/path/to/directory/\")\n        ```\n        \"\"\"\n        fs: fsspec.AbstractFileSystem\n        fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {}))\n        with fs.open(posixpath.join(dataset_info_dir, config.DATASET_INFO_FILENAME), \"wb\") as f:\n            self._dump_info(f, pretty_print=pretty_print)\n        if self.license:\n            with fs.open(posixpath.join(dataset_info_dir, config.LICENSE_FILENAME), \"wb\") as f:\n                self._dump_license(f)\n\n    def _dump_info(self, file, pretty_print=False):\n        \"\"\"Dump info in `file` file-like object open in bytes mode (to support remote files)\"\"\"\n        file.write(json.dumps(asdict(self), indent=4 if pretty_print else None).encode(\"utf-8\"))\n\n    def _dump_license(self, file):\n        \"\"\"Dump license in `file` file-like object open in bytes mode (to support remote files)\"\"\"\n        file.write(self.license.encode(\"utf-8\"))\n\n    @classmethod\n    def from_merge(cls, dataset_infos: list[\"DatasetInfo\"]):\n        dataset_infos = [dset_info.copy() for dset_info in dataset_infos if dset_info is not None]\n\n        if len(dataset_infos) > 0 and all(dataset_infos[0] == dset_info for dset_info in dataset_infos):\n            # if all dataset_infos are equal we don't need to merge. Just return the first.\n            return dataset_infos[0]\n\n        description = \"\\n\\n\".join(unique_values(info.description for info in dataset_infos)).strip()\n        citation = \"\\n\\n\".join(unique_values(info.citation for info in dataset_infos)).strip()\n        homepage = \"\\n\\n\".join(unique_values(info.homepage for info in dataset_infos)).strip()\n        license = \"\\n\\n\".join(unique_values(info.license for info in dataset_infos)).strip()\n        features = None\n        supervised_keys = None\n\n        return cls(\n            description=description,\n            citation=citation,\n            homepage=homepage,\n            license=license,\n            features=features,\n            supervised_keys=supervised_keys,\n        )\n\n    @classmethod\n    def from_directory(cls, dataset_info_dir: str, storage_options: Optional[dict] = None) -> \"DatasetInfo\":\n        \"\"\"Create [`DatasetInfo`] from the JSON file in `dataset_info_dir`.\n\n        This function updates all the dynamically generated fields (num_examples,\n        hash, time of creation,...) of the [`DatasetInfo`].\n\n        This will overwrite all previous metadata.\n\n        Args:\n            dataset_info_dir (`str`):\n                The directory containing the metadata file. This\n                should be the root directory of a specific dataset version.\n            storage_options (`dict`, *optional*):\n                Key/value pairs to be passed on to the file-system backend, if any.\n\n                <Added version=\"2.9.0\"/>\n\n        Example:\n\n        ```py\n        >>> from datasets import DatasetInfo\n        >>> ds_info = DatasetInfo.from_directory(\"/path/to/directory/\")\n        ```\n        \"\"\"\n        fs: fsspec.AbstractFileSystem\n        fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {}))\n        logger.debug(f\"Loading Dataset info from {dataset_info_dir}\")\n        if not dataset_info_dir:\n            raise ValueError(\"Calling DatasetInfo.from_directory() with undefined dataset_info_dir.\")\n        with fs.open(posixpath.join(dataset_info_dir, config.DATASET_INFO_FILENAME), \"r\", encoding=\"utf-8\") as f:\n            dataset_info_dict = json.load(f)\n        return cls.from_dict(dataset_info_dict)\n\n    @classmethod\n    def from_dict(cls, dataset_info_dict: dict) -> \"DatasetInfo\":\n        field_names = {f.name for f in dataclasses.fields(cls)}\n        return cls(**{k: v for k, v in dataset_info_dict.items() if k in field_names})\n\n    def update(self, other_dataset_info: \"DatasetInfo\", ignore_none=True):\n        self_dict = self.__dict__\n        self_dict.update(\n            **{\n                k: copy.deepcopy(v)\n                for k, v in other_dataset_info.__dict__.items()\n                if (v is not None or not ignore_none)\n            }\n        )\n\n    def copy(self) -> \"DatasetInfo\":\n        return self.__class__(**{k: copy.deepcopy(v) for k, v in self.__dict__.items()})\n\n    def _to_yaml_dict(self) -> dict:\n        yaml_dict = {}\n        dataset_info_dict = asdict(self)\n        for key in dataset_info_dict:\n            if key in self._INCLUDED_INFO_IN_YAML:\n                value = getattr(self, key)\n                if hasattr(value, \"_to_yaml_list\"):  # Features, SplitDict\n                    yaml_dict[key] = value._to_yaml_list()\n                elif hasattr(value, \"_to_yaml_string\"):  # Version\n                    yaml_dict[key] = value._to_yaml_string()\n                else:\n                    yaml_dict[key] = value\n        return yaml_dict\n\n    @classmethod\n    def _from_yaml_dict(cls, yaml_data: dict) -> \"DatasetInfo\":\n        yaml_data = copy.deepcopy(yaml_data)\n        if yaml_data.get(\"features\") is not None:\n            yaml_data[\"features\"] = Features._from_yaml_list(yaml_data[\"features\"])\n        if yaml_data.get(\"splits\") is not None:\n            yaml_data[\"splits\"] = SplitDict._from_yaml_list(yaml_data[\"splits\"])\n        field_names = {f.name for f in dataclasses.fields(cls)}\n        return cls(**{k: v for k, v in yaml_data.items() if k in field_names})\n\n\nclass DatasetInfosDict(dict[str, DatasetInfo]):\n    def write_to_directory(self, dataset_infos_dir, overwrite=False, pretty_print=False) -> None:\n        total_dataset_infos = {}\n        dataset_infos_path = os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME)\n        dataset_readme_path = os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)\n        if not overwrite:\n            total_dataset_infos = self.from_directory(dataset_infos_dir)\n        total_dataset_infos.update(self)\n        if os.path.exists(dataset_infos_path):\n            # for backward compatibility, let's update the JSON file if it exists\n            with open(dataset_infos_path, \"w\", encoding=\"utf-8\") as f:\n                dataset_infos_dict = {\n                    config_name: asdict(dset_info) for config_name, dset_info in total_dataset_infos.items()\n                }\n                json.dump(dataset_infos_dict, f, indent=4 if pretty_print else None)\n        # Dump the infos in the YAML part of the README.md file\n        if os.path.exists(dataset_readme_path):\n            dataset_card = DatasetCard.load(dataset_readme_path)\n            dataset_card_data = dataset_card.data\n        else:\n            dataset_card = None\n            dataset_card_data = DatasetCardData()\n        if total_dataset_infos:\n            total_dataset_infos.to_dataset_card_data(dataset_card_data)\n            dataset_card = (\n                DatasetCard(\"---\\n\" + str(dataset_card_data) + \"\\n---\\n\") if dataset_card is None else dataset_card\n            )\n            dataset_card.save(Path(dataset_readme_path))\n\n    @classmethod\n    def from_directory(cls, dataset_infos_dir) -> \"DatasetInfosDict\":\n        logger.debug(f\"Loading Dataset Infos from {dataset_infos_dir}\")\n        # Load the info from the YAML part of README.md\n        if os.path.exists(os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)):\n            dataset_card_data = DatasetCard.load(Path(dataset_infos_dir) / config.REPOCARD_FILENAME).data\n            if \"dataset_info\" in dataset_card_data:\n                return cls.from_dataset_card_data(dataset_card_data)\n        if os.path.exists(os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME)):\n            # this is just to have backward compatibility with dataset_infos.json files\n            with open(os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME), encoding=\"utf-8\") as f:\n                return cls(\n                    {\n                        config_name: DatasetInfo.from_dict(dataset_info_dict)\n                        for config_name, dataset_info_dict in json.load(f).items()\n                    }\n                )\n        else:\n            return cls()\n\n    @classmethod\n    def from_dataset_card_data(cls, dataset_card_data: DatasetCardData) -> \"DatasetInfosDict\":\n        if isinstance(dataset_card_data.get(\"dataset_info\"), (list, dict)):\n            if isinstance(dataset_card_data[\"dataset_info\"], list):\n                return cls(\n                    {\n                        dataset_info_yaml_dict.get(\"config_name\", \"default\"): DatasetInfo._from_yaml_dict(\n                            dataset_info_yaml_dict\n                        )\n                        for dataset_info_yaml_dict in dataset_card_data[\"dataset_info\"]\n                    }\n                )\n            else:\n                dataset_info = DatasetInfo._from_yaml_dict(dataset_card_data[\"dataset_info\"])\n                dataset_info.config_name = dataset_card_data[\"dataset_info\"].get(\"config_name\", \"default\")\n                return cls({dataset_info.config_name: dataset_info})\n        else:\n            return cls()\n\n    def to_dataset_card_data(self, dataset_card_data: DatasetCardData) -> None:\n        if self:\n            # first get existing metadata info\n            if \"dataset_info\" in dataset_card_data and isinstance(dataset_card_data[\"dataset_info\"], dict):\n                dataset_metadata_infos = {\n                    dataset_card_data[\"dataset_info\"].get(\"config_name\", \"default\"): dataset_card_data[\"dataset_info\"]\n                }\n            elif \"dataset_info\" in dataset_card_data and isinstance(dataset_card_data[\"dataset_info\"], list):\n                dataset_metadata_infos = {\n                    config_metadata[\"config_name\"]: config_metadata\n                    for config_metadata in dataset_card_data[\"dataset_info\"]\n                }\n            else:\n                dataset_metadata_infos = {}\n            # update/rewrite existing metadata info with the one to dump\n            total_dataset_infos = {\n                **dataset_metadata_infos,\n                **{config_name: dset_info._to_yaml_dict() for config_name, dset_info in self.items()},\n            }\n            # the config_name from the dataset_infos_dict takes over the config_name of the DatasetInfo\n            for config_name, dset_info_yaml_dict in total_dataset_infos.items():\n                dset_info_yaml_dict[\"config_name\"] = config_name\n            if len(total_dataset_infos) == 1:\n                # use a struct instead of a list of configurations, since there's only one\n                dataset_card_data[\"dataset_info\"] = next(iter(total_dataset_infos.values()))\n                config_name = dataset_card_data[\"dataset_info\"].pop(\"config_name\", None)\n                if config_name != \"default\":\n                    # if config_name is not \"default\" preserve it and put at the first position\n                    dataset_card_data[\"dataset_info\"] = {\n                        \"config_name\": config_name,\n                        **dataset_card_data[\"dataset_info\"],\n                    }\n            else:\n                dataset_card_data[\"dataset_info\"] = []\n                for config_name, dataset_info_yaml_dict in sorted(total_dataset_infos.items()):\n                    # add the config_name field in first position\n                    dataset_info_yaml_dict.pop(\"config_name\", None)\n                    dataset_info_yaml_dict = {\"config_name\": config_name, **dataset_info_yaml_dict}\n                    dataset_card_data[\"dataset_info\"].append(dataset_info_yaml_dict)\n"
  },
  {
    "path": "src/datasets/inspect.py",
    "content": "# Copyright 2020 The HuggingFace Datasets Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Lint as: python3\n\"\"\"List and inspect datasets.\"\"\"\n\nimport os\nfrom collections.abc import Mapping, Sequence\nfrom typing import Optional, Union\n\nfrom .download.download_config import DownloadConfig\nfrom .download.download_manager import DownloadMode\nfrom .download.streaming_download_manager import StreamingDownloadManager\nfrom .info import DatasetInfo\nfrom .load import (\n    dataset_module_factory,\n    get_dataset_builder_class,\n    load_dataset_builder,\n)\nfrom .utils.logging import get_logger\nfrom .utils.version import Version\n\n\nlogger = get_logger(__name__)\n\n\nclass SplitsNotFoundError(ValueError):\n    pass\n\n\ndef get_dataset_infos(\n    path: str,\n    data_files: Optional[Union[dict, list, str]] = None,\n    download_config: Optional[DownloadConfig] = None,\n    download_mode: Optional[Union[DownloadMode, str]] = None,\n    revision: Optional[Union[str, Version]] = None,\n    token: Optional[Union[bool, str]] = None,\n    **config_kwargs,\n):\n    \"\"\"Get the meta information about a dataset, returned as a dict mapping config name to DatasetInfoDict.\n\n    Args:\n        path (`str`): path to the dataset repository. Can be either:\n\n            - a local path to the dataset directory containing the data files,\n                e.g. `'./dataset/squad'`\n            - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),\n                e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`\n        revision (`Union[str, datasets.Version]`, *optional*):\n            If specified, the dataset module will be loaded from the datasets repository at this version.\n            By default:\n            - it is set to the local version of the lib.\n            - it will also try to load it from the main branch if it's not available at the local version of the lib.\n            Specifying a version that is different from your local version of the lib might cause compatibility issues.\n        download_config ([`DownloadConfig`], *optional*):\n            Specific download configuration parameters.\n        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):\n            Download/generate mode.\n        data_files (`Union[Dict, List, str]`, *optional*):\n            Defining the data_files of the dataset configuration.\n        token (`str` or `bool`, *optional*):\n            Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.\n            If `True`, or not specified, will get token from `\"~/.huggingface\"`.\n        **config_kwargs (additional keyword arguments):\n            Optional attributes for builder class which will override the attributes if supplied.\n\n    Example:\n\n    ```py\n    >>> from datasets import get_dataset_infos\n    >>> get_dataset_infos('cornell-movie-review-data/rotten_tomatoes')\n    {'default': DatasetInfo(description=\"Movie Review Dataset.\\nThis is a dataset of containing 5,331 positive and 5,331 negative processed\\nsentences from Rotten Tomatoes movie reviews...), ...}\n    ```\n    \"\"\"\n    config_names = get_dataset_config_names(\n        path=path,\n        revision=revision,\n        download_config=download_config,\n        download_mode=download_mode,\n        data_files=data_files,\n        token=token,\n    )\n    return {\n        config_name: get_dataset_config_info(\n            path=path,\n            config_name=config_name,\n            data_files=data_files,\n            download_config=download_config,\n            download_mode=download_mode,\n            revision=revision,\n            token=token,\n            **config_kwargs,\n        )\n        for config_name in config_names\n    }\n\n\ndef get_dataset_config_names(\n    path: str,\n    revision: Optional[Union[str, Version]] = None,\n    download_config: Optional[DownloadConfig] = None,\n    download_mode: Optional[Union[DownloadMode, str]] = None,\n    data_files: Optional[Union[dict, list, str]] = None,\n    **download_kwargs,\n):\n    \"\"\"Get the list of available config names for a particular dataset.\n\n    Args:\n        path (`str`): path to the dataset repository. Can be either:\n\n            - a local path to the dataset directory containing the data files,\n                e.g. `'./dataset/squad'`\n            - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),\n                e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`\n        revision (`Union[str, datasets.Version]`, *optional*):\n            If specified, the dataset module will be loaded from the datasets repository at this version.\n            By default:\n            - it is set to the local version of the lib.\n            - it will also try to load it from the main branch if it's not available at the local version of the lib.\n            Specifying a version that is different from your local version of the lib might cause compatibility issues.\n        download_config ([`DownloadConfig`], *optional*):\n            Specific download configuration parameters.\n        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):\n            Download/generate mode.\n        data_files (`Union[Dict, List, str]`, *optional*):\n            Defining the data_files of the dataset configuration.\n        **download_kwargs (additional keyword arguments):\n            Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied,\n            for example `token`.\n\n    Example:\n\n    ```py\n    >>> from datasets import get_dataset_config_names\n    >>> get_dataset_config_names(\"nyu-mll/glue\")\n    ['cola',\n     'sst2',\n     'mrpc',\n     'qqp',\n     'stsb',\n     'mnli',\n     'mnli_mismatched',\n     'mnli_matched',\n     'qnli',\n     'rte',\n     'wnli',\n     'ax']\n    ```\n    \"\"\"\n    dataset_module = dataset_module_factory(\n        path,\n        revision=revision,\n        download_config=download_config,\n        download_mode=download_mode,\n        data_files=data_files,\n        **download_kwargs,\n    )\n    builder_cls = get_dataset_builder_class(dataset_module, dataset_name=os.path.basename(path))\n    return list(builder_cls.builder_configs.keys()) or [\n        dataset_module.builder_kwargs.get(\"config_name\", builder_cls.DEFAULT_CONFIG_NAME or \"default\")\n    ]\n\n\ndef get_dataset_default_config_name(\n    path: str,\n    revision: Optional[Union[str, Version]] = None,\n    download_config: Optional[DownloadConfig] = None,\n    download_mode: Optional[Union[DownloadMode, str]] = None,\n    data_files: Optional[Union[dict, list, str]] = None,\n    **download_kwargs,\n) -> Optional[str]:\n    \"\"\"Get the default config name for a particular dataset.\n    Can return None only if the dataset has multiple configurations and no default configuration.\n\n    Args:\n        path (`str`): path to the dataset repository. Can be either:\n\n            - a local path to the dataset directory containing the data files,\n                e.g. `'./dataset/squad'`\n            - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),\n                e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`\n        revision (`Union[str, datasets.Version]`, *optional*):\n            If specified, the dataset module will be loaded from the datasets repository at this version.\n            By default:\n            - it is set to the local version of the lib.\n            - it will also try to load it from the main branch if it's not available at the local version of the lib.\n            Specifying a version that is different from your local version of the lib might cause compatibility issues.\n        download_config ([`DownloadConfig`], *optional*):\n            Specific download configuration parameters.\n        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):\n            Download/generate mode.\n        data_files (`Union[Dict, List, str]`, *optional*):\n            Defining the data_files of the dataset configuration.\n        **download_kwargs (additional keyword arguments):\n            Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied,\n            for example `token`.\n\n    Returns:\n        Optional[str]: the default config name if there is one\n\n    Example:\n\n    ```py\n    >>> from datasets import get_dataset_default_config_name\n    >>> get_dataset_default_config_name(\"openbookqa\")\n    'main'\n    ```\n    \"\"\"\n    dataset_module = dataset_module_factory(\n        path,\n        revision=revision,\n        download_config=download_config,\n        download_mode=download_mode,\n        data_files=data_files,\n        **download_kwargs,\n    )\n    builder_cls = get_dataset_builder_class(dataset_module, dataset_name=os.path.basename(path))\n    builder_configs = list(builder_cls.builder_configs.keys())\n    if builder_configs:\n        default_config_name = builder_configs[0] if len(builder_configs) == 1 else None\n    else:\n        default_config_name = \"default\"\n    return builder_cls.DEFAULT_CONFIG_NAME or default_config_name\n\n\ndef get_dataset_config_info(\n    path: str,\n    config_name: Optional[str] = None,\n    data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,\n    download_config: Optional[DownloadConfig] = None,\n    download_mode: Optional[Union[DownloadMode, str]] = None,\n    revision: Optional[Union[str, Version]] = None,\n    token: Optional[Union[bool, str]] = None,\n    **config_kwargs,\n) -> DatasetInfo:\n    \"\"\"Get the meta information (DatasetInfo) about a dataset for a particular config\n\n    Args:\n        path (`str`): path to the dataset repository. Can be either:\n\n            - a local path to the dataset directory containing the data files,\n                e.g. `'./dataset/squad'`\n            - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),\n                e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`\n        config_name (:obj:`str`, optional): Defining the name of the dataset configuration.\n        data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s).\n        download_config (:class:`~download.DownloadConfig`, optional): Specific download configuration parameters.\n        download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.\n        revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset to load.\n            As datasets have their own git repository on the Datasets Hub, the default version \"main\" corresponds to their \"main\" branch.\n            You can specify a different version than the default \"main\" by using a commit SHA or a git tag of the dataset repository.\n        token (``str`` or :obj:`bool`, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.\n            If True, or not specified, will get token from `\"~/.huggingface\"`.\n        **config_kwargs (additional keyword arguments): optional attributes for builder class which will override the attributes if supplied.\n\n    \"\"\"\n    builder = load_dataset_builder(\n        path,\n        name=config_name,\n        data_files=data_files,\n        download_config=download_config,\n        download_mode=download_mode,\n        revision=revision,\n        token=token,\n        **config_kwargs,\n    )\n    info = builder.info\n    if info.splits is None:\n        download_config = download_config.copy() if download_config else DownloadConfig()\n        if token is not None:\n            download_config.token = token\n        try:\n            info.splits = {\n                split_generator.name: {\"name\": split_generator.name, \"dataset_name\": path}\n                for split_generator in builder._split_generators(\n                    StreamingDownloadManager(base_path=builder.base_path, download_config=download_config)\n                )\n            }\n        except Exception as err:\n            raise SplitsNotFoundError(\"The split names could not be parsed from the dataset config.\") from err\n    return info\n\n\ndef get_dataset_split_names(\n    path: str,\n    config_name: Optional[str] = None,\n    data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,\n    download_config: Optional[DownloadConfig] = None,\n    download_mode: Optional[Union[DownloadMode, str]] = None,\n    revision: Optional[Union[str, Version]] = None,\n    token: Optional[Union[bool, str]] = None,\n    **config_kwargs,\n):\n    \"\"\"Get the list of available splits for a particular config and dataset.\n\n    Args:\n        path (`str`): path to the dataset repository. Can be either:\n\n            - a local path to the dataset directory containing the data files,\n                e.g. `'./dataset/squad'`\n            - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),\n                e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`\n        config_name (`str`, *optional*):\n            Defining the name of the dataset configuration.\n        data_files (`str` or `Sequence` or `Mapping`, *optional*):\n            Path(s) to source data file(s).\n        download_config ([`DownloadConfig`], *optional*):\n            Specific download configuration parameters.\n        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):\n            Download/generate mode.\n        revision ([`Version`] or `str`, *optional*):\n            Version of the dataset to load.\n            As datasets have their own git repository on the Datasets Hub, the default version \"main\" corresponds to their \"main\" branch.\n            You can specify a different version than the default \"main\" by using a commit SHA or a git tag of the dataset repository.\n        token (`str` or `bool`, *optional*):\n            Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.\n            If `True`, or not specified, will get token from `\"~/.huggingface\"`.\n        **config_kwargs (additional keyword arguments):\n            Optional attributes for builder class which will override the attributes if supplied.\n\n    Example:\n\n    ```py\n    >>> from datasets import get_dataset_split_names\n    >>> get_dataset_split_names('cornell-movie-review-data/rotten_tomatoes')\n    ['train', 'validation', 'test']\n    ```\n    \"\"\"\n    info = get_dataset_config_info(\n        path,\n        config_name=config_name,\n        data_files=data_files,\n        download_config=download_config,\n        download_mode=download_mode,\n        revision=revision,\n        token=token,\n        **config_kwargs,\n    )\n    return list(info.splits.keys())\n"
  },
  {
    "path": "src/datasets/io/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/io/abc.py",
    "content": "from abc import ABC, abstractmethod\nfrom typing import Optional, Union\n\nfrom .. import Dataset, DatasetDict, Features, IterableDataset, IterableDatasetDict, NamedSplit\nfrom ..utils.typing import NestedDataStructureLike, PathLike\n\n\nclass AbstractDatasetReader(ABC):\n    def __init__(\n        self,\n        path_or_paths: Optional[NestedDataStructureLike[PathLike]] = None,\n        split: Optional[NamedSplit] = None,\n        features: Optional[Features] = None,\n        cache_dir: str = None,\n        keep_in_memory: bool = False,\n        streaming: bool = False,\n        num_proc: Optional[int] = None,\n        **kwargs,\n    ):\n        self.path_or_paths = path_or_paths\n        self.split = split if split or isinstance(path_or_paths, dict) else \"train\"\n        self.features = features\n        self.cache_dir = cache_dir\n        self.keep_in_memory = keep_in_memory\n        self.streaming = streaming\n        self.num_proc = num_proc\n        self.kwargs = kwargs\n\n    @abstractmethod\n    def read(self) -> Union[Dataset, DatasetDict, IterableDataset, IterableDatasetDict]:\n        pass\n\n\nclass AbstractDatasetInputStream(ABC):\n    def __init__(\n        self,\n        features: Optional[Features] = None,\n        cache_dir: str = None,\n        keep_in_memory: bool = False,\n        streaming: bool = False,\n        num_proc: Optional[int] = None,\n        **kwargs,\n    ):\n        self.features = features\n        self.cache_dir = cache_dir\n        self.keep_in_memory = keep_in_memory\n        self.streaming = streaming\n        self.num_proc = num_proc\n        self.kwargs = kwargs\n\n    @abstractmethod\n    def read(self) -> Union[Dataset, IterableDataset]:\n        pass\n"
  },
  {
    "path": "src/datasets/io/csv.py",
    "content": "import multiprocessing\nimport os\nfrom typing import BinaryIO, Optional, Union\n\nimport fsspec\n\nfrom .. import Dataset, Features, NamedSplit, config\nfrom ..formatting import query_table\nfrom ..packaged_modules.csv.csv import Csv\nfrom ..utils import tqdm as hf_tqdm\nfrom ..utils.typing import NestedDataStructureLike, PathLike\nfrom .abc import AbstractDatasetReader\n\n\nclass CsvDatasetReader(AbstractDatasetReader):\n    def __init__(\n        self,\n        path_or_paths: NestedDataStructureLike[PathLike],\n        split: Optional[NamedSplit] = None,\n        features: Optional[Features] = None,\n        cache_dir: str = None,\n        keep_in_memory: bool = False,\n        streaming: bool = False,\n        num_proc: Optional[int] = None,\n        **kwargs,\n    ):\n        super().__init__(\n            path_or_paths,\n            split=split,\n            features=features,\n            cache_dir=cache_dir,\n            keep_in_memory=keep_in_memory,\n            streaming=streaming,\n            num_proc=num_proc,\n            **kwargs,\n        )\n        path_or_paths = path_or_paths if isinstance(path_or_paths, dict) else {self.split: path_or_paths}\n        self.builder = Csv(\n            cache_dir=cache_dir,\n            data_files=path_or_paths,\n            features=features,\n            **kwargs,\n        )\n\n    def read(self):\n        # Build iterable dataset\n        if self.streaming:\n            dataset = self.builder.as_streaming_dataset(split=self.split)\n        # Build regular (map-style) dataset\n        else:\n            download_config = None\n            download_mode = None\n            verification_mode = None\n            base_path = None\n\n            self.builder.download_and_prepare(\n                download_config=download_config,\n                download_mode=download_mode,\n                verification_mode=verification_mode,\n                base_path=base_path,\n                num_proc=self.num_proc,\n            )\n            dataset = self.builder.as_dataset(\n                split=self.split, verification_mode=verification_mode, in_memory=self.keep_in_memory\n            )\n        return dataset\n\n\nclass CsvDatasetWriter:\n    def __init__(\n        self,\n        dataset: Dataset,\n        path_or_buf: Union[PathLike, BinaryIO],\n        batch_size: Optional[int] = None,\n        num_proc: Optional[int] = None,\n        storage_options: Optional[dict] = None,\n        **to_csv_kwargs,\n    ):\n        if num_proc is not None and num_proc <= 0:\n            raise ValueError(f\"num_proc {num_proc} must be an integer > 0.\")\n\n        self.dataset = dataset\n        self.path_or_buf = path_or_buf\n        self.batch_size = batch_size if batch_size else config.DEFAULT_MAX_BATCH_SIZE\n        self.num_proc = num_proc\n        self.encoding = \"utf-8\"\n        self.storage_options = storage_options or {}\n        self.to_csv_kwargs = to_csv_kwargs\n\n    def write(self) -> int:\n        _ = self.to_csv_kwargs.pop(\"path_or_buf\", None)\n        header = self.to_csv_kwargs.pop(\"header\", True)\n        index = self.to_csv_kwargs.pop(\"index\", False)\n\n        if isinstance(self.path_or_buf, (str, bytes, os.PathLike)):\n            with fsspec.open(self.path_or_buf, \"wb\", **(self.storage_options or {})) as buffer:\n                written = self._write(file_obj=buffer, header=header, index=index, **self.to_csv_kwargs)\n        else:\n            written = self._write(file_obj=self.path_or_buf, header=header, index=index, **self.to_csv_kwargs)\n        return written\n\n    def _batch_csv(self, args):\n        offset, header, index, to_csv_kwargs = args\n\n        batch = query_table(\n            table=self.dataset.data,\n            key=slice(offset, offset + self.batch_size),\n            indices=self.dataset._indices,\n        )\n        csv_str = batch.to_pandas().to_csv(\n            path_or_buf=None, header=header if (offset == 0) else False, index=index, **to_csv_kwargs\n        )\n        return csv_str.encode(self.encoding)\n\n    def _write(self, file_obj: BinaryIO, header, index, **to_csv_kwargs) -> int:\n        \"\"\"Writes the pyarrow table as CSV to a binary file handle.\n\n        Caller is responsible for opening and closing the handle.\n        \"\"\"\n        written = 0\n\n        if self.num_proc is None or self.num_proc == 1:\n            for offset in hf_tqdm(\n                range(0, len(self.dataset), self.batch_size),\n                unit=\"ba\",\n                desc=\"Creating CSV from Arrow format\",\n            ):\n                csv_str = self._batch_csv((offset, header, index, to_csv_kwargs))\n                written += file_obj.write(csv_str)\n\n        else:\n            num_rows, batch_size = len(self.dataset), self.batch_size\n            with multiprocessing.Pool(self.num_proc) as pool:\n                for csv_str in hf_tqdm(\n                    pool.imap(\n                        self._batch_csv,\n                        [(offset, header, index, to_csv_kwargs) for offset in range(0, num_rows, batch_size)],\n                    ),\n                    total=(num_rows // batch_size) + 1 if num_rows % batch_size else num_rows // batch_size,\n                    unit=\"ba\",\n                    desc=\"Creating CSV from Arrow format\",\n                ):\n                    written += file_obj.write(csv_str)\n\n        return written\n"
  },
  {
    "path": "src/datasets/io/generator.py",
    "content": "from typing import Callable, Optional\n\nfrom .. import Features, NamedSplit, Split\nfrom ..packaged_modules.generator.generator import Generator\nfrom .abc import AbstractDatasetInputStream\n\n\nclass GeneratorDatasetInputStream(AbstractDatasetInputStream):\n    def __init__(\n        self,\n        generator: Callable,\n        features: Optional[Features] = None,\n        cache_dir: str = None,\n        keep_in_memory: bool = False,\n        streaming: bool = False,\n        gen_kwargs: Optional[dict] = None,\n        num_proc: Optional[int] = None,\n        split: NamedSplit = Split.TRAIN,\n        fingerprint: Optional[str] = None,\n        **kwargs,\n    ):\n        super().__init__(\n            features=features,\n            cache_dir=cache_dir,\n            keep_in_memory=keep_in_memory,\n            streaming=streaming,\n            num_proc=num_proc,\n            **kwargs,\n        )\n        self.builder = Generator(\n            cache_dir=cache_dir,\n            features=features,\n            generator=generator,\n            gen_kwargs=gen_kwargs,\n            split=split,\n            config_id=\"default-fingerprint=\" + fingerprint if fingerprint else None,\n            **kwargs,\n        )\n        self.fingerprint = fingerprint\n\n    def read(self):\n        # Build iterable dataset\n        if self.streaming:\n            dataset = self.builder.as_streaming_dataset(split=self.builder.config.split)\n        # Build regular (map-style) dataset\n        else:\n            download_config = None\n            download_mode = None\n            verification_mode = None\n            base_path = None\n\n            self.builder.download_and_prepare(\n                download_config=download_config,\n                download_mode=download_mode,\n                verification_mode=verification_mode,\n                base_path=base_path,\n                num_proc=self.num_proc,\n            )\n            dataset = self.builder.as_dataset(\n                split=self.builder.config.split, verification_mode=verification_mode, in_memory=self.keep_in_memory\n            )\n            if self.fingerprint:\n                dataset._fingerprint = self.fingerprint\n        return dataset\n"
  },
  {
    "path": "src/datasets/io/json.py",
    "content": "import multiprocessing\nimport os\nfrom typing import BinaryIO, Optional, Union\n\nimport fsspec\n\nfrom .. import Dataset, Features, NamedSplit, config\nfrom ..formatting import query_table\nfrom ..packaged_modules.json.json import Json\nfrom ..utils import tqdm as hf_tqdm\nfrom ..utils.typing import NestedDataStructureLike, PathLike\nfrom .abc import AbstractDatasetReader\n\n\nclass JsonDatasetReader(AbstractDatasetReader):\n    def __init__(\n        self,\n        path_or_paths: NestedDataStructureLike[PathLike],\n        split: Optional[NamedSplit] = None,\n        features: Optional[Features] = None,\n        cache_dir: str = None,\n        keep_in_memory: bool = False,\n        streaming: bool = False,\n        field: Optional[str] = None,\n        num_proc: Optional[int] = None,\n        **kwargs,\n    ):\n        super().__init__(\n            path_or_paths,\n            split=split,\n            features=features,\n            cache_dir=cache_dir,\n            keep_in_memory=keep_in_memory,\n            streaming=streaming,\n            num_proc=num_proc,\n            **kwargs,\n        )\n        self.field = field\n        path_or_paths = path_or_paths if isinstance(path_or_paths, dict) else {self.split: path_or_paths}\n        self.builder = Json(\n            cache_dir=cache_dir,\n            data_files=path_or_paths,\n            features=features,\n            field=field,\n            **kwargs,\n        )\n\n    def read(self):\n        # Build iterable dataset\n        if self.streaming:\n            dataset = self.builder.as_streaming_dataset(split=self.split)\n        # Build regular (map-style) dataset\n        else:\n            download_config = None\n            download_mode = None\n            verification_mode = None\n            base_path = None\n\n            self.builder.download_and_prepare(\n                download_config=download_config,\n                download_mode=download_mode,\n                verification_mode=verification_mode,\n                base_path=base_path,\n                num_proc=self.num_proc,\n            )\n            dataset = self.builder.as_dataset(\n                split=self.split, verification_mode=verification_mode, in_memory=self.keep_in_memory\n            )\n        return dataset\n\n\nclass JsonDatasetWriter:\n    def __init__(\n        self,\n        dataset: Dataset,\n        path_or_buf: Union[PathLike, BinaryIO],\n        batch_size: Optional[int] = None,\n        num_proc: Optional[int] = None,\n        storage_options: Optional[dict] = None,\n        **to_json_kwargs,\n    ):\n        if num_proc is not None and num_proc <= 0:\n            raise ValueError(f\"num_proc {num_proc} must be an integer > 0.\")\n\n        self.dataset = dataset\n        self.path_or_buf = path_or_buf\n        self.batch_size = batch_size if batch_size else config.DEFAULT_MAX_BATCH_SIZE\n        self.num_proc = num_proc\n        self.encoding = \"utf-8\"\n        self.storage_options = storage_options or {}\n        self.to_json_kwargs = to_json_kwargs\n\n    def write(self) -> int:\n        _ = self.to_json_kwargs.pop(\"path_or_buf\", None)\n        orient = self.to_json_kwargs.pop(\"orient\", \"records\")\n        lines = self.to_json_kwargs.pop(\"lines\", True if orient == \"records\" else False)\n        if \"index\" not in self.to_json_kwargs and orient in [\"split\", \"table\"]:\n            self.to_json_kwargs[\"index\"] = False\n\n        # Determine the default compression value based on self.path_or_buf type\n        default_compression = \"infer\" if isinstance(self.path_or_buf, (str, bytes, os.PathLike)) else None\n        compression = self.to_json_kwargs.pop(\"compression\", default_compression)\n\n        if compression not in [None, \"infer\", \"gzip\", \"bz2\", \"xz\"]:\n            raise NotImplementedError(f\"`datasets` currently does not support {compression} compression\")\n\n        if not lines and self.batch_size < self.dataset.num_rows:\n            raise NotImplementedError(\n                \"Output JSON will not be formatted correctly when lines = False and batch_size < number of rows in the dataset. Use pandas.DataFrame.to_json() instead.\"\n            )\n\n        if isinstance(self.path_or_buf, (str, bytes, os.PathLike)):\n            with fsspec.open(\n                self.path_or_buf, \"wb\", compression=compression, **(self.storage_options or {})\n            ) as buffer:\n                written = self._write(file_obj=buffer, orient=orient, lines=lines, **self.to_json_kwargs)\n        else:\n            if compression:\n                raise NotImplementedError(\n                    f\"The compression parameter is not supported when writing to a buffer, but compression={compression}\"\n                    \" was passed. Please provide a local path instead.\"\n                )\n            written = self._write(file_obj=self.path_or_buf, orient=orient, lines=lines, **self.to_json_kwargs)\n        return written\n\n    def _batch_json(self, args):\n        offset, orient, lines, to_json_kwargs = args\n\n        batch = query_table(\n            table=self.dataset.data,\n            key=slice(offset, offset + self.batch_size),\n            indices=self.dataset._indices,\n        )\n        json_str = batch.to_pandas().to_json(path_or_buf=None, orient=orient, lines=lines, **to_json_kwargs)\n        if not json_str.endswith(\"\\n\"):\n            json_str += \"\\n\"\n        return json_str.encode(self.encoding)\n\n    def _write(\n        self,\n        file_obj: BinaryIO,\n        orient,\n        lines,\n        **to_json_kwargs,\n    ) -> int:\n        \"\"\"Writes the pyarrow table as JSON lines to a binary file handle.\n\n        Caller is responsible for opening and closing the handle.\n        \"\"\"\n        written = 0\n\n        if self.num_proc is None or self.num_proc == 1:\n            for offset in hf_tqdm(\n                range(0, len(self.dataset), self.batch_size),\n                unit=\"ba\",\n                desc=\"Creating json from Arrow format\",\n            ):\n                json_str = self._batch_json((offset, orient, lines, to_json_kwargs))\n                written += file_obj.write(json_str)\n        else:\n            num_rows, batch_size = len(self.dataset), self.batch_size\n            with multiprocessing.Pool(self.num_proc) as pool:\n                for json_str in hf_tqdm(\n                    pool.imap(\n                        self._batch_json,\n                        [(offset, orient, lines, to_json_kwargs) for offset in range(0, num_rows, batch_size)],\n                    ),\n                    total=(num_rows // batch_size) + 1 if num_rows % batch_size else num_rows // batch_size,\n                    unit=\"ba\",\n                    desc=\"Creating json from Arrow format\",\n                ):\n                    written += file_obj.write(json_str)\n\n        return written\n"
  },
  {
    "path": "src/datasets/io/parquet.py",
    "content": "import json\nimport os\nfrom typing import BinaryIO, Optional, Union\n\nimport fsspec\nimport pyarrow.parquet as pq\n\nfrom .. import Dataset, Features, NamedSplit, config\nfrom ..arrow_writer import get_writer_batch_size_from_data_size, get_writer_batch_size_from_features\nfrom ..features.features import require_storage_embed\nfrom ..formatting import query_table\nfrom ..packaged_modules import _PACKAGED_DATASETS_MODULES\nfrom ..packaged_modules.parquet.parquet import Parquet\nfrom ..utils import tqdm as hf_tqdm\nfrom ..utils.typing import NestedDataStructureLike, PathLike\nfrom .abc import AbstractDatasetReader\n\n\nclass ParquetDatasetReader(AbstractDatasetReader):\n    def __init__(\n        self,\n        path_or_paths: NestedDataStructureLike[PathLike],\n        split: Optional[NamedSplit] = None,\n        features: Optional[Features] = None,\n        cache_dir: str = None,\n        keep_in_memory: bool = False,\n        streaming: bool = False,\n        num_proc: Optional[int] = None,\n        **kwargs,\n    ):\n        super().__init__(\n            path_or_paths,\n            split=split,\n            features=features,\n            cache_dir=cache_dir,\n            keep_in_memory=keep_in_memory,\n            streaming=streaming,\n            num_proc=num_proc,\n            **kwargs,\n        )\n        path_or_paths = path_or_paths if isinstance(path_or_paths, dict) else {self.split: path_or_paths}\n        hash = _PACKAGED_DATASETS_MODULES[\"parquet\"][1]\n        self.builder = Parquet(\n            cache_dir=cache_dir,\n            data_files=path_or_paths,\n            features=features,\n            hash=hash,\n            **kwargs,\n        )\n\n    def read(self):\n        # Build iterable dataset\n        if self.streaming:\n            dataset = self.builder.as_streaming_dataset(split=self.split)\n        # Build regular (map-style) dataset\n        else:\n            download_config = None\n            download_mode = None\n            verification_mode = None\n            base_path = None\n\n            self.builder.download_and_prepare(\n                download_config=download_config,\n                download_mode=download_mode,\n                verification_mode=verification_mode,\n                base_path=base_path,\n                num_proc=self.num_proc,\n            )\n            dataset = self.builder.as_dataset(\n                split=self.split, verification_mode=verification_mode, in_memory=self.keep_in_memory\n            )\n        return dataset\n\n\nclass ParquetDatasetWriter:\n    def __init__(\n        self,\n        dataset: Dataset,\n        path_or_buf: Union[PathLike, BinaryIO],\n        batch_size: Optional[int] = None,\n        storage_options: Optional[dict] = None,\n        use_content_defined_chunking: Union[bool, dict] = True,\n        write_page_index: bool = True,\n        **parquet_writer_kwargs,\n    ):\n        self.dataset = dataset\n        self.path_or_buf = path_or_buf\n        self.batch_size = (\n            batch_size\n            or get_writer_batch_size_from_features(dataset.features)\n            or get_writer_batch_size_from_data_size(len(dataset), dataset._estimate_nbytes())\n        )\n        self.storage_options = storage_options or {}\n        self.parquet_writer_kwargs = parquet_writer_kwargs\n        if use_content_defined_chunking is True:\n            use_content_defined_chunking = config.DEFAULT_CDC_OPTIONS\n        self.use_content_defined_chunking = use_content_defined_chunking\n        self.write_page_index = write_page_index\n\n    def write(self) -> int:\n        if isinstance(self.path_or_buf, (str, bytes, os.PathLike)):\n            with fsspec.open(self.path_or_buf, \"wb\", **(self.storage_options or {})) as buffer:\n                written = self._write(\n                    file_obj=buffer,\n                    batch_size=self.batch_size,\n                    **self.parquet_writer_kwargs,\n                )\n        else:\n            written = self._write(\n                file_obj=self.path_or_buf,\n                batch_size=self.batch_size,\n                **self.parquet_writer_kwargs,\n            )\n        return written\n\n    def _write(self, file_obj: BinaryIO, batch_size: int, **parquet_writer_kwargs) -> int:\n        \"\"\"Writes the pyarrow table as Parquet to a binary file handle.\n\n        Caller is responsible for opening and closing the handle.\n        \"\"\"\n        written = 0\n        _ = parquet_writer_kwargs.pop(\"path_or_buf\", None)\n        schema = self.dataset.features.arrow_schema\n\n        writer = pq.ParquetWriter(\n            file_obj,\n            schema=schema,\n            use_content_defined_chunking=self.use_content_defined_chunking,\n            write_page_index=self.write_page_index,\n            compression={\n                col: \"none\" if require_storage_embed(feature) else \"snappy\"\n                for col, feature in self.dataset.features.items()\n            },\n            use_dictionary=[\n                col for col, feature in self.dataset.features.items() if not require_storage_embed(feature)\n            ],\n            column_encoding={\n                col: \"PLAIN\" for col, feature in self.dataset.features.items() if require_storage_embed(feature)\n            },\n            **parquet_writer_kwargs,\n        )\n\n        for offset in hf_tqdm(\n            range(0, len(self.dataset), batch_size),\n            unit=\"ba\",\n            desc=\"Creating parquet from Arrow format\",\n        ):\n            batch = query_table(\n                table=self.dataset._data,\n                key=slice(offset, offset + batch_size),\n                indices=self.dataset._indices,\n            )\n            writer.write_table(batch)\n            written += batch.nbytes\n\n        # TODO(kszucs): we may want to persist multiple parameters\n        if self.use_content_defined_chunking is not False:\n            writer.add_key_value_metadata({\"content_defined_chunking\": json.dumps(self.use_content_defined_chunking)})\n\n        writer.close()\n        return written\n"
  },
  {
    "path": "src/datasets/io/spark.py",
    "content": "from typing import Optional\n\nimport pyspark\n\nfrom .. import Features, NamedSplit\nfrom ..download import DownloadMode\nfrom ..packaged_modules.spark.spark import Spark\nfrom .abc import AbstractDatasetReader\n\n\nclass SparkDatasetReader(AbstractDatasetReader):\n    \"\"\"A dataset reader that reads from a Spark DataFrame.\n\n    When caching, cache materialization is parallelized over Spark; an NFS that is accessible to the driver must be\n    provided. Streaming is not currently supported.\n    \"\"\"\n\n    def __init__(\n        self,\n        df: pyspark.sql.DataFrame,\n        split: Optional[NamedSplit] = None,\n        features: Optional[Features] = None,\n        streaming: bool = True,\n        cache_dir: str = None,\n        keep_in_memory: bool = False,\n        working_dir: str = None,\n        load_from_cache_file: bool = True,\n        file_format: str = \"arrow\",\n        **kwargs,\n    ):\n        super().__init__(\n            split=split,\n            features=features,\n            cache_dir=cache_dir,\n            keep_in_memory=keep_in_memory,\n            streaming=streaming,\n            **kwargs,\n        )\n        self._load_from_cache_file = load_from_cache_file\n        self._file_format = file_format\n        self.builder = Spark(\n            df=df,\n            features=features,\n            cache_dir=cache_dir,\n            working_dir=working_dir,\n            **kwargs,\n        )\n\n    def read(self):\n        if self.streaming:\n            return self.builder.as_streaming_dataset(split=self.split)\n        download_mode = None if self._load_from_cache_file else DownloadMode.FORCE_REDOWNLOAD\n        self.builder.download_and_prepare(\n            download_mode=download_mode,\n            file_format=self._file_format,\n        )\n        return self.builder.as_dataset(split=self.split)\n"
  },
  {
    "path": "src/datasets/io/sql.py",
    "content": "import multiprocessing\nfrom typing import TYPE_CHECKING, Optional, Union\n\nfrom .. import Dataset, Features, config\nfrom ..formatting import query_table\nfrom ..packaged_modules.sql.sql import Sql\nfrom ..utils import tqdm as hf_tqdm\nfrom .abc import AbstractDatasetInputStream\n\n\nif TYPE_CHECKING:\n    import sqlite3\n\n    import sqlalchemy\n\n\nclass SqlDatasetReader(AbstractDatasetInputStream):\n    def __init__(\n        self,\n        sql: Union[str, \"sqlalchemy.sql.Selectable\"],\n        con: Union[str, \"sqlalchemy.engine.Connection\", \"sqlalchemy.engine.Engine\", \"sqlite3.Connection\"],\n        features: Optional[Features] = None,\n        cache_dir: str = None,\n        keep_in_memory: bool = False,\n        **kwargs,\n    ):\n        super().__init__(features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, **kwargs)\n        self.builder = Sql(\n            cache_dir=cache_dir,\n            features=features,\n            sql=sql,\n            con=con,\n            **kwargs,\n        )\n\n    def read(self):\n        download_config = None\n        download_mode = None\n        verification_mode = None\n        base_path = None\n\n        self.builder.download_and_prepare(\n            download_config=download_config,\n            download_mode=download_mode,\n            verification_mode=verification_mode,\n            base_path=base_path,\n        )\n\n        # Build dataset for splits\n        dataset = self.builder.as_dataset(\n            split=\"train\", verification_mode=verification_mode, in_memory=self.keep_in_memory\n        )\n        return dataset\n\n\nclass SqlDatasetWriter:\n    def __init__(\n        self,\n        dataset: Dataset,\n        name: str,\n        con: Union[str, \"sqlalchemy.engine.Connection\", \"sqlalchemy.engine.Engine\", \"sqlite3.Connection\"],\n        batch_size: Optional[int] = None,\n        num_proc: Optional[int] = None,\n        **to_sql_kwargs,\n    ):\n        if num_proc is not None and num_proc <= 0:\n            raise ValueError(f\"num_proc {num_proc} must be an integer > 0.\")\n\n        self.dataset = dataset\n        self.name = name\n        self.con = con\n        self.batch_size = batch_size if batch_size else config.DEFAULT_MAX_BATCH_SIZE\n        self.num_proc = num_proc\n        self.to_sql_kwargs = to_sql_kwargs\n\n    def write(self) -> int:\n        _ = self.to_sql_kwargs.pop(\"sql\", None)\n        _ = self.to_sql_kwargs.pop(\"con\", None)\n        index = self.to_sql_kwargs.pop(\"index\", False)\n\n        written = self._write(index=index, **self.to_sql_kwargs)\n        return written\n\n    def _batch_sql(self, args):\n        offset, index, to_sql_kwargs = args\n        to_sql_kwargs = {**to_sql_kwargs, \"if_exists\": \"append\"} if offset > 0 else to_sql_kwargs\n        batch = query_table(\n            table=self.dataset.data,\n            key=slice(offset, offset + self.batch_size),\n            indices=self.dataset._indices,\n        )\n        df = batch.to_pandas()\n        num_rows = df.to_sql(self.name, self.con, index=index, **to_sql_kwargs)\n        return num_rows or len(df)\n\n    def _write(self, index, **to_sql_kwargs) -> int:\n        \"\"\"Writes the pyarrow table as SQL to a database.\n\n        Caller is responsible for opening and closing the SQL connection.\n        \"\"\"\n        written = 0\n\n        if self.num_proc is None or self.num_proc == 1:\n            for offset in hf_tqdm(\n                range(0, len(self.dataset), self.batch_size),\n                unit=\"ba\",\n                desc=\"Creating SQL from Arrow format\",\n            ):\n                written += self._batch_sql((offset, index, to_sql_kwargs))\n        else:\n            num_rows, batch_size = len(self.dataset), self.batch_size\n            with multiprocessing.Pool(self.num_proc) as pool:\n                for num_rows in hf_tqdm(\n                    pool.imap(\n                        self._batch_sql,\n                        [(offset, index, to_sql_kwargs) for offset in range(0, num_rows, batch_size)],\n                    ),\n                    total=(num_rows // batch_size) + 1 if num_rows % batch_size else num_rows // batch_size,\n                    unit=\"ba\",\n                    desc=\"Creating SQL from Arrow format\",\n                ):\n                    written += num_rows\n\n        return written\n"
  },
  {
    "path": "src/datasets/io/text.py",
    "content": "from typing import Optional\n\nfrom .. import Features, NamedSplit\nfrom ..packaged_modules.text.text import Text\nfrom ..utils.typing import NestedDataStructureLike, PathLike\nfrom .abc import AbstractDatasetReader\n\n\nclass TextDatasetReader(AbstractDatasetReader):\n    def __init__(\n        self,\n        path_or_paths: NestedDataStructureLike[PathLike],\n        split: Optional[NamedSplit] = None,\n        features: Optional[Features] = None,\n        cache_dir: str = None,\n        keep_in_memory: bool = False,\n        streaming: bool = False,\n        num_proc: Optional[int] = None,\n        **kwargs,\n    ):\n        super().__init__(\n            path_or_paths,\n            split=split,\n            features=features,\n            cache_dir=cache_dir,\n            keep_in_memory=keep_in_memory,\n            streaming=streaming,\n            num_proc=num_proc,\n            **kwargs,\n        )\n        path_or_paths = path_or_paths if isinstance(path_or_paths, dict) else {self.split: path_or_paths}\n        self.builder = Text(\n            cache_dir=cache_dir,\n            data_files=path_or_paths,\n            features=features,\n            **kwargs,\n        )\n\n    def read(self):\n        # Build iterable dataset\n        if self.streaming:\n            dataset = self.builder.as_streaming_dataset(split=self.split)\n        # Build regular (map-style) dataset\n        else:\n            download_config = None\n            download_mode = None\n            verification_mode = None\n            base_path = None\n\n            self.builder.download_and_prepare(\n                download_config=download_config,\n                download_mode=download_mode,\n                verification_mode=verification_mode,\n                base_path=base_path,\n                num_proc=self.num_proc,\n            )\n            dataset = self.builder.as_dataset(\n                split=self.split, verification_mode=verification_mode, in_memory=self.keep_in_memory\n            )\n        return dataset\n"
  },
  {
    "path": "src/datasets/iterable_dataset.py",
    "content": "import asyncio\nimport contextlib\nimport copy\nimport inspect\nimport itertools\nimport multiprocessing.pool\nimport re\nimport sys\nimport tempfile\nfrom collections import Counter\nfrom collections.abc import Iterable, Iterator\nfrom copy import deepcopy\nfrom dataclasses import dataclass\nfrom functools import partial\nfrom itertools import cycle, islice\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any, BinaryIO, Callable, Optional, Union\n\nimport fsspec.asyn\nimport multiprocess as mp\nimport numpy as np\nimport pandas as pd\nimport pyarrow as pa\nimport pyarrow.dataset as pds\nimport pyarrow.parquet as pq\nfrom huggingface_hub import (\n    CommitInfo,\n    CommitOperationAdd,\n    HfApi,\n    HfFileSystem,\n    HfFileSystemResolvedPath,\n)\nfrom huggingface_hub.utils import RepositoryNotFoundError\nfrom packaging import version\n\nfrom . import config\nfrom .arrow_dataset import Dataset, DatasetInfoMixin, _push_to_bucket, _push_to_repo\nfrom .features import Features\nfrom .features.features import (\n    FeatureType,\n    List,\n    Value,\n    _align_features,\n    _check_if_features_can_be_aligned,\n    _fix_for_backward_compatible_features,\n    _visit,\n    cast_to_python_objects,\n    require_decoding,\n)\nfrom .formatting import (\n    ArrowFormatter,\n    PythonFormatter,\n    TableFormatter,\n    TensorFormatter,\n    get_format_type_from_alias,\n    get_formatter,\n)\nfrom .info import DatasetInfo\nfrom .naming import _split_re\nfrom .splits import NamedSplit, Split, SplitInfo\nfrom .table import cast_table_to_features, embed_table_storage, read_schema_from_file, table_cast\nfrom .utils import tqdm as hf_tqdm\nfrom .utils.logging import get_logger\nfrom .utils.py_utils import (\n    Literal,\n    convert_file_size_to_int,\n    iflatmap_unordered,\n)\nfrom .utils.sharding import _merge_gen_kwargs, _number_of_shards_in_gen_kwargs, _shuffle_gen_kwargs, _split_gen_kwargs\nfrom .utils.typing import PathLike\n\n\nif config.HF_HUB_VERSION >= version.parse(\"1.6.0\"):\n    from huggingface_hub.errors import BucketNotFoundError\n    from huggingface_hub.hf_file_system import HfFileSystemResolvedBucketPath, HfFileSystemResolvedRepositoryPath\n\nelse:\n    BucketNotFoundError = None\n    HfFileSystemResolvedBucketPath = None\n    HfFileSystemResolvedRepositoryPath = HfFileSystemResolvedPath\n\nif TYPE_CHECKING:\n    import sqlite3\n\n    import polars as pl\n    import sqlalchemy\n    import torch\n\n    from .builder import Key as BuilderKey\n\nlogger = get_logger(__name__)\n\nKey = Union[int, str, tuple[int, int], \"BuilderKey\"]\n\n\ndef identity_func(x):\n    return x\n\n\ndef _rename_columns_fn(example: dict, column_mapping: dict[str, str]):\n    if any(col not in example for col in column_mapping):\n        raise ValueError(\n            f\"Error when renaming {list(column_mapping)} to {list(column_mapping.values())}: columns {set(column_mapping) - set(example)} are not in the dataset.\"\n        )\n    if any(col in example for col in column_mapping.values()):\n        raise ValueError(\n            f\"Error when renaming {list(column_mapping)} to {list(column_mapping.values())}: columns {set(example) - set(column_mapping.values())} are already in the dataset.\"\n        )\n    return {\n        new_column_name: example[original_column_name]\n        for original_column_name, new_column_name in column_mapping.items()\n    }\n\n\ndef add_column_fn(example: dict, idx: int, name: str, column: list[dict]):\n    if name in example:\n        raise ValueError(f\"Error when adding {name}: column {name} is already in the dataset.\")\n    return {name: column[idx]}\n\n\ndef _infer_features_from_batch(batch: dict[str, list], try_features: Optional[Features] = None) -> Features:\n    pa_table = pa.Table.from_pydict(batch)\n    if try_features is not None:\n        try:\n            pa_table = table_cast(pa_table, pa.schema(try_features.type))\n        except (TypeError, pa.ArrowInvalid, pa.ArrowNotImplementedError):\n            pass\n    return Features.from_arrow_schema(pa_table.schema)\n\n\ndef _examples_to_batch(examples: list[dict[str, Any]]) -> dict[str, list]:\n    # we order the columns by order of appearance\n    # to do so, we use a dict as an ordered set\n    cols = {col: None for example in examples for col in example}\n    # when an example is missing a column, we set the value to None with .get()\n    arrays = [[example.get(col) for example in examples] for col in cols]\n    return dict(zip(cols, arrays))\n\n\ndef _batch_to_examples(batch: dict[str, list]) -> Iterator[dict[str, Any]]:\n    \"\"\"Convert a batch (dict of examples) to examples list\"\"\"\n    n_examples = 0 if len(batch) == 0 else len(batch[next(iter(batch))])\n    for i in range(n_examples):\n        yield {col: array[i] for col, array in batch.items()}\n\n\ndef _convert_to_arrow(\n    iterable: Iterable[tuple[Key, dict]],\n    batch_size: int,\n    drop_last_batch: bool = False,\n) -> Iterator[tuple[Key, pa.Table]]:\n    \"\"\"Convert and group examples in Arrow tables of size `batch_size`.\n\n    Args:\n        iterable (`Iterable[Tuple[Key, dict]]`):\n            An examples iterable containing tuples (example_key, example) of type (int/str, dict)\n        batch_size (`Optional[int]`):\n            Size of each sub-table to yield. If None or <= 0, yields the full table.\n        drop_last_batch (`bool`, defaults to `False`):\n            Drop the last batch if it is smaller than `batch_size`.\n    \"\"\"\n    if batch_size is None or batch_size <= 0:\n        yield (\n            \"all\",\n            pa.Table.from_pylist(cast_to_python_objects([example for _, example in iterable], only_1d_for_numpy=True)),\n        )\n        return\n    iterator = iter(iterable)\n    for key, example in iterator:\n        iterator_batch = islice(iterator, batch_size - 1)\n        key_examples_list = [(key, example)] + list(iterator_batch)\n        if len(key_examples_list) < batch_size and drop_last_batch:\n            return\n        keys, examples = zip(*key_examples_list)\n        new_key = \"_\".join(str(key) for key in keys)\n        yield new_key, pa.Table.from_pylist(cast_to_python_objects(examples, only_1d_for_numpy=True))\n\n\ndef shift_ex_examples_rngs(ex_iterable: \"_BaseExamplesIterable\", value: int) -> \"_BaseExamplesIterable\":\n    \"\"\"We need to go through the ex_iterables recursively, create a new seed and return a new iterable, then set it to the containing ex_iterable.\"\"\"\n\n    def set_seed_recursively(ex_iterable):\n        if hasattr(ex_iterable, \"shift_rngs\"):\n            ex_iterable = ex_iterable.shift_rngs(value)\n        if hasattr(ex_iterable, \"ex_iterable\"):\n            ex_iterable.ex_iterable = set_seed_recursively(ex_iterable.ex_iterable)\n        if hasattr(ex_iterable, \"ex_iterables\"):\n            ex_iterable.ex_iterables = [set_seed_recursively(ei) for ei in ex_iterable.ex_iterables]\n        return ex_iterable\n\n    return set_seed_recursively(ex_iterable)\n\n\nclass _BaseExamplesIterable:\n    \"\"\"Base class for the examples iterable used by an IterableDataset\"\"\"\n\n    def __init__(self) -> None:\n        self._state_dict: Optional[Union[list, dict]] = None\n\n    def __iter__(self) -> Iterator[tuple[Key, dict]]:\n        \"\"\"An examples iterable should yield tuples (example_key, example) of type (int/str, dict)\"\"\"\n        raise NotImplementedError(f\"{type(self)} doesn't implement __iter__ yet\")\n\n    @property\n    def iter_arrow(self) -> Optional[Callable[[], Iterator[tuple[Key, pa.Table]]]]:\n        return None\n\n    @property\n    def is_typed(self) -> bool:\n        return False\n\n    @property\n    def features(self) -> Optional[Features]:\n        return None\n\n    def shuffle_data_sources(self, generator: np.random.Generator) -> \"_BaseExamplesIterable\":\n        \"\"\"\n        Either shuffle the shards/sources of the dataset, or propagate the shuffling to the underlying iterable.\n        If the order of the shards must stay fixed (when using .skip or .take for example), then this method returns self.\n        \"\"\"\n        raise NotImplementedError(f\"{type(self)} doesn't implement shuffle_data_sources yet\")\n\n    def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> \"_BaseExamplesIterable\":\n        \"\"\"Either keep only the requested shard, or propagate the request to the underlying iterable.\"\"\"\n        raise NotImplementedError(f\"{type(self)} doesn't implement shard_data_sources yet\")\n\n    def reshard_data_sources(self) -> \"_BaseExamplesIterable\":\n        \"\"\"\n        Either reshard the shards/sources of the dataset, i.e. further split the current shards into more shards,\n        or propagate the resharding to the underlying iterable.\n        If the examples iterable can't be further resharded, then this method returns self.\n        \"\"\"\n        raise NotImplementedError(f\"{type(self)} doesn't implement reshard_data_sources yet\")\n\n    def split_shard_indices_by_worker(self, num_shards: int, index: int, contiguous=True) -> list[int]:\n        if contiguous:\n            div = self.num_shards // num_shards\n            mod = self.num_shards % num_shards\n            start = div * index + min(index, mod)\n            end = start + div + (1 if index < mod else 0)\n            return list(range(start, end))\n        else:\n            return list(range(index, self.num_shards, num_shards))\n\n    @property\n    def num_shards(self) -> int:\n        raise NotImplementedError(f\"{type(self)} doesn't implement num_shards yet\")\n\n    def _init_state_dict(self) -> dict:\n        raise NotImplementedError(f\"{type(self)} doesn't implement _init_state_dict yet\")\n\n    def load_state_dict(self, state_dict: dict) -> dict:\n        def _inner_load_state_dict(state, new_state):\n            if new_state is not None and isinstance(state, dict):\n                for key in new_state:\n                    state[key] = _inner_load_state_dict(state[key], new_state[key])\n                return state\n            elif new_state is not None and isinstance(state, list):\n                for i in range(len(state)):\n                    state[i] = _inner_load_state_dict(state[i], new_state[i])\n                return state\n            return new_state\n\n        return _inner_load_state_dict(self._state_dict, state_dict)\n\n    def state_dict(self) -> dict:\n        if self._state_dict:\n            return copy.deepcopy(self._state_dict)\n        raise RuntimeError(\"State dict is not initialized, please call ex_iterable._init_state_dict() first.\")\n\n\nclass ExamplesIterable(_BaseExamplesIterable):\n    def __init__(\n        self,\n        generate_examples_fn: Callable[..., Iterator[tuple[Key, dict]]],\n        kwargs: dict,\n        generate_more_kwargs_fn: Optional[Callable[..., Iterator[dict]]] = None,\n    ):\n        super().__init__()\n        self.generate_examples_fn = generate_examples_fn\n        self.kwargs = kwargs\n\n        # for resharding\n        self.generate_more_kwargs_fn = generate_more_kwargs_fn\n\n    def _init_state_dict(self) -> dict:\n        self._state_dict = {\"shard_idx\": 0, \"shard_example_idx\": 0, \"type\": self.__class__.__name__}\n        return self._state_dict\n\n    def __iter__(self):\n        shard_idx_start = self._state_dict[\"shard_idx\"] if self._state_dict else 0\n        for gen_kwargs in islice(_split_gen_kwargs(self.kwargs, max_num_jobs=self.num_shards), shard_idx_start, None):\n            shard_example_idx_start = self._state_dict[\"shard_example_idx\"] if self._state_dict else 0\n            for key_example in islice(self.generate_examples_fn(**gen_kwargs), shard_example_idx_start, None):\n                if self._state_dict:\n                    self._state_dict[\"shard_example_idx\"] += 1\n                yield key_example\n            if self._state_dict:\n                self._state_dict[\"shard_idx\"] += 1\n                self._state_dict[\"shard_example_idx\"] = 0\n\n    def shuffle_data_sources(self, generator: np.random.Generator) -> \"ExamplesIterable\":\n        return ExamplesIterable(\n            self.generate_examples_fn,\n            _shuffle_gen_kwargs(copy.deepcopy(generator), self.kwargs),\n            self.generate_more_kwargs_fn,\n        )\n\n    def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> \"ExamplesIterable\":\n        \"\"\"Keep only the requested shard.\"\"\"\n        gen_kwargs_list = _split_gen_kwargs(self.kwargs, max_num_jobs=self.num_shards)\n        shard_indices = self.split_shard_indices_by_worker(num_shards, index, contiguous=contiguous)\n        requested_gen_kwargs = _merge_gen_kwargs([gen_kwargs_list[i] for i in shard_indices])\n        return ExamplesIterable(self.generate_examples_fn, requested_gen_kwargs, self.generate_more_kwargs_fn)\n\n    def reshard_data_sources(self) -> \"ExamplesIterable\":\n        \"\"\"Split shars into more shards if possible.\"\"\"\n        if not self.generate_more_kwargs_fn:\n            return ExamplesIterable(self.generate_examples_fn, self.kwargs, self.generate_more_kwargs_fn)\n        gen_kwargs_list = _split_gen_kwargs(self.kwargs, max_num_jobs=self.num_shards)\n        new_gen_kwargs = _merge_gen_kwargs(\n            [\n                new_gen_kwargs\n                for gen_kwargs in gen_kwargs_list\n                for new_gen_kwargs in self.generate_more_kwargs_fn(**gen_kwargs)\n            ]\n        )\n        return ExamplesIterable(self.generate_examples_fn, new_gen_kwargs, self.generate_more_kwargs_fn)\n\n    @property\n    def num_shards(self) -> int:\n        return _number_of_shards_in_gen_kwargs(self.kwargs)\n\n\nclass ArrowExamplesIterable(_BaseExamplesIterable):\n    def __init__(\n        self,\n        generate_tables_fn: Callable[..., Iterator[tuple[Key, pa.Table]]],\n        kwargs: dict,\n        generate_more_kwargs_fn: Optional[Callable[..., Iterator[dict]]] = None,\n    ):\n        super().__init__()\n        self.generate_tables_fn = generate_tables_fn\n        self.kwargs = kwargs\n\n        # for resharding\n        self.generate_more_kwargs_fn = generate_more_kwargs_fn\n\n    @property\n    def iter_arrow(self):\n        return self._iter_arrow\n\n    def _init_state_dict(self) -> dict:\n        self._state_dict = {\"shard_idx\": 0, \"shard_example_idx\": 0, \"type\": self.__class__.__name__}\n        return self._state_dict\n\n    def __iter__(self):\n        formatter = PythonFormatter()\n        shard_idx_start = self._state_dict[\"shard_idx\"] if self._state_dict else 0\n        for gen_kwags in islice(_split_gen_kwargs(self.kwargs, max_num_jobs=self.num_shards), shard_idx_start, None):\n            shard_example_idx_start = self._state_dict[\"shard_example_idx\"] if self._state_dict else 0\n            shard_example_idx = 0\n            for key, pa_table in self.generate_tables_fn(**gen_kwags):\n                if shard_example_idx + len(pa_table) <= shard_example_idx_start:\n                    shard_example_idx += len(pa_table)\n                    continue\n                for pa_subtable in pa_table.to_reader(max_chunksize=config.ARROW_READER_BATCH_SIZE_IN_DATASET_ITER):\n                    formatted_batch = formatter.format_batch(pa_subtable)\n                    for example in _batch_to_examples(formatted_batch):\n                        if shard_example_idx >= shard_example_idx_start:\n                            if self._state_dict:\n                                self._state_dict[\"shard_example_idx\"] += 1\n                            yield key, example\n                        shard_example_idx += 1\n            if self._state_dict:\n                self._state_dict[\"shard_idx\"] += 1\n                self._state_dict[\"shard_example_idx\"] = 0\n\n    def _iter_arrow(self):\n        shard_idx_start = self._state_dict[\"shard_idx\"] if self._state_dict else 0\n        for gen_kwags in islice(_split_gen_kwargs(self.kwargs, max_num_jobs=self.num_shards), shard_idx_start, None):\n            shard_example_idx_start = self._state_dict[\"shard_example_idx\"] if self._state_dict else 0\n            shard_example_idx = 0\n            for key, pa_table in self.generate_tables_fn(**gen_kwags):\n                shard_example_idx += len(pa_table)\n                if shard_example_idx <= shard_example_idx_start:\n                    continue\n                if self._state_dict:\n                    self._state_dict[\"shard_example_idx\"] += len(pa_table)\n                yield key, pa_table\n            if self._state_dict:\n                self._state_dict[\"shard_idx\"] += 1\n                self._state_dict[\"shard_example_idx\"] = 0\n\n    def shuffle_data_sources(self, generator: np.random.Generator) -> \"ArrowExamplesIterable\":\n        return ArrowExamplesIterable(\n            self.generate_tables_fn, _shuffle_gen_kwargs(copy.deepcopy(generator), self.kwargs), generator\n        )\n\n    def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> \"ArrowExamplesIterable\":\n        \"\"\"Keep only the requested shard.\"\"\"\n        gen_kwargs_list = _split_gen_kwargs(self.kwargs, max_num_jobs=self.num_shards)\n        shard_indices = self.split_shard_indices_by_worker(num_shards, index, contiguous=contiguous)\n        requested_gen_kwargs = _merge_gen_kwargs([gen_kwargs_list[i] for i in shard_indices])\n        return ArrowExamplesIterable(self.generate_tables_fn, requested_gen_kwargs)\n\n    def reshard_data_sources(self) -> \"ArrowExamplesIterable\":\n        \"\"\"Split shars into more shards if possible.\"\"\"\n        if not self.generate_more_kwargs_fn:\n            return ArrowExamplesIterable(self.generate_tables_fn, self.kwargs, self.generate_more_kwargs_fn)\n        gen_kwargs_list = _split_gen_kwargs(self.kwargs, max_num_jobs=self.num_shards)\n        new_gen_kwargs = _merge_gen_kwargs(\n            [\n                new_gen_kwargs\n                for gen_kwargs in gen_kwargs_list\n                for new_gen_kwargs in self.generate_more_kwargs_fn(**gen_kwargs)\n            ]\n        )\n        return ArrowExamplesIterable(self.generate_tables_fn, new_gen_kwargs, self.generate_more_kwargs_fn)\n\n    @property\n    def num_shards(self) -> int:\n        return _number_of_shards_in_gen_kwargs(self.kwargs)\n\n\nclass RebatchedArrowExamplesIterable(_BaseExamplesIterable):\n    def __init__(\n        self,\n        ex_iterable: _BaseExamplesIterable,\n        batch_size: Optional[int],\n        drop_last_batch: bool = False,\n        force_convert_to_arrow: bool = False,\n    ):\n        super().__init__()\n        self.ex_iterable = ex_iterable\n        self.batch_size = batch_size\n        self.drop_last_batch = drop_last_batch\n        self.force_convert_to_arrow = force_convert_to_arrow\n\n    @property\n    def iter_arrow(self):\n        return self._iter_arrow if self.ex_iterable.iter_arrow or self.force_convert_to_arrow else None\n\n    @property\n    def is_typed(self):\n        return self.ex_iterable.is_typed\n\n    @property\n    def features(self):\n        return self.ex_iterable.features\n\n    def _init_state_dict(self) -> dict:\n        self._state_dict = {\n            \"examples_iterable\": self.ex_iterable._init_state_dict(),\n            \"previous_state\": None,\n            \"batch_idx\": 0,\n            \"num_chunks_since_previous_state\": 0,\n            \"cropped_chunk_length\": 0,\n            \"type\": self.__class__.__name__,\n        }\n        return self._state_dict\n\n    def __iter__(self):\n        yield from self.ex_iterable\n\n    def _iter_arrow(self) -> Iterator[tuple[Key, pa.Table]]:\n        \"\"\"Iterate over sub-tables of size `batch_size`.\"\"\"\n        if self._state_dict and self._state_dict[\"previous_state\"]:\n            self.ex_iterable.load_state_dict(self._state_dict[\"previous_state\"])\n        if self.ex_iterable.iter_arrow:\n            iterator = self.ex_iterable.iter_arrow()\n        elif self.force_convert_to_arrow:\n            iterator = _convert_to_arrow(self.ex_iterable, batch_size=1)\n        else:\n            raise RuntimeError(\n                \"_iter_arrow is not available in RebatchedArrowExamplesIterable, use an examples iterable that implements _iter_arrow() or pass force_convert_to_arrow=True\"\n            )\n        if self.batch_size is None or self.batch_size <= 0:\n            if self._state_dict and self._state_dict[\"batch_idx\"] > 0:\n                return\n            all_pa_table = pa.concat_tables([pa_table for _, pa_table in iterator])\n            if self._state_dict:\n                self._state_dict[\"batch_idx\"] = 1\n            yield \"all\", all_pa_table\n            return\n        keys_buffer = []\n        chunks_buffer = []\n        chunks_buffer_size = 0\n        num_chunks_to_skip = self._state_dict[\"num_chunks_since_previous_state\"] if self._state_dict else 0\n        chunk_length_to_crop = self._state_dict[\"cropped_chunk_length\"] if self._state_dict else 0\n        if self._state_dict:\n            previous_state = self.ex_iterable.state_dict()\n            self._state_dict[\"previous_state\"] = previous_state\n        for key, pa_table in iterator:\n            for num_chunks_since_previous_state, chunk in enumerate(pa_table.to_reader(max_chunksize=self.batch_size)):\n                if num_chunks_to_skip > 1:\n                    num_chunks_to_skip -= 1\n                    continue\n                elif num_chunks_to_skip == 1 and chunk_length_to_crop == 0:\n                    num_chunks_to_skip -= 1\n                    continue\n                elif num_chunks_to_skip == 1 and chunk_length_to_crop > 0:\n                    chunk = chunk.slice(chunk_length_to_crop, len(chunk) - chunk_length_to_crop)\n                    num_chunks_to_skip = 0\n                    chunk_length_to_crop = 0\n                if len(chunk) == 0:\n                    continue\n\n                if chunks_buffer_size + len(chunk) < self.batch_size:\n                    keys_buffer.append(key)\n                    chunks_buffer.append(chunk)\n                    chunks_buffer_size += len(chunk)\n                    continue\n                elif chunks_buffer_size + len(chunk) == self.batch_size:\n                    keys_buffer.append(key)\n                    chunks_buffer.append(chunk)\n                    new_key = \"_\".join(str(_key) for _key in keys_buffer)\n                    if self._state_dict:\n                        self._state_dict[\"batch_idx\"] += 1\n                        self._state_dict[\"num_chunks_since_previous_state\"] += len(chunks_buffer)\n                        self._state_dict[\"cropped_chunk_length\"] = 0\n                    yield new_key, pa.Table.from_batches(chunks_buffer)\n                    keys_buffer = []\n                    chunks_buffer = []\n                    chunks_buffer_size = 0\n                    if self._state_dict:\n                        self._state_dict[\"previous_state\"] = previous_state\n                        self._state_dict[\"num_chunks_since_previous_state\"] = num_chunks_since_previous_state + 1\n                else:\n                    cropped_chunk_length = self.batch_size - chunks_buffer_size\n                    keys_buffer.append(f\"{key}[:{cropped_chunk_length}]\")\n                    chunks_buffer.append(chunk.slice(0, cropped_chunk_length))\n                    new_key = \"_\".join(str(_key) for _key in keys_buffer)\n                    if self._state_dict:\n                        self._state_dict[\"batch_idx\"] += 1\n                        self._state_dict[\"num_chunks_since_previous_state\"] += len(chunks_buffer)\n                        self._state_dict[\"cropped_chunk_length\"] = cropped_chunk_length\n                    yield new_key, pa.Table.from_batches(chunks_buffer)\n                    keys_buffer = [f\"{key}[{cropped_chunk_length}:]\"]\n                    chunks_buffer = [chunk.slice(cropped_chunk_length, len(chunk) - cropped_chunk_length)]\n                    chunks_buffer_size = len(chunk) - cropped_chunk_length\n                    if self._state_dict:\n                        self._state_dict[\"previous_state\"] = previous_state\n                        self._state_dict[\"num_chunks_since_previous_state\"] = num_chunks_since_previous_state\n            if self._state_dict:\n                previous_state = self.ex_iterable.state_dict()\n        if not self.drop_last_batch and chunks_buffer:\n            new_key = \"_\".join(str(_key) for _key in keys_buffer)\n            if self._state_dict:\n                self._state_dict[\"previous_state\"] = previous_state\n                self._state_dict[\"batch_idx\"] += 1\n                self._state_dict[\"num_chunks_since_previous_state\"] = 0\n                self._state_dict[\"cropped_chunk_length\"] = 0\n            yield new_key, pa.Table.from_batches(chunks_buffer)\n\n    def shuffle_data_sources(self, generator: np.random.Generator) -> \"RebatchedArrowExamplesIterable\":\n        return RebatchedArrowExamplesIterable(\n            self.ex_iterable.shuffle_data_sources(generator),\n            self.batch_size,\n            self.drop_last_batch,\n            self.force_convert_to_arrow,\n        )\n\n    def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> \"RebatchedArrowExamplesIterable\":\n        return RebatchedArrowExamplesIterable(\n            self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous),\n            self.batch_size,\n            self.drop_last_batch,\n            self.force_convert_to_arrow,\n        )\n\n    def reshard_data_sources(self) -> \"RebatchedArrowExamplesIterable\":\n        return RebatchedArrowExamplesIterable(\n            self.ex_iterable.reshard_data_sources(), self.batch_size, self.drop_last_batch, self.force_convert_to_arrow\n        )\n\n    @property\n    def num_shards(self) -> int:\n        return self.ex_iterable.num_shards\n\n\nclass SelectColumnsIterable(_BaseExamplesIterable):\n    def __init__(self, ex_iterable: _BaseExamplesIterable, column_names: list[str]):\n        super().__init__()\n        self.ex_iterable = ex_iterable\n        self.column_names = column_names\n\n    @property\n    def iter_arrow(self):\n        if self.ex_iterable.iter_arrow:\n            return self._iter_arrow\n\n    @property\n    def is_typed(self):\n        return self.ex_iterable.is_typed\n\n    @property\n    def features(self):\n        return self.ex_iterable.features\n\n    def _init_state_dict(self) -> dict:\n        self._state_dict = self.ex_iterable._init_state_dict()\n        return self._state_dict\n\n    def __iter__(self):\n        for idx, row in self.ex_iterable:\n            yield idx, {c: row[c] for c in self.column_names}\n\n    def _iter_arrow(self) -> Iterator[tuple[Key, pa.Table]]:\n        for idx, pa_table in self.ex_iterable.iter_arrow():\n            if len(pa_table) > 0:  # empty tables have no schema\n                yield idx, pa_table.select(self.column_names)\n\n    def shuffle_data_sources(self, generator: np.random.Generator) -> \"SelectColumnsIterable\":\n        return SelectColumnsIterable(self.ex_iterable.shuffle_data_sources(generator), self.column_names)\n\n    def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> \"SelectColumnsIterable\":\n        return SelectColumnsIterable(\n            self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous), self.column_names\n        )\n\n    def reshard_data_sources(self) -> \"SelectColumnsIterable\":\n        return SelectColumnsIterable(self.ex_iterable.reshard_data_sources(), self.column_names)\n\n    @property\n    def num_shards(self) -> int:\n        return self.ex_iterable.num_shards\n\n\nclass StepExamplesIterable(_BaseExamplesIterable):\n    def __init__(self, ex_iterable: _BaseExamplesIterable, step: int, offset: int):\n        super().__init__()\n        self.ex_iterable = ex_iterable\n        self.step = step\n        self.offset = offset\n\n    @property\n    def iter_arrow(self):\n        return self._iter_arrow if self.ex_iterable.iter_arrow else None\n\n    @property\n    def is_typed(self):\n        return self.ex_iterable.is_typed\n\n    @property\n    def features(self):\n        return self.ex_iterable.features\n\n    def _init_state_dict(self) -> dict:\n        self._state_dict = {\n            \"examples_iterable\": self.ex_iterable._init_state_dict(),\n            \"stepped\": 0,\n            \"type\": self.__class__.__name__,\n        }\n        return self._state_dict\n\n    def __iter__(self):\n        ex_iterator = iter(self.ex_iterable)\n        while True:\n            batch = list(islice(ex_iterator, self.step))\n            if len(batch) > self.offset:\n                yield batch[self.offset]\n            else:\n                break\n\n    def _iter_arrow(self):\n        stepped = self._state_dict[\"stepped\"] if self._state_dict else 0\n        for key, pa_table in self.ex_iterable.iter_arrow():\n            stepped_pa_table = pa_table.take(\n                pa.array(range((self.offset - stepped) % self.step, len(pa_table), self.step), type=pa.int64())\n            )\n            stepped = (stepped + len(pa_table)) % self.step\n            if self._state_dict:\n                self._state_dict[\"stepped\"] = stepped\n            yield key, stepped_pa_table\n\n    def shuffle_data_sources(self, generator: np.random.Generator) -> \"StepExamplesIterable\":\n        return StepExamplesIterable(\n            self.ex_iterable.shuffle_data_sources(generator), step=self.step, offset=self.offset\n        )\n\n    def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> \"StepExamplesIterable\":\n        return StepExamplesIterable(\n            self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous),\n            step=self.step,\n            offset=self.offset,\n        )\n\n    def reshard_data_sources(self) -> \"StepExamplesIterable\":\n        return StepExamplesIterable(\n            self.ex_iterable.reshard_data_sources(),\n            step=self.step,\n            offset=self.offset,\n        )\n\n    @property\n    def num_shards(self) -> int:\n        return self.ex_iterable.num_shards\n\n\nclass CyclingMultiSourcesExamplesIterable(_BaseExamplesIterable):\n    def __init__(\n        self,\n        ex_iterables: list[_BaseExamplesIterable],\n        stopping_strategy: Literal[\n            \"first_exhausted\", \"all_exhausted\", \"all_exhausted_without_replacement\"\n        ] = \"first_exhausted\",\n    ):\n        super().__init__()\n        self.ex_iterables = ex_iterables\n        self.stopping_strategy = stopping_strategy\n\n        # if undersampling (\"first_exhausted\"), we stop as soon as one dataset is exhausted\n        # if oversampling (\"all_exhausted\"), we stop as soons as every dataset is exhausted, i.e as soon as every samples of every dataset has been visited at least once\n        # if sampling without replacement (\"all_exhausted_without_replacement\"), we stop once all samples of every dataset has been visited exactly once.\n        self.bool_strategy_func = (\n            np.all if (stopping_strategy in (\"all_exhausted\", \"all_exhausted_without_replacement\")) else np.any\n        )\n\n    @property\n    def is_typed(self):\n        return self.ex_iterables[0].is_typed\n\n    @property\n    def features(self):\n        return self.ex_iterables[0].features\n\n    @property\n    def iter_arrow(self):\n        # iterate on arrow tables if all ex_iterables can iterate\n        return self._iter_arrow if all(ex_iterable.iter_arrow for ex_iterable in self.ex_iterables) else None\n\n    def _get_indices_iterator(self):\n        # this is an infinite iterator to keep track of which iterator we want to pick examples from\n        ex_iterable_idx = self._state_dict[\"ex_iterable_idx\"] if self._state_dict else 0\n        for next_ex_iterable_idx in islice(cycle(range(len(self.ex_iterables))), ex_iterable_idx + 1, None):\n            if self._state_dict:\n                self._state_dict[\"ex_iterable_idx\"] = next_ex_iterable_idx\n            yield ex_iterable_idx\n            ex_iterable_idx = next_ex_iterable_idx\n\n    def _init_state_dict(self) -> dict:\n        self._state_dict = {\n            \"ex_iterable_idx\": 0,\n            \"ex_iterables\": [ex_iterable._init_state_dict() for ex_iterable in self.ex_iterables],\n            \"previous_states\": [None] * len(self.ex_iterables),\n            \"is_exhausted\": [False] * len(self.ex_iterables),\n            \"type\": self.__class__.__name__,\n        }\n        return self._state_dict\n\n    def _iter_arrow(self):\n        # we use this to buffer one example of each iterator to know if an iterator is exhausted\n        nexts = [None] * len(self.ex_iterables)\n        # because of that, we need to rewind 1 example when reloading the state dict\n        if self._state_dict:\n            for i in range(len(self.ex_iterables)):\n                if self._state_dict[\"previous_states\"][i] is not None:\n                    self.ex_iterables[i].load_state_dict(self._state_dict[\"previous_states\"][i])\n        iterators = [ex_iterable.iter_arrow() for ex_iterable in self.ex_iterables]\n\n        indices_iterator = self._get_indices_iterator()\n\n        is_exhausted = (\n            np.array(self._state_dict[\"is_exhausted\"]) if self._state_dict else np.full(len(self.ex_iterables), False)\n        )\n        for i in indices_iterator:\n            # if the stopping criteria is met, break the main for loop\n            if self.bool_strategy_func(is_exhausted):\n                break\n            # Skip exhausted iterators if we sample without replacement\n            if is_exhausted[i] and self.stopping_strategy in [\"all_exhausted_without_replacement\"]:\n                continue\n            # let's pick one example from the iterator at index i\n            if nexts[i] is None:\n                nexts[i] = next(iterators[i], False)\n            result = nexts[i]\n            if self._state_dict:\n                self._state_dict[\"previous_states\"][i] = deepcopy(self._state_dict[\"ex_iterables\"][i])\n            nexts[i] = next(iterators[i], False)\n\n            # the iterator is exhausted\n            if nexts[i] is False:\n                is_exhausted[i] = True\n                if self._state_dict:\n                    self._state_dict[\"is_exhausted\"][i] = True\n                # we reset it in case the stopping crtieria isn't met yet and we sample with replacement\n                if self.stopping_strategy not in [\"all_exhausted_without_replacement\"]:\n                    nexts[i] = None\n                    if self._state_dict:\n                        self._state_dict[\"ex_iterables\"][i] = self.ex_iterables[i]._init_state_dict()\n                        self._state_dict[\"previous_states\"][i] = None\n                    iterators[i] = self.ex_iterables[i]._iter_arrow()\n\n            if result is not False:\n                yield result\n\n    def __iter__(self):\n        # we use this to buffer one example of each iterator to know if an iterator is exhausted\n        nexts = [None] * len(self.ex_iterables)\n        # because of that, we need to rewind 1 example when reloading the state dict\n        if self._state_dict:\n            for i in range(len(self.ex_iterables)):\n                if self._state_dict[\"previous_states\"][i] is not None:\n                    self.ex_iterables[i].load_state_dict(self._state_dict[\"previous_states\"][i])\n        iterators = [iter(ex_iterable) for ex_iterable in self.ex_iterables]\n\n        indices_iterator = self._get_indices_iterator()\n\n        is_exhausted = (\n            np.array(self._state_dict[\"is_exhausted\"]) if self._state_dict else np.full(len(self.ex_iterables), False)\n        )\n        for i in indices_iterator:\n            # if the stopping criteria is met, break the main for loop\n            if self.bool_strategy_func(is_exhausted):\n                break\n            # let's pick one example from the iterator at index i\n            if is_exhausted[i] and self.stopping_strategy in [\"all_exhausted_without_replacement\"]:\n                continue\n            if nexts[i] is None:\n                nexts[i] = next(iterators[i], False)\n            result = nexts[i]\n            if self._state_dict:\n                self._state_dict[\"previous_states\"][i] = deepcopy(self._state_dict[\"ex_iterables\"][i])\n            nexts[i] = next(iterators[i], False)\n\n            # the iterator is exhausted\n            if nexts[i] is False:\n                is_exhausted[i] = True\n                if self._state_dict:\n                    self._state_dict[\"is_exhausted\"][i] = True\n                # we reset it in case the stopping crtieria isn't met yet\n                if self.stopping_strategy not in [\"all_exhausted_without_replacement\"]:\n                    nexts[i] = None\n                    if self._state_dict:\n                        self._state_dict[\"ex_iterables\"][i] = self.ex_iterables[i]._init_state_dict()\n                        self._state_dict[\"previous_states\"][i] = None\n                    iterators[i] = iter(self.ex_iterables[i])\n            if result is not False:\n                yield result\n\n    def shuffle_data_sources(self, generator: np.random.Generator) -> \"CyclingMultiSourcesExamplesIterable\":\n        \"\"\"Shuffle each underlying examples iterable.\"\"\"\n        ex_iterables = [ex_iterable.shuffle_data_sources(generator) for ex_iterable in self.ex_iterables]\n        return CyclingMultiSourcesExamplesIterable(ex_iterables, self.stopping_strategy)\n\n    @property\n    def num_shards(self) -> int:\n        return min(ex_iterable.num_shards for ex_iterable in self.ex_iterables) if self.ex_iterables else 0\n\n    def shard_data_sources(\n        self, num_shards: int, index: int, contiguous=True\n    ) -> \"CyclingMultiSourcesExamplesIterable\":\n        \"\"\"Either keep only the requested shard, or propagate the request to the underlying iterable.\"\"\"\n        if num_shards < self.num_shards:\n            return CyclingMultiSourcesExamplesIterable(\n                [\n                    iterable.shard_data_sources(num_shards, index, contiguous=contiguous)\n                    for iterable in self.ex_iterables\n                ],\n                stopping_strategy=self.stopping_strategy,\n            )\n        elif index < self.num_shards:\n            return CyclingMultiSourcesExamplesIterable(\n                [\n                    iterable.shard_data_sources(self.num_shards, index, contiguous=contiguous)\n                    for iterable in self.ex_iterables\n                ],\n                stopping_strategy=self.stopping_strategy,\n            )\n        else:\n            return CyclingMultiSourcesExamplesIterable(\n                [],\n                stopping_strategy=self.stopping_strategy,\n            )\n\n    def reshard_data_sources(self) -> \"CyclingMultiSourcesExamplesIterable\":\n        return CyclingMultiSourcesExamplesIterable(\n            [iterable.reshard_data_sources() for iterable in self.ex_iterables],\n            stopping_strategy=self.stopping_strategy,\n        )\n\n\nclass VerticallyConcatenatedMultiSourcesExamplesIterable(_BaseExamplesIterable):\n    \"\"\"\n    VerticallyConcatenatedMultiSourcesExamplesIterable simply chains the input iterables.\n    It doesn't require the examples iterables to always yield the same columns.\n    Instead, this is handled by the `IterableDataset` class or `FormattedExamplesIterable`.\n\n    For information, `IterableDataset` merges the features of all the datasets to concatenate into one.\n    We use `IterableDataset._resolve_features` to obtain the features of all the datasets to concatenate.\n\n    Then for each example, `IterableDataset` and `FormattedExamplesIterable` automatically fill missing columns with None.\n    This is done with `_apply_feature_types_on_example`.\n    \"\"\"\n\n    def __init__(self, ex_iterables: list[_BaseExamplesIterable]):\n        super().__init__()\n        self.ex_iterables = ex_iterables\n\n    @property\n    def is_typed(self):\n        return self.ex_iterables[0].is_typed\n\n    @property\n    def features(self):\n        return self.ex_iterables[0].features\n\n    @property\n    def iter_arrow(self):\n        if all(ex_iterable.iter_arrow is not None for ex_iterable in self.ex_iterables):\n            return self._iter_arrow\n\n    def _init_state_dict(self) -> dict:\n        self._state_dict = {\n            \"ex_iterable_idx\": 0,\n            \"ex_iterables\": [ex_iterable._init_state_dict() for ex_iterable in self.ex_iterables],\n            \"type\": self.__class__.__name__,\n        }\n        return self._state_dict\n\n    def __iter__(self):\n        ex_iterable_idx_start = self._state_dict[\"ex_iterable_idx\"] if self._state_dict else 0\n        for ex_iterable in islice(self.ex_iterables, ex_iterable_idx_start, None):\n            yield from ex_iterable\n            if self._state_dict:\n                self._state_dict[\"ex_iterable_idx\"] += 1\n\n    def _iter_arrow(self):\n        ex_iterable_idx_start = self._state_dict[\"ex_iterable_idx\"] if self._state_dict else 0\n        for ex_iterable in islice(self.ex_iterables, ex_iterable_idx_start, None):\n            yield from ex_iterable.iter_arrow()\n            if self._state_dict:\n                self._state_dict[\"ex_iterable_idx\"] += 1\n\n    def shuffle_data_sources(\n        self, generator: np.random.Generator\n    ) -> \"VerticallyConcatenatedMultiSourcesExamplesIterable\":\n        \"\"\"Shuffle all shards.\"\"\"\n        rng = deepcopy(generator)\n        single_shard_ex_iterables = [\n            ex_iterable.shard_data_sources(num_shards=ex_iterable.num_shards, index=index)\n            for ex_iterable in self.ex_iterables\n            for index in range(ex_iterable.num_shards)\n        ]\n        rng.shuffle(single_shard_ex_iterables)\n        return VerticallyConcatenatedMultiSourcesExamplesIterable(single_shard_ex_iterables)\n\n    @property\n    def num_shards(self) -> int:\n        return sum(ex_iterable.num_shards for ex_iterable in self.ex_iterables)\n\n    def shard_data_sources(\n        self, num_shards: int, index: int, contiguous=True\n    ) -> \"VerticallyConcatenatedMultiSourcesExamplesIterable\":\n        \"\"\"Keep only the requested shard\"\"\"\n        single_shard_ex_iterables = [\n            ex_iterable.shard_data_sources(num_shards=ex_iterable.num_shards, index=index)\n            for ex_iterable in self.ex_iterables\n            for index in range(ex_iterable.num_shards)\n        ]\n        shard_indices = self.split_shard_indices_by_worker(num_shards, index, contiguous=contiguous)\n        return VerticallyConcatenatedMultiSourcesExamplesIterable(\n            [single_shard_ex_iterables[i] for i in shard_indices]\n        )\n\n    def reshard_data_sources(self) -> \"VerticallyConcatenatedMultiSourcesExamplesIterable\":\n        return VerticallyConcatenatedMultiSourcesExamplesIterable(\n            [iterable.reshard_data_sources() for iterable in self.ex_iterables]\n        )\n\n\ndef _check_column_names(column_names: list[str]):\n    \"\"\"Check the column names to make sure they don't contain duplicates.\"\"\"\n    counter = Counter(column_names)\n    if not all(count == 1 for count in counter.values()):\n        duplicated_columns = [col for col in counter if counter[col] > 1]\n        raise ValueError(\n            f\"The examples iterables can't have duplicated columns but columns {duplicated_columns} are duplicated.\"\n        )\n\n\nclass HorizontallyConcatenatedMultiSourcesExamplesIterable(_BaseExamplesIterable):\n    \"\"\"\n    HorizontallyConcatenatedMultiSourcesExamplesIterable merges examples together for the input list of iterables.\n    It also checks that there are no duplicate columns (otherwise we don't know which one to keep).\n    This check is done once when yielding the first example.\n\n    However it doesn't fill missing columns with None.\n    Instead, this is handled by the `IterableDataset` class or `FormattedExamplesIterable`.\n\n    For information, `IterableDataset` merges the features of all the datasets to concatenate into one.\n    We use `IterableDataset._resolve_features` to obtain the features of all the datasets to concatenate.\n\n    Then for each example, `IterableDataset` and `FormattedExamplesIterable` automatically fill missing columns with None.\n    This is done with `_apply_feature_types_on_example`.\n    \"\"\"\n\n    def __init__(self, ex_iterables: list[_BaseExamplesIterable]):\n        super().__init__()\n        self.ex_iterables = ex_iterables\n\n    @property\n    def iter_arrow(self):\n        return (\n            self._iter_arrow\n            if all(\n                isinstance(ex_iterable, RebatchedArrowExamplesIterable) and ex_iterable.ex_iterable.iter_arrow\n                for ex_iterable in self.ex_iterables\n            )\n            or (len(self.ex_iterables) < 2 and all(ex_iterable.iter_arrow for ex_iterable in self.ex_iterables))\n            else None\n        )\n\n    @property\n    def is_typed(self):\n        return self.ex_iterables[0].is_typed\n\n    @property\n    def features(self):\n        return self.ex_iterables[0].features\n\n    def _init_state_dict(self) -> dict:\n        self._state_dict = {\n            \"ex_iterables\": [ex_iterable._init_state_dict() for ex_iterable in self.ex_iterables],\n            \"type\": self.__class__.__name__,\n        }\n        return self._state_dict\n\n    def __iter__(self):\n        ex_iterators = [iter(ex_iterable) for ex_iterable in self.ex_iterables]\n        for i in itertools.count():\n            keys = []\n            examples = []\n            for ex_iterator in list(ex_iterators):\n                try:\n                    key, example = next(ex_iterator)\n                    keys.append(key)\n                    examples.append(example)\n                except StopIteration:\n                    ex_iterators.remove(ex_iterator)\n            if ex_iterators:\n                if i == 0:\n                    _check_column_names([column_name for example in examples for column_name in example])\n                new_example = {}\n                for example in examples:\n                    new_example.update(example)\n                new_key = \"_\".join(str(key) for key in keys)\n                yield new_key, new_example\n            else:\n                break\n\n    def _iter_arrow(self):\n        pa_table_iterators = [iter(ex_iterable.iter_arrow()) for ex_iterable in self.ex_iterables]\n        for i in itertools.count():\n            keys = []\n            pa_tables = []\n            for pa_table_iterator in list(pa_table_iterators):\n                try:\n                    key, pa_table = next(pa_table_iterator)\n                    keys.append(key)\n                    pa_tables.append(pa_table)\n                except StopIteration:\n                    pa_table_iterators.remove(pa_table_iterator)\n            if pa_table_iterators:\n                if i == 0:\n                    _check_column_names(\n                        [column_name for pa_table in pa_tables for column_name in pa_table.column_names]\n                    )\n                for j, table in enumerate(pa_tables):\n                    if j == 0:\n                        new_pa_table = table\n                    else:\n                        for name, col in zip(table.column_names, table.columns):\n                            new_pa_table = pa_table.append_column(name, col)\n                new_key = \"_\".join(str(key) for key in keys)\n                yield new_key, new_pa_table\n            else:\n                break\n\n    def shuffle_data_sources(\n        self, generator: np.random.Generator\n    ) -> \"HorizontallyConcatenatedMultiSourcesExamplesIterable\":\n        \"\"\"Doesn't shuffle the wrapped examples iterable since it would break the alignment between them.\"\"\"\n        return self\n\n    @property\n    def num_shards(self) -> int:\n        return 1\n\n    def shard_data_sources(\n        self, num_shards: int, index: int, contiguous=True\n    ) -> \"HorizontallyConcatenatedMultiSourcesExamplesIterable\":\n        \"\"\"Doesn't shard the wrapped examples iterable since it would break the alignment between them.\"\"\"\n        return self\n\n    def reshard_data_sources(self) -> \"HorizontallyConcatenatedMultiSourcesExamplesIterable\":\n        \"\"\"Doesn't reshard the wrapped examples iterable since it would break the alignment between them.\"\"\"\n        return self\n\n\nclass RandomlyCyclingMultiSourcesExamplesIterable(CyclingMultiSourcesExamplesIterable):\n    def __init__(\n        self,\n        ex_iterables: list[_BaseExamplesIterable],\n        generator: np.random.Generator,\n        probabilities: Optional[list[float]] = None,\n        stopping_strategy: Literal[\n            \"first_exhausted\", \"all_exhausted\", \"all_exhausted_without_replacement\"\n        ] = \"first_exhausted\",\n    ):\n        super().__init__(ex_iterables, stopping_strategy)\n        self.generator = deepcopy(generator)\n        self.probabilities = probabilities\n\n    def shift_rngs(self, value: int) -> \"_BaseExamplesIterable\":\n        rng = deepcopy(self.generator)\n        new_seed = rng.integers(0, 1 << 63) - value\n        return RandomlyCyclingMultiSourcesExamplesIterable(\n            ex_iterables=self.ex_iterables,\n            generator=np.random.default_rng(seed=new_seed),\n            probabilities=self.probabilities,\n            stopping_strategy=self.stopping_strategy,\n        )\n\n    @property\n    def is_typed(self):\n        return self.ex_iterables[0].is_typed\n\n    @property\n    def features(self):\n        return self.ex_iterables[0].features\n\n    def _get_indices_iterator(self):\n        rng = deepcopy(self.generator)\n        num_sources = len(self.ex_iterables)\n        random_batch_size = 1000\n        # this is an infinite iterator that randomly samples the index of the source to pick examples from\n        index_offset = self._state_dict[\"bit_generator_index_offset\"] if self._state_dict else 0\n        if self._state_dict:\n            rng.bit_generator.state = self._state_dict[\"bit_generator_state\"]\n        if self.probabilities is None:\n            while True:\n                for i in islice(rng.integers(0, num_sources, size=random_batch_size), index_offset, None):\n                    index_offset = (index_offset + 1) % random_batch_size\n                    if self._state_dict:\n                        self._state_dict[\"bit_generator_index_offset\"] = index_offset\n                        if index_offset == 0:\n                            self._state_dict[\"bit_generator_state\"] = rng.bit_generator.state\n                    yield int(i)\n        else:\n            while True:\n                for i in islice(\n                    rng.choice(num_sources, size=random_batch_size, p=self.probabilities), index_offset, None\n                ):\n                    index_offset = (index_offset + 1) % random_batch_size\n                    if self._state_dict:\n                        self._state_dict[\"bit_generator_index_offset\"] = index_offset\n                        if index_offset == 0:\n                            self._state_dict[\"bit_generator_state\"] = rng.bit_generator.state\n                    yield int(i)\n\n    def _init_state_dict(self) -> dict:\n        self._state_dict = {\n            \"bit_generator_state\": self.generator.bit_generator.state,\n            \"bit_generator_index_offset\": 0,\n            \"ex_iterables\": [ex_iterable._init_state_dict() for ex_iterable in self.ex_iterables],\n            \"previous_states\": [None] * len(self.ex_iterables),\n            \"is_exhausted\": [False] * len(self.ex_iterables),\n            \"type\": self.__class__.__name__,\n        }\n        return self._state_dict\n\n    def shuffle_data_sources(self, generator: np.random.Generator) -> \"RandomlyCyclingMultiSourcesExamplesIterable\":\n        \"\"\"Shuffle the data sources of each wrapped examples iterable.\"\"\"\n        ex_iterables = [ex_iterable.shuffle_data_sources(generator) for ex_iterable in self.ex_iterables]\n        return RandomlyCyclingMultiSourcesExamplesIterable(\n            ex_iterables,\n            generator=generator,\n            probabilities=self.probabilities,\n            stopping_strategy=self.stopping_strategy,\n        )\n\n    def shard_data_sources(\n        self, num_shards: int, index: int, contiguous=True\n    ) -> \"RandomlyCyclingMultiSourcesExamplesIterable\":\n        \"\"\"Either keep only the requested shard, or propagate the request to the underlying iterable.\"\"\"\n        if num_shards < self.num_shards:\n            return RandomlyCyclingMultiSourcesExamplesIterable(\n                [\n                    iterable.shard_data_sources(num_shards, index, contiguous=contiguous)\n                    for iterable in self.ex_iterables\n                ],\n                self.generator,\n                self.probabilities,\n                self.stopping_strategy,\n            )\n        elif index < self.num_shards:\n            return RandomlyCyclingMultiSourcesExamplesIterable(\n                [\n                    iterable.shard_data_sources(self.num_shards, index, contiguous=contiguous)\n                    for iterable in self.ex_iterables\n                ],\n                self.generator,\n                self.probabilities,\n                self.stopping_strategy,\n            )\n        else:\n            return RandomlyCyclingMultiSourcesExamplesIterable(\n                [],\n                self.generator,\n                self.probabilities,\n                self.stopping_strategy,\n            )\n\n    def reshard_data_sources(self) -> \"RandomlyCyclingMultiSourcesExamplesIterable\":\n        \"\"\"Either keep only the requested shard, or propagate the request to the underlying iterable.\"\"\"\n        return RandomlyCyclingMultiSourcesExamplesIterable(\n            [iterable.reshard_data_sources() for iterable in self.ex_iterables],\n            self.generator,\n            self.probabilities,\n            self.stopping_strategy,\n        )\n\n\ndef _table_output_to_arrow(output) -> pa.Table:\n    if isinstance(output, pa.Table):\n        return output\n    if isinstance(output, (pd.DataFrame, pd.Series)):\n        return pa.Table.from_pandas(output)\n    if config.POLARS_AVAILABLE and \"polars\" in sys.modules:\n        import polars as pl\n\n        if isinstance(output, (pl.DataFrame, pl.Series)):\n            return output.to_arrow()\n    return output\n\n\nclass MappedExamplesIterable(_BaseExamplesIterable):\n    def __init__(\n        self,\n        ex_iterable: _BaseExamplesIterable,\n        function: Callable,\n        with_indices: bool = False,\n        input_columns: Optional[list[str]] = None,\n        batched: bool = False,\n        batch_size: Optional[int] = 1000,\n        drop_last_batch: bool = False,\n        remove_columns: Optional[list[str]] = None,\n        fn_kwargs: Optional[dict] = None,\n        formatting: Optional[\"FormattingConfig\"] = None,\n        features: Optional[Features] = None,\n        max_num_running_async_map_functions_in_parallel: Optional[int] = None,\n    ):\n        super().__init__()\n        self.ex_iterable = ex_iterable\n        self.function = function\n        self.batched = batched\n        self.batch_size = batch_size\n        self.drop_last_batch = drop_last_batch\n        self.remove_columns = remove_columns\n        self.with_indices = with_indices\n        self.input_columns = input_columns\n        self.fn_kwargs = fn_kwargs or {}\n        self.formatting = formatting  # required for iter_arrow\n        self._features = features\n        self.max_num_running_async_map_functions_in_parallel = (\n            max_num_running_async_map_functions_in_parallel or config.MAX_NUM_RUNNING_ASYNC_MAP_FUNCTIONS_IN_PARALLEL\n        )\n        # sanity checks\n        if formatting and formatting.is_table:\n            # batch_size should match for iter_arrow\n            if not isinstance(ex_iterable, RebatchedArrowExamplesIterable):\n                raise ValueError(\n                    f\"The {formatting.format_type.capitalize()}-formatted {type(self).__name__} has underlying iterable \"\n                    f\"that is a {type(ex_iterable).__name__} instead of a RebatchedArrowExamplesIterable.\"\n                )\n            elif not ex_iterable.iter_arrow:\n                raise ValueError(\n                    f\"The {formatting.format_type.capitalize()}-formatted {type(self).__name__} has underlying iterable \"\n                    f\"that is a {type(ex_iterable).__name__} but doesnt' implement iter_arrow(), a possible fix could be \"\n                    \"to use RebatchedArrowExamplesIterable(..., force_convert_to_arrow=True).\"\n                )\n            elif ex_iterable.batch_size != (batch_size if batched else 1):\n                raise ValueError(\n                    f\"The {formatting.format_type.capitalize()}-formatted {type(self).__name__} has batch_size={batch_size if batched else 1} which is \"\n                    f\"different from {ex_iterable.batch_size=} from its underlying iterable.\"\n                )\n        # to enable graceful ends\n        self._owned_loops_and_tasks: list[tuple[asyncio.AbstractEventLoop, list[asyncio.Task]]] = []\n\n    @property\n    def iter_arrow(self):\n        if self.formatting and self.formatting.is_table:\n            return self._iter_arrow\n\n    @property\n    def is_typed(self):\n        return self.features is not None  # user has extracted features\n\n    @property\n    def features(self):\n        return self._features\n\n    def _init_state_dict(self) -> dict:\n        self._state_dict = {\n            \"examples_iterable\": self.ex_iterable._init_state_dict(),\n            \"previous_state\": None,\n            \"num_examples_since_previous_state\": 0,\n            \"previous_state_example_idx\": 0,\n            \"type\": self.__class__.__name__,\n        }\n        return self._state_dict\n\n    def __iter__(self):\n        if self.formatting and self.formatting.is_table:\n            formatter = PythonFormatter()\n            for key, pa_table in self._iter_arrow(max_chunksize=1):\n                yield key, formatter.format_row(pa_table)\n        else:\n            yield from self._iter()\n\n    def _iter(self):\n        current_idx = self._state_dict[\"previous_state_example_idx\"] if self._state_dict else 0\n        if self._state_dict and self._state_dict[\"previous_state\"]:\n            self.ex_iterable.load_state_dict(self._state_dict[\"previous_state\"])\n            num_examples_to_skip = self._state_dict[\"num_examples_since_previous_state\"]\n        else:\n            num_examples_to_skip = 0\n        iterator = iter(self.ex_iterable)\n\n        # We use the same logic as in Dataset.map, but with less features/formatting\n        # since they're handled by FormattedExamplesIterable\n\n        if self.formatting:\n            formatter = get_formatter(self.formatting.format_type)\n            format_dict = formatter.recursive_tensorize if isinstance(formatter, TensorFormatter) else None\n        else:\n            format_dict = None\n\n        def iter_batched_inputs():\n            nonlocal current_idx\n            for key, example in iterator:\n                # If `batched`, first build the batch, if `batch_size` is None or <=0, then the batch is the whole dataset\n                iterator_batch = (\n                    iterator\n                    if self.batch_size is None or self.batch_size <= 0\n                    else islice(iterator, self.batch_size - 1)\n                )\n                key_examples_list = [(key, example)] + list(iterator_batch)\n                keys, examples = zip(*key_examples_list)\n                # the new key is the concatenation of the examples keys from the batch\n                key = \"_\".join(str(key) for key in keys)\n                if (\n                    self.drop_last_batch\n                    and self.batch_size is not None\n                    and self.batch_size > 0\n                    and len(examples) < self.batch_size\n                ):  # ignore last batch\n                    return\n                batch = _examples_to_batch(examples)\n                # we need to format here in case we need to stack tensors together\n                batch = format_dict(batch) if format_dict else batch\n                indices = [current_idx + i for i in range(len(key_examples_list))]\n                current_idx += len(indices)\n                yield indices, (key, batch)\n\n        def iter_inputs():\n            nonlocal current_idx\n            for key, example in iterator:\n                # If not batched, we can apply the transform and yield the example directly\n                # first copy the example, since we might drop some keys\n                example = dict(example)\n                # no need to do formatting here\n                current_idx += 1\n                yield current_idx - 1, (key, example)\n\n        def validate_function_output(processed_inputs):\n            if self.batched and processed_inputs:\n                first_col = next(iter(processed_inputs))\n                bad_cols = [\n                    col for col in processed_inputs if len(processed_inputs[col]) != len(processed_inputs[first_col])\n                ]\n                if bad_cols:\n                    raise ValueError(\n                        f\"Column lengths mismatch: columns {bad_cols} have length {[len(processed_inputs[col]) for col in bad_cols]} \"\n                        f\"while {first_col} has length {len(processed_inputs[first_col])}.\"\n                    )\n\n        def prepare_inputs(key_example, indices):\n            key, example = key_example\n            fn_args = [example] if self.input_columns is None else [example[col] for col in self.input_columns]\n            additional_args = ()\n            if self.with_indices:\n                fn_args += (indices,)\n            inputs = dict(example)\n            return inputs, fn_args, additional_args, self.fn_kwargs\n\n        def prepare_outputs(key_example, inputs, processed_inputs):\n            validate_function_output(processed_inputs)\n            # this logic mimics the one in Dataset.map\n            if self.remove_columns:\n                for c in self.remove_columns:\n                    if c in inputs:\n                        del inputs[c]\n                    if processed_inputs is key_example[1] and c in processed_inputs:\n                        del processed_inputs[c]\n            transformed_inputs = {**inputs, **processed_inputs}\n            # no need to do features decoding here\n            return transformed_inputs\n\n        def apply_function(key_example, indices):\n            \"\"\"Utility to apply the function on a selection of columns.\"\"\"\n            inputs, fn_args, additional_args, fn_kwargs = prepare_inputs(key_example, indices)\n            processed_inputs = self.function(*fn_args, *additional_args, **fn_kwargs)\n            return prepare_outputs(key_example, inputs, processed_inputs)\n\n        async def async_apply_function(key_example, indices):\n            \"\"\"Utility to apply the function on a selection of columns. Same code but async\"\"\"\n            inputs, fn_args, additional_args, fn_kwargs = prepare_inputs(key_example, indices)\n            processed_inputs = await self.function(*fn_args, *additional_args, **fn_kwargs)\n            return prepare_outputs(key_example, inputs, processed_inputs)\n\n        tasks: list[asyncio.Task] = []\n        if inspect.iscoroutinefunction(self.function):\n            try:\n                loop = asyncio.get_running_loop()\n            except RuntimeError:\n                loop = asyncio.new_event_loop()\n            self._owned_loops_and_tasks.append((loop, tasks))\n        else:\n            loop = None\n\n        def iter_outputs():\n            nonlocal tasks, loop\n            inputs_iterator = iter_batched_inputs() if self.batched else iter_inputs()\n            if inspect.iscoroutinefunction(self.function):\n                if self._state_dict:\n                    previous_state = self.ex_iterable.state_dict()\n                    self._state_dict[\"previous_state\"] = previous_state\n                    previous_state_task = None\n                    previous_state_example_idx = self._state_dict[\"previous_state_example_idx\"]\n                indices: Union[list[int], list[list[int]]] = []\n                for i, key_example in inputs_iterator:\n                    indices.append(i)\n                    tasks.append(loop.create_task(async_apply_function(key_example, i)))\n                    # keep the total active tasks under a certain number\n                    if len(tasks) >= self.max_num_running_async_map_functions_in_parallel:\n                        done, pending = loop.run_until_complete(\n                            asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)\n                        )\n                        while tasks and len(pending) >= self.max_num_running_async_map_functions_in_parallel:\n                            done, pending = loop.run_until_complete(\n                                asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)\n                            )\n                    if len(tasks) >= 10 * self.max_num_running_async_map_functions_in_parallel:\n                        loop.run_until_complete(tasks[0])\n                    # yield finished tasks\n                    while tasks and tasks[0].done():\n                        i, task = indices.pop(0), tasks.pop(0)\n                        yield i, task.result()\n                        if self._state_dict and task is previous_state_task:\n                            self._state_dict[\"previous_state\"] = previous_state\n                            self._state_dict[\"num_examples_since_previous_state\"] = 0\n                            self._state_dict[\"previous_state_example_idx\"] = previous_state_example_idx\n                            previous_state, previous_state_task = None, None\n                    # checkpoint\n                    if self._state_dict and previous_state_task is None and tasks:\n                        previous_state = self.ex_iterable.state_dict()\n                        previous_state_task = tasks[-1]\n                        previous_state_example_idx = current_idx\n                while tasks:\n                    yield indices[0], loop.run_until_complete(tasks[0])\n                    indices.pop(0), tasks.pop(0)\n            else:\n                if self._state_dict:\n                    if self.batched:\n                        self._state_dict[\"previous_state\"] = self.ex_iterable.state_dict()\n                        self._state_dict[\"num_examples_since_previous_state\"] = 0\n                        self._state_dict[\"previous_state_example_idx\"] = current_idx\n                for i, key_example in inputs_iterator:\n                    if self._state_dict:\n                        if not self.batched:\n                            self._state_dict[\"previous_state_example_idx\"] = current_idx\n                    yield i, apply_function(key_example, i)\n                    if self._state_dict:\n                        if self.batched:\n                            self._state_dict[\"previous_state\"] = self.ex_iterable.state_dict()\n                            self._state_dict[\"num_examples_since_previous_state\"] = 0\n                            self._state_dict[\"previous_state_example_idx\"] = current_idx\n\n        try:\n            outputs = iter_outputs()\n            if self.batched:\n                outputs = (\n                    (key, transformed_example)\n                    for key, transformed_batch in outputs\n                    for transformed_example in _batch_to_examples(transformed_batch)\n                )\n            for key, transformed_example in outputs:\n                if self._state_dict and self._state_dict[\"previous_state\"] is not None:\n                    self._state_dict[\"num_examples_since_previous_state\"] += 1\n                if num_examples_to_skip > 0:\n                    num_examples_to_skip -= 1\n                    continue\n                yield key, transformed_example\n        except (Exception, KeyboardInterrupt):\n            if loop:\n                logger.debug(f\"Canceling {len(tasks)} async tasks.\")\n                for task in tasks:\n                    task.cancel(msg=\"KeyboardInterrupt\")\n                try:\n                    loop.run_until_complete(asyncio.gather(*tasks))\n                except (asyncio.CancelledError, ValueError):\n                    logger.debug(\"Tasks canceled.\")\n            raise\n\n    def _iter_arrow(self, max_chunksize: Optional[int] = None) -> Iterator[tuple[Key, pa.Table]]:\n        formatter: TableFormatter = get_formatter(self.formatting.format_type) if self.formatting else ArrowFormatter()\n        if self.ex_iterable.iter_arrow:\n            iterator = self.ex_iterable.iter_arrow()\n        else:\n            iterator = _convert_to_arrow(\n                self.ex_iterable,\n                batch_size=self.batch_size if self.batched else 1,\n                drop_last_batch=self.drop_last_batch,\n            )\n        if self._state_dict and self._state_dict[\"previous_state\"]:\n            self.ex_iterable.load_state_dict(self._state_dict[\"previous_state\"])\n            num_examples_to_skip = self._state_dict[\"num_examples_since_previous_state\"]\n        else:\n            num_examples_to_skip = 0\n        if self._state_dict and max_chunksize is not None:\n            self._state_dict[\"previous_state\"] = self.ex_iterable.state_dict()\n            self._state_dict[\"num_examples_since_previous_state\"] = 0\n        current_idx = self._state_dict[\"previous_state_example_idx\"] if self._state_dict else 0\n        for key, pa_table in iterator:\n            if (\n                self.batched\n                and self.batch_size is not None\n                and len(pa_table) < self.batch_size\n                and self.drop_last_batch\n            ):\n                return\n            # first build the batch\n            function_args = (\n                [formatter.format_batch(pa_table)]\n                if self.input_columns is None\n                else [pa_table[col] for col in self.input_columns]\n            )\n            if self.with_indices:\n                if self.batched:\n                    function_args.append([current_idx + i for i in range(len(pa_table))])\n                else:\n                    function_args.append(current_idx)\n            # then apply the transform\n            output = self.function(*function_args, **self.fn_kwargs)\n            output_table = _table_output_to_arrow(output)\n            if not isinstance(output_table, pa.Table):\n                raise TypeError(\n                    f\"Provided `function` which is applied to {formatter.table_type} returns a variable of type \"\n                    f\"{type(output)}. Make sure provided `function` returns a {formatter.table_type} to update the dataset.\"\n                )\n            # we don't need to merge results for consistency with Dataset.map which merges iif both input and output are dicts\n            # then remove the unwanted columns\n            if self.remove_columns:\n                for column in self.remove_columns:\n                    if column in output_table.column_names:\n                        output_table = output_table.remove_column(output_table.column_names.index(column))\n            # return output\n            if max_chunksize is None:\n                current_idx += len(pa_table)\n                if self._state_dict:\n                    self._state_dict[\"previous_state_example_idx\"] += len(pa_table)\n                yield key, output_table\n            else:\n                for i, pa_subtable in enumerate(output_table.to_reader(max_chunksize=max_chunksize)):\n                    current_idx += 1\n                    if self._state_dict:\n                        self._state_dict[\"num_examples_since_previous_state\"] += 1\n                    if num_examples_to_skip > 0:\n                        num_examples_to_skip -= 1\n                        continue\n                    yield f\"{key}_{i}\", pa_subtable\n                if self._state_dict:\n                    self._state_dict[\"previous_state\"] = self.ex_iterable.state_dict()\n                    self._state_dict[\"num_examples_since_previous_state\"] = 0\n                    self._state_dict[\"previous_state_example_idx\"] += len(pa_table)\n\n    def shuffle_data_sources(self, generator: np.random.Generator) -> \"MappedExamplesIterable\":\n        \"\"\"Shuffle the wrapped examples iterable.\"\"\"\n        return MappedExamplesIterable(\n            self.ex_iterable.shuffle_data_sources(generator),\n            function=self.function,\n            with_indices=self.with_indices,\n            input_columns=self.input_columns,\n            batched=self.batched,\n            batch_size=self.batch_size,\n            drop_last_batch=self.drop_last_batch,\n            remove_columns=self.remove_columns,\n            fn_kwargs=self.fn_kwargs,\n            formatting=self.formatting,\n            features=self.features,\n            max_num_running_async_map_functions_in_parallel=self.max_num_running_async_map_functions_in_parallel,\n        )\n\n    def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> \"MappedExamplesIterable\":\n        \"\"\"Keep only the requested shard.\"\"\"\n        return MappedExamplesIterable(\n            self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous),\n            function=self.function,\n            with_indices=self.with_indices,\n            input_columns=self.input_columns,\n            batched=self.batched,\n            batch_size=self.batch_size,\n            drop_last_batch=self.drop_last_batch,\n            remove_columns=self.remove_columns,\n            fn_kwargs=self.fn_kwargs,\n            formatting=self.formatting,\n            features=self.features,\n            max_num_running_async_map_functions_in_parallel=self.max_num_running_async_map_functions_in_parallel,\n        )\n\n    def reshard_data_sources(self) -> \"MappedExamplesIterable\":\n        return MappedExamplesIterable(\n            self.ex_iterable.reshard_data_sources(),\n            function=self.function,\n            with_indices=self.with_indices,\n            input_columns=self.input_columns,\n            batched=self.batched,\n            batch_size=self.batch_size,\n            drop_last_batch=self.drop_last_batch,\n            remove_columns=self.remove_columns,\n            fn_kwargs=self.fn_kwargs,\n            formatting=self.formatting,\n            features=self.features,\n            max_num_running_async_map_functions_in_parallel=self.max_num_running_async_map_functions_in_parallel,\n        )\n\n    @property\n    def num_shards(self) -> int:\n        return self.ex_iterable.num_shards\n\n\ndef _add_mask(\n    input: Union[dict, pa.Table],\n    mask: Union[bool, list, pa.Array, pa.ChunkedArray, pa.BooleanScalar],\n    mask_column_name: str,\n):\n    if isinstance(input, pa.Table):\n        if not isinstance(mask, (list, pa.Array, pa.ChunkedArray)):\n            mask = pa.array([mask], type=pa.bool_())\n        return input.append_column(mask_column_name, mask)\n    else:\n        return {mask_column_name: mask}\n\n\ndef add_mask(mask_function: Callable, input: Union[dict, pa.Table], *args, mask_column_name: str, **kwargs):\n    mask = mask_function(input, *args, **kwargs)\n    return _add_mask(input, mask, mask_column_name)\n\n\nasync def async_add_mask(\n    mask_function: Callable, input: Union[dict, pa.Table], *args, mask_column_name: str, **kwargs\n):\n    mask = await mask_function(input, *args, **kwargs)\n    return _add_mask(input, mask, mask_column_name)\n\n\nclass FilteredExamplesIterable(MappedExamplesIterable):\n    mask_column_name = \"===MASK===\"\n\n    def __init__(\n        self,\n        ex_iterable: _BaseExamplesIterable,\n        function: Callable,\n        with_indices: bool = False,\n        input_columns: Optional[list[str]] = None,\n        batched: bool = False,\n        batch_size: Optional[int] = 1000,\n        fn_kwargs: Optional[dict] = None,\n        formatting: Optional[\"FormattingConfig\"] = None,\n    ):\n        self.mask_function = function\n        if ex_iterable.is_typed:\n            features = Features({**ex_iterable.features, self.mask_column_name: Value(\"bool\")})\n        else:\n            features = None\n        super().__init__(\n            ex_iterable=ex_iterable,\n            function=partial(\n                async_add_mask if inspect.iscoroutinefunction(function) else add_mask,\n                function,\n                mask_column_name=self.mask_column_name,\n            ),\n            with_indices=with_indices,\n            input_columns=input_columns,\n            batched=batched,\n            batch_size=batch_size,\n            fn_kwargs=fn_kwargs,\n            formatting=formatting,\n            features=features,\n        )\n\n    def _iter(self):\n        for key, example in super()._iter():\n            example = dict(example)\n            if example.pop(self.mask_column_name):\n                yield key, example\n\n    def _iter_arrow(self, max_chunksize: Optional[int] = None):\n        for key, pa_table in super()._iter_arrow(max_chunksize=max_chunksize):\n            mask = pa_table[self.mask_column_name]\n            yield key, pa_table.drop(self.mask_column_name).filter(mask)\n\n    def shuffle_data_sources(self, seed: Optional[int]) -> \"FilteredExamplesIterable\":\n        \"\"\"Shuffle the wrapped examples iterable.\"\"\"\n        return FilteredExamplesIterable(\n            self.ex_iterable.shuffle_data_sources(seed),\n            function=self.mask_function,\n            with_indices=self.with_indices,\n            input_columns=self.input_columns,\n            batched=self.batched,\n            batch_size=self.batch_size,\n            fn_kwargs=self.fn_kwargs,\n            formatting=self.formatting,\n        )\n\n    def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> \"FilteredExamplesIterable\":\n        \"\"\"Keep only the requested shard.\"\"\"\n        return FilteredExamplesIterable(\n            self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous),\n            function=self.mask_function,\n            with_indices=self.with_indices,\n            input_columns=self.input_columns,\n            batched=self.batched,\n            batch_size=self.batch_size,\n            fn_kwargs=self.fn_kwargs,\n            formatting=self.formatting,\n        )\n\n    def reshard_data_sources(self) -> \"FilteredExamplesIterable\":\n        return FilteredExamplesIterable(\n            self.ex_iterable.reshard_data_sources(),\n            function=self.mask_function,\n            with_indices=self.with_indices,\n            input_columns=self.input_columns,\n            batched=self.batched,\n            batch_size=self.batch_size,\n            fn_kwargs=self.fn_kwargs,\n            formatting=self.formatting,\n        )\n\n    @property\n    def num_shards(self) -> int:\n        return self.ex_iterable.num_shards\n\n\nclass BufferShuffledExamplesIterable(_BaseExamplesIterable):\n    def __init__(self, ex_iterable: _BaseExamplesIterable, buffer_size: int, generator: np.random.Generator):\n        super().__init__()\n        self.ex_iterable = ex_iterable\n        self.buffer_size = buffer_size\n        self.generator = generator\n\n    def shift_rngs(self, value: int) -> \"_BaseExamplesIterable\":\n        rng = deepcopy(self.generator)\n        new_seed = rng.integers(0, 1 << 63) - value\n        return BufferShuffledExamplesIterable(\n            ex_iterable=self.ex_iterable,\n            buffer_size=self.buffer_size,\n            generator=np.random.default_rng(seed=new_seed),\n        )\n\n    @property\n    def is_typed(self):\n        return self.ex_iterable.is_typed\n\n    @property\n    def features(self):\n        return self.ex_iterable.features\n\n    @property\n    def iter_arrow(self):\n        return self._iter_arrow if self.ex_iterable.iter_arrow else None\n\n    def _init_state_dict(self) -> dict:\n        self._state_dict = self.ex_iterable._init_state_dict()\n        self._original_state_dict = self.state_dict()\n        return self._state_dict\n\n    def load_state_dict(self, state_dict: dict) -> dict:\n        if self._state_dict:\n            if state_dict != self._original_state_dict:\n                logger.warning(\n                    \"Loading a state dict of a shuffle buffer of a dataset without the buffer content.\"\n                    \"The shuffle buffer will be refilled before starting to yield new examples.\"\n                )\n        return super().load_state_dict(state_dict)\n\n    @staticmethod\n    def _iter_random_indices(rng: np.random.Generator, buffer_size: int, random_batch_size=1000) -> Iterator[int]:\n        while True:\n            yield from (int(i) for i in rng.integers(0, buffer_size, size=random_batch_size))\n\n    def __iter__(self):\n        buffer_size = self.buffer_size\n        rng = deepcopy(self.generator)\n        indices_iterator = self._iter_random_indices(rng, buffer_size)\n        # this is the shuffle buffer that we keep in memory\n        mem_buffer = []\n        for x in self.ex_iterable:\n            if len(mem_buffer) == buffer_size:  # if the buffer is full, pick and example from it\n                i = next(indices_iterator)\n                yield mem_buffer[i]\n                mem_buffer[i] = x  # replace the picked example by a new one\n            else:  # otherwise, keep filling the buffer\n                mem_buffer.append(x)\n        # when we run out of examples, we shuffle the remaining examples in the buffer and yield them\n        rng.shuffle(mem_buffer)\n        yield from mem_buffer\n\n    def _iter_arrow(self):\n        buffer_size = self.buffer_size\n        rng = deepcopy(self.generator)\n        indices_iterator = self._iter_random_indices(rng, buffer_size)\n        # this is the shuffle buffer that we keep in memory\n        mem_buffer = []\n        for key, pa_table in self.ex_iterable.iter_arrow():\n            if len(mem_buffer) == buffer_size:  # if the buffer is full, pick and example from it\n                i = next(indices_iterator)\n                yield mem_buffer[i]\n                mem_buffer[i] = (key, pa_table)  # replace the picked example by a new one\n            else:  # otherwise, keep filling the buffer\n                mem_buffer.append((key, pa_table))\n        # when we run out of examples, we shuffle the remaining examples in the buffer and yield them\n        rng.shuffle(mem_buffer)\n        yield from mem_buffer\n\n    def shuffle_data_sources(self, generator: np.random.Generator) -> \"BufferShuffledExamplesIterable\":\n        \"\"\"Shuffle the wrapped examples iterable as well as the shuffling buffer.\"\"\"\n        return BufferShuffledExamplesIterable(\n            self.ex_iterable.shuffle_data_sources(generator), buffer_size=self.buffer_size, generator=self.generator\n        )\n\n    def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> \"BufferShuffledExamplesIterable\":\n        \"\"\"Keep only the requested shard.\"\"\"\n        return BufferShuffledExamplesIterable(\n            self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous),\n            buffer_size=self.buffer_size,\n            generator=self.generator,\n        )\n\n    def reshard_data_sources(self) -> \"BufferShuffledExamplesIterable\":\n        return BufferShuffledExamplesIterable(\n            self.ex_iterable.reshard_data_sources(),\n            buffer_size=self.buffer_size,\n            generator=self.generator,\n        )\n\n    @property\n    def num_shards(self) -> int:\n        return self.ex_iterable.num_shards\n\n\nclass SkipExamplesIterable(_BaseExamplesIterable):\n    def __init__(\n        self,\n        ex_iterable: _BaseExamplesIterable,\n        n: int,\n        block_sources_order_when_shuffling: bool = True,\n        split_when_sharding: bool = True,\n    ):\n        super().__init__()\n        self.ex_iterable = ex_iterable\n        self.n = n\n        self.block_sources_order_when_shuffling = block_sources_order_when_shuffling\n        self.split_when_sharding = split_when_sharding\n\n    @property\n    def iter_arrow(self):\n        return self._iter_arrow if self.ex_iterable.iter_arrow else None\n\n    @property\n    def is_typed(self):\n        return self.ex_iterable.is_typed\n\n    @property\n    def features(self):\n        return self.ex_iterable.features\n\n    def _init_state_dict(self) -> dict:\n        self._state_dict = {\n            \"skipped\": 0,\n            \"examples_iterable\": self.ex_iterable._init_state_dict(),\n            \"type\": self.__class__.__name__,\n        }\n        return self._state_dict\n\n    def __iter__(self):\n        skipped = self._state_dict[\"skipped\"] if self._state_dict else 0\n        for key_example in self.ex_iterable:\n            if skipped + 1 <= self.n:\n                skipped += 1\n                if self._state_dict:\n                    self._state_dict[\"skipped\"] = skipped\n            else:\n                yield key_example\n\n    def _iter_arrow(self):\n        skipped = self._state_dict[\"skipped\"] if self._state_dict else 0\n        for key, pa_table in self.ex_iterable.iter_arrow():\n            if len(pa_table) == 0:\n                continue\n            elif skipped + len(pa_table) <= self.n:\n                skipped += len(pa_table)\n                if self._state_dict:\n                    self._state_dict[\"skipped\"] = skipped\n            if skipped + 1 <= self.n:\n                offset = self.n - skipped\n                skipped = self.n\n                if self._state_dict:\n                    self._state_dict[\"skipped\"] = skipped\n                yield key, pa_table.slice(offset, len(pa_table) - offset)\n            else:\n                yield key, pa_table\n\n    @staticmethod\n    def split_number(num, n):\n        quotient = num // n\n        remainder = num % n\n        result = [quotient] * n\n        for i in range(remainder):\n            result[i] += 1\n        return result\n\n    def shuffle_data_sources(self, generator: np.random.Generator) -> \"SkipExamplesIterable\":\n        \"\"\"May not shuffle the wrapped examples iterable since it would skip examples from other shards instead.\"\"\"\n        if self.block_sources_order_when_shuffling:\n            return self\n        else:\n            return SkipExamplesIterable(\n                self.ex_iterable.shuffle_data_sources(generator),\n                n=self.n,\n                block_sources_order_when_shuffling=self.block_sources_order_when_shuffling,\n                split_when_sharding=self.split_when_sharding,\n            )\n\n    def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> \"SkipExamplesIterable\":\n        \"\"\"Keep only the requested shard.\"\"\"\n        if self.split_when_sharding:\n            return SkipExamplesIterable(\n                self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous),\n                n=self.split_number(self.n, num_shards)[index],\n                block_sources_order_when_shuffling=self.block_sources_order_when_shuffling,\n                split_when_sharding=self.split_when_sharding,\n            )\n        else:\n            return self\n\n    def reshard_data_sources(self) -> \"SkipExamplesIterable\":\n        return SkipExamplesIterable(\n            self.ex_iterable.reshard_data_sources(),\n            n=self.n,\n            block_sources_order_when_shuffling=self.block_sources_order_when_shuffling,\n            split_when_sharding=self.split_when_sharding,\n        )\n\n    @property\n    def num_shards(self) -> int:\n        return self.ex_iterable.num_shards\n\n\nclass RepeatExamplesIterable(_BaseExamplesIterable):\n    \"\"\"\n    Iterable that repeats the underlying iterable a given number of times.\n    \"\"\"\n\n    def __init__(\n        self,\n        ex_iterable: _BaseExamplesIterable,\n        num_times: Optional[int],\n    ):\n        super().__init__()\n        self.ex_iterable = ex_iterable\n        self.num_times = num_times\n\n    def _init_state_dict(self) -> dict:\n        self._state_dict = {\n            \"repeat_index\": 0,\n            \"examples_iterable\": self.ex_iterable._init_state_dict(),\n            \"type\": self.__class__.__name__,\n        }\n        return self._state_dict\n\n    def __iter__(self):\n        repeat_index = self._state_dict[\"repeat_index\"] if self._state_dict else 0\n        while True:\n            if self.num_times is not None and repeat_index >= max(self.num_times, 0):\n                break\n            yield from self.ex_iterable\n            repeat_index += 1\n            if self._state_dict:\n                self._state_dict[\"repeat_index\"] = repeat_index\n                self._state_dict[\"examples_iterable\"] = self.ex_iterable._init_state_dict()\n\n    def shuffle_data_sources(self, generator: np.random.Generator) -> \"RepeatExamplesIterable\":\n        \"\"\"Shuffle the underlying iterable, then repeat.\"\"\"\n        return RepeatExamplesIterable(self.ex_iterable.shuffle_data_sources(generator), num_times=self.num_times)\n\n    def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> \"RepeatExamplesIterable\":\n        \"\"\"Shard, then repeat shards.\"\"\"\n        return RepeatExamplesIterable(\n            self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous),\n            num_times=self.num_times,\n        )\n\n    def reshard_data_sources(self) -> \"RepeatExamplesIterable\":\n        return RepeatExamplesIterable(\n            self.ex_iterable.reshard_data_sources(),\n            num_times=self.num_times,\n        )\n\n    @property\n    def num_shards(self) -> int:\n        return self.ex_iterable.num_shards\n\n\nclass TakeExamplesIterable(_BaseExamplesIterable):\n    def __init__(\n        self,\n        ex_iterable: _BaseExamplesIterable,\n        n: int,\n        block_sources_order_when_shuffling: bool = True,\n        split_when_sharding: bool = True,\n    ):\n        super().__init__()\n        self.ex_iterable = ex_iterable\n        self.n = n\n        self.block_sources_order_when_shuffling = block_sources_order_when_shuffling\n        self.split_when_sharding = split_when_sharding\n\n    @property\n    def iter_arrow(self):\n        return self._iter_arrow if self.ex_iterable.iter_arrow else None\n\n    @property\n    def is_typed(self):\n        return self.ex_iterable.is_typed\n\n    @property\n    def features(self):\n        return self.ex_iterable.features\n\n    def _init_state_dict(self) -> dict:\n        self._state_dict = {\n            \"taken\": 0,\n            \"examples_iterable\": self.ex_iterable._init_state_dict(),\n            \"type\": self.__class__.__name__,\n        }\n        return self._state_dict\n\n    def __iter__(self):\n        taken = self._state_dict[\"taken\"] if self._state_dict else 0\n        if taken >= self.n:\n            return\n        for key_example in self.ex_iterable:\n            if taken + 1 <= self.n:\n                taken += 1\n                if self._state_dict:\n                    self._state_dict[\"taken\"] = taken\n                yield key_example\n            else:\n                break\n\n    def _iter_arrow(self):\n        taken = self._state_dict[\"taken\"] if self._state_dict else 0\n        if taken >= self.n:\n            return\n        for key, pa_table in self.ex_iterable.iter_arrow():\n            if len(pa_table) == 0:\n                continue\n            elif taken + len(pa_table) <= self.n:\n                taken += len(pa_table)\n                if self._state_dict:\n                    self._state_dict[\"taken\"] = taken\n                yield key, pa_table\n            elif taken + 1 <= self.n:\n                length = self.n - taken\n                taken = self.n\n                if self._state_dict:\n                    self._state_dict[\"taken\"] = taken\n                yield key, pa_table.slice(0, length)\n            else:\n                break\n\n    @staticmethod\n    def split_number(num, n):\n        quotient = num // n\n        remainder = num % n\n        result = [quotient] * n\n        for i in range(remainder):\n            result[i] += 1\n        return result\n\n    def shuffle_data_sources(self, generator: np.random.Generator) -> \"TakeExamplesIterable\":\n        \"\"\"May not shuffle the wrapped examples iterable since it would take examples from other shards instead.\"\"\"\n        if self.block_sources_order_when_shuffling:\n            return self\n        else:\n            return TakeExamplesIterable(\n                self.ex_iterable.shuffle_data_sources(generator),\n                n=self.n,\n                block_sources_order_when_shuffling=self.block_sources_order_when_shuffling,\n                split_when_sharding=self.split_when_sharding,\n            )\n\n    def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> \"TakeExamplesIterable\":\n        \"\"\"Keep only the requested shard.\"\"\"\n        if self.split_when_sharding:\n            return TakeExamplesIterable(\n                self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous),\n                n=self.split_number(self.n, num_shards)[index],\n                block_sources_order_when_shuffling=self.block_sources_order_when_shuffling,\n                split_when_sharding=self.split_when_sharding,\n            )\n        else:\n            return TakeExamplesIterable(\n                self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous),\n                n=self.n,\n                block_sources_order_when_shuffling=self.block_sources_order_when_shuffling,\n                split_when_sharding=self.split_when_sharding,\n            )\n\n    def reshard_data_sources(self) -> \"TakeExamplesIterable\":\n        return TakeExamplesIterable(\n            self.ex_iterable.reshard_data_sources(),\n            n=self.n,\n            block_sources_order_when_shuffling=self.block_sources_order_when_shuffling,\n            split_when_sharding=self.split_when_sharding,\n        )\n\n    @property\n    def num_shards(self) -> int:\n        return self.ex_iterable.num_shards\n\n\ndef _apply_feature_types_on_example(\n    example: dict, features: Features, token_per_repo_id: dict[str, Union[str, bool, None]]\n) -> dict:\n    example = dict(example)\n    # add missing columns\n    for column_name in features:\n        if column_name not in example:\n            example[column_name] = None\n    # we encode the example for ClassLabel feature types for example\n    encoded_example = features.encode_example(example)\n    # Decode example for Audio feature, e.g.\n    decoded_example = features.decode_example(encoded_example, token_per_repo_id=token_per_repo_id)\n    return decoded_example\n\n\n@dataclass\nclass FormattingConfig:\n    format_type: Optional[str]\n\n    @property\n    def is_table(self) -> bool:\n        return isinstance(get_formatter(self.format_type), TableFormatter)\n\n    @property\n    def is_tensor(self) -> bool:\n        return isinstance(get_formatter(self.format_type), TensorFormatter)\n\n\nclass FormattedExamplesIterable(_BaseExamplesIterable):\n    def __init__(\n        self,\n        ex_iterable: _BaseExamplesIterable,\n        formatting: Optional[FormattingConfig],\n        features: Optional[Features],\n        token_per_repo_id: dict[str, Union[str, bool, None]],\n        force_convert_to_python: bool = False,\n    ):\n        super().__init__()\n        self.ex_iterable = ex_iterable\n        self._features = features\n        self.formatting = formatting\n        self.token_per_repo_id = token_per_repo_id\n        self.force_convert_to_python = force_convert_to_python\n\n    @property\n    def iter_arrow(self):\n        if self.ex_iterable.iter_arrow and not self.force_convert_to_python:\n            return self._iter_arrow\n\n    @property\n    def is_typed(self):\n        return self.ex_iterable.is_typed or self._features is not None\n\n    @property\n    def features(self):\n        return self._features\n\n    def _init_state_dict(self) -> dict:\n        self._state_dict = self.ex_iterable._init_state_dict()\n        return self._state_dict\n\n    def __iter__(self):\n        if not self.formatting or self.formatting.is_table:\n            formatter = PythonFormatter(\n                features=self._features if not self.ex_iterable.is_typed else None,\n                token_per_repo_id=self.token_per_repo_id,\n            )\n        else:\n            formatter = get_formatter(\n                self.formatting.format_type,\n                features=self._features if not self.ex_iterable.is_typed else None,\n                token_per_repo_id=self.token_per_repo_id,\n            )\n\n        # It's ok to use _iter_arrow here without fancy state_dict logic since it's\n        # used with RebatchedArrowExamplesIterable with the right batch_size to\n        # never lose examples\n        if self.ex_iterable.iter_arrow:\n            # feature casting (inc column addition) handled within self._iter_arrow()\n            for key, pa_table in self._iter_arrow():\n                batch = formatter.format_batch(pa_table)\n                for example in _batch_to_examples(batch):\n                    yield key, example\n        else:\n            format_dict = (\n                formatter.recursive_tensorize\n                if isinstance(formatter, TensorFormatter)\n                else None  # cast in case features is None\n            )\n            for key, example in self.ex_iterable:\n                # don't apply feature types if already applied by ex_iterable (e.g. in case of chained with_format)\n                if self.features and not self.ex_iterable.is_typed:\n                    example = _apply_feature_types_on_example(\n                        example, self.features, token_per_repo_id=self.token_per_repo_id\n                    )\n                if format_dict:\n                    example = format_dict(example)\n                yield key, example\n\n    def _iter_arrow(self) -> Iterator[tuple[Key, pa.Table]]:\n        if not self.features:\n            yield from self.ex_iterable._iter_arrow()\n            return\n        for key, pa_table in self.ex_iterable._iter_arrow():\n            columns = set(pa_table.column_names)\n            schema = self.features.arrow_schema\n            # add missing columns\n            for column_name in self.features:\n                if column_name not in columns:\n                    col = pa.NullArray.from_buffers(pa.null(), len(pa_table), [None])\n                    pa_table = pa_table.append_column(column_name, col)\n            if pa_table.schema != schema:\n                pa_table = cast_table_to_features(pa_table, self.features)\n            yield key, pa_table\n\n    def shuffle_data_sources(self, generator: np.random.Generator) -> \"FormattedExamplesIterable\":\n        \"\"\"Shuffle the wrapped examples iterable.\"\"\"\n        return FormattedExamplesIterable(\n            self.ex_iterable.shuffle_data_sources(generator),\n            features=self.features,\n            token_per_repo_id=self.token_per_repo_id,\n            formatting=self.formatting,\n            force_convert_to_python=self.force_convert_to_python,\n        )\n\n    def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> \"FormattedExamplesIterable\":\n        \"\"\"Keep only the requested shard.\"\"\"\n        return FormattedExamplesIterable(\n            self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous),\n            features=self.features,\n            token_per_repo_id=self.token_per_repo_id,\n            formatting=self.formatting,\n            force_convert_to_python=self.force_convert_to_python,\n        )\n\n    def reshard_data_sources(self) -> \"FormattedExamplesIterable\":\n        return FormattedExamplesIterable(\n            self.ex_iterable.reshard_data_sources(),\n            features=self.features,\n            token_per_repo_id=self.token_per_repo_id,\n            formatting=self.formatting,\n            force_convert_to_python=self.force_convert_to_python,\n        )\n\n    @property\n    def num_shards(self) -> int:\n        return self.ex_iterable.num_shards\n\n\n@dataclass\nclass DistributedConfig:\n    rank: int\n    world_size: int\n\n\ndef _maybe_add_torch_iterable_dataset_parent_class(cls):\n    \"\"\"Add torch.utils.data.IterableDataset as a parent class if 'torch' is available\"\"\"\n    if config.TORCH_AVAILABLE:\n        import torch.utils.data\n\n        if torch.utils.data.IterableDataset not in cls.__bases__:\n            cls.__bases__ += (torch.utils.data.IterableDataset,)\n\n\ndef _maybe_share_with_torch_persistent_workers(value: Union[int, \"torch.Tensor\"]) -> Union[int, \"torch.Tensor\"]:\n    if config.TORCH_AVAILABLE:\n        import torch\n\n        if isinstance(value, torch.Tensor):\n            return value.share_memory_()\n        else:\n            return torch.tensor(value).share_memory_()\n    else:\n        return value\n\n\nclass IterableColumn:\n    \"\"\"\n    An iterable for a specific column of an [`IterableDataset`].\n\n    Example:\n\n    Iterate on the texts of the \"text\" column of a dataset:\n\n    ```python\n    for text in dataset[\"text\"]:\n        ...\n    ```\n\n    It also works with nested columns:\n\n    ```python\n    for source in dataset[\"metadata\"][\"source\"]:\n        ...\n    ```\n    \"\"\"\n\n    def __init__(self, source: Union[\"IterableDataset\", \"IterableColumn\"], column_name: str):\n        self.source = source\n        self.column_name = column_name\n\n    def __iter__(self) -> Iterator[Any]:\n        for example in self.source:\n            yield example[self.column_name]\n\n    def __getitem__(self, column_name: str) -> \"IterableColumn\":\n        return IterableColumn(self, column_name)\n\n\nclass IterableDataset(DatasetInfoMixin):\n    \"\"\"A Dataset backed by an iterable.\"\"\"\n\n    def __init__(\n        self,\n        ex_iterable: _BaseExamplesIterable,\n        info: Optional[DatasetInfo] = None,\n        split: Optional[NamedSplit] = None,\n        formatting: Optional[FormattingConfig] = None,\n        distributed: Optional[DistributedConfig] = None,\n        token_per_repo_id: Optional[dict[str, Union[str, bool, None]]] = None,\n    ):\n        info = info.copy() if info is not None else DatasetInfo()\n        DatasetInfoMixin.__init__(self, info=info, split=split)\n\n        self._ex_iterable = copy.copy(ex_iterable)\n        self._formatting = formatting\n        self._distributed = distributed\n        self._token_per_repo_id: dict[str, Union[str, bool, None]] = token_per_repo_id or {}\n        self._epoch: Union[int, \"torch.Tensor\"] = _maybe_share_with_torch_persistent_workers(0)\n        self._starting_state_dict: Optional[dict] = None\n        self.__hffs_cache = HfFileSystem._cache  # keep the cache on pickling (e.g. for dataloader workers)\n        self._prepare_ex_iterable_for_iteration()  # set state_dict\n        _maybe_add_torch_iterable_dataset_parent_class(self.__class__)  # subclass of torch IterableDataset\n\n    @property\n    def num_columns(self) -> Optional[int]:\n        \"\"\"Number of columns in the dataset.\n        This can be None if the dataset has unknown features (e.g. after a map() operation).\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\")\n        >>> ds.num_columns\n        2\n        ```\n        \"\"\"\n        return None if self.features is None else len(self.features)\n\n    @property\n    def column_names(self) -> Optional[list[str]]:\n        \"\"\"Names of the columns in the dataset.\n        This can be None if the dataset has unknown features (e.g. after a map() operation).\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\", streaming=True)\n        >>> ds.column_names\n        ['text', 'label']\n        ```\n        \"\"\"\n        return None if self.features is None else list(self.features)\n\n    def state_dict(self) -> dict:\n        \"\"\"Get the current state_dict of the dataset.\n        It corresponds to the state at the latest example it yielded.\n\n        Resuming returns exactly where the checkpoint was saved except in two cases:\n\n        1. examples from shuffle buffers are lost when resuming and the buffers are refilled with new data\n        2. combinations of `.with_format(arrow)` and batched `.map()` may skip one batch.\n\n        Returns:\n            `dict`\n\n        Example:\n\n        ```py\n        >>> from datasets import Dataset, concatenate_datasets\n        >>> ds = Dataset.from_dict({\"a\": range(6)}).to_iterable_dataset(num_shards=3)\n        >>> for idx, example in enumerate(ds):\n        ...     print(example)\n        ...     if idx == 2:\n        ...         state_dict = ds.state_dict()\n        ...         print(\"checkpoint\")\n        ...         break\n        >>> ds.load_state_dict(state_dict)\n        >>> print(f\"restart from checkpoint\")\n        >>> for example in ds:\n        ...     print(example)\n        ```\n\n        which returns:\n        ```\n        {'a': 0}\n        {'a': 1}\n        {'a': 2}\n        checkpoint\n        restart from checkpoint\n        {'a': 3}\n        {'a': 4}\n        {'a': 5}\n        ```\n\n        ```py\n        >>> from torchdata.stateful_dataloader import StatefulDataLoader\n        >>> ds = load_dataset(\"deepmind/code_contests\", streaming=True, split=\"train\")\n        >>> dataloader = StatefulDataLoader(ds, batch_size=32, num_workers=4)\n        >>> # checkpoint\n        >>> state_dict = dataloader.state_dict()  # uses ds.state_dict() under the hood\n        >>> # resume from checkpoint\n        >>> dataloader.load_state_dict(state_dict)  # uses ds.load_state_dict() under the hood\n        ```\n        \"\"\"\n        return copy.deepcopy(self._state_dict)\n\n    def load_state_dict(self, state_dict: dict) -> None:\n        \"\"\"Load the state_dict of the dataset.\n        The iteration will restart at the next example from when the state was saved.\n\n        Resuming returns exactly where the checkpoint was saved except in two cases:\n\n        1. examples from shuffle buffers are lost when resuming and the buffers are refilled with new data\n        2. combinations of `.with_format(arrow)` and batched `.map()` may skip one batch.\n\n        Example:\n\n        ```py\n        >>> from datasets import Dataset, concatenate_datasets\n        >>> ds = Dataset.from_dict({\"a\": range(6)}).to_iterable_dataset(num_shards=3)\n        >>> for idx, example in enumerate(ds):\n        ...     print(example)\n        ...     if idx == 2:\n        ...         state_dict = ds.state_dict()\n        ...         print(\"checkpoint\")\n        ...         break\n        >>> ds.load_state_dict(state_dict)\n        >>> print(f\"restart from checkpoint\")\n        >>> for example in ds:\n        ...     print(example)\n        ```\n\n        which returns:\n        ```\n        {'a': 0}\n        {'a': 1}\n        {'a': 2}\n        checkpoint\n        restart from checkpoint\n        {'a': 3}\n        {'a': 4}\n        {'a': 5}\n        ```\n\n        ```py\n        >>> from torchdata.stateful_dataloader import StatefulDataLoader\n        >>> ds = load_dataset(\"deepmind/code_contests\", streaming=True, split=\"train\")\n        >>> dataloader = StatefulDataLoader(ds, batch_size=32, num_workers=4)\n        >>> # checkpoint\n        >>> state_dict = dataloader.state_dict()  # uses ds.state_dict() under the hood\n        >>> # resume from checkpoint\n        >>> dataloader.load_state_dict(state_dict)  # uses ds.load_state_dict() under the hood\n        ```\n        \"\"\"\n        self._starting_state_dict = state_dict\n\n    def __repr__(self):\n        return f\"IterableDataset({{\\n    features: {list(self._info.features.keys()) if self._info.features is not None else 'Unknown'},\\n    num_shards: {self.num_shards}\\n}})\"\n\n    def __getstate__(self):\n        return self.__dict__\n\n    def __setstate__(self, d):\n        self.__dict__ = d\n        # Re-add torch shared memory, since shared memory is not always kept when pickling\n        self._epoch = _maybe_share_with_torch_persistent_workers(self._epoch)\n        # Re-add the cache to keep on pickling (e.g. for dataloader workers)\n        self.__hffs_cache = HfFileSystem._cache\n        # Re-add torch iterable dataset as a parent class, since dynamically added parent classes are not kept when pickling\n        _maybe_add_torch_iterable_dataset_parent_class(self.__class__)\n\n    def _head(self, n=5):\n        return next(iter(self.iter(batch_size=n)))\n\n    @property\n    def epoch(self) -> int:\n        return int(self._epoch)\n\n    @property\n    def num_shards(self) -> int:\n        if self._distributed and self._ex_iterable.num_shards % self._distributed.world_size == 0:\n            return self._ex_iterable.num_shards // self._distributed.world_size\n        return self._ex_iterable.num_shards\n\n    @property\n    def n_shards(self) -> int:  # backward compatibility\n        return self.num_shards\n\n    def _iter_pytorch(self):\n        ex_iterable = self._prepare_ex_iterable_for_iteration()\n        # Fix for fsspec when using multiprocess to avoid hanging in the ML training loop. (only required for fsspec >= 0.9.0)\n        # See https://github.com/fsspec/gcsfs/issues/379\n        fsspec.asyn.reset_lock()\n        # check if there aren't too many workers\n        import torch.utils.data\n\n        worker_info = torch.utils.data.get_worker_info()\n        if self._is_main_process() and ex_iterable.num_shards < worker_info.num_workers:\n            logger.warning(\n                f\"Too many dataloader workers: {worker_info.num_workers} (max is dataset.num_shards={ex_iterable.num_shards}). \"\n                f\"Stopping {worker_info.num_workers - ex_iterable.num_shards} dataloader workers.\"\n            )\n            logger.info(\n                f\"To parallelize data loading, we give each process some shards (or data sources) to process. \"\n                f\"Therefore it's unnecessary to have a number of workers greater than dataset.num_shards={ex_iterable.num_shards}. \"\n                f\"To enable more parallelism, please split the dataset in more files than {ex_iterable.num_shards} or try `dataset = dataset.reshard()` which may increase `num_shards` depending on the dataset file format.\"\n            )\n        # split workload\n        _log_prefix = f\"node#{self._distributed.rank} \" if self._distributed else \"\"\n        shards_indices = ex_iterable.split_shard_indices_by_worker(\n            num_shards=worker_info.num_workers, index=worker_info.id, contiguous=False\n        )\n        if shards_indices:\n            logger.debug(\n                f\"{_log_prefix}dataloader worker#{worker_info.id}, ': Starting to iterate over {len(shards_indices)}/{ex_iterable.num_shards} shards.\"\n            )\n            ex_iterable = ex_iterable.shard_data_sources(\n                num_shards=worker_info.num_workers, index=worker_info.id, contiguous=False\n            )\n            ex_iterable = shift_ex_examples_rngs(ex_iterable=ex_iterable, value=worker_info.id)\n            self._state_dict = {\n                \"examples_iterable\": ex_iterable._init_state_dict(),\n                \"epoch\": self.epoch,\n            }\n            if self._starting_state_dict and self.epoch == self._starting_state_dict[\"epoch\"]:\n                ex_iterable.load_state_dict(self._starting_state_dict[\"examples_iterable\"])\n\n            if self._formatting and (ex_iterable.iter_arrow or self._formatting.is_table):\n                formatter = get_formatter(self._formatting.format_type, features=self.features)\n                for key, pa_table in ex_iterable.iter_arrow():\n                    yield formatter.format_row(pa_table)\n                return\n            else:\n                for key, example in ex_iterable:\n                    # no need to format thanks to FormattedExamplesIterable\n                    yield example\n            logger.debug(\n                f\"{_log_prefix}dataloader worker#{worker_info.id}, ': Finished iterating over {len(shards_indices)}/{ex_iterable.num_shards} shards.\"\n            )\n        else:\n            logger.debug(\n                f\"{_log_prefix}dataloader worker#{worker_info.id}, ': Stopping... Number of dataset shards < num_workers ({ex_iterable.num_shards}<{worker_info.num_workers}).\"\n            )\n\n    def _is_main_process(self):\n        if self._distributed and self._distributed.rank > 0:\n            return False\n        if \"torch\" in sys.modules:\n            import torch.utils.data\n\n            worker_info = torch.utils.data.get_worker_info()\n            if worker_info is not None and worker_info.id > 0:\n                return False\n        return True\n\n    def _prepare_ex_iterable_for_iteration(\n        self, batch_size: int = 1, drop_last_batch: bool = False\n    ) -> _BaseExamplesIterable:\n        ex_iterable = self._ex_iterable\n\n        if self.epoch:\n            ex_iterable = ex_iterable.shuffle_data_sources(np.random.default_rng(self.epoch))\n            ex_iterable = shift_ex_examples_rngs(ex_iterable, self.epoch)\n\n        if self._distributed:\n            rank = self._distributed.rank\n            world_size = self._distributed.world_size\n            if ex_iterable.num_shards % world_size == 0:\n                if self._is_main_process():\n                    num_shards_per_node = ex_iterable.num_shards // world_size\n                    plural = \"s\" if num_shards_per_node > 1 else \"\"\n                    logger.info(\n                        f\"Assigning {num_shards_per_node} shard{plural} (or data source{plural}) of the dataset to each node.\"\n                    )\n                ex_iterable = ex_iterable.shard_data_sources(num_shards=world_size, index=rank, contiguous=False)\n            else:\n                if self._is_main_process():\n                    logger.info(\n                        f\"Assigning 1 out of {world_size} examples of the dataset to each node. The others are skipped during the iteration.\"\n                    )\n                    logger.info(\n                        f\"It is more optimized to distribute the dataset shards (or data sources) across nodes. \"\n                        f\"You can do that by using a dataset with number of shards that is a factor of world_size={world_size}. \"\n                        f\"The current dataset has {ex_iterable.num_shards} which is not a factor of {world_size}\"\n                    )\n                ex_iterable = StepExamplesIterable(ex_iterable, step=world_size, offset=rank)\n\n        if ex_iterable.iter_arrow:\n            ex_iterable = RebatchedArrowExamplesIterable(\n                ex_iterable, batch_size=batch_size, drop_last_batch=drop_last_batch\n            )\n        elif self._formatting and self._formatting.is_table:\n            ex_iterable = RebatchedArrowExamplesIterable(\n                ex_iterable, batch_size=batch_size, drop_last_batch=drop_last_batch, force_convert_to_arrow=True\n            )\n\n        if self._formatting or (self.features and ex_iterable.features != self.features):\n            ex_iterable = FormattedExamplesIterable(\n                ex_iterable,\n                formatting=self._formatting,\n                features=self.features,\n                token_per_repo_id=self._token_per_repo_id,\n            )\n\n        self._state_dict = {\n            \"examples_iterable\": ex_iterable._init_state_dict(),\n            \"epoch\": self.epoch,\n        }\n        if self._starting_state_dict and self.epoch == self._starting_state_dict[\"epoch\"]:\n            ex_iterable.load_state_dict(self._starting_state_dict[\"examples_iterable\"])\n        return ex_iterable\n\n    def __iter__(self):\n        if \"torch\" in sys.modules:\n            import torch.utils.data\n\n            worker_info = torch.utils.data.get_worker_info()\n            if isinstance(self, torch.utils.data.IterableDataset) and worker_info is not None:\n                # We're a torch.utils.data.IterableDataset in a PyTorch worker process\n                yield from self._iter_pytorch()\n                return\n\n        ex_iterable = self._prepare_ex_iterable_for_iteration()\n        if self._formatting and (ex_iterable.iter_arrow or self._formatting.is_table):\n            formatter = get_formatter(self._formatting.format_type, features=self.features)\n            for key, pa_table in ex_iterable.iter_arrow():\n                yield formatter.format_row(pa_table)\n            return\n\n        for key, example in ex_iterable:\n            # no need to format thanks to FormattedExamplesIterable\n            yield example\n\n    def iter(self, batch_size: int, drop_last_batch: bool = False):\n        \"\"\"Iterate through the batches of size `batch_size`.\n\n        Args:\n            batch_size (:obj:`int`): size of each batch to yield.\n            drop_last_batch (:obj:`bool`, default `False`): Whether a last batch smaller than the batch_size should be\n                dropped\n        \"\"\"\n\n        if self._formatting:\n            formatter = get_formatter(self._formatting.format_type, features=self.features)\n            format_dict = formatter.recursive_tensorize if isinstance(formatter, TensorFormatter) else None\n        else:\n            format_dict = None\n\n        ex_iterable = self._prepare_ex_iterable_for_iteration(batch_size=batch_size, drop_last_batch=drop_last_batch)\n        if self._formatting and (ex_iterable.iter_arrow or self._formatting.is_table):\n            for key, pa_table in ex_iterable.iter_arrow():\n                yield formatter.format_batch(pa_table)\n            return\n\n        iterator = iter(ex_iterable)\n        for key, example in iterator:\n            # If batched, first build the batch\n            examples = [example] + [example for key, example in islice(iterator, batch_size - 1)]\n            if drop_last_batch and len(examples) < batch_size:  # ignore last batch\n                return\n            batch = _examples_to_batch(examples)\n            # we need to format here in case we need to stack tensors together\n            yield format_dict(batch) if format_dict else batch\n\n    def __getitem__(self, column_name: str) -> IterableColumn:\n        return IterableColumn(self, column_name)\n\n    @staticmethod\n    def from_generator(\n        generator: Callable,\n        features: Optional[Features] = None,\n        gen_kwargs: Optional[dict] = None,\n        split: NamedSplit = Split.TRAIN,\n    ) -> \"IterableDataset\":\n        \"\"\"Create an Iterable Dataset from a generator.\n\n        Args:\n            generator (`Callable`):\n                A generator function that `yields` examples.\n            features (`Features`, *optional*):\n                Dataset features.\n            gen_kwargs(`dict`, *optional*):\n                Keyword arguments to be passed to the `generator` callable.\n                You can define a sharded iterable dataset by passing the list of shards in `gen_kwargs`.\n                This can be used to improve shuffling and when iterating over the dataset with multiple workers.\n            split ([`NamedSplit`], defaults to `Split.TRAIN`):\n                Split name to be assigned to the dataset.\n\n                <Added version=\"2.21.0\"/>\n        Returns:\n            [`IterableDataset`]\n\n        Example:\n\n        ```py\n        >>> def gen():\n        ...     yield {\"text\": \"Good\", \"label\": 0}\n        ...     yield {\"text\": \"Bad\", \"label\": 1}\n        ...\n        >>> ds = IterableDataset.from_generator(gen)\n        ```\n\n        ```py\n        >>> def gen(shards):\n        ...     for shard in shards:\n        ...         with open(shard) as f:\n        ...             for line in f:\n        ...                 yield {\"line\": line}\n        ...\n        >>> shards = [f\"data{i}.txt\" for i in range(32)]\n        >>> ds = IterableDataset.from_generator(gen, gen_kwargs={\"shards\": shards})\n        >>> ds = ds.shuffle(seed=42, buffer_size=10_000)  # shuffles the shards order + uses a shuffle buffer\n        >>> from torch.utils.data import DataLoader\n        >>> dataloader = DataLoader(ds.with_format(\"torch\"), num_workers=4)  # give each worker a subset of 32/4=8 shards\n        ```\n        \"\"\"\n        from .io.generator import GeneratorDatasetInputStream\n\n        return GeneratorDatasetInputStream(\n            generator=generator, features=features, gen_kwargs=gen_kwargs, streaming=True, split=split\n        ).read()\n\n    @staticmethod\n    def from_spark(\n        df: \"pyspark.sql.DataFrame\",\n        split: Optional[NamedSplit] = None,\n        features: Optional[Features] = None,\n        **kwargs,\n    ) -> \"IterableDataset\":\n        \"\"\"Create an IterableDataset from Spark DataFrame. The dataset is streamed to the driver in batches.\n\n        Args:\n            df (`pyspark.sql.DataFrame`):\n                The DataFrame containing the desired data.\n            split (`NamedSplit`, *optional*):\n                Split name to be assigned to the dataset.\n            features (`Features`, *optional*):\n                Dataset features.\n\n        Returns:\n            [`IterableDataset`]\n\n        Example:\n\n        ```py\n        >>> df = spark.createDataFrame(\n        >>>     data=[[1, \"Elia\"], [2, \"Teo\"], [3, \"Fang\"]],\n        >>>     columns=[\"id\", \"name\"],\n        >>> )\n        >>> ds = IterableDataset.from_spark(df)\n        ```\n        \"\"\"\n        from .io.spark import SparkDatasetReader\n\n        if sys.platform == \"win32\":\n            raise OSError(\"IterableDataset.from_spark is not currently supported on Windows\")\n\n        return SparkDatasetReader(\n            df,\n            split=split,\n            features=features,\n            streaming=True,\n            **kwargs,\n        ).read()\n\n    @staticmethod\n    def from_file(filename: str) -> \"IterableDataset\":\n        \"\"\"Instantiate a IterableDataset from Arrow table at filename.\n\n        Args:\n            filename (`str`):\n                File name of the dataset.\n\n        Returns:\n            [`IterableDataset`]\n        \"\"\"\n        pa_table_schema = read_schema_from_file(filename)\n        inferred_features = Features.from_arrow_schema(pa_table_schema)\n        ex_iterable = ArrowExamplesIterable(Dataset._generate_tables_from_cache_file, kwargs={\"filename\": filename})\n        return IterableDataset(ex_iterable=ex_iterable, info=DatasetInfo(features=inferred_features))\n\n    @classmethod\n    def from_pandas(\n        cls,\n        df: pd.DataFrame,\n        features: Optional[Features] = None,\n        info: Optional[DatasetInfo] = None,\n        split: Optional[NamedSplit] = None,\n        preserve_index: Optional[bool] = None,\n        num_shards: Optional[int] = 1,\n    ) -> \"IterableDataset\":\n        \"\"\"\n        Convert `pandas.DataFrame` to a `pyarrow.Table` to create an [`IterableDataset`].\n\n        The column types in the resulting Arrow Table are inferred from the dtypes of the `pandas.Series` in the\n        DataFrame. In the case of non-object Series, the NumPy dtype is translated to its Arrow equivalent. In the\n        case of `object`, we need to guess the datatype by looking at the Python objects in this Series.\n\n        Be aware that Series of the `object` dtype don't carry enough information to always lead to a meaningful Arrow\n        type. In the case that we cannot infer a type, e.g. because the DataFrame is of length 0 or the Series only\n        contains `None/nan` objects, the type is set to `null`. This behavior can be avoided by constructing explicit\n        features and passing it to this function.\n\n        Important: a dataset created with from_pandas() lives in memory.\n        This may change in the future, but in the meantime if you\n        want to reduce memory usage you should write it on disk\n        and reload using e.g. to_parquet / from_parquet.\n\n        Args:\n            df (`pandas.DataFrame`):\n                Dataframe that contains the dataset.\n            features ([`Features`], *optional*):\n                Dataset features.\n            info (`DatasetInfo`, *optional*):\n                Dataset information, like description, citation, etc.\n            split (`NamedSplit`, *optional*):\n                Name of the dataset split.\n            preserve_index (`bool`, *optional*):\n                Whether to store the index as an additional column in the resulting Dataset.\n                The default of `None` will store the index as a column, except for `RangeIndex` which is stored as metadata only.\n                Use `preserve_index=True` to force it to be stored as a column.\n            num_shards (`int`, default to `1`):\n                Number of shards to define when instantiating the iterable dataset. This is especially useful for big datasets to be able to shuffle properly,\n                and also to enable fast parallel loading using a PyTorch DataLoader or in distributed setups for example.\n\n        Returns:\n            [`IterableDataset`]\n\n        Example:\n\n        ```py\n        >>> ds = IterableDataset.from_pandas(df)\n        ```\n        \"\"\"\n        return Dataset.from_pandas(\n            df,\n            features=features,\n            info=info,\n            split=split,\n            preserve_index=preserve_index,\n        ).to_iterable_dataset(num_shards=num_shards)\n\n    @classmethod\n    def from_polars(\n        cls,\n        df: Union[\"pl.DataFrame\", \"pl.LazyFrame\"],\n        features: Optional[Features] = None,\n        info: Optional[DatasetInfo] = None,\n        split: Optional[NamedSplit] = None,\n    ) -> \"IterableDataset\":\n        \"\"\"\n        Create an IterableDataset from a polars DataFrame or LazyFrame.\n\n        Iterating over the dataset is mostly zero copy.\n        Under the hood, the dataset iterates over the polars DataFrame batches/slices.\n\n        Data types that do copy:\n            * CategoricalType\n\n        Args:\n            df (`polars.DataFrame`): DataFrame to convert to Arrow Table\n            features (`Features`, optional): Dataset features.\n            info (`DatasetInfo`, optional): Dataset information, like description, citation, etc.\n            split (`NamedSplit`, optional): Name of the dataset split.\n\n        Returns:\n            [`IterableDataset`]\n\n        Examples:\n        ```py\n        >>> ds = IterableDataset.from_polars(df)\n        ```\n        \"\"\"\n        import polars as pl\n\n        if info is not None and features is not None and info.features != features:\n            raise ValueError(\n                f\"Features specified in `features` and `info.features` can't be different:\\n{features}\\n{info.features}\"\n            )\n        features = features if features is not None else info.features if info is not None else None\n        if features is not None:\n            features = _fix_for_backward_compatible_features(features)\n        if info is None:\n            info = DatasetInfo()\n        info.features = features or Features.from_arrow_schema(\n            (df.collect_schema() if isinstance(df, pl.LazyFrame) else df.schema).to_arrow()\n        )\n        return IterableDataset(\n            ArrowExamplesIterable(_generate_tables_from_polars, kwargs={\"df\": df}),\n            info=info,\n            split=split,\n        )\n\n    @classmethod\n    def from_dict(\n        cls,\n        mapping: dict,\n        features: Optional[Features] = None,\n        info: Optional[DatasetInfo] = None,\n        split: Optional[NamedSplit] = None,\n        num_shards: Optional[int] = 1,\n    ) -> \"IterableDataset\":\n        \"\"\"\n        Convert `dict` to a `pyarrow.Table` to create an [`IterableDataset`].\n\n        Important: a dataset created with from_dict() lives in memory.\n        This may change in the future, but in the meantime if you\n        want to reduce memory usage you should write it back on disk\n        and reload using e.g. to_parquet / from_parquet.\n\n        Args:\n            mapping (`Mapping`):\n                Mapping of strings to Arrays or Python lists.\n            features ([`Features`], *optional*):\n                Dataset features.\n            info (`DatasetInfo`, *optional*):\n                Dataset information, like description, citation, etc.\n            split (`NamedSplit`, *optional*):\n                Name of the dataset split.\n            num_shards (`int`, default to `1`):\n                Number of shards to define when instantiating the iterable dataset. This is especially useful for big datasets to be able to shuffle properly,\n                and also to enable fast parallel loading using a PyTorch DataLoader or in distributed setups for example.\n\n        Returns:\n            [`IterableDataset`]\n        \"\"\"\n        return Dataset.from_dict(mapping, features=features, info=info, split=split).to_iterable_dataset(\n            num_shards=num_shards\n        )\n\n    @classmethod\n    def from_list(\n        cls,\n        mapping: list[dict],\n        features: Optional[Features] = None,\n        info: Optional[DatasetInfo] = None,\n        split: Optional[NamedSplit] = None,\n        num_shards: Optional[int] = 1,\n    ) -> \"IterableDataset\":\n        \"\"\"\n        Convert a list of dicts to a `pyarrow.Table` to create an [`IterableDataset`]`.\n\n        Note that the keys of the first entry will be used to determine the dataset columns,\n        regardless of what is passed to features.\n\n        Important: a dataset created with from_list() lives in memory.\n        This may change in the future, but in the meantime if you\n        want to reduce memory usage you should write it back on disk\n        and reload using e.g. from_parquet / to_parquet.\n\n        Args:\n            mapping (`List[dict]`): A list of mappings of strings to row values.\n            features (`Features`, optional): Dataset features.\n            info (`DatasetInfo`, optional): Dataset information, like description, citation, etc.\n            split (`NamedSplit`, optional): Name of the dataset split.\n            num_shards (`int`, default to `1`):\n                Number of shards to define when instantiating the iterable dataset. This is especially useful for big datasets to be able to shuffle properly,\n                and also to enable fast parallel loading using a PyTorch DataLoader or in distributed setups for example.\n\n        Returns:\n            [`IterableDataset`]\n        \"\"\"\n        return Dataset.from_list(\n            mapping,\n            features=features,\n            info=info,\n            split=split,\n        ).to_iterable_dataset(num_shards=num_shards)\n\n    @staticmethod\n    def from_csv(\n        path_or_paths: Union[PathLike, list[PathLike]],\n        split: Optional[NamedSplit] = None,\n        features: Optional[Features] = None,\n        keep_in_memory: bool = False,\n        **kwargs,\n    ) -> \"IterableDataset\":\n        \"\"\"Create an IterableDataset from CSV file(s).\n\n        Args:\n            path_or_paths (`path-like` or list of `path-like`):\n                Path(s) of the CSV file(s).\n            split ([`NamedSplit`], *optional*):\n                Split name to be assigned to the dataset.\n            features ([`Features`], *optional*):\n                Dataset features.\n            keep_in_memory (`bool`, defaults to `False`):\n                Whether to copy the data in-memory.\n            **kwargs (additional keyword arguments):\n                Keyword arguments to be passed to [`pandas.read_csv`].\n\n        Returns:\n            [`IterableDataset`]\n\n        Example:\n\n        ```py\n        >>> ds = IterableDataset.from_csv('path/to/dataset.csv')\n        ```\n        \"\"\"\n        # Dynamic import to avoid circular dependency\n        from .io.csv import CsvDatasetReader\n\n        return CsvDatasetReader(\n            path_or_paths,\n            split=split,\n            features=features,\n            keep_in_memory=keep_in_memory,\n            streaming=True,\n            **kwargs,\n        ).read()\n\n    @staticmethod\n    def from_json(\n        path_or_paths: Union[PathLike, list[PathLike]],\n        split: Optional[NamedSplit] = None,\n        features: Optional[Features] = None,\n        keep_in_memory: bool = False,\n        field: Optional[str] = None,\n        **kwargs,\n    ) -> \"IterableDataset\":\n        \"\"\"Create an IterableDataset from JSON or JSON Lines file(s).\n\n        Args:\n            path_or_paths (`path-like` or list of `path-like`):\n                Path(s) of the JSON or JSON Lines file(s).\n            split ([`NamedSplit`], *optional*):\n                Split name to be assigned to the dataset.\n            features ([`Features`], *optional*):\n                 Dataset features.\n            keep_in_memory (`bool`, defaults to `False`):\n                Whether to copy the data in-memory.\n            field (`str`, *optional*):\n                Field name of the JSON file where the dataset is contained in.\n            **kwargs (additional keyword arguments):\n                Keyword arguments to be passed to [`JsonConfig`].\n\n        Returns:\n            [`IterableDataset`]\n\n        Example:\n\n        ```py\n        >>> ds = IterableDataset.from_json('path/to/dataset.json')\n        ```\n        \"\"\"\n        # Dynamic import to avoid circular dependency\n        from .io.json import JsonDatasetReader\n\n        return JsonDatasetReader(\n            path_or_paths,\n            split=split,\n            features=features,\n            keep_in_memory=keep_in_memory,\n            field=field,\n            streaming=True,\n            **kwargs,\n        ).read()\n\n    @staticmethod\n    def from_parquet(\n        path_or_paths: Union[PathLike, list[PathLike]],\n        split: Optional[NamedSplit] = None,\n        features: Optional[Features] = None,\n        keep_in_memory: bool = False,\n        columns: Optional[list[str]] = None,\n        filters: Optional[Union[pds.Expression, list[tuple], list[list[tuple]]]] = None,\n        fragment_scan_options: Optional[pds.ParquetFragmentScanOptions] = None,\n        on_bad_files: Literal[\"error\", \"warn\", \"skip\"] = \"error\",\n        **kwargs,\n    ) -> \"IterableDataset\":\n        \"\"\"Create an IterableDataset from Parquet file(s).\n\n        Args:\n            path_or_paths (`path-like` or list of `path-like`):\n                Path(s) of the Parquet file(s).\n            split (`NamedSplit`, *optional*):\n                Split name to be assigned to the dataset.\n            features (`Features`, *optional*):\n                Dataset features.\n            keep_in_memory (`bool`, defaults to `False`):\n                Whether to copy the data in-memory.\n            columns (`List[str]`, *optional*):\n                If not `None`, only these columns will be read from the file.\n                A column name may be a prefix of a nested field, e.g. 'a' will select\n                'a.b', 'a.c', and 'a.d.e'.\n            filters (`Union[pyarrow.dataset.Expression, list[tuple], list[list[tuple]]]`, *optional*):\n                Return only the rows matching the filter.\n                If possible the predicate will be pushed down to exploit the partition information\n                or internal metadata found in the data source, e.g. Parquet statistics.\n                Otherwise filters the loaded RecordBatches before yielding them.\n            fragment_scan_options (`pyarrow.dataset.ParquetFragmentScanOptions`, *optional*)\n                Scan-specific options for Parquet fragments.\n                This is especially useful to configure buffering and caching.\n\n                <Added version=\"4.2.0\"/>\n            on_bad_files (`Literal[\"error\", \"warn\", \"skip\"]`, *optional*, defaults to \"error\")\n                Specify what to do upon encountering a bad file (a file that can't be read). Allowed values are :\n                * 'error', raise an Exception when a bad file is encountered.\n                * 'warn', raise a warning when a bad file is encountered and skip that file.\n                * 'skip', skip bad files without raising or warning when they are encountered.\n\n                <Added version=\"4.2.0\"/>\n            **kwargs (additional keyword arguments):\n                Keyword arguments to be passed to [`ParquetConfig`].\n\n        Returns:\n            [`IterableDataset`]\n\n        Example:\n\n        ```py\n        >>> ds = IterableDataset.from_parquet('path/to/dataset.parquet')\n        ```\n\n        Load a subset of columns:\n\n        ```python\n        >>> ds = IterableDataset.from_parquet('path/to/dataset.parquet', columns=[\"col_0\", \"col_1\"])\n        ```\n\n        Efficiently filter data, possibly skipping entire files or row groups:\n\n        ```python\n        >>> filters = [(\"col_0\", \"==\", 0)]\n        >>> ds = IterableDataset.from_parquet(parquet_files_list, filters=filters)\n        ```\n        \"\"\"\n        # Dynamic import to avoid circular dependency\n        from .io.parquet import ParquetDatasetReader\n\n        return ParquetDatasetReader(\n            path_or_paths,\n            split=split,\n            features=features,\n            keep_in_memory=keep_in_memory,\n            columns=columns,\n            streaming=True,\n            filters=filters,\n            fragment_scan_options=fragment_scan_options,\n            on_bad_files=on_bad_files,\n            **kwargs,\n        ).read()\n\n    @staticmethod\n    def from_text(\n        path_or_paths: Union[PathLike, list[PathLike]],\n        split: Optional[NamedSplit] = None,\n        features: Optional[Features] = None,\n        keep_in_memory: bool = False,\n        keep_linebreaks: bool = False,\n        sample_by: Literal[\"line\", \"paragraph\", \"document\"] = \"line\",\n        **kwargs,\n    ) -> \"IterableDataset\":\n        \"\"\"Create an IterableDataset from text file(s).\n\n        Args:\n            path_or_paths (`path-like` or list of `path-like`):\n                Path(s) of the text file(s).\n            split (`NamedSplit`, *optional*):\n                Split name to be assigned to the dataset.\n            features (`Features`, *optional*):\n                Dataset features.\n            keep_in_memory (`bool`, defaults to `False`):\n                Whether to copy the data in-memory.\n            keep_linebreaks: (`bool`, defaults to False):\n                Whether to keep line breaks.\n            sample_by (`Literal[\"line\", \"paragraph\", \"document\"]`, defaults to \"line\"):\n                Whether to load data per line, praragraph or document.\n                By default one row in the dataset = one line.\n            **kwargs (additional keyword arguments):\n                Keyword arguments to be passed to [`TextConfig`].\n\n        Returns:\n            [`IterableDataset`]\n\n        Example:\n\n        ```py\n        >>> ds = IterableDataset.from_text('path/to/dataset.txt')\n        ```\n        \"\"\"\n        # Dynamic import to avoid circular dependency\n        from .io.text import TextDatasetReader\n\n        return TextDatasetReader(\n            path_or_paths,\n            split=split,\n            features=features,\n            keep_in_memory=keep_in_memory,\n            streaming=True,\n            keep_linebreaks=keep_linebreaks,\n            sample_by=sample_by,\n            **kwargs,\n        ).read()\n\n    def with_format(\n        self,\n        type: Optional[str] = None,\n    ) -> \"IterableDataset\":\n        \"\"\"\n        Return a dataset with the specified format.\n\n        Args:\n\n            type (`str`, *optional*):\n                Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'jax', 'arrow', 'pandas', 'polars']`.\n                `None` means it returns python objects (default).\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> from transformers import AutoTokenizer\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"validation\", streaming=True)\n        >>> tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n        >>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True)\n        >>> ds = ds.with_format(\"torch\")\n        >>> next(iter(ds))\n        {'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .',\n         'label': tensor(1),\n         'input_ids': tensor([  101, 18027, 16310, 16001,  1103,  9321,   178, 11604,  7235,  6617,\n                1742,  2165,  2820,  1206,  6588, 22572, 12937,  1811,  2153,  1105,\n                1147, 12890, 19587,  6463,  1105, 15026,  1482,   119,   102,     0,\n                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n                    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n                    0,     0,     0,     0]),\n         'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),\n         'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n                1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}\n        ```\n        \"\"\"\n        type = get_format_type_from_alias(type)\n        # TODO(QL): add format_kwargs\n        # TODO(QL): add format_columns and return_all_columns\n        # TODO(QL): add pandas format\n        return IterableDataset(\n            ex_iterable=self._ex_iterable,\n            info=self._info.copy(),\n            split=self._split,\n            formatting=FormattingConfig(format_type=type),\n            distributed=copy.deepcopy(self._distributed),\n            token_per_repo_id=self._token_per_repo_id,\n        )\n\n    def map(\n        self,\n        function: Optional[Callable] = None,\n        with_indices: bool = False,\n        input_columns: Optional[Union[str, list[str]]] = None,\n        batched: bool = False,\n        batch_size: Optional[int] = 1000,\n        drop_last_batch: bool = False,\n        remove_columns: Optional[Union[str, list[str]]] = None,\n        features: Optional[Features] = None,\n        fn_kwargs: Optional[dict] = None,\n    ) -> \"IterableDataset\":\n        \"\"\"\n        Apply a function to all the examples in the iterable dataset (individually or in batches) and update them.\n        If your function returns a column that already exists, then it overwrites it.\n        The function is applied on-the-fly on the examples when iterating over the dataset.\n\n        You can specify whether the function should be batched or not with the `batched` parameter:\n\n        - If batched is `False`, then the function takes 1 example in and should return 1 example.\n          An example is a dictionary, e.g. `{\"text\": \"Hello there !\"}`.\n        - If batched is `True` and `batch_size` is 1, then the function takes a batch of 1 example as input and can return a batch with 1 or more examples.\n          A batch is a dictionary, e.g. a batch of 1 example is {\"text\": [\"Hello there !\"]}.\n        - If batched is `True` and `batch_size` is `n` > 1, then the function takes a batch of `n` examples as input and can return a batch with `n` examples, or with an arbitrary number of examples.\n          Note that the last batch may have less than `n` examples.\n          A batch is a dictionary, e.g. a batch of `n` examples is `{\"text\": [\"Hello there !\"] * n}`.\n\n        If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simulatenous calls.\n        It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time.\n\n        Args:\n            function (`Callable`, *optional*, defaults to `None`):\n                Function applied on-the-fly on the examples when you iterate on the dataset.\n                It must have one of the following signatures:\n\n                - `function(example: Dict[str, Any]) -> Dict[str, Any]` if `batched=False` and `with_indices=False`\n                - `function(example: Dict[str, Any], idx: int) -> Dict[str, Any]` if `batched=False` and `with_indices=True`\n                - `function(batch: Dict[str, List]) -> Dict[str, List]` if `batched=True` and `with_indices=False`\n                - `function(batch: Dict[str, List], indices: List[int]) -> Dict[str, List]` if `batched=True` and `with_indices=True`\n\n                For advanced usage, the function can also return a `pyarrow.Table`.\n                If the function is asynchronous, then `map` will run your function in parallel.\n                Moreover if your function returns nothing (`None`), then `map` will run your function and return the dataset unchanged.\n                If no function is provided, default to identity function: `lambda x: x`.\n            with_indices (`bool`, defaults to `False`):\n                Provide example indices to `function`. Note that in this case the signature of `function` should be `def function(example, idx[, rank]): ...`.\n            input_columns (`Optional[Union[str, List[str]]]`, defaults to `None`):\n                The columns to be passed into `function`\n                as positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument.\n            batched (`bool`, defaults to `False`):\n                Provide batch of examples to `function`.\n            batch_size (`int`, *optional*, defaults to `1000`):\n                Number of examples per batch provided to `function` if `batched=True`.\n                `batch_size <= 0` or `batch_size == None` then provide the full dataset as a single batch to `function`.\n            drop_last_batch (`bool`, defaults to `False`):\n                Whether a last batch smaller than the batch_size should be\n                dropped instead of being processed by the function.\n            remove_columns (`[List[str]]`, *optional*, defaults to `None`):\n                Remove a selection of columns while doing the mapping.\n                Columns will be removed before updating the examples with the output of `function`, i.e. if `function` is adding\n                columns with names in `remove_columns`, these columns will be kept.\n            features (`[Features]`, *optional*, defaults to `None`):\n                Feature types of the resulting dataset.\n            fn_kwargs (`Dict`, *optional*, default `None`):\n                Keyword arguments to be passed to `function`.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\", streaming=True)\n        >>> def add_prefix(example):\n        ...     example[\"text\"] = \"Review: \" + example[\"text\"]\n        ...     return example\n        >>> ds = ds.map(add_prefix)\n        >>> list(ds.take(3))\n        [{'label': 1,\n         'text': 'Review: the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'},\n         {'label': 1,\n         'text': 'Review: the gorgeously elaborate continuation of \" the lord of the rings \" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\\'s expanded vision of j . r . r . tolkien\\'s middle-earth .'},\n         {'label': 1, 'text': 'Review: effective but too-tepid biopic'}]\n        ```\n        \"\"\"\n        if isinstance(input_columns, str):\n            input_columns = [input_columns]\n        if isinstance(remove_columns, str):\n            remove_columns = [remove_columns]\n        if function is None:\n            function = identity_func\n        if fn_kwargs is None:\n            fn_kwargs = {}\n        if features is not None:\n            features = _fix_for_backward_compatible_features(features)\n\n        ex_iterable = self._ex_iterable\n        # no need to apply features if ex_iterable is typed and if there was no cast_column()\n        input_features = (\n            None\n            if (ex_iterable.is_typed and (self._info.features is None or self._info.features == ex_iterable.features))\n            else self._info.features\n        )\n\n        if self._formatting and self._formatting.is_table:\n            # apply formatting before iter_arrow to keep map examples iterable happy\n            ex_iterable = FormattedExamplesIterable(\n                ex_iterable,\n                formatting=copy.deepcopy(self._formatting),\n                features=input_features,\n                token_per_repo_id=self._token_per_repo_id,\n            )\n            ex_iterable = RebatchedArrowExamplesIterable(\n                ex_iterable,\n                batch_size=batch_size if batched else 1,\n                drop_last_batch=drop_last_batch,\n                force_convert_to_arrow=True,\n            )\n        else:\n            if self._formatting and self._ex_iterable.iter_arrow:\n                ex_iterable = RebatchedArrowExamplesIterable(\n                    self._ex_iterable, batch_size=batch_size if batched else 1, drop_last_batch=drop_last_batch\n                )\n            if self._formatting or input_features:\n                # apply formatting after iter_arrow to avoid re-encoding the examples\n                ex_iterable = FormattedExamplesIterable(\n                    ex_iterable,\n                    formatting=copy.deepcopy(self._formatting),\n                    features=input_features,\n                    token_per_repo_id=self._token_per_repo_id,\n                    force_convert_to_python=True,\n                )\n\n        ex_iterable = MappedExamplesIterable(\n            ex_iterable,\n            function=function,\n            with_indices=with_indices,\n            input_columns=input_columns,\n            batched=batched,\n            batch_size=batch_size,\n            drop_last_batch=drop_last_batch,\n            remove_columns=remove_columns,\n            fn_kwargs=fn_kwargs,\n            formatting=self._formatting,\n            features=features,\n        )\n        info = self.info.copy()\n        info.features = features\n        return IterableDataset(\n            ex_iterable=ex_iterable,\n            info=info,\n            split=self._split,\n            formatting=self._formatting,\n            distributed=copy.deepcopy(self._distributed),\n            token_per_repo_id=self._token_per_repo_id,\n        )\n\n    def filter(\n        self,\n        function: Optional[Callable] = None,\n        with_indices=False,\n        input_columns: Optional[Union[str, list[str]]] = None,\n        batched: bool = False,\n        batch_size: Optional[int] = 1000,\n        fn_kwargs: Optional[dict] = None,\n    ) -> \"IterableDataset\":\n        \"\"\"Apply a filter function to all the elements so that the dataset only includes examples according to the filter function.\n        The filtering is done on-the-fly when iterating over the dataset.\n\n        If the function is asynchronous, then `filter` will run your function in parallel, with up to one thousand simulatenous calls (configurable).\n        It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time.\n\n        Args:\n            function (`Callable`):\n                Callable with one of the following signatures:\n\n                - `function(example: Dict[str, Any]) -> bool` if `with_indices=False, batched=False`\n                - `function(example: Dict[str, Any], indices: int) -> bool` if `with_indices=True, batched=False`\n                - `function(example: Dict[str, List]) -> List[bool]` if `with_indices=False, batched=True`\n                - `function(example: Dict[str, List], indices: List[int]) -> List[bool]` if `with_indices=True, batched=True`\n\n                If the function is asynchronous, then `filter` will run your function in parallel.\n                If no function is provided, defaults to an always True function: `lambda x: True`.\n            with_indices (`bool`, defaults to `False`):\n                Provide example indices to `function`. Note that in this case the signature of `function` should be `def function(example, idx): ...`.\n            input_columns (`str` or `List[str]`, *optional*):\n                The columns to be passed into `function` as\n                positional arguments. If `None`, a dict mapping to all formatted columns is passed as one argument.\n            batched (`bool`, defaults to `False`):\n                Provide batch of examples to `function`.\n            batch_size (`int`, *optional*, default `1000`):\n                Number of examples per batch provided to `function` if `batched=True`.\n            fn_kwargs (`Dict`, *optional*, default `None`):\n                Keyword arguments to be passed to `function`.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\", streaming=True)\n        >>> ds = ds.filter(lambda x: x[\"label\"] == 0)\n        >>> list(ds.take(3))\n        [{'label': 0, 'movie_review': 'simplistic , silly and tedious .'},\n         {'label': 0,\n         'movie_review': \"it's so laddish and juvenile , only teenage boys could possibly find it funny .\"},\n         {'label': 0,\n         'movie_review': 'exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable .'}]\n        ```\n        \"\"\"\n        if isinstance(input_columns, str):\n            input_columns = [input_columns]\n\n        # We need the examples to be decoded for certain feature types like Image or Audio,\n        # format and type before filtering\n        ex_iterable = self._ex_iterable\n        if self._info.features or self._formatting:\n            ex_iterable = FormattedExamplesIterable(\n                ex_iterable,\n                formatting=self._formatting,\n                features=ex_iterable.features if ex_iterable.is_typed else self._info.features,\n                token_per_repo_id=self._token_per_repo_id,\n            )\n\n        ex_iterable = FilteredExamplesIterable(\n            ex_iterable,\n            function=function,\n            with_indices=with_indices,\n            input_columns=input_columns,\n            batched=batched,\n            batch_size=batch_size,\n            fn_kwargs=fn_kwargs,\n            formatting=self._formatting,\n        )\n        return IterableDataset(\n            ex_iterable=ex_iterable,\n            info=self._info,\n            split=self._split,\n            formatting=self._formatting,\n            distributed=copy.deepcopy(self._distributed),\n            token_per_repo_id=self._token_per_repo_id,\n        )\n\n    def shuffle(\n        self, seed=None, generator: Optional[np.random.Generator] = None, buffer_size: int = 1000\n    ) -> \"IterableDataset\":\n        \"\"\"\n        Randomly shuffles the elements of this dataset.\n\n        This dataset fills a buffer with `buffer_size` elements, then randomly samples elements from this buffer,\n        replacing the selected elements with new elements. For perfect shuffling, a buffer size greater than or\n        equal to the full size of the dataset is required.\n\n        For instance, if your dataset contains 10,000 elements but `buffer_size` is set to 1000, then `shuffle` will\n        initially select a random element from only the first 1000 elements in the buffer. Once an element is\n        selected, its space in the buffer is replaced by the next (i.e. 1,001-st) element,\n        maintaining the 1000 element buffer.\n\n        If the dataset is made of several shards, it also does shuffle the order of the shards.\n        However if the order has been fixed by using [`~datasets.IterableDataset.skip`] or [`~datasets.IterableDataset.take`]\n        then the order of the shards is kept unchanged.\n\n        Args:\n            seed (`int`, *optional*, defaults to `None`):\n                Random seed that will be used to shuffle the dataset.\n                It is used to sample from the shuffle buffer and also to shuffle the data shards.\n            generator (`numpy.random.Generator`, *optional*):\n                Numpy random Generator to use to compute the permutation of the dataset rows.\n                If `generator=None` (default), uses `np.random.default_rng` (the default BitGenerator (PCG64) of NumPy).\n            buffer_size (`int`, defaults to `1000`):\n                Size of the buffer.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\", streaming=True)\n        >>> list(ds.take(3))\n        [{'label': 1,\n         'text': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'},\n         {'label': 1,\n         'text': 'the gorgeously elaborate continuation of \" the lord of the rings \" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\\'s expanded vision of j . r . r . tolkien\\'s middle-earth .'},\n         {'label': 1, 'text': 'effective but too-tepid biopic'}]\n        >>> shuffled_ds = ds.shuffle(seed=42)\n        >>> list(shuffled_ds.take(3))\n        [{'label': 1,\n         'text': \"a sports movie with action that's exciting on the field and a story you care about off it .\"},\n         {'label': 1,\n         'text': 'at its best , the good girl is a refreshingly adult take on adultery . . .'},\n         {'label': 1,\n         'text': \"sam jones became a very lucky filmmaker the day wilco got dropped from their record label , proving that one man's ruin may be another's fortune .\"}]\n        ```\n        \"\"\"\n        if generator is None:\n            generator = np.random.default_rng(seed)\n        else:\n            generator = deepcopy(generator)\n        return IterableDataset(\n            BufferShuffledExamplesIterable(\n                RebatchedArrowExamplesIterable(self._ex_iterable.shuffle_data_sources(generator), batch_size=1)\n                if self._ex_iterable.iter_arrow\n                else self._ex_iterable.shuffle_data_sources(generator),\n                buffer_size=buffer_size,\n                generator=generator,\n            ),\n            info=self._info.copy(),\n            split=self._split,\n            formatting=self._formatting,\n            distributed=copy.deepcopy(self._distributed),\n            token_per_repo_id=self._token_per_repo_id,\n        )\n\n    def set_epoch(self, epoch: int):\n        self._epoch += epoch - self._epoch  # update torch value in shared memory in-place\n\n    def skip(self, n: int) -> \"IterableDataset\":\n        \"\"\"\n        Create a new [`IterableDataset`] that skips the first `n` elements.\n\n        Args:\n            n (`int`):\n                Number of elements to skip.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\", streaming=True)\n        >>> list(ds.take(3))\n        [{'label': 1,\n         'text': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'},\n         {'label': 1,\n         'text': 'the gorgeously elaborate continuation of \" the lord of the rings \" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\\'s expanded vision of j . r . r . tolkien\\'s middle-earth .'},\n         {'label': 1, 'text': 'effective but too-tepid biopic'}]\n        >>> ds = ds.skip(1)\n        >>> list(ds.take(3))\n        [{'label': 1,\n         'text': 'the gorgeously elaborate continuation of \" the lord of the rings \" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\\'s expanded vision of j . r . r . tolkien\\'s middle-earth .'},\n         {'label': 1, 'text': 'effective but too-tepid biopic'},\n         {'label': 1,\n         'text': 'if you sometimes like to go to the movies to have fun , wasabi is a good place to start .'}]\n        ```\n        \"\"\"\n        ex_iterable = SkipExamplesIterable(\n            self._ex_iterable,\n            n,\n            split_when_sharding=self._distributed is None,\n        )\n        return IterableDataset(\n            ex_iterable=ex_iterable,\n            info=self._info.copy(),\n            split=self._split,\n            formatting=self._formatting,\n            distributed=copy.deepcopy(self._distributed),\n            token_per_repo_id=self._token_per_repo_id,\n        )\n\n    def repeat(self, num_times: Optional[int]) -> \"IterableDataset\":\n        \"\"\"\n        Create a new [`IterableDataset`] that repeats the underlying dataset `num_times` times.\n\n        N.B. The effect of calling shuffle after repeat depends significantly on buffer size.\n        With buffer_size 1, duplicate data is never seen in the same iteration, even after shuffling:\n        ds.repeat(n).shuffle(seed=42, buffer_size=1) is equivalent to ds.shuffle(seed=42, buffer_size=1).repeat(n),\n        and only shuffles shard orders within each iteration.\n        With buffer size >= (num samples in the dataset * num_times), we get full shuffling of the repeated data, i.e. we can observe duplicates in\n        the same iteration.\n\n        Args:\n            num_times (`int`) or (`None`):\n                Number of times to repeat the dataset. If `None`, the dataset will be repeated indefinitely.\n\n        Example:\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\")\n        >>> ds = ds.take(2).repeat(2)\n        >>> list(ds)\n        [{'label': 1,\n         'text': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'},\n         {'label': 1,\n         'text': 'the gorgeously elaborate continuation of \" the lord of the rings \" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\\'s expanded vision of j . r . r . tolkien\\'s middle-earth .'},\n         {'label': 1, 'text': 'effective but too-tepid biopic'},\n         {'label': 1,\n         'text': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'},\n         {'label': 1,\n         'text': 'the gorgeously elaborate continuation of \" the lord of the rings \" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\\'s expanded vision of j . r . r . tolkien\\'s middle-earth .'},\n         {'label': 1, 'text': 'effective but too-tepid biopic'}]\n        ```\n        \"\"\"\n        return IterableDataset(\n            ex_iterable=RepeatExamplesIterable(self._ex_iterable, num_times=num_times),\n            info=self._info,\n            split=self._split,\n            formatting=self._formatting,\n            distributed=copy.deepcopy(self._distributed),\n            token_per_repo_id=self._token_per_repo_id,\n        )\n\n    def take(self, n: int) -> \"IterableDataset\":\n        \"\"\"\n        Create a new [`IterableDataset`] with only the first `n` elements.\n\n        Args:\n            n (`int`):\n                Number of elements to take.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\", streaming=True)\n        >>> small_ds = ds.take(2)\n        >>> list(small_ds)\n        [{'label': 1,\n         'text': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'},\n         {'label': 1,\n         'text': 'the gorgeously elaborate continuation of \" the lord of the rings \" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\\'s expanded vision of j . r . r . tolkien\\'s middle-earth .'}]\n        ```\n        \"\"\"\n        ex_iterable = TakeExamplesIterable(\n            self._ex_iterable,\n            n,\n            split_when_sharding=self._distributed is None,\n        )\n        return IterableDataset(\n            ex_iterable=ex_iterable,\n            info=self._info.copy(),\n            split=self._split,\n            formatting=self._formatting,\n            distributed=copy.deepcopy(self._distributed),\n            token_per_repo_id=self._token_per_repo_id,\n        )\n\n    def shard(\n        self,\n        num_shards: int,\n        index: int,\n        contiguous: bool = True,\n    ) -> \"IterableDataset\":\n        \"\"\"Return the `index`-nth shard from dataset split into `num_shards` pieces.\n\n        This shards deterministically. `dataset.shard(n, i)` splits the dataset into contiguous chunks,\n        so it can be easily concatenated back together after processing. If `dataset.num_shards % n == l`, then the\n        first `l` datasets each have `(dataset.num_shards // n) + 1` shards, and the remaining datasets have `(dataset.num_shards // n)` shards.\n        `datasets.concatenate_datasets([dset.shard(n, i) for i in range(n)])` returns a dataset with the same order as the original.\n        In particular, `dataset.shard(dataset.num_shards, i)` returns a dataset with 1 shard.\n\n        Note: n should be less or equal to the number of shards in the dataset `dataset.num_shards`.\n\n        On the other hand, `dataset.shard(n, i, contiguous=False)` contains all the shards of the dataset whose index mod `n = i`.\n\n        Be sure to shard before using any randomizing operator (such as `shuffle`).\n        It is best if the shard operator is used early in the dataset pipeline.\n\n        Args:\n            num_shards (`int`):\n                How many shards to split the dataset into.\n            index (`int`):\n                Which shard to select and return.\n            contiguous: (`bool`, defaults to `True`):\n                Whether to select contiguous blocks of indices for shards.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"fancyzhx/amazon_polarity\", split=\"train\", streaming=True)\n        >>> ds\n        IterableDataset({\n            features: ['label', 'title', 'content'],\n            num_shards: 4\n        })\n        >>> ds.shard(num_shards=2, index=0)\n        IterableDataset({\n            features: ['label', 'title', 'content'],\n            num_shards: 2\n        })\n        ```\n        \"\"\"\n        ex_iterable = self._ex_iterable.shard_data_sources(num_shards=num_shards, index=index, contiguous=contiguous)\n        return IterableDataset(\n            ex_iterable=ex_iterable,\n            info=self._info.copy(),\n            split=self._split,\n            formatting=self._formatting,\n            distributed=copy.deepcopy(self._distributed),\n            token_per_repo_id=self._token_per_repo_id,\n        )\n\n    def reshard(self) -> \"IterableDataset\":\n        \"\"\"Reshard the dataset if possible, i.e. split the current shards further into more shards.\n        This increases the number of shards and the resulting dataset has num_shards >= previous_num_shards.\n        Equality may happen if no shard can be split further.\n\n        The resharding mechanism depends on the dataset file format:\n\n        * Parquet: shard per row group instead of per file\n        * Other: not implemented yet (contributions are welcome !)\n\n        Be sure to reshard/shard before using any randomizing operator (such as `shuffle`).\n        It is best if the shard operator is used early in the dataset pipeline.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"fancyzhx/amazon_polarity\", split=\"train\", streaming=True)\n        >>> ds\n        IterableDataset({\n            features: ['label', 'title', 'content'],\n            num_shards: 4\n        })\n        >>> ds.reshard()\n        IterableDataset({\n            features: ['label', 'title', 'content'],\n            num_shards: 3600\n        })\n        ```\n        \"\"\"\n        ex_iterable = self._ex_iterable.reshard_data_sources()\n        return IterableDataset(\n            ex_iterable=ex_iterable,\n            info=self._info.copy(),\n            split=self._split,\n            formatting=self._formatting,\n            distributed=copy.deepcopy(self._distributed),\n            token_per_repo_id=self._token_per_repo_id,\n        )\n\n    def add_column(self, name: str, column: Union[list, np.array]) -> \"IterableDataset\":\n        \"\"\"Add column to Dataset.\n\n        Args:\n            name (str): Column name.\n            column (list or np.array): Column data to be added.\n\n        Returns:\n            `IterableDataset`\n        \"\"\"\n        return self.map(partial(add_column_fn, name=name, column=column), with_indices=True)\n\n    def rename_column(self, original_column_name: str, new_column_name: str) -> \"IterableDataset\":\n        \"\"\"\n        Rename a column in the dataset, and move the features associated to the original column under the new column\n        name.\n\n        Args:\n            original_column_name (`str`):\n                Name of the column to rename.\n            new_column_name (`str`):\n                New name for the column.\n\n        Returns:\n            `IterableDataset`: A copy of the dataset with a renamed column.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\", streaming=True)\n        >>> next(iter(ds))\n        {'label': 1,\n         'text': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}\n        >>> ds = ds.rename_column(\"text\", \"movie_review\")\n        >>> next(iter(ds))\n        {'label': 1,\n         'movie_review': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}\n        ```\n        \"\"\"\n        return self.rename_columns({original_column_name: new_column_name})\n\n    def rename_columns(self, column_mapping: dict[str, str]) -> \"IterableDataset\":\n        \"\"\"\n        Rename several columns in the dataset, and move the features associated to the original columns under\n        the new column names.\n\n        Args:\n            column_mapping (`Dict[str, str]`): A mapping of columns to rename to their new names\n\n        Returns:\n            `IterableDataset`: A copy of the dataset with renamed columns\n        \"\"\"\n\n        original_features = self._info.features.copy() if self._info.features else None\n        ds_iterable = self.map(\n            partial(_rename_columns_fn, column_mapping=column_mapping), remove_columns=list(column_mapping)\n        )\n        if original_features is not None:\n            ds_iterable._info.features = Features(\n                {\n                    column_mapping[col] if col in column_mapping.keys() else col: feature\n                    for col, feature in original_features.items()\n                }\n            )\n        return ds_iterable\n\n    def remove_columns(self, column_names: Union[str, list[str]]) -> \"IterableDataset\":\n        \"\"\"\n        Remove one or several column(s) in the dataset and the features associated to them.\n        The removal is done on-the-fly on the examples when iterating over the dataset.\n\n\n        Args:\n            column_names (`Union[str, List[str]]`):\n                Name of the column(s) to remove.\n\n        Returns:\n            `IterableDataset`: A copy of the dataset object without the columns to remove.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\", streaming=True)\n        >>> next(iter(ds))\n        {'text': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}\n        >>> ds = ds.remove_columns(\"label\")\n        >>> next(iter(ds))\n        {'text': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}\n        ```\n        \"\"\"\n        original_features = self._info.features.copy() if self._info.features else None\n        ds_iterable = self.map(remove_columns=column_names)\n        if original_features is not None:\n            ds_iterable._info.features = original_features.copy()\n            for col, _ in original_features.items():\n                if col in column_names:\n                    del ds_iterable._info.features[col]\n\n        return ds_iterable\n\n    def select_columns(self, column_names: Union[str, list[str]]) -> \"IterableDataset\":\n        \"\"\"Select one or several column(s) in the dataset and the features\n        associated to them. The selection is done on-the-fly on the examples\n        when iterating over the dataset.\n\n\n        Args:\n            column_names (`Union[str, List[str]]`):\n                Name of the column(s) to select.\n\n        Returns:\n            `IterableDataset`: A copy of the dataset object with selected columns.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\", streaming=True)\n        >>> next(iter(ds))\n        {'text': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}\n        >>> ds = ds.select_columns(\"text\")\n        >>> next(iter(ds))\n        {'text': 'the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}\n        ```\n        \"\"\"\n        if isinstance(column_names, str):\n            column_names = [column_names]\n\n        if self._info:\n            info = copy.deepcopy(self._info)\n            if self._info.features is not None:\n                missing_columns = set(column_names) - set(self._info.features.keys())\n                if missing_columns:\n                    raise ValueError(\n                        f\"Column name {list(missing_columns)} not in the \"\n                        \"dataset. Columns in the dataset: \"\n                        f\"{list(self._info.features.keys())}.\"\n                    )\n                info.features = Features({c: info.features[c] for c in column_names})\n\n        ex_iterable = SelectColumnsIterable(self._ex_iterable, column_names)\n        return IterableDataset(\n            ex_iterable=ex_iterable,\n            info=info,\n            split=self._split,\n            formatting=self._formatting,\n            distributed=self._distributed,\n            token_per_repo_id=self._token_per_repo_id,\n        )\n\n    def cast_column(self, column: str, feature: FeatureType) -> \"IterableDataset\":\n        \"\"\"Cast column to feature for decoding.\n\n        Args:\n            column (`str`):\n                Column name.\n            feature (`Feature`):\n                Target feature.\n\n        Returns:\n            `IterableDataset`\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset, Audio\n        >>> ds = load_dataset(\"PolyAI/minds14\", name=\"en-US\", split=\"train\", streaming=True)\n        >>> ds.features\n        {'audio': Audio(sampling_rate=8000, mono=True, decode=True, id=None),\n         'english_transcription': Value('string'),\n         'intent_class': ClassLabel(num_classes=14, names=['abroad', 'address', 'app_error', 'atm_limit', 'balance', 'business_loan',  'card_issues', 'cash_deposit', 'direct_debit', 'freeze', 'high_value_payment', 'joint_account', 'latest_transactions', 'pay_bill']),\n         'lang_id': ClassLabel(num_classes=14, names=['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR',  'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN']),\n         'path': Value('string'),\n         'transcription': Value('string')}\n        >>> ds = ds.cast_column(\"audio\", Audio(sampling_rate=16000))\n        >>> ds.features\n        {'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),\n         'english_transcription': Value('string'),\n         'intent_class': ClassLabel(num_classes=14, names=['abroad', 'address', 'app_error', 'atm_limit', 'balance', 'business_loan',  'card_issues', 'cash_deposit', 'direct_debit', 'freeze', 'high_value_payment', 'joint_account', 'latest_transactions', 'pay_bill']),\n         'lang_id': ClassLabel(num_classes=14, names=['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR',  'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN']),\n         'path': Value('string'),\n         'transcription': Value('string')}\n        ```\n        \"\"\"\n        feature = _fix_for_backward_compatible_features(feature)\n        info = self._info.copy()\n        info.features[column] = feature\n        return IterableDataset(\n            ex_iterable=self._ex_iterable,\n            info=info,\n            split=self._split,\n            formatting=self._formatting,\n            distributed=copy.deepcopy(self._distributed),\n            token_per_repo_id=self._token_per_repo_id,\n        )\n\n    def cast(\n        self,\n        features: Features,\n    ) -> \"IterableDataset\":\n        \"\"\"\n        Cast the dataset to a new set of features.\n\n        Args:\n            features ([`Features`]):\n                New features to cast the dataset to.\n                The name of the fields in the features must match the current column names.\n                The type of the data must also be convertible from one type to the other.\n                For non-trivial conversion, e.g. `string` <-> `ClassLabel` you should use [`~Dataset.map`] to update the Dataset.\n\n        Returns:\n            `IterableDataset`: A copy of the dataset with casted features.\n\n        Example:\n\n        ```py\n        >>> from datasets import load_dataset, ClassLabel, Value\n        >>> ds = load_dataset(\"cornell-movie-review-data/rotten_tomatoes\", split=\"train\", streaming=True)\n        >>> ds.features\n        {'label': ClassLabel(names=['neg', 'pos']),\n         'text': Value('string')}\n        >>> new_features = ds.features.copy()\n        >>> new_features[\"label\"] = ClassLabel(names=[\"bad\", \"good\"])\n        >>> new_features[\"text\"] = Value(\"large_string\")\n        >>> ds = ds.cast(new_features)\n        >>> ds.features\n        {'label': ClassLabel(names=['bad', 'good']),\n         'text': Value('large_string')}\n        ```\n        \"\"\"\n        features = _fix_for_backward_compatible_features(features)\n        info = self._info.copy()\n        info.features = features\n        return IterableDataset(\n            ex_iterable=self._ex_iterable,\n            info=info,\n            split=self._split,\n            formatting=self._formatting,\n            distributed=copy.deepcopy(self._distributed),\n            token_per_repo_id=self._token_per_repo_id,\n        )\n\n    def decode(self, enable: bool = True, num_threads: int = 0) -> \"IterableDataset\":\n        \"\"\"\n        Enable or disable the dataset features decoding for audio, image, video.\n\n        When enabled (default), media types are decoded:\n\n        * audio -> dict of \"array\" and \"sampling_rate\" and \"path\"\n        * image -> PIL.Image\n        * video -> torchvision.io.VideoReader\n\n        You can enable multithreading using `num_threads`. This is especially useful to speed up remote\n        data streaming. However it can be slower than `num_threads=0` for local data on fast disks.\n\n        Disabling decoding is useful if you want to iterate on the paths or bytes of the media files\n        without actually decoding their content. To disable decoding you can use `.decode(False)`, which\n        is equivalent to calling `.cast()` or `.cast_column()` with all the Audio, Image and Video types\n        set to `decode=False`.\n\n        Args:\n            enable (`bool`, defaults to `True`):\n                Enable or disable features decoding.\n            num_threads (`int`, defaults to `0`):\n                Enable multithreading for features decoding.\n\n        Returns:\n            `IterableDataset`: A copy of the dataset with casted features.\n\n        Examples:\n\n        Disable decoding:\n\n        ```py\n        >>> from datasets import load_dataset\n        >>> ds = load_dataset(\"sshh12/planet-textures\", split=\"train\", streaming=True)\n        >>> next(iter(ds))\n        {'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=2048x1024>,\n        'text': 'A distant celestial object with an icy crust, displaying a light blue shade, covered with round pits and rugged terrains.'}\n        >>> ds = ds.decode(False)\n        >>> ds.features\n        {'image': Image(mode=None, decode=False, id=None),\n        'text': Value('string')}\n        >>> next(iter(ds))\n        {\n          'image': {\n            'path': 'hf://datasets/sshh12/planet-textures@69dc4cef7a5c4b2cfe387727ec8ea73d4bff7302/train/textures/0000.png',\n            'bytes': None\n          },\n          'text': 'A distant celestial object with an icy crust, displaying a light blue shade, covered with round pits and rugged terrains.'\n        }\n        ```\n\n        Speed up streaming with multithreading:\n\n        ```py\n        >>> import os\n        >>> from datasets import load_dataset\n        >>> from tqdm import tqdm\n        >>> ds = load_dataset(\"sshh12/planet-textures\", split=\"train\", streaming=True)\n        >>> num_threads = min(32, (os.cpu_count() or 1) + 4)\n        >>> ds = ds.decode(num_threads=num_threads)\n        >>> for _ in tqdm(ds):  # 20 times faster !\n        ...     ...\n        ```\n        \"\"\"\n        if not self.features:\n            raise ValueError(\n                \"Features decoding is only available for datasets with known features, but features are Unknown. \"\n                \"Please set the datasets features with `ds = ds.cast(features)`.\"\n            )\n        ds = self\n\n        def set_decoding(decode: bool, feature):\n            if hasattr(feature, \"decode\"):\n                feature.decode = decode\n\n        if enable and num_threads > 0:\n            disabled_decoding_features = self.features.copy()\n            enabled_decoding_features = self.features.copy()\n\n            _visit(disabled_decoding_features, partial(set_decoding, False))\n            _visit(enabled_decoding_features, partial(set_decoding, True))\n            ds = ds.cast(disabled_decoding_features)\n            pool = multiprocessing.pool.ThreadPool(num_threads)\n            func = partial(_apply_async, pool, enabled_decoding_features.decode_example)\n            ds = ds.map(func, features=enabled_decoding_features)\n            assert isinstance(ds._ex_iterable, MappedExamplesIterable)\n            ds._ex_iterable.max_num_running_async_map_functions_in_parallel = 2 * num_threads\n        else:\n            features = ds.features.copy()\n            _visit(features, partial(set_decoding, enable))\n            ds = ds.cast(features)\n        return ds\n\n    def _step(self, step: int, offset: int) -> \"IterableDataset\":\n        ex_iterable = StepExamplesIterable(self._ex_iterable, step=step, offset=offset)\n        return IterableDataset(\n            ex_iterable=ex_iterable,\n            info=self._info.copy(),\n            split=self._split,\n            formatting=self._formatting,\n            distributed=copy.deepcopy(self._distributed),\n            token_per_repo_id=self._token_per_repo_id,\n        )\n\n    def _resolve_features(self):\n        if self.features is not None:\n            return self\n        elif self._ex_iterable.is_typed:\n            features = self._ex_iterable.features\n        else:\n            features = _infer_features_from_batch(self.with_format(None)._head())\n        info = self.info.copy()\n        info.features = features\n        return IterableDataset(\n            ex_iterable=self._ex_iterable,\n            info=info,\n            split=self._split,\n            formatting=self._formatting,\n            distributed=copy.deepcopy(self._distributed),\n            token_per_repo_id=self._token_per_repo_id,\n        )\n\n    def batch(self, batch_size: int, drop_last_batch: bool = False) -> \"IterableDataset\":\n        \"\"\"\n        Group samples from the dataset into batches.\n\n        Args:\n            batch_size (`int`): The number of samples in each batch.\n            drop_last_batch (`bool`, defaults to `False`): Whether to drop the last incomplete batch.\n\n        Example:\n        ```py\n        >>> ds = load_dataset(\"some_dataset\", streaming=True)\n        >>> batched_ds = ds.batch(batch_size=32)\n        ```\n        \"\"\"\n\n        if self.features:\n            features = Features({col: List(feature) for col, feature in self.features.items()})\n        else:\n            features = None\n        return self.map(\n            _batch_fn, batched=True, batch_size=batch_size, drop_last_batch=drop_last_batch, features=features\n        )\n\n    def to_dict(self, batch_size: Optional[int] = None, batched: bool = False) -> Union[dict, Iterator[dict]]:\n        \"\"\"Returns the dataset as a Python dict. Can also return a generator for large datasets.\n\n        Args:\n            batch_size (`int`, *optional*): The size (number of rows) of the batches if `batched` is `True`.\n                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.\n\n        Returns:\n            `dict` or `Iterator[dict]`\n\n        Example:\n\n        ```py\n        >>> ds.to_dict()\n        ```\n        \"\"\"\n        if batched:\n            for table in self.with_format(\"arrow\").iter(batch_size=batch_size):\n                yield Dataset(table, fingerprint=\"unset\").to_dict()\n        else:\n            table = pa.concat_tables(list(self.with_format(\"arrow\").iter(batch_size=1000)))\n            return Dataset(table, fingerprint=\"unset\").to_dict()\n\n    def to_list(self) -> list:\n        \"\"\"Returns the dataset as a Python list.\n\n        Returns:\n            `list`\n\n        Example:\n\n        ```py\n        >>> ds.to_list()\n        ```\n        \"\"\"\n        table = pa.concat_tables(list(self.with_format(\"arrow\").iter(batch_size=1000)))\n        return Dataset(table, fingerprint=\"unset\").to_list()\n\n    def to_pandas(\n        self, batch_size: Optional[int] = None, batched: bool = False\n    ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:\n        \"\"\"Returns the dataset as a `pandas.DataFrame`. Can also return a generator for large datasets.\n\n        Args:\n            batch_size (`int`, *optional*):\n                The size (number of rows) of the batches if `batched` is `True`.\n                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.\n            batched (`bool`):\n                Set to `True` to return a generator that yields the dataset as batches\n                of `batch_size` rows. Defaults to `False` (returns the whole datasets once).\n\n        Returns:\n            `pandas.DataFrame` or `Iterator[pandas.DataFrame]`\n\n        Example:\n\n        ```py\n        >>> ds.to_pandas()\n        ```\n        \"\"\"\n        info = DatasetInfo(features=self.features.copy()) if self.features is not None else None\n\n        def maybe_cast_to_declared_features(table: pa.Table):\n            if self.features is not None and table.schema != self.features.arrow_schema:\n                return cast_table_to_features(table, self.features)\n            return table\n\n        if batched:\n            return (\n                Dataset(maybe_cast_to_declared_features(table), info=info, fingerprint=\"unset\").to_pandas()\n                for table in self.with_format(\"arrow\").iter(batch_size=batch_size)\n            )\n        else:\n            table = pa.concat_tables(\n                [maybe_cast_to_declared_features(table) for table in self.with_format(\"arrow\").iter(batch_size=1000)]\n            )\n            return Dataset(table, info=info, fingerprint=\"unset\").to_pandas()\n\n    def to_polars(\n        self,\n        batch_size: Optional[int] = None,\n        batched: bool = False,\n        schema_overrides: Optional[dict] = None,\n        rechunk: bool = True,\n    ) -> Union[\"pl.DataFrame\", Iterator[\"pl.DataFrame\"]]:\n        \"\"\"Returns the dataset as a `polars.DataFrame`. Can also return a generator for large datasets.\n\n        Args:\n            batch_size (`int`, *optional*):\n                The size (number of rows) of the batches if `batched` is `True`.\n                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.\n            batched (`bool`):\n                Set to `True` to return a generator that yields the dataset as batches\n                of `batch_size` rows. Defaults to `False` (returns the whole datasets once).\n            schema_overrides (`dict`, *optional*):\n                Support type specification or override of one or more columns; note that\n                any dtypes inferred from the schema param will be overridden.\n            rechunk (`bool`):\n                Make sure that all data is in contiguous memory. Defaults to `True`.\n        Returns:\n            `polars.DataFrame` or `Iterator[polars.DataFrame]`\n\n        Example:\n\n        ```py\n        >>> ds.to_polars()\n        ```\n        \"\"\"\n        if batched:\n            for table in self.with_format(\"arrow\").iter(batch_size=batch_size):\n                yield Dataset(table, fingerprint=\"unset\").to_polars(schema_overrides=schema_overrides, rechunk=rechunk)\n        else:\n            table = pa.concat_tables(list(self.with_format(\"arrow\").iter(batch_size=1000)))\n            return Dataset(table, fingerprint=\"unset\").to_polars(schema_overrides=schema_overrides, rechunk=rechunk)\n\n    def to_csv(\n        self,\n        path_or_buf: Union[PathLike, BinaryIO],\n        batch_size: Optional[int] = None,\n        storage_options: Optional[dict] = None,\n        **to_csv_kwargs,\n    ) -> int:\n        \"\"\"Exports the dataset to csv.\n\n        This iterates on the dataset and loads it completely in memory before writing it.\n\n        Args:\n            path_or_buf (`PathLike` or `FileOrBuffer`):\n                Either a path to a file (e.g. `file.csv`), a remote URI (e.g. `hf://datasets/username/my_dataset_name/data.csv`),\n                or a BinaryIO, where the dataset will be saved to in the specified format.\n            batch_size (`int`, *optional*):\n                Size of the batch to load in memory and write at once.\n                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.\n            storage_options (`dict`, *optional*):\n                Key/value pairs to be passed on to the file-system backend, if any.\n            **to_csv_kwargs (additional keyword arguments):\n                Parameters to pass to pandas's [`pandas.DataFrame.to_csv`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html).\n                The parameter `index` defaults to `False` if not specified.\n                If you would like to write the index, pass `index=True` and also set a name for the index column by\n                passing `index_label`.\n\n        Returns:\n            `int`: The number of characters or bytes written.\n\n        Example:\n\n        ```py\n        >>> ds.to_csv(\"path/to/dataset/directory\")\n        ```\n        \"\"\"\n        table = pa.concat_tables(list(self.with_format(\"arrow\").iter(batch_size=1000)))\n        return Dataset(table, fingerprint=\"unset\").to_csv(\n            path_or_buf,\n            batch_size=batch_size,\n            storage_options=storage_options,\n            **to_csv_kwargs,\n        )\n\n    def to_json(\n        self,\n        path_or_buf: Union[PathLike, BinaryIO],\n        batch_size: Optional[int] = None,\n        storage_options: Optional[dict] = None,\n        **to_json_kwargs,\n    ) -> int:\n        \"\"\"Export the dataset to JSON Lines or JSON.\n\n        This iterates on the dataset and loads it completely in memory before writing it.\n\n        The default output format is [JSON Lines](https://jsonlines.org/).\n        To export to [JSON](https://www.json.org), pass `lines=False` argument and the desired `orient`.\n\n        Args:\n            path_or_buf (`PathLike` or `FileOrBuffer`):\n                Either a path to a file (e.g. `file.json`), a remote URI (e.g. `hf://datasets/username/my_dataset_name/data.json`),\n                or a BinaryIO, where the dataset will be saved to in the specified format.\n            batch_size (`int`, *optional*):\n                Size of the batch to load in memory and write at once.\n                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.\n            storage_options (`dict`, *optional*):\n                Key/value pairs to be passed on to the file-system backend, if any.\n            **to_json_kwargs (additional keyword arguments):\n                Parameters to pass to pandas's [`pandas.DataFrame.to_json`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_json.html).\n                Default arguments are `lines=True` and `orient=\"records\".\n                The parameter `index` defaults to `False` if `orient` is `\"split\"` or `\"table\"`.\n                If you would like to write the index, pass `index=True`.\n\n        Returns:\n            `int`: The number of characters or bytes written.\n\n        Example:\n\n        ```py\n        >>> ds.to_json(\"path/to/dataset/directory/filename.jsonl\")\n        ```\n\n        ```py\n        >>> num_shards = dataset.num_shards\n        >>> for index in range(num_shards):\n        ...     shard = dataset.shard(index, num_shards)\n        ...     shard.to_json(f\"path/of/my/dataset/data-{index:05d}.jsonl\")\n        ```\n\n        \"\"\"\n        table = pa.concat_tables(list(self.with_format(\"arrow\").iter(batch_size=1000)))\n        return Dataset(table, fingerprint=\"unset\").to_json(\n            path_or_buf,\n            batch_size=batch_size,\n            storage_options=storage_options,\n            **to_json_kwargs,\n        )\n\n    def to_sql(\n        self,\n        name: str,\n        con: Union[str, \"sqlalchemy.engine.Connection\", \"sqlalchemy.engine.Engine\", \"sqlite3.Connection\"],\n        batch_size: Optional[int] = None,\n        **sql_writer_kwargs,\n    ) -> int:\n        \"\"\"Exports the dataset to a SQL database.\n\n        Args:\n            name (`str`):\n                Name of SQL table.\n            con (`str` or `sqlite3.Connection` or `sqlalchemy.engine.Connection` or `sqlalchemy.engine.Connection`):\n                A [URI string](https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls) or a SQLite3/SQLAlchemy connection object used to write to a database.\n            batch_size (`int`, *optional*):\n                Size of the batch to load in memory and write at once.\n                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.\n            **sql_writer_kwargs (additional keyword arguments):\n                Parameters to pass to pandas's [`pandas.DataFrame.to_sql`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html).\n                The parameter `index` defaults to `False` if not specified.\n                If you would like to write the index, pass `index=True` and also set a name for the index column by\n                passing `index_label`.\n\n\n        Returns:\n            `int`: The number of records written.\n\n        Example:\n\n        ```py\n        >>> # con provided as a connection URI string\n        >>> ds.to_sql(\"data\", \"sqlite:///my_own_db.sql\")\n        >>> # con provided as a sqlite3 connection object\n        >>> import sqlite3\n        >>> con = sqlite3.connect(\"my_own_db.sql\")\n        >>> with con:\n        ...     ds.to_sql(\"data\", con)\n        ```\n        \"\"\"\n        table = pa.concat_tables(list(self.with_format(\"arrow\").iter(batch_size=1000)))\n        return Dataset(table, fingerprint=\"unset\").to_sql(name, con, batch_size=batch_size, **sql_writer_kwargs)\n\n    def to_parquet(\n        self,\n        path_or_buf: Union[PathLike, BinaryIO],\n        batch_size: Optional[int] = None,\n        storage_options: Optional[dict] = None,\n        **parquet_writer_kwargs,\n    ) -> int:\n        \"\"\"Exports the dataset to parquet\n\n        Args:\n            path_or_buf (`PathLike` or `FileOrBuffer`):\n                Either a path to a file (e.g. `file.parquet`), a remote URI (e.g. `hf://datasets/username/my_dataset_name/data.parquet`),\n                or a BinaryIO, where the dataset will be saved to in the specified format.\n            batch_size (`int`, *optional*):\n                Size of the batch to load in memory and write at once.\n                Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.\n            storage_options (`dict`, *optional*):\n                Key/value pairs to be passed on to the file-system backend, if any.\n\n                <Added version=\"2.19.0\"/>\n            **parquet_writer_kwargs (additional keyword arguments):\n                Parameters to pass to PyArrow's `pyarrow.parquet.ParquetWriter`.\n\n        Returns:\n            `int`: The number of characters or bytes written.\n\n        Example:\n\n        ```py\n        >>> ds.to_parquet(\"path/to/dataset/directory\")\n        ```\n\n        ```py\n        >>> num_shards = dataset.num_shards\n        >>> for index in range(num_shards):\n        ...     shard = dataset.shard(index, num_shards)\n        ...     shard.to_parquet(f\"path/of/my/dataset/data-{index:05d}.parquet\")\n        ```\n\n        \"\"\"\n        from .arrow_writer import get_arrow_writer_batch_size_from_features\n\n        batch_size = get_arrow_writer_batch_size_from_features(self.features) or config.DEFAULT_MAX_BATCH_SIZE\n        table = pa.concat_tables(list(self.with_format(\"arrow\").iter(batch_size=batch_size)))\n        return Dataset(table, fingerprint=\"unset\").to_parquet(\n            path_or_buf, storage_options=storage_options, **parquet_writer_kwargs\n        )\n\n    def _push_parquet_shards_to_hub_single(\n        self,\n        job_id: int,\n        num_jobs: int,\n        resolved_output_path: HfFileSystemResolvedPath,\n        data_dir: str,\n        split: str,\n        token: Optional[str],\n        create_pr: Optional[bool],\n        # max_shard_size: Optional[Union[int, str]] = None,  # TODO(QL): add arg\n        num_shards: int,\n        embed_external_files: bool,\n    ) -> Iterable[tuple[list[CommitOperationAdd], list[str], int, int]]:\n        \"\"\"Pushes the dataset shards as Parquet files to the hub.\n\n        Returns:\n            additions (`List[CommitOperation]`): list of the `CommitOperationAdd` of the uploaded shards\n            new_parquet_paths (`List[str]`): list of the paths of the uploaded parquet files\n            features (`Features`): features of the uploaded dataset\n            dataset_nbytes (`int`): approximate size in bytes of the uploaded dataset after uncompression\n            num_examples (`int`): number of examples of th euploaded shards\n        \"\"\"\n\n        div = num_shards // num_jobs\n        mod = num_shards % num_jobs\n        start = div * job_id + min(job_id, mod)\n        end = start + div + (1 if job_id < mod else 0)\n\n        index_shards = (\n            (start + i, self.shard(num_shards=end - start, index=i, contiguous=True)) for i in range(end - start)\n        )\n\n        api = HfApi(endpoint=config.HF_ENDPOINT, token=token)\n\n        dataset_nbytes = 0\n        num_examples = 0\n        additions: list[CommitOperationAdd] = []\n        new_parquet_paths: list[str] = []\n        features = self.features\n        for index, shard in index_shards:\n            if embed_external_files:\n                from .arrow_writer import get_arrow_writer_batch_size_from_features\n\n                shard = shard.with_format(\"arrow\")\n                shard = shard.map(\n                    partial(embed_table_storage, token_per_repo_id=self._token_per_repo_id),\n                    batched=True,\n                    batch_size=get_arrow_writer_batch_size_from_features(shard.features),\n                )\n            shard_path_in_repo = f\"{data_dir}/{split}-{index:05d}-of-{num_shards:05d}.parquet\"\n            tmp_file = tempfile.NamedTemporaryFile(suffix=\".parquet\", delete=False)\n            try:\n                shard.to_parquet(tmp_file)\n                tmp_file.close()\n                parquet_metadata = pq.read_metadata(tmp_file.name)\n                if features is None:\n                    features = Features.from_arrow_schema(parquet_metadata.schema.to_arrow_schema())\n                num_examples += parquet_metadata.num_rows\n                dataset_nbytes += sum(\n                    parquet_metadata.row_group(i).total_byte_size for i in range(parquet_metadata.num_row_groups)\n                )\n                new_parquet_paths.append(shard_path_in_repo)\n                if (\n                    isinstance(resolved_output_path, HfFileSystemResolvedRepositoryPath)\n                    and not resolved_output_path.path_in_repo\n                ):\n                    shard_addition = CommitOperationAdd(path_in_repo=shard_path_in_repo, path_or_fileobj=tmp_file.name)\n                    api.preupload_lfs_files(\n                        repo_id=resolved_output_path.repo_id,\n                        additions=[shard_addition],\n                        repo_type=resolved_output_path.repo_type,\n                        revision=resolved_output_path.revision,\n                        create_pr=create_pr,\n                    )\n                    additions.append(shard_addition)\n                elif isinstance(resolved_output_path, HfFileSystemResolvedBucketPath):\n                    if resolved_output_path.path:\n                        shard_path_in_repo = resolved_output_path.path + \"/\" + shard_path_in_repo\n                    api.batch_bucket_files(\n                        bucket_id=resolved_output_path.bucket_id, add=[(tmp_file.name, shard_path_in_repo)]\n                    )\n                else:\n                    raise NotImplementedError(f\"Bad HF path: {resolved_output_path}\")\n            except (Exception, KeyboardInterrupt):\n                tmp_file.close()\n                Path(tmp_file.name).unlink()\n                raise\n            tmp_file.close()\n            Path(tmp_file.name).unlink()\n            yield job_id, False, 1\n\n        yield job_id, True, (additions, new_parquet_paths, features, dataset_nbytes, num_examples)\n\n    def _push_parquet_shards_to_hub(\n        self,\n        resolved_output_path: HfFileSystemResolvedPath,\n        data_dir: str,\n        split: str,\n        token: Optional[str],\n        create_pr: Optional[bool],\n        max_shard_size: Optional[Union[int, str]],\n        num_shards: Optional[int],\n        embed_external_files: bool,\n        num_proc: Optional[int],\n    ) -> tuple[list[CommitOperationAdd], list[str], Features, SplitInfo, int]:\n        \"\"\"Pushes the dataset shards as Parquet files to the hub.\n\n        Returns:\n            additions (`List[CommitOperation]`): list of the `CommitOperationAdd` of the uploaded shards\n            new_parquet_paths (`List[str]`): list of paths of the new files uploaded to the output path,\n                relative to output path\n            features (`features`): features of the uploaded dataset\n            split_info (`int`): info of the uploaded split, including the approximate size in bytes of\n                the uploaded dataset after uncompression\n            uploaded_size (`int`): number of uploaded bytes to the repository or bucket\n        \"\"\"\n\n        # Find decodable columns, because if there are any, we need to:\n        # embed the bytes from the files in the shards\n        decodable_columns = (\n            [k for k, v in self._info.features.items() if require_decoding(v, ignore_decode_attribute=True)]\n            if embed_external_files\n            else []\n        )\n        embed_external_files = embed_external_files and bool(decodable_columns)\n\n        if num_shards is None:\n            if max_shard_size is None:\n                num_shards = self.num_shards\n            else:\n                max_shard_size = convert_file_size_to_int(max_shard_size or config.MAX_SHARD_SIZE)\n                estimated_nbytes = 0\n                for pa_table in self.with_format(\"arrow\").iter(batch_size=config.DEFAULT_MAX_BATCH_SIZE):\n                    estimated_nbytes += pa_table.nbytes\n                num_shards = int(estimated_nbytes / max_shard_size) + 1\n                num_shards = max(num_shards, num_proc or 1)\n\n        additions: list[CommitOperationAdd] = []\n        new_parquet_paths: list[str] = []\n        uploaded_size = 0\n        dataset_nbytes = 0\n        num_examples = 0\n        features = self.features\n\n        num_jobs = num_proc or 1\n        if num_shards <= 1:\n            logger.warning(\n                f\"Setting num_proc from {num_jobs} back to 1 for the {split} split to disable multiprocessing as it only contains one shard.\"\n            )\n            num_proc = None\n            num_jobs = 1\n        elif num_shards < num_jobs:\n            logger.warning(\n                f\"Setting num_proc from {num_jobs} to {num_shards} for the {split} split as it only contains {num_shards} shards.\"\n            )\n            num_proc = num_shards\n            num_jobs = num_shards\n        kwargs_iterable = [\n            {\n                \"self\": self.shard(num_shards=num_jobs, index=job_id, contiguous=True),\n                \"job_id\": job_id,\n                \"num_jobs\": num_jobs,\n                \"resolved_output_path\": resolved_output_path,\n                \"data_dir\": data_dir,\n                \"split\": split,\n                \"token\": token,\n                \"create_pr\": create_pr,\n                \"num_shards\": num_shards,\n                \"embed_external_files\": embed_external_files,\n            }\n            for job_id in range(num_jobs)\n        ]\n        desc = \"Uploading the dataset shards\"\n        desc += f\" (num_proc={num_proc})\" if num_proc is not None and num_proc >= 1 else \"\"\n        pbar = hf_tqdm(\n            unit=\" shards\",\n            total=num_shards,\n            desc=desc,\n        )\n        with (\n            contextlib.nullcontext()\n            if num_proc is None or num_proc < 1\n            else mp.get_context(\"spawn\").Pool(num_proc) as pool\n        ):\n            update_stream = (\n                IterableDataset._push_parquet_shards_to_hub_single(**kwargs_iterable[0])\n                if pool is None\n                else iflatmap_unordered(\n                    pool,\n                    IterableDataset._push_parquet_shards_to_hub_single,\n                    kwargs_iterable=kwargs_iterable,\n                )\n            )\n            for job_id, done, content in update_stream:\n                if not done:\n                    pbar.update(content)\n                else:\n                    job_additions, job_new_parquet_paths, job_features, job_uploaded_size, job_num_examples = content\n                    additions += job_additions\n                    new_parquet_paths += job_new_parquet_paths\n                    uploaded_size += job_uploaded_size\n                    num_examples += job_num_examples\n                    features = job_features\n            if pool is not None:\n                pool.close()\n                pool.join()\n\n        uploaded_size = sum(addition.upload_info.size for addition in additions)\n        split_info = SplitInfo(split, num_bytes=dataset_nbytes, num_examples=num_examples)\n        return additions, new_parquet_paths, features, split_info, uploaded_size\n\n    def push_to_hub(\n        self,\n        repo_id: str,\n        config_name: str = \"default\",\n        set_default: Optional[bool] = None,\n        split: Optional[str] = None,\n        data_dir: Optional[str] = None,\n        commit_message: Optional[str] = None,\n        commit_description: Optional[str] = None,\n        private: Optional[bool] = None,\n        token: Optional[str] = None,\n        revision: Optional[str] = None,\n        create_pr: Optional[bool] = False,\n        max_shard_size: Optional[Union[int, str]] = None,\n        num_shards: Optional[int] = None,\n        embed_external_files: bool = True,\n        num_proc: Optional[int] = None,\n    ) -> CommitInfo:\n        \"\"\"Pushes the dataset to the hub as a Parquet dataset.\n        The dataset is pushed using HTTP requests and does not need to have neither git or git-lfs installed.\n\n        The resulting Parquet files are self-contained by default. If your dataset contains [`Image`], [`Audio`] or [`Video`]\n        data, the Parquet files will store the bytes of your images or audio files.\n        You can disable this by setting `embed_external_files` to `False`.\n\n        Args:\n            repo_id (`str`):\n                The ID of the repository to push to in the following format: `<user>/<dataset_name>` or\n                `<org>/<dataset_name>`. Also accepts `<dataset_name>`, which will default to the namespace\n                of the logged-in user.\n\n                It could also be a location inside a bucket, e.g. `buckets/<user_or_org>/<bucket_name>/...`\n            config_name (`str`, defaults to \"default\"):\n                The configuration name (or subset) of a dataset. Defaults to \"default\".\n            set_default (`bool`, *optional*):\n                Whether to set this configuration as the default one. Otherwise, the default configuration is the one\n                named \"default\".\n            split (`str`, *optional*):\n                The name of the split that will be given to that dataset. Defaults to `self.split`.\n            data_dir (`str`, *optional*):\n                Directory name that will contain the uploaded data files. Defaults to the `config_name` if different\n                from \"default\", else \"data\".\n            commit_message (`str`, *optional*):\n                Message to commit while pushing. Will default to `\"Upload dataset\"`.\n            commit_description (`str`, *optional*):\n                Description of the commit that will be created.\n                Additionally, description of the PR if a PR is created (`create_pr` is True).\n            private (`bool`, *optional*):\n                Whether to make the repo private. If `None` (default), the repo will be public unless the\n                organization's default is private. This value is ignored if the repo already exists.\n            token (`str`, *optional*):\n                An optional authentication token for the Hugging Face Hub. If no token is passed, will default\n                to the token saved locally when logging in with `huggingface-cli login`. Will raise an error\n                if no token is passed and the user is not logged-in.\n            revision (`str`, *optional*):\n                Branch to push the uploaded files to. Defaults to the `\"main\"` branch.\n            create_pr (`bool`, *optional*, defaults to `False`):\n                Whether to create a PR with the uploaded files or directly commit.\n            max_shard_size (`int` or `str`, *optional*):\n                Optional maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed\n                by a unit (like `\"5MB\"`). If not provided, shard count defaults to this dataset's `.num_shards`.\n            num_shards (`int`, *optional*):\n                Number of shards to write. If `max_shard_size` is provided and `num_shards` is not, then the number of shards is estimated\n                from `max_shard_size`.\n            embed_external_files (`bool`, defaults to `True`):\n                Whether to embed file bytes in the shards.\n                In particular, this will do the following before the push for the fields of type:\n\n                - [`Audio`] and [`Image`]: remove local path information and embed file content in the Parquet files.\n            num_proc (`int`, *optional*, defaults to `None`):\n                Number of processes when preparing and uploading the dataset.\n                This is helpful if the dataset is made of many samples and transformations.\n                I uses \"spawn\" context to work with hf_xet, the rust client for fast uploads to HF.\n                Multiprocessing is disabled by default.\n\n        Return:\n            huggingface_hub.CommitInfo\n\n        Example:\n\n        ```python\n        >>> dataset.push_to_hub(\"<organization>/<dataset_id>\")\n        >>> dataset_dict.push_to_hub(\"<organization>/<dataset_id>\", private=True)\n        >>> dataset.push_to_hub(\"<organization>/<dataset_id>\", max_shard_size=\"1GB\")\n        >>> dataset.push_to_hub(\"<organization>/<dataset_id>\", num_shards=1024)\n        ```\n\n        If your dataset has multiple splits (e.g. train/validation/test):\n\n        ```python\n        >>> train_dataset.push_to_hub(\"<organization>/<dataset_id>\", split=\"train\")\n        >>> val_dataset.push_to_hub(\"<organization>/<dataset_id>\", split=\"validation\")\n        >>> # later\n        >>> dataset = load_dataset(\"<organization>/<dataset_id>\")\n        >>> train_dataset = dataset[\"train\"]\n        >>> val_dataset = dataset[\"validation\"]\n        ```\n\n        If you want to add a new configuration (or subset) to a dataset (e.g. if the dataset has multiple tasks/versions/languages):\n\n        ```python\n        >>> english_dataset.push_to_hub(\"<organization>/<dataset_id>\", \"en\")\n        >>> french_dataset.push_to_hub(\"<organization>/<dataset_id>\", \"fr\")\n        >>> # later\n        >>> english_dataset = load_dataset(\"<organization>/<dataset_id>\", \"en\")\n        >>> french_dataset = load_dataset(\"<organization>/<dataset_id>\", \"fr\")\n        ```\n        \"\"\"\n        if num_proc is not None and num_proc > self.num_shards:\n            logger.warning(\n                f\"Too many num_proc: {num_proc} (max is dataset.num_shards={self.num_shards}). \"\n                f\"Stopping {num_proc - self.num_shards} processes.\"\n            )\n            logger.info(\n                f\"To parallelize data loading, we give each process some shards (or data sources) to process. \"\n                f\"Therefore it's unnecessary to have a number of processes greater than dataset.num_shards={self.num_shards}. \"\n                f\"To enable more parallelism, please split the dataset in more files than {self.num_shards}.\"\n            )\n            num_proc = self.num_shards\n\n        if config_name == \"data\":\n            raise ValueError(\"`config_name` cannot be 'data'. Please, choose another name for configuration.\")\n\n        if max_shard_size is not None and num_shards is not None:\n            raise ValueError(\n                \"Failed to push_to_hub: please specify either max_shard_size or num_shards, but not both.\"\n            )\n\n        if split is None:\n            split = str(self.split) if self.split is not None else \"train\"\n\n        if not re.match(_split_re, split):\n            raise ValueError(f\"Split name should match '{_split_re}' but got '{split}'.\")\n\n        if not data_dir:\n            data_dir = config_name if config_name != \"default\" else \"data\"  # for backward compatibility\n\n        api = HfApi(endpoint=config.HF_ENDPOINT, token=token)\n        if repo_id.startswith(\"buckets/\"):\n            if BucketNotFoundError is None:\n                raise ImportError(\"Pushing datasets to buckets requires huggingface_hub>=1.6.0\")\n            _, _namespace, _bucket_name, *_path_segments = repo_id.split(\"/\")\n            try:\n                bucket_id = api.bucket_info(_namespace + \"/\" + _bucket_name).id\n            except BucketNotFoundError:\n                bucket_url = api.create_bucket(_namespace + \"/\" + _bucket_name, private=private, exist_ok=True)\n                bucket_id = bucket_url.bucket_id\n            path = \"/\".join(s for s in _path_segments if s)\n            return _push_to_bucket(\n                self,\n                bucket_id=bucket_id,\n                path=path,\n                config_name=config_name,\n                set_default=set_default,\n                split=split,\n                data_dir=data_dir,\n                token=token,\n                max_shard_size=max_shard_size,\n                num_shards=num_shards,\n                embed_external_files=embed_external_files,\n                num_proc=num_proc,\n            )\n        else:\n            try:\n                repo_id = api.repo_info(repo_id, repo_type=\"dataset\").id\n            except RepositoryNotFoundError:\n                repo_url = api.create_repo(\n                    repo_id,\n                    repo_type=\"dataset\",\n                    private=private,\n                    exist_ok=True,\n                )\n                repo_id = repo_url.repo_id\n\n            if revision is not None and not revision.startswith(\"refs/pr/\"):\n                # We do not call create_branch for a PR reference: 400 Bad Request\n                api.create_branch(repo_id, branch=revision, repo_type=\"dataset\", exist_ok=True)\n            return _push_to_repo(\n                self,\n                repo_id=repo_id,\n                config_name=config_name,\n                set_default=set_default,\n                split=split,\n                data_dir=data_dir,\n                commit_message=commit_message,\n                commit_description=commit_description,\n                token=token,\n                revision=revision,\n                create_pr=create_pr,\n                max_shard_size=max_shard_size,\n                num_shards=num_shards,\n                embed_external_files=embed_external_files,\n                num_proc=num_proc,\n            )\n\n\ndef _concatenate_iterable_datasets(\n    dsets: list[IterableDataset],\n    info: Optional[DatasetInfo] = None,\n    split: Optional[NamedSplit] = None,\n    axis: int = 0,\n) -> IterableDataset:\n    \"\"\"\n    Converts a list of `IterableDataset` with the same schema into a single `IterableDataset`.\n    Missing data are filled with None values.\n\n    <Added version=\"2.4.0\"/>\n\n    Args:\n        dsets (`List[datasets.IterableDataset]`): List of Datasets to concatenate.\n        info (`DatasetInfo`, optional): Dataset information, like description, citation, etc.\n        split (`NamedSplit`, optional): Name of the dataset split.\n        axis (``{0, 1}``, default ``0``, meaning over rows):\n            Axis to concatenate over, where ``0`` means over rows (vertically) and ``1`` means over columns\n            (horizontally).\n\n            *New in version 1.6.0*\n\n    Example:\n\n    ```py\n    >>> ds3 = _concatenate_iterable_datasets([ds1, ds2])\n    ```\n    \"\"\"\n    dsets = [d._resolve_features() for d in dsets]\n\n    # Perform checks (and a potentional cast if axis=0)\n    if axis == 0:\n        _check_if_features_can_be_aligned([dset.features for dset in dsets])\n    else:\n        _check_column_names([col_name for dset in dsets for col_name in dset.features])\n\n    # Check format is consistent; if so, will set format for concatenated dataset\n    if all(dset._formatting is None for dset in dsets):\n        formatting = None\n    elif any(dset._formatting is None for dset in dsets):\n        formatting = None\n        logger.info(\n            \"Some of the datasets have disparate format or format not set. Resetting the format of the concatenated dataset.\"\n        )\n    else:\n        format_type_set = {dset._formatting.format_type for dset in dsets}\n        if len(format_type_set) == 1:\n            format_type = format_type_set.pop()\n            formatting = FormattingConfig(format_type=format_type)\n        else:\n            formatting = None\n            logger.info(\n                \"Some of the datasets have disparate format or format not set. Resetting the format of the concatenated dataset.\"\n            )\n\n    # TODO: improve this to account for a mix of ClassLabel and Value for example\n    # right now it would keep the type of the first dataset in the list\n    features = Features(\n        {k: v for features in _align_features([dset.features for dset in dsets]) for k, v in features.items()}\n    )\n\n    ex_iterables = [copy.deepcopy(d._ex_iterable) for d in dsets]\n    if axis == 0:\n        ex_iterable = VerticallyConcatenatedMultiSourcesExamplesIterable(ex_iterables)\n    else:\n        if all(ex_iterable.iter_arrow for ex_iterable in ex_iterables):\n            from .arrow_writer import get_arrow_writer_batch_size_from_features\n\n            batch_size = get_arrow_writer_batch_size_from_features(features) or config.DEFAULT_MAX_BATCH_SIZE\n            ex_iterables = [\n                RebatchedArrowExamplesIterable(ex_iterable, batch_size=batch_size) for ex_iterable in ex_iterables\n            ]\n        ex_iterable = HorizontallyConcatenatedMultiSourcesExamplesIterable(ex_iterables)\n    # Set new info - we update the features\n    # setting the features also ensures to fill missing columns with None\n    if info is None:\n        info = DatasetInfo.from_merge([d.info for d in dsets])\n    else:\n        info = info.copy()\n    info.features = features\n    # Get all the auth tokens per repository - in case the datasets come from different private repositories\n    token_per_repo_id = {repo_id: token for dataset in dsets for repo_id, token in dataset._token_per_repo_id.items()}\n    # Return new daset\n    return IterableDataset(\n        ex_iterable=ex_iterable,\n        info=info,\n        split=split,\n        token_per_repo_id=token_per_repo_id,\n        formatting=formatting,\n    )\n\n\ndef _interleave_iterable_datasets(\n    datasets: list[IterableDataset],\n    probabilities: Optional[list[float]] = None,\n    seed: Optional[int] = None,\n    info: Optional[DatasetInfo] = None,\n    split: Optional[NamedSplit] = None,\n    stopping_strategy: Literal[\n        \"first_exhausted\", \"all_exhausted\", \"all_exhausted_without_replacement\"\n    ] = \"first_exhausted\",\n) -> IterableDataset:\n    \"\"\"\n    Interleave several iterable datasets (sources) into a single iterable dataset.\n    The new iterable dataset alternates between the sources to yield examples.\n    If `probabilities = None` (default) the iterable dataset will cycles through the sources in order for each next example in the iteration.\n    If `probabilities` is not `None, the iterable dataset will sample a random source according to the provided probabilities for each next examples in the iteration.\n\n    <Added version=\"2.4.0\"/>\n\n    Args:\n        datasets (`List[IterableDataset]`): list of datasets to interleave\n        probabilities (`List[float]`, optional, default None): If specified, the new iterable dataset samples\n            examples from one source at a time according to these probabilities.\n        seed (`int`, optional, default None): The random seed used to choose a source for each example.\n        stopping_strategy (`str`, defaults to `first_exhausted`):\n            Two strategies are proposed right now.\n            By default, `first_exhausted` is an undersampling strategy, i.e the dataset construction is stopped as soon as one dataset has ran out of samples.\n            If the strategy is `all_exhausted`,  we use an oversampling strategy, i.e the dataset construction is stopped as soon as every samples of every dataset has been added at least once.\n            Note that if the strategy is `all_exhausted`, the interleaved dataset size can get enormous:\n            - with no probabilities, the resulting dataset will have max_length_datasets*nb_dataset samples.\n            - with given probabilities, the resulting dataset will have more samples if some datasets have really low probability of visiting.\n\n    Output:\n        `datasets.IterableDataset`\n    \"\"\"\n    datasets = [d._resolve_features() for d in datasets]\n\n    # Perform checks\n    _check_if_features_can_be_aligned([dset.features for dset in datasets])\n    for i, dset in enumerate(datasets):\n        if datasets[0]._distributed != dset._distributed:\n            raise ValueError(\n                f\"Datasets should be identically split_by_node before interleaving, but got {datasets[0]._distributed}!={dset._distributed} at index 0 and {i}\"\n            )\n\n    # TODO: improve this to account for a mix of ClassLabel and Value for example\n    # right now it would keep the type of the first dataset in the list\n    features = Features(\n        {k: v for features in _align_features([dset.features for dset in datasets]) for k, v in features.items()}\n    )\n\n    ex_iterables = [copy.deepcopy(d._ex_iterable) for d in datasets]\n    if all(ex_iterable.iter_arrow for ex_iterable in ex_iterables):\n        ex_iterables = [RebatchedArrowExamplesIterable(ex_iterable, batch_size=1) for ex_iterable in ex_iterables]\n    # Use cycling or random cycling of sources\n    if probabilities is None:\n        ex_iterable = CyclingMultiSourcesExamplesIterable(ex_iterables, stopping_strategy=stopping_strategy)\n    else:\n        generator = np.random.default_rng(seed)\n        ex_iterable = RandomlyCyclingMultiSourcesExamplesIterable(\n            ex_iterables,\n            generator=generator,\n            probabilities=probabilities,\n            stopping_strategy=stopping_strategy,\n        )\n    # Set new info - we update the features\n    # setting the features also ensures to fill missing columns with None\n    if info is None:\n        info = DatasetInfo.from_merge([d.info for d in datasets])\n    else:\n        info = info.copy()\n    info.features = features\n    # Get all the auth tokens per repository - in case the datasets come from different private repositories\n    token_per_repo_id = {\n        repo_id: token for dataset in datasets for repo_id, token in dataset._token_per_repo_id.items()\n    }\n    # Return new daset\n    return IterableDataset(\n        ex_iterable=ex_iterable,\n        info=info,\n        split=split,\n        token_per_repo_id=token_per_repo_id,\n        distributed=datasets[0]._distributed,\n    )\n\n\ndef _split_by_node_iterable_dataset(dataset: IterableDataset, rank: int, world_size: int) -> IterableDataset:\n    \"\"\"\n    Split an iterable dataset for the node at rank `rank` in a pool of nodes of size `world_size`.\n\n    If the dataset has a number of shards that is a factor of `world_size` (i.e. if `dataset.num_shards % world_size == 0`),\n    then the shards are evenly assigned across the nodes, which is the most optimized.\n    Otherwise, each node keeps 1 example out of `world_size`, skipping the other examples.\n\n    Args:\n        dataset ([`IterableDataset`]):\n            The iterable dataset to split by node.\n        rank (`int`):\n            Rank of the current node.\n        world_size (`int`):\n            Total number of nodes.\n\n    Returns:\n        [`IterableDataset`]: The iterable dataset to be used on the node at rank `rank`.\n    \"\"\"\n    if dataset._distributed:\n        rank = world_size * dataset._distributed.rank + rank\n        world_size = world_size * dataset._distributed.world_size\n    distributed = DistributedConfig(rank=rank, world_size=world_size)\n    return IterableDataset(\n        ex_iterable=dataset._ex_iterable,\n        info=dataset._info.copy(),\n        split=dataset._split,\n        formatting=dataset._formatting,\n        distributed=distributed,\n        token_per_repo_id=dataset._token_per_repo_id,\n    )\n\n\nasync def _apply_async(pool, func, x):\n    future = pool.apply_async(func, (x,))\n    while True:\n        if future.ready():\n            return future.get()\n        else:\n            await asyncio.sleep(0)\n\n\ndef _batch_fn(unbatched):\n    return {k: [v] for k, v in unbatched.items()}\n\n\ndef _generate_tables_from_polars(df: Union[\"pl.DataFrame\", \"pl.LazyFrame\"]) -> Iterator[tuple[\"BuilderKey\", pa.Table]]:\n    import polars as pl\n\n    from .builder import Key as BuilderKey\n\n    for slice_idx, df_slice in enumerate(df.collect_batches() if isinstance(df, pl.LazyFrame) else df.iter_slices()):\n        yield BuilderKey(0, slice_idx), df_slice.to_arrow()\n"
  },
  {
    "path": "src/datasets/load.py",
    "content": "# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Lint as: python3\n\"\"\"Access datasets.\"\"\"\n\nimport glob\nimport importlib\nimport inspect\nimport json\nimport os\nimport posixpath\nfrom collections import Counter\nfrom collections.abc import Mapping, Sequence\nfrom dataclasses import dataclass, field\nfrom pathlib import Path\nfrom typing import Any, Literal, Optional, Union, overload\n\nimport fsspec\nimport httpx\nimport requests\nimport yaml\nfrom fsspec.core import url_to_fs\nfrom huggingface_hub import DatasetCard, DatasetCardData, HfApi, HfFileSystem\nfrom huggingface_hub.utils import (\n    EntryNotFoundError,\n    GatedRepoError,\n    LocalEntryNotFoundError,\n    OfflineModeIsEnabled,\n    RepositoryNotFoundError,\n    RevisionNotFoundError,\n    get_session,\n)\nfrom packaging import version\n\nfrom . import __version__, config\nfrom .arrow_dataset import Dataset\nfrom .builder import BuilderConfig, DatasetBuilder\nfrom .data_files import (\n    DataFilesDict,\n    DataFilesList,\n    DataFilesPatternsDict,\n    EmptyDatasetError,\n    get_data_patterns,\n    sanitize_patterns,\n)\nfrom .dataset_dict import DatasetDict, IterableDatasetDict\nfrom .download.download_config import DownloadConfig\nfrom .download.download_manager import DownloadMode\nfrom .download.streaming_download_manager import StreamingDownloadManager, xbasename, xglob, xjoin\nfrom .exceptions import DataFilesNotFoundError, DatasetNotFoundError\nfrom .features import Features\nfrom .features.features import _fix_for_backward_compatible_features\nfrom .fingerprint import Hasher\nfrom .info import DatasetInfo, DatasetInfosDict\nfrom .iterable_dataset import IterableDataset\nfrom .naming import camelcase_to_snakecase, snakecase_to_camelcase\nfrom .packaged_modules import (\n    _ALL_ALLOWED_EXTENSIONS,\n    _EXTENSION_TO_MODULE,\n    _MODULE_TO_EXTENSIONS,\n    _MODULE_TO_METADATA_EXTENSIONS,\n    _MODULE_TO_METADATA_FILE_NAMES,\n    _PACKAGED_DATASETS_MODULES,\n)\nfrom .packaged_modules.folder_based_builder.folder_based_builder import FolderBasedBuilder\nfrom .splits import Split\nfrom .utils import _dataset_viewer\nfrom .utils.file_utils import (\n    _raise_if_offline_mode_is_enabled,\n    cached_path,\n    get_datasets_user_agent,\n    is_relative_path,\n    relative_to_absolute_path,\n)\nfrom .utils.hub import hf_dataset_url\nfrom .utils.info_utils import VerificationMode, is_small_dataset\nfrom .utils.logging import get_logger\nfrom .utils.metadata import MetadataConfigs\nfrom .utils.typing import PathLike\nfrom .utils.version import Version\n\n\nif config.HF_HUB_VERSION >= version.parse(\"1.6.0\"):\n    from huggingface_hub.errors import BucketNotFoundError\n\nelse:\n    BucketNotFoundError = None\n\n\nlogger = get_logger(__name__)\n\n\nclass _InitializeConfiguredDatasetBuilder:\n    \"\"\"\n    From https://stackoverflow.com/questions/4647566/pickle-a-dynamically-parameterized-sub-class\n    See also ConfiguredDatasetBuilder.__reduce__\n    When called with the param value as the only argument, returns an\n    un-initialized instance of the parameterized class. Subsequent __setstate__\n    will be called by pickle.\n    \"\"\"\n\n    def __call__(self, builder_cls, metadata_configs, default_config_name, name):\n        # make a simple object which has no complex __init__ (this one will do)\n        obj = _InitializeConfiguredDatasetBuilder()\n        obj.__class__ = configure_builder_class(\n            builder_cls, metadata_configs, default_config_name=default_config_name, dataset_name=name\n        )\n        return obj\n\n\ndef configure_builder_class(\n    builder_cls: type[DatasetBuilder],\n    builder_configs: list[BuilderConfig],\n    default_config_name: Optional[str],\n    dataset_name: str,\n) -> type[DatasetBuilder]:\n    \"\"\"\n    Dynamically create a builder class with custom builder configs parsed from README.md file,\n    i.e. set BUILDER_CONFIGS class variable of a builder class to custom configs list.\n    \"\"\"\n\n    class ConfiguredDatasetBuilder(builder_cls):\n        BUILDER_CONFIGS = builder_configs\n        DEFAULT_CONFIG_NAME = default_config_name\n\n        __module__ = builder_cls.__module__  # so that the actual packaged builder can be imported\n\n        def __reduce__(self):  # to make dynamically created class pickable, see _InitializeParameterizedDatasetBuilder\n            parent_builder_cls = self.__class__.__mro__[1]\n            return (\n                _InitializeConfiguredDatasetBuilder(),\n                (\n                    parent_builder_cls,\n                    self.BUILDER_CONFIGS,\n                    self.DEFAULT_CONFIG_NAME,\n                    self.dataset_name,\n                ),\n                self.__dict__.copy(),\n            )\n\n    ConfiguredDatasetBuilder.__name__ = (\n        f\"{builder_cls.__name__.lower().capitalize()}{snakecase_to_camelcase(dataset_name)}\"\n    )\n    ConfiguredDatasetBuilder.__qualname__ = (\n        f\"{builder_cls.__name__.lower().capitalize()}{snakecase_to_camelcase(dataset_name)}\"\n    )\n\n    return ConfiguredDatasetBuilder\n\n\ndef import_main_class(module_path) -> Optional[type[DatasetBuilder]]:\n    \"\"\"Import a module at module_path and return its main class: a DatasetBuilder\"\"\"\n    module = importlib.import_module(module_path)\n    # Find the main class in our imported module\n    module_main_cls = None\n    for name, obj in module.__dict__.items():\n        if inspect.isclass(obj) and issubclass(obj, DatasetBuilder):\n            if inspect.isabstract(obj):\n                continue\n            module_main_cls = obj\n            obj_module = inspect.getmodule(obj)\n            if obj_module is not None and module == obj_module:\n                break\n\n    return module_main_cls\n\n\ndef get_dataset_builder_class(\n    dataset_module: \"DatasetModule\", dataset_name: Optional[str] = None\n) -> type[DatasetBuilder]:\n    builder_cls = import_main_class(dataset_module.module_path)\n    if dataset_module.builder_configs_parameters.builder_configs:\n        dataset_name = dataset_name or dataset_module.builder_kwargs.get(\"dataset_name\")\n        if dataset_name is None:\n            raise ValueError(\"dataset_name should be specified but got None\")\n        builder_cls = configure_builder_class(\n            builder_cls,\n            builder_configs=dataset_module.builder_configs_parameters.builder_configs,\n            default_config_name=dataset_module.builder_configs_parameters.default_config_name,\n            dataset_name=dataset_name,\n        )\n    return builder_cls\n\n\ndef increase_load_count(name: str):\n    \"\"\"Update the download count of a dataset.\"\"\"\n    if not config.HF_HUB_OFFLINE and config.HF_UPDATE_DOWNLOAD_COUNTS:\n        try:\n            get_session().head(\n                \"/\".join((config.S3_DATASETS_BUCKET_PREFIX, name, name + \".py\")),\n                headers={\"User-Agent\": get_datasets_user_agent()},\n                timeout=3,\n            )\n        except Exception:\n            pass\n\n\ndef infer_module_for_data_files_list(\n    data_files_list: DataFilesList, download_config: Optional[DownloadConfig] = None\n) -> tuple[Optional[str], dict]:\n    \"\"\"Infer module (and builder kwargs) from list of data files.\n\n    It picks the module based on the most common file extension.\n    In case of a draw \".parquet\" is the favorite, and then alphabetical order.\n\n    Args:\n        data_files_list (DataFilesList): List of data files.\n        download_config (bool or str, optional): Mainly use `token` or `storage_options` to support different platforms and auth types.\n\n    Returns:\n        tuple[str, dict[str, Any]]: Tuple with\n            - inferred module name\n            - dict of builder kwargs\n    \"\"\"\n    extensions_counter = Counter(\n        (\".\" + suffix.lower(), xbasename(filepath) in FolderBasedBuilder.METADATA_FILENAMES)\n        for filepath in data_files_list\n        for suffix in xbasename(filepath).split(\".\")[1:]\n    )\n    if extensions_counter:\n\n        def sort_key(ext_count: tuple[tuple[str, bool], int]) -> tuple[int, bool]:\n            \"\"\"Sort by count and set \".parquet\" as the favorite in case of a draw, and ignore metadata files\"\"\"\n            (ext, is_metadata), count = ext_count\n            return (not is_metadata, count, ext == \".parquet\", ext == \".jsonl\", ext == \".json\", ext == \".csv\", ext)\n\n        for (ext, _), _ in sorted(extensions_counter.items(), key=sort_key, reverse=True):\n            if ext in _EXTENSION_TO_MODULE:\n                return _EXTENSION_TO_MODULE[ext]\n            elif ext == \".zip\":\n                return infer_module_for_data_files_list_in_archives(data_files_list, download_config=download_config)\n    return None, {}\n\n\ndef infer_module_for_data_files_list_in_archives(\n    data_files_list: DataFilesList, download_config: Optional[DownloadConfig] = None\n) -> tuple[Optional[str], dict]:\n    \"\"\"Infer module (and builder kwargs) from list of archive data files.\n\n    Args:\n        data_files_list (DataFilesList): List of data files.\n        download_config (bool or str, optional): Mainly use `token` or `storage_options` to support different platforms and auth types.\n\n    Returns:\n        tuple[str, dict[str, Any]]: Tuple with\n            - inferred module name\n            - dict of builder kwargs\n    \"\"\"\n    archived_files = []\n    archive_files_counter = 0\n    for filepath in data_files_list:\n        if str(filepath).endswith(\".zip\"):\n            archive_files_counter += 1\n            if archive_files_counter > config.ARCHIVES_MAX_NUMBER_FOR_MODULE_INFERENCE:\n                break\n            extracted = xjoin(StreamingDownloadManager().extract(filepath), \"**\")\n            archived_files += [\n                f.split(\"::\")[0] for f in xglob(extracted, recursive=True, download_config=download_config)\n            ]\n    extensions_counter = Counter(\n        \".\" + suffix.lower() for filepath in archived_files for suffix in xbasename(filepath).split(\".\")[1:]\n    )\n    if extensions_counter:\n        most_common = extensions_counter.most_common(1)[0][0]\n        if most_common in _EXTENSION_TO_MODULE:\n            return _EXTENSION_TO_MODULE[most_common]\n    return None, {}\n\n\ndef infer_module_for_data_files(\n    data_files: DataFilesDict, path: Optional[str] = None, download_config: Optional[DownloadConfig] = None\n) -> tuple[Optional[str], dict[str, Any]]:\n    \"\"\"Infer module (and builder kwargs) from data files. Raise if module names for different splits don't match.\n\n    Args:\n        data_files ([`DataFilesDict`]): Dict of list of data files.\n        path (str, *optional*): Dataset name or path.\n        download_config ([`DownloadConfig`], *optional*):\n            Specific download configuration parameters to authenticate on the Hugging Face Hub for private remote files.\n\n    Returns:\n        tuple[str, dict[str, Any]]: Tuple with\n            - inferred module name\n            - builder kwargs\n    \"\"\"\n    split_modules = {\n        split: infer_module_for_data_files_list(data_files_list, download_config=download_config)\n        for split, data_files_list in data_files.items()\n    }\n    module_name, default_builder_kwargs = next(iter(split_modules.values()))\n    if any((module_name, default_builder_kwargs) != split_module for split_module in split_modules.values()):\n        raise ValueError(f\"Couldn't infer the same data file format for all splits. Got {split_modules}\")\n    if not module_name:\n        raise DataFilesNotFoundError(\"No (supported) data files found\" + (f\" in {path}\" if path else \"\"))\n    return module_name, default_builder_kwargs\n\n\ndef create_builder_configs_from_metadata_configs(\n    module_path: str,\n    metadata_configs: MetadataConfigs,\n    base_path: Optional[str] = None,\n    default_builder_kwargs: dict[str, Any] = None,\n    download_config: Optional[DownloadConfig] = None,\n) -> tuple[list[BuilderConfig], str]:\n    builder_cls = import_main_class(module_path)\n    builder_config_cls = builder_cls.BUILDER_CONFIG_CLASS\n    default_config_name = metadata_configs.get_default_config_name()\n    builder_configs = []\n    default_builder_kwargs = {} if default_builder_kwargs is None else default_builder_kwargs\n\n    base_path = base_path if base_path is not None else \"\"\n    for config_name, config_params in metadata_configs.items():\n        config_data_files = config_params.get(\"data_files\")\n        config_data_dir = config_params.get(\"data_dir\")\n        config_base_path = xjoin(base_path, config_data_dir) if config_data_dir else base_path\n        try:\n            config_patterns = (\n                sanitize_patterns(config_data_files)\n                if config_data_files is not None\n                else get_data_patterns(config_base_path, download_config=download_config)\n            )\n            config_data_files_dict = DataFilesPatternsDict.from_patterns(\n                config_patterns,\n                allowed_extensions=_ALL_ALLOWED_EXTENSIONS,\n            )\n        except EmptyDatasetError as e:\n            raise EmptyDatasetError(\n                f\"Dataset at '{base_path}' doesn't contain data files matching the patterns for config '{config_name}',\"\n                f\" check `data_files` and `data_fir` parameters in the `configs` YAML field in README.md. \"\n            ) from e\n        ignored_params = [\n            param for param in config_params if not hasattr(builder_config_cls, param) and param != \"default\"\n        ]\n        if ignored_params:\n            logger.warning(\n                f\"Some datasets params were ignored: {ignored_params}. \"\n                \"Make sure to use only valid params for the dataset builder and to have \"\n                \"a up-to-date version of the `datasets` library.\"\n            )\n        builder_configs.append(\n            builder_config_cls(\n                name=config_name,\n                data_files=config_data_files_dict,\n                data_dir=config_data_dir,\n                **{\n                    param: value\n                    for param, value in {**default_builder_kwargs, **config_params}.items()\n                    if hasattr(builder_config_cls, param) and param not in (\"default\", \"data_files\", \"data_dir\")\n                },\n            )\n        )\n    return builder_configs, default_config_name\n\n\n@dataclass\nclass BuilderConfigsParameters:\n    \"\"\"Dataclass containing objects related to creation of builder configurations from yaml's metadata content.\n\n    Attributes:\n        metadata_configs (`MetadataConfigs`, *optional*):\n            Configs parsed from yaml's metadata.\n        builder_configs (`list[BuilderConfig]`, *optional*):\n            List of BuilderConfig objects created from metadata_configs above.\n        default_config_name (`str`):\n            Name of default config taken from yaml's metadata.\n    \"\"\"\n\n    metadata_configs: Optional[MetadataConfigs] = None\n    builder_configs: Optional[list[BuilderConfig]] = None\n    default_config_name: Optional[str] = None\n\n\n@dataclass\nclass DatasetModule:\n    module_path: str\n    hash: str\n    builder_kwargs: dict\n    builder_configs_parameters: BuilderConfigsParameters = field(default_factory=BuilderConfigsParameters)\n    dataset_infos: Optional[DatasetInfosDict] = None\n\n\nclass _DatasetModuleFactory:\n    def get_module(self) -> DatasetModule:\n        raise NotImplementedError\n\n\nclass LocalDatasetModuleFactory(_DatasetModuleFactory):\n    \"\"\"Get the module of a dataset loaded from the user's data files. The dataset builder module to use is inferred\n    from the data files extensions.\"\"\"\n\n    def __init__(\n        self,\n        path: str,\n        data_dir: Optional[str] = None,\n        data_files: Optional[Union[str, list, dict]] = None,\n        download_mode: Optional[Union[DownloadMode, str]] = None,\n    ):\n        if data_dir and os.path.isabs(data_dir):\n            raise ValueError(f\"`data_dir` must be relative to a dataset directory's root: {path}\")\n\n        self.path = Path(path).as_posix()\n        self.name = Path(path).stem\n        self.data_files = data_files\n        self.data_dir = data_dir\n        self.download_mode = download_mode\n\n    def get_module(self) -> DatasetModule:\n        readme_path = os.path.join(self.path, config.REPOCARD_FILENAME)\n        standalone_yaml_path = os.path.join(self.path, config.REPOYAML_FILENAME)\n        dataset_card_data = DatasetCard.load(readme_path).data if os.path.isfile(readme_path) else DatasetCardData()\n        if os.path.exists(standalone_yaml_path):\n            with open(standalone_yaml_path, encoding=\"utf-8\") as f:\n                standalone_yaml_data = yaml.safe_load(f.read())\n                if standalone_yaml_data:\n                    _dataset_card_data_dict = dataset_card_data.to_dict()\n                    _dataset_card_data_dict.update(standalone_yaml_data)\n                    dataset_card_data = DatasetCardData(**_dataset_card_data_dict)\n        metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)\n        dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)\n        # we need a set of data files to find which dataset builder to use\n        # because we need to infer module name by files extensions\n        base_path = Path(self.path, self.data_dir or \"\").expanduser().resolve().as_posix()\n        if self.data_files is not None:\n            patterns = sanitize_patterns(self.data_files)\n        elif metadata_configs and not self.data_dir and \"data_files\" in next(iter(metadata_configs.values())):\n            patterns = sanitize_patterns(next(iter(metadata_configs.values()))[\"data_files\"])\n        else:\n            patterns = get_data_patterns(base_path)\n        data_files = DataFilesDict.from_patterns(\n            patterns,\n            base_path=base_path,\n            allowed_extensions=_ALL_ALLOWED_EXTENSIONS,\n        )\n        module_name, default_builder_kwargs = infer_module_for_data_files(\n            data_files=data_files,\n            path=self.path,\n        )\n        data_files = data_files.filter(\n            extensions=_MODULE_TO_EXTENSIONS[module_name] + _MODULE_TO_METADATA_EXTENSIONS[module_name],\n            file_names=_MODULE_TO_METADATA_FILE_NAMES[module_name],\n        )\n        module_path, _ = _PACKAGED_DATASETS_MODULES[module_name]\n        if metadata_configs:\n            builder_configs, default_config_name = create_builder_configs_from_metadata_configs(\n                module_path,\n                metadata_configs,\n                base_path=base_path,\n                default_builder_kwargs=default_builder_kwargs,\n            )\n        else:\n            builder_configs: list[BuilderConfig] = [\n                import_main_class(module_path).BUILDER_CONFIG_CLASS(\n                    data_files=data_files,\n                    **default_builder_kwargs,\n                )\n            ]\n            default_config_name = None\n        builder_kwargs = {\n            \"base_path\": self.path,\n            \"dataset_name\": camelcase_to_snakecase(Path(self.path).name),\n        }\n        if self.data_dir:\n            builder_kwargs[\"data_files\"] = data_files\n        # this file is deprecated and was created automatically in old versions of push_to_hub\n        if os.path.isfile(os.path.join(self.path, config.DATASETDICT_INFOS_FILENAME)):\n            with open(os.path.join(self.path, config.DATASETDICT_INFOS_FILENAME), encoding=\"utf-8\") as f:\n                legacy_dataset_infos = DatasetInfosDict(\n                    {\n                        config_name: DatasetInfo.from_dict(dataset_info_dict)\n                        for config_name, dataset_info_dict in json.load(f).items()\n                    }\n                )\n                if len(legacy_dataset_infos) == 1:\n                    # old config e.g. named \"username--dataset_name\"\n                    legacy_config_name = next(iter(legacy_dataset_infos))\n                    legacy_dataset_infos[\"default\"] = legacy_dataset_infos.pop(legacy_config_name)\n            legacy_dataset_infos.update(dataset_infos)\n            dataset_infos = legacy_dataset_infos\n        if default_config_name is None and len(dataset_infos) == 1:\n            default_config_name = next(iter(dataset_infos))\n\n        hash = Hasher.hash({\"dataset_infos\": dataset_infos, \"builder_configs\": builder_configs})\n        return DatasetModule(\n            module_path,\n            hash,\n            builder_kwargs,\n            dataset_infos=dataset_infos,\n            builder_configs_parameters=BuilderConfigsParameters(\n                metadata_configs=metadata_configs,\n                builder_configs=builder_configs,\n                default_config_name=default_config_name,\n            ),\n        )\n\n\nclass PackagedDatasetModuleFactory(_DatasetModuleFactory):\n    \"\"\"Get the dataset builder module from the ones that are packaged with the library: csv, json, etc.\"\"\"\n\n    def __init__(\n        self,\n        name: str,\n        data_dir: Optional[str] = None,\n        data_files: Optional[Union[str, list, dict]] = None,\n        download_config: Optional[DownloadConfig] = None,\n        download_mode: Optional[Union[DownloadMode, str]] = None,\n    ):\n        self.name = name\n        self.data_files = data_files\n        self.data_dir = data_dir\n        self.download_config = download_config\n        self.download_mode = download_mode\n        increase_load_count(name)\n\n    def get_module(self) -> DatasetModule:\n        base_path = Path(self.data_dir or \"\").expanduser().resolve().as_posix()\n        patterns = (\n            sanitize_patterns(self.data_files)\n            if self.data_files is not None\n            else get_data_patterns(base_path, download_config=self.download_config)\n        )\n        data_files = DataFilesDict.from_patterns(\n            patterns,\n            download_config=self.download_config,\n            base_path=base_path,\n        )\n\n        module_path, hash = _PACKAGED_DATASETS_MODULES[self.name]\n\n        builder_kwargs = {\n            \"data_files\": data_files,\n            \"dataset_name\": self.name,\n        }\n\n        return DatasetModule(module_path, hash, builder_kwargs)\n\n\nclass HubDatasetModuleFactory(_DatasetModuleFactory):\n    \"\"\"\n    Get the module of a dataset loaded from data files of a dataset repository.\n    The dataset builder module to use is inferred from the data files extensions.\n    \"\"\"\n\n    def __init__(\n        self,\n        name: str,\n        commit_hash: str,\n        data_dir: Optional[str] = None,\n        data_files: Optional[Union[str, list, dict]] = None,\n        download_config: Optional[DownloadConfig] = None,\n        download_mode: Optional[Union[DownloadMode, str]] = None,\n        use_exported_dataset_infos: bool = False,\n    ):\n        self.name = name\n        self.commit_hash = commit_hash\n        self.data_files = data_files\n        self.data_dir = data_dir\n        self.download_config = download_config or DownloadConfig()\n        self.download_mode = download_mode\n        self.use_exported_dataset_infos = use_exported_dataset_infos\n        increase_load_count(name)\n\n    def get_module(self) -> DatasetModule:\n        # Get the Dataset Card and fix the revision in case there are new commits in the meantime\n        api = HfApi(\n            endpoint=config.HF_ENDPOINT,\n            token=self.download_config.token,\n            library_name=\"datasets\",\n            library_version=__version__,\n            user_agent=get_datasets_user_agent(self.download_config.user_agent),\n        )\n        try:\n            dataset_readme_path = api.hf_hub_download(\n                repo_id=self.name,\n                filename=config.REPOCARD_FILENAME,\n                repo_type=\"dataset\",\n                revision=self.commit_hash,\n                proxies=self.download_config.proxies,\n            )\n            dataset_card_data = DatasetCard.load(dataset_readme_path).data\n        except EntryNotFoundError:\n            dataset_card_data = DatasetCardData()\n        download_config = self.download_config.copy()\n        if download_config.download_desc is None:\n            download_config.download_desc = \"Downloading standalone yaml\"\n        try:\n            standalone_yaml_path = cached_path(\n                hf_dataset_url(self.name, config.REPOYAML_FILENAME, revision=self.commit_hash),\n                download_config=download_config,\n            )\n            with open(standalone_yaml_path, encoding=\"utf-8\") as f:\n                standalone_yaml_data = yaml.safe_load(f.read())\n                if standalone_yaml_data:\n                    _dataset_card_data_dict = dataset_card_data.to_dict()\n                    _dataset_card_data_dict.update(standalone_yaml_data)\n                    dataset_card_data = DatasetCardData(**_dataset_card_data_dict)\n        except FileNotFoundError:\n            pass\n        base_path = f\"hf://datasets/{self.name}@{self.commit_hash}/{self.data_dir or ''}\".rstrip(\"/\")\n        metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)\n        dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)\n        if config.USE_PARQUET_EXPORT and self.use_exported_dataset_infos:\n            try:\n                exported_dataset_infos = _dataset_viewer.get_exported_dataset_infos(\n                    dataset=self.name, commit_hash=self.commit_hash, token=self.download_config.token\n                )\n                exported_dataset_infos = DatasetInfosDict(\n                    {\n                        config_name: DatasetInfo.from_dict(exported_dataset_infos[config_name])\n                        for config_name in exported_dataset_infos\n                    }\n                )\n            except _dataset_viewer.DatasetViewerError:\n                exported_dataset_infos = None\n        else:\n            exported_dataset_infos = None\n        if exported_dataset_infos:\n            exported_dataset_infos.update(dataset_infos)\n            dataset_infos = exported_dataset_infos\n        # we need a set of data files to find which dataset builder to use\n        # because we need to infer module name by files extensions\n        if self.data_files is not None:\n            patterns = sanitize_patterns(self.data_files)\n        elif metadata_configs and not self.data_dir and \"data_files\" in next(iter(metadata_configs.values())):\n            patterns = sanitize_patterns(next(iter(metadata_configs.values()))[\"data_files\"])\n        else:\n            patterns = get_data_patterns(base_path, download_config=self.download_config)\n        data_files = DataFilesDict.from_patterns(\n            patterns,\n            base_path=base_path,\n            allowed_extensions=_ALL_ALLOWED_EXTENSIONS,\n            download_config=self.download_config,\n        )\n        module_name, default_builder_kwargs = infer_module_for_data_files(\n            data_files=data_files,\n            path=self.name,\n            download_config=self.download_config,\n        )\n        data_files = data_files.filter(\n            extensions=_MODULE_TO_EXTENSIONS[module_name] + _MODULE_TO_METADATA_EXTENSIONS[module_name],\n            file_names=_MODULE_TO_METADATA_FILE_NAMES[module_name],\n        )\n        module_path, _ = _PACKAGED_DATASETS_MODULES[module_name]\n        if metadata_configs:\n            builder_configs, default_config_name = create_builder_configs_from_metadata_configs(\n                module_path,\n                metadata_configs,\n                base_path=base_path,\n                default_builder_kwargs=default_builder_kwargs,\n                download_config=self.download_config,\n            )\n        else:\n            builder_configs: list[BuilderConfig] = [\n                import_main_class(module_path).BUILDER_CONFIG_CLASS(\n                    data_files=data_files,\n                    **default_builder_kwargs,\n                )\n            ]\n            default_config_name = None\n        builder_kwargs = {\n            \"base_path\": hf_dataset_url(self.name, \"\", revision=self.commit_hash).rstrip(\"/\"),\n            \"repo_id\": self.name,\n            \"dataset_name\": camelcase_to_snakecase(Path(self.name).name),\n        }\n        if self.data_dir:\n            builder_kwargs[\"data_files\"] = data_files\n        download_config = self.download_config.copy()\n        if download_config.download_desc is None:\n            download_config.download_desc = \"Downloading metadata\"\n        try:\n            # this file is deprecated and was created automatically in old versions of push_to_hub\n            dataset_infos_path = cached_path(\n                hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=self.commit_hash),\n                download_config=download_config,\n            )\n            with open(dataset_infos_path, encoding=\"utf-8\") as f:\n                legacy_dataset_infos = DatasetInfosDict(\n                    {\n                        config_name: DatasetInfo.from_dict(dataset_info_dict)\n                        for config_name, dataset_info_dict in json.load(f).items()\n                    }\n                )\n                if len(legacy_dataset_infos) == 1:\n                    # old config e.g. named \"username--dataset_name\"\n                    legacy_config_name = next(iter(legacy_dataset_infos))\n                    legacy_dataset_infos[\"default\"] = legacy_dataset_infos.pop(legacy_config_name)\n            legacy_dataset_infos.update(dataset_infos)\n            dataset_infos = legacy_dataset_infos\n        except FileNotFoundError:\n            pass\n        if default_config_name is None and len(dataset_infos) == 1:\n            default_config_name = next(iter(dataset_infos))\n\n        return DatasetModule(\n            module_path,\n            self.commit_hash,\n            builder_kwargs,\n            dataset_infos=dataset_infos,\n            builder_configs_parameters=BuilderConfigsParameters(\n                metadata_configs=metadata_configs,\n                builder_configs=builder_configs,\n                default_config_name=default_config_name,\n            ),\n        )\n\n\nclass HubDatasetModuleFactoryWithParquetExport(_DatasetModuleFactory):\n    \"\"\"\n    Get the module of a dataset loaded from parquet files of a dataset repository parquet export.\n    \"\"\"\n\n    def __init__(\n        self,\n        name: str,\n        commit_hash: str,\n        download_config: Optional[DownloadConfig] = None,\n    ):\n        self.name = name\n        self.commit_hash = commit_hash\n        self.download_config = download_config or DownloadConfig()\n        increase_load_count(name)\n\n    def get_module(self) -> DatasetModule:\n        exported_parquet_files = _dataset_viewer.get_exported_parquet_files(\n            dataset=self.name, commit_hash=self.commit_hash, token=self.download_config.token\n        )\n        exported_dataset_infos = _dataset_viewer.get_exported_dataset_infos(\n            dataset=self.name, commit_hash=self.commit_hash, token=self.download_config.token\n        )\n        dataset_infos = DatasetInfosDict(\n            {\n                config_name: DatasetInfo.from_dict(exported_dataset_infos[config_name])\n                for config_name in exported_dataset_infos\n            }\n        )\n        parquet_commit_hash = (\n            HfApi(\n                endpoint=config.HF_ENDPOINT,\n                token=self.download_config.token,\n                library_name=\"datasets\",\n                library_version=__version__,\n                user_agent=get_datasets_user_agent(self.download_config.user_agent),\n            )\n            .dataset_info(\n                self.name,\n                revision=\"refs/convert/parquet\",\n                token=self.download_config.token,\n                timeout=100.0,\n            )\n            .sha\n        )  # fix the revision in case there are new commits in the meantime\n        metadata_configs = MetadataConfigs._from_exported_parquet_files_and_dataset_infos(\n            parquet_commit_hash=parquet_commit_hash,\n            exported_parquet_files=exported_parquet_files,\n            dataset_infos=dataset_infos,\n        )\n        module_path, _ = _PACKAGED_DATASETS_MODULES[\"parquet\"]\n        builder_configs, default_config_name = create_builder_configs_from_metadata_configs(\n            module_path,\n            metadata_configs,\n            download_config=self.download_config,\n        )\n        builder_kwargs = {\n            \"repo_id\": self.name,\n            \"dataset_name\": camelcase_to_snakecase(Path(self.name).name),\n        }\n\n        return DatasetModule(\n            module_path,\n            self.commit_hash,\n            builder_kwargs,\n            dataset_infos=dataset_infos,\n            builder_configs_parameters=BuilderConfigsParameters(\n                metadata_configs=metadata_configs,\n                builder_configs=builder_configs,\n                default_config_name=default_config_name,\n            ),\n        )\n\n\nclass CachedDatasetModuleFactory(_DatasetModuleFactory):\n    \"\"\"\n    Get the module of a dataset that has been loaded once already and cached.\n    \"\"\"\n\n    def __init__(\n        self,\n        name: str,\n        cache_dir: Optional[str] = None,\n    ):\n        self.name = name\n        self.cache_dir = cache_dir\n        assert self.name.count(\"/\") <= 1\n\n    def get_module(self) -> DatasetModule:\n        cache_dir = os.path.expanduser(str(self.cache_dir or config.HF_DATASETS_CACHE))\n        namespace_and_dataset_name = self.name.split(\"/\")\n        namespace_and_dataset_name[-1] = camelcase_to_snakecase(namespace_and_dataset_name[-1])\n        cached_relative_path = \"___\".join(namespace_and_dataset_name)\n        cached_datasets_directory_path_root = os.path.join(cache_dir, cached_relative_path)\n        cached_directory_paths = [\n            cached_directory_path\n            for cached_directory_path in glob.glob(os.path.join(cached_datasets_directory_path_root, \"*\", \"*\", \"*\"))\n            if os.path.isdir(cached_directory_path)\n        ]\n        if cached_directory_paths:\n            builder_kwargs = {\n                \"repo_id\": self.name,\n                \"dataset_name\": self.name.split(\"/\")[-1],\n            }\n            warning_msg = f\"Using the latest cached version of the dataset since {self.name} couldn't be found on the Hugging Face Hub\"\n            if config.HF_HUB_OFFLINE:\n                warning_msg += \" (offline mode is enabled).\"\n            logger.warning(warning_msg)\n            return DatasetModule(\n                \"datasets.packaged_modules.cache.cache\",\n                \"auto\",\n                {**builder_kwargs, \"version\": \"auto\"},\n            )\n        raise FileNotFoundError(f\"Dataset {self.name} is not cached in {self.cache_dir}\")\n\n\nclass HubBucketDatasetModuleFactory(_DatasetModuleFactory):\n    \"\"\"\n    Get the module of a dataset loaded from data files of a a Storage Bucket.\n    The dataset builder module to use is inferred from the data files extensions.\n    \"\"\"\n\n    def __init__(\n        self,\n        path: str,\n        data_dir: Optional[str] = None,\n        data_files: Optional[Union[str, list, dict]] = None,\n        download_config: Optional[DownloadConfig] = None,\n        download_mode: Optional[Union[DownloadMode, str]] = None,\n    ):\n        self.path = Path(path).as_posix()\n        self.name = Path(path).stem\n        self.data_files = data_files\n        self.data_dir = data_dir\n        self.download_config = download_config\n        self.download_mode = download_mode\n\n    def get_module(self) -> DatasetModule:\n        hffs = HfFileSystem(\n            endpoint=config.HF_ENDPOINT,\n            token=self.download_config.token,\n        )\n        readme_path = xjoin(self.path, config.REPOCARD_FILENAME)\n        standalone_yaml_path = xjoin(self.path, config.REPOYAML_FILENAME)\n        try:\n            dataset_card_data = DatasetCard(hffs.read_text(readme_path, newline=\"\", encoding=\"utf-8\"))\n        except FileNotFoundError:\n            dataset_card_data = DatasetCardData()\n        try:\n            standalone_yaml_data = yaml.safe_load(hffs.read_text(standalone_yaml_path, newline=\"\", encoding=\"utf-8\"))\n        except FileNotFoundError:\n            dataset_card_data = DatasetCardData()\n        if hffs.exists(standalone_yaml_path):\n            with hffs.open(standalone_yaml_path, \"r\", encoding=\"utf-8\") as f:\n                standalone_yaml_data = yaml.safe_load(f.read())\n                if standalone_yaml_data:\n                    _dataset_card_data_dict = dataset_card_data.to_dict()\n                    _dataset_card_data_dict.update(standalone_yaml_data)\n                    dataset_card_data = DatasetCardData(**_dataset_card_data_dict)\n        metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)\n        dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)\n        # we need a set of data files to find which dataset builder to use\n        # because we need to infer module name by files extensions\n        base_path = \"hf://\" + Path(self.path, self.data_dir or \"\").as_posix()\n        if self.data_files is not None:\n            patterns = sanitize_patterns(self.data_files)\n        elif metadata_configs and not self.data_dir and \"data_files\" in next(iter(metadata_configs.values())):\n            patterns = sanitize_patterns(next(iter(metadata_configs.values()))[\"data_files\"])\n        else:\n            patterns = get_data_patterns(base_path, download_config=self.download_config)\n        data_files = DataFilesDict.from_patterns(\n            patterns,\n            base_path=base_path,\n            allowed_extensions=_ALL_ALLOWED_EXTENSIONS,\n        )\n        module_name, default_builder_kwargs = infer_module_for_data_files(\n            data_files=data_files,\n            path=self.path,\n        )\n        data_files = data_files.filter(\n            extensions=_MODULE_TO_EXTENSIONS[module_name] + _MODULE_TO_METADATA_EXTENSIONS[module_name],\n            file_names=_MODULE_TO_METADATA_FILE_NAMES[module_name],\n        )\n        module_path, _ = _PACKAGED_DATASETS_MODULES[module_name]\n        if metadata_configs:\n            builder_configs, default_config_name = create_builder_configs_from_metadata_configs(\n                module_path,\n                metadata_configs,\n                base_path=base_path,\n                default_builder_kwargs=default_builder_kwargs,\n            )\n        else:\n            builder_configs: list[BuilderConfig] = [\n                import_main_class(module_path).BUILDER_CONFIG_CLASS(\n                    data_files=data_files,\n                    **default_builder_kwargs,\n                )\n            ]\n            default_config_name = None\n        builder_kwargs = {\n            \"base_path\": base_path,\n            \"dataset_name\": camelcase_to_snakecase(Path(self.path).name),\n        }\n        if self.data_dir:\n            builder_kwargs[\"data_files\"] = data_files\n        # this file is deprecated and was created automatically in old versions of push_to_hub\n        if hffs.isfile(xjoin(self.path, config.DATASETDICT_INFOS_FILENAME)):\n            with hffs.open(xjoin(self.path, config.DATASETDICT_INFOS_FILENAME), \"r\", encoding=\"utf-8\") as f:\n                legacy_dataset_infos = DatasetInfosDict(\n                    {\n                        config_name: DatasetInfo.from_dict(dataset_info_dict)\n                        for config_name, dataset_info_dict in json.load(f).items()\n                    }\n                )\n                if len(legacy_dataset_infos) == 1:\n                    # old config e.g. named \"username--dataset_name\"\n                    legacy_config_name = next(iter(legacy_dataset_infos))\n                    legacy_dataset_infos[\"default\"] = legacy_dataset_infos.pop(legacy_config_name)\n            legacy_dataset_infos.update(dataset_infos)\n            dataset_infos = legacy_dataset_infos\n        if default_config_name is None and len(dataset_infos) == 1:\n            default_config_name = next(iter(dataset_infos))\n\n        hash = Hasher.hash({\"dataset_infos\": dataset_infos, \"builder_configs\": builder_configs})\n        return DatasetModule(\n            module_path,\n            hash,\n            builder_kwargs,\n            dataset_infos=dataset_infos,\n            builder_configs_parameters=BuilderConfigsParameters(\n                metadata_configs=metadata_configs,\n                builder_configs=builder_configs,\n                default_config_name=default_config_name,\n            ),\n        )\n\n\ndef dataset_module_factory(\n    path: str,\n    revision: Optional[Union[str, Version]] = None,\n    download_config: Optional[DownloadConfig] = None,\n    download_mode: Optional[Union[DownloadMode, str]] = None,\n    data_dir: Optional[str] = None,\n    data_files: Optional[Union[dict, list, str, DataFilesDict]] = None,\n    cache_dir: Optional[str] = None,\n    **download_kwargs,\n) -> DatasetModule:\n    \"\"\"\n    Download/extract/cache a dataset module.\n\n    Dataset codes are cached inside the dynamic modules cache to allow easy import (avoid ugly sys.path tweaks).\n\n    Args:\n\n        path (str): Path or name of the dataset.\n            Depending on ``path``, the dataset builder that is used comes from one of the generic dataset builders (JSON, CSV, Parquet, text etc.).\n\n            For local datasets:\n\n            - if ``path`` is a local directory (containing data files only)\n              -> load a generic dataset builder (csv, json, text etc.) based on the content of the directory\n              e.g. ``'./path/to/directory/with/my/csv/data'``.\n\n            For datasets on the Hugging Face Hub (list all available datasets with ``huggingface_hub.list_datasets()``)\n\n            - if ``path`` is a dataset repository on the HF hub (containing data files only)\n              -> load a generic dataset builder (csv, text etc.) based on the content of the repository\n              e.g. ``'username/dataset_name'``, a dataset repository on the HF hub containing your data files.\n\n            For datasets in Storage Buckets on the Hugging Face Hub (list all available buckets with ``huggingface_hub.list_buckets()``)\n\n            - if `path` is a directory within a Storage Bucket on the HF Hub (containing data files only)\n              -> load the dataset from supported files in the directory (csv, json, parquet, etc.)\n              e.g. `'buckets/username/bucket_name/my_dataset'`.\n\n        revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset to load.\n            As datasets have their own git repository on the Datasets Hub, the default version \"main\" corresponds to their \"main\" branch.\n            You can specify a different version than the default \"main\" by using a commit SHA or a git tag of the dataset repository.\n        download_config (:class:`DownloadConfig`, optional): Specific download configuration parameters.\n        download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.\n        data_dir (:obj:`str`, optional): Directory with the data files. Used only if `data_files` is not specified,\n            in which case it's equal to pass `os.path.join(data_dir, \"**\")` as `data_files`.\n        data_files (:obj:`Union[Dict, List, str]`, optional): Defining the data_files of the dataset configuration.\n        cache_dir (`str`, *optional*):\n            Directory to read/write data. Defaults to `\"~/.cache/huggingface/datasets\"`.\n\n            <Added version=\"2.16.0\"/>\n\n        **download_kwargs (additional keyword arguments): optional attributes for DownloadConfig() which will override\n            the attributes in download_config if supplied.\n\n    Returns:\n        DatasetModule\n    \"\"\"\n    if download_config is None:\n        download_config = DownloadConfig(**download_kwargs)\n    download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)\n    download_config.extract_compressed_file = True\n    download_config.force_extract = True\n    download_config.force_download = download_mode == DownloadMode.FORCE_REDOWNLOAD\n\n    filename = list(filter(lambda x: x, path.replace(os.sep, \"/\").split(\"/\")))[-1]\n    if not filename.endswith(\".py\"):\n        filename = filename + \".py\"\n    combined_path = os.path.join(path, filename)\n\n    # We have several ways to get a dataset builder:\n    #\n    # - if path is the name of a packaged dataset module\n    #   -> use the packaged module (json, csv, etc.)\n    #\n    # - if path is a local directory (but no python file)\n    #   -> use a packaged module (csv, text etc.) based on content of the directory\n    #\n    # - if path has one \"/\" and is dataset repository on the HF hub\n    #   -> use a packaged module (csv, text etc.) based on content of the repository\n    #\n    # - if path starts with \"buckets/\" and points to a Storage Bucket on the HF hub\n    #   -> use a packaged module (csv, text etc.) based on content of the directory in the bucket\n\n    if path.startswith(\"hf://datasets/\"):\n        path = path[len(\"hf://datasets/\") :]\n        remote_only = True\n    elif path.startswith(\"hf://buckets/\"):\n        path = path[len(\"hf://\") :]\n        remote_only = True\n    else:\n        remote_only = False\n\n    # Try packaged\n    if path in _PACKAGED_DATASETS_MODULES:\n        return PackagedDatasetModuleFactory(\n            path,\n            data_dir=data_dir,\n            data_files=data_files,\n            download_config=download_config,\n            download_mode=download_mode,\n        ).get_module()\n    # Try locally\n    elif path.endswith(filename):\n        raise RuntimeError(f\"Dataset scripts are no longer supported, but found {filename}\")\n    elif os.path.isfile(combined_path):\n        raise RuntimeError(f\"Dataset scripts are no longer supported, but found {filename}\")\n    elif os.path.isdir(path) and not remote_only:\n        return LocalDatasetModuleFactory(\n            path, data_dir=data_dir, data_files=data_files, download_mode=download_mode\n        ).get_module()\n    # Try remotely\n    elif path.startswith(\"buckets/\"):\n        if BucketNotFoundError is None:\n            raise ImportError(\"Loading datasets from buckets requires huggingface_hub>=1.6.0\")\n        # We check that the bucket exists, and the directory exists, and authentication in one call\n        api = HfApi(\n            endpoint=config.HF_ENDPOINT,\n            token=download_config.token,\n            library_name=\"datasets\",\n            library_version=__version__,\n            user_agent=get_datasets_user_agent(download_config.user_agent),\n        )\n        _, _namespace, _bucket_name, *_path_segments = path.split(\"/\")\n        bucket_id = _namespace + \"/\" + _bucket_name\n        prefix = \"/\".join(s for s in _path_segments if s)\n        try:\n            next(iter(api.list_bucket_tree(bucket_id, prefix)))\n        except (\n            OfflineModeIsEnabled,\n            requests.exceptions.Timeout,\n            requests.exceptions.ConnectionError,\n            httpx.ConnectError,\n            httpx.TimeoutException,\n        ) as e:\n            raise ConnectionError(f\"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})\") from e\n        except StopIteration as e:\n            raise DatasetNotFoundError(f\"Bucket directory at {path} doesn't exist\") from e\n        except BucketNotFoundError as e:\n            raise DatasetNotFoundError(f\"Bucket '{bucket_id}' doesn't exist on the Hub or cannot be accessed.\") from e\n        return HubBucketDatasetModuleFactory(\n            path,\n            data_dir=data_dir,\n            data_files=data_files,\n            download_config=download_config,\n            download_mode=download_mode,\n        ).get_module()\n    elif is_relative_path(path) and path.count(\"/\") <= 1:\n        try:\n            # Get the Dataset Card + get the revision + check authentication all at in one call\n            # We fix the commit_hash in case there are new commits in the meantime\n            api = HfApi(\n                endpoint=config.HF_ENDPOINT,\n                token=download_config.token,\n                library_name=\"datasets\",\n                library_version=__version__,\n                user_agent=get_datasets_user_agent(download_config.user_agent),\n            )\n            try:\n                _raise_if_offline_mode_is_enabled()\n                dataset_readme_path = api.hf_hub_download(\n                    repo_id=path,\n                    filename=config.REPOCARD_FILENAME,\n                    repo_type=\"dataset\",\n                    revision=revision,\n                    proxies=download_config.proxies,\n                )\n                commit_hash = os.path.basename(os.path.dirname(dataset_readme_path))\n            except LocalEntryNotFoundError as e:\n                if isinstance(\n                    e.__cause__,\n                    (\n                        OfflineModeIsEnabled,\n                        requests.exceptions.Timeout,\n                        requests.exceptions.ConnectionError,\n                        httpx.ConnectError,\n                        httpx.TimeoutException,\n                    ),\n                ):\n                    raise ConnectionError(f\"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})\") from e\n                else:\n                    raise\n            except EntryNotFoundError:\n                commit_hash = api.dataset_info(\n                    path,\n                    revision=revision,\n                    timeout=100.0,\n                ).sha\n            except (\n                OfflineModeIsEnabled,\n                requests.exceptions.Timeout,\n                requests.exceptions.ConnectionError,\n                httpx.ConnectError,\n                httpx.TimeoutException,\n            ) as e:\n                raise ConnectionError(f\"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})\") from e\n            except GatedRepoError as e:\n                message = f\"Dataset '{path}' is a gated dataset on the Hub.\"\n                if e.response.status_code == 401:\n                    message += \" You must be authenticated to access it.\"\n                elif e.response.status_code == 403:\n                    message += f\" Visit the dataset page at https://huggingface.co/datasets/{path} to ask for access.\"\n                raise DatasetNotFoundError(message) from e\n            except RepositoryNotFoundError as e:\n                raise DatasetNotFoundError(f\"Dataset '{path}' doesn't exist on the Hub or cannot be accessed.\") from e\n            try:\n                api.hf_hub_download(\n                    repo_id=path,\n                    filename=filename,\n                    repo_type=\"dataset\",\n                    revision=commit_hash,\n                    proxies=download_config.proxies,\n                )\n                raise RuntimeError(f\"Dataset scripts are no longer supported, but found {filename}\")\n            except EntryNotFoundError:\n                # Use the infos from the parquet export except in some cases:\n                if data_dir or data_files or (revision and revision != \"main\"):\n                    use_exported_dataset_infos = False\n                else:\n                    use_exported_dataset_infos = True\n                return HubDatasetModuleFactory(\n                    path,\n                    commit_hash=commit_hash,\n                    data_dir=data_dir,\n                    data_files=data_files,\n                    download_config=download_config,\n                    download_mode=download_mode,\n                    use_exported_dataset_infos=use_exported_dataset_infos,\n                ).get_module()\n            except GatedRepoError as e:\n                message = f\"Dataset '{path}' is a gated dataset on the Hub.\"\n                if e.response.status_code == 401:\n                    message += \" You must be authenticated to access it.\"\n                elif e.response.status_code == 403:\n                    message += f\" Visit the dataset page at https://huggingface.co/datasets/{path} to ask for access.\"\n                raise DatasetNotFoundError(message) from e\n        except RevisionNotFoundError as e:\n            raise DatasetNotFoundError(f\"Revision '{revision}' doesn't exist for dataset '{path}' on the Hub.\") from e\n        except Exception as e1:\n            # All the attempts failed, before raising the error we should check if the module is already cached\n            try:\n                return CachedDatasetModuleFactory(path, cache_dir=cache_dir).get_module()\n            except Exception:\n                # If it's not in the cache, then it doesn't exist.\n                if isinstance(e1, OfflineModeIsEnabled):\n                    raise ConnectionError(f\"Couldn't reach the Hugging Face Hub for dataset '{path}': {e1}\") from None\n                if isinstance(e1, (DataFilesNotFoundError, DatasetNotFoundError, EmptyDatasetError)):\n                    raise e1 from None\n                if isinstance(e1, FileNotFoundError):\n                    raise FileNotFoundError(\n                        f\"Couldn't find any data file at {relative_to_absolute_path(path)}. \"\n                        f\"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}\"\n                    ) from None\n                raise e1 from None\n    else:\n        raise FileNotFoundError(f\"Couldn't find any data file at {relative_to_absolute_path(path)}.\")\n\n\ndef load_dataset_builder(\n    path: str,\n    name: Optional[str] = None,\n    data_dir: Optional[str] = None,\n    data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,\n    cache_dir: Optional[str] = None,\n    features: Optional[Features] = None,\n    download_config: Optional[DownloadConfig] = None,\n    download_mode: Optional[Union[DownloadMode, str]] = None,\n    revision: Optional[Union[str, Version]] = None,\n    token: Optional[Union[bool, str]] = None,\n    storage_options: Optional[dict] = None,\n    **config_kwargs,\n) -> DatasetBuilder:\n    \"\"\"Load a dataset builder which can be used to:\n\n    - Inspect general information that is required to build a dataset (cache directory, config, dataset info, features, data files, etc.)\n    - Download and prepare the dataset as Arrow files in the cache\n    - Get a streaming dataset without downloading or caching anything\n\n    You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`].\n\n    A dataset is a directory that contains some data files in generic formats (JSON, CSV, Parquet, etc.) and possibly\n    in a generic structure (Webdataset, ImageFolder, AudioFolder, VideoFolder, etc.)\n\n    Args:\n\n        path (`str`):\n            Path or name of the dataset.\n\n            - if `path` is a dataset repository on the HF hub (list all available datasets with [`huggingface_hub.list_datasets`])\n              -> load the dataset builder from supported files in the repository (csv, json, parquet, etc.)\n              e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing the data files.\n\n            - if `path` is a directory within a Storage Bucket on the HF Hub (list your buckets with [`huggingface_hub.list_buckets`])\n              -> load the dataset from supported files in the directory (csv, json, parquet, etc.)\n              e.g. `'buckets/username/bucket_name/my_dataset'`.\n\n            - if `path` is a local directory\n              -> load the dataset builder from supported files in the directory (csv, json, parquet, etc.)\n              e.g. `'./path/to/directory/with/my/csv/data'`.\n\n            - if `path` is the name of a dataset builder and `data_files` or `data_dir` is specified\n              (available builders are \"json\", \"csv\", \"parquet\", \"arrow\", \"text\", \"xml\", \"webdataset\", \"imagefolder\", \"audiofolder\", \"videofolder\")\n              -> load the dataset builder from the files in `data_files` or `data_dir`\n              e.g. `'parquet'`.\n\n            Use a `hf://` path like `'hf://datasets/username/dataset_name'` to allow remote only.\n            Use an absolute path to allow local only.\n\n        name (`str`, *optional*):\n            Defining the name of the dataset configuration.\n        data_dir (`str`, *optional*):\n            Defining the `data_dir` of the dataset configuration. If specified for the generic builders (csv, text etc.) or the Hub datasets and `data_files` is `None`,\n            the behavior is equal to passing `os.path.join(data_dir, **)` as `data_files` to reference all the files in a directory.\n        data_files (`str` or `Sequence` or `Mapping`, *optional*):\n            Path(s) to source data file(s).\n        cache_dir (`str`, *optional*):\n            Directory to read/write data. Defaults to `\"~/.cache/huggingface/datasets\"`.\n        features ([`Features`], *optional*):\n            Set the features type to use for this dataset.\n        download_config ([`DownloadConfig`], *optional*):\n            Specific download configuration parameters.\n        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):\n            Download/generate mode.\n        revision ([`Version`] or `str`, *optional*):\n            Version of the dataset to load.\n            As datasets have their own git repository on the Datasets Hub, the default version \"main\" corresponds to their \"main\" branch.\n            You can specify a different version than the default \"main\" by using a commit SHA or a git tag of the dataset repository.\n        token (`str` or `bool`, *optional*):\n            Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.\n            If `True`, or not specified, will get token from `\"~/.huggingface\"`.\n        storage_options (`dict`, *optional*, defaults to `None`):\n            **Experimental**. Key/value pairs to be passed on to the dataset file-system backend, if any.\n\n            <Added version=\"2.11.0\"/>\n\n        **config_kwargs (additional keyword arguments):\n            Keyword arguments to be passed to the [`BuilderConfig`]\n            and used in the [`DatasetBuilder`].\n\n    Returns:\n        [`DatasetBuilder`]\n\n    Example:\n\n    ```py\n    >>> from datasets import load_dataset_builder\n    >>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')\n    >>> ds_builder.info.features\n    {'label': ClassLabel(names=['neg', 'pos']),\n     'text': Value('string')}\n    ```\n    \"\"\"\n    download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)\n    if token is not None:\n        download_config = download_config.copy() if download_config else DownloadConfig()\n        download_config.token = token\n    if storage_options is not None:\n        download_config = download_config.copy() if download_config else DownloadConfig()\n        download_config.storage_options.update(storage_options)\n    if features is not None:\n        features = _fix_for_backward_compatible_features(features)\n    dataset_module = dataset_module_factory(\n        path,\n        revision=revision,\n        download_config=download_config,\n        download_mode=download_mode,\n        data_dir=data_dir,\n        data_files=data_files,\n        cache_dir=cache_dir,\n    )\n    # Get dataset builder class\n    builder_kwargs = dataset_module.builder_kwargs\n    data_dir = builder_kwargs.pop(\"data_dir\", data_dir)\n    data_files = builder_kwargs.pop(\"data_files\", data_files)\n    config_name = builder_kwargs.pop(\n        \"config_name\", name or dataset_module.builder_configs_parameters.default_config_name\n    )\n    dataset_name = builder_kwargs.pop(\"dataset_name\", None)\n    info = dataset_module.dataset_infos.get(config_name) if dataset_module.dataset_infos else None\n\n    if (\n        path in _PACKAGED_DATASETS_MODULES\n        and data_files is None\n        and dataset_module.builder_configs_parameters.builder_configs[0].data_files is None\n    ):\n        error_msg = f\"Please specify the data files or data directory to load for the {path} dataset builder.\"\n        example_extensions = [\n            extension for extension in _EXTENSION_TO_MODULE if _EXTENSION_TO_MODULE[extension] == path\n        ]\n        if example_extensions:\n            error_msg += f'\\nFor example `data_files={{\"train\": \"path/to/data/train/*.{example_extensions[0]}\"}}`'\n        raise ValueError(error_msg)\n\n    # When users pass config kwargs, they should override module-provided defaults\n    # instead of colliding at constructor call time.\n    builder_kwargs = {key: value for key, value in builder_kwargs.items() if key not in config_kwargs}\n\n    builder_cls = get_dataset_builder_class(dataset_module, dataset_name=dataset_name)\n    # Instantiate the dataset builder\n    builder_instance: DatasetBuilder = builder_cls(\n        cache_dir=cache_dir,\n        dataset_name=dataset_name,\n        config_name=config_name,\n        data_dir=data_dir,\n        data_files=data_files,\n        hash=dataset_module.hash,\n        info=info,\n        features=features,\n        token=token,\n        storage_options=storage_options,\n        **builder_kwargs,\n        **config_kwargs,\n    )\n    builder_instance._use_legacy_cache_dir_if_possible(dataset_module)\n\n    return builder_instance\n\n\n@overload\ndef load_dataset(\n    path: str,\n    name: Optional[str] = None,\n    data_dir: Optional[str] = None,\n    data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,\n    split: None = None,\n    cache_dir: Optional[str] = None,\n    features: Optional[Features] = None,\n    download_config: Optional[DownloadConfig] = None,\n    download_mode: Optional[Union[DownloadMode, str]] = None,\n    verification_mode: Optional[Union[VerificationMode, str]] = None,\n    keep_in_memory: Optional[bool] = None,\n    save_infos: bool = False,\n    revision: Optional[Union[str, Version]] = None,\n    token: Optional[Union[bool, str]] = None,\n    streaming: Literal[False] = False,\n    num_proc: Optional[int] = None,\n    storage_options: Optional[dict] = None,\n    **config_kwargs: Any,\n) -> DatasetDict: ...\n\n\n@overload\ndef load_dataset(\n    path: str,\n    name: Optional[str] = None,\n    data_dir: Optional[str] = None,\n    data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,\n    *,\n    split: Union[str, Split, list[str], list[Split]],\n    cache_dir: Optional[str] = None,\n    features: Optional[Features] = None,\n    download_config: Optional[DownloadConfig] = None,\n    download_mode: Optional[Union[DownloadMode, str]] = None,\n    verification_mode: Optional[Union[VerificationMode, str]] = None,\n    keep_in_memory: Optional[bool] = None,\n    save_infos: bool = False,\n    revision: Optional[Union[Version, str]] = None,\n    token: Optional[Union[bool, str]] = None,\n    streaming: Literal[False] = False,\n    num_proc: Optional[int] = None,\n    storage_options: Optional[dict] = None,\n    **config_kwargs: Any,\n) -> Dataset: ...\n\n\n@overload\ndef load_dataset(\n    path: str,\n    name: Optional[str] = None,\n    data_dir: Optional[str] = None,\n    data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,\n    split: None = None,\n    cache_dir: Optional[str] = None,\n    features: Optional[Features] = None,\n    download_config: Optional[DownloadConfig] = None,\n    download_mode: Optional[Union[DownloadMode, str]] = None,\n    verification_mode: Optional[Union[VerificationMode, str]] = None,\n    keep_in_memory: Optional[bool] = None,\n    save_infos: bool = False,\n    revision: Optional[Union[Version, str]] = None,\n    token: Optional[Union[bool, str]] = None,\n    *,\n    streaming: Literal[True],\n    num_proc: Optional[int] = None,\n    storage_options: Optional[dict] = None,\n    **config_kwargs: Any,\n) -> IterableDatasetDict: ...\n\n\n@overload\ndef load_dataset(\n    path: str,\n    name: Optional[str] = None,\n    data_dir: Optional[str] = None,\n    data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,\n    *,\n    split: Union[str, Split, list[str], list[Split]],\n    cache_dir: Optional[str] = None,\n    features: Optional[Features] = None,\n    download_config: Optional[DownloadConfig] = None,\n    download_mode: Optional[Union[DownloadMode, str]] = None,\n    verification_mode: Optional[Union[VerificationMode, str]] = None,\n    keep_in_memory: Optional[bool] = None,\n    save_infos: bool = False,\n    revision: Optional[Union[Version, str]] = None,\n    token: Optional[Union[bool, str]] = None,\n    streaming: Literal[True],\n    num_proc: Optional[int] = None,\n    storage_options: Optional[dict] = None,\n    **config_kwargs: Any,\n) -> IterableDataset: ...\n\n\ndef load_dataset(\n    path: str,\n    name: Optional[str] = None,\n    data_dir: Optional[str] = None,\n    data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None,\n    split: Optional[Union[str, Split, list[str], list[Split]]] = None,\n    cache_dir: Optional[str] = None,\n    features: Optional[Features] = None,\n    download_config: Optional[DownloadConfig] = None,\n    download_mode: Optional[Union[DownloadMode, str]] = None,\n    verification_mode: Optional[Union[VerificationMode, str]] = None,\n    keep_in_memory: Optional[bool] = None,\n    save_infos: bool = False,\n    revision: Optional[Union[str, Version]] = None,\n    token: Optional[Union[bool, str]] = None,\n    streaming: bool = False,\n    num_proc: Optional[int] = None,\n    storage_options: Optional[dict] = None,\n    **config_kwargs,\n) -> Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]:\n    \"\"\"Load a dataset from the Hugging Face Hub, or a local dataset.\n\n    You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`].\n\n    A dataset is a directory that contains some data files in generic formats (JSON, CSV, Parquet, etc.) and possibly\n    in a generic structure (Webdataset, ImageFolder, AudioFolder, VideoFolder, etc.)\n\n    This function does the following under the hood:\n\n        1. Load a dataset builder:\n\n            * Find the most common data format in the dataset and pick its associated builder (JSON, CSV, Parquet, Webdataset, ImageFolder, AudioFolder, etc.)\n            * Find which file goes into which split (e.g. train/test) based on file and directory names or on the YAML configuration\n            * It is also possible to specify `data_files` manually, and which dataset builder to use (e.g. \"parquet\").\n\n        2. Run the dataset builder:\n\n            In the general case:\n\n            * Download the data files from the dataset if they are not already available locally or cached.\n            * Process and cache the dataset in typed Arrow tables for caching.\n\n                Arrow table are arbitrarily long, typed tables which can store nested objects and be mapped to numpy/pandas/python generic types.\n                They can be directly accessed from disk, loaded in RAM or even streamed over the web.\n\n            In the streaming case:\n\n            * Don't download or cache anything. Instead, the dataset is lazily loaded and will be streamed on-the-fly when iterating on it.\n\n        3. Return a dataset built from the requested splits in `split` (default: all).\n\n    Args:\n\n        path (`str`):\n            Path or name of the dataset.\n\n            - if `path` is a dataset repository on the HF hub (list all available datasets with [`huggingface_hub.list_datasets`])\n              -> load the dataset from supported files in the repository (csv, json, parquet, etc.)\n              e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing the data files.\n\n            - if `path` is a directory within a Storage Bucket on the HF Hub (list your buckets with [`huggingface_hub.list_buckets`])\n              -> load the dataset from supported files in the directory (csv, json, parquet, etc.)\n              e.g. `'buckets/username/bucket_name/my_dataset'`.\n\n            - if `path` is a local directory\n              -> load the dataset from supported files in the directory (csv, json, parquet, etc.)\n              e.g. `'./path/to/directory/with/my/csv/data'`.\n\n            - if `path` is the name of a dataset builder and `data_files` or `data_dir` is specified\n              (available builders are \"json\", \"csv\", \"parquet\", \"arrow\", \"text\", \"xml\", \"webdataset\", \"imagefolder\", \"audiofolder\", \"videofolder\")\n              -> load the dataset from the files in `data_files` or `data_dir`\n              e.g. `'parquet'`.\n\n            Use a `hf://` path like `'hf://datasets/username/dataset_name'` to allow remote only.\n            Use an absolute path to allow local only.\n\n        name (`str`, *optional*):\n            Defining the name of the dataset configuration.\n        data_dir (`str`, *optional*):\n            Defining the `data_dir` of the dataset configuration. If specified for the generic builders (csv, text etc.) or the Hub datasets and `data_files` is `None`,\n            the behavior is equal to passing `os.path.join(data_dir, **)` as `data_files` to reference all the files in a directory.\n        data_files (`str` or `Sequence` or `Mapping`, *optional*):\n            Path(s) to source data file(s).\n        split (`Split` or `str`):\n            Which split of the data to load.\n            If `None`, will return a `dict` with all splits (typically `datasets.Split.TRAIN` and `datasets.Split.TEST`).\n            If given, will return a single Dataset.\n            Splits can be combined and specified like in tensorflow-datasets.\n        cache_dir (`str`, *optional*):\n            Directory to read/write data. Defaults to `\"~/.cache/huggingface/datasets\"`.\n        features (`Features`, *optional*):\n            Set the features type to use for this dataset.\n        download_config ([`DownloadConfig`], *optional*):\n            Specific download configuration parameters.\n        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):\n            Download/generate mode.\n        verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):\n            Verification mode determining the checks to run on the downloaded/processed dataset information (checksums/size/splits/...).\n\n            <Added version=\"2.9.1\"/>\n        keep_in_memory (`bool`, defaults to `None`):\n            Whether to copy the dataset in-memory. If `None`, the dataset\n            will not be copied in-memory unless explicitly enabled by setting `datasets.config.IN_MEMORY_MAX_SIZE` to\n            nonzero. See more details in the [improve performance](../cache#improve-performance) section.\n        revision ([`Version`] or `str`, *optional*):\n            Version of the dataset to load.\n            As datasets have their own git repository on the Datasets Hub, the default version \"main\" corresponds to their \"main\" branch.\n            You can specify a different version than the default \"main\" by using a commit SHA or a git tag of the dataset repository.\n        token (`str` or `bool`, *optional*):\n            Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.\n            If `True`, or not specified, will get token from `\"~/.huggingface\"`.\n        streaming (`bool`, defaults to `False`):\n            If set to `True`, don't download the data files. Instead, it streams the data progressively while\n            iterating on the dataset. An [`IterableDataset`] or [`IterableDatasetDict`] is returned instead in this case.\n\n            Note that streaming works for datasets that use data formats that support being iterated over like txt, csv, jsonl for example.\n            Json files may be downloaded completely. Also streaming from remote zip or gzip files is supported but other compressed formats\n            like rar and xz are not yet supported. The tgz format doesn't allow streaming.\n        num_proc (`int`, *optional*, defaults to `None`):\n            Number of processes when downloading and generating the dataset locally.\n            Multiprocessing is disabled by default.\n\n            <Added version=\"2.7.0\"/>\n        storage_options (`dict`, *optional*, defaults to `None`):\n            **Experimental**. Key/value pairs to be passed on to the dataset file-system backend, if any.\n\n            <Added version=\"2.11.0\"/>\n        **config_kwargs (additional keyword arguments):\n            Keyword arguments to be passed to the `BuilderConfig`\n            and used in the [`DatasetBuilder`].\n\n    Returns:\n        [`Dataset`] or [`DatasetDict`]:\n        - if `split` is not `None`: the dataset requested,\n        - if `split` is `None`, a [`~datasets.DatasetDict`] with each split.\n\n        or [`IterableDataset`] or [`IterableDatasetDict`]: if `streaming=True`\n\n        - if `split` is not `None`, the dataset is requested\n        - if `split` is `None`, a [`~datasets.streaming.IterableDatasetDict`] with each split.\n\n    Example:\n\n    Load a dataset from the Hugging Face Hub:\n\n    ```py\n    >>> from datasets import load_dataset\n    >>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='train')\n\n    # Load a subset or dataset configuration (here 'sst2')\n    >>> from datasets import load_dataset\n    >>> ds = load_dataset('nyu-mll/glue', 'sst2', split='train')\n\n    # Manual mapping of data files to splits\n    >>> data_files = {'train': 'train.csv', 'test': 'test.csv'}\n    >>> ds = load_dataset('namespace/your_dataset_name', data_files=data_files)\n\n    # Manual selection of a directory to load\n    >>> ds = load_dataset('namespace/your_dataset_name', data_dir='folder_name')\n    ```\n\n    Load a dataset from a Storage Bucket on the Hugging Face Hub:\n\n    ```py\n    >>> from datasets import load_dataset\n    >>> ds = load_dataset('buckets/username/bucket_name/rotten_tomatoes', split='train')\n    ```\n\n    Load a local dataset:\n\n    ```py\n    # Load a CSV file\n    >>> from datasets import load_dataset\n    >>> ds = load_dataset('csv', data_files='path/to/local/my_dataset.csv')\n\n    # Load a JSON file\n    >>> from datasets import load_dataset\n    >>> ds = load_dataset('json', data_files='path/to/local/my_dataset.json')\n    ```\n\n    Load an [`~datasets.IterableDataset`]:\n\n    ```py\n    >>> from datasets import load_dataset\n    >>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='train', streaming=True)\n    ```\n\n    Load an image dataset with the `ImageFolder` dataset builder:\n\n    ```py\n    >>> from datasets import load_dataset\n    >>> ds = load_dataset('imagefolder', data_dir='/path/to/images', split='train')\n    ```\n    \"\"\"\n    if \"trust_remote_code\" in config_kwargs:\n        if config_kwargs.pop(\"trust_remote_code\"):\n            logger.error(\n                \"`trust_remote_code` is not supported anymore.\\n\"\n                f\"Please check that the Hugging Face dataset '{path}' isn't based on a loading script and remove `trust_remote_code`.\\n\"\n                \"If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.\"\n            )\n    if data_files is not None and not data_files:\n        raise ValueError(f\"Empty 'data_files': '{data_files}'. It should be either non-empty or None (default).\")\n    if Path(path, config.DATASET_STATE_JSON_FILENAME).exists():\n        raise ValueError(\n            \"You are trying to load a dataset that was saved using `save_to_disk`. \"\n            \"Please use `load_from_disk` instead.\"\n        )\n\n    if streaming and num_proc is not None:\n        raise NotImplementedError(\n            \"Loading a streaming dataset in parallel with `num_proc` is not implemented. \"\n            \"To parallelize streaming, you can wrap the dataset with a PyTorch DataLoader using `num_workers` > 1 instead.\"\n        )\n\n    download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)\n    verification_mode = VerificationMode(\n        (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS\n    )\n\n    # Create a dataset builder\n    builder_instance = load_dataset_builder(\n        path=path,\n        name=name,\n        data_dir=data_dir,\n        data_files=data_files,\n        cache_dir=cache_dir,\n        features=features,\n        download_config=download_config,\n        download_mode=download_mode,\n        revision=revision,\n        token=token,\n        storage_options=storage_options,\n        **config_kwargs,\n    )\n\n    # Return iterable dataset in case of streaming\n    if streaming:\n        return builder_instance.as_streaming_dataset(split=split)\n\n    # Download and prepare data\n    builder_instance.download_and_prepare(\n        download_config=download_config,\n        download_mode=download_mode,\n        verification_mode=verification_mode,\n        num_proc=num_proc,\n        storage_options=storage_options,\n    )\n\n    # Build dataset for splits\n    keep_in_memory = (\n        keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)\n    )\n    ds = builder_instance.as_dataset(split=split, verification_mode=verification_mode, in_memory=keep_in_memory)\n\n    return ds\n\n\ndef load_from_disk(\n    dataset_path: PathLike, keep_in_memory: Optional[bool] = None, storage_options: Optional[dict] = None\n) -> Union[Dataset, DatasetDict]:\n    \"\"\"\n    Loads a dataset that was previously saved using [`~Dataset.save_to_disk`] from a dataset directory, or\n    from a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.\n\n    Args:\n        dataset_path (`path-like`):\n            Path (e.g. `\"dataset/train\"`) or remote URI (e.g. `\"s3://my-bucket/dataset/train\"`)\n            of the [`Dataset`] or [`DatasetDict`] directory where the dataset/dataset-dict will be\n            loaded from.\n        keep_in_memory (`bool`, defaults to `None`):\n            Whether to copy the dataset in-memory. If `None`, the dataset\n            will not be copied in-memory unless explicitly enabled by setting `datasets.config.IN_MEMORY_MAX_SIZE` to\n            nonzero. See more details in the [improve performance](../cache#improve-performance) section.\n\n        storage_options (`dict`, *optional*):\n            Key/value pairs to be passed on to the file-system backend, if any.\n\n            <Added version=\"2.9.0\"/>\n\n    Returns:\n        [`Dataset`] or [`DatasetDict`]:\n        - If `dataset_path` is a path of a dataset directory: the dataset requested.\n        - If `dataset_path` is a path of a dataset dict directory, a [`DatasetDict`] with each split.\n\n    Example:\n\n    ```py\n    >>> from datasets import load_from_disk\n    >>> ds = load_from_disk('path/to/dataset/directory')\n    ```\n    \"\"\"\n    fs: fsspec.AbstractFileSystem\n    fs, *_ = url_to_fs(dataset_path, **(storage_options or {}))\n    if not fs.exists(dataset_path):\n        raise FileNotFoundError(f\"Directory {dataset_path} not found\")\n    if fs.isfile(posixpath.join(dataset_path, config.DATASET_INFO_FILENAME)) and fs.isfile(\n        posixpath.join(dataset_path, config.DATASET_STATE_JSON_FILENAME)\n    ):\n        return Dataset.load_from_disk(dataset_path, keep_in_memory=keep_in_memory, storage_options=storage_options)\n    elif fs.isfile(posixpath.join(dataset_path, config.DATASETDICT_JSON_FILENAME)):\n        return DatasetDict.load_from_disk(dataset_path, keep_in_memory=keep_in_memory, storage_options=storage_options)\n    else:\n        raise FileNotFoundError(\n            f\"Directory {dataset_path} is neither a `Dataset` directory nor a `DatasetDict` directory.\"\n        )\n"
  },
  {
    "path": "src/datasets/naming.py",
    "content": "# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Lint as: python3\n\"\"\"Utilities for file names.\"\"\"\n\nimport itertools\nimport os\nimport re\n\n\n_uppercase_uppercase_re = re.compile(r\"([A-Z]+)([A-Z][a-z])\")\n_lowercase_uppercase_re = re.compile(r\"([a-z\\d])([A-Z])\")\n\n_single_underscore_re = re.compile(r\"(?<!_)_(?!_)\")\n_multiple_underscores_re = re.compile(r\"(_{2,})\")\n\n_split_re = r\"^\\w+(\\.\\w+)*$\"\n\nINVALID_WINDOWS_CHARACTERS_IN_PATH = r\"<>:/\\|?*\"\n\n\ndef camelcase_to_snakecase(name):\n    \"\"\"Convert camel-case string to snake-case.\"\"\"\n    name = _uppercase_uppercase_re.sub(r\"\\1_\\2\", name)\n    name = _lowercase_uppercase_re.sub(r\"\\1_\\2\", name)\n    return name.lower()\n\n\ndef snakecase_to_camelcase(name):\n    \"\"\"Convert snake-case string to camel-case string.\"\"\"\n    name = _single_underscore_re.split(name)\n    name = [_multiple_underscores_re.split(n) for n in name]\n    return \"\".join(n.capitalize() for n in itertools.chain.from_iterable(name) if n != \"\")\n\n\ndef filename_prefix_for_name(name):\n    if os.path.basename(name) != name:\n        raise ValueError(f\"Should be a dataset name, not a path: {name}\")\n    return camelcase_to_snakecase(name)\n\n\ndef filename_prefix_for_split(name, split):\n    if os.path.basename(name) != name:\n        raise ValueError(f\"Should be a dataset name, not a path: {name}\")\n    if not re.match(_split_re, split):\n        raise ValueError(f\"Split name should match '{_split_re}'' but got '{split}'.\")\n    return f\"{filename_prefix_for_name(name)}-{split}\"\n\n\ndef filepattern_for_dataset_split(dataset_name, split, data_dir, filetype_suffix=None):\n    prefix = filename_prefix_for_split(dataset_name, split)\n    if filetype_suffix:\n        prefix += f\".{filetype_suffix}\"\n    filepath = os.path.join(data_dir, prefix)\n    return f\"{filepath}*\"\n\n\ndef filenames_for_dataset_split(path, dataset_name, split, filetype_suffix=None, shard_lengths=None):\n    prefix = filename_prefix_for_split(dataset_name, split)\n    prefix = os.path.join(path, prefix)\n\n    if shard_lengths and len(shard_lengths) > 1:\n        num_shards = len(shard_lengths)\n        filenames = [f\"{prefix}-{shard_id:05d}-of-{num_shards:05d}\" for shard_id in range(num_shards)]\n        if filetype_suffix:\n            filenames = [filename + f\".{filetype_suffix}\" for filename in filenames]\n        return filenames\n    else:\n        filename = prefix\n        if filetype_suffix:\n            filename += f\".{filetype_suffix}\"\n        return [filename]\n"
  },
  {
    "path": "src/datasets/packaged_modules/__init__.py",
    "content": "import inspect\nimport re\nfrom typing import Dict, List, Tuple\n\nfrom huggingface_hub.utils import insecure_hashlib\n\nfrom .arrow import arrow\nfrom .audiofolder import audiofolder\nfrom .cache import cache\nfrom .csv import csv\nfrom .eval import eval\nfrom .hdf5 import hdf5\nfrom .imagefolder import imagefolder\nfrom .json import json\nfrom .lance import lance\nfrom .niftifolder import niftifolder\nfrom .pandas import pandas\nfrom .parquet import parquet\nfrom .pdffolder import pdffolder\nfrom .sql import sql\nfrom .text import text\nfrom .videofolder import videofolder\nfrom .webdataset import webdataset\nfrom .xml import xml\n\n\ndef _hash_python_lines(lines: list[str]) -> str:\n    filtered_lines = []\n    for line in lines:\n        line = re.sub(r\"#.*\", \"\", line)  # remove comments\n        if line:\n            filtered_lines.append(line)\n    full_str = \"\\n\".join(filtered_lines)\n\n    # Make a hash from all this code\n    full_bytes = full_str.encode(\"utf-8\")\n    return insecure_hashlib.sha256(full_bytes).hexdigest()\n\n\n# get importable module names and hash for caching\n_PACKAGED_DATASETS_MODULES = {\n    \"csv\": (csv.__name__, _hash_python_lines(inspect.getsource(csv).splitlines())),\n    \"json\": (json.__name__, _hash_python_lines(inspect.getsource(json).splitlines())),\n    \"pandas\": (pandas.__name__, _hash_python_lines(inspect.getsource(pandas).splitlines())),\n    \"parquet\": (parquet.__name__, _hash_python_lines(inspect.getsource(parquet).splitlines())),\n    \"arrow\": (arrow.__name__, _hash_python_lines(inspect.getsource(arrow).splitlines())),\n    \"text\": (text.__name__, _hash_python_lines(inspect.getsource(text).splitlines())),\n    \"imagefolder\": (imagefolder.__name__, _hash_python_lines(inspect.getsource(imagefolder).splitlines())),\n    \"audiofolder\": (audiofolder.__name__, _hash_python_lines(inspect.getsource(audiofolder).splitlines())),\n    \"videofolder\": (videofolder.__name__, _hash_python_lines(inspect.getsource(videofolder).splitlines())),\n    \"pdffolder\": (pdffolder.__name__, _hash_python_lines(inspect.getsource(pdffolder).splitlines())),\n    \"niftifolder\": (niftifolder.__name__, _hash_python_lines(inspect.getsource(niftifolder).splitlines())),\n    \"webdataset\": (webdataset.__name__, _hash_python_lines(inspect.getsource(webdataset).splitlines())),\n    \"xml\": (xml.__name__, _hash_python_lines(inspect.getsource(xml).splitlines())),\n    \"hdf5\": (hdf5.__name__, _hash_python_lines(inspect.getsource(hdf5).splitlines())),\n    \"eval\": (eval.__name__, _hash_python_lines(inspect.getsource(eval).splitlines())),\n    \"lance\": (lance.__name__, _hash_python_lines(inspect.getsource(lance).splitlines())),\n}\n\n# get importable module names and hash for caching\n_PACKAGED_DATASETS_MODULES_2_15_HASHES = {\n    \"csv\": \"eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d\",\n    \"json\": \"8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96\",\n    \"pandas\": \"3ac4ffc4563c796122ef66899b9485a3f1a977553e2d2a8a318c72b8cc6f2202\",\n    \"parquet\": \"ca31c69184d9832faed373922c2acccec0b13a0bb5bbbe19371385c3ff26f1d1\",\n    \"arrow\": \"74f69db2c14c2860059d39860b1f400a03d11bf7fb5a8258ca38c501c878c137\",\n    \"text\": \"c4a140d10f020282918b5dd1b8a49f0104729c6177f60a6b49ec2a365ec69f34\",\n    \"imagefolder\": \"7b7ce5247a942be131d49ad4f3de5866083399a0f250901bd8dc202f8c5f7ce5\",\n    \"audiofolder\": \"d3c1655c66c8f72e4efb5c79e952975fa6e2ce538473a6890241ddbddee9071c\",\n}\n\n# Used to infer the module to use based on the data files extensions\n_EXTENSION_TO_MODULE: dict[str, tuple[str, dict]] = {\n    \".csv\": (\"csv\", {}),\n    \".tsv\": (\"csv\", {\"sep\": \"\\t\"}),\n    \".json\": (\"json\", {}),\n    \".jsonl\": (\"json\", {}),\n    # ndjson is no longer maintained (see: https://github.com/ndjson/ndjson-spec/issues/35#issuecomment-1285673417)\n    \".ndjson\": (\"json\", {}),\n    \".parquet\": (\"parquet\", {}),\n    \".geoparquet\": (\"parquet\", {}),\n    \".gpq\": (\"parquet\", {}),\n    \".arrow\": (\"arrow\", {}),\n    \".txt\": (\"text\", {}),\n    \".tar\": (\"webdataset\", {}),\n    \".xml\": (\"xml\", {}),\n    \".hdf5\": (\"hdf5\", {}),\n    \".h5\": (\"hdf5\", {}),\n    \".eval\": (\"eval\", {}),\n    \".lance\": (\"lance\", {}),\n}\n_EXTENSION_TO_MODULE.update({ext: (\"imagefolder\", {}) for ext in imagefolder.ImageFolder.EXTENSIONS})\n_EXTENSION_TO_MODULE.update({ext.upper(): (\"imagefolder\", {}) for ext in imagefolder.ImageFolder.EXTENSIONS})\n_EXTENSION_TO_MODULE.update({ext: (\"audiofolder\", {}) for ext in audiofolder.AudioFolder.EXTENSIONS})\n_EXTENSION_TO_MODULE.update({ext.upper(): (\"audiofolder\", {}) for ext in audiofolder.AudioFolder.EXTENSIONS})\n_EXTENSION_TO_MODULE.update({ext: (\"videofolder\", {}) for ext in videofolder.VideoFolder.EXTENSIONS})\n_EXTENSION_TO_MODULE.update({ext.upper(): (\"videofolder\", {}) for ext in videofolder.VideoFolder.EXTENSIONS})\n_EXTENSION_TO_MODULE.update({ext: (\"pdffolder\", {}) for ext in pdffolder.PdfFolder.EXTENSIONS})\n_EXTENSION_TO_MODULE.update({ext.upper(): (\"pdffolder\", {}) for ext in pdffolder.PdfFolder.EXTENSIONS})\n_EXTENSION_TO_MODULE.update({ext: (\"niftifolder\", {}) for ext in niftifolder.NiftiFolder.EXTENSIONS})\n_EXTENSION_TO_MODULE.update({ext.upper(): (\"niftifolder\", {}) for ext in niftifolder.NiftiFolder.EXTENSIONS})\n\n# Used to filter data files based on extensions given a module name\n_MODULE_TO_EXTENSIONS: dict[str, list[str]] = {}\nfor _ext, (_module, _) in _EXTENSION_TO_MODULE.items():\n    _MODULE_TO_EXTENSIONS.setdefault(_module, []).append(_ext)\n\nfor _module in _MODULE_TO_EXTENSIONS:\n    _MODULE_TO_EXTENSIONS[_module].append(\".zip\")\n\n# Used to filter data files based on file names\n_MODULE_TO_METADATA_FILE_NAMES: Dict[str, List[str]] = {}\nfor _module in _MODULE_TO_EXTENSIONS:\n    _MODULE_TO_METADATA_FILE_NAMES[_module] = []\n_MODULE_TO_METADATA_FILE_NAMES[\"imagefolder\"] = imagefolder.ImageFolder.METADATA_FILENAMES\n_MODULE_TO_METADATA_FILE_NAMES[\"audiofolder\"] = imagefolder.ImageFolder.METADATA_FILENAMES\n_MODULE_TO_METADATA_FILE_NAMES[\"videofolder\"] = imagefolder.ImageFolder.METADATA_FILENAMES\n_MODULE_TO_METADATA_FILE_NAMES[\"pdffolder\"] = imagefolder.ImageFolder.METADATA_FILENAMES\n_MODULE_TO_METADATA_FILE_NAMES[\"niftifolder\"] = imagefolder.ImageFolder.METADATA_FILENAMES\n\n_MODULE_TO_METADATA_EXTENSIONS: Dict[str, List[str]] = {}\nfor _module in _MODULE_TO_EXTENSIONS:\n    _MODULE_TO_METADATA_EXTENSIONS[_module] = []\n_MODULE_TO_METADATA_EXTENSIONS[\"lance\"] = lance.Lance.METADATA_EXTENSIONS\n\n# Total\n\n_ALL_EXTENSIONS = list(_EXTENSION_TO_MODULE.keys()) + [\".zip\"]\n_ALL_METADATA_EXTENSIONS = sorted({_ext for _exts in _MODULE_TO_METADATA_EXTENSIONS.values() for _ext in _exts})\n_ALL_ALLOWED_EXTENSIONS = _ALL_EXTENSIONS + _ALL_METADATA_EXTENSIONS\n"
  },
  {
    "path": "src/datasets/packaged_modules/arrow/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/packaged_modules/arrow/arrow.py",
    "content": "from dataclasses import dataclass\nfrom typing import Optional\n\nimport pyarrow as pa\n\nimport datasets\nfrom datasets.builder import Key\nfrom datasets.table import table_cast\n\n\nlogger = datasets.utils.logging.get_logger(__name__)\n\n\n@dataclass\nclass ArrowConfig(datasets.BuilderConfig):\n    \"\"\"BuilderConfig for Arrow.\"\"\"\n\n    features: Optional[datasets.Features] = None\n\n    def __post_init__(self):\n        super().__post_init__()\n\n\nclass Arrow(datasets.ArrowBasedBuilder):\n    BUILDER_CONFIG_CLASS = ArrowConfig\n\n    def _info(self):\n        return datasets.DatasetInfo(features=self.config.features)\n\n    def _split_generators(self, dl_manager):\n        \"\"\"We handle string, list and dicts in datafiles\"\"\"\n        if not self.config.data_files:\n            raise ValueError(f\"At least one data file must be specified, but got data_files={self.config.data_files}\")\n        data_files = dl_manager.download(self.config.data_files)\n        splits = []\n        for split_name, files in data_files.items():\n            # Infer features if they are stored in the arrow schema\n            if self.info.features is None:\n                for file in files:\n                    with open(file, \"rb\") as f:\n                        try:\n                            reader = pa.ipc.open_stream(f)\n                        except (OSError, pa.lib.ArrowInvalid):\n                            reader = pa.ipc.open_file(f)\n                    self.info.features = datasets.Features.from_arrow_schema(reader.schema)\n                    break\n            splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={\"files\": files}))\n        return splits\n\n    def _cast_table(self, pa_table: pa.Table) -> pa.Table:\n        if self.info.features is not None:\n            # more expensive cast to support nested features with keys in a different order\n            # allows str <-> int/float or str to Audio for example\n            pa_table = table_cast(pa_table, self.info.features.arrow_schema)\n        return pa_table\n\n    def _generate_shards(self, files):\n        yield from files\n\n    def _generate_tables(self, files):\n        for file_idx, file in enumerate(files):\n            with open(file, \"rb\") as f:\n                try:\n                    try:\n                        batches = pa.ipc.open_stream(f)\n                    except (OSError, pa.lib.ArrowInvalid):\n                        reader = pa.ipc.open_file(f)\n                        batches = (reader.get_batch(i) for i in range(reader.num_record_batches))\n                    for batch_idx, record_batch in enumerate(batches):\n                        pa_table = pa.Table.from_batches([record_batch])\n                        # Uncomment for debugging (will print the Arrow table size and elements)\n                        # logger.warning(f\"pa_table: {pa_table} num rows: {pa_table.num_rows}\")\n                        # logger.warning('\\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))\n                        yield Key(file_idx, batch_idx), self._cast_table(pa_table)\n                except ValueError as e:\n                    logger.error(f\"Failed to read file '{file}' with error {type(e)}: {e}\")\n                    raise\n"
  },
  {
    "path": "src/datasets/packaged_modules/audiofolder/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/packaged_modules/audiofolder/audiofolder.py",
    "content": "import datasets\n\nfrom ..folder_based_builder import folder_based_builder\n\n\nlogger = datasets.utils.logging.get_logger(__name__)\n\n\nclass AudioFolderConfig(folder_based_builder.FolderBasedBuilderConfig):\n    \"\"\"Builder Config for AudioFolder.\"\"\"\n\n    drop_labels: bool = None\n    drop_metadata: bool = None\n\n    def __post_init__(self):\n        super().__post_init__()\n\n\nclass AudioFolder(folder_based_builder.FolderBasedBuilder):\n    BASE_FEATURE = datasets.Audio\n    BASE_COLUMN_NAME = \"audio\"\n    BUILDER_CONFIG_CLASS = AudioFolderConfig\n    EXTENSIONS: list[str]  # definition at the bottom of the script\n\n\n# Obtained with:\n# ```\n# import soundfile as sf\n#\n# AUDIO_EXTENSIONS = [f\".{format.lower()}\" for format in sf.available_formats().keys()]\n#\n# # .opus decoding is supported if libsndfile >= 1.0.31:\n# AUDIO_EXTENSIONS.extend([\".opus\"])\n# ```\n# We intentionally did not run this code on launch because:\n# (1) Soundfile was an optional dependency, so importing it in global namespace is not allowed\n# (2) To ensure the list of supported extensions is deterministic\n# (3) We use TorchCodec now anyways instead of Soundfile\nAUDIO_EXTENSIONS = [\n    \".aiff\",\n    \".au\",\n    \".avr\",\n    \".caf\",\n    \".flac\",\n    \".htk\",\n    \".svx\",\n    \".mat4\",\n    \".mat5\",\n    \".mpc2k\",\n    \".ogg\",\n    \".paf\",\n    \".pvf\",\n    \".raw\",\n    \".rf64\",\n    \".sd2\",\n    \".sds\",\n    \".ircam\",\n    \".voc\",\n    \".w64\",\n    \".wav\",\n    \".nist\",\n    \".wavex\",\n    \".wve\",\n    \".xi\",\n    \".mp3\",\n    \".opus\",\n    \".3gp\",\n    \".3g2\",\n    \".avi\",\n    \".asf\",\n    \".flv\",\n    \".mp4\",\n    \".mov\",\n    \".m4v\",\n    \".mkv\",\n    \".mpg\",\n    \".webm\",\n    \".f4v\",\n    \".wmv\",\n    \".wma\",\n    \".ogg\",\n    \".ogm\",\n    \".mxf\",\n    \".nut\",\n]\nAudioFolder.EXTENSIONS = AUDIO_EXTENSIONS\n"
  },
  {
    "path": "src/datasets/packaged_modules/cache/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/packaged_modules/cache/cache.py",
    "content": "import glob\nimport json\nimport os\nimport shutil\nimport time\nfrom pathlib import Path\nfrom typing import Optional, Union\n\nimport pyarrow as pa\n\nimport datasets\nimport datasets.config\nimport datasets.data_files\nfrom datasets.builder import Key\nfrom datasets.naming import camelcase_to_snakecase, filenames_for_dataset_split\n\n\nlogger = datasets.utils.logging.get_logger(__name__)\n\n\ndef _get_modification_time(cached_directory_path):\n    return (Path(cached_directory_path)).stat().st_mtime\n\n\ndef _find_hash_in_cache(\n    dataset_name: str,\n    config_name: Optional[str],\n    cache_dir: Optional[str],\n    config_kwargs: dict,\n    custom_features: Optional[datasets.Features],\n) -> tuple[str, str, str]:\n    if config_name or config_kwargs or custom_features:\n        config_id = datasets.BuilderConfig(config_name or \"default\").create_config_id(\n            config_kwargs=config_kwargs, custom_features=custom_features\n        )\n    else:\n        config_id = None\n    cache_dir = os.path.expanduser(str(cache_dir or datasets.config.HF_DATASETS_CACHE))\n    namespace_and_dataset_name = dataset_name.split(\"/\")\n    namespace_and_dataset_name[-1] = camelcase_to_snakecase(namespace_and_dataset_name[-1])\n    cached_relative_path = \"___\".join(namespace_and_dataset_name)\n    cached_datasets_directory_path_root = os.path.join(cache_dir, cached_relative_path)\n    cached_directory_paths = [\n        cached_directory_path\n        for cached_directory_path in glob.glob(\n            os.path.join(cached_datasets_directory_path_root, config_id or \"*\", \"*\", \"*\")\n        )\n        if os.path.isdir(cached_directory_path)\n        and (\n            config_kwargs\n            or custom_features\n            or json.loads(Path(cached_directory_path, \"dataset_info.json\").read_text(encoding=\"utf-8\"))[\"config_name\"]\n            == Path(cached_directory_path).parts[-3]  # no extra params => config_id == config_name\n        )\n    ]\n    if not cached_directory_paths:\n        cached_directory_paths = [\n            cached_directory_path\n            for cached_directory_path in glob.glob(os.path.join(cached_datasets_directory_path_root, \"*\", \"*\", \"*\"))\n            if os.path.isdir(cached_directory_path)\n        ]\n        available_configs = sorted(\n            {Path(cached_directory_path).parts[-3] for cached_directory_path in cached_directory_paths}\n        )\n        raise ValueError(\n            f\"Couldn't find cache for {dataset_name}\"\n            + (f\" for config '{config_id}'\" if config_id else \"\")\n            + (f\"\\nAvailable configs in the cache: {available_configs}\" if available_configs else \"\")\n        )\n    # get most recent\n    cached_directory_path = Path(sorted(cached_directory_paths, key=_get_modification_time)[-1])\n    version, hash = cached_directory_path.parts[-2:]\n    other_configs = [\n        Path(_cached_directory_path).parts[-3]\n        for _cached_directory_path in glob.glob(os.path.join(cached_datasets_directory_path_root, \"*\", version, hash))\n        if os.path.isdir(_cached_directory_path)\n        and (\n            config_kwargs\n            or custom_features\n            or json.loads(Path(_cached_directory_path, \"dataset_info.json\").read_text(encoding=\"utf-8\"))[\"config_name\"]\n            == Path(_cached_directory_path).parts[-3]  # no extra params => config_id == config_name\n        )\n    ]\n    if not config_id and len(other_configs) > 1:\n        raise ValueError(\n            f\"There are multiple '{dataset_name}' configurations in the cache: {', '.join(other_configs)}\"\n            f\"\\nPlease specify which configuration to reload from the cache, e.g.\"\n            f\"\\n\\tload_dataset('{dataset_name}', '{other_configs[0]}')\"\n        )\n    config_name = cached_directory_path.parts[-3]\n    warning_msg = (\n        f\"Found the latest cached dataset configuration '{config_name}' at {cached_directory_path} \"\n        f\"(last modified on {time.ctime(_get_modification_time(cached_directory_path))}).\"\n    )\n    logger.warning(warning_msg)\n    return config_name, version, hash\n\n\nclass Cache(datasets.ArrowBasedBuilder):\n    def __init__(\n        self,\n        cache_dir: Optional[str] = None,\n        dataset_name: Optional[str] = None,\n        config_name: Optional[str] = None,\n        version: Optional[str] = \"0.0.0\",\n        hash: Optional[str] = None,\n        base_path: Optional[str] = None,\n        info: Optional[datasets.DatasetInfo] = None,\n        features: Optional[datasets.Features] = None,\n        token: Optional[Union[bool, str]] = None,\n        repo_id: Optional[str] = None,\n        data_files: Optional[Union[str, list, dict, datasets.data_files.DataFilesDict]] = None,\n        data_dir: Optional[str] = None,\n        storage_options: Optional[dict] = None,\n        writer_batch_size: Optional[int] = None,\n        **config_kwargs,\n    ):\n        if repo_id is None and dataset_name is None:\n            raise ValueError(\"repo_id or dataset_name is required for the Cache dataset builder\")\n        if data_files is not None:\n            config_kwargs[\"data_files\"] = data_files\n        if data_dir is not None:\n            config_kwargs[\"data_dir\"] = data_dir\n        if hash == \"auto\" and version == \"auto\":\n            config_name, version, hash = _find_hash_in_cache(\n                dataset_name=repo_id or dataset_name,\n                config_name=config_name,\n                cache_dir=cache_dir,\n                config_kwargs=config_kwargs,\n                custom_features=features,\n            )\n        elif hash == \"auto\" or version == \"auto\":\n            raise NotImplementedError(\"Pass both hash='auto' and version='auto' instead\")\n        super().__init__(\n            cache_dir=cache_dir,\n            dataset_name=dataset_name,\n            config_name=config_name,\n            version=version,\n            hash=hash,\n            base_path=base_path,\n            info=info,\n            token=token,\n            repo_id=repo_id,\n            storage_options=storage_options,\n            writer_batch_size=writer_batch_size,\n        )\n\n    def _info(self) -> datasets.DatasetInfo:\n        return datasets.DatasetInfo()\n\n    def download_and_prepare(self, output_dir: Optional[str] = None, *args, **kwargs):\n        if not os.path.exists(self.cache_dir):\n            raise ValueError(f\"Cache directory for {self.dataset_name} doesn't exist at {self.cache_dir}\")\n        if output_dir is not None and output_dir != self.cache_dir:\n            shutil.copytree(self.cache_dir, output_dir)\n\n    def _split_generators(self, dl_manager):\n        # used to stream from cache\n        if isinstance(self.info.splits, datasets.SplitDict):\n            split_infos: list[datasets.SplitInfo] = list(self.info.splits.values())\n        else:\n            raise ValueError(f\"Missing splits info for {self.dataset_name} in cache directory {self.cache_dir}\")\n        return [\n            datasets.SplitGenerator(\n                name=split_info.name,\n                gen_kwargs={\n                    \"files\": filenames_for_dataset_split(\n                        self.cache_dir,\n                        dataset_name=self.dataset_name,\n                        split=split_info.name,\n                        filetype_suffix=\"arrow\",\n                        shard_lengths=split_info.shard_lengths,\n                    )\n                },\n            )\n            for split_info in split_infos\n        ]\n\n    def _generate_shards(self, files):\n        yield from files\n\n    def _generate_tables(self, files):\n        # used to stream from cache\n        for file_idx, file in enumerate(files):\n            with open(file, \"rb\") as f:\n                try:\n                    for batch_idx, record_batch in enumerate(pa.ipc.open_stream(f)):\n                        pa_table = pa.Table.from_batches([record_batch])\n                        # Uncomment for debugging (will print the Arrow table size and elements)\n                        # logger.warning(f\"pa_table: {pa_table} num rows: {pa_table.num_rows}\")\n                        # logger.warning('\\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))\n                        yield Key(file_idx, batch_idx), pa_table\n                except ValueError as e:\n                    logger.error(f\"Failed to read file '{file}' with error {type(e)}: {e}\")\n                    raise\n"
  },
  {
    "path": "src/datasets/packaged_modules/csv/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/packaged_modules/csv/csv.py",
    "content": "from dataclasses import dataclass\nfrom typing import Any, Callable, Optional, Union\n\nimport pandas as pd\nimport pyarrow as pa\n\nimport datasets\nimport datasets.config\nfrom datasets.builder import Key\nfrom datasets.features.features import require_storage_cast\nfrom datasets.table import table_cast\nfrom datasets.utils.py_utils import Literal\n\n\nlogger = datasets.utils.logging.get_logger(__name__)\n\n_PANDAS_READ_CSV_NO_DEFAULT_PARAMETERS = [\"names\", \"prefix\"]\n_PANDAS_READ_CSV_DEPRECATED_PARAMETERS = [\"warn_bad_lines\", \"error_bad_lines\", \"mangle_dupe_cols\"]\n_PANDAS_READ_CSV_NEW_1_3_0_PARAMETERS = [\"encoding_errors\", \"on_bad_lines\"]\n_PANDAS_READ_CSV_NEW_2_0_0_PARAMETERS = [\"date_format\"]\n_PANDAS_READ_CSV_DEPRECATED_2_2_0_PARAMETERS = [\"verbose\"]\n\n\n@dataclass\nclass CsvConfig(datasets.BuilderConfig):\n    \"\"\"BuilderConfig for CSV.\"\"\"\n\n    sep: str = \",\"\n    delimiter: Optional[str] = None\n    header: Optional[Union[int, list[int], str]] = \"infer\"\n    names: Optional[list[str]] = None\n    column_names: Optional[list[str]] = None\n    index_col: Optional[Union[int, str, list[int], list[str]]] = None\n    usecols: Optional[Union[list[int], list[str]]] = None\n    prefix: Optional[str] = None\n    mangle_dupe_cols: bool = True\n    engine: Optional[Literal[\"c\", \"python\", \"pyarrow\"]] = None\n    converters: dict[Union[int, str], Callable[[Any], Any]] = None\n    true_values: Optional[list] = None\n    false_values: Optional[list] = None\n    skipinitialspace: bool = False\n    skiprows: Optional[Union[int, list[int]]] = None\n    nrows: Optional[int] = None\n    na_values: Optional[Union[str, list[str]]] = None\n    keep_default_na: bool = True\n    na_filter: bool = True\n    verbose: bool = False\n    skip_blank_lines: bool = True\n    thousands: Optional[str] = None\n    decimal: str = \".\"\n    lineterminator: Optional[str] = None\n    quotechar: str = '\"'\n    quoting: int = 0\n    escapechar: Optional[str] = None\n    comment: Optional[str] = None\n    encoding: Optional[str] = None\n    dialect: Optional[str] = None\n    error_bad_lines: bool = True\n    warn_bad_lines: bool = True\n    skipfooter: int = 0\n    doublequote: bool = True\n    memory_map: bool = False\n    float_precision: Optional[str] = None\n    chunksize: int = 10_000\n    features: Optional[datasets.Features] = None\n    encoding_errors: Optional[str] = \"strict\"\n    on_bad_lines: Literal[\"error\", \"warn\", \"skip\"] = \"error\"\n    date_format: Optional[str] = None\n\n    def __post_init__(self):\n        super().__post_init__()\n        if self.delimiter is not None:\n            self.sep = self.delimiter\n        if self.column_names is not None:\n            self.names = self.column_names\n\n    @property\n    def pd_read_csv_kwargs(self):\n        pd_read_csv_kwargs = {\n            \"sep\": self.sep,\n            \"header\": self.header,\n            \"names\": self.names,\n            \"index_col\": self.index_col,\n            \"usecols\": self.usecols,\n            \"prefix\": self.prefix,\n            \"mangle_dupe_cols\": self.mangle_dupe_cols,\n            \"engine\": self.engine,\n            \"converters\": self.converters,\n            \"true_values\": self.true_values,\n            \"false_values\": self.false_values,\n            \"skipinitialspace\": self.skipinitialspace,\n            \"skiprows\": self.skiprows,\n            \"nrows\": self.nrows,\n            \"na_values\": self.na_values,\n            \"keep_default_na\": self.keep_default_na,\n            \"na_filter\": self.na_filter,\n            \"verbose\": self.verbose,\n            \"skip_blank_lines\": self.skip_blank_lines,\n            \"thousands\": self.thousands,\n            \"decimal\": self.decimal,\n            \"lineterminator\": self.lineterminator,\n            \"quotechar\": self.quotechar,\n            \"quoting\": self.quoting,\n            \"escapechar\": self.escapechar,\n            \"comment\": self.comment,\n            \"encoding\": self.encoding,\n            \"dialect\": self.dialect,\n            \"error_bad_lines\": self.error_bad_lines,\n            \"warn_bad_lines\": self.warn_bad_lines,\n            \"skipfooter\": self.skipfooter,\n            \"doublequote\": self.doublequote,\n            \"memory_map\": self.memory_map,\n            \"float_precision\": self.float_precision,\n            \"chunksize\": self.chunksize,\n            \"encoding_errors\": self.encoding_errors,\n            \"on_bad_lines\": self.on_bad_lines,\n            \"date_format\": self.date_format,\n        }\n\n        # some kwargs must not be passed if they don't have a default value\n        # some others are deprecated and we can also not pass them if they are the default value\n        for pd_read_csv_parameter in _PANDAS_READ_CSV_NO_DEFAULT_PARAMETERS + _PANDAS_READ_CSV_DEPRECATED_PARAMETERS:\n            if pd_read_csv_kwargs[pd_read_csv_parameter] == getattr(CsvConfig(), pd_read_csv_parameter):\n                del pd_read_csv_kwargs[pd_read_csv_parameter]\n\n        # Remove 1.3 new arguments\n        if not (datasets.config.PANDAS_VERSION.major >= 1 and datasets.config.PANDAS_VERSION.minor >= 3):\n            for pd_read_csv_parameter in _PANDAS_READ_CSV_NEW_1_3_0_PARAMETERS:\n                del pd_read_csv_kwargs[pd_read_csv_parameter]\n\n        # Remove 2.0 new arguments\n        if not (datasets.config.PANDAS_VERSION.major >= 2):\n            for pd_read_csv_parameter in _PANDAS_READ_CSV_NEW_2_0_0_PARAMETERS:\n                del pd_read_csv_kwargs[pd_read_csv_parameter]\n\n        # Remove 2.2 deprecated arguments\n        if datasets.config.PANDAS_VERSION.release >= (2, 2):\n            for pd_read_csv_parameter in _PANDAS_READ_CSV_DEPRECATED_2_2_0_PARAMETERS:\n                if pd_read_csv_kwargs[pd_read_csv_parameter] == getattr(CsvConfig(), pd_read_csv_parameter):\n                    del pd_read_csv_kwargs[pd_read_csv_parameter]\n\n        return pd_read_csv_kwargs\n\n\nclass Csv(datasets.ArrowBasedBuilder):\n    BUILDER_CONFIG_CLASS = CsvConfig\n\n    def _info(self):\n        return datasets.DatasetInfo(features=self.config.features)\n\n    def _split_generators(self, dl_manager):\n        \"\"\"We handle string, list and dicts in datafiles\"\"\"\n        if not self.config.data_files:\n            raise ValueError(f\"At least one data file must be specified, but got data_files={self.config.data_files}\")\n        dl_manager.download_config.extract_on_the_fly = True\n        base_data_files = dl_manager.download(self.config.data_files)\n        extracted_data_files = dl_manager.extract(base_data_files)\n        splits = []\n        for split_name, extracted_files in extracted_data_files.items():\n            files_iterables = [dl_manager.iter_files(extracted_file) for extracted_file in extracted_files]\n            splits.append(\n                datasets.SplitGenerator(\n                    name=split_name,\n                    gen_kwargs={\"files_iterables\": files_iterables, \"base_files\": base_data_files[split_name]},\n                )\n            )\n        return splits\n\n    def _cast_table(self, pa_table: pa.Table) -> pa.Table:\n        if self.config.features is not None:\n            schema = self.config.features.arrow_schema\n            if all(not require_storage_cast(feature) for feature in self.config.features.values()):\n                # cheaper cast\n                pa_table = pa.Table.from_arrays([pa_table[field.name] for field in schema], schema=schema)\n            else:\n                # more expensive cast; allows str <-> int/float or str to Audio for example\n                pa_table = table_cast(pa_table, schema)\n        return pa_table\n\n    def _generate_shards(self, base_files, files_iterables):\n        yield from base_files\n\n    def _generate_tables(self, base_files, files_iterables):\n        schema = self.config.features.arrow_schema if self.config.features else None\n        # dtype allows reading an int column as str\n        dtype = (\n            {\n                name: dtype.to_pandas_dtype() if not require_storage_cast(feature) else object\n                for name, dtype, feature in zip(schema.names, schema.types, self.config.features.values())\n            }\n            if schema is not None\n            else None\n        )\n        for shard_idx, files_iterable in enumerate(files_iterables):\n            for file in files_iterable:\n                csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.pd_read_csv_kwargs)\n                try:\n                    for batch_idx, df in enumerate(csv_file_reader):\n                        pa_table = pa.Table.from_pandas(df)\n                        # Uncomment for debugging (will print the Arrow table size and elements)\n                        # logger.warning(f\"pa_table: {pa_table} num rows: {pa_table.num_rows}\")\n                        # logger.warning('\\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))\n                        yield Key(shard_idx, batch_idx), self._cast_table(pa_table)\n                except ValueError as e:\n                    logger.error(f\"Failed to read file '{file}' with error {type(e)}: {e}\")\n                    raise\n"
  },
  {
    "path": "src/datasets/packaged_modules/eval/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/packaged_modules/eval/eval.py",
    "content": "import json\nimport os\nfrom itertools import islice\nfrom typing import Iterable\n\nimport pyarrow as pa\n\nimport datasets\nfrom datasets.builder import Key\n\n\nlogger = datasets.utils.logging.get_logger(__name__)\n\n\nclass Eval(datasets.GeneratorBasedBuilder):\n    NUM_EXAMPLES_FOR_FEATURES_INFERENCE = 5\n\n    def _info(self):\n        return datasets.DatasetInfo()\n\n    def _split_generators(self, dl_manager):\n        \"\"\"We handle string, list and dicts in datafiles\"\"\"\n        if not self.config.data_files:\n            raise ValueError(f\"At least one data file must be specified, but got data_files={self.config.data_files}\")\n        dl_manager.download_config.extract_on_the_fly = True\n        base_data_files = dl_manager.download(self.config.data_files)\n        extracted_data_files = dl_manager.extract(base_data_files)\n        splits = []\n        for split_name, logs in extracted_data_files.items():\n            logs_files_iterables = [dl_manager.iter_files(log) for log in logs]\n            splits.append(\n                datasets.SplitGenerator(\n                    name=split_name,\n                    gen_kwargs={\n                        \"logs_files_iterables\": logs_files_iterables,\n                        \"base_files\": base_data_files[split_name],\n                    },\n                )\n            )\n        if not self.info.features:\n            first_examples = list(\n                islice(\n                    self._iter_samples_from_log_files(logs_files_iterables[0]),\n                    self.NUM_EXAMPLES_FOR_FEATURES_INFERENCE,\n                )\n            )\n            pa_tables = [pa.Table.from_pylist([example]) for example in first_examples]\n            inferred_arrow_schema = pa.concat_tables(pa_tables, promote_options=\"default\").schema\n            self.info.features = datasets.Features.from_arrow_schema(inferred_arrow_schema)\n\n        return splits\n\n    def _sort_samples_key(self, sample_path: str):\n        # looks like \"{sample_idx}_epoch_{epoch_idx}\"\"\n        (sample_idx_str, epoch_idx_str) = os.path.splitext(os.path.basename(sample_path))[0].split(\"_epoch_\")\n        return (int(epoch_idx_str), int(sample_idx_str))\n\n    def _iter_samples_from_log_files(self, log_files: Iterable[str]):\n        sample_files = [log_file for log_file in log_files if os.path.basename(os.path.dirname(log_file)) == \"samples\"]\n        sample_files.sort(key=self._sort_samples_key)\n        for sample_file in sample_files:\n            with open(sample_file) as f:\n                sample = json.load(f)\n                for field in sample:\n                    if isinstance(sample[field], dict):\n                        sample[field] = json.dumps(sample[field])\n                    if isinstance(sample[field], list):\n                        sample[field] = [json.dumps(x) for x in sample[field]]\n                yield sample\n\n    def _generate_shards(self, base_files, logs_files_iterables):\n        yield from base_files\n\n    def _generate_examples(self, base_files, logs_files_iterables):\n        for file_idx, log_files in enumerate(logs_files_iterables):\n            for sample_idx, sample in enumerate(self._iter_samples_from_log_files(log_files)):\n                yield Key(file_idx, sample_idx), sample\n"
  },
  {
    "path": "src/datasets/packaged_modules/folder_based_builder/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py",
    "content": "import collections\nimport io\nimport itertools\nimport os\nfrom dataclasses import dataclass\nfrom typing import Any, Callable, Iterator, Optional, Union\n\nimport pandas as pd\nimport pyarrow as pa\nimport pyarrow.dataset as ds\nimport pyarrow.json as paj\nimport pyarrow.parquet as pq\n\nimport datasets\nfrom datasets import config\nfrom datasets.builder import Key\nfrom datasets.features.features import FeatureType, _visit, _visit_with_path, _VisitPath, require_storage_cast\nfrom datasets.utils.file_utils import readline\n\n\nlogger = datasets.utils.logging.get_logger(__name__)\n\n\ndef count_path_segments(path):\n    return path.replace(\"\\\\\", \"/\").count(\"/\")\n\n\n@dataclass\nclass FolderBasedBuilderConfig(datasets.BuilderConfig):\n    \"\"\"BuilderConfig for AutoFolder.\"\"\"\n\n    features: Optional[datasets.Features] = None\n    drop_labels: bool = None\n    drop_metadata: bool = None\n    metadata_filenames: list[str] = None\n    filters: Optional[Union[ds.Expression, list[tuple], list[list[tuple]]]] = None\n\n    def __post_init__(self):\n        super().__post_init__()\n\n\nclass FolderBasedBuilder(datasets.GeneratorBasedBuilder):\n    \"\"\"\n    Base class for generic data loaders for vision and image data.\n\n\n    Abstract class attributes to be overridden by a child class:\n        BASE_FEATURE: feature object to decode data (i.e. datasets.Image, datasets.Audio, ...)\n        BASE_COLUMN_NAME: string key name of a base feature (i.e. \"image\", \"audio\", ...)\n        BUILDER_CONFIG_CLASS: builder config inherited from `folder_based_builder.FolderBasedBuilderConfig`\n        EXTENSIONS: list of allowed extensions (only files with these extensions and METADATA_FILENAME files\n            will be included in a dataset)\n    \"\"\"\n\n    BASE_FEATURE: type[FeatureType]\n    BASE_COLUMN_NAME: str\n    BUILDER_CONFIG_CLASS: FolderBasedBuilderConfig\n    EXTENSIONS: list[str]\n\n    METADATA_FILENAMES: list[str] = [\"metadata.csv\", \"metadata.jsonl\", \"metadata.parquet\"]\n\n    def _info(self):\n        if not self.config.data_dir and not self.config.data_files:\n            raise ValueError(\n                \"Folder-based datasets require either `data_dir` or `data_files` to be specified. \"\n                \"Neither was provided.\"\n            )\n\n        return datasets.DatasetInfo(features=self.config.features)\n\n    def _split_generators(self, dl_manager):\n        if not self.config.data_files:\n            raise ValueError(f\"At least one data file must be specified, but got data_files={self.config.data_files}\")\n        dl_manager.download_config.extract_on_the_fly = True\n        # Do an early pass if:\n        # * `drop_labels` is None (default) or False, to infer the class labels\n        # * `drop_metadata` is None (default) or False, to find the metadata files\n        do_analyze = not self.config.drop_labels or not self.config.drop_metadata\n        labels, path_depths = set(), set()\n        all_metadata_files = collections.defaultdict(set)\n        metadata_filenames = self.config.metadata_filenames or self.METADATA_FILENAMES\n\n        def analyze(files_or_archives, downloaded_files_or_dirs, split):\n            if len(downloaded_files_or_dirs) == 0:\n                return\n            # The files are separated from the archives at this point, so check the first sample\n            # to see if it's a file or a directory and iterate accordingly\n            if os.path.isfile(downloaded_files_or_dirs[0]):\n                original_files, downloaded_files = files_or_archives, downloaded_files_or_dirs\n                for original_file, downloaded_file in zip(original_files, downloaded_files):\n                    original_file, downloaded_file = str(original_file), str(downloaded_file)\n                    _, original_file_ext = os.path.splitext(original_file)\n                    if original_file_ext.lower() in self.EXTENSIONS:\n                        if not self.config.drop_labels:\n                            labels.add(os.path.basename(os.path.dirname(original_file)))\n                            path_depths.add(count_path_segments(original_file))\n                    elif os.path.basename(original_file) in metadata_filenames:\n                        all_metadata_files[split].add((original_file, None, downloaded_file))\n                    else:\n                        original_file_name = os.path.basename(original_file)\n                        logger.debug(\n                            f\"The file '{original_file_name}' was ignored: it is not a {self.BASE_COLUMN_NAME}, and is not {metadata_filenames} either.\"\n                        )\n            else:\n                archives, downloaded_dirs = files_or_archives, downloaded_files_or_dirs\n                for archive, downloaded_dir in zip(archives, downloaded_dirs):\n                    archive, downloaded_dir = str(archive), str(downloaded_dir)\n                    for downloaded_dir_file in dl_manager.iter_files(downloaded_dir):\n                        _, downloaded_dir_file_ext = os.path.splitext(downloaded_dir_file)\n                        if downloaded_dir_file_ext in self.EXTENSIONS:\n                            if not self.config.drop_labels:\n                                labels.add(os.path.basename(os.path.dirname(downloaded_dir_file)))\n                                path_depths.add(count_path_segments(downloaded_dir_file))\n                        elif os.path.basename(downloaded_dir_file) in metadata_filenames:\n                            all_metadata_files[split].add((None, downloaded_dir, downloaded_dir_file))\n                        else:\n                            archive_file_name = os.path.basename(archive)\n                            original_file_name = os.path.basename(downloaded_dir_file)\n                            logger.debug(\n                                f\"The file '{original_file_name}' from the archive '{archive_file_name}' was ignored: it is not a {self.BASE_COLUMN_NAME}, and is not {metadata_filenames} either.\"\n                            )\n\n        data_files = self.config.data_files\n        splits = []\n        for split_name, files in data_files.items():\n            files, metadata_files, archives = self._split_files_and_metadata_and_archives(files)\n            downloaded_files = dl_manager.download(files)\n            downloaded_metadata_files = dl_manager.download(metadata_files)\n            downloaded_dirs = dl_manager.download_and_extract(archives)\n            if do_analyze:  # drop_metadata is None or False, drop_labels is None or False\n                logger.info(f\"Searching for labels and/or metadata files in {split_name} data files...\")\n                analyze(files, downloaded_files, split_name)\n                analyze(metadata_files, downloaded_metadata_files, split_name)\n                analyze(archives, downloaded_dirs, split_name)\n\n                if all_metadata_files:\n                    # add metadata if `all_metadata_files` are found and `drop_metadata` is None (default) or False\n                    add_metadata = not self.config.drop_metadata\n                    # if `all_metadata_files` are found, don't add labels\n                    add_labels = False\n                else:\n                    # if `all_metadata_files` are not found, don't add metadata\n                    add_metadata = False\n                    # if `all_metadata_files` are not found and `drop_labels` is None (default) -\n                    # add labels if files are on the same level in directory hierarchy and there is more than one label\n                    add_labels = (\n                        (len(labels) > 1 and len(path_depths) == 1)\n                        if self.config.drop_labels is None\n                        else not self.config.drop_labels\n                    )\n\n                if add_labels:\n                    logger.info(\"Adding the labels inferred from data directories to the dataset's features...\")\n                if add_metadata:\n                    logger.info(\"Adding metadata to the dataset...\")\n            else:\n                add_labels, add_metadata, all_metadata_files = False, False, {}\n\n            # files info (original_file, None, downloaded_file)\n            files = tuple(zip(files, [None] * len(files), downloaded_files))\n            # archives info (original_archive_file, downloaded_dir, downloaded_files)\n            files += tuple(\n                (archive, downloaded_dir, dl_manager.iter_files(downloaded_dir))\n                for archive, downloaded_dir in zip(archives, downloaded_dirs)\n            )\n            splits.append(\n                datasets.SplitGenerator(\n                    name=split_name,\n                    gen_kwargs={\n                        \"files\": files,\n                        \"metadata_files\": all_metadata_files.get(split_name, []),\n                        \"add_labels\": add_labels,\n                        \"add_metadata\": add_metadata,\n                    },\n                )\n            )\n\n        if add_metadata:\n            # Verify that:\n            # * all metadata files have the same set of features in each split\n            # * the `file_name` key is one of the metadata keys and is of type string\n            features_per_metadata_file: list[tuple[str, datasets.Features]] = []\n\n            # Check that all metadata files share the same format\n            metadata_ext = {\n                os.path.splitext(original_metadata_file or downloaded_metadata_file)[-1]\n                for original_metadata_file, _, downloaded_metadata_file in itertools.chain.from_iterable(\n                    all_metadata_files.values()\n                )\n            }\n            if len(metadata_ext) > 1:\n                raise ValueError(f\"Found metadata files with different extensions: {list(metadata_ext)}\")\n            metadata_ext = metadata_ext.pop()\n\n            for split_metadata_files in all_metadata_files.values():\n                pa_metadata_table = None\n                for _, _, downloaded_metadata_file in split_metadata_files:\n                    for pa_metadata_table in self._read_metadata(downloaded_metadata_file, metadata_ext=metadata_ext):\n                        break  # just fetch the first rows\n                    if pa_metadata_table is not None:\n                        features_per_metadata_file.append(\n                            (downloaded_metadata_file, datasets.Features.from_arrow_schema(pa_metadata_table.schema))\n                        )\n                        break  # no need to fetch all the files\n            for downloaded_metadata_file, metadata_features in features_per_metadata_file:\n                if metadata_features != features_per_metadata_file[0][1]:\n                    raise ValueError(\n                        f\"Metadata files {downloaded_metadata_file} and {features_per_metadata_file[0][0]} have different features: {features_per_metadata_file[0]} != {metadata_features}\"\n                    )\n            metadata_features = features_per_metadata_file[0][1]\n            feature_not_found = True\n\n            def _set_feature(feature):\n                nonlocal feature_not_found\n                if isinstance(feature, dict):\n                    out = type(feature)()\n                    for key in feature:\n                        if (key == \"file_name\" or key.endswith(\"_file_name\")) and (\n                            feature[key] == datasets.Value(\"string\") or feature[key] == datasets.Value(\"large_string\")\n                        ):\n                            key = key[: -len(\"_file_name\")] or self.BASE_COLUMN_NAME\n                            out[key] = self.BASE_FEATURE()\n                            feature_not_found = False\n                        elif (key == \"file_names\" or key.endswith(\"_file_names\")) and (\n                            feature[key]\n                            in [datasets.List(datasets.Value(\"string\")), datasets.List(datasets.Value(\"large_string\"))]\n                        ):\n                            key = key[: -len(\"_file_names\")] or (self.BASE_COLUMN_NAME + \"s\")\n                            out[key] = datasets.List(self.BASE_FEATURE())\n                            feature_not_found = False\n                        elif (key == \"file_names\" or key.endswith(\"_file_names\")) and (\n                            feature[key] == [datasets.Value(\"string\")]\n                            or feature[key] == [datasets.Value(\"large_string\")]\n                        ):\n                            key = key[: -len(\"_file_names\")] or (self.BASE_COLUMN_NAME + \"s\")\n                            out[key] = [self.BASE_FEATURE()]\n                            feature_not_found = False\n                        else:\n                            out[key] = feature[key]\n                    return out\n                return feature\n\n            metadata_features = _visit(metadata_features, _set_feature)\n\n            if feature_not_found:\n                raise ValueError(\n                    \"`file_name`, `*_file_name`, `file_names` or `*_file_names` must be present as dictionary key in metadata files\"\n                )\n        else:\n            metadata_features = None\n\n        # Normally, we would do this in _info, but we need to know the labels and/or metadata\n        # before building the features\n        if self.config.features is None:\n            if add_metadata:\n                self.info.features = metadata_features\n            elif add_labels:\n                self.info.features = datasets.Features(\n                    {\n                        self.BASE_COLUMN_NAME: self.BASE_FEATURE(),\n                        \"label\": datasets.ClassLabel(names=sorted(labels)),\n                    }\n                )\n            else:\n                self.info.features = datasets.Features({self.BASE_COLUMN_NAME: self.BASE_FEATURE()})\n\n        return splits\n\n    def _split_files_and_metadata_and_archives(self, data_files):\n        files, metadata_files, archives = [], [], []\n        metadata_filenames = self.config.metadata_filenames or self.METADATA_FILENAMES\n        for data_file in data_files:\n            data_file_root, data_file_ext = os.path.splitext(data_file)\n            _, second_data_file_ext = os.path.splitext(data_file_root)\n            if data_file_ext.lower() in self.EXTENSIONS or second_data_file_ext.lower() in self.EXTENSIONS:\n                files.append(data_file)\n            elif os.path.basename(data_file) in metadata_filenames:\n                metadata_files.append(data_file)\n            elif data_file_ext.lower() == \".zip\":\n                archives.append(data_file)\n        return files, metadata_files, archives\n\n    def _read_metadata(self, metadata_file: str, metadata_ext: str = \"\") -> Iterator[pa.Table]:\n        \"\"\"using the same logic as the Csv, Json and Parquet dataset builders to stream the data\"\"\"\n        if self.config.filters is not None:\n            filter_expr = (\n                pq.filters_to_expression(self.config.filters)\n                if isinstance(self.config.filters, list)\n                else self.config.filters\n            )\n        else:\n            filter_expr = None\n        if metadata_ext == \".csv\":\n            chunksize = 10_000  # 10k lines\n            schema = self.config.features.arrow_schema if self.config.features else None\n            # dtype allows reading an int column as str\n            dtype = (\n                {\n                    name: dtype.to_pandas_dtype() if not require_storage_cast(feature) else object\n                    for name, dtype, feature in zip(schema.names, schema.types, self.config.features.values())\n                }\n                if schema is not None\n                else None\n            )\n            csv_file_reader = pd.read_csv(metadata_file, iterator=True, dtype=dtype, chunksize=chunksize)\n            for df in csv_file_reader:\n                pa_table = pa.Table.from_pandas(df)\n                if self.config.filters is not None:\n                    pa_table = pa_table.filter(filter_expr)\n                if len(pa_table) > 0:\n                    yield pa_table\n        elif metadata_ext == \".jsonl\":\n            with open(metadata_file, \"rb\") as f:\n                chunksize: int = 10 << 20  # 10MB\n                # Use block_size equal to the chunk size divided by 32 to leverage multithreading\n                # Set a default minimum value of 16kB if the chunk size is really small\n                block_size = max(chunksize // 32, 16 << 10)\n                while True:\n                    batch = f.read(chunksize)\n                    if not batch:\n                        break\n                    # Finish current line\n                    try:\n                        batch += f.readline()\n                    except (AttributeError, io.UnsupportedOperation):\n                        batch += readline(f)\n                    while True:\n                        try:\n                            pa_table = paj.read_json(\n                                io.BytesIO(batch), read_options=paj.ReadOptions(block_size=block_size)\n                            )\n                            break\n                        except (pa.ArrowInvalid, pa.ArrowNotImplementedError) as e:\n                            if (\n                                isinstance(e, pa.ArrowInvalid)\n                                and \"straddling\" not in str(e)\n                                or block_size > len(batch)\n                            ):\n                                raise\n                            else:\n                                # Increase the block size in case it was too small.\n                                # The block size will be reset for the next file.\n                                logger.debug(\n                                    f\"Batch of {len(batch)} bytes couldn't be parsed with block_size={block_size}. Retrying with block_size={block_size * 2}.\"\n                                )\n                                block_size *= 2\n                    if self.config.filters is not None:\n                        pa_table = pa_table.filter(filter_expr)\n                    if len(pa_table) > 0:\n                        yield pa_table\n        else:\n            with open(metadata_file, \"rb\") as f:\n                parquet_fragment = ds.ParquetFileFormat().make_fragment(f)\n                if parquet_fragment.row_groups:\n                    batch_size = parquet_fragment.row_groups[0].num_rows\n                else:\n                    batch_size = config.DEFAULT_MAX_BATCH_SIZE\n                for record_batch in parquet_fragment.to_batches(\n                    batch_size=batch_size,\n                    filter=filter_expr,\n                    batch_readahead=0,\n                    fragment_readahead=0,\n                ):\n                    yield pa.Table.from_batches([record_batch])\n\n    def _generate_shards(self, files, metadata_files, add_metadata, add_labels):\n        if add_metadata:\n            for _, _, downloaded_metadata_file in metadata_files:\n                yield downloaded_metadata_file\n        else:\n            for _, downloaded_dir, downloaded_file in files:\n                yield downloaded_dir or downloaded_file\n\n    def _generate_examples(self, files, metadata_files, add_metadata, add_labels):\n        if add_metadata:\n            feature_paths = []\n\n            def find_feature_path(feature, feature_path):\n                nonlocal feature_paths\n                if feature_path and isinstance(feature, self.BASE_FEATURE):\n                    feature_paths.append(feature_path)\n\n            _visit_with_path(self.info.features, find_feature_path)\n\n            for shard_idx, metadata_file_info in enumerate(metadata_files):\n                if len(metadata_file_info) == 2:\n                    original_metadata_file, downloaded_metadata_file = metadata_file_info\n                else:\n                    original_metadata_file, downloaded_metadata_dir, downloaded_metadata_file = metadata_file_info\n                metadata_ext = os.path.splitext(original_metadata_file or downloaded_metadata_file)[-1]\n                downloaded_metadata_dir = os.path.dirname(downloaded_metadata_file)\n\n                def set_feature(item, feature_path: _VisitPath):\n                    if len(feature_path) == 2 and isinstance(feature_path[0], str) and feature_path[1] == 0:\n                        item[feature_path[0]] = item.pop(\"file_names\", None) or item.pop(\n                            feature_path[0] + \"_file_names\", None\n                        )\n                    elif len(feature_path) == 1 and isinstance(feature_path[0], str):\n                        item[feature_path[0]] = item.pop(\"file_name\", None) or item.pop(\n                            feature_path[0] + \"_file_name\", None\n                        )\n                    elif len(feature_path) == 0:\n                        file_relpath = os.path.normpath(item).replace(\"\\\\\", \"/\")\n                        item = os.path.join(downloaded_metadata_dir, file_relpath)\n                    return item\n\n                for pa_metadata_table in self._read_metadata(downloaded_metadata_file, metadata_ext=metadata_ext):\n                    for sample_idx, sample in enumerate(pa_metadata_table.to_pylist()):\n                        for feature_path in feature_paths:\n                            _nested_apply(sample, feature_path, set_feature)\n                        yield Key(shard_idx, sample_idx), sample\n        else:\n            if self.config.filters is not None:\n                filter_expr = (\n                    pq.filters_to_expression(self.config.filters)\n                    if isinstance(self.config.filters, list)\n                    else self.config.filters\n                )\n            for shard_idx, (original_file, _, downloaded_files) in enumerate(files):\n                if isinstance(downloaded_files, str):\n                    downloaded_files = [downloaded_files]\n                for sample_idx, downloaded_file in enumerate(downloaded_files):\n                    sample = {self.BASE_COLUMN_NAME: downloaded_file}\n                    if add_labels:\n                        sample[\"label\"] = os.path.basename(os.path.dirname(original_file or downloaded_file))\n                    if self.config.filters is not None:\n                        pa_table = pa.Table.from_pylist([sample]).filter(filter_expr)\n                        if len(pa_table) == 0:\n                            continue\n                    yield Key(shard_idx, sample_idx), sample\n\n\ndef _nested_apply(item: Any, feature_path: _VisitPath, func: Callable[[Any, _VisitPath], Any]):\n    # see _visit_with_path() to see how feature paths are constructed\n    item = func(item, feature_path)\n    if feature_path:\n        key = feature_path[0]\n        if key == 0:\n            for i in range(len(item)):\n                item[i] = _nested_apply(item[i], feature_path[1:], func)\n        else:\n            item[key] = _nested_apply(item[key], feature_path[1:], func)\n    return item\n"
  },
  {
    "path": "src/datasets/packaged_modules/generator/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/packaged_modules/generator/generator.py",
    "content": "from dataclasses import dataclass\nfrom typing import Callable, Optional\n\nimport datasets\nfrom datasets.builder import Key\nfrom datasets.utils.sharding import _number_of_shards_in_gen_kwargs, _split_gen_kwargs\n\n\n@dataclass\nclass GeneratorConfig(datasets.BuilderConfig):\n    generator: Optional[Callable] = None\n    gen_kwargs: Optional[dict] = None\n    features: Optional[datasets.Features] = None\n    split: datasets.NamedSplit = datasets.Split.TRAIN\n\n    def __post_init__(self):\n        super().__post_init__()\n        if self.generator is None:\n            raise ValueError(\"generator must be specified\")\n\n        if self.gen_kwargs is None:\n            self.gen_kwargs = {}\n\n\nclass Generator(datasets.GeneratorBasedBuilder):\n    BUILDER_CONFIG_CLASS = GeneratorConfig\n\n    def _info(self):\n        return datasets.DatasetInfo(features=self.config.features)\n\n    def _split_generators(self, dl_manager):\n        return [datasets.SplitGenerator(name=self.config.split, gen_kwargs=self.config.gen_kwargs)]\n\n    def _generate_examples(self, **gen_kwargs):\n        num_shards = _number_of_shards_in_gen_kwargs(gen_kwargs)\n        for shard_idx, shard_gen_kwargs in enumerate(_split_gen_kwargs(gen_kwargs, max_num_jobs=num_shards)):\n            for sample_idx, sample in enumerate(self.config.generator(**shard_gen_kwargs)):\n                yield Key(shard_idx, sample_idx), sample\n"
  },
  {
    "path": "src/datasets/packaged_modules/hdf5/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/packaged_modules/hdf5/hdf5.py",
    "content": "from dataclasses import dataclass, field\nfrom typing import TYPE_CHECKING, Optional\n\nimport numpy as np\nimport pyarrow as pa\n\nimport datasets\nfrom datasets.builder import Key\nfrom datasets.features.features import (\n    Array2D,\n    Array3D,\n    Array4D,\n    Array5D,\n    Features,\n    LargeList,\n    List,\n    Value,\n    _ArrayXD,\n    _arrow_to_datasets_dtype,\n)\nfrom datasets.table import cast_table_to_features\n\n\nif TYPE_CHECKING:\n    import h5py\n\nlogger = datasets.utils.logging.get_logger(__name__)\n\nEXTENSIONS = [\".h5\", \".hdf5\"]\n\n\n@dataclass\nclass HDF5Config(datasets.BuilderConfig):\n    \"\"\"BuilderConfig for HDF5.\"\"\"\n\n    batch_size: Optional[int] = None\n    features: Optional[datasets.Features] = None\n\n\nclass HDF5(datasets.ArrowBasedBuilder):\n    \"\"\"ArrowBasedBuilder that converts HDF5 files to Arrow tables using the HF extension types.\"\"\"\n\n    BUILDER_CONFIG_CLASS = HDF5Config\n\n    def _info(self):\n        return datasets.DatasetInfo(features=self.config.features)\n\n    def _split_generators(self, dl_manager):\n        import h5py\n\n        if not self.config.data_files:\n            raise ValueError(f\"At least one data file must be specified, but got data_files={self.config.data_files}\")\n        data_files = dl_manager.download(self.config.data_files)\n        splits = []\n        for split_name, files in data_files.items():\n            # Infer features from first file\n            if self.info.features is None:\n                for first_file in files:\n                    with open(first_file, \"rb\") as f:\n                        with h5py.File(f, \"r\") as h5:\n                            self.info.features = _recursive_infer_features(h5)\n                    break\n            splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={\"files\": files}))\n        return splits\n\n    def _generate_shards(self, files):\n        yield from files\n\n    def _generate_tables(self, files):\n        import h5py\n\n        batch_size_cfg = self.config.batch_size\n        for file_idx, file in enumerate(files):\n            try:\n                with open(file, \"rb\") as f:\n                    with h5py.File(f, \"r\") as h5:\n                        # Infer features and lengths from first file\n                        if self.info.features is None:\n                            self.info.features = _recursive_infer_features(h5)\n                        num_rows = _check_dataset_lengths(h5, self.info.features)\n                        if num_rows is None:\n                            logger.warning(f\"File {file} contains no data, skipping...\")\n                            continue\n                        effective_batch = batch_size_cfg or self._writer_batch_size or num_rows\n                        for batch_idx, start in enumerate(range(0, num_rows, effective_batch)):\n                            end = min(start + effective_batch, num_rows)\n                            pa_table = _recursive_load_arrays(h5, self.info.features, start, end)\n                            if pa_table is None:\n                                logger.warning(f\"File {file} contains no data, skipping...\")\n                                continue\n                            yield Key(file_idx, batch_idx), cast_table_to_features(pa_table, self.info.features)\n            except ValueError as e:\n                logger.error(f\"Failed to read file '{file}' with error {type(e)}: {e}\")\n                raise\n\n\n# ┌───────────┐\n# │  Complex  │\n# └───────────┘\n\n\ndef _is_complex_dtype(dtype: np.dtype) -> bool:\n    if dtype.kind == \"c\":\n        return True\n    if dtype.subdtype is not None:\n        return _is_complex_dtype(dtype.subdtype[0])\n    return False\n\n\ndef _create_complex_features(dset) -> Features:\n    if dset.dtype.subdtype is not None:\n        dtype, data_shape = dset.dtype.subdtype\n    else:\n        data_shape = dset.shape[1:]\n        dtype = dset.dtype\n\n    if dtype == np.complex64:\n        # two float32s\n        value_type = Value(\"float32\")\n    elif dtype == np.complex128:\n        # two float64s\n        value_type = Value(\"float64\")\n    else:\n        logger.warning(f\"Found complex dtype {dtype} that is not supported. Converting to float64...\")\n        value_type = Value(\"float64\")\n\n    return Features(\n        {\n            \"real\": _create_sized_feature_impl(data_shape, value_type),\n            \"imag\": _create_sized_feature_impl(data_shape, value_type),\n        }\n    )\n\n\ndef _convert_complex_to_nested(arr: np.ndarray) -> pa.StructArray:\n    data = {\n        \"real\": datasets.features.features.numpy_to_pyarrow_listarray(arr.real),\n        \"imag\": datasets.features.features.numpy_to_pyarrow_listarray(arr.imag),\n    }\n    return pa.StructArray.from_arrays([data[\"real\"], data[\"imag\"]], names=[\"real\", \"imag\"])\n\n\n# ┌────────────┐\n# │  Compound  │\n# └────────────┘\n\n\ndef _is_compound_dtype(dtype: np.dtype) -> bool:\n    return dtype.kind == \"V\"\n\n\n@dataclass\nclass _CompoundGroup:\n    dset: \"h5py.Dataset\"\n    data: np.ndarray = None\n\n    def items(self):\n        for field_name in self.dset.dtype.names:\n            field_dtype = self.dset.dtype[field_name]\n            yield field_name, _CompoundField(self.data, field_name, field_dtype)\n\n\n@dataclass\nclass _CompoundField:\n    data: Optional[np.ndarray]\n    name: str\n    dtype: np.dtype\n    shape: tuple[int, ...] = field(init=False)\n\n    def __post_init__(self):\n        self.shape = (len(self.data) if self.data is not None else 0,) + self.dtype.shape\n\n    def __getitem__(self, key):\n        return self.data[key][self.name]\n\n\ndef _create_compound_features(dset) -> Features:\n    mock_group = _CompoundGroup(dset)\n    return _recursive_infer_features(mock_group)\n\n\ndef _convert_compound_to_nested(arr, dset) -> pa.StructArray:\n    mock_group = _CompoundGroup(dset, data=arr)\n    features = _create_compound_features(dset)\n    return _recursive_load_arrays(mock_group, features, 0, len(arr))\n\n\n# ┌───────────────────┐\n# │  Variable-Length  │\n# └───────────────────┘\n\n\ndef _is_vlen_dtype(dtype: np.dtype) -> bool:\n    if dtype.metadata and \"vlen\" in dtype.metadata:\n        return True\n    return False\n\n\ndef _create_vlen_features(dset) -> Features:\n    vlen_dtype = dset.dtype.metadata[\"vlen\"]\n    if vlen_dtype in (str, bytes):\n        return Value(\"string\")\n    inner_feature = _np_to_pa_to_hf_value(vlen_dtype)\n    return List(inner_feature)\n\n\ndef _convert_vlen_to_array(arr: np.ndarray) -> pa.Array:\n    return datasets.features.features.numpy_to_pyarrow_listarray(arr)\n\n\n# ┌───────────┐\n# │  Generic  │\n# └───────────┘\n\n\ndef _recursive_infer_features(h5_obj) -> Features:\n    features_dict = {}\n    for path, dset in h5_obj.items():\n        if _is_group(dset):\n            features = _recursive_infer_features(dset)\n            if features:\n                features_dict[path] = features\n        elif _is_dataset(dset):\n            features = _infer_feature(dset)\n            if features:\n                features_dict[path] = features\n\n    return Features(features_dict)\n\n\ndef _infer_feature(dset):\n    if _is_complex_dtype(dset.dtype):\n        return _create_complex_features(dset)\n    elif _is_compound_dtype(dset.dtype) or dset.dtype.kind == \"V\":\n        return _create_compound_features(dset)\n    elif _is_vlen_dtype(dset.dtype):\n        return _create_vlen_features(dset)\n    return _create_sized_feature(dset)\n\n\ndef _load_array(dset, path: str, start: int, end: int) -> pa.Array:\n    arr = dset[start:end]\n\n    if _is_vlen_dtype(dset.dtype):\n        return _convert_vlen_to_array(arr)\n    elif _is_complex_dtype(dset.dtype):\n        return _convert_complex_to_nested(arr)\n    elif _is_compound_dtype(dset.dtype):\n        return _convert_compound_to_nested(arr, dset)\n    elif dset.dtype.kind == \"O\":\n        raise ValueError(\n            f\"Object dtype dataset '{path}' is not supported. \"\n            f\"For variable-length data, please use h5py.vlen_dtype() \"\n            f\"when creating the HDF5 file. \"\n            f\"See: https://docs.h5py.org/en/stable/special.html#variable-length-strings\"\n        )\n    else:\n        # If any non-batch dimension is zero, emit an unsized pa.list_\n        # to avoid creating FixedSizeListArray with list_size=0.\n        if any(dim == 0 for dim in dset.shape[1:]):\n            inner_type = pa.from_numpy_dtype(dset.dtype)\n            return pa.array([[] for _ in arr], type=pa.list_(inner_type))\n        else:\n            return datasets.features.features.numpy_to_pyarrow_listarray(arr)\n\n\ndef _recursive_load_arrays(h5_obj, features: Features, start: int, end: int):\n    batch_dict = {}\n    for path, dset in h5_obj.items():\n        if path not in features:\n            continue\n        if _is_group(dset):\n            arr = _recursive_load_arrays(dset, features[path], start, end)\n        elif _is_dataset(dset):\n            arr = _load_array(dset, path, start, end)\n        else:\n            raise ValueError(f\"Unexpected type {type(dset)}\")\n\n        if arr is not None:\n            batch_dict[path] = arr\n\n    if _is_file(h5_obj):\n        return pa.Table.from_pydict(batch_dict)\n\n    if batch_dict:\n        should_chunk, keys, values = False, [], []\n        for k, v in batch_dict.items():\n            if isinstance(v, pa.ChunkedArray):\n                should_chunk = True\n                v = v.combine_chunks()\n            keys.append(k)\n            values.append(v)\n\n        sarr = pa.StructArray.from_arrays(values, names=keys)\n        return pa.chunked_array(sarr) if should_chunk else sarr\n\n\n# ┌─────────────┐\n# │  Utilities  │\n# └─────────────┘\n\n\ndef _create_sized_feature(dset):\n    dset_shape = dset.shape[1:]\n    value_feature = _np_to_pa_to_hf_value(dset.dtype)\n    return _create_sized_feature_impl(dset_shape, value_feature)\n\n\ndef _create_sized_feature_impl(dset_shape, value_feature):\n    dtype_str = value_feature.dtype\n    if any(dim == 0 for dim in dset_shape):\n        logger.warning(\n            f\"HDF5 to Arrow: Found a dataset with shape {dset_shape} and dtype {dtype_str} that has a dimension with size 0. Shape information will be lost in the conversion to List({value_feature}).\"\n        )\n        return List(value_feature)\n\n    rank = len(dset_shape)\n    if rank == 0:\n        return value_feature\n    elif rank == 1:\n        return List(value_feature, length=dset_shape[0])\n    elif rank <= 5:\n        return _sized_arrayxd(rank)(shape=dset_shape, dtype=dtype_str)\n    else:\n        raise TypeError(f\"Array{rank}D not supported. Maximum 5 dimensions allowed.\")\n\n\ndef _sized_arrayxd(rank: int):\n    return {2: Array2D, 3: Array3D, 4: Array4D, 5: Array5D}[rank]\n\n\ndef _np_to_pa_to_hf_value(numpy_dtype: np.dtype) -> Value:\n    return Value(dtype=_arrow_to_datasets_dtype(pa.from_numpy_dtype(numpy_dtype)))\n\n\ndef _first_dataset(h5_obj, features: Features, prefix=\"\"):\n    for path, dset in h5_obj.items():\n        if path not in features:\n            continue\n        if _is_group(dset):\n            found = _first_dataset(dset, features[path], prefix=f\"{prefix}{path}/\")\n            if found is not None:\n                return found\n        elif _is_dataset(dset):\n            return f\"{prefix}{path}\"\n\n\ndef _check_dataset_lengths(h5_obj, features: Features) -> int:\n    first_path = _first_dataset(h5_obj, features)\n    if first_path is None:\n        return None\n\n    num_rows = h5_obj[first_path].shape[0]\n    for path, dset in h5_obj.items():\n        if path not in features:\n            continue\n        if _is_dataset(dset):\n            if dset.shape[0] != num_rows:\n                raise ValueError(f\"Dataset '{path}' has length {dset.shape[0]} but expected {num_rows}\")\n    return num_rows\n\n\ndef _is_group(h5_obj) -> bool:\n    import h5py\n\n    return isinstance(h5_obj, h5py.Group) or isinstance(h5_obj, _CompoundGroup)\n\n\ndef _is_dataset(h5_obj) -> bool:\n    import h5py\n\n    return isinstance(h5_obj, h5py.Dataset) or isinstance(h5_obj, _CompoundField)\n\n\ndef _is_file(h5_obj) -> bool:\n    import h5py\n\n    return isinstance(h5_obj, h5py.File)\n\n\ndef _has_zero_dimensions(feature):\n    if isinstance(feature, _ArrayXD):\n        return any(dim == 0 for dim in feature.shape)\n    elif isinstance(feature, List):\n        return feature.length == 0 or _has_zero_dimensions(feature.feature)\n    elif isinstance(feature, LargeList):\n        return _has_zero_dimensions(feature.feature)\n    else:\n        return False\n"
  },
  {
    "path": "src/datasets/packaged_modules/imagefolder/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/packaged_modules/imagefolder/imagefolder.py",
    "content": "import datasets\n\nfrom ..folder_based_builder import folder_based_builder\n\n\nlogger = datasets.utils.logging.get_logger(__name__)\n\n\nclass ImageFolderConfig(folder_based_builder.FolderBasedBuilderConfig):\n    \"\"\"BuilderConfig for ImageFolder.\"\"\"\n\n    drop_labels: bool = None\n    drop_metadata: bool = None\n\n    def __post_init__(self):\n        super().__post_init__()\n\n\nclass ImageFolder(folder_based_builder.FolderBasedBuilder):\n    BASE_FEATURE = datasets.Image\n    BASE_COLUMN_NAME = \"image\"\n    BUILDER_CONFIG_CLASS = ImageFolderConfig\n    EXTENSIONS: list[str]  # definition at the bottom of the script\n\n\n# Obtained with:\n# ```\n# import PIL.Image\n# IMAGE_EXTENSIONS = []\n# PIL.Image.init()\n# for ext, format in PIL.Image.EXTENSION.items():\n#     if format in PIL.Image.OPEN:\n#         IMAGE_EXTENSIONS.append(ext[1:])\n# ```\n# We intentionally do not run this code on launch because:\n# (1) Pillow is an optional dependency, so importing Pillow in global namespace is not allowed\n# (2) To ensure the list of supported extensions is deterministic\nIMAGE_EXTENSIONS = [\n    \".blp\",\n    \".bmp\",\n    \".dib\",\n    \".bufr\",\n    \".cur\",\n    \".pcx\",\n    \".dcx\",\n    \".dds\",\n    \".ps\",\n    \".eps\",\n    \".fit\",\n    \".fits\",\n    \".fli\",\n    \".flc\",\n    \".ftc\",\n    \".ftu\",\n    \".gbr\",\n    \".gif\",\n    \".grib\",\n    # \".h5\",   # may contain zero or several images\n    # \".hdf\",  # may contain zero or several images\n    \".png\",\n    \".apng\",\n    \".jp2\",\n    \".j2k\",\n    \".jpc\",\n    \".jpf\",\n    \".jpx\",\n    \".j2c\",\n    \".icns\",\n    \".ico\",\n    \".im\",\n    \".iim\",\n    \".tif\",\n    \".tiff\",\n    \".jfif\",\n    \".jpe\",\n    \".jpg\",\n    \".jpeg\",\n    \".mpg\",\n    \".mpeg\",\n    \".msp\",\n    \".pcd\",\n    \".pxr\",\n    \".pbm\",\n    \".pgm\",\n    \".ppm\",\n    \".pnm\",\n    \".psd\",\n    \".bw\",\n    \".rgb\",\n    \".rgba\",\n    \".sgi\",\n    \".ras\",\n    \".tga\",\n    \".icb\",\n    \".vda\",\n    \".vst\",\n    \".webp\",\n    \".wmf\",\n    \".emf\",\n    \".xbm\",\n    \".xpm\",\n]\nImageFolder.EXTENSIONS = IMAGE_EXTENSIONS\n"
  },
  {
    "path": "src/datasets/packaged_modules/json/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/packaged_modules/json/json.py",
    "content": "import io\nfrom dataclasses import dataclass\nfrom typing import Literal, Optional\n\nimport pandas as pd\nimport pyarrow as pa\nimport pyarrow.json as paj\n\nimport datasets\nimport datasets.config\nfrom datasets.builder import Key\nfrom datasets.table import table_cast\nfrom datasets.utils.file_utils import readline\nfrom datasets.utils.json import (\n    find_mixed_struct_types_field_paths,\n    get_json_field_path_from_pyarrow_json_error,\n    get_json_field_paths_from_feature,\n    insert_json_field_path,\n    json_encode_field,\n    json_encode_fields_in_json_lines,\n    set_json_types_in_feature,\n    ujson_dumps,\n    ujson_loads,\n)\n\n\nlogger = datasets.utils.logging.get_logger(__name__)\n\n\ndef pandas_read_json(path_or_buf, **kwargs):\n    if datasets.config.PANDAS_VERSION.major >= 2:\n        kwargs[\"dtype_backend\"] = \"pyarrow\"\n    return pd.read_json(path_or_buf, **kwargs)\n\n\nclass FullReadDisallowed(Exception):\n    pass\n\n\n@dataclass\nclass JsonConfig(datasets.BuilderConfig):\n    \"\"\"BuilderConfig for JSON.\"\"\"\n\n    features: Optional[datasets.Features] = None\n    encoding: str = \"utf-8\"\n    encoding_errors: Optional[str] = None\n    field: Optional[str] = None\n    use_threads: bool = True  # deprecated\n    block_size: Optional[int] = None  # deprecated\n    chunksize: int = 10 << 20  # 10MB\n    newlines_in_values: Optional[bool] = None\n    on_mixed_types: Optional[Literal[\"use_json\"]] = \"use_json\"\n\n    def __post_init__(self):\n        super().__post_init__()\n\n\nclass Json(datasets.ArrowBasedBuilder):\n    BUILDER_CONFIG_CLASS = JsonConfig\n\n    def _info(self):\n        if self.config.block_size is not None:\n            logger.warning(\"The JSON loader parameter `block_size` is deprecated. Please use `chunksize` instead\")\n            self.config.chunksize = self.config.block_size\n        if self.config.use_threads is not True:\n            logger.warning(\n                \"The JSON loader parameter `use_threads` is deprecated and doesn't have any effect anymore.\"\n            )\n        if self.config.newlines_in_values is not None:\n            raise ValueError(\"The JSON loader parameter `newlines_in_values` is no longer supported\")\n        return datasets.DatasetInfo(features=self.config.features)\n\n    def _split_generators(self, dl_manager):\n        \"\"\"We handle string, list and dicts in datafiles\"\"\"\n        if not self.config.data_files:\n            raise ValueError(f\"At least one data file must be specified, but got data_files={self.config.data_files}\")\n        dl_manager.download_config.extract_on_the_fly = True\n        base_data_files = dl_manager.download(self.config.data_files)\n        extracted_data_files = dl_manager.extract(base_data_files)\n        splits = []\n        for split_name, extracted_files in extracted_data_files.items():\n            files_iterables = [dl_manager.iter_files(extracted_file) for extracted_file in extracted_files]\n            splits.append(\n                datasets.SplitGenerator(\n                    name=split_name,\n                    gen_kwargs={\"files_iterables\": files_iterables, \"base_files\": base_data_files[split_name]},\n                )\n            )\n        if self.info.features is None:\n            try:\n                pa_table = next(iter(self._generate_tables(**splits[0].gen_kwargs, allow_full_read=False)))[1]\n                self.info.features = datasets.Features.from_arrow_schema(pa_table.schema)\n            except FullReadDisallowed:\n                pass\n        return splits\n\n    def _cast_table(self, pa_table: pa.Table, json_field_paths=()) -> pa.Table:\n        if self.info.features is not None:\n            # adding missing columns\n            for column_name in set(self.info.features) - set(pa_table.column_names):\n                type = self.info.features.arrow_schema.field(column_name).type\n                pa_table = pa_table.append_column(column_name, pa.array([None] * len(pa_table), type=type))\n            # convert to string when needed\n            for i, column_name in enumerate(pa_table.column_names):\n                if pa.types.is_struct(pa_table[column_name].type) and self.info.features.get(\n                    column_name, None\n                ) == datasets.Value(\"string\"):\n                    jsonl = (\n                        pa_table[column_name]\n                        .to_pandas(types_mapper=pd.ArrowDtype)\n                        .to_json(orient=\"records\", lines=True)\n                    )\n                    string_array = pa.array(\n                        (None if x.strip() == \"null\" else x.strip() for x in jsonl.split(\"\\n\") if x.strip()),\n                        type=pa.string(),\n                    )\n                    pa_table = pa_table.set_column(i, column_name, string_array)\n            # more expensive cast to support nested structures with keys in a different order\n            # allows str <-> int/float or str to Audio for example\n            pa_table = table_cast(pa_table, self.info.features.arrow_schema)\n        elif json_field_paths:\n            features = datasets.Features.from_arrow_schema(pa_table.schema)\n            features = set_json_types_in_feature(features, json_field_paths)\n            pa_table = table_cast(pa_table, features.arrow_schema)\n        return pa_table\n\n    def _generate_shards(self, base_files, files_iterables):\n        yield from base_files\n\n    def _generate_tables(self, base_files, files_iterables, allow_full_read=True):\n        json_field_paths = []\n\n        if self.info.features is not None:\n            json_field_paths = get_json_field_paths_from_feature(self.info.features)\n\n        for shard_idx, files_iterable in enumerate(files_iterables):\n            for file in files_iterable:\n                # If the file is one json object and if we need to look at the items in one specific field\n                if self.config.field is not None:\n                    if not allow_full_read:\n                        raise FullReadDisallowed()\n                    with open(file, encoding=self.config.encoding, errors=self.config.encoding_errors) as f:\n                        dataset = ujson_loads(f.read())\n                    # We keep only the field we are interested in\n                    dataset = dataset[self.config.field]\n                    df = pandas_read_json(io.StringIO(ujson_dumps(dataset)))\n                    if df.columns.tolist() == [0]:\n                        df.columns = list(self.config.features) if self.config.features else [\"text\"]\n                    pa_table = pa.Table.from_pandas(df, preserve_index=False)\n                    yield Key(shard_idx, 0), self._cast_table(pa_table)\n\n                # If the file has one json object per line\n                else:\n                    with open(file, \"rb\") as f:\n                        batch_idx = 0\n                        # Use block_size equal to the chunk size divided by 32 to leverage multithreading\n                        # Set a default minimum value of 16kB if the chunk size is really small\n                        block_size = max(self.config.chunksize // 32, 16 << 10)\n                        encoding_errors = (\n                            self.config.encoding_errors if self.config.encoding_errors is not None else \"strict\"\n                        )\n                        while True:\n                            batch = f.read(self.config.chunksize)\n                            if not batch:\n                                break\n                            if batch.startswith(b\"[\"):\n                                if not allow_full_read:\n                                    raise FullReadDisallowed()\n                                else:\n                                    # convert to JSON Lines\n                                    full_data = batch + f.read()\n                                    if b\"{\" in batch[:100].split(b'\"', 1)[0]:  # list of objects\n                                        batch = \"\\n\".join(ujson_dumps(x) for x in ujson_loads(full_data)).encode()\n                                    else:  # list of strings\n                                        batch = \"\\n\".join(\n                                            ujson_dumps({\"text\": x}) for x in ujson_loads(full_data)\n                                        ).encode()\n                            # Finish current line\n                            try:\n                                batch += f.readline()\n                            except (AttributeError, io.UnsupportedOperation):\n                                batch += readline(f)\n                            # PyArrow only accepts utf-8 encoded bytes\n                            if self.config.encoding != \"utf-8\":\n                                batch = batch.decode(self.config.encoding, errors=encoding_errors).encode(\"utf-8\")\n                            # On first batch we check for lists of objects with arbitrary fields\n                            if (\n                                shard_idx == 0\n                                and batch_idx == 0\n                                and self.info.features is None\n                                and self.config.on_mixed_types == \"use_json\"\n                            ):\n                                examples = [ujson_loads(line) for line in batch.splitlines()]\n                                json_field_paths += find_mixed_struct_types_field_paths(examples)\n                            # Re-encode JSON fields\n                            original_batch = batch\n                            if json_field_paths:\n                                examples = [ujson_loads(line) for line in batch.splitlines()]\n                                for json_field_path in json_field_paths:\n                                    examples = [json_encode_field(examples, json_field_path) for examples in examples]\n                                batch = \"\\n\".join(ujson_dumps(example) for example in examples).encode()\n                            # Disable parallelism if block size is ~ len(batch) to avoid segfault\n                            block_size = len(batch) if len(batch) // 8 > block_size else block_size\n                            try:\n                                while True:\n                                    try:\n                                        pa_table = paj.read_json(\n                                            io.BytesIO(batch), read_options=paj.ReadOptions(block_size=block_size)\n                                        )\n                                        break\n                                    except (pa.ArrowInvalid, pa.ArrowNotImplementedError) as e:\n                                        if batch.startswith(b\"[\"):  # paj.read_json only supports json lines\n                                            raise\n                                        elif self.config.on_mixed_types == \"use_json\" and (\n                                            isinstance(e, pa.ArrowInvalid)\n                                            and \"JSON parse error: Column(\" in str(e)\n                                            and \") changed from\" in str(e)\n                                        ):\n                                            json_field_path = get_json_field_path_from_pyarrow_json_error(str(e))\n                                            insert_json_field_path(json_field_paths, json_field_path)\n                                            batch = json_encode_fields_in_json_lines(original_batch, json_field_paths)\n                                        elif (\n                                            \"straddling\" in str(e) or \"JSON conversion to\" in str(e)\n                                        ) and block_size < len(batch):\n                                            # Increase the block size in case it was too small.\n                                            # The block size will be reset for the next file.\n                                            # this is needed in case of \"stradding\" or for some JSON conversions (see https://github.com/huggingface/datasets/issues/2799)\n                                            logger.debug(\n                                                f\"Batch of {len(batch)} bytes couldn't be parsed with block_size={block_size}. Retrying with block_size={block_size * 2}.\"\n                                            )\n                                            block_size *= 2\n                                        else:\n                                            raise\n                            except pa.ArrowInvalid as e:\n                                if not allow_full_read:\n                                    raise FullReadDisallowed()\n                                try:\n                                    with open(\n                                        file, encoding=self.config.encoding, errors=self.config.encoding_errors\n                                    ) as f:\n                                        df = pandas_read_json(f)\n                                except ValueError:\n                                    logger.error(f\"Failed to load JSON from file '{file}' with error {type(e)}: {e}\")\n                                    raise e\n                                if df.columns.tolist() == [0]:\n                                    df.columns = list(self.config.features) if self.config.features else [\"text\"]\n                                try:\n                                    pa_table = pa.Table.from_pandas(df, preserve_index=False)\n                                except pa.ArrowInvalid as e:\n                                    logger.error(\n                                        f\"Failed to convert pandas DataFrame to Arrow Table from file '{file}' with error {type(e)}: {e}\"\n                                    )\n                                    raise ValueError(\n                                        f\"Failed to convert pandas DataFrame to Arrow Table from file {file}.\"\n                                    ) from None\n                                yield Key(shard_idx, 0), self._cast_table(pa_table)\n                                break\n                            yield (\n                                Key(shard_idx, batch_idx),\n                                self._cast_table(pa_table, json_field_paths=json_field_paths),\n                            )\n                            batch_idx += 1\n"
  },
  {
    "path": "src/datasets/packaged_modules/lance/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/packaged_modules/lance/lance.py",
    "content": "import re\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Dict, List, Optional\n\nimport pyarrow as pa\nfrom huggingface_hub import HfApi\n\nimport datasets\nfrom datasets import Audio, Image, Video\nfrom datasets.builder import Key\nfrom datasets.table import table_cast\nfrom datasets.utils.file_utils import is_local_path\n\n\nif TYPE_CHECKING:\n    import lance\n    import lance.file\n\nlogger = datasets.utils.logging.get_logger(__name__)\n\nMAGIC_BYTES_EXTENSION_AND_FEATURE_TYPES = [\n    (\"1A 45 DF A3\", \".mkv\", Video()),\n    (\"66 74 79 70 69 73 6F 6D\", \".mp4\", Video()),\n    (\"66 74 79 70 4D 53 4E 56\", \".mp4\", Video()),\n    (\"52 49 46 46\", \".avi\", Video()),\n    (\"00 00 01 BA\", \".mpeg\", Video()),\n    (\"00 00 01 BA\", \".mpeg\", Video()),\n    (\"00 00 01 B3\", \".mov\", Video()),\n    (\"89 50 4E 47\", \".png\", Image()),\n    (\"FF D8\", \".jpg\", Image()),\n    (\"49 49\", \".tif\", Image()),\n    (\"47 49 46 38\", \".gif\", Image()),\n    (\"52 49 46 46\", \".wav\", Audio()),\n    (\"49 44 33\", \".mp3\", Audio()),\n    (\"66 4C 61 43\", \".flac\", Audio()),\n]\n\n\n@dataclass\nclass LanceConfig(datasets.BuilderConfig):\n    \"\"\"\n    BuilderConfig for Lance format.\n\n    Args:\n        features: (`Features`, *optional*):\n            Cast the data to `features`.\n        columns: (`List[str]`, *optional*):\n            List of columns to load, the other ones are ignored.\n        batch_size: (`int`, *optional*):\n            Size of the RecordBatches to iterate on. Default to 256.\n        token: (`str`, *optional*):\n            Optional HF token to use to download datasets.\n    \"\"\"\n\n    features: Optional[datasets.Features] = None\n    columns: Optional[List[str]] = None\n    batch_size: Optional[int] = 256\n    token: Optional[str] = None\n\n\ndef resolve_dataset_uris(files: List[str]) -> Dict[str, List[str]]:\n    dataset_uris = set()\n    for file_path in files:\n        path = Path(file_path)\n        if path.parent.name in {\"_transactions\", \"_indices\", \"_versions\"}:\n            dataset_root = path.parent.parent\n            dataset_uris.add(str(dataset_root))\n    return list(dataset_uris)\n\n\ndef _fix_hf_uri(uri: str) -> str:\n    # replace the revision tag from hf uri\n    if \"@\" in uri:\n        matched = re.match(r\"(hf://.+?)(@[0-9a-f]+)(/.*)\", uri)\n        if matched:\n            uri = matched.group(1) + matched.group(3)\n    return uri\n\n\ndef _fix_local_version_file(uri: str) -> str:\n    # replace symlinks with real files for _version\n    if \"/_versions/\" in uri and is_local_path(uri):\n        path = Path(uri)\n        if path.is_symlink():\n            data = path.read_bytes()\n            path.unlink()\n            path.write_bytes(data)\n    return uri\n\n\nclass Lance(datasets.ArrowBasedBuilder, datasets.builder._CountableBuilderMixin):\n    BUILDER_CONFIG_CLASS = LanceConfig\n    METADATA_EXTENSIONS = [\".idx\", \".txn\", \".manifest\"]\n\n    def _info(self):\n        return datasets.DatasetInfo(features=self.config.features)\n\n    def _split_generators(self, dl_manager):\n        import lance\n        import lance.file\n\n        if not self.config.data_files:\n            raise ValueError(f\"At least one data file must be specified, but got data_files={self.config.data_files}\")\n        if self.repo_id:\n            api = HfApi(**dl_manager.download_config.storage_options.get(\"hf\", {}))\n            dataset_sha = api.dataset_info(self.repo_id).sha\n            if dataset_sha != self.hash:\n                raise NotImplementedError(\n                    f\"lance doesn't support loading other revisions than 'main' yet, but got {self.hash}\"\n                )\n        data_files = dl_manager.download(self.config.data_files)\n\n        # TODO: remove once Lance supports HF links with revisions\n        data_files = {split: [_fix_hf_uri(file) for file in files] for split, files in data_files.items()}\n        # TODO: remove once Lance supports symlinks for _version files\n        data_files = {split: [_fix_local_version_file(file) for file in files] for split, files in data_files.items()}\n\n        splits: list[datasets.SplitGenerator] = []\n        for split_name, files in data_files.items():\n            storage_options = dl_manager.download_config.storage_options.get(files[0].split(\"://\", 0)[0] + \"://\")\n\n            lance_dataset_uris = resolve_dataset_uris(files)\n            if lance_dataset_uris:\n                lance_datasets = [lance.dataset(uri, storage_options=storage_options) for uri in lance_dataset_uris]\n                fragments = [frag for lance_dataset in lance_datasets for frag in lance_dataset.get_fragments()]\n                if self.info.features is None:\n                    pa_schema = fragments[0]._ds.schema\n                    first_row_first_bytes = {}\n                    for field in pa_schema:\n                        if self.config.columns is not None and field.name not in self.config.columns:\n                            continue\n                        if pa.types.is_binary(field.type) or pa.types.is_large_binary(field.type):\n                            try:\n                                first_row_first_bytes[field.name] = (\n                                    lance_datasets[0].take_blobs(field.name, [0])[0].read(16)\n                                )\n                            except ValueError:\n                                first_row_first_bytes[field.name] = (\n                                    lance_datasets[0].take([0], [field.name]).to_pylist()[0][field.name][:16]\n                                )\n                splits.append(\n                    datasets.SplitGenerator(\n                        name=split_name,\n                        gen_kwargs={\"fragments\": fragments, \"lance_files_paths\": None, \"lance_files\": None},\n                    )\n                )\n            else:\n                lance_files = [\n                    lance.file.LanceFileReader(file, storage_options=storage_options, columns=self.config.columns)\n                    for file in files\n                ]\n                if self.info.features is None:\n                    pa_schema = lance_files[0].metadata().schema\n                    first_row_first_bytes = {\n                        field_name: value[:16]\n                        for field_name, value in lance_files[0].take_rows([0]).to_table().to_pylist()[0].items()\n                        if isinstance(value, bytes)\n                    }\n                splits.append(\n                    datasets.SplitGenerator(\n                        name=split_name,\n                        gen_kwargs={\"fragments\": None, \"lance_files_paths\": files, \"lance_files\": lance_files},\n                    )\n                )\n            if self.info.features is None:\n                if self.config.columns:\n                    fields = [\n                        pa_schema.field(name) for name in self.config.columns if pa_schema.get_field_index(name) != -1\n                    ]\n                    pa_schema = pa.schema(fields)\n                features = datasets.Features.from_arrow_schema(pa_schema)\n                for field_name, first_bytes in first_row_first_bytes.items():\n                    for magic_bytes_hex, _, feature_type in MAGIC_BYTES_EXTENSION_AND_FEATURE_TYPES:\n                        magic_bytes = bytes.fromhex(magic_bytes_hex)\n                        if magic_bytes in first_bytes[: len(magic_bytes) * 2]:  # allow some padding\n                            features[field_name] = feature_type\n                            break\n                self.info.features = features\n\n        return splits\n\n    def _cast_table(self, pa_table: pa.Table) -> pa.Table:\n        if self.info.features is not None:\n            # more expensive cast to support nested features with keys in a different order\n            # allows str <-> int/float or str to Audio for example\n            pa_table = table_cast(pa_table, self.info.features.arrow_schema)\n        return pa_table\n\n    def _generate_shards(\n        self,\n        fragments: Optional[List[\"lance.LanceFragment\"]],\n        lance_files_paths: Optional[list[str]],\n        lance_files: Optional[List[\"lance.file.LanceFileReader\"]],\n    ):\n        if fragments:\n            for fragment in fragments:\n                paths = [data_file.path for data_file in fragment.metadata.data_files()]\n                yield paths[0] if len(paths) == 1 else {\"fragment_data_files\": paths}\n        else:\n            yield from lance_files_paths\n\n    def _generate_num_examples(\n        self,\n        fragments: Optional[List[\"lance.LanceFragment\"]],\n        lance_files_paths: Optional[list[str]],\n        lance_files: Optional[List[\"lance.file.LanceFileReader\"]],\n    ):\n        if fragments:\n            for fragment in fragments:\n                yield fragment.count_rows()\n        else:\n            for lance_file in lance_files:\n                yield lance_file.num_rows()\n\n    def _generate_tables(\n        self,\n        fragments: Optional[List[\"lance.LanceFragment\"]],\n        lance_files_paths: Optional[list[str]],\n        lance_files: Optional[List[\"lance.file.LanceFileReader\"]],\n    ):\n        if fragments:\n            for frag_idx, fragment in enumerate(fragments):\n                for batch_idx, batch in enumerate(\n                    fragment.to_batches(\n                        columns=self.config.columns, batch_size=self.config.batch_size, blob_handling=\"all_binary\"\n                    )\n                ):\n                    table = pa.Table.from_batches([batch])\n                    yield Key(frag_idx, batch_idx), self._cast_table(table)\n        else:\n            for file_idx, lance_file in enumerate(lance_files):\n                for batch_idx, batch in enumerate(lance_file.read_all(batch_size=self.config.batch_size).to_batches()):\n                    table = pa.Table.from_batches([batch])\n                    yield Key(file_idx, batch_idx), self._cast_table(table)\n"
  },
  {
    "path": "src/datasets/packaged_modules/niftifolder/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/packaged_modules/niftifolder/niftifolder.py",
    "content": "import datasets\n\nfrom ..folder_based_builder import folder_based_builder\n\n\nlogger = datasets.utils.logging.get_logger(__name__)\n\n\nclass NiftiFolderConfig(folder_based_builder.FolderBasedBuilderConfig):\n    \"\"\"BuilderConfig for NiftiFolder.\"\"\"\n\n    drop_labels: bool = None\n    drop_metadata: bool = None\n\n    def __post_init__(self):\n        super().__post_init__()\n\n\nclass NiftiFolder(folder_based_builder.FolderBasedBuilder):\n    BASE_FEATURE = datasets.Nifti\n    BASE_COLUMN_NAME = \"nifti\"\n    BUILDER_CONFIG_CLASS = NiftiFolderConfig\n    EXTENSIONS: list[str] = [\".nii\"]\n"
  },
  {
    "path": "src/datasets/packaged_modules/pandas/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/packaged_modules/pandas/pandas.py",
    "content": "import warnings\nfrom dataclasses import dataclass\nfrom typing import Optional\n\nimport pandas as pd\nimport pyarrow as pa\n\nimport datasets\nfrom datasets.builder import Key\nfrom datasets.table import table_cast\n\n\n@dataclass\nclass PandasConfig(datasets.BuilderConfig):\n    \"\"\"BuilderConfig for Pandas.\"\"\"\n\n    features: Optional[datasets.Features] = None\n\n    def __post_init__(self):\n        super().__post_init__()\n\n\nclass Pandas(datasets.ArrowBasedBuilder):\n    BUILDER_CONFIG_CLASS = PandasConfig\n\n    def _info(self):\n        warnings.warn(\n            \"The Pandas builder is deprecated and will be removed in the next major version of datasets.\",\n            FutureWarning,\n        )\n        return datasets.DatasetInfo(features=self.config.features)\n\n    def _split_generators(self, dl_manager):\n        \"\"\"We handle string, list and dicts in datafiles\"\"\"\n        if not self.config.data_files:\n            raise ValueError(f\"At least one data file must be specified, but got data_files={self.config.data_files}\")\n        data_files = dl_manager.download(self.config.data_files)\n        splits = []\n        for split_name, files in data_files.items():\n            splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={\"files\": files}))\n        return splits\n\n    def _cast_table(self, pa_table: pa.Table) -> pa.Table:\n        if self.config.features is not None:\n            # more expensive cast to support nested features with keys in a different order\n            # allows str <-> int/float or str to Audio for example\n            pa_table = table_cast(pa_table, self.config.features.arrow_schema)\n        return pa_table\n\n    def _generate_shards(self, files):\n        yield from files\n\n    def _generate_tables(self, files):\n        for i, file in enumerate(files):\n            with open(file, \"rb\") as f:\n                pa_table = pa.Table.from_pandas(pd.read_pickle(f))\n                yield Key(i, 0), self._cast_table(pa_table)\n"
  },
  {
    "path": "src/datasets/packaged_modules/parquet/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/packaged_modules/parquet/parquet.py",
    "content": "from dataclasses import dataclass\nfrom typing import Literal, Optional, Union\n\nimport pyarrow as pa\nimport pyarrow.dataset as ds\nimport pyarrow.parquet as pq\n\nimport datasets\nfrom datasets.builder import Key\nfrom datasets.table import table_cast\n\n\nlogger = datasets.utils.logging.get_logger(__name__)\n\n\n@dataclass\nclass ParquetConfig(datasets.BuilderConfig):\n    \"\"\"\n    BuilderConfig for Parquet.\n\n    Args:\n        batch_size (`int`, *optional*):\n            Size of the RecordBatches to iterate on.\n            The default is the row group size (defined by the first row group).\n        columns (`list[str]`, *optional*)\n            List of columns to load, the other ones are ignored.\n            All columns are loaded by default.\n        features: (`Features`, *optional*):\n            Cast the data to `features`.\n        filters (`Union[pyarrow.dataset.Expression, list[tuple], list[list[tuple]]]`, *optional*):\n            Return only the rows matching the filter.\n            If possible the predicate will be pushed down to exploit the partition information\n            or internal metadata found in the data source, e.g. Parquet statistics.\n            Otherwise filters the loaded RecordBatches before yielding them.\n        fragment_scan_options (`pyarrow.dataset.ParquetFragmentScanOptions`, *optional*)\n            Scan-specific options for Parquet fragments.\n            This is especially useful to configure buffering and caching.\n\n            <Added version=\"4.2.0\"/>\n        on_bad_files (`Literal[\"error\", \"warn\", \"skip\"]`, *optional*, defaults to \"error\")\n            Specify what to do upon encountering a bad file (a file that can't be read). Allowed values are :\n            * 'error', raise an Exception when a bad file is encountered.\n            * 'warn', raise a warning when a bad file is encountered and skip that file.\n            * 'skip', skip bad files without raising or warning when they are encountered.\n\n            <Added version=\"4.2.0\"/>\n\n    Example:\n\n    Load a subset of columns:\n\n    ```python\n    >>> ds = load_dataset(parquet_dataset_id, columns=[\"col_0\", \"col_1\"])\n    ```\n\n    Stream data and efficiently filter data, possibly skipping entire files or row groups:\n\n    ```python\n    >>> filters = [(\"col_0\", \"==\", 0)]\n    >>> ds = load_dataset(parquet_dataset_id, streaming=True, filters=filters)\n    ```\n\n    Increase the minimum request size when streaming from 32MiB (default) to 128MiB and enable prefetching:\n\n    ```python\n    >>> import pyarrow\n    >>> import pyarrow.dataset\n    >>> fragment_scan_options = pyarrow.dataset.ParquetFragmentScanOptions(\n    ...     cache_options=pyarrow.CacheOptions(\n    ...         prefetch_limit=1,\n    ...         range_size_limit=128 << 20\n    ...     ),\n    ... )\n    >>> ds = load_dataset(parquet_dataset_id, streaming=True, fragment_scan_options=fragment_scan_options)\n    ```\n\n    \"\"\"\n\n    batch_size: Optional[int] = None\n    columns: Optional[list[str]] = None\n    features: Optional[datasets.Features] = None\n    filters: Optional[Union[ds.Expression, list[tuple], list[list[tuple]]]] = None\n    fragment_scan_options: Optional[ds.ParquetFragmentScanOptions] = None\n    on_bad_files: Literal[\"error\", \"warn\", \"skip\"] = \"error\"\n\n    def __post_init__(self):\n        super().__post_init__()\n\n\nclass Parquet(datasets.ArrowBasedBuilder):\n    BUILDER_CONFIG_CLASS = ParquetConfig\n\n    def _info(self):\n        if (\n            self.config.columns is not None\n            and self.config.features is not None\n            and set(self.config.columns) != set(self.config.features)\n        ):\n            raise ValueError(\n                \"The columns and features argument must contain the same columns, but got \",\n                f\"{self.config.columns} and {self.config.features}\",\n            )\n        return datasets.DatasetInfo(features=self.config.features)\n\n    def _split_generators(self, dl_manager):\n        \"\"\"We handle string, list and dicts in datafiles\"\"\"\n        if not self.config.data_files:\n            raise ValueError(f\"At least one data file must be specified, but got data_files={self.config.data_files}\")\n        dl_manager.download_config.extract_on_the_fly = True\n        data_files = dl_manager.download(self.config.data_files)\n        splits = []\n        for split_name, files in data_files.items():\n            # Infer features if they are stored in the arrow schema\n            if self.info.features is None:\n                for file in files:\n                    try:\n                        with open(file, \"rb\") as f:\n                            self.info.features = datasets.Features.from_arrow_schema(pq.read_schema(f))\n                            break\n                    except pa.ArrowInvalid as e:\n                        if self.config.on_bad_files == \"error\":\n                            logger.error(f\"Failed to read schema from '{file}' with error {type(e).__name__}: {e}\")\n                            raise\n                        elif self.config.on_bad_files == \"warn\":\n                            logger.warning(f\"Skipping bad schema from '{file}'. {type(e).__name__}: {e}`\")\n                        else:\n                            logger.debug(f\"Skipping bad schema from '{file}'. {type(e).__name__}: {e}`\")\n            if self.info.features is None:\n                raise ValueError(\n                    f\"At least one valid data file must be specified, all the data_files are invalid: {self.config.data_files}\"\n                )\n            splits.append(\n                datasets.SplitGenerator(\n                    name=split_name, gen_kwargs={\"files\": files, \"row_groups_list\": [None] * len(files)}\n                )\n            )\n        if self.config.columns is not None and set(self.config.columns) != set(self.info.features):\n            self.info.features = datasets.Features(\n                {col: feat for col, feat in self.info.features.items() if col in self.config.columns}\n            )\n        return splits\n\n    def _cast_table(self, pa_table: pa.Table) -> pa.Table:\n        if self.info.features is not None:\n            # more expensive cast to support nested features with keys in a different order\n            # allows str <-> int/float or str to Audio for example\n            pa_table = table_cast(pa_table, self.info.features.arrow_schema)\n        return pa_table\n\n    def _generate_shards(self, files, row_groups_list):\n        if not row_groups_list:\n            yield from files\n        else:\n            for file, row_groups in zip(files, row_groups_list):\n                yield {\n                    \"fragment_data_file\": file,\n                    \"fragment_row_groups\": row_groups,\n                }\n\n    def _generate_more_gen_kwargs(self, files, row_groups_list):\n        if not row_groups_list:\n            parquet_file_format = ds.ParquetFileFormat(default_fragment_scan_options=self.config.fragment_scan_options)\n            for file in files:\n                with open(file, \"rb\") as f:\n                    parquet_fragment = parquet_file_format.make_fragment(f)\n                    yield {\n                        \"files\": [file] * parquet_fragment.num_row_groups,\n                        \"row_groups_list\": [\n                            (row_group_id,) for row_group_id in range(parquet_fragment.num_row_groups)\n                        ],\n                    }\n        else:\n            for file, row_groups in zip(files, row_groups_list):\n                yield {\"files\": [file], \"row_groups_list\": [row_groups]}\n\n    def _generate_tables(self, files, row_groups_list):\n        if self.config.features is not None and self.config.columns is not None:\n            if sorted(field.name for field in self.info.features.arrow_schema) != sorted(self.config.columns):\n                raise ValueError(\n                    f\"Tried to load parquet data with columns '{self.config.columns}' with mismatching features '{self.info.features}'\"\n                )\n        filter_expr = (\n            pq.filters_to_expression(self.config.filters)\n            if isinstance(self.config.filters, list)\n            else self.config.filters\n        )\n        parquet_file_format = ds.ParquetFileFormat(default_fragment_scan_options=self.config.fragment_scan_options)\n        for file_idx, (file, row_groups) in enumerate(zip(files, row_groups_list)):\n            try:\n                with open(file, \"rb\") as f:\n                    parquet_fragment = parquet_file_format.make_fragment(f)\n                    if row_groups is not None:\n                        parquet_fragment.subset(row_group_ids=row_groups)\n                    if parquet_fragment.row_groups:\n                        batch_size = self.config.batch_size or parquet_fragment.row_groups[0].num_rows\n                        for batch_idx, record_batch in enumerate(\n                            parquet_fragment.to_batches(\n                                batch_size=batch_size,\n                                columns=self.config.columns,\n                                filter=filter_expr,\n                                batch_readahead=0,\n                                fragment_readahead=0,\n                            )\n                        ):\n                            pa_table = pa.Table.from_batches([record_batch])\n                            # Uncomment for debugging (will print the Arrow table size and elements)\n                            # logger.warning(f\"pa_table: {pa_table} num rows: {pa_table.num_rows}\")\n                            # logger.warning('\\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))\n                            yield Key(file_idx, batch_idx), self._cast_table(pa_table)\n            except (pa.ArrowInvalid, ValueError) as e:\n                if self.config.on_bad_files == \"error\":\n                    logger.error(f\"Failed to read file '{file}' with error {type(e).__name__}: {e}\")\n                    raise\n                elif self.config.on_bad_files == \"warn\":\n                    logger.warning(f\"Skipping bad file '{file}'. {type(e).__name__}: {e}`\")\n                else:\n                    logger.debug(f\"Skipping bad file '{file}'. {type(e).__name__}: {e}`\")\n"
  },
  {
    "path": "src/datasets/packaged_modules/pdffolder/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/packaged_modules/pdffolder/pdffolder.py",
    "content": "import datasets\n\nfrom ..folder_based_builder import folder_based_builder\n\n\nlogger = datasets.utils.logging.get_logger(__name__)\n\n\nclass PdfFolderConfig(folder_based_builder.FolderBasedBuilderConfig):\n    \"\"\"BuilderConfig for ImageFolder.\"\"\"\n\n    drop_labels: bool = None\n    drop_metadata: bool = None\n\n    def __post_init__(self):\n        super().__post_init__()\n\n\nclass PdfFolder(folder_based_builder.FolderBasedBuilder):\n    BASE_FEATURE = datasets.Pdf\n    BASE_COLUMN_NAME = \"pdf\"\n    BUILDER_CONFIG_CLASS = PdfFolderConfig\n    EXTENSIONS: list[str] = [\".pdf\"]\n"
  },
  {
    "path": "src/datasets/packaged_modules/spark/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/packaged_modules/spark/spark.py",
    "content": "import os\nimport posixpath\nimport uuid\nfrom collections.abc import Iterable\nfrom dataclasses import dataclass\nfrom itertools import islice\nfrom typing import TYPE_CHECKING, Optional, Union\n\nimport numpy as np\nimport pyarrow as pa\n\nimport datasets\nfrom datasets.arrow_writer import ArrowWriter, ParquetWriter\nfrom datasets.config import MAX_SHARD_SIZE\nfrom datasets.filesystems import (\n    is_remote_filesystem,\n    rename,\n)\nfrom datasets.iterable_dataset import _BaseExamplesIterable\nfrom datasets.utils import experimental\nfrom datasets.utils.py_utils import convert_file_size_to_int\n\n\nlogger = datasets.utils.logging.get_logger(__name__)\n\nif TYPE_CHECKING:\n    import pyspark\n    import pyspark.sql\n\n\n@dataclass\nclass SparkConfig(datasets.BuilderConfig):\n    \"\"\"BuilderConfig for Spark.\"\"\"\n\n    features: Optional[datasets.Features] = None\n\n    def __post_init__(self):\n        super().__post_init__()\n\n\ndef _reorder_dataframe_by_partition(df: \"pyspark.sql.DataFrame\", new_partition_order: list[int]):\n    df_combined = df.select(\"*\").where(f\"part_id = {new_partition_order[0]}\")\n    for partition_id in new_partition_order[1:]:\n        partition_df = df.select(\"*\").where(f\"part_id = {partition_id}\")\n        df_combined = df_combined.union(partition_df)\n    return df_combined\n\n\ndef _generate_iterable_examples(\n    df: \"pyspark.sql.DataFrame\",\n    partition_order: list[int],\n    state_dict: Optional[dict] = None,\n):\n    import pyspark\n\n    df_with_partition_id = df.select(\"*\", pyspark.sql.functions.spark_partition_id().alias(\"part_id\"))\n    partition_idx_start = state_dict[\"partition_idx\"] if state_dict else 0\n    partition_df = _reorder_dataframe_by_partition(df_with_partition_id, partition_order[partition_idx_start:])\n    # pipeline next partition in parallel to hide latency\n    rows = partition_df.toLocalIterator(prefetchPartitions=True)\n    curr_partition = None\n    row_id = state_dict[\"partition_example_idx\"] if state_dict else 0\n    for row in islice(rows, row_id, None):\n        row_as_dict = row.asDict()\n        part_id = row_as_dict[\"part_id\"]\n        row_as_dict.pop(\"part_id\")\n        if curr_partition != part_id:\n            if state_dict and curr_partition is not None:\n                state_dict[\"partition_idx\"] += 1\n            curr_partition = part_id\n            row_id = 0\n        if state_dict:\n            state_dict[\"partition_example_idx\"] = row_id + 1\n        yield (part_id, row_id), row_as_dict\n        row_id += 1\n\n\nclass SparkExamplesIterable(_BaseExamplesIterable):\n    def __init__(\n        self,\n        df: \"pyspark.sql.DataFrame\",\n        partition_order=None,\n    ):\n        super().__init__()\n        self.df = df\n        self.partition_order = partition_order or range(self.df.rdd.getNumPartitions())\n\n    def _init_state_dict(self) -> dict:\n        self._state_dict = {\"partition_idx\": 0, \"partition_example_idx\": 0}\n        return self._state_dict\n\n    @experimental\n    def load_state_dict(self, state_dict: dict) -> dict:\n        return super().load_state_dict(state_dict)\n\n    def __iter__(self):\n        yield from _generate_iterable_examples(self.df, self.partition_order, self._state_dict)\n\n    def shuffle_data_sources(self, generator: np.random.Generator) -> \"SparkExamplesIterable\":\n        partition_order = list(range(self.df.rdd.getNumPartitions()))\n        generator.shuffle(partition_order)\n        return SparkExamplesIterable(self.df, partition_order=partition_order)\n\n    def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> \"SparkExamplesIterable\":\n        partition_order = self.split_shard_indices_by_worker(num_shards=num_shards, index=index, contiguous=contiguous)\n        return SparkExamplesIterable(self.df, partition_order=partition_order)\n\n    @property\n    def num_shards(self) -> int:\n        return len(self.partition_order)\n\n\nclass Spark(datasets.DatasetBuilder):\n    BUILDER_CONFIG_CLASS = SparkConfig\n\n    def __init__(\n        self,\n        df: \"pyspark.sql.DataFrame\",\n        cache_dir: str = None,\n        working_dir: str = None,\n        **config_kwargs,\n    ):\n        import pyspark\n\n        self._spark = pyspark.sql.SparkSession.builder.getOrCreate()\n        self.df = df\n        self._working_dir = working_dir\n\n        super().__init__(\n            cache_dir=cache_dir,\n            config_name=str(self.df.semanticHash()),\n            **config_kwargs,\n        )\n\n    def _validate_cache_dir(self):\n        # Define this so that we don't reference self in create_cache_and_write_probe, which will result in a pickling\n        # error due to pickling the SparkContext.\n        cache_dir = self._cache_dir\n\n        # Returns the path of the created file.\n        def create_cache_and_write_probe(context):\n            # makedirs with exist_ok will recursively create the directory. It will not throw an error if directories\n            # already exist.\n            os.makedirs(cache_dir, exist_ok=True)\n            probe_file = os.path.join(cache_dir, \"fs_test\" + uuid.uuid4().hex)\n            # Opening the file in append mode will create a new file unless it already exists, in which case it will not\n            # change the file contents.\n            open(probe_file, \"a\")\n            return [probe_file]\n\n        if self._spark.conf.get(\"spark.master\", \"\").startswith(\"local\"):\n            return\n\n        # If the cluster is multi-node, make sure that the user provided a cache_dir and that it is on an NFS\n        # accessible to the driver.\n        # TODO: Stream batches to the driver using ArrowCollectSerializer instead of throwing an error.\n        if self._cache_dir:\n            probe = (\n                self._spark.sparkContext.parallelize(range(1), 1).mapPartitions(create_cache_and_write_probe).collect()\n            )\n            if os.path.isfile(probe[0]):\n                return\n\n        raise ValueError(\n            \"When using Dataset.from_spark on a multi-node cluster, the driver and all workers should be able to access cache_dir\"\n        )\n\n    def _info(self):\n        return datasets.DatasetInfo(features=self.config.features)\n\n    def _split_generators(self, dl_manager: datasets.download.download_manager.DownloadManager):\n        return [datasets.SplitGenerator(name=datasets.Split.TRAIN)]\n\n    def _repartition_df_if_needed(self, max_shard_size):\n        import pyspark\n\n        def get_arrow_batch_size(it):\n            for batch in it:\n                yield pa.RecordBatch.from_pydict({\"batch_bytes\": [batch.nbytes]})\n\n        df_num_rows = self.df.count()\n        sample_num_rows = df_num_rows if df_num_rows <= 100 else 100\n        # Approximate the size of each row (in Arrow format) by averaging over a max-100-row sample.\n        approx_bytes_per_row = (\n            self.df.limit(sample_num_rows)\n            .repartition(1)\n            .mapInArrow(get_arrow_batch_size, \"batch_bytes: long\")\n            .agg(pyspark.sql.functions.sum(\"batch_bytes\").alias(\"sample_bytes\"))\n            .collect()[0]\n            .sample_bytes\n            / sample_num_rows\n        )\n        approx_total_size = approx_bytes_per_row * df_num_rows\n        if approx_total_size > max_shard_size:\n            # Make sure there is at least one row per partition.\n            new_num_partitions = min(df_num_rows, int(approx_total_size / max_shard_size))\n            self.df = self.df.repartition(new_num_partitions)\n\n    def _prepare_split_single(\n        self,\n        fpath: str,\n        file_format: str,\n        max_shard_size: int,\n    ) -> Iterable[tuple[int, bool, Union[int, tuple]]]:\n        import pyspark\n\n        writer_class = ParquetWriter if file_format == \"parquet\" else ArrowWriter\n        working_fpath = os.path.join(self._working_dir, os.path.basename(fpath)) if self._working_dir else fpath\n        embed_local_files = file_format == \"parquet\"\n\n        # Define these so that we don't reference self in write_arrow, which will result in a pickling error due to\n        # pickling the SparkContext.\n        features = self.config.features\n        writer_batch_size = self._writer_batch_size\n        storage_options = self._fs.storage_options\n\n        def write_arrow(it):\n            # Within the same SparkContext, no two task attempts will share the same attempt ID.\n            task_id = pyspark.TaskContext().taskAttemptId()\n            first_batch = next(it, None)\n            if first_batch is None:\n                # Some partitions might not receive any data.\n                return pa.RecordBatch.from_arrays(\n                    [[task_id], [0], [0]],\n                    names=[\"task_id\", \"num_examples\", \"num_bytes\"],\n                )\n            shard_id = 0\n            writer = writer_class(\n                features=features,\n                path=working_fpath.replace(\"SSSSS\", f\"{shard_id:05d}\").replace(\"TTTTT\", f\"{task_id:05d}\"),\n                writer_batch_size=writer_batch_size,\n                storage_options=storage_options,\n                embed_local_files=embed_local_files,\n            )\n            table = pa.Table.from_batches([first_batch])\n            writer.write_table(table)\n            for batch in it:\n                if max_shard_size is not None and writer._num_bytes >= max_shard_size:\n                    num_examples, num_bytes = writer.finalize()\n                    writer.close()\n                    yield pa.RecordBatch.from_arrays(\n                        [[task_id], [num_examples], [num_bytes]],\n                        names=[\"task_id\", \"num_examples\", \"num_bytes\"],\n                    )\n                    shard_id += 1\n                    writer = writer_class(\n                        features=writer._features,\n                        path=working_fpath.replace(\"SSSSS\", f\"{shard_id:05d}\").replace(\"TTTTT\", f\"{task_id:05d}\"),\n                        writer_batch_size=writer_batch_size,\n                        storage_options=storage_options,\n                        embed_local_files=embed_local_files,\n                    )\n                table = pa.Table.from_batches([batch])\n                writer.write_table(table)\n\n            if writer._num_bytes > 0:\n                num_examples, num_bytes = writer.finalize()\n                writer.close()\n                yield pa.RecordBatch.from_arrays(\n                    [[task_id], [num_examples], [num_bytes]],\n                    names=[\"task_id\", \"num_examples\", \"num_bytes\"],\n                )\n\n            if working_fpath != fpath:\n                for file in os.listdir(os.path.dirname(working_fpath)):\n                    dest = os.path.join(os.path.dirname(fpath), os.path.basename(file))\n                    shutil.move(file, dest)\n\n        stats = (\n            self.df.mapInArrow(write_arrow, \"task_id: long, num_examples: long, num_bytes: long\")\n            .groupBy(\"task_id\")\n            .agg(\n                pyspark.sql.functions.sum(\"num_examples\").alias(\"total_num_examples\"),\n                pyspark.sql.functions.sum(\"num_bytes\").alias(\"total_num_bytes\"),\n                pyspark.sql.functions.count(\"num_bytes\").alias(\"num_shards\"),\n                pyspark.sql.functions.collect_list(\"num_examples\").alias(\"shard_lengths\"),\n            )\n            .collect()\n        )\n        for row in stats:\n            yield row.task_id, (row.total_num_examples, row.total_num_bytes, row.num_shards, row.shard_lengths)\n\n    def _prepare_split(\n        self,\n        split_generator: \"datasets.SplitGenerator\",\n        file_format: str = \"arrow\",\n        max_shard_size: Optional[Union[str, int]] = None,\n        num_proc: Optional[int] = None,\n        **kwargs,\n    ):\n        self._validate_cache_dir()\n\n        max_shard_size = convert_file_size_to_int(max_shard_size or MAX_SHARD_SIZE)\n        self._repartition_df_if_needed(max_shard_size)\n        is_local = not is_remote_filesystem(self._fs)\n        path_join = os.path.join if is_local else posixpath.join\n\n        SUFFIX = \"-TTTTT-SSSSS-of-NNNNN\"\n        fname = f\"{self.name}-{split_generator.name}{SUFFIX}.{file_format}\"\n        fpath = path_join(self._output_dir, fname)\n\n        total_num_examples = 0\n        total_num_bytes = 0\n        total_shards = 0\n        task_id_and_num_shards = []\n        all_shard_lengths = []\n\n        for task_id, content in self._prepare_split_single(fpath, file_format, max_shard_size):\n            (\n                num_examples,\n                num_bytes,\n                num_shards,\n                shard_lengths,\n            ) = content\n            if num_bytes > 0:\n                total_num_examples += num_examples\n                total_num_bytes += num_bytes\n                total_shards += num_shards\n                task_id_and_num_shards.append((task_id, num_shards))\n                all_shard_lengths.extend(shard_lengths)\n\n        split_generator.split_info.num_examples = total_num_examples\n        split_generator.split_info.num_bytes = total_num_bytes\n\n        # should rename everything at the end\n        logger.debug(f\"Renaming {total_shards} shards.\")\n        if total_shards > 1:\n            split_generator.split_info.shard_lengths = all_shard_lengths\n\n            # Define fs outside of _rename_shard so that we don't reference self in the function, which will result in a\n            # pickling error due to pickling the SparkContext.\n            fs = self._fs\n\n            # use the -SSSSS-of-NNNNN pattern\n            def _rename_shard(\n                task_id: int,\n                shard_id: int,\n                global_shard_id: int,\n            ):\n                rename(\n                    fs,\n                    fpath.replace(\"SSSSS\", f\"{shard_id:05d}\").replace(\"TTTTT\", f\"{task_id:05d}\"),\n                    fpath.replace(\"TTTTT-SSSSS\", f\"{global_shard_id:05d}\").replace(\"NNNNN\", f\"{total_shards:05d}\"),\n                )\n\n            args = []\n            global_shard_id = 0\n            for i in range(len(task_id_and_num_shards)):\n                task_id, num_shards = task_id_and_num_shards[i]\n                for shard_id in range(num_shards):\n                    args.append([task_id, shard_id, global_shard_id])\n                    global_shard_id += 1\n            self._spark.sparkContext.parallelize(args, len(args)).map(lambda args: _rename_shard(*args)).collect()\n        else:\n            # don't use any pattern\n            shard_id = 0\n            task_id = task_id_and_num_shards[0][0]\n            self._rename(\n                fpath.replace(\"SSSSS\", f\"{shard_id:05d}\").replace(\"TTTTT\", f\"{task_id:05d}\"),\n                fpath.replace(SUFFIX, \"\"),\n            )\n\n    def _get_examples_iterable_for_split(\n        self,\n        split_generator: \"datasets.SplitGenerator\",\n    ) -> SparkExamplesIterable:\n        return SparkExamplesIterable(self.df)\n"
  },
  {
    "path": "src/datasets/packaged_modules/sql/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/packaged_modules/sql/sql.py",
    "content": "import sys\nfrom dataclasses import dataclass\nfrom typing import TYPE_CHECKING, Optional, Union\n\nimport pandas as pd\nimport pyarrow as pa\n\nimport datasets\nimport datasets.config\nfrom datasets.builder import Key\nfrom datasets.features.features import require_storage_cast\nfrom datasets.table import table_cast\n\n\nif TYPE_CHECKING:\n    import sqlite3\n\n    import sqlalchemy\n\n\nlogger = datasets.utils.logging.get_logger(__name__)\n\n\n@dataclass\nclass SqlConfig(datasets.BuilderConfig):\n    \"\"\"BuilderConfig for SQL.\"\"\"\n\n    sql: Union[str, \"sqlalchemy.sql.Selectable\"] = None\n    con: Union[str, \"sqlalchemy.engine.Connection\", \"sqlalchemy.engine.Engine\", \"sqlite3.Connection\"] = None\n    index_col: Optional[Union[str, list[str]]] = None\n    coerce_float: bool = True\n    params: Optional[Union[list, tuple, dict]] = None\n    parse_dates: Optional[Union[list, dict]] = None\n    columns: Optional[list[str]] = None\n    chunksize: Optional[int] = 10_000\n    features: Optional[datasets.Features] = None\n\n    def __post_init__(self):\n        super().__post_init__()\n        if self.sql is None:\n            raise ValueError(\"sql must be specified\")\n        if self.con is None:\n            raise ValueError(\"con must be specified\")\n\n    def create_config_id(\n        self,\n        config_kwargs: dict,\n        custom_features: Optional[datasets.Features] = None,\n    ) -> str:\n        config_kwargs = config_kwargs.copy()\n        # We need to stringify the Selectable object to make its hash deterministic\n\n        # The process of stringifying is explained here: http://docs.sqlalchemy.org/en/latest/faq/sqlexpressions.html\n        sql = config_kwargs[\"sql\"]\n        if not isinstance(sql, str):\n            if datasets.config.SQLALCHEMY_AVAILABLE and \"sqlalchemy\" in sys.modules:\n                import sqlalchemy\n\n                if isinstance(sql, sqlalchemy.sql.Selectable):\n                    engine = sqlalchemy.create_engine(config_kwargs[\"con\"].split(\"://\")[0] + \"://\")\n                    sql_str = str(sql.compile(dialect=engine.dialect))\n                    config_kwargs[\"sql\"] = sql_str\n                else:\n                    raise TypeError(\n                        f\"Supported types for 'sql' are string and sqlalchemy.sql.Selectable but got {type(sql)}: {sql}\"\n                    )\n            else:\n                raise TypeError(\n                    f\"Supported types for 'sql' are string and sqlalchemy.sql.Selectable but got {type(sql)}: {sql}\"\n                )\n        con = config_kwargs[\"con\"]\n        if not isinstance(con, str):\n            config_kwargs[\"con\"] = id(con)\n            logger.info(\n                f\"SQL connection 'con' of type {type(con)} couldn't be hashed properly. To enable hashing, specify 'con' as URI string instead.\"\n            )\n\n        return super().create_config_id(config_kwargs, custom_features=custom_features)\n\n    @property\n    def pd_read_sql_kwargs(self):\n        pd_read_sql_kwargs = {\n            \"index_col\": self.index_col,\n            \"columns\": self.columns,\n            \"params\": self.params,\n            \"coerce_float\": self.coerce_float,\n            \"parse_dates\": self.parse_dates,\n        }\n        return pd_read_sql_kwargs\n\n\nclass Sql(datasets.ArrowBasedBuilder):\n    BUILDER_CONFIG_CLASS = SqlConfig\n\n    def _info(self):\n        return datasets.DatasetInfo(features=self.config.features)\n\n    def _split_generators(self, dl_manager):\n        return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={})]\n\n    def _cast_table(self, pa_table: pa.Table) -> pa.Table:\n        if self.config.features is not None:\n            schema = self.config.features.arrow_schema\n            if all(not require_storage_cast(feature) for feature in self.config.features.values()):\n                # cheaper cast\n                pa_table = pa.Table.from_arrays([pa_table[field.name] for field in schema], schema=schema)\n            else:\n                # more expensive cast; allows str <-> int/float or str to Audio for example\n                pa_table = table_cast(pa_table, schema)\n        return pa_table\n\n    def _generate_tables(self):\n        chunksize = self.config.chunksize\n        sql_reader = pd.read_sql(\n            self.config.sql, self.config.con, chunksize=chunksize, **self.config.pd_read_sql_kwargs\n        )\n        sql_reader = [sql_reader] if chunksize is None else sql_reader\n        for chunk_idx, df in enumerate(sql_reader):\n            pa_table = pa.Table.from_pandas(df)\n            yield Key(0, chunk_idx), self._cast_table(pa_table)\n"
  },
  {
    "path": "src/datasets/packaged_modules/text/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/packaged_modules/text/text.py",
    "content": "from dataclasses import dataclass\nfrom io import StringIO\nfrom typing import Literal, Optional\n\nimport pyarrow as pa\n\nimport datasets\nfrom datasets.builder import Key\nfrom datasets.features.features import require_storage_cast\nfrom datasets.table import table_cast\n\n\nlogger = datasets.utils.logging.get_logger(__name__)\n\n\n@dataclass\nclass TextConfig(datasets.BuilderConfig):\n    \"\"\"BuilderConfig for text files.\n\n    Args:\n        features: (`Features`, *optional*):\n            Cast the data to `features`.\n        encoding: (`str`, defaults to \"utf-8\"):\n            Encoding to decode the file.\n        encoding_errors: (`str`, *optional*):\n            Argument to define what to do in case of encoding error.\n            This is the same as the `error` argument in `open()`.\n        chunksize: (`Features`, *optional*, defaults to \"10MB\"):\n            Chunk size to read the data.\n        keep_linebreaks: (`bool`, defaults to False):\n            Whether to keep line breaks.\n        sample_by (`Literal[\"line\", \"paragraph\", \"document\"]`, defaults to \"line\"):\n            Whether to load data per line, praragraph or document.\n            By default one row in the dataset = one line.\n    \"\"\"\n\n    features: Optional[datasets.Features] = None\n    encoding: str = \"utf-8\"\n    encoding_errors: Optional[str] = None\n    chunksize: int = 10 << 20  # 10MB\n    keep_linebreaks: bool = False\n    sample_by: Literal[\"line\", \"paragraph\", \"document\"] = \"line\"\n\n\nclass Text(datasets.ArrowBasedBuilder):\n    BUILDER_CONFIG_CLASS = TextConfig\n\n    def _info(self):\n        return datasets.DatasetInfo(features=self.config.features)\n\n    def _split_generators(self, dl_manager):\n        \"\"\"The `data_files` kwarg in load_dataset() can be a str, List[str], Dict[str,str], or Dict[str,List[str]].\n\n        If str or List[str], then the dataset returns only the 'train' split.\n        If dict, then keys should be from the `datasets.Split` enum.\n        \"\"\"\n        if not self.config.data_files:\n            raise ValueError(f\"At least one data file must be specified, but got data_files={self.config.data_files}\")\n        dl_manager.download_config.extract_on_the_fly = True\n        base_data_files = dl_manager.download(self.config.data_files)\n        extracted_data_files = dl_manager.extract(base_data_files)\n        splits = []\n        for split_name, files in extracted_data_files.items():\n            files_iterables = [dl_manager.iter_files(file) for file in files]\n            splits.append(\n                datasets.SplitGenerator(\n                    name=split_name,\n                    gen_kwargs={\"files_iterables\": files_iterables, \"base_files\": base_data_files[split_name]},\n                )\n            )\n        return splits\n\n    def _cast_table(self, pa_table: pa.Table) -> pa.Table:\n        if self.config.features is not None:\n            schema = self.config.features.arrow_schema\n            if all(not require_storage_cast(feature) for feature in self.config.features.values()):\n                # cheaper cast\n                pa_table = pa_table.cast(schema)\n            else:\n                # more expensive cast; allows str <-> int/float or str to Audio for example\n                pa_table = table_cast(pa_table, schema)\n            return pa_table\n        else:\n            return pa_table.cast(pa.schema({\"text\": pa.string()}))\n\n    def _generate_shards(self, base_files, files_iterables):\n        yield from base_files\n\n    def _generate_tables(self, base_files, files_iterables):\n        pa_table_names = list(self.config.features) if self.config.features is not None else [\"text\"]\n        for shard_idx, files_iterable in enumerate(files_iterables):\n            for file in files_iterable:\n                # open in text mode, by default translates universal newlines (\"\\n\", \"\\r\\n\" and \"\\r\") into \"\\n\"\n                with open(file, encoding=self.config.encoding, errors=self.config.encoding_errors) as f:\n                    if self.config.sample_by == \"line\":\n                        batch_idx = 0\n                        while True:\n                            batch = f.read(self.config.chunksize)\n                            if not batch:\n                                break\n                            batch += f.readline()  # finish current line\n                            # StringIO.readlines, by default splits only on \"\\n\" (and keeps line breaks)\n                            batch = StringIO(batch).readlines()\n                            if not self.config.keep_linebreaks:\n                                batch = [line.rstrip(\"\\n\") for line in batch]\n                            pa_table = pa.Table.from_arrays([pa.array(batch)], names=pa_table_names)\n                            # Uncomment for debugging (will print the Arrow table size and elements)\n                            # logger.warning(f\"pa_table: {pa_table} num rows: {pa_table.num_rows}\")\n                            # logger.warning('\\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))\n                            yield Key(shard_idx, batch_idx), self._cast_table(pa_table)\n                            batch_idx += 1\n                    elif self.config.sample_by == \"paragraph\":\n                        batch_idx = 0\n                        batch = \"\"\n                        while True:\n                            new_batch = f.read(self.config.chunksize)\n                            if not new_batch:\n                                break\n                            batch += new_batch\n                            batch += f.readline()  # finish current line\n                            batch = batch.split(\"\\n\\n\")\n                            pa_table = pa.Table.from_arrays(\n                                [pa.array([example for example in batch[:-1] if example])], names=pa_table_names\n                            )\n                            # Uncomment for debugging (will print the Arrow table size and elements)\n                            # logger.warning(f\"pa_table: {pa_table} num rows: {pa_table.num_rows}\")\n                            # logger.warning('\\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))\n                            yield Key(shard_idx, batch_idx), self._cast_table(pa_table)\n                            batch_idx += 1\n                            batch = batch[-1]\n                        if batch:\n                            pa_table = pa.Table.from_arrays([pa.array([batch])], names=pa_table_names)\n                            yield (shard_idx, batch_idx), self._cast_table(pa_table)\n                    elif self.config.sample_by == \"document\":\n                        text = f.read()\n                        pa_table = pa.Table.from_arrays([pa.array([text])], names=pa_table_names)\n                        yield Key(shard_idx, 0), self._cast_table(pa_table)\n"
  },
  {
    "path": "src/datasets/packaged_modules/videofolder/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/packaged_modules/videofolder/videofolder.py",
    "content": "import datasets\n\nfrom ..folder_based_builder import folder_based_builder\n\n\nlogger = datasets.utils.logging.get_logger(__name__)\n\n\nclass VideoFolderConfig(folder_based_builder.FolderBasedBuilderConfig):\n    \"\"\"BuilderConfig for ImageFolder.\"\"\"\n\n    drop_labels: bool = None\n    drop_metadata: bool = None\n\n    def __post_init__(self):\n        super().__post_init__()\n\n\nclass VideoFolder(folder_based_builder.FolderBasedBuilder):\n    BASE_FEATURE = datasets.Video\n    BASE_COLUMN_NAME = \"video\"\n    BUILDER_CONFIG_CLASS = VideoFolderConfig\n    EXTENSIONS: list[str]  # definition at the bottom of the script\n\n\n# TODO: initial list, we should check the compatibility of other formats\nVIDEO_EXTENSIONS = [\n    \".mkv\",\n    \".mp4\",\n    \".avi\",\n    \".mpeg\",\n    \".mov\",\n]\nVideoFolder.EXTENSIONS = VIDEO_EXTENSIONS\n"
  },
  {
    "path": "src/datasets/packaged_modules/webdataset/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/packaged_modules/webdataset/_tenbin.py",
    "content": "#\n# Copyright (c) 2017-2021 NVIDIA CORPORATION. All rights reserved.\n# This file coems from the WebDataset library.\n# See the LICENSE file for licensing terms (BSD-style).\n#\n\n\"\"\"\nBinary tensor encodings for PyTorch and NumPy.\n\nThis defines efficient binary encodings for tensors. The format is 8 byte\naligned and can be used directly for computations when transmitted, say,\nvia RDMA. The format is supported by WebDataset with the `.ten` filename\nextension. It is also used by Tensorcom, Tensorcom RDMA, and can be used\nfor fast tensor storage with LMDB and in disk files (which can be memory\nmapped)\n\nData is encoded as a series of chunks:\n\n- magic number (int64)\n- length in bytes (int64)\n- bytes (multiple of 64 bytes long)\n\nArrays are a header chunk followed by a data chunk.\nHeader chunks have the following structure:\n\n- dtype (int64)\n- 8 byte array name\n- ndim (int64)\n- dim[0]\n- dim[1]\n- ...\n\"\"\"\n\nimport struct\nimport sys\n\nimport numpy as np\n\n\ndef bytelen(a):\n    \"\"\"Determine the length of a in bytes.\"\"\"\n    if hasattr(a, \"nbytes\"):\n        return a.nbytes\n    elif isinstance(a, (bytearray, bytes)):\n        return len(a)\n    else:\n        raise ValueError(a, \"cannot determine nbytes\")\n\n\ndef bytedata(a):\n    \"\"\"Return a the raw data corresponding to a.\"\"\"\n    if isinstance(a, (bytearray, bytes, memoryview)):\n        return a\n    elif hasattr(a, \"data\"):\n        return a.data\n    else:\n        raise ValueError(a, \"cannot return bytedata\")\n\n\n# tables for converting between long/short NumPy dtypes\n\nlong_to_short = \"\"\"\nfloat16 f2\nfloat32 f4\nfloat64 f8\nint8 i1\nint16 i2\nint32 i4\nint64 i8\nuint8 u1\nuint16 u2\nunit32 u4\nuint64 u8\n\"\"\".strip()\nlong_to_short = [x.split() for x in long_to_short.split(\"\\n\")]\nlong_to_short = {x[0]: x[1] for x in long_to_short}\nshort_to_long = {v: k for k, v in long_to_short.items()}\n\n\ndef check_acceptable_input_type(data, allow64):\n    \"\"\"Check that the data has an acceptable type for tensor encoding.\n\n    :param data: array\n    :param allow64: allow 64 bit types\n    \"\"\"\n    for a in data:\n        if a.dtype.name not in long_to_short:\n            raise ValueError(\"unsupported dataypte\")\n        if not allow64 and a.dtype.name not in [\"float64\", \"int64\", \"uint64\"]:\n            raise ValueError(\"64 bit datatypes not allowed unless explicitly enabled\")\n\n\ndef str64(s):\n    \"\"\"Convert a string to an int64.\"\"\"\n    s = s + \"\\0\" * (8 - len(s))\n    s = s.encode(\"ascii\")\n    return struct.unpack(\"@q\", s)[0]\n\n\ndef unstr64(i):\n    \"\"\"Convert an int64 to a string.\"\"\"\n    b = struct.pack(\"@q\", i)\n    return b.decode(\"ascii\").strip(\"\\0\")\n\n\ndef check_infos(data, infos, required_infos=None):\n    \"\"\"Verify the info strings.\"\"\"\n    if required_infos is False or required_infos is None:\n        return data\n    if required_infos is True:\n        return data, infos\n    if not isinstance(required_infos, (tuple, list)):\n        raise ValueError(\"required_infos must be tuple or list\")\n    for required, actual in zip(required_infos, infos):\n        raise ValueError(f\"actual info {actual} doesn't match required info {required}\")\n    return data\n\n\ndef encode_header(a, info=\"\"):\n    \"\"\"Encode an array header as a byte array.\"\"\"\n    if a.ndim >= 10:\n        raise ValueError(\"too many dimensions\")\n    if a.nbytes != np.prod(a.shape) * a.itemsize:\n        raise ValueError(\"mismatch between size and shape\")\n    if a.dtype.name not in long_to_short:\n        raise ValueError(\"unsupported array type\")\n    header = [str64(long_to_short[a.dtype.name]), str64(info), len(a.shape)] + list(a.shape)\n    return bytedata(np.array(header, dtype=\"i8\"))\n\n\ndef decode_header(h):\n    \"\"\"Decode a byte array into an array header.\"\"\"\n    h = np.frombuffer(h, dtype=\"i8\")\n    if unstr64(h[0]) not in short_to_long:\n        raise ValueError(\"unsupported array type\")\n    dtype = np.dtype(short_to_long[unstr64(h[0])])\n    info = unstr64(h[1])\n    rank = int(h[2])\n    shape = tuple(h[3 : 3 + rank])\n    return shape, dtype, info\n\n\ndef encode_list(l, infos=None):  # noqa: E741\n    \"\"\"Given a list of arrays, encode them into a list of byte arrays.\"\"\"\n    if infos is None:\n        infos = [\"\"]\n    else:\n        if len(l) != len(infos):\n            raise ValueError(f\"length of list {l} must muatch length of infos {infos}\")\n    result = []\n    for i, a in enumerate(l):\n        header = encode_header(a, infos[i % len(infos)])\n        result += [header, bytedata(a)]\n    return result\n\n\ndef decode_list(l, infos=False):  # noqa: E741\n    \"\"\"Given a list of byte arrays, decode them into arrays.\"\"\"\n    result = []\n    infos0 = []\n    for header, data in zip(l[::2], l[1::2]):\n        shape, dtype, info = decode_header(header)\n        a = np.frombuffer(data, dtype=dtype, count=np.prod(shape)).reshape(*shape)\n        result += [a]\n        infos0 += [info]\n    return check_infos(result, infos0, infos)\n\n\nmagic_str = \"~TenBin~\"\nmagic = str64(magic_str)\nmagic_bytes = unstr64(magic).encode(\"ascii\")\n\n\ndef roundup(n, k=64):\n    \"\"\"Round up to the next multiple of 64.\"\"\"\n    return k * ((n + k - 1) // k)\n\n\ndef encode_chunks(l):  # noqa: E741\n    \"\"\"Encode a list of chunks into a single byte array, with lengths and magics..\"\"\"\n    size = sum(16 + roundup(b.nbytes) for b in l)\n    result = bytearray(size)\n    offset = 0\n    for b in l:\n        result[offset : offset + 8] = magic_bytes\n        offset += 8\n        result[offset : offset + 8] = struct.pack(\"@q\", b.nbytes)\n        offset += 8\n        result[offset : offset + bytelen(b)] = b\n        offset += roundup(bytelen(b))\n    return result\n\n\ndef decode_chunks(buf):\n    \"\"\"Decode a byte array into a list of chunks.\"\"\"\n    result = []\n    offset = 0\n    total = bytelen(buf)\n    while offset < total:\n        if magic_bytes != buf[offset : offset + 8]:\n            raise ValueError(\"magic bytes mismatch\")\n        offset += 8\n        nbytes = struct.unpack(\"@q\", buf[offset : offset + 8])[0]\n        offset += 8\n        b = buf[offset : offset + nbytes]\n        offset += roundup(nbytes)\n        result.append(b)\n    return result\n\n\ndef encode_buffer(l, infos=None):  # noqa: E741\n    \"\"\"Encode a list of arrays into a single byte array.\"\"\"\n    if not isinstance(l, list):\n        raise ValueError(\"requires list\")\n    return encode_chunks(encode_list(l, infos=infos))\n\n\ndef decode_buffer(buf, infos=False):\n    \"\"\"Decode a byte array into a list of arrays.\"\"\"\n    return decode_list(decode_chunks(buf), infos=infos)\n\n\ndef write_chunk(stream, buf):\n    \"\"\"Write a byte chunk to the stream with magics, length, and padding.\"\"\"\n    nbytes = bytelen(buf)\n    stream.write(magic_bytes)\n    stream.write(struct.pack(\"@q\", nbytes))\n    stream.write(bytedata(buf))\n    padding = roundup(nbytes) - nbytes\n    if padding > 0:\n        stream.write(b\"\\0\" * padding)\n\n\ndef read_chunk(stream):\n    \"\"\"Read a byte chunk from a stream with magics, length, and padding.\"\"\"\n    magic = stream.read(8)\n    if magic == b\"\":\n        return None\n    if magic != magic_bytes:\n        raise ValueError(\"magic number does not match\")\n    nbytes = stream.read(8)\n    nbytes = struct.unpack(\"@q\", nbytes)[0]\n    if nbytes < 0:\n        raise ValueError(\"negative nbytes\")\n    data = stream.read(nbytes)\n    padding = roundup(nbytes) - nbytes\n    if padding > 0:\n        stream.read(padding)\n    return data\n\n\ndef write(stream, l, infos=None):  # noqa: E741\n    \"\"\"Write a list of arrays to a stream, with magics, length, and padding.\"\"\"\n    for chunk in encode_list(l, infos=infos):\n        write_chunk(stream, chunk)\n\n\ndef read(stream, n=sys.maxsize, infos=False):\n    \"\"\"Read a list of arrays from a stream, with magics, length, and padding.\"\"\"\n    chunks = []\n    for _ in range(n):\n        header = read_chunk(stream)\n        if header is None:\n            break\n        data = read_chunk(stream)\n        if data is None:\n            raise ValueError(\"premature EOF\")\n        chunks += [header, data]\n    return decode_list(chunks, infos=infos)\n\n\ndef save(fname, *args, infos=None, nocheck=False):\n    \"\"\"Save a list of arrays to a file, with magics, length, and padding.\"\"\"\n    if not nocheck and not fname.endswith(\".ten\"):\n        raise ValueError(\"file name should end in .ten\")\n    with open(fname, \"wb\") as stream:\n        write(stream, args, infos=infos)\n\n\ndef load(fname, infos=False, nocheck=False):\n    \"\"\"Read a list of arrays from a file, with magics, length, and padding.\"\"\"\n    if not nocheck and not fname.endswith(\".ten\"):\n        raise ValueError(\"file name should end in .ten\")\n    with open(fname, \"rb\") as stream:\n        return read(stream, infos=infos)\n"
  },
  {
    "path": "src/datasets/packaged_modules/webdataset/webdataset.py",
    "content": "import io\nimport json\nimport re\nfrom itertools import islice\nfrom typing import Any, Callable\n\nimport fsspec\nimport numpy as np\nimport pyarrow as pa\n\nimport datasets\nfrom datasets.builder import Key\nfrom datasets.features.features import cast_to_python_objects\nfrom datasets.utils.file_utils import SINGLE_FILE_COMPRESSION_EXTENSION_TO_PROTOCOL, xbasename\n\n\nlogger = datasets.utils.logging.get_logger(__name__)\n\n\nclass WebDataset(datasets.GeneratorBasedBuilder):\n    DEFAULT_WRITER_BATCH_SIZE = 100\n    IMAGE_EXTENSIONS: list[str]  # definition at the bottom of the script\n    AUDIO_EXTENSIONS: list[str]  # definition at the bottom of the script\n    VIDEO_EXTENSIONS: list[str]  # definition at the bottom of the script\n    DECODERS: dict[str, Callable[[Any], Any]]  # definition at the bottom of the script\n    NUM_EXAMPLES_FOR_FEATURES_INFERENCE = 5\n\n    @classmethod\n    def _get_pipeline_from_tar(cls, tar_path, tar_iterator):\n        current_example = {}\n        fs: fsspec.AbstractFileSystem = fsspec.filesystem(\"memory\")\n        streaming_download_manager = datasets.StreamingDownloadManager()\n        for filename, f in tar_iterator:\n            example_key, field_name = base_plus_ext(filename)\n            if example_key is None:\n                continue\n            if current_example and current_example[\"__key__\"] != example_key:\n                # reposition some keys in last position\n                current_example[\"__key__\"] = current_example.pop(\"__key__\")\n                current_example[\"__url__\"] = current_example.pop(\"__url__\")\n                yield current_example\n                current_example = {}\n            current_example[\"__key__\"] = example_key\n            current_example[\"__url__\"] = tar_path\n            current_example[field_name] = f.read()\n            if field_name.split(\".\")[-1].lower() in SINGLE_FILE_COMPRESSION_EXTENSION_TO_PROTOCOL:\n                fs.write_bytes(filename, current_example[field_name])\n                extracted_file_path = streaming_download_manager.extract(f\"memory://{filename}\")\n                with fsspec.open(extracted_file_path) as f:\n                    current_example[field_name] = f.read()\n                fs.delete(filename)\n                data_extension = xbasename(extracted_file_path).split(\".\")[-1].lower()\n            else:\n                data_extension = field_name.split(\".\")[-1].lower()\n            if data_extension in cls.DECODERS:\n                current_example[field_name] = cls.DECODERS[data_extension](current_example[field_name])\n        if current_example:\n            yield current_example\n\n    def _info(self) -> datasets.DatasetInfo:\n        return datasets.DatasetInfo()\n\n    def _split_generators(self, dl_manager):\n        \"\"\"We handle string, list and dicts in datafiles\"\"\"\n        # Download the data files\n        if not self.config.data_files:\n            raise ValueError(f\"At least one data file must be specified, but got data_files={self.config.data_files}\")\n        data_files = dl_manager.download(self.config.data_files)\n        splits = []\n        for split_name, tar_paths in data_files.items():\n            tar_iterators = [dl_manager.iter_archive(tar_path) for tar_path in tar_paths]\n            splits.append(\n                datasets.SplitGenerator(\n                    name=split_name, gen_kwargs={\"tar_paths\": tar_paths, \"tar_iterators\": tar_iterators}\n                )\n            )\n        if not self.info.features:\n            # Get one example to get the feature types\n            pipeline = self._get_pipeline_from_tar(tar_paths[0], tar_iterators[0])\n            first_examples = list(islice(pipeline, self.NUM_EXAMPLES_FOR_FEATURES_INFERENCE))\n            if any(example.keys() != first_examples[0].keys() for example in first_examples):\n                raise ValueError(\n                    \"The TAR archives of the dataset should be in WebDataset format, \"\n                    \"but the files in the archive don't share the same prefix or the same types.\"\n                )\n            pa_tables = [\n                pa.Table.from_pylist(cast_to_python_objects([example], only_1d_for_numpy=True))\n                for example in first_examples\n            ]\n            inferred_arrow_schema = pa.concat_tables(pa_tables, promote_options=\"default\").schema\n            features = datasets.Features.from_arrow_schema(inferred_arrow_schema)\n\n            for field_name in first_examples[0]:\n                extension = field_name.rsplit(\".\", 1)[-1].lower()\n                # Set Image types\n                if extension in self.IMAGE_EXTENSIONS:\n                    features[field_name] = datasets.Image()\n                # Set Audio types\n                if extension in self.AUDIO_EXTENSIONS:\n                    features[field_name] = datasets.Audio()\n                # Set Video types\n                if extension in self.VIDEO_EXTENSIONS:\n                    features[field_name] = datasets.Video()\n            self.info.features = features\n\n        return splits\n\n    def _generate_shards(self, tar_paths, tar_iterators):\n        yield from tar_paths\n\n    def _generate_examples(self, tar_paths, tar_iterators):\n        image_field_names = [\n            field_name for field_name, feature in self.info.features.items() if isinstance(feature, datasets.Image)\n        ]\n        audio_field_names = [\n            field_name for field_name, feature in self.info.features.items() if isinstance(feature, datasets.Audio)\n        ]\n        all_field_names = list(self.info.features.keys())\n        for tar_idx, (tar_path, tar_iterator) in enumerate(zip(tar_paths, tar_iterators)):\n            for example_idx, example in enumerate(self._get_pipeline_from_tar(tar_path, tar_iterator)):\n                for field_name in all_field_names:\n                    if field_name not in example:\n                        example[field_name] = None\n                for field_name in image_field_names + audio_field_names:\n                    if example[field_name] is not None:\n                        example[field_name] = {\n                            \"path\": example[\"__key__\"] + \".\" + field_name,\n                            \"bytes\": example[field_name],\n                        }\n                yield Key(tar_idx, example_idx), example\n\n\n# Source: https://github.com/webdataset/webdataset/blob/87bd5aa41602d57f070f65a670893ee625702f2f/webdataset/tariterators.py#L25\ndef base_plus_ext(path):\n    \"\"\"Split off all file extensions.\n\n    Returns base, allext.\n    \"\"\"\n    match = re.match(r\"^((?:.*/|)[^.]+)[.]([^/]*)$\", path)\n    if not match:\n        return None, None\n    return match.group(1), match.group(2)\n\n\n# Obtained with:\n# ```\n# import PIL.Image\n# IMAGE_EXTENSIONS = []\n# PIL.Image.init()\n# for ext, format in PIL.Image.EXTENSION.items():\n#     if format in PIL.Image.OPEN:\n#         IMAGE_EXTENSIONS.append(ext[1:])\n# ```\n# We intentionally do not run this code on launch because:\n# (1) Pillow is an optional dependency, so importing Pillow in global namespace is not allowed\n# (2) To ensure the list of supported extensions is deterministic\nIMAGE_EXTENSIONS = [\n    \"blp\",\n    \"bmp\",\n    \"dib\",\n    \"bufr\",\n    \"cur\",\n    \"pcx\",\n    \"dcx\",\n    \"dds\",\n    \"ps\",\n    \"eps\",\n    \"fit\",\n    \"fits\",\n    \"fli\",\n    \"flc\",\n    \"ftc\",\n    \"ftu\",\n    \"gbr\",\n    \"gif\",\n    \"grib\",\n    \"h5\",\n    \"hdf\",\n    \"png\",\n    \"apng\",\n    \"jp2\",\n    \"j2k\",\n    \"jpc\",\n    \"jpf\",\n    \"jpx\",\n    \"j2c\",\n    \"icns\",\n    \"ico\",\n    \"im\",\n    \"iim\",\n    \"tif\",\n    \"tiff\",\n    \"jfif\",\n    \"jpe\",\n    \"jpg\",\n    \"jpeg\",\n    \"mpg\",\n    \"mpeg\",\n    \"msp\",\n    \"pcd\",\n    \"pxr\",\n    \"pbm\",\n    \"pgm\",\n    \"ppm\",\n    \"pnm\",\n    \"psd\",\n    \"bw\",\n    \"rgb\",\n    \"rgba\",\n    \"sgi\",\n    \"ras\",\n    \"tga\",\n    \"icb\",\n    \"vda\",\n    \"vst\",\n    \"webp\",\n    \"wmf\",\n    \"emf\",\n    \"xbm\",\n    \"xpm\",\n]\nWebDataset.IMAGE_EXTENSIONS = IMAGE_EXTENSIONS\n\n\n# Obtained with:\n# ```\n# import soundfile as sf\n#\n# AUDIO_EXTENSIONS = [f\".{format.lower()}\" for format in sf.available_formats().keys()]\n#\n# # .opus decoding is supported if libsndfile >= 1.0.31:\n# AUDIO_EXTENSIONS.extend([\".mp3\", \".opus\"])\n# ```\n# We intentionally did not run this code on launch because:\n# (1) Soundfile was an optional dependency, so importing it in global namespace is not allowed\n# (2) To ensure the list of supported extensions is deterministic\n# (3) We use TorchCodec now anyways instead of Soundfile\nAUDIO_EXTENSIONS = [\n    \"aiff\",\n    \"au\",\n    \"avr\",\n    \"caf\",\n    \"flac\",\n    \"htk\",\n    \"svx\",\n    \"mat4\",\n    \"mat5\",\n    \"mpc2k\",\n    \"ogg\",\n    \"paf\",\n    \"pvf\",\n    \"raw\",\n    \"rf64\",\n    \"sd2\",\n    \"sds\",\n    \"ircam\",\n    \"voc\",\n    \"w64\",\n    \"wav\",\n    \"nist\",\n    \"wavex\",\n    \"wve\",\n    \"xi\",\n    \"mp3\",\n    \"opus\",\n]\nWebDataset.AUDIO_EXTENSIONS = AUDIO_EXTENSIONS\n\n\n# TODO: initial list, we should check the compatibility of other formats\nVIDEO_EXTENSIONS = [\n    \"mkv\",\n    \"mp4\",\n    \"avi\",\n    \"mpeg\",\n    \"mov\",\n]\nWebDataset.VIDEO_EXTENSIONS = VIDEO_EXTENSIONS\n\n\ndef text_loads(data: bytes):\n    return data.decode(\"utf-8\")\n\n\ndef tenbin_loads(data: bytes):\n    from . import _tenbin\n\n    return _tenbin.decode_buffer(data)\n\n\ndef msgpack_loads(data: bytes):\n    import msgpack\n\n    return msgpack.unpackb(data)\n\n\ndef npy_loads(data: bytes):\n    import numpy.lib.format\n\n    stream = io.BytesIO(data)\n    return numpy.lib.format.read_array(stream, allow_pickle=False)\n\n\ndef npz_loads(data: bytes):\n    return np.load(io.BytesIO(data), allow_pickle=False)\n\n\ndef cbor_loads(data: bytes):\n    import cbor\n\n    return cbor.loads(data)\n\n\ndef torch_loads(data: bytes):\n    import torch\n\n    return torch.load(io.BytesIO(data), weights_only=True)\n\n\n# Obtained by checking `decoders` in `webdataset.autodecode`\n# and removing unsafe extension decoders.\n# Removed Pickle decoders:\n# - \"pyd\": lambda data: pickle.loads(data)\n# - \"pickle\": lambda data: pickle.loads(data)\n# Modified NumPy decoders to fix CVE-2019-6446 (add allow_pickle=False and weights_only=True):\n# - \"npy\": npy_loads,\n# - \"npz\": lambda data: np.load(io.BytesIO(data)),\n# - \"pth\": lambda data: torch_loads(data)\nDECODERS = {\n    \"txt\": text_loads,\n    \"text\": text_loads,\n    \"transcript\": text_loads,\n    \"cls\": int,\n    \"cls2\": int,\n    \"index\": int,\n    \"inx\": int,\n    \"id\": int,\n    \"json\": json.loads,\n    \"jsn\": json.loads,\n    \"ten\": tenbin_loads,\n    \"tb\": tenbin_loads,\n    \"mp\": msgpack_loads,\n    \"msg\": msgpack_loads,\n    \"npy\": npy_loads,\n    \"npz\": npz_loads,\n    \"cbor\": cbor_loads,\n    \"pth\": torch_loads,\n}\nWebDataset.DECODERS = DECODERS\n"
  },
  {
    "path": "src/datasets/packaged_modules/xml/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/packaged_modules/xml/xml.py",
    "content": "from dataclasses import dataclass\nfrom typing import Optional\n\nimport pyarrow as pa\n\nimport datasets\nfrom datasets.features.features import require_storage_cast\nfrom datasets.table import table_cast\n\n\nlogger = datasets.utils.logging.get_logger(__name__)\n\n\n@dataclass\nclass XmlConfig(datasets.BuilderConfig):\n    \"\"\"BuilderConfig for xml files.\"\"\"\n\n    features: Optional[datasets.Features] = None\n    encoding: str = \"utf-8\"\n    encoding_errors: Optional[str] = None\n\n\nclass Xml(datasets.ArrowBasedBuilder):\n    BUILDER_CONFIG_CLASS = XmlConfig\n\n    def _info(self):\n        return datasets.DatasetInfo(features=self.config.features)\n\n    def _split_generators(self, dl_manager):\n        \"\"\"The `data_files` kwarg in load_dataset() can be a str, List[str], Dict[str,str], or Dict[str,List[str]].\n\n        If str or List[str], then the dataset returns only the 'train' split.\n        If dict, then keys should be from the `datasets.Split` enum.\n        \"\"\"\n        if not self.config.data_files:\n            raise ValueError(f\"At least one data file must be specified, but got data_files={self.config.data_files}\")\n        dl_manager.download_config.extract_on_the_fly = True\n        data_files = dl_manager.download_and_extract(self.config.data_files)\n        splits = []\n        for split_name, files in data_files.items():\n            if isinstance(files, str):\n                files = [files]\n            files = [dl_manager.iter_files(file) for file in files]\n            splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={\"files\": files}))\n        return splits\n\n    def _cast_table(self, pa_table: pa.Table) -> pa.Table:\n        if self.config.features is not None:\n            schema = self.config.features.arrow_schema\n            if all(not require_storage_cast(feature) for feature in self.config.features.values()):\n                # cheaper cast\n                pa_table = pa_table.cast(schema)\n            else:\n                # more expensive cast; allows str <-> int/float or str to Audio for example\n                pa_table = table_cast(pa_table, schema)\n            return pa_table\n        else:\n            return pa_table.cast(pa.schema({\"xml\": pa.string()}))\n\n    def _generate_shards(self, files):\n        yield from files\n\n    def _generate_tables(self, files):\n        pa_table_names = list(self.config.features) if self.config.features is not None else [\"xml\"]\n        for file_idx, file in enumerate(files):\n            # open in text mode, by default translates universal newlines (\"\\n\", \"\\r\\n\" and \"\\r\") into \"\\n\"\n            with open(file, encoding=self.config.encoding, errors=self.config.encoding_errors) as f:\n                xml = f.read()\n                pa_table = pa.Table.from_arrays([pa.array([xml])], names=pa_table_names)\n                yield (file_idx, 0), self._cast_table(pa_table)\n"
  },
  {
    "path": "src/datasets/parallel/__init__.py",
    "content": "from .parallel import ParallelBackendConfig, parallel_backend, parallel_map\n"
  },
  {
    "path": "src/datasets/parallel/parallel.py",
    "content": "import contextlib\nfrom multiprocessing import Pool, RLock\n\nfrom tqdm.auto import tqdm\n\nfrom ..utils import experimental, logging\n\n\nlogger = logging.get_logger(__name__)\n\n\nclass ParallelBackendConfig:\n    backend_name = None\n\n\n@experimental\ndef parallel_map(function, iterable, num_proc, batched, batch_size, types, disable_tqdm, desc, single_map_nested_func):\n    \"\"\"\n    **Experimental.** Apply a function to iterable elements in parallel, where the implementation uses either\n    multiprocessing.Pool or joblib for parallelization.\n\n    Args:\n        function (`Callable[[Any], Any]`): Function to be applied to `iterable`.\n        iterable (`list`, `tuple` or `np.ndarray`): Iterable elements to apply function to.\n        num_proc (`int`): Number of processes (if no backend specified) or jobs (using joblib).\n        types (`tuple`): Additional types (besides `dict` values) to apply `function` recursively to their elements.\n        disable_tqdm (`bool`): Whether to disable the tqdm progressbar.\n        desc (`str`): Prefix for the tqdm progressbar.\n        single_map_nested_func (`Callable`): Map function that applies `function` to an element from `iterable`.\n            Takes a tuple of function, data_struct, types, rank, disable_tqdm, desc as input, where data_struct is an\n            element of `iterable`, and `rank` is used for progress bar.\n    \"\"\"\n    if ParallelBackendConfig.backend_name is None:\n        return _map_with_multiprocessing_pool(\n            function, iterable, num_proc, batched, batch_size, types, disable_tqdm, desc, single_map_nested_func\n        )\n\n    return _map_with_joblib(\n        function, iterable, num_proc, batched, batch_size, types, disable_tqdm, desc, single_map_nested_func\n    )\n\n\ndef _map_with_multiprocessing_pool(\n    function, iterable, num_proc, batched, batch_size, types, disable_tqdm, desc, single_map_nested_func\n):\n    num_proc = num_proc if num_proc <= len(iterable) else len(iterable)\n    split_kwds = []  # We organize the splits ourselve (contiguous splits)\n    for index in range(num_proc):\n        div = len(iterable) // num_proc\n        mod = len(iterable) % num_proc\n        start = div * index + min(index, mod)\n        end = start + div + (1 if index < mod else 0)\n        split_kwds.append((function, iterable[start:end], batched, batch_size, types, index, disable_tqdm, desc))\n\n    if len(iterable) != sum(len(i[1]) for i in split_kwds):\n        raise ValueError(\n            f\"Error dividing inputs iterable among processes. \"\n            f\"Total number of objects {len(iterable)}, \"\n            f\"length: {sum(len(i[1]) for i in split_kwds)}\"\n        )\n\n    logger.info(\n        f\"Spawning {num_proc} processes for {len(iterable)} objects in slices of {[len(i[1]) for i in split_kwds]}\"\n    )\n    initargs, initializer = None, None\n    if not disable_tqdm:\n        initargs, initializer = (RLock(),), tqdm.set_lock\n    with Pool(num_proc, initargs=initargs, initializer=initializer) as pool:\n        mapped = pool.map(single_map_nested_func, split_kwds)\n    logger.info(f\"Finished {num_proc} processes\")\n    mapped = [obj for proc_res in mapped for obj in proc_res]\n    logger.info(f\"Unpacked {len(mapped)} objects\")\n\n    return mapped\n\n\ndef _map_with_joblib(\n    function, iterable, num_proc, batched, batch_size, types, disable_tqdm, desc, single_map_nested_func\n):\n    # progress bar is not yet supported for _map_with_joblib, because tqdm couldn't accurately be applied to joblib,\n    # and it requires monkey-patching joblib internal classes which is subject to change\n    import joblib\n\n    with joblib.parallel_backend(ParallelBackendConfig.backend_name, n_jobs=num_proc):\n        return joblib.Parallel()(\n            joblib.delayed(single_map_nested_func)((function, obj, batched, batch_size, types, None, True, None))\n            for obj in iterable\n        )\n\n\n@experimental\n@contextlib.contextmanager\ndef parallel_backend(backend_name: str):\n    \"\"\"\n    **Experimental.**  Configures the parallel backend for parallelized dataset loading, which uses the parallelization\n    implemented by joblib.\n\n    Args:\n        backend_name (str): Name of backend for parallelization implementation, has to be supported by joblib.\n\n     Example usage:\n     ```py\n     with parallel_backend('spark'):\n       dataset = load_dataset(..., num_proc=2)\n     ```\n    \"\"\"\n    ParallelBackendConfig.backend_name = backend_name\n\n    if backend_name == \"spark\":\n        from joblibspark import register_spark\n\n        register_spark()\n\n        # TODO: call create_cache_and_write_probe if \"download\" in steps\n        # TODO: raise NotImplementedError when Dataset.map etc is called\n\n    try:\n        yield\n    finally:\n        ParallelBackendConfig.backend_name = None\n"
  },
  {
    "path": "src/datasets/search.py",
    "content": "import importlib.util\nimport os\nimport tempfile\nfrom pathlib import PurePath\nfrom typing import TYPE_CHECKING, NamedTuple, Optional, Union\n\nimport fsspec\nimport numpy as np\n\nfrom .features import List\nfrom .utils import logging\nfrom .utils import tqdm as hf_tqdm\n\n\nif TYPE_CHECKING:\n    from .arrow_dataset import Dataset  # noqa: F401\n\n    try:\n        from elasticsearch import Elasticsearch  # noqa: F401\n\n    except ImportError:\n        pass\n    try:\n        import faiss  # noqa: F401\n\n    except ImportError:\n        pass\n\n_has_elasticsearch = importlib.util.find_spec(\"elasticsearch\") is not None\n_has_faiss = importlib.util.find_spec(\"faiss\") is not None\n\n\nlogger = logging.get_logger(__name__)\n\n\nclass MissingIndex(Exception):\n    pass\n\n\nclass SearchResults(NamedTuple):\n    scores: list[float]\n    indices: list[int]\n\n\nclass BatchedSearchResults(NamedTuple):\n    total_scores: list[list[float]]\n    total_indices: list[list[int]]\n\n\nclass NearestExamplesResults(NamedTuple):\n    scores: list[float]\n    examples: dict\n\n\nclass BatchedNearestExamplesResults(NamedTuple):\n    total_scores: list[list[float]]\n    total_examples: list[dict]\n\n\nclass BaseIndex:\n    \"\"\"Base class for indexing\"\"\"\n\n    def search(self, query, k: int = 10, **kwargs) -> SearchResults:\n        \"\"\"\n        To implement.\n        This method has to return the scores and the indices of the retrieved examples given a certain query.\n        \"\"\"\n        raise NotImplementedError\n\n    def search_batch(self, queries, k: int = 10, **kwargs) -> BatchedSearchResults:\n        \"\"\"Find the nearest examples indices to the query.\n\n        Args:\n            queries (`Union[List[str], np.ndarray]`): The queries as a list of strings if `column` is a text index or as a numpy array if `column` is a vector index.\n            k (`int`): The number of examples to retrieve per query.\n\n        Output:\n            total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.\n            total_indices (`List[List[int]]`): The indices of the retrieved examples per query.\n        \"\"\"\n        total_scores, total_indices = [], []\n        for query in queries:\n            scores, indices = self.search(query, k)\n            total_scores.append(scores)\n            total_indices.append(indices)\n        return BatchedSearchResults(total_scores, total_indices)\n\n    def save(self, file: Union[str, PurePath]):\n        \"\"\"Serialize the index on disk\"\"\"\n        raise NotImplementedError\n\n    @classmethod\n    def load(cls, file: Union[str, PurePath]) -> \"BaseIndex\":\n        \"\"\"Deserialize the index from disk\"\"\"\n        raise NotImplementedError\n\n\nclass ElasticSearchIndex(BaseIndex):\n    \"\"\"\n    Sparse index using Elasticsearch. It is used to index text and run queries based on BM25 similarity.\n    An Elasticsearch server needs to be accessible, and a python client is declared with\n    ```\n    es_client = Elasticsearch([{'host': 'localhost', 'port': '9200'}])\n    ```\n    for example.\n    \"\"\"\n\n    def __init__(\n        self,\n        host: Optional[str] = None,\n        port: Optional[int] = None,\n        es_client: Optional[\"Elasticsearch\"] = None,\n        es_index_name: Optional[str] = None,\n        es_index_config: Optional[dict] = None,\n    ):\n        if not _has_elasticsearch:\n            raise ImportError(\n                \"You must install ElasticSearch to use ElasticSearchIndex. To do so you can run `pip install elasticsearch==7.7.1 for example`\"\n            )\n        if es_client is not None and (host is not None or port is not None):\n            raise ValueError(\"Please specify either `es_client` or `(host, port)`, but not both.\")\n        host = host or \"localhost\"\n        port = port or 9200\n\n        import elasticsearch.helpers  # noqa: F401 - need this to properly load all the es features\n        from elasticsearch import Elasticsearch  # noqa: F811\n\n        self.es_client = es_client if es_client is not None else Elasticsearch([{\"host\": host, \"port\": str(port)}])\n        self.es_index_name = (\n            es_index_name\n            if es_index_name is not None\n            else \"huggingface_datasets_\" + os.path.basename(tempfile.NamedTemporaryFile().name)\n        )\n        self.es_index_config = (\n            es_index_config\n            if es_index_config is not None\n            else {\n                \"settings\": {\n                    \"number_of_shards\": 1,\n                    \"analysis\": {\"analyzer\": {\"stop_standard\": {\"type\": \"standard\", \" stopwords\": \"_english_\"}}},\n                },\n                \"mappings\": {\"properties\": {\"text\": {\"type\": \"text\", \"analyzer\": \"standard\", \"similarity\": \"BM25\"}}},\n            }\n        )\n\n    def add_documents(self, documents: Union[list[str], \"Dataset\"], column: Optional[str] = None):\n        \"\"\"\n        Add documents to the index.\n        If the documents are inside a certain column, you can specify it using the `column` argument.\n        \"\"\"\n        index_name = self.es_index_name\n        index_config = self.es_index_config\n        self.es_client.indices.create(index=index_name, body=index_config)\n        number_of_docs = len(documents)\n        progress = hf_tqdm(unit=\"docs\", total=number_of_docs)\n        successes = 0\n\n        def passage_generator():\n            if column is not None:\n                for i, example in enumerate(documents):\n                    yield {\"text\": example[column], \"_id\": i}\n            else:\n                for i, example in enumerate(documents):\n                    yield {\"text\": example, \"_id\": i}\n\n        # create the ES index\n        import elasticsearch as es\n\n        for ok, action in es.helpers.streaming_bulk(\n            client=self.es_client,\n            index=index_name,\n            actions=passage_generator(),\n        ):\n            progress.update(1)\n            successes += ok\n        if successes != len(documents):\n            logger.warning(\n                f\"Some documents failed to be added to ElasticSearch. Failures: {len(documents) - successes}/{len(documents)}\"\n            )\n        logger.info(f\"Indexed {successes:d} documents\")\n\n    def search(self, query: str, k=10, **kwargs) -> SearchResults:\n        \"\"\"Find the nearest examples indices to the query.\n\n        Args:\n            query (`str`): The query as a string.\n            k (`int`): The number of examples to retrieve.\n\n        Output:\n            scores (`List[List[float]`): The retrieval scores of the retrieved examples.\n            indices (`List[List[int]]`): The indices of the retrieved examples.\n        \"\"\"\n        response = self.es_client.search(\n            index=self.es_index_name,\n            body={\"query\": {\"multi_match\": {\"query\": query, \"fields\": [\"text\"], \"type\": \"cross_fields\"}}, \"size\": k},\n            **kwargs,\n        )\n        hits = response[\"hits\"][\"hits\"]\n        return SearchResults([hit[\"_score\"] for hit in hits], [int(hit[\"_id\"]) for hit in hits])\n\n    def search_batch(self, queries, k: int = 10, max_workers=10, **kwargs) -> BatchedSearchResults:\n        import concurrent.futures\n\n        total_scores, total_indices = [None] * len(queries), [None] * len(queries)\n        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:\n            future_to_index = {executor.submit(self.search, query, k, **kwargs): i for i, query in enumerate(queries)}\n            for future in concurrent.futures.as_completed(future_to_index):\n                index = future_to_index[future]\n                results: SearchResults = future.result()\n                total_scores[index] = results.scores\n                total_indices[index] = results.indices\n        return BatchedSearchResults(total_indices=total_indices, total_scores=total_scores)\n\n\nclass FaissIndex(BaseIndex):\n    \"\"\"\n    Dense index using Faiss. It is used to index vectors.\n    Faiss is a library for efficient similarity search and clustering of dense vectors.\n    It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM.\n    You can find more information about Faiss here:\n    - For index types and the string factory: https://github.com/facebookresearch/faiss/wiki/The-index-factory\n    - For GPU settings: https://github.com/facebookresearch/faiss/wiki/Faiss-on-the-GPU\n    \"\"\"\n\n    def __init__(\n        self,\n        device: Optional[Union[int, list[int]]] = None,\n        string_factory: Optional[str] = None,\n        metric_type: Optional[int] = None,\n        custom_index: Optional[\"faiss.Index\"] = None,\n    ):\n        \"\"\"\n        Create a Dense index using Faiss. You can specify `device` if you want to run it on GPU (`device` must be the GPU index).\n        You can find more information about Faiss here:\n        - For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory\n        \"\"\"\n        if string_factory is not None and custom_index is not None:\n            raise ValueError(\"Please specify either `string_factory` or `custom_index` but not both.\")\n        if device is not None and custom_index is not None:\n            raise ValueError(\n                \"Cannot pass both 'custom_index' and 'device'. \"\n                \"Pass 'custom_index' already transferred to the target device instead.\"\n            )\n        self.device = device\n        self.string_factory = string_factory\n        self.metric_type = metric_type\n        self.faiss_index = custom_index\n        if not _has_faiss:\n            raise ImportError(\n                \"You must install Faiss to use FaissIndex. To do so you can run `conda install -c pytorch faiss-cpu` or `conda install -c pytorch faiss-gpu`. \"\n                \"A community supported package is also available on pypi: `pip install faiss-cpu` or `pip install faiss-gpu`. \"\n                \"Note that pip may not have the latest version of FAISS, and thus, some of the latest features and bug fixes may not be available.\"\n            )\n\n    def add_vectors(\n        self,\n        vectors: Union[np.array, \"Dataset\"],\n        column: Optional[str] = None,\n        batch_size: int = 1000,\n        train_size: Optional[int] = None,\n        faiss_verbose: Optional[bool] = None,\n    ):\n        \"\"\"\n        Add vectors to the index.\n        If the arrays are inside a certain column, you can specify it using the `column` argument.\n        \"\"\"\n        import faiss  # noqa: F811\n\n        if column and not isinstance(vectors.features[column], List):\n            raise ValueError(\n                f\"Wrong feature type for column '{column}'. Expected 1d array, got {vectors.features[column]}\"\n            )\n\n        # Create index\n        if self.faiss_index is None:\n            size = len(vectors[0]) if column is None else len(vectors[0][column])\n            if self.string_factory is not None:\n                if self.metric_type is None:\n                    index = faiss.index_factory(size, self.string_factory)\n                else:\n                    index = faiss.index_factory(size, self.string_factory, self.metric_type)\n            else:\n                if self.metric_type is None:\n                    index = faiss.IndexFlat(size)\n                else:\n                    index = faiss.IndexFlat(size, self.metric_type)\n\n            self.faiss_index = self._faiss_index_to_device(index, self.device)\n            logger.info(f\"Created faiss index of type {type(self.faiss_index)}\")\n\n        # Set verbosity level\n        if faiss_verbose is not None:\n            self.faiss_index.verbose = faiss_verbose\n            if hasattr(self.faiss_index, \"index\") and self.faiss_index.index is not None:\n                self.faiss_index.index.verbose = faiss_verbose\n            if hasattr(self.faiss_index, \"quantizer\") and self.faiss_index.quantizer is not None:\n                self.faiss_index.quantizer.verbose = faiss_verbose\n            if hasattr(self.faiss_index, \"clustering_index\") and self.faiss_index.clustering_index is not None:\n                self.faiss_index.clustering_index.verbose = faiss_verbose\n\n        # Train\n        if train_size is not None:\n            train_vecs = vectors[:train_size] if column is None else vectors[:train_size][column]\n            logger.info(f\"Training the index with the first {len(train_vecs)} vectors\")\n            self.faiss_index.train(train_vecs)\n        else:\n            logger.info(\"Ignored the training step of the faiss index as `train_size` is None.\")\n\n        # Add vectors\n        logger.info(f\"Adding {len(vectors)} vectors to the faiss index\")\n        for i in hf_tqdm(range(0, len(vectors), batch_size)):\n            vecs = vectors[i : i + batch_size] if column is None else vectors[i : i + batch_size][column]\n            self.faiss_index.add(vecs)\n\n    @staticmethod\n    def _faiss_index_to_device(index: \"faiss.Index\", device: Optional[Union[int, list[int]]] = None) -> \"faiss.Index\":\n        \"\"\"\n        Sends a faiss index to a device.\n        A device can either be a positive integer (GPU id), a negative integer (all GPUs),\n            or a list of positive integers (select GPUs to use), or `None` for CPU.\n        \"\"\"\n\n        # If device is not specified, then it runs on CPU.\n        if device is None:\n            return index\n\n        import faiss  # noqa: F811\n\n        # If the device id is given as an integer\n        if isinstance(device, int):\n            # Positive integers are directly mapped to GPU ids\n            if device > -1:\n                faiss_res = faiss.StandardGpuResources()\n                index = faiss.index_cpu_to_gpu(faiss_res, device, index)\n            # And negative integers mean using all GPUs\n            else:\n                index = faiss.index_cpu_to_all_gpus(index)\n        # Device ids given as a list mean mapping to those devices specified.\n        elif isinstance(device, (list, tuple)):\n            index = faiss.index_cpu_to_gpus_list(index, gpus=list(device))\n        else:\n            raise TypeError(\n                f\"The argument type: {type(device)} is not expected. \"\n                + \"Please pass in either nothing, a positive int, a negative int, or a list of positive ints.\"\n            )\n\n        return index\n\n    def search(self, query: np.array, k=10, **kwargs) -> SearchResults:\n        \"\"\"Find the nearest examples indices to the query.\n\n        Args:\n            query (`np.array`): The query as a numpy array.\n            k (`int`): The number of examples to retrieve.\n\n        Output:\n            scores (`List[List[float]`): The retrieval scores of the retrieved examples.\n            indices (`List[List[int]]`): The indices of the retrieved examples.\n        \"\"\"\n        if len(query.shape) != 1 and (len(query.shape) != 2 or query.shape[0] != 1):\n            raise ValueError(\"Shape of query is incorrect, it has to be either a 1D array or 2D (1, N)\")\n\n        queries = query.reshape(1, -1)\n        if not queries.flags.c_contiguous:\n            queries = np.asarray(queries, order=\"C\")\n        scores, indices = self.faiss_index.search(queries, k, **kwargs)\n        return SearchResults(scores[0], indices[0].astype(int))\n\n    def search_batch(self, queries: np.array, k=10, **kwargs) -> BatchedSearchResults:\n        \"\"\"Find the nearest examples indices to the queries.\n\n        Args:\n            queries (`np.array`): The queries as a numpy array.\n            k (`int`): The number of examples to retrieve.\n\n        Output:\n            total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.\n            total_indices (`List[List[int]]`): The indices of the retrieved examples per query.\n        \"\"\"\n        if len(queries.shape) != 2:\n            raise ValueError(\"Shape of query must be 2D\")\n        if not queries.flags.c_contiguous:\n            queries = np.asarray(queries, order=\"C\")\n        scores, indices = self.faiss_index.search(queries, k, **kwargs)\n        return BatchedSearchResults(scores, indices.astype(int))\n\n    def save(self, file: Union[str, PurePath], storage_options: Optional[dict] = None):\n        \"\"\"Serialize the FaissIndex on disk\"\"\"\n        import faiss  # noqa: F811\n\n        if self.device is not None and isinstance(self.device, (int, list, tuple)):\n            index = faiss.index_gpu_to_cpu(self.faiss_index)\n        else:\n            index = self.faiss_index\n\n        with fsspec.open(str(file), \"wb\", **(storage_options or {})) as f:\n            faiss.write_index(index, faiss.BufferedIOWriter(faiss.PyCallbackIOWriter(f.write)))\n\n    @classmethod\n    def load(\n        cls,\n        file: Union[str, PurePath],\n        device: Optional[Union[int, list[int]]] = None,\n        storage_options: Optional[dict] = None,\n    ) -> \"FaissIndex\":\n        \"\"\"Deserialize the FaissIndex from disk\"\"\"\n        import faiss  # noqa: F811\n\n        # Instances of FaissIndex is essentially just a wrapper for faiss indices.\n        faiss_index = cls(device=device)\n        with fsspec.open(str(file), \"rb\", **(storage_options or {})) as f:\n            index = faiss.read_index(faiss.BufferedIOReader(faiss.PyCallbackIOReader(f.read)))\n        faiss_index.faiss_index = faiss_index._faiss_index_to_device(index, faiss_index.device)\n        return faiss_index\n\n\nclass IndexableMixin:\n    \"\"\"Add indexing features to `datasets.Dataset`\"\"\"\n\n    def __init__(self):\n        self._indexes: dict[str, BaseIndex] = {}\n\n    def __len__(self):\n        raise NotImplementedError\n\n    def __getitem__(self, key):\n        raise NotImplementedError\n\n    def is_index_initialized(self, index_name: str) -> bool:\n        return index_name in self._indexes\n\n    def _check_index_is_initialized(self, index_name: str):\n        if not self.is_index_initialized(index_name):\n            raise MissingIndex(\n                f\"Index with index_name '{index_name}' not initialized yet. Please make sure that you call `add_faiss_index` or `add_elasticsearch_index` first.\"\n            )\n\n    def list_indexes(self) -> list[str]:\n        \"\"\"List the `colindex_nameumns`/identifiers of all the attached indexes.\"\"\"\n        return list(self._indexes)\n\n    def get_index(self, index_name: str) -> BaseIndex:\n        \"\"\"List the `index_name`/identifiers of all the attached indexes.\n\n        Args:\n            index_name (`str`): Index name.\n\n        Returns:\n            [`BaseIndex`]\n        \"\"\"\n        self._check_index_is_initialized(index_name)\n        return self._indexes[index_name]\n\n    def add_faiss_index(\n        self,\n        column: str,\n        index_name: Optional[str] = None,\n        device: Optional[Union[int, list[int]]] = None,\n        string_factory: Optional[str] = None,\n        metric_type: Optional[int] = None,\n        custom_index: Optional[\"faiss.Index\"] = None,\n        batch_size: int = 1000,\n        train_size: Optional[int] = None,\n        faiss_verbose: bool = False,\n    ):\n        \"\"\"Add a dense index using Faiss for fast retrieval.\n        The index is created using the vectors of the specified column.\n        You can specify `device` if you want to run it on GPU (`device` must be the GPU index, see more below).\n        You can find more information about Faiss here:\n        - For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory\n\n        Args:\n            column (`str`): The column of the vectors to add to the index.\n            index_name (Optional `str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`.\n                By default it corresponds to `column`.\n            device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.\n                If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.\n            string_factory (Optional `str`): This is passed to the index factory of Faiss to create the index. Default index class is IndexFlatIP.\n            metric_type (Optional `int`): Type of metric. Ex: `faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`.\n            custom_index (Optional `faiss.Index`): Custom Faiss index that you already have instantiated and configured for your needs.\n            batch_size (Optional `int`): Size of the batch to use while adding vectors to the FaissIndex. Default value is 1000.\n                <Added version=\"2.4.0\"/>\n            train_size (Optional `int`): If the index needs a training step, specifies how many vectors will be used to train the index.\n            faiss_verbose (`bool`, defaults to False): Enable the verbosity of the Faiss index.\n        \"\"\"\n        index_name = index_name if index_name is not None else column\n        faiss_index = FaissIndex(\n            device=device, string_factory=string_factory, metric_type=metric_type, custom_index=custom_index\n        )\n        faiss_index.add_vectors(\n            self, column=column, batch_size=batch_size, train_size=train_size, faiss_verbose=faiss_verbose\n        )\n        self._indexes[index_name] = faiss_index\n\n    def add_faiss_index_from_external_arrays(\n        self,\n        external_arrays: np.array,\n        index_name: str,\n        device: Optional[Union[int, list[int]]] = None,\n        string_factory: Optional[str] = None,\n        metric_type: Optional[int] = None,\n        custom_index: Optional[\"faiss.Index\"] = None,\n        batch_size: int = 1000,\n        train_size: Optional[int] = None,\n        faiss_verbose: bool = False,\n    ):\n        \"\"\"Add a dense index using Faiss for fast retrieval.\n        The index is created using the vectors of `external_arrays`.\n        You can specify `device` if you want to run it on GPU (`device` must be the GPU index).\n        You can find more information about Faiss here:\n        - For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory\n\n        Args:\n            external_arrays (`np.array`): If you want to use arrays from outside the lib for the index, you can set `external_arrays`.\n                It will use `external_arrays` to create the Faiss index instead of the arrays in the given `column`.\n            index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`.\n            device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.\n                If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.\n            string_factory (Optional `str`): This is passed to the index factory of Faiss to create the index. Default index class is IndexFlatIP.\n            metric_type (Optional `int`): Type of metric. Ex: `faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`.\n            custom_index (Optional `faiss.Index`): Custom Faiss index that you already have instantiated and configured for your needs.\n            batch_size (Optional `int`): Size of the batch to use while adding vectors to the FaissIndex. Default value is 1000.\n                <Added version=\"2.4.0\"/>\n            train_size (Optional `int`): If the index needs a training step, specifies how many vectors will be used to train the index.\n            faiss_verbose (`bool`, defaults to False): Enable the verbosity of the Faiss index.\n        \"\"\"\n        faiss_index = FaissIndex(\n            device=device, string_factory=string_factory, metric_type=metric_type, custom_index=custom_index\n        )\n        faiss_index.add_vectors(\n            external_arrays, column=None, batch_size=batch_size, train_size=train_size, faiss_verbose=faiss_verbose\n        )\n        self._indexes[index_name] = faiss_index\n\n    def save_faiss_index(self, index_name: str, file: Union[str, PurePath], storage_options: Optional[dict] = None):\n        \"\"\"Save a FaissIndex on disk.\n\n        Args:\n            index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`.\n            file (`str`): The path to the serialized faiss index on disk or remote URI (e.g. `\"s3://my-bucket/index.faiss\"`).\n            storage_options (`dict`, *optional*):\n                Key/value pairs to be passed on to the file-system backend, if any.\n\n                <Added version=\"2.11.0\"/>\n\n        \"\"\"\n        index = self.get_index(index_name)\n        if not isinstance(index, FaissIndex):\n            raise ValueError(f\"Index '{index_name}' is not a FaissIndex but a '{type(index)}'\")\n        index.save(file, storage_options=storage_options)\n        logger.info(f\"Saved FaissIndex {index_name} at {file}\")\n\n    def load_faiss_index(\n        self,\n        index_name: str,\n        file: Union[str, PurePath],\n        device: Optional[Union[int, list[int]]] = None,\n        storage_options: Optional[dict] = None,\n    ):\n        \"\"\"Load a FaissIndex from disk.\n\n        If you want to do additional configurations, you can have access to the faiss index object by doing\n        `.get_index(index_name).faiss_index` to make it fit your needs.\n\n        Args:\n            index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to\n                call `.get_nearest` or `.search`.\n            file (`str`): The path to the serialized faiss index on disk or remote URI (e.g. `\"s3://my-bucket/index.faiss\"`).\n            device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.\n                If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.\n            storage_options (`dict`, *optional*):\n                Key/value pairs to be passed on to the file-system backend, if any.\n\n                <Added version=\"2.11.0\"/>\n\n        \"\"\"\n        index = FaissIndex.load(file, device=device, storage_options=storage_options)\n        if index.faiss_index.ntotal != len(self):\n            raise ValueError(\n                f\"Index size should match Dataset size, but Index '{index_name}' at {file} has {index.faiss_index.ntotal} elements while the dataset has {len(self)} examples.\"\n            )\n        self._indexes[index_name] = index\n        logger.info(f\"Loaded FaissIndex {index_name} from {file}\")\n\n    def add_elasticsearch_index(\n        self,\n        column: str,\n        index_name: Optional[str] = None,\n        host: Optional[str] = None,\n        port: Optional[int] = None,\n        es_client: Optional[\"Elasticsearch\"] = None,\n        es_index_name: Optional[str] = None,\n        es_index_config: Optional[dict] = None,\n    ):\n        \"\"\"Add a text index using ElasticSearch for fast retrieval.\n\n        Args:\n            column (`str`): The column of the documents to add to the index.\n            index_name (Optional `str`): The index_name/identifier of the index. This is the index name that is used to call `.get_nearest` or `.search`.\n                By default it corresponds to `column`.\n            host (Optional `str`, defaults to localhost):\n                host of where ElasticSearch is running\n            port (Optional `str`, defaults to 9200):\n                port of where ElasticSearch is running\n            es_client (Optional `elasticsearch.Elasticsearch`):\n                The elasticsearch client used to create the index if host and port are None.\n            es_index_name (Optional `str`): The elasticsearch index name used to create the index.\n            es_index_config (Optional `dict`):\n                The configuration of the elasticsearch index.\n                Default config is:\n\n        Config::\n\n            {\n                \"settings\": {\n                    \"number_of_shards\": 1,\n                    \"analysis\": {\"analyzer\": {\"stop_standard\": {\"type\": \"standard\", \" stopwords\": \"_english_\"}}},\n                },\n                \"mappings\": {\n                    \"properties\": {\n                        \"text\": {\n                            \"type\": \"text\",\n                            \"analyzer\": \"standard\",\n                            \"similarity\": \"BM25\"\n                        },\n                    }\n                },\n            }\n        \"\"\"\n        index_name = index_name if index_name is not None else column\n        es_index = ElasticSearchIndex(\n            host=host, port=port, es_client=es_client, es_index_name=es_index_name, es_index_config=es_index_config\n        )\n        es_index.add_documents(self, column=column)\n        self._indexes[index_name] = es_index\n\n    def load_elasticsearch_index(\n        self,\n        index_name: str,\n        es_index_name: str,\n        host: Optional[str] = None,\n        port: Optional[int] = None,\n        es_client: Optional[\"Elasticsearch\"] = None,\n        es_index_config: Optional[dict] = None,\n    ):\n        \"\"\"Load an existing text index using ElasticSearch for fast retrieval.\n\n        Args:\n            index_name (`str`):\n                The `index_name`/identifier of the index. This is the index name that is used to call `get_nearest` or `search`.\n            es_index_name (`str`):\n                The name of elasticsearch index to load.\n            host (`str`, *optional*, defaults to `localhost`):\n                Host of where ElasticSearch is running.\n            port (`str`, *optional*, defaults to `9200`):\n                Port of where ElasticSearch is running.\n            es_client (`elasticsearch.Elasticsearch`, *optional*):\n                The elasticsearch client used to create the index if host and port are `None`.\n            es_index_config (`dict`, *optional*):\n                The configuration of the elasticsearch index.\n                Default config is:\n                    ```\n                    {\n                        \"settings\": {\n                            \"number_of_shards\": 1,\n                            \"analysis\": {\"analyzer\": {\"stop_standard\": {\"type\": \"standard\", \" stopwords\": \"_english_\"}}},\n                        },\n                        \"mappings\": {\n                            \"properties\": {\n                                \"text\": {\n                                    \"type\": \"text\",\n                                    \"analyzer\": \"standard\",\n                                    \"similarity\": \"BM25\"\n                                },\n                            }\n                        },\n                    }\n                    ```\n        \"\"\"\n        self._indexes[index_name] = ElasticSearchIndex(\n            host=host, port=port, es_client=es_client, es_index_name=es_index_name, es_index_config=es_index_config\n        )\n\n    def drop_index(self, index_name: str):\n        \"\"\"Drop the index with the specified column.\n\n        Args:\n            index_name (`str`):\n                The `index_name`/identifier of the index.\n        \"\"\"\n        del self._indexes[index_name]\n\n    def search(self, index_name: str, query: Union[str, np.array], k: int = 10, **kwargs) -> SearchResults:\n        \"\"\"Find the nearest examples indices in the dataset to the query.\n\n        Args:\n            index_name (`str`):\n                The name/identifier of the index.\n            query (`Union[str, np.ndarray]`):\n                The query as a string if `index_name` is a text index or as a numpy array if `index_name` is a vector index.\n            k (`int`):\n                The number of examples to retrieve.\n\n        Returns:\n            `(scores, indices)`:\n                A tuple of `(scores, indices)` where:\n                - **scores** (`List[List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples\n                - **indices** (`List[List[int]]`): the indices of the retrieved examples\n        \"\"\"\n        self._check_index_is_initialized(index_name)\n        return self._indexes[index_name].search(query, k, **kwargs)\n\n    def search_batch(\n        self, index_name: str, queries: Union[list[str], np.array], k: int = 10, **kwargs\n    ) -> BatchedSearchResults:\n        \"\"\"Find the nearest examples indices in the dataset to the query.\n\n        Args:\n            index_name (`str`):\n                The `index_name`/identifier of the index.\n            queries (`Union[List[str], np.ndarray]`):\n                The queries as a list of strings if `index_name` is a text index or as a numpy array if `index_name` is a vector index.\n            k (`int`):\n                The number of examples to retrieve per query.\n\n        Returns:\n            `(total_scores, total_indices)`:\n                A tuple of `(total_scores, total_indices)` where:\n                - **total_scores** (`List[List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples per query\n                - **total_indices** (`List[List[int]]`): the indices of the retrieved examples per query\n        \"\"\"\n        self._check_index_is_initialized(index_name)\n        return self._indexes[index_name].search_batch(queries, k, **kwargs)\n\n    def get_nearest_examples(\n        self, index_name: str, query: Union[str, np.array], k: int = 10, **kwargs\n    ) -> NearestExamplesResults:\n        \"\"\"Find the nearest examples in the dataset to the query.\n\n        Args:\n            index_name (`str`):\n                The index_name/identifier of the index.\n            query (`Union[str, np.ndarray]`):\n                The query as a string if `index_name` is a text index or as a numpy array if `index_name` is a vector index.\n            k (`int`):\n                The number of examples to retrieve.\n\n        Returns:\n            `(scores, examples)`:\n                A tuple of `(scores, examples)` where:\n                - **scores** (`List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples\n                - **examples** (`dict`): the retrieved examples\n        \"\"\"\n        self._check_index_is_initialized(index_name)\n        scores, indices = self.search(index_name, query, k, **kwargs)\n        top_indices = [i for i in indices if i >= 0]\n        return NearestExamplesResults(scores[: len(top_indices)], self[top_indices])\n\n    def get_nearest_examples_batch(\n        self, index_name: str, queries: Union[list[str], np.array], k: int = 10, **kwargs\n    ) -> BatchedNearestExamplesResults:\n        \"\"\"Find the nearest examples in the dataset to the query.\n\n        Args:\n            index_name (`str`):\n                The `index_name`/identifier of the index.\n            queries (`Union[List[str], np.ndarray]`):\n                The queries as a list of strings if `index_name` is a text index or as a numpy array if `index_name` is a vector index.\n            k (`int`):\n                The number of examples to retrieve per query.\n\n        Returns:\n            `(total_scores, total_examples)`:\n                A tuple of `(total_scores, total_examples)` where:\n                - **total_scores** (`List[List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples per query\n                - **total_examples** (`List[dict]`): the retrieved examples per query\n        \"\"\"\n        self._check_index_is_initialized(index_name)\n        total_scores, total_indices = self.search_batch(index_name, queries, k, **kwargs)\n        total_scores = [\n            scores_i[: len([i for i in indices_i if i >= 0])]\n            for scores_i, indices_i in zip(total_scores, total_indices)\n        ]\n        total_samples = [self[[i for i in indices if i >= 0]] for indices in total_indices]\n        return BatchedNearestExamplesResults(total_scores, total_samples)\n"
  },
  {
    "path": "src/datasets/splits.py",
    "content": "# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Lint as: python3\n\"\"\"Splits related API.\"\"\"\n\nimport abc\nimport collections\nimport copy\nimport dataclasses\nimport re\nfrom dataclasses import dataclass\nfrom typing import Optional, Union\n\nfrom .arrow_reader import FileInstructions, make_file_instructions\nfrom .naming import _split_re\nfrom .utils.py_utils import NonMutableDict, asdict\n\n\n@dataclass\nclass SplitInfo:\n    name: str = dataclasses.field(default=\"\", metadata={\"include_in_asdict_even_if_is_default\": True})\n    num_bytes: int = dataclasses.field(default=0, metadata={\"include_in_asdict_even_if_is_default\": True})\n    num_examples: int = dataclasses.field(default=0, metadata={\"include_in_asdict_even_if_is_default\": True})\n    shard_lengths: Optional[list[int]] = None\n    original_shard_lengths: Optional[list[int]] = None\n\n    # Deprecated\n    # For backward compatibility, this field needs to always be included in files like\n    # dataset_infos.json and dataset_info.json files\n    # To do so, we always include it in the output of datasets.utils.py_utils.asdict(split_info)\n    dataset_name: Optional[str] = dataclasses.field(\n        default=None, metadata={\"include_in_asdict_even_if_is_default\": True}\n    )\n\n    @property\n    def file_instructions(self):\n        \"\"\"Returns the list of dict(filename, take, skip).\"\"\"\n        # `self.dataset_name` is assigned in `SplitDict.add()`.\n        instructions = make_file_instructions(\n            name=self.dataset_name,\n            split_infos=[self],\n            instruction=str(self.name),\n        )\n        return instructions.file_instructions\n\n\n@dataclass\nclass SubSplitInfo:\n    \"\"\"Wrapper around a sub split info.\n    This class exposes info on the subsplit:\n    ```\n    ds, info = datasets.load_dataset(..., split='train[75%:]', with_info=True)\n    info.splits['train[75%:]'].num_examples\n    ```\n    \"\"\"\n\n    instructions: FileInstructions\n\n    @property\n    def num_examples(self):\n        \"\"\"Returns the number of examples in the subsplit.\"\"\"\n        return self.instructions.num_examples\n\n    @property\n    def file_instructions(self):\n        \"\"\"Returns the list of dict(filename, take, skip).\"\"\"\n        return self.instructions.file_instructions\n\n\nclass SplitBase(metaclass=abc.ABCMeta):\n    # pylint: disable=line-too-long\n    \"\"\"Abstract base class for Split compositionality.\n\n    See the\n    [guide on splits](../loading#slice-splits)\n    for more information.\n\n    There are three parts to the composition:\n        1) The splits are composed (defined, merged, split,...) together before\n             calling the `.as_dataset()` function. This is done with the `__add__`,\n             `__getitem__`, which return a tree of `SplitBase` (whose leaf\n             are the `NamedSplit` objects)\n\n        ```\n        split = datasets.Split.TRAIN + datasets.Split.TEST.subsplit(datasets.percent[:50])\n        ```\n\n        2) The `SplitBase` is forwarded to the `.as_dataset()` function\n             to be resolved into actual read instruction. This is done by the\n             `.get_read_instruction()` method which takes the real dataset splits\n             (name, number of shards,...) and parse the tree to return a\n             `SplitReadInstruction()` object\n\n        ```\n        read_instruction = split.get_read_instruction(self.info.splits)\n        ```\n\n        3) The `SplitReadInstruction` is then used in the `tf.data.Dataset` pipeline\n             to define which files to read and how to skip examples within file.\n\n    \"\"\"\n\n    # pylint: enable=line-too-long\n\n    @abc.abstractmethod\n    def get_read_instruction(self, split_dict):\n        \"\"\"Parse the descriptor tree and compile all read instructions together.\n\n        Args:\n            split_dict: `dict`, The `dict[split_name, SplitInfo]` of the dataset\n\n        Returns:\n            split_read_instruction: `SplitReadInstruction`\n        \"\"\"\n        raise NotImplementedError(\"Abstract method\")\n\n    def __eq__(self, other):\n        \"\"\"Equality: datasets.Split.TRAIN == 'train'.\"\"\"\n        if isinstance(other, (NamedSplit, str)):\n            return False\n        raise NotImplementedError(\"Equality is not implemented between merged/sub splits.\")\n\n    def __ne__(self, other):\n        \"\"\"InEquality: datasets.Split.TRAIN != 'test'.\"\"\"\n        return not self.__eq__(other)\n\n    def __add__(self, other):\n        \"\"\"Merging: datasets.Split.TRAIN + datasets.Split.TEST.\"\"\"\n        return _SplitMerged(self, other)\n\n    def subsplit(self, arg=None, k=None, percent=None, weighted=None):  # pylint: disable=redefined-outer-name\n        \"\"\"Divides this split into subsplits.\n\n        There are 3 ways to define subsplits, which correspond to the 3\n        arguments `k` (get `k` even subsplits), `percent` (get a slice of the\n        dataset with `datasets.percent`), and `weighted` (get subsplits with proportions\n        specified by `weighted`).\n\n        Example::\n\n        ```\n        # 50% train, 50% test\n        train, test = split.subsplit(k=2)\n        # 50% train, 25% test, 25% validation\n        train, test, validation = split.subsplit(weighted=[2, 1, 1])\n        # Extract last 20%\n        subsplit = split.subsplit(datasets.percent[-20:])\n        ```\n\n        Warning: k and weighted will be converted into percent which mean that\n        values below the percent will be rounded up or down. The final split may be\n        bigger to deal with remainders. For instance:\n\n        ```\n        train, test, valid = split.subsplit(k=3)  # 33%, 33%, 34%\n        s1, s2, s3, s4 = split.subsplit(weighted=[2, 2, 1, 1])  # 33%, 33%, 16%, 18%\n        ```\n\n        Args:\n            arg: If no kwargs are given, `arg` will be interpreted as one of\n                `k`, `percent`, or `weighted` depending on the type.\n                For example:\n                ```\n                split.subsplit(10)  # Equivalent to split.subsplit(k=10)\n                split.subsplit(datasets.percent[:-20])  # percent=datasets.percent[:-20]\n                split.subsplit([1, 1, 2])  # weighted=[1, 1, 2]\n                ```\n            k: `int` If set, subdivide the split into `k` equal parts.\n            percent: `datasets.percent slice`, return a single subsplit corresponding to\n                a slice of the original split. For example:\n                `split.subsplit(datasets.percent[-20:])  # Last 20% of the dataset`.\n            weighted: `list[int]`, return a list of subsplits whose proportions match\n                the normalized sum of the list. For example:\n                `split.subsplit(weighted=[1, 1, 2])  # 25%, 25%, 50%`.\n\n        Returns:\n            A subsplit or list of subsplits extracted from this split object.\n        \"\"\"\n        # Note that the percent kwargs redefine the outer name datasets.percent. This\n        # is done for consistency (.subsplit(percent=datasets.percent[:40]))\n        if sum(bool(x) for x in (arg, k, percent, weighted)) != 1:\n            raise ValueError(\"Only one argument of subsplit should be set.\")\n\n        # Auto deduce k\n        if isinstance(arg, int):\n            k = arg\n        elif isinstance(arg, slice):\n            percent = arg\n        elif isinstance(arg, list):\n            weighted = arg\n\n        if not (k or percent or weighted):\n            raise ValueError(\n                f\"Invalid split argument {arg}. Only list, slice and int supported. \"\n                \"One of k, weighted or percent should be set to a non empty value.\"\n            )\n\n        def assert_slices_coverage(slices):\n            # Ensure that the expended slices cover all percents.\n            assert sum((list(range(*s.indices(100))) for s in slices), []) == list(range(100))\n\n        if k:\n            if not 0 < k <= 100:\n                raise ValueError(f\"Subsplit k should be between 0 and 100, got {k}\")\n            shift = 100 // k\n            slices = [slice(i * shift, (i + 1) * shift) for i in range(k)]\n            # Round up last element to ensure all elements are taken\n            slices[-1] = slice(slices[-1].start, 100)\n            # Internal check to ensure full coverage\n            assert_slices_coverage(slices)\n            return tuple(_SubSplit(self, s) for s in slices)\n        elif percent:\n            return _SubSplit(self, percent)\n        elif weighted:\n            # Normalize the weighted sum\n            total = sum(weighted)\n            weighted = [100 * x // total for x in weighted]\n            # Create the slice for each of the elements\n            start = 0\n            stop = 0\n            slices = []\n            for v in weighted:\n                stop += v\n                slices.append(slice(start, stop))\n                start = stop\n            # Round up last element to ensure all elements are taken\n            slices[-1] = slice(slices[-1].start, 100)\n            # Internal check to ensure full coverage\n            assert_slices_coverage(slices)\n            return tuple(_SubSplit(self, s) for s in slices)\n        else:\n            # Should not be possible\n            raise ValueError(\"Could not determine the split\")\n\n\n# 2 requirements:\n# 1. datasets.percent be sliceable\n# 2. datasets.percent be documented\n#\n# Instances are not documented, so we want datasets.percent to be a class, but to\n# have it be sliceable, we need this metaclass.\nclass PercentSliceMeta(type):\n    def __getitem__(cls, slice_value):\n        if not isinstance(slice_value, slice):\n            raise ValueError(f\"datasets.percent should only be called with slice, not {slice_value}\")\n        return slice_value\n\n\nclass PercentSlice(metaclass=PercentSliceMeta):\n    # pylint: disable=line-too-long\n    \"\"\"Syntactic sugar for defining slice subsplits: `datasets.percent[75:-5]`.\n\n    See the\n    [guide on splits](../loading#slice-splits)\n    for more information.\n    \"\"\"\n\n    # pylint: enable=line-too-long\n    pass\n\n\npercent = PercentSlice  # pylint: disable=invalid-name\n\n\nclass _SplitMerged(SplitBase):\n    \"\"\"Represent two split descriptors merged together.\"\"\"\n\n    def __init__(self, split1, split2):\n        self._split1 = split1\n        self._split2 = split2\n\n    def get_read_instruction(self, split_dict):\n        read_instruction1 = self._split1.get_read_instruction(split_dict)\n        read_instruction2 = self._split2.get_read_instruction(split_dict)\n        return read_instruction1 + read_instruction2\n\n    def __repr__(self):\n        return f\"({repr(self._split1)} + {repr(self._split2)})\"\n\n\nclass _SubSplit(SplitBase):\n    \"\"\"Represent a sub split of a split descriptor.\"\"\"\n\n    def __init__(self, split, slice_value):\n        self._split = split\n        self._slice_value = slice_value\n\n    def get_read_instruction(self, split_dict):\n        return self._split.get_read_instruction(split_dict)[self._slice_value]\n\n    def __repr__(self):\n        slice_str = \"{start}:{stop}\"\n        if self._slice_value.step is not None:\n            slice_str += \":{step}\"\n        slice_str = slice_str.format(\n            start=\"\" if self._slice_value.start is None else self._slice_value.start,\n            stop=\"\" if self._slice_value.stop is None else self._slice_value.stop,\n            step=self._slice_value.step,\n        )\n        return f\"{repr(self._split)}(datasets.percent[{slice_str}])\"\n\n\nclass NamedSplit(SplitBase):\n    \"\"\"Descriptor corresponding to a named split (train, test, ...).\n\n    Example:\n        Each descriptor can be composed with other using addition or slice:\n\n            ```py\n            split = datasets.Split.TRAIN.subsplit(datasets.percent[0:25]) + datasets.Split.TEST\n            ```\n\n        The resulting split will correspond to 25% of the train split merged with\n        100% of the test split.\n\n        A split cannot be added twice, so the following will fail:\n\n            ```py\n            split = (\n                    datasets.Split.TRAIN.subsplit(datasets.percent[:25]) +\n                    datasets.Split.TRAIN.subsplit(datasets.percent[75:])\n            )  # Error\n            split = datasets.Split.TEST + datasets.Split.ALL  # Error\n            ```\n\n        The slices can be applied only one time. So the following are valid:\n\n            ```py\n            split = (\n                    datasets.Split.TRAIN.subsplit(datasets.percent[:25]) +\n                    datasets.Split.TEST.subsplit(datasets.percent[:50])\n            )\n            split = (datasets.Split.TRAIN + datasets.Split.TEST).subsplit(datasets.percent[:50])\n            ```\n\n        But this is not valid:\n\n            ```py\n            train = datasets.Split.TRAIN\n            test = datasets.Split.TEST\n            split = train.subsplit(datasets.percent[:25]).subsplit(datasets.percent[:25])\n            split = (train.subsplit(datasets.percent[:25]) + test).subsplit(datasets.percent[:50])\n            ```\n    \"\"\"\n\n    def __init__(self, name: str):\n        self._name = name\n        split_names_from_instruction = [split_instruction.split(\"[\")[0] for split_instruction in name.split(\"+\")]\n        for split_name in split_names_from_instruction:\n            if not re.match(_split_re, split_name):\n                raise ValueError(f\"Split name should match '{_split_re}' but got '{split_name}'.\")\n\n    def __str__(self):\n        return self._name\n\n    def __repr__(self):\n        return f\"NamedSplit({self._name!r})\"\n\n    def __eq__(self, other):\n        \"\"\"Equality: datasets.Split.TRAIN == 'train'.\"\"\"\n        if isinstance(other, NamedSplit):\n            return self._name == other._name  # pylint: disable=protected-access\n        elif isinstance(other, SplitBase):\n            return False\n        elif isinstance(other, str):  # Other should be string\n            return self._name == other\n        else:\n            return False\n\n    def __lt__(self, other):\n        return self._name < other._name  # pylint: disable=protected-access\n\n    def __hash__(self):\n        return hash(self._name)\n\n    def get_read_instruction(self, split_dict):\n        return SplitReadInstruction(split_dict[self._name])\n\n\nclass NamedSplitAll(NamedSplit):\n    \"\"\"Split corresponding to the union of all defined dataset splits.\"\"\"\n\n    def __init__(self):\n        super().__init__(\"all\")\n\n    def __repr__(self):\n        return \"NamedSplitAll()\"\n\n    def get_read_instruction(self, split_dict):\n        # Merge all dataset splits together\n        read_instructions = [SplitReadInstruction(s) for s in split_dict.values()]\n        return sum(read_instructions, SplitReadInstruction())\n\n\nclass Split:\n    # pylint: disable=line-too-long\n    \"\"\"`Enum` for dataset splits.\n\n    Datasets are typically split into different subsets to be used at various\n    stages of training and evaluation.\n\n    - `TRAIN`: the training data.\n    - `VALIDATION`: the validation data. If present, this is typically used as\n      evaluation data while iterating on a model (e.g. changing hyperparameters,\n      model architecture, etc.).\n    - `TEST`: the testing data. This is the data to report metrics on. Typically\n      you do not want to use this during model iteration as you may overfit to it.\n    - `ALL`: the union of all defined dataset splits.\n\n    All splits, including compositions inherit from `datasets.SplitBase`.\n\n    See the [guide](../load_hub#splits) on splits for more information.\n\n    Example:\n\n    ```py\n    >>> datasets.SplitGenerator(\n    ...     name=datasets.Split.TRAIN,\n    ...     gen_kwargs={\"split_key\": \"train\", \"files\": dl_manager.download_and extract(url)},\n    ... ),\n    ... datasets.SplitGenerator(\n    ...     name=datasets.Split.VALIDATION,\n    ...     gen_kwargs={\"split_key\": \"validation\", \"files\": dl_manager.download_and extract(url)},\n    ... ),\n    ... datasets.SplitGenerator(\n    ...     name=datasets.Split.TEST,\n    ...     gen_kwargs={\"split_key\": \"test\", \"files\": dl_manager.download_and extract(url)},\n    ... )\n    ```\n    \"\"\"\n\n    # pylint: enable=line-too-long\n    TRAIN = NamedSplit(\"train\")\n    TEST = NamedSplit(\"test\")\n    VALIDATION = NamedSplit(\"validation\")\n    ALL = NamedSplitAll()\n\n    def __new__(cls, name):\n        \"\"\"Create a custom split with datasets.Split('custom_name').\"\"\"\n        return NamedSplitAll() if name == \"all\" else NamedSplit(name)\n\n\n# Similar to SplitInfo, but contain an additional slice info\nSlicedSplitInfo = collections.namedtuple(\n    \"SlicedSplitInfo\",\n    [\n        \"split_info\",\n        \"slice_value\",\n    ],\n)  # noqa: E231\n\n\nclass SplitReadInstruction:\n    \"\"\"Object containing the reading instruction for the dataset.\n\n    Similarly to `SplitDescriptor` nodes, this object can be composed with itself,\n    but the resolution happens instantaneously, instead of keeping track of the\n    tree, such as all instructions are compiled and flattened in a single\n    SplitReadInstruction object containing the list of files and slice to use.\n\n    Once resolved, the instructions can be accessed with:\n\n    ```\n    read_instructions.get_list_sliced_split_info()  # List of splits to use\n    ```\n\n    \"\"\"\n\n    def __init__(self, split_info=None):\n        self._splits = NonMutableDict(error_msg=\"Overlap between splits. Split {key} has been added with itself.\")\n\n        if split_info:\n            self.add(SlicedSplitInfo(split_info=split_info, slice_value=None))\n\n    def add(self, sliced_split):\n        \"\"\"Add a SlicedSplitInfo the read instructions.\"\"\"\n        # TODO(epot): Check that the number of examples per shard % 100 == 0\n        # Otherwise the slices value may be unbalanced and not exactly reflect the\n        # requested slice.\n        self._splits[sliced_split.split_info.name] = sliced_split\n\n    def __add__(self, other):\n        \"\"\"Merging split together.\"\"\"\n        # Will raise error if a split has already be added (NonMutableDict)\n        # TODO(epot): If a split is already added but there is no overlap between\n        # the slices, should merge the slices (ex: [:10] + [80:])\n        split_instruction = SplitReadInstruction()\n        split_instruction._splits.update(self._splits)  # pylint: disable=protected-access\n        split_instruction._splits.update(other._splits)  # pylint: disable=protected-access\n        return split_instruction\n\n    def __getitem__(self, slice_value):\n        \"\"\"Sub-splits.\"\"\"\n        # Will raise an error if a split has already been sliced\n        split_instruction = SplitReadInstruction()\n        for v in self._splits.values():\n            if v.slice_value is not None:\n                raise ValueError(f\"Trying to slice Split {v.split_info.name} which has already been sliced\")\n            v = v._asdict()\n            v[\"slice_value\"] = slice_value\n            split_instruction.add(SlicedSplitInfo(**v))\n        return split_instruction\n\n    def get_list_sliced_split_info(self):\n        return list(self._splits.values())\n\n\nclass SplitDict(dict[str, SplitInfo]):\n    \"\"\"Split info object.\"\"\"\n\n    def __init__(self, *args, dataset_name=None, **kwargs):\n        super().__init__(*args, **kwargs)\n        self.dataset_name = dataset_name\n\n    def __getitem__(self, key: Union[SplitBase, str]):\n        # 1st case: The key exists: `info.splits['train']`\n        if str(key) in self:\n            return super().__getitem__(str(key))\n        # 2nd case: Uses instructions: `info.splits['train[50%]']`\n        else:\n            instructions = make_file_instructions(\n                name=self.dataset_name,\n                split_infos=self.values(),\n                instruction=key,\n            )\n            return SubSplitInfo(instructions)\n\n    def __setitem__(self, key: Union[SplitBase, str], value: SplitInfo):\n        if key != value.name:\n            raise ValueError(f\"Cannot add elem. (key mismatch: '{key}' != '{value.name}')\")\n        super().__setitem__(key, value)\n\n    def add(self, split_info: SplitInfo):\n        \"\"\"Add the split info.\"\"\"\n        if split_info.name in self:\n            raise ValueError(f\"Split {split_info.name} already present\")\n        split_info.dataset_name = self.dataset_name\n        super().__setitem__(split_info.name, split_info)\n\n    @property\n    def total_num_examples(self):\n        \"\"\"Return the total number of examples.\"\"\"\n        return sum(s.num_examples for s in self.values())\n\n    @classmethod\n    def from_split_dict(cls, split_infos: Union[list, dict], dataset_name: Optional[str] = None):\n        \"\"\"Returns a new SplitDict initialized from a Dict or List of `split_infos`.\"\"\"\n        if isinstance(split_infos, dict):\n            split_infos = list(split_infos.values())\n\n        if dataset_name is None:\n            dataset_name = split_infos[0].get(\"dataset_name\") if split_infos else None\n\n        split_dict = cls(dataset_name=dataset_name)\n\n        for split_info in split_infos:\n            if isinstance(split_info, dict):\n                split_info = SplitInfo(**split_info)\n            split_dict.add(split_info)\n\n        return split_dict\n\n    def to_split_dict(self):\n        \"\"\"Returns a list of SplitInfo protos that we have.\"\"\"\n        out = []\n        for split_name, split_info in self.items():\n            split_info = copy.deepcopy(split_info)\n            split_info.name = split_name\n            out.append(split_info)\n        return out\n\n    def copy(self):\n        return SplitDict.from_split_dict(self.to_split_dict(), self.dataset_name)\n\n    def _to_yaml_list(self) -> list:\n        out = [asdict(s) for s in self.to_split_dict()]\n        # we don't need the shard lengths in YAML\n        for split_info_dict in out:\n            split_info_dict.pop(\"shard_lengths\", None)\n            split_info_dict.pop(\"original_shard_lengths\", None)\n        # we don't need the dataset_name attribute that is deprecated\n        for split_info_dict in out:\n            split_info_dict.pop(\"dataset_name\", None)\n        return out\n\n    @classmethod\n    def _from_yaml_list(cls, yaml_data: list) -> \"SplitDict\":\n        return cls.from_split_dict(yaml_data)\n\n\n@dataclass\nclass SplitGenerator:\n    \"\"\"Defines the split information for the generator.\n\n    This should be used as returned value of\n    `GeneratorBasedBuilder._split_generators`.\n    See `GeneratorBasedBuilder._split_generators` for more info and example\n    of usage.\n\n    Args:\n        name (`str`):\n            Name of the `Split` for which the generator will\n            create the examples.\n        **gen_kwargs (additional keyword arguments):\n            Keyword arguments to forward to the `DatasetBuilder._generate_examples` method\n            of the builder.\n\n    Example:\n\n    ```py\n    >>> datasets.SplitGenerator(\n    ...     name=datasets.Split.TRAIN,\n    ...     gen_kwargs={\"split_key\": \"train\", \"files\": dl_manager.download_and_extract(url)},\n    ... )\n    ```\n    \"\"\"\n\n    name: str\n    gen_kwargs: dict = dataclasses.field(default_factory=dict)\n    split_info: SplitInfo = dataclasses.field(init=False)\n\n    def __post_init__(self):\n        self.name = str(self.name)  # Make sure we convert NamedSplits in strings\n        NamedSplit(self.name)  # check that it's a valid split name\n        self.split_info = SplitInfo(name=self.name)\n"
  },
  {
    "path": "src/datasets/streaming.py",
    "content": "import importlib\nfrom functools import wraps\nfrom typing import TYPE_CHECKING, Optional\n\nfrom .download.download_config import DownloadConfig\nfrom .utils.file_utils import (\n    xbasename,\n    xdirname,\n    xet_parse,\n    xexists,\n    xgetsize,\n    xglob,\n    xgzip_open,\n    xisdir,\n    xisfile,\n    xjoin,\n    xlistdir,\n    xnumpy_load,\n    xopen,\n    xpandas_read_csv,\n    xpandas_read_excel,\n    xPath,\n    xpyarrow_parquet_read_table,\n    xrelpath,\n    xsio_loadmat,\n    xsplit,\n    xsplitext,\n    xwalk,\n    xxml_dom_minidom_parse,\n)\nfrom .utils.logging import get_logger\nfrom .utils.patching import patch_submodule\n\n\nlogger = get_logger(__name__)\n\n\nif TYPE_CHECKING:\n    from .builder import DatasetBuilder\n\n\ndef extend_module_for_streaming(module_path, download_config: Optional[DownloadConfig] = None):\n    \"\"\"Extend the module to support streaming.\n\n    We patch some functions in the module to use `fsspec` to support data streaming:\n    - We use `fsspec.open` to open and read remote files. We patch the module function:\n      - `open`\n    - We use the \"::\" hop separator to join paths and navigate remote compressed/archive files. We patch the module\n      functions:\n      - `os.path.join`\n      - `pathlib.Path.joinpath` and `pathlib.Path.__truediv__` (called when using the \"/\" operator)\n\n    The patched functions are replaced with custom functions defined to work with the\n    :class:`~download.streaming_download_manager.StreamingDownloadManager`.\n\n    Args:\n        module_path: Path to the module to be extended.\n        download_config: Mainly use `token` or `storage_options` to support different platforms and auth types.\n    \"\"\"\n\n    module = importlib.import_module(module_path)\n\n    # TODO(QL): always update the module to add subsequent new authentication without removing old ones\n    if hasattr(module, \"_patched_for_streaming\") and module._patched_for_streaming:\n        if isinstance(module._patched_for_streaming, DownloadConfig):\n            module._patched_for_streaming.token = download_config.token\n            module._patched_for_streaming.storage_options = download_config.storage_options\n        return\n\n    def wrap_auth(function):\n        @wraps(function)\n        def wrapper(*args, **kwargs):\n            return function(*args, download_config=download_config, **kwargs)\n\n        wrapper._decorator_name_ = \"wrap_auth\"\n        return wrapper\n\n    # open files in a streaming fashion\n    patch_submodule(module, \"open\", wrap_auth(xopen)).start()\n    patch_submodule(module, \"os.listdir\", wrap_auth(xlistdir)).start()\n    patch_submodule(module, \"os.walk\", wrap_auth(xwalk)).start()\n    patch_submodule(module, \"glob.glob\", wrap_auth(xglob)).start()\n    # allow to navigate in remote zip files\n    patch_submodule(module, \"os.path.join\", xjoin).start()\n    patch_submodule(module, \"os.path.dirname\", xdirname).start()\n    patch_submodule(module, \"os.path.basename\", xbasename).start()\n    patch_submodule(module, \"os.path.relpath\", xrelpath).start()\n    patch_submodule(module, \"os.path.split\", xsplit).start()\n    patch_submodule(module, \"os.path.splitext\", xsplitext).start()\n    # allow checks on paths\n    patch_submodule(module, \"os.path.exists\", wrap_auth(xexists)).start()\n    patch_submodule(module, \"os.path.isdir\", wrap_auth(xisdir)).start()\n    patch_submodule(module, \"os.path.isfile\", wrap_auth(xisfile)).start()\n    patch_submodule(module, \"os.path.getsize\", wrap_auth(xgetsize)).start()\n    patch_submodule(module, \"pathlib.Path\", xPath).start()\n    # file readers\n    patch_submodule(module, \"gzip.open\", wrap_auth(xgzip_open)).start()\n    patch_submodule(module, \"numpy.load\", wrap_auth(xnumpy_load)).start()\n    patch_submodule(module, \"pandas.read_csv\", wrap_auth(xpandas_read_csv), attrs=[\"__version__\"]).start()\n    patch_submodule(module, \"pandas.read_excel\", wrap_auth(xpandas_read_excel), attrs=[\"__version__\"]).start()\n    patch_submodule(module, \"scipy.io.loadmat\", wrap_auth(xsio_loadmat), attrs=[\"__version__\"]).start()\n    patch_submodule(module, \"xml.etree.ElementTree.parse\", wrap_auth(xet_parse)).start()\n    patch_submodule(module, \"xml.dom.minidom.parse\", wrap_auth(xxml_dom_minidom_parse)).start()\n    # pyarrow: do not patch pyarrow attribute in packaged modules\n    if not module.__name__.startswith(\"datasets.packaged_modules.\"):\n        patch_submodule(module, \"pyarrow.parquet.read_table\", wrap_auth(xpyarrow_parquet_read_table)).start()\n    module._patched_for_streaming = download_config\n\n\ndef extend_dataset_builder_for_streaming(builder: \"DatasetBuilder\"):\n    \"\"\"Extend the dataset builder module and the modules imported by it to support streaming.\n\n    Args:\n        builder (:class:`DatasetBuilder`): Dataset builder instance.\n    \"\"\"\n    # this extends the open and os.path.join functions for data streaming\n    download_config = DownloadConfig(storage_options=builder.storage_options, token=builder.token)\n    extend_module_for_streaming(builder.__module__, download_config=download_config)\n\n    # builders can inherit from other builders that might use streaming functionality\n    # (for example, ImageFolder and AudioFolder inherit from FolderBuilder which implements examples generation)\n    # but these parents builders are not patched automatically as they are not instantiated, so we patch them here\n    from .builder import DatasetBuilder\n\n    parent_builder_modules = [\n        cls.__module__\n        for cls in type(builder).__mro__[1:]  # make sure it's not the same module we've already patched\n        if issubclass(cls, DatasetBuilder) and cls.__module__ != DatasetBuilder.__module__\n    ]  # check it's not a standard builder from datasets.builder\n    for module in parent_builder_modules:\n        extend_module_for_streaming(module, download_config=download_config)\n"
  },
  {
    "path": "src/datasets/table.py",
    "content": "import copy\nimport os\nfrom collections.abc import Iterator\nfrom functools import partial\nfrom itertools import groupby\nfrom typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, Union\n\nimport numpy as np\nimport pyarrow as pa\nimport pyarrow.compute as pc\n\nfrom .utils.logging import get_logger\n\n\nif TYPE_CHECKING:\n    from .features.features import Features, FeatureType\n\n\nlogger = get_logger(__name__)\n\n\ndef inject_arrow_table_documentation(arrow_table_method):\n    def wrapper(fn):\n        fn.__doc__ = arrow_table_method.__doc__ + (fn.__doc__ if fn.__doc__ is not None else \"\")\n        fn.__doc__ = fn.__doc__.replace(\"pyarrow.Table\", \"Table\")\n        if hasattr(arrow_table_method, \"__annotations__\"):\n            fn.__annotations__ = arrow_table_method.__annotations__\n        return fn\n\n    return wrapper\n\n\ndef _in_memory_arrow_table_from_file(filename: str) -> pa.Table:\n    in_memory_stream = pa.input_stream(filename)\n    opened_stream = pa.ipc.open_stream(in_memory_stream)\n    pa_table = opened_stream.read_all()\n    return pa_table\n\n\ndef _in_memory_arrow_table_from_buffer(buffer: pa.Buffer) -> pa.Table:\n    stream = pa.BufferReader(buffer)\n    opened_stream = pa.ipc.open_stream(stream)\n    table = opened_stream.read_all()\n    return table\n\n\ndef _memory_mapped_record_batch_reader_from_file(filename: str) -> pa.RecordBatchStreamReader:\n    memory_mapped_stream = pa.memory_map(filename)\n    return pa.ipc.open_stream(memory_mapped_stream)\n\n\ndef read_schema_from_file(filename: str) -> pa.Schema:\n    \"\"\"\n    Infer arrow table schema from file without loading whole file into memory.\n    Useful especially while having very big files.\n    \"\"\"\n    with pa.memory_map(filename) as memory_mapped_stream:\n        schema = pa.ipc.open_stream(memory_mapped_stream).schema\n    return schema\n\n\ndef _memory_mapped_arrow_table_from_file(filename: str) -> pa.Table:\n    opened_stream = _memory_mapped_record_batch_reader_from_file(filename)\n    pa_table = opened_stream.read_all()\n    return pa_table\n\n\ndef _deepcopy(x, memo: dict):\n    \"\"\"deepcopy a regular class instance\"\"\"\n    cls = x.__class__\n    result = cls.__new__(cls)\n    memo[id(x)] = result\n    for k, v in x.__dict__.items():\n        setattr(result, k, copy.deepcopy(v, memo))\n    return result\n\n\ndef _interpolation_search(arr: list[int], x: int) -> int:\n    \"\"\"\n    Return the position i of a sorted array so that arr[i] <= x < arr[i+1]\n\n    Args:\n        arr (`List[int]`): non-empty sorted list of integers\n        x (`int`): query\n\n    Returns:\n        `int`: the position i so that arr[i] <= x < arr[i+1]\n\n    Raises:\n        `IndexError`: if the array is empty or if the query is outside the array values\n    \"\"\"\n    i, j = 0, len(arr) - 1\n    while i < j and arr[i] <= x < arr[j]:\n        k = i + ((j - i) * (x - arr[i]) // (arr[j] - arr[i]))\n        if arr[k] <= x < arr[k + 1]:\n            return k\n        elif arr[k] < x:\n            i, j = k + 1, j\n        else:\n            i, j = i, k\n    raise IndexError(f\"Invalid query '{x}' for size {arr[-1] if len(arr) else 'none'}.\")\n\n\nclass IndexedTableMixin:\n    def __init__(self, table: pa.Table):\n        self._schema: pa.Schema = table.schema\n        self._batches: list[pa.RecordBatch] = [\n            recordbatch for recordbatch in table.to_batches() if len(recordbatch) > 0\n        ]\n        self._offsets: np.ndarray = np.cumsum([0] + [len(b) for b in self._batches], dtype=np.int64)\n\n    def fast_gather(self, indices: Union[list[int], np.ndarray]) -> pa.Table:\n        \"\"\"\n        Create a pa.Table by gathering the records at the records at the specified indices. Should be faster\n        than pa.concat_tables(table.fast_slice(int(i) % table.num_rows, 1) for i in indices) since NumPy can compute\n        the binary searches in parallel, highly optimized C\n        \"\"\"\n        if not len(indices):\n            raise ValueError(\"Indices must be non-empty\")\n        batch_indices = np.searchsorted(self._offsets, indices, side=\"right\") - 1\n        return pa.Table.from_batches(\n            [\n                self._batches[batch_idx].slice(i - self._offsets[batch_idx], 1)\n                for batch_idx, i in zip(batch_indices, indices)\n            ],\n            schema=self._schema,\n        )\n\n    def fast_slice(self, offset=0, length=None) -> pa.Table:\n        \"\"\"\n        Slice the Table using interpolation search.\n        The behavior is the same as `pyarrow.Table.slice` but it's significantly faster.\n\n        Interpolation search is used to find the start and end indexes of the batches we want to keep.\n        The batches to keep are then concatenated to form the sliced Table.\n        \"\"\"\n        if offset < 0:\n            raise IndexError(\"Offset must be non-negative\")\n        elif offset >= self._offsets[-1] or (length is not None and length <= 0):\n            return pa.Table.from_batches([], schema=self._schema)\n        i = _interpolation_search(self._offsets, offset)\n        if length is None or length + offset >= self._offsets[-1]:\n            batches = self._batches[i:]\n            batches[0] = batches[0].slice(offset - self._offsets[i])\n        else:\n            j = _interpolation_search(self._offsets, offset + length - 1)\n            batches = self._batches[i : j + 1]\n            batches[-1] = batches[-1].slice(0, offset + length - self._offsets[j])\n            batches[0] = batches[0].slice(offset - self._offsets[i])\n        return pa.Table.from_batches(batches, schema=self._schema)\n\n\nclass Table(IndexedTableMixin):\n    \"\"\"\n    Wraps a pyarrow Table by using composition.\n    This is the base class for `InMemoryTable`, `MemoryMappedTable` and `ConcatenationTable`.\n\n    It implements all the basic attributes/methods of the pyarrow Table class except\n    the Table transforms: `slice, filter, flatten, combine_chunks, cast, add_column,\n    append_column, remove_column, set_column, rename_columns` and `drop`.\n\n    The implementation of these methods differs for the subclasses.\n    \"\"\"\n\n    def __init__(self, table: pa.Table):\n        super().__init__(table)\n        self.table = table\n\n    def __deepcopy__(self, memo: dict):\n        # arrow tables are immutable, so there's no need to copy self.table\n        # moreover calling deepcopy on a pyarrow table seems to make pa.total_allocated_bytes() decrease for some reason\n        # by adding it to the memo, self.table won't be copied\n        memo[id(self.table)] = self.table\n        # same for the recordbatches used by the index\n        memo[id(self._batches)] = list(self._batches)\n        return _deepcopy(self, memo)\n\n    def validate(self, *args, **kwargs):\n        \"\"\"\n        Perform validation checks.  An exception is raised if validation fails.\n\n        By default only cheap validation checks are run.  Pass `full=True`\n        for thorough validation checks (potentially `O(n)`).\n\n        Args:\n            full (`bool`, defaults to `False`):\n                If `True`, run expensive checks, otherwise cheap checks only.\n\n        Raises:\n            `pa.lib.ArrowInvalid`: if validation fails\n        \"\"\"\n        return self.table.validate(*args, **kwargs)\n\n    def equals(self, *args, **kwargs):\n        \"\"\"\n        Check if contents of two tables are equal.\n\n        Args:\n            other ([`~datasets.table.Table`]):\n                Table to compare against.\n            check_metadata `bool`, defaults to `False`):\n                Whether schema metadata equality should be checked as well.\n\n        Returns:\n            `bool`\n        \"\"\"\n        args = tuple(arg.table if isinstance(arg, Table) else arg for arg in args)\n        kwargs = {k: v.table if isinstance(v, Table) else v for k, v in kwargs}\n        return self.table.equals(*args, **kwargs)\n\n    def to_batches(self, *args, **kwargs):\n        \"\"\"\n        Convert Table to list of (contiguous) `RecordBatch` objects.\n\n        Args:\n            max_chunksize (`int`, defaults to `None`):\n                Maximum size for `RecordBatch` chunks. Individual chunks may be\n                smaller depending on the chunk layout of individual columns.\n\n        Returns:\n            `List[pyarrow.RecordBatch]`\n        \"\"\"\n        return self.table.to_batches(*args, **kwargs)\n\n    def to_pydict(self, *args, **kwargs):\n        \"\"\"\n        Convert the Table to a `dict` or `OrderedDict`.\n\n        Returns:\n            `dict`\n        \"\"\"\n        return self.table.to_pydict(*args, **kwargs)\n\n    def to_pylist(self, *args, **kwargs):\n        \"\"\"\n        Convert the Table to a list\n\n        Returns:\n            `list`\n        \"\"\"\n        return self.table.to_pylist(*args, **kwargs)\n\n    def to_pandas(self, *args, **kwargs):\n        \"\"\"\n        Convert to a pandas-compatible NumPy array or DataFrame, as appropriate.\n\n        Args:\n            memory_pool (`MemoryPool`, defaults to `None`):\n                Arrow MemoryPool to use for allocations. Uses the default memory\n                pool is not passed.\n            strings_to_categorical (`bool`, defaults to `False`):\n                Encode string (UTF8) and binary types to `pandas.Categorical`.\n            categories (`list`, defaults to `empty`):\n                List of fields that should be returned as `pandas.Categorical`. Only\n                applies to table-like data structures.\n            zero_copy_only (`bool`, defaults to `False`):\n                Raise an `ArrowException` if this function call would require copying\n                the underlying data.\n            integer_object_nulls (`bool`, defaults to `False`):\n                Cast integers with nulls to objects.\n            date_as_object (`bool`, defaults to `True`):\n                Cast dates to objects. If `False`, convert to `datetime64[ns]` dtype.\n            timestamp_as_object (`bool`, defaults to `False`):\n                Cast non-nanosecond timestamps (`np.datetime64`) to objects. This is\n                useful if you have timestamps that don't fit in the normal date\n                range of nanosecond timestamps (1678 CE-2262 CE).\n                If `False`, all timestamps are converted to `datetime64[ns]` dtype.\n            use_threads (`bool`, defaults to `True`):\n                Whether to parallelize the conversion using multiple threads.\n            deduplicate_objects (`bool`, defaults to `False`):\n                Do not create multiple copies Python objects when created, to save\n                on memory use. Conversion will be slower.\n            ignore_metadata (`bool`, defaults to `False`):\n                If `True`, do not use the 'pandas' metadata to reconstruct the\n                DataFrame index, if present.\n            safe (`bool`, defaults to `True`):\n                For certain data types, a cast is needed in order to store the\n                data in a pandas DataFrame or Series (e.g. timestamps are always\n                stored as nanoseconds in pandas). This option controls whether it\n                is a safe cast or not.\n            split_blocks (`bool`, defaults to `False`):\n                If `True`, generate one internal \"block\" for each column when\n                creating a pandas.DataFrame from a `RecordBatch` or `Table`. While this\n                can temporarily reduce memory note that various pandas operations\n                can trigger \"consolidation\" which may balloon memory use.\n            self_destruct (`bool`, defaults to `False`):\n                EXPERIMENTAL: If `True`, attempt to deallocate the originating Arrow\n                memory while converting the Arrow object to pandas. If you use the\n                object after calling `to_pandas` with this option it will crash your\n                program.\n            types_mapper (`function`, defaults to `None`):\n                A function mapping a pyarrow DataType to a pandas `ExtensionDtype`.\n                This can be used to override the default pandas type for conversion\n                of built-in pyarrow types or in absence of `pandas_metadata` in the\n                Table schema. The function receives a pyarrow DataType and is\n                expected to return a pandas `ExtensionDtype` or `None` if the\n                default conversion should be used for that type. If you have\n                a dictionary mapping, you can pass `dict.get` as function.\n\n        Returns:\n            `pandas.Series` or `pandas.DataFrame`: `pandas.Series` or `pandas.DataFrame` depending on type of object\n        \"\"\"\n        return self.table.to_pandas(*args, **kwargs)\n\n    def to_string(self, *args, **kwargs):\n        return self.table.to_string(*args, **kwargs)\n\n    def to_reader(self, max_chunksize: Optional[int] = None):\n        \"\"\"\n        Convert the Table to a RecordBatchReader.\n\n        Note that this method is zero-copy, it merely exposes the same data under a different API.\n\n        Args:\n            max_chunksize (`int`, defaults to `None`)\n                Maximum size for RecordBatch chunks. Individual chunks may be smaller depending\n                on the chunk layout of individual columns.\n\n        Returns:\n            `pyarrow.RecordBatchReader`\n        \"\"\"\n        return self.table.to_reader(max_chunksize=max_chunksize)\n\n    def field(self, *args, **kwargs):\n        \"\"\"\n        Select a schema field by its column name or numeric index.\n\n        Args:\n            i (`Union[int, str]`):\n                The index or name of the field to retrieve.\n\n        Returns:\n            `pyarrow.Field`\n        \"\"\"\n        return self.table.field(*args, **kwargs)\n\n    def column(self, *args, **kwargs):\n        \"\"\"\n        Select a column by its column name, or numeric index.\n\n        Args:\n            i (`Union[int, str]`):\n                The index or name of the column to retrieve.\n\n        Returns:\n            `pyarrow.ChunkedArray`\n        \"\"\"\n        return self.table.column(*args, **kwargs)\n\n    def itercolumns(self, *args, **kwargs):\n        \"\"\"\n        Iterator over all columns in their numerical order.\n\n        Yields:\n            `pyarrow.ChunkedArray`\n        \"\"\"\n        return self.table.itercolumns(*args, **kwargs)\n\n    @property\n    def schema(self):\n        \"\"\"\n        Schema of the table and its columns.\n\n        Returns:\n            `pyarrow.Schema`\n        \"\"\"\n        return self.table.schema\n\n    @property\n    def columns(self):\n        \"\"\"\n        List of all columns in numerical order.\n\n        Returns:\n            `List[pa.ChunkedArray]`\n        \"\"\"\n        return self.table.columns\n\n    @property\n    def num_columns(self):\n        \"\"\"\n        Number of columns in this table.\n\n        Returns:\n            int\n        \"\"\"\n        return self.table.num_columns\n\n    @property\n    def num_rows(self):\n        \"\"\"\n        Number of rows in this table.\n\n        Due to the definition of a table, all columns have the same number of\n        rows.\n\n        Returns:\n            int\n        \"\"\"\n        return self.table.num_rows\n\n    @property\n    def shape(self):\n        \"\"\"\n        Dimensions of the table: (#rows, #columns).\n\n        Returns:\n            `(int, int)`: Number of rows and number of columns.\n        \"\"\"\n        return self.table.shape\n\n    @property\n    def nbytes(self):\n        \"\"\"\n        Total number of bytes consumed by the elements of the table.\n        \"\"\"\n        return self.table.nbytes\n\n    @property\n    def column_names(self):\n        \"\"\"\n        Names of the table's columns.\n        \"\"\"\n        return self.table.column_names\n\n    def __eq__(self, other):\n        return self.equals(other)\n\n    def __getitem__(self, i):\n        return self.table[i]\n\n    def __len__(self):\n        return len(self.table)\n\n    def __repr__(self):\n        return self.table.__repr__().replace(\"pyarrow.Table\", self.__class__.__name__)\n\n    def __str__(self):\n        return self.table.__str__().replace(\"pyarrow.Table\", self.__class__.__name__)\n\n    def slice(self, *args, **kwargs):\n        \"\"\"\n        Compute zero-copy slice of this Table.\n\n        Args:\n            offset (`int`, defaults to `0`):\n                Offset from start of table to slice.\n            length (`int`, defaults to `None`):\n                Length of slice (default is until end of table starting from\n                offset).\n\n        Returns:\n            `datasets.table.Table`\n        \"\"\"\n        raise NotImplementedError()\n\n    def filter(self, *args, **kwargs):\n        \"\"\"\n        Select records from a Table. See `pyarrow.compute.filter` for full usage.\n        \"\"\"\n        raise NotImplementedError()\n\n    def flatten(self, *args, **kwargs):\n        \"\"\"\n        Flatten this Table.  Each column with a struct type is flattened\n        into one column per struct field.  Other columns are left unchanged.\n\n        Args:\n            memory_pool (`MemoryPool`, defaults to `None`):\n                For memory allocations, if required, otherwise use default pool.\n\n        Returns:\n            `datasets.table.Table`\n        \"\"\"\n        raise NotImplementedError()\n\n    def combine_chunks(self, *args, **kwargs):\n        \"\"\"\n        Make a new table by combining the chunks this table has.\n\n        All the underlying chunks in the `ChunkedArray` of each column are\n        concatenated into zero or one chunk.\n\n        Args:\n            memory_pool (`MemoryPool`, defaults to `None`):\n                For memory allocations, if required, otherwise use default pool.\n\n        Returns:\n            `datasets.table.Table`\n        \"\"\"\n        raise NotImplementedError()\n\n    def cast(self, *args, **kwargs):\n        \"\"\"\n        Cast table values to another schema.\n\n        Args:\n            target_schema (`Schema`):\n                Schema to cast to, the names and order of fields must match.\n            safe (`bool`, defaults to `True`):\n                Check for overflows or other unsafe conversions.\n\n        Returns:\n            `datasets.table.Table`\n        \"\"\"\n        raise NotImplementedError()\n\n    def replace_schema_metadata(self, *args, **kwargs):\n        \"\"\"\n        EXPERIMENTAL: Create shallow copy of table by replacing schema\n        key-value metadata with the indicated new metadata (which may be None,\n        which deletes any existing metadata\n\n        Args:\n            metadata (`dict`, defaults to `None`):\n\n        Returns:\n            `datasets.table.Table`: shallow_copy\n        \"\"\"\n        raise NotImplementedError()\n\n    def add_column(self, *args, **kwargs):\n        \"\"\"\n        Add column to Table at position.\n\n        A new table is returned with the column added, the original table\n        object is left unchanged.\n\n        Args:\n            i (`int`):\n                Index to place the column at.\n            field_ (`Union[str, pyarrow.Field]`):\n                If a string is passed then the type is deduced from the column\n                data.\n            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):\n                Column data.\n\n        Returns:\n            `datasets.table.Table`: New table with the passed column added.\n        \"\"\"\n        raise NotImplementedError()\n\n    def append_column(self, *args, **kwargs):\n        \"\"\"\n        Append column at end of columns.\n\n        Args:\n            field_ (`Union[str, pyarrow.Field]`):\n                If a string is passed then the type is deduced from the column\n                data.\n            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):\n                Column data.\n\n        Returns:\n            `datasets.table.Table`:  New table with the passed column added.\n        \"\"\"\n        raise NotImplementedError()\n\n    def remove_column(self, *args, **kwargs):\n        \"\"\"\n        Create new Table with the indicated column removed.\n\n        Args:\n            i (`int`):\n                Index of column to remove.\n\n        Returns:\n            `datasets.table.Table`: New table without the column.\n        \"\"\"\n        raise NotImplementedError()\n\n    def set_column(self, *args, **kwargs):\n        \"\"\"\n        Replace column in Table at position.\n\n        Args:\n            i (`int`):\n                Index to place the column at.\n            field_ (`Union[str, pyarrow.Field]`):\n                If a string is passed then the type is deduced from the column\n                data.\n            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):\n                Column data.\n\n        Returns:\n            `datasets.table.Table`: New table with the passed column set.\n        \"\"\"\n        raise NotImplementedError()\n\n    def rename_columns(self, *args, **kwargs):\n        \"\"\"\n        Create new table with columns renamed to provided names.\n        \"\"\"\n        raise NotImplementedError()\n\n    def drop(self, *args, **kwargs):\n        \"\"\"\n        Drop one or more columns and return a new table.\n\n        Args:\n            columns (`List[str]`):\n                List of field names referencing existing columns.\n\n        Raises:\n            `KeyError` : if any of the passed columns name are not existing.\n\n        Returns:\n            `datasets.table.Table`: New table without the columns.\n        \"\"\"\n        raise NotImplementedError()\n\n    def select(self, *args, **kwargs):\n        \"\"\"\n        Select columns of the table.\n\n        Returns a new table with the specified columns, and metadata preserved.\n\n        Args:\n            columns (:obj:`Union[List[str], List[int]]`):\n                The column names or integer indices to select.\n\n        Returns:\n            `datasets.table.Table`: table with only a subset of the columns\n        \"\"\"\n        raise NotImplementedError()\n\n\nclass TableBlock(Table):\n    \"\"\"\n    `TableBlock` is the allowed class inside a `ConcanetationTable`.\n    Only `MemoryMappedTable` and `InMemoryTable` are `TableBlock`.\n    This is because we don't want a `ConcanetationTable` made out of other `ConcanetationTables`.\n    \"\"\"\n\n    pass\n\n\nclass InMemoryTable(TableBlock):\n    \"\"\"\n    The table is said in-memory when it is loaded into the user's RAM.\n\n    Pickling it does copy all the data using memory.\n    Its implementation is simple and uses the underlying pyarrow Table methods directly.\n\n    This is different from the `MemoryMapped` table, for which pickling doesn't copy all the\n    data in memory. For a `MemoryMapped`, unpickling instead reloads the table from the disk.\n\n    `InMemoryTable` must be used when data fit in memory, while `MemoryMapped` are reserved for\n    data bigger than memory or when you want the memory footprint of your application to\n    stay low.\n    \"\"\"\n\n    @classmethod\n    def from_file(cls, filename: str):\n        table = _in_memory_arrow_table_from_file(filename)\n        return cls(table)\n\n    @classmethod\n    def from_buffer(cls, buffer: pa.Buffer):\n        table = _in_memory_arrow_table_from_buffer(buffer)\n        return cls(table)\n\n    @classmethod\n    def from_pandas(cls, *args, **kwargs):\n        \"\"\"\n        Convert pandas.DataFrame to an Arrow Table.\n\n        The column types in the resulting Arrow Table are inferred from the\n        dtypes of the pandas.Series in the DataFrame. In the case of non-object\n        Series, the NumPy dtype is translated to its Arrow equivalent. In the\n        case of `object`, we need to guess the datatype by looking at the\n        Python objects in this Series.\n\n        Be aware that Series of the `object` dtype don't carry enough\n        information to always lead to a meaningful Arrow type. In the case that\n        we cannot infer a type, e.g. because the DataFrame is of length 0 or\n        the Series only contains `None/nan` objects, the type is set to\n        null. This behavior can be avoided by constructing an explicit schema\n        and passing it to this function.\n\n        Args:\n            df (`pandas.DataFrame`):\n            schema (`pyarrow.Schema`, *optional*):\n                The expected schema of the Arrow Table. This can be used to\n                indicate the type of columns if we cannot infer it automatically.\n                If passed, the output will have exactly this schema. Columns\n                specified in the schema that are not found in the DataFrame columns\n                or its index will raise an error. Additional columns or index\n                levels in the DataFrame which are not specified in the schema will\n                be ignored.\n            preserve_index (`bool`, *optional*):\n                Whether to store the index as an additional column in the resulting\n                `Table`. The default of None will store the index as a column,\n                except for RangeIndex which is stored as metadata only. Use\n                `preserve_index=True` to force it to be stored as a column.\n            nthreads (`int`, defaults to `None` (may use up to system CPU count threads))\n                If greater than 1, convert columns to Arrow in parallel using\n                indicated number of threads.\n            columns (`List[str]`, *optional*):\n               List of column to be converted. If `None`, use all columns.\n            safe (`bool`, defaults to `True`):\n               Check for overflows or other unsafe conversions,\n\n        Returns:\n            `datasets.table.Table`:\n\n        Examples:\n        ```python\n        >>> import pandas as pd\n        >>> import pyarrow as pa\n        >>> df = pd.DataFrame({\n            ...     'int': [1, 2],\n            ...     'str': ['a', 'b']\n            ... })\n        >>> pa.Table.from_pandas(df)\n        <pyarrow.lib.Table object at 0x7f05d1fb1b40>\n        ```\n        \"\"\"\n        return cls(pa.Table.from_pandas(*args, **kwargs))\n\n    @classmethod\n    def from_arrays(cls, *args, **kwargs):\n        \"\"\"\n        Construct a Table from Arrow arrays.\n\n        Args:\n            arrays (`List[Union[pyarrow.Array, pyarrow.ChunkedArray]]`):\n                Equal-length arrays that should form the table.\n            names (`List[str]`, *optional*):\n                Names for the table columns. If not passed, schema must be passed.\n            schema (`Schema`, defaults to `None`):\n                Schema for the created table. If not passed, names must be passed.\n            metadata (`Union[dict, Mapping]`, defaults to `None`):\n                Optional metadata for the schema (if inferred).\n\n        Returns:\n            `datasets.table.Table`\n        \"\"\"\n        return cls(pa.Table.from_arrays(*args, **kwargs))\n\n    @classmethod\n    def from_pydict(cls, *args, **kwargs):\n        \"\"\"\n        Construct a Table from Arrow arrays or columns.\n\n        Args:\n            mapping (`Union[dict, Mapping]`):\n                A mapping of strings to Arrays or Python lists.\n            schema (`Schema`, defaults to `None`):\n                If not passed, will be inferred from the Mapping values\n            metadata (`Union[dict, Mapping]`, defaults to `None`):\n                Optional metadata for the schema (if inferred).\n\n        Returns:\n            `datasets.table.Table`\n        \"\"\"\n        return cls(pa.Table.from_pydict(*args, **kwargs))\n\n    @classmethod\n    def from_pylist(cls, mapping, *args, **kwargs):\n        \"\"\"\n        Construct a Table from list of rows / dictionaries.\n\n        Args:\n            mapping (`List[dict]`):\n                A mapping of strings to row values.\n            schema (`Schema`, defaults to `None`):\n                If not passed, will be inferred from the Mapping values\n            metadata (`Union[dict, Mapping]`, defaults to `None`):\n                Optional metadata for the schema (if inferred).\n\n        Returns:\n            `datasets.table.Table`\n        \"\"\"\n        return cls(pa.Table.from_pylist(mapping, *args, **kwargs))\n\n    @classmethod\n    def from_batches(cls, *args, **kwargs):\n        \"\"\"\n        Construct a Table from a sequence or iterator of Arrow `RecordBatches`.\n\n        Args:\n            batches (`Union[Sequence[pyarrow.RecordBatch], Iterator[pyarrow.RecordBatch]]`):\n                Sequence of `RecordBatch` to be converted, all schemas must be equal.\n            schema (`Schema`, defaults to `None`):\n                If not passed, will be inferred from the first `RecordBatch`.\n\n        Returns:\n            `datasets.table.Table`:\n        \"\"\"\n        return cls(pa.Table.from_batches(*args, **kwargs))\n\n    def slice(self, offset=0, length=None):\n        \"\"\"\n        Compute zero-copy slice of this Table.\n\n        Args:\n            offset (`int`, defaults to `0`):\n                Offset from start of table to slice.\n            length (`int`, defaults to `None`):\n                Length of slice (default is until end of table starting from\n                offset).\n\n        Returns:\n            `datasets.table.Table`\n        \"\"\"\n        # Use fast slicing here\n        return InMemoryTable(self.fast_slice(offset=offset, length=length))\n\n    def filter(self, *args, **kwargs):\n        \"\"\"\n        Select records from a Table. See `pyarrow.compute.filter` for full usage.\n        \"\"\"\n        return InMemoryTable(self.table.filter(*args, **kwargs))\n\n    def flatten(self, *args, **kwargs):\n        \"\"\"\n        Flatten this Table.  Each column with a struct type is flattened\n        into one column per struct field.  Other columns are left unchanged.\n\n        Args:\n            memory_pool (`MemoryPool`, defaults to `None`):\n                For memory allocations, if required, otherwise use default pool.\n\n        Returns:\n            `datasets.table.Table`\n        \"\"\"\n        return InMemoryTable(table_flatten(self.table, *args, **kwargs))\n\n    def combine_chunks(self, *args, **kwargs):\n        \"\"\"\n        Make a new table by combining the chunks this table has.\n\n        All the underlying chunks in the `ChunkedArray` of each column are\n        concatenated into zero or one chunk.\n\n        Args:\n            memory_pool (`MemoryPool`, defaults to `None`):\n                For memory allocations, if required, otherwise use default pool.\n\n        Returns:\n            `datasets.table.Table`\n        \"\"\"\n        return InMemoryTable(self.table.combine_chunks(*args, **kwargs))\n\n    def cast(self, *args, **kwargs):\n        \"\"\"\n        Cast table values to another schema.\n\n        Args:\n            target_schema (`Schema`):\n                Schema to cast to, the names and order of fields must match.\n            safe (`bool`, defaults to `True`):\n                Check for overflows or other unsafe conversions.\n\n        Returns:\n            `datasets.table.Table`\n        \"\"\"\n        return InMemoryTable(table_cast(self.table, *args, **kwargs))\n\n    def replace_schema_metadata(self, *args, **kwargs):\n        \"\"\"\n        EXPERIMENTAL: Create shallow copy of table by replacing schema\n        key-value metadata with the indicated new metadata (which may be `None`,\n        which deletes any existing metadata).\n\n        Args:\n            metadata (`dict`, defaults to `None`):\n\n        Returns:\n            `datasets.table.Table`: shallow_copy\n        \"\"\"\n        return InMemoryTable(self.table.replace_schema_metadata(*args, **kwargs))\n\n    def add_column(self, *args, **kwargs):\n        \"\"\"\n        Add column to Table at position.\n\n        A new table is returned with the column added, the original table\n        object is left unchanged.\n\n        Args:\n            i (`int`):\n                Index to place the column at.\n            field_ (`Union[str, pyarrow.Field]`):\n                If a string is passed then the type is deduced from the column\n                data.\n            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):\n                Column data.\n\n        Returns:\n            `datasets.table.Table`: New table with the passed column added.\n        \"\"\"\n        return InMemoryTable(self.table.add_column(*args, **kwargs))\n\n    def append_column(self, *args, **kwargs):\n        \"\"\"\n        Append column at end of columns.\n\n        Args:\n            field_ (`Union[str, pyarrow.Field]`):\n                If a string is passed then the type is deduced from the column\n                data.\n            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):\n                Column data.\n\n        Returns:\n            `datasets.table.Table`:\n                New table with the passed column added.\n        \"\"\"\n        return InMemoryTable(self.table.append_column(*args, **kwargs))\n\n    def remove_column(self, *args, **kwargs):\n        \"\"\"\n        Create new Table with the indicated column removed.\n\n        Args:\n            i (`int`):\n                Index of column to remove.\n\n        Returns:\n            `datasets.table.Table`:\n                New table without the column.\n        \"\"\"\n        return InMemoryTable(self.table.remove_column(*args, **kwargs))\n\n    def set_column(self, *args, **kwargs):\n        \"\"\"\n        Replace column in Table at position.\n\n        Args:\n            i (`int`):\n                Index to place the column at.\n            field_ (`Union[str, pyarrow.Field]`):\n                If a string is passed then the type is deduced from the column\n                data.\n            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):\n                Column data.\n\n        Returns:\n            `datasets.table.Table`:\n                New table with the passed column set.\n        \"\"\"\n        return InMemoryTable(self.table.set_column(*args, **kwargs))\n\n    def rename_columns(self, *args, **kwargs):\n        \"\"\"\n        Create new table with columns renamed to provided names.\n        \"\"\"\n        return InMemoryTable(self.table.rename_columns(*args, **kwargs))\n\n    def drop(self, *args, **kwargs):\n        \"\"\"\n        Drop one or more columns and return a new table.\n\n        Args:\n            columns (`List[str]`):\n                List of field names referencing existing columns.\n\n        Raises:\n            `KeyError` : if any of the passed columns name are not existing.\n\n        Returns:\n            `datasets.table.Table`:\n                New table without the columns.\n        \"\"\"\n        return InMemoryTable(self.table.drop(*args, **kwargs))\n\n    def select(self, *args, **kwargs):\n        \"\"\"\n        Select columns of the table.\n\n        Returns a new table with the specified columns, and metadata preserved.\n\n        Args:\n            columns (:obj:`Union[List[str], List[int]]`):\n                The column names or integer indices to select.\n\n        Returns:\n            :class:`datasets.table.Table`: New table with the specified columns, and metadata preserved.\n        \"\"\"\n        return InMemoryTable(self.table.select(*args, **kwargs))\n\n\n# The MemoryMappedTable needs replays to properly reload tables from the disk\nReplay = tuple[str, tuple, dict]\n\n\nclass MemoryMappedTable(TableBlock):\n    \"\"\"\n    The table is said memory mapped when it doesn't use the user's RAM but loads the data\n    from the disk instead.\n\n    Pickling it doesn't copy the data into memory.\n    Instead, only the path to the memory mapped arrow file is pickled, as well as the list\n    of transforms to \"replay\" when reloading the table from the disk.\n\n    Its implementation requires to store an history of all the transforms that were applied\n    to the underlying pyarrow Table, so that they can be \"replayed\" when reloading the Table\n    from the disk.\n\n    This is different from the `InMemoryTable` table, for which pickling does copy all the\n    data in memory.\n\n    `InMemoryTable` must be used when data fit in memory, while `MemoryMapped` are reserved for\n    data bigger than memory or when you want the memory footprint of your application to\n    stay low.\n    \"\"\"\n\n    def __init__(self, table: pa.Table, path: str, replays: Optional[list[Replay]] = None):\n        super().__init__(table)\n        self.path = os.path.abspath(path)\n        self.replays: list[Replay] = replays if replays is not None else []\n\n    @classmethod\n    def from_file(cls, filename: str, replays=None):\n        table = _memory_mapped_arrow_table_from_file(filename)\n        table = cls._apply_replays(table, replays)\n        return cls(table, filename, replays)\n\n    def __getstate__(self):\n        return {\"path\": self.path, \"replays\": self.replays}\n\n    def __setstate__(self, state):\n        path = state[\"path\"]\n        replays = state[\"replays\"]\n        table = _memory_mapped_arrow_table_from_file(path)\n        table = self._apply_replays(table, replays)\n        MemoryMappedTable.__init__(self, table, path=path, replays=replays)\n\n    @staticmethod\n    def _apply_replays(table: pa.Table, replays: Optional[list[Replay]] = None) -> pa.Table:\n        if replays is not None:\n            for name, args, kwargs in replays:\n                if name == \"cast\":\n                    table = table_cast(table, *args, **kwargs)\n                elif name == \"flatten\":\n                    table = table_flatten(table, *args, **kwargs)\n                else:\n                    table = getattr(table, name)(*args, **kwargs)\n        return table\n\n    def _append_replay(self, replay: Replay) -> list[Replay]:\n        replays = copy.deepcopy(self.replays)\n        replays.append(replay)\n        return replays\n\n    def slice(self, offset=0, length=None):\n        \"\"\"\n        Compute zero-copy slice of this Table.\n\n        Args:\n            offset (`int`, defaults to `0`):\n                Offset from start of table to slice.\n            length (`int`, defaults to `None`):\n                Length of slice (default is until end of table starting from\n                offset).\n\n        Returns:\n            `datasets.table.Table`\n        \"\"\"\n        replay = (\"slice\", (offset, length), {})\n        replays = self._append_replay(replay)\n        # Use fast slicing here\n        return MemoryMappedTable(self.fast_slice(offset=offset, length=length), self.path, replays)\n\n    def filter(self, *args, **kwargs):\n        \"\"\"\n        Select records from a Table. See `pyarrow.compute.filter` for full usage.\n        \"\"\"\n        replay = (\"filter\", copy.deepcopy(args), copy.deepcopy(kwargs))\n        replays = self._append_replay(replay)\n        return MemoryMappedTable(self.table.filter(*args, **kwargs), self.path, replays)\n\n    def flatten(self, *args, **kwargs):\n        \"\"\"\n        Flatten this Table.  Each column with a struct type is flattened\n        into one column per struct field.  Other columns are left unchanged.\n\n        Args:\n            memory_pool (`MemoryPool`, defaults to `None`):\n                For memory allocations, if required, otherwise use default pool.\n\n        Returns:\n            `datasets.table.Table`\n        \"\"\"\n        replay = (\"flatten\", copy.deepcopy(args), copy.deepcopy(kwargs))\n        replays = self._append_replay(replay)\n        return MemoryMappedTable(table_flatten(self.table, *args, **kwargs), self.path, replays)\n\n    def combine_chunks(self, *args, **kwargs):\n        \"\"\"\n        Make a new table by combining the chunks this table has.\n\n        All the underlying chunks in the ChunkedArray of each column are\n        concatenated into zero or one chunk.\n\n        Args:\n            memory_pool (`MemoryPool`, defaults to `None`):\n                For memory allocations, if required, otherwise use default pool.\n\n        Returns:\n            `datasets.table.Table`\n        \"\"\"\n        replay = (\"combine_chunks\", copy.deepcopy(args), copy.deepcopy(kwargs))\n        replays = self._append_replay(replay)\n        return MemoryMappedTable(self.table.combine_chunks(*args, **kwargs), self.path, replays)\n\n    def cast(self, *args, **kwargs):\n        \"\"\"\n        Cast table values to another schema\n\n        Args:\n            target_schema (`Schema`):\n                Schema to cast to, the names and order of fields must match.\n            safe (`bool`, defaults to `True`):\n                Check for overflows or other unsafe conversions.\n\n        Returns:\n            `datasets.table.Table`\n        \"\"\"\n        replay = (\"cast\", copy.deepcopy(args), copy.deepcopy(kwargs))\n        replays = self._append_replay(replay)\n        return MemoryMappedTable(table_cast(self.table, *args, **kwargs), self.path, replays)\n\n    def replace_schema_metadata(self, *args, **kwargs):\n        \"\"\"\n        EXPERIMENTAL: Create shallow copy of table by replacing schema\n        key-value metadata with the indicated new metadata (which may be None,\n        which deletes any existing metadata.\n\n        Args:\n            metadata (`dict`, defaults to `None`):\n\n        Returns:\n            `datasets.table.Table`: shallow_copy\n        \"\"\"\n        replay = (\"replace_schema_metadata\", copy.deepcopy(args), copy.deepcopy(kwargs))\n        replays = self._append_replay(replay)\n        return MemoryMappedTable(self.table.replace_schema_metadata(*args, **kwargs), self.path, replays)\n\n    def add_column(self, *args, **kwargs):\n        \"\"\"\n        Add column to Table at position.\n\n        A new table is returned with the column added, the original table\n        object is left unchanged.\n\n        Args:\n            i (`int`):\n                Index to place the column at.\n            field_ (`Union[str, pyarrow.Field]`):\n                If a string is passed then the type is deduced from the column\n                data.\n            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):\n                Column data.\n\n        Returns:\n            `datasets.table.Table`: New table with the passed column added.\n        \"\"\"\n        replay = (\"add_column\", copy.deepcopy(args), copy.deepcopy(kwargs))\n        replays = self._append_replay(replay)\n        return MemoryMappedTable(self.table.add_column(*args, **kwargs), self.path, replays)\n\n    def append_column(self, *args, **kwargs):\n        \"\"\"\n        Append column at end of columns.\n\n        Args:\n            field_ (`Union[str, pyarrow.Field]`):\n                If a string is passed then the type is deduced from the column\n                data.\n            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):\n                Column data.\n\n        Returns:\n            `datasets.table.Table`:\n                New table with the passed column added.\n        \"\"\"\n        replay = (\"append_column\", copy.deepcopy(args), copy.deepcopy(kwargs))\n        replays = self._append_replay(replay)\n        return MemoryMappedTable(self.table.append_column(*args, **kwargs), self.path, replays)\n\n    def remove_column(self, *args, **kwargs):\n        \"\"\"\n        Create new Table with the indicated column removed.\n\n        Args:\n            i (`int`):\n                Index of column to remove.\n\n        Returns:\n            `datasets.table.Table`:\n                New table without the column.\n        \"\"\"\n        replay = (\"remove_column\", copy.deepcopy(args), copy.deepcopy(kwargs))\n        replays = self._append_replay(replay)\n        return MemoryMappedTable(self.table.remove_column(*args, **kwargs), self.path, replays)\n\n    def set_column(self, *args, **kwargs):\n        \"\"\"\n        Replace column in Table at position.\n\n        Args:\n            i (`int`):\n                Index to place the column at.\n            field_ (`Union[str, pyarrow.Field]`):\n                If a string is passed then the type is deduced from the column\n                data.\n            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):\n                Column data.\n\n        Returns:\n            `datasets.table.Table`:\n                New table with the passed column set.\n        \"\"\"\n        replay = (\"set_column\", copy.deepcopy(args), copy.deepcopy(kwargs))\n        replays = self._append_replay(replay)\n        return MemoryMappedTable(self.table.set_column(*args, **kwargs), self.path, replays)\n\n    def rename_columns(self, *args, **kwargs):\n        \"\"\"\n        Create new table with columns renamed to provided names.\n        \"\"\"\n        replay = (\"rename_columns\", copy.deepcopy(args), copy.deepcopy(kwargs))\n        replays = self._append_replay(replay)\n        return MemoryMappedTable(self.table.rename_columns(*args, **kwargs), self.path, replays)\n\n    def drop(self, *args, **kwargs):\n        \"\"\"\n        Drop one or more columns and return a new table.\n\n        Args:\n            columns (`List[str]`):\n                List of field names referencing existing columns.\n\n        Raises:\n            `KeyError` : if any of the passed columns name are not existing.\n\n        Returns:\n            `datasets.table.Table`:\n                New table without the columns.\n        \"\"\"\n        replay = (\"drop\", copy.deepcopy(args), copy.deepcopy(kwargs))\n        replays = self._append_replay(replay)\n        return MemoryMappedTable(self.table.drop(*args, **kwargs), self.path, replays)\n\n    def select(self, *args, **kwargs):\n        \"\"\"\n        Select columns of the table.\n\n        Returns a new table with the specified columns, and metadata preserved.\n\n        Args:\n            columns (:obj:`Union[List[str], List[int]]`):\n                The column names or integer indices to select.\n\n        Returns:\n            :class:`datasets.table.Table`: New table with the specified columns, and metadata preserved.\n        \"\"\"\n        replay = (\"select\", copy.deepcopy(args), copy.deepcopy(kwargs))\n        replays = self._append_replay(replay)\n        return MemoryMappedTable(self.table.select(*args, **kwargs), self.path, replays)\n\n\n# A ConcatenationTable is the concatenation of several tables.\n# The ``blocks`` attributes stores a list of list of blocks.\n# The first axis concatenates the tables along the axis 0 (it appends rows),\n# while the second axis concatenates tables along the axis 1 (it appends columns).\nTableBlockContainer = TypeVar(\"TableBlockContainer\", TableBlock, list[TableBlock], list[list[TableBlock]])\n\n\nclass ConcatenationTable(Table):\n    \"\"\"\n    The table comes from the concatenation of several tables called blocks.\n    It enables concatenation on both axis 0 (append rows) and axis 1 (append columns).\n\n    The underlying tables are called \"blocks\" and can be either `InMemoryTable`\n    or `MemoryMappedTable` objects.\n    This allows to combine tables that come from memory or that are memory mapped.\n    When a `ConcatenationTable` is pickled, then each block is pickled:\n    - the `InMemoryTable` objects are pickled by copying all the data in memory.\n    - the MemoryMappedTable objects are pickled without copying the data into memory.\n    Instead, only the path to the memory mapped arrow file is pickled, as well as the list\n    of transforms to \"replays\" when reloading the table from the disk.\n\n    Its implementation requires to store each block separately.\n    The `blocks` attributes stores a list of list of blocks.\n    The first axis concatenates the tables along the axis 0 (it appends rows),\n    while the second axis concatenates tables along the axis 1 (it appends columns).\n\n    If some columns are missing when concatenating on axis 0, they are filled with null values.\n    This is done using `pyarrow.concat_tables(tables, promote=True)`.\n\n    You can access the fully combined table by accessing the `ConcatenationTable.table` attribute,\n    and the blocks by accessing the `ConcatenationTable.blocks` attribute.\n    \"\"\"\n\n    def __init__(self, table: pa.Table, blocks: list[list[TableBlock]]):\n        super().__init__(table)\n        self.blocks = blocks\n        # Check that all the blocks have the right type.\n        # Only InMemoryTable and MemoryMappedTable are allowed.\n        for subtables in blocks:\n            for subtable in subtables:\n                if not isinstance(subtable, TableBlock):\n                    raise TypeError(\n                        \"The blocks of a ConcatenationTable must be InMemoryTable or MemoryMappedTable objects\"\n                        f\", but got {_short_str(subtable)}.\"\n                    )\n\n    def __getstate__(self):\n        return {\"blocks\": self.blocks, \"schema\": self.table.schema}\n\n    def __setstate__(self, state):\n        blocks = state[\"blocks\"]\n        schema = state[\"schema\"]\n        table = self._concat_blocks_horizontally_and_vertically(blocks)\n        if schema is not None and table.schema != schema:\n            # We fix the columns by concatenating with an empty table with the right columns\n            empty_table = pa.Table.from_batches([], schema=schema)\n            # We set promote_options=\"default\" to fill missing columns with null values\n            table = pa.concat_tables([table, empty_table], promote_options=\"default\")\n        ConcatenationTable.__init__(self, table, blocks=blocks)\n\n    @staticmethod\n    def _concat_blocks(blocks: list[Union[TableBlock, pa.Table]], axis: int = 0) -> pa.Table:\n        pa_tables = [table.table if hasattr(table, \"table\") else table for table in blocks]\n        if axis == 0:\n            # We set promote_options=\"default\" to fill missing columns with null values\n            return pa.concat_tables(pa_tables, promote_options=\"default\")\n        elif axis == 1:\n            for i, table in enumerate(pa_tables):\n                if i == 0:\n                    pa_table = table\n                else:\n                    for name, col in zip(table.column_names, table.columns):\n                        pa_table = pa_table.append_column(name, col)\n            return pa_table\n        else:\n            raise ValueError(\"'axis' must be either 0 or 1\")\n\n    @classmethod\n    def _concat_blocks_horizontally_and_vertically(cls, blocks: list[list[TableBlock]]) -> pa.Table:\n        pa_tables_to_concat_vertically = []\n        for i, tables in enumerate(blocks):\n            if not tables:\n                continue\n            pa_table_horizontally_concatenated = cls._concat_blocks(tables, axis=1)\n            pa_tables_to_concat_vertically.append(pa_table_horizontally_concatenated)\n        return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)\n\n    @classmethod\n    def _merge_blocks(cls, blocks: TableBlockContainer, axis: Optional[int] = None) -> TableBlockContainer:\n        if axis is not None:\n            merged_blocks = []\n            for is_in_memory, block_group in groupby(blocks, key=lambda x: isinstance(x, InMemoryTable)):\n                if is_in_memory:\n                    block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]\n                merged_blocks += list(block_group)\n        else:  # both\n            merged_blocks = [cls._merge_blocks(row_block, axis=1) for row_block in blocks]\n            if all(len(row_block) == 1 for row_block in merged_blocks):\n                merged_blocks = cls._merge_blocks(\n                    [block for row_block in merged_blocks for block in row_block], axis=0\n                )\n        return merged_blocks\n\n    @classmethod\n    def _consolidate_blocks(cls, blocks: TableBlockContainer) -> TableBlockContainer:\n        if isinstance(blocks, TableBlock):\n            return blocks\n        elif isinstance(blocks[0], TableBlock):\n            return cls._merge_blocks(blocks, axis=0)\n        else:\n            return cls._merge_blocks(blocks)\n\n    @classmethod\n    def from_blocks(cls, blocks: TableBlockContainer) -> \"ConcatenationTable\":\n        blocks = cls._consolidate_blocks(blocks)\n        if isinstance(blocks, TableBlock):\n            table = blocks\n            return cls(table.table, [[table]])\n        elif isinstance(blocks[0], TableBlock):\n            table = cls._concat_blocks(blocks, axis=0)\n            blocks = [[t] for t in blocks]\n            return cls(table, blocks)\n        else:\n            table = cls._concat_blocks_horizontally_and_vertically(blocks)\n            return cls(table, blocks)\n\n    @classmethod\n    def from_tables(cls, tables: list[Union[pa.Table, Table]], axis: int = 0) -> \"ConcatenationTable\":\n        \"\"\"Create `ConcatenationTable` from list of tables.\n\n        Args:\n            tables (list of `Table` or list of `pyarrow.Table`):\n                List of tables.\n            axis (`{0, 1}`, defaults to `0`, meaning over rows):\n                Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns\n                (horizontally).\n\n                <Added version=\"1.6.0\"/>\n        \"\"\"\n\n        def to_blocks(table: Union[pa.Table, Table]) -> list[list[TableBlock]]:\n            if isinstance(table, pa.Table):\n                return [[InMemoryTable(table)]]\n            elif isinstance(table, ConcatenationTable):\n                return copy.deepcopy(table.blocks)\n            else:\n                return [[table]]\n\n        def _slice_row_block(row_block: list[TableBlock], length: int) -> tuple[list[TableBlock], list[TableBlock]]:\n            sliced = [table.slice(0, length) for table in row_block]\n            remainder = [table.slice(length, len(row_block[0]) - length) for table in row_block]\n            return sliced, remainder\n\n        def _split_both_like(\n            result: list[list[TableBlock]], blocks: list[list[TableBlock]]\n        ) -> tuple[list[list[TableBlock]], list[list[TableBlock]]]:\n            \"\"\"\n            Make sure each row_block contain the same num_rows to be able to concatenate them on axis=1.\n\n            To do so, we modify both blocks sets to have the same row_blocks boundaries.\n            For example, if `result` has 2 row_blocks of 3 rows and `blocks` has 3 row_blocks of 2 rows,\n            we modify both to have 4 row_blocks of size 2, 1, 1 and 2:\n\n                    [ x   x   x | x   x   x ]\n                +   [ y   y | y   y | y   y ]\n                -----------------------------\n                =   [ x   x | x | x | x   x ]\n                    [ y   y | y | y | y   y ]\n\n            \"\"\"\n            result, blocks = list(result), list(blocks)\n            new_result, new_blocks = [], []\n            while result and blocks:\n                # we slice the longest row block to save two row blocks of same length\n                # and we replace the long row block by its remainder if necessary\n                if len(result[0][0]) > len(blocks[0][0]):\n                    new_blocks.append(blocks[0])\n                    sliced, result[0] = _slice_row_block(result[0], len(blocks.pop(0)[0]))\n                    new_result.append(sliced)\n                elif len(result[0][0]) < len(blocks[0][0]):\n                    new_result.append(result[0])\n                    sliced, blocks[0] = _slice_row_block(blocks[0], len(result.pop(0)[0]))\n                    new_blocks.append(sliced)\n                else:\n                    new_result.append(result.pop(0))\n                    new_blocks.append(blocks.pop(0))\n            if result or blocks:\n                raise ValueError(\"Failed to concatenate on axis=1 because tables don't have the same number of rows\")\n            return new_result, new_blocks\n\n        def _extend_blocks(\n            result: list[list[TableBlock]], blocks: list[list[TableBlock]], axis: int = 0\n        ) -> list[list[TableBlock]]:\n            if axis == 0:\n                result.extend(blocks)\n            elif axis == 1:\n                # We make sure each row_block have the same num_rows\n                result, blocks = _split_both_like(result, blocks)\n                for i, row_block in enumerate(blocks):\n                    result[i].extend(row_block)\n            return result\n\n        blocks = to_blocks(tables[0])\n        for table in tables[1:]:\n            table_blocks = to_blocks(table)\n            blocks = _extend_blocks(blocks, table_blocks, axis=axis)\n        return cls.from_blocks(blocks)\n\n    @property\n    def _slices(self):\n        offset = 0\n        for tables in self.blocks:\n            length = len(tables[0])\n            yield (offset, length)\n            offset += length\n\n    def slice(self, offset=0, length=None):\n        \"\"\"\n        Compute zero-copy slice of this Table.\n\n        Args:\n            offset (`int`, defaults to `0`):\n                Offset from start of table to slice.\n            length (`int`, defaults to `None`):\n                Length of slice (default is until end of table starting from\n                offset).\n\n        Returns:\n            `datasets.table.Table`\n        \"\"\"\n        table = self.table.slice(offset, length=length)\n        length = length if length is not None else self.num_rows - offset\n        blocks = []\n        for tables in self.blocks:\n            n_rows = len(tables[0])\n            if length == 0:\n                break\n            elif n_rows <= offset:\n                offset = offset - n_rows\n            elif n_rows <= offset + length:\n                blocks.append([t.slice(offset) for t in tables])\n                length, offset = length + offset - n_rows, 0\n            else:\n                blocks.append([t.slice(offset, length) for t in tables])\n                length, offset = 0, 0\n        return ConcatenationTable(table, blocks)\n\n    def filter(self, mask, *args, **kwargs):\n        \"\"\"\n        Select records from a Table. See `pyarrow.compute.filter` for full usage.\n        \"\"\"\n        table = self.table.filter(mask, *args, **kwargs)\n        blocks = []\n        for (offset, length), tables in zip(self._slices, self.blocks):\n            submask = mask.slice(offset, length)\n            blocks.append([t.filter(submask, *args, **kwargs) for t in tables])\n        return ConcatenationTable(table, blocks)\n\n    def flatten(self, *args, **kwargs):\n        \"\"\"\n        Flatten this Table.  Each column with a struct type is flattened\n        into one column per struct field.  Other columns are left unchanged.\n\n        Args:\n            memory_pool (`MemoryPool`, defaults to `None`):\n                For memory allocations, if required, otherwise use default pool.\n\n        Returns:\n            `datasets.table.Table`\n        \"\"\"\n        table = table_flatten(self.table, *args, **kwargs)\n        blocks = []\n        for tables in self.blocks:\n            blocks.append([t.flatten(*args, **kwargs) for t in tables])\n        return ConcatenationTable(table, blocks)\n\n    def combine_chunks(self, *args, **kwargs):\n        \"\"\"\n        Make a new table by combining the chunks this table has.\n\n        All the underlying chunks in the `ChunkedArray` of each column are\n        concatenated into zero or one chunk.\n\n        Args:\n            memory_pool (`MemoryPool`, defaults to `None`):\n                For memory allocations, if required, otherwise use default pool.\n\n        Returns:\n            `datasets.table.Table`\n        \"\"\"\n        table = self.table.combine_chunks(*args, **kwargs)\n        blocks = []\n        for tables in self.blocks:\n            blocks.append([t.combine_chunks(*args, **kwargs) for t in tables])\n        return ConcatenationTable(table, blocks)\n\n    def cast(self, target_schema, *args, **kwargs):\n        \"\"\"\n        Cast table values to another schema.\n\n        Args:\n            target_schema (`Schema`):\n                Schema to cast to, the names and order of fields must match.\n            safe (`bool`, defaults to `True`):\n                Check for overflows or other unsafe conversions.\n\n        Returns:\n            `datasets.table.Table`\n        \"\"\"\n        from .features import Features\n\n        table = table_cast(self.table, target_schema, *args, **kwargs)\n        target_features = Features.from_arrow_schema(target_schema)\n        blocks = []\n        for subtables in self.blocks:\n            new_tables = []\n            fields = list(target_schema)\n            for subtable in subtables:\n                subfields = []\n                for name in subtable.column_names:\n                    subfields.append(fields.pop(next(i for i, field in enumerate(fields) if field.name == name)))\n                subfeatures = Features({subfield.name: target_features[subfield.name] for subfield in subfields})\n                subschema = subfeatures.arrow_schema\n                new_tables.append(subtable.cast(subschema, *args, **kwargs))\n            blocks.append(new_tables)\n        return ConcatenationTable(table, blocks)\n\n    def replace_schema_metadata(self, *args, **kwargs):\n        \"\"\"\n        EXPERIMENTAL: Create shallow copy of table by replacing schema\n        key-value metadata with the indicated new metadata (which may be `None`,\n        which deletes any existing metadata).\n\n        Args:\n            metadata (`dict`, defaults to `None`):\n\n        Returns:\n            `datasets.table.Table`: shallow_copy\n        \"\"\"\n        table = self.table.replace_schema_metadata(*args, **kwargs)\n        blocks = []\n        for tables in self.blocks:\n            blocks.append([t.replace_schema_metadata(*args, **kwargs) for t in tables])\n        return ConcatenationTable(table, self.blocks)\n\n    def add_column(self, *args, **kwargs):\n        \"\"\"\n        Add column to Table at position.\n\n        A new table is returned with the column added, the original table\n        object is left unchanged.\n\n        Args:\n            i (`int`):\n                Index to place the column at.\n            field_ (`Union[str, pyarrow.Field]`):\n                If a string is passed then the type is deduced from the column\n                data.\n            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):\n                Column data.\n\n        Returns:\n            `datasets.table.Table`: New table with the passed column added.\n        \"\"\"\n        raise NotImplementedError()\n\n    def append_column(self, *args, **kwargs):\n        \"\"\"\n        Append column at end of columns.\n\n        Args:\n            field_ (`Union[str, pyarrow.Field]`):\n                If a string is passed then the type is deduced from the column\n                data.\n            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):\n                Column data.\n\n        Returns:\n            `datasets.table.Table`:\n                New table with the passed column added.\n        \"\"\"\n        raise NotImplementedError()\n\n    def remove_column(self, i, *args, **kwargs):\n        \"\"\"\n        Create new Table with the indicated column removed.\n\n        Args:\n            i (`int`):\n                Index of column to remove.\n\n        Returns:\n            `datasets.table.Table`:\n                New table without the column.\n        \"\"\"\n        table = self.table.remove_column(i, *args, **kwargs)\n        name = self.table.column_names[i]\n        blocks = []\n        for tables in self.blocks:\n            blocks.append(\n                [\n                    t.remove_column(t.column_names.index(name), *args, **kwargs) if name in t.column_names else t\n                    for t in tables\n                ]\n            )\n        return ConcatenationTable(table, blocks)\n\n    def set_column(self, *args, **kwargs):\n        \"\"\"\n        Replace column in Table at position.\n\n        Args:\n            i (`int`):\n                Index to place the column at.\n            field_ (`Union[str, pyarrow.Field]`):\n                If a string is passed then the type is deduced from the column\n                data.\n            column (`Union[pyarrow.Array, List[pyarrow.Array]]`):\n                Column data.\n\n        Returns:\n            `datasets.table.Table`:\n                New table with the passed column set.\n        \"\"\"\n        raise NotImplementedError()\n\n    def rename_columns(self, names, *args, **kwargs):\n        \"\"\"\n        Create new table with columns renamed to provided names.\n        \"\"\"\n        table = self.table.rename_columns(names, *args, **kwargs)\n        names = dict(zip(self.table.column_names, names))\n        blocks = []\n        for tables in self.blocks:\n            blocks.append(\n                [t.rename_columns([names[name] for name in t.column_names], *args, **kwargs) for t in tables]\n            )\n        return ConcatenationTable(table, blocks)\n\n    def drop(self, columns, *args, **kwargs):\n        \"\"\"\n        Drop one or more columns and return a new table.\n\n        Args:\n            columns (`List[str]`):\n                List of field names referencing existing columns.\n\n        Raises:\n            `KeyError` : if any of the passed columns name are not existing.\n\n        Returns:\n            `datasets.table.Table`:\n                New table without the columns.\n        \"\"\"\n        table = self.table.drop(columns, *args, **kwargs)\n        blocks = []\n        for tables in self.blocks:\n            blocks.append([t.drop([c for c in columns if c in t.column_names], *args, **kwargs) for t in tables])\n        return ConcatenationTable(table, blocks)\n\n    def select(self, columns, *args, **kwargs):\n        \"\"\"\n        Select columns of the table.\n\n        Returns a new table with the specified columns, and metadata preserved.\n\n        Args:\n            columns (:obj:`Union[List[str], List[int]]`):\n                The column names or integer indices to select.\n\n        Returns:\n            :class:`datasets.table.Table`: New table with the specified columns, and metadata preserved.\n        \"\"\"\n        table = self.table.select(columns, *args, **kwargs)\n        blocks = []\n        for tables in self.blocks:\n            blocks.append([t.select([c for c in columns if c in t.column_names], *args, **kwargs) for t in tables])\n        return ConcatenationTable(table, blocks)\n\n\ndef concat_tables(tables: list[Table], axis: int = 0) -> Table:\n    \"\"\"\n    Concatenate tables.\n\n    Args:\n        tables (list of `Table`):\n            List of tables to be concatenated.\n        axis (`{0, 1}`, defaults to `0`, meaning over rows):\n            Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns\n            (horizontally).\n\n            <Added version=\"1.6.0\"/>\n    Returns:\n        `datasets.table.Table`:\n            If the number of input tables is > 1, then the returned table is a `datasets.table.ConcatenationTable`.\n            Otherwise if there's only one table, it is returned as is.\n    \"\"\"\n    tables = list(tables)\n    if len(tables) == 1:\n        return tables[0]\n    return ConcatenationTable.from_tables(tables, axis=axis)\n\n\ndef list_table_cache_files(table: Table) -> list[str]:\n    \"\"\"\n    Get the cache files that are loaded by the table.\n    Cache file are used when parts of the table come from the disk via memory mapping.\n\n    Returns:\n        `List[str]`:\n            A list of paths to the cache files loaded by the table.\n    \"\"\"\n    if isinstance(table, ConcatenationTable):\n        cache_files = []\n        for subtables in table.blocks:\n            for subtable in subtables:\n                cache_files += list_table_cache_files(subtable)\n        return cache_files\n    elif isinstance(table, MemoryMappedTable):\n        return [table.path]\n    else:\n        return []\n\n\ndef _wrap_for_chunked_arrays(func):\n    \"\"\"Apply the function on each chunk of a `pyarrow.ChunkedArray`, or on the array directly\"\"\"\n\n    def wrapper(array, *args, **kwargs):\n        if isinstance(array, pa.ChunkedArray):\n            return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])\n        else:\n            return func(array, *args, **kwargs)\n\n    return wrapper\n\n\ndef _are_list_values_of_length(array: pa.ListArray, length: int) -> bool:\n    \"\"\"Check if all the sub-lists of a `pa.ListArray` have the specified length.\"\"\"\n    return pc.all(pc.equal(array.value_lengths(), length)).as_py() or array.null_count == len(array)\n\n\ndef _combine_list_array_offsets_with_mask(array: pa.ListArray) -> pa.Array:\n    \"\"\"Add the null bitmap to the offsets of a `pa.ListArray`.\"\"\"\n    offsets = array.offsets\n    if array.null_count > 0:\n        offsets = pa.concat_arrays(\n            [\n                pc.replace_with_mask(offsets[:-1], array.is_null(), pa.nulls(len(array), pa.int32())),\n                offsets[-1:],\n            ]\n        )\n    return offsets\n\n\ndef _storage_type(type: pa.DataType) -> pa.DataType:\n    \"\"\"Convert a (possibly nested) `pa.ExtensionType` to its storage type.\"\"\"\n    if isinstance(type, pa.ExtensionType):\n        return _storage_type(type.storage_type)\n    elif isinstance(type, pa.StructType):\n        return pa.struct([pa.field(field.name, _storage_type(field.type)) for field in type])\n    elif isinstance(type, pa.ListType):\n        return pa.list_(_storage_type(type.value_type))\n    elif isinstance(type, pa.FixedSizeListType):\n        return pa.list_(_storage_type(type.value_type), type.list_size)\n    return type\n\n\ndef _short_str(value: Any) -> str:\n    out = str(value)\n    if len(out) > 3000:\n        out = out[:1500] + \"\\n...\\n\" + out[-1500:]\n    return out\n\n\n@_wrap_for_chunked_arrays\ndef array_cast(\n    array: pa.Array, pa_type: pa.DataType, allow_primitive_to_str: bool = True, allow_decimal_to_str: bool = True\n) -> Union[pa.Array, pa.FixedSizeListArray, pa.ListArray, pa.StructArray, pa.ExtensionArray]:\n    \"\"\"Improved version of `pa.Array.cast`\n\n    It supports casting `pa.StructArray` objects to re-order the fields.\n    It also let you control certain aspects of the casting, e.g. whether\n    to disable casting primitives (`booleans`, `floats` or `ints`) or\n    disable casting decimals to strings.\n\n    Args:\n        array (`pa.Array`):\n            PyArrow array to cast\n        pa_type (`pa.DataType`):\n            Target PyArrow type\n        allow_primitive_to_str (`bool`, defaults to `True`):\n            Whether to allow casting primitives to strings.\n            Defaults to `True`.\n        allow_decimal_to_str (`bool`, defaults to `True`):\n            Whether to allow casting decimals to strings.\n            Defaults to `True`.\n\n    Raises:\n        `pa.ArrowInvalidError`: if the arrow data casting fails\n        `TypeError`: if the target type is not supported according, e.g.\n\n            - if a field is missing\n            - if casting from primitives to strings and `allow_primitive_to_str` is `False`\n            - if casting from decimals to strings and `allow_decimal_to_str` is `False`\n\n    Returns:\n        `List[pyarrow.Array]`: the casted array\n    \"\"\"\n    _c = partial(array_cast, allow_primitive_to_str=allow_primitive_to_str, allow_decimal_to_str=allow_decimal_to_str)\n    if isinstance(array, pa.ExtensionArray):\n        array = array.storage\n    if isinstance(pa_type, pa.ExtensionType):\n        return pa_type.wrap_array(_c(array, pa_type.storage_type))\n    elif array.type == pa_type:\n        return array\n    elif pa.types.is_struct(array.type):\n        if pa.types.is_struct(pa_type) and ({field.name for field in pa_type} == {field.name for field in array.type}):\n            if array.type.num_fields == 0:\n                return array\n            arrays = [_c(array.field(field.name), field.type) for field in pa_type]\n            return pa.StructArray.from_arrays(arrays, fields=list(pa_type), mask=array.is_null())\n    elif pa.types.is_list(array.type) or pa.types.is_large_list(array.type):\n        if pa.types.is_fixed_size_list(pa_type):\n            if _are_list_values_of_length(array, pa_type.list_size):\n                if array.null_count > 0:\n                    # Ensure each null value in the array translates to [null] * pa_type.list_size in the array's values array\n                    array_type = array.type\n                    storage_type = _storage_type(array_type)\n                    if array_type != storage_type:\n                        # Temporarily convert to the storage type to support extension types in the slice operation\n                        array = _c(array, storage_type)\n                        array = pc.list_slice(array, 0, pa_type.list_size, return_fixed_size_list=True)\n                        array = _c(array, array_type)\n                    else:\n                        array = pc.list_slice(array, 0, pa_type.list_size, return_fixed_size_list=True)\n                    array_values = array.values\n                    return pa.FixedSizeListArray.from_arrays(\n                        _c(array_values, pa_type.value_type), pa_type.list_size, mask=array.is_null()\n                    )\n                else:\n                    array_values = array.values[\n                        array.offset * pa_type.list_size : (array.offset + len(array)) * pa_type.list_size\n                    ]\n                    return pa.FixedSizeListArray.from_arrays(_c(array_values, pa_type.value_type), pa_type.list_size)\n        elif pa.types.is_list(pa_type):\n            # Merge offsets with the null bitmap to avoid the \"Null bitmap with offsets slice not supported\" ArrowNotImplementedError\n            array_offsets = _combine_list_array_offsets_with_mask(array)\n            return pa.ListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type))\n        elif pa.types.is_large_list(pa_type):\n            # Merge offsets with the null bitmap to avoid the \"Null bitmap with offsets slice not supported\" ArrowNotImplementedError\n            array_offsets = _combine_list_array_offsets_with_mask(array)\n            return pa.LargeListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type))\n    elif pa.types.is_fixed_size_list(array.type):\n        if pa.types.is_fixed_size_list(pa_type):\n            if pa_type.list_size == array.type.list_size:\n                array_values = array.values[\n                    array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size\n                ]\n                return pa.FixedSizeListArray.from_arrays(\n                    _c(array_values, pa_type.value_type), pa_type.list_size, mask=array.is_null()\n                )\n        elif pa.types.is_list(pa_type):\n            array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size\n            return pa.ListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type), mask=array.is_null())\n        elif pa.types.is_large_list(pa_type):\n            array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size\n            return pa.LargeListArray.from_arrays(\n                array_offsets, _c(array.values, pa_type.value_type), mask=array.is_null()\n            )\n    else:\n        if pa.types.is_string(pa_type):\n            if not allow_primitive_to_str and pa.types.is_primitive(array.type):\n                raise TypeError(\n                    f\"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)} \"\n                    f\"since allow_primitive_to_str is set to {allow_primitive_to_str} \"\n                )\n            if not allow_decimal_to_str and pa.types.is_decimal(array.type):\n                raise TypeError(\n                    f\"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)} \"\n                    f\"and allow_decimal_to_str is set to {allow_decimal_to_str}\"\n                )\n        if pa.types.is_null(pa_type) and not pa.types.is_null(array.type):\n            raise TypeError(f\"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)}\")\n        return array.cast(pa_type)\n    raise TypeError(f\"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)}\")\n\n\n@_wrap_for_chunked_arrays\ndef cast_array_to_feature(\n    array: pa.Array, feature: \"FeatureType\", allow_primitive_to_str: bool = True, allow_decimal_to_str: bool = True\n) -> pa.Array:\n    \"\"\"Cast an array to the arrow type that corresponds to the requested feature type.\n    For custom features like [`Audio`] or [`Image`], it takes into account the \"cast_storage\" methods\n    they defined to enable casting from other arrow types.\n\n    Args:\n        array (`pa.Array`):\n            The PyArrow array to cast.\n        feature (`datasets.features.FeatureType`):\n            The target feature type.\n        allow_primitive_to_str (`bool`, defaults to `True`):\n            Whether to allow casting primitives to strings.\n            Defaults to `True`.\n        allow_decimal_to_str (`bool`, defaults to `True`):\n            Whether to allow casting decimals to strings.\n            Defaults to `True`.\n\n    Raises:\n        `pa.ArrowInvalidError`: if the arrow data casting fails\n        `TypeError`: if the target type is not supported according, e.g.\n\n            - if a field is missing\n            - if casting from primitives and `allow_primitive_to_str` is `False`\n            - if casting from decimals and `allow_decimal_to_str` is `False`\n\n    Returns:\n        array (`pyarrow.Array`): the casted array\n    \"\"\"\n    from .features.features import LargeList, List, get_nested_type\n\n    _c = partial(\n        cast_array_to_feature,\n        allow_primitive_to_str=allow_primitive_to_str,\n        allow_decimal_to_str=allow_decimal_to_str,\n    )\n\n    if isinstance(array, pa.ExtensionArray):\n        array = array.storage\n    if hasattr(feature, \"cast_storage\"):\n        return feature.cast_storage(array)\n\n    if pa.types.is_struct(array.type):\n        # feature must be a dict\n        if isinstance(feature, dict) and (array_fields := {field.name for field in array.type}) <= set(feature):\n            null_array = pa.array([None] * len(array))\n            arrays = [\n                _c(array.field(name) if name in array_fields else null_array, subfeature)\n                for name, subfeature in feature.items()\n            ]\n            return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null())\n    elif pa.types.is_list(array.type) or pa.types.is_large_list(array.type):\n        # feature must be either List(subfeature) or LargeList(subfeature)\n        if isinstance(feature, LargeList):\n            casted_array_values = _c(array.values, feature.feature)\n            if pa.types.is_large_list(array.type) and casted_array_values.type == array.values.type:\n                # Both array and feature have equal large_list type and values (within the list) type\n                return array\n            else:\n                # Merge offsets with the null bitmap to avoid the \"Null bitmap with offsets slice not supported\" ArrowNotImplementedError\n                array_offsets = _combine_list_array_offsets_with_mask(array)\n                return pa.LargeListArray.from_arrays(array_offsets, casted_array_values)\n        elif isinstance(feature, List):\n            if feature.length > -1:\n                if _are_list_values_of_length(array, feature.length):\n                    if array.null_count > 0:\n                        # Ensure each null value in the array translates to [null] * pa_type.list_size in the array's values array\n                        array_type = array.type\n                        storage_type = _storage_type(array_type)\n                        if array_type != storage_type:\n                            # Temporarily convert to the storage type to support extension types in the slice operation\n                            array = array_cast(\n                                array,\n                                storage_type,\n                                allow_primitive_to_str=allow_primitive_to_str,\n                                allow_decimal_to_str=allow_decimal_to_str,\n                            )\n                            array = pc.list_slice(array, 0, feature.length, return_fixed_size_list=True)\n                            array = array_cast(\n                                array,\n                                array_type,\n                                allow_primitive_to_str=allow_primitive_to_str,\n                                allow_decimal_to_str=allow_decimal_to_str,\n                            )\n                        else:\n                            array = pc.list_slice(array, 0, feature.length, return_fixed_size_list=True)\n                        array_values = array.values\n                        casted_array_values = _c(array_values, feature.feature)\n                        return pa.FixedSizeListArray.from_arrays(\n                            casted_array_values, feature.length, mask=array.is_null()\n                        )\n                    else:\n                        array_values = array.values[\n                            array.offset * feature.length : (array.offset + len(array)) * feature.length\n                        ]\n                        return pa.FixedSizeListArray.from_arrays(_c(array_values, feature.feature), feature.length)\n            else:\n                casted_array_values = _c(array.values, feature.feature)\n                if pa.types.is_list(array.type) and casted_array_values.type == array.values.type:\n                    # Both array and feature have equal list type and values (within the list) type\n                    return array\n                else:\n                    # Merge offsets with the null bitmap to avoid the \"Null bitmap with offsets slice not supported\" ArrowNotImplementedError\n                    array_offsets = _combine_list_array_offsets_with_mask(array)\n                    return pa.ListArray.from_arrays(array_offsets, casted_array_values)\n    elif pa.types.is_fixed_size_list(array.type):\n        # feature must be List(subfeature)\n        if isinstance(feature, LargeList):\n            array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size\n            return pa.LargeListArray.from_arrays(\n                array_offsets, _c(array.values, feature.feature), mask=array.is_null()\n            )\n        elif isinstance(feature, List):\n            if feature.length > -1:\n                if feature.length == array.type.list_size:\n                    array_values = array.values[\n                        array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size\n                    ]\n                    casted_array_values = _c(array_values, feature.feature)\n                    return pa.FixedSizeListArray.from_arrays(casted_array_values, feature.length, mask=array.is_null())\n            else:\n                array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size\n                return pa.ListArray.from_arrays(array_offsets, _c(array.values, feature.feature), mask=array.is_null())\n    if pa.types.is_null(array.type):\n        return array_cast(\n            array,\n            get_nested_type(feature),\n            allow_primitive_to_str=allow_primitive_to_str,\n            allow_decimal_to_str=allow_decimal_to_str,\n        )\n    elif not isinstance(feature, (List, LargeList, dict)):\n        return array_cast(\n            array,\n            feature(),\n            allow_primitive_to_str=allow_primitive_to_str,\n            allow_decimal_to_str=allow_decimal_to_str,\n        )\n    raise TypeError(f\"Couldn't cast array of type\\n{_short_str(array.type)}\\nto\\n{_short_str(feature)}\")\n\n\n@_wrap_for_chunked_arrays\ndef embed_array_storage(array: pa.Array, feature: \"FeatureType\", token_per_repo_id=None):\n    \"\"\"Embed data into an arrays's storage.\n    For custom features like Audio or Image, it takes into account the \"embed_storage\" methods\n    they define to embed external data (e.g. an image file) into an array.\n\n    <Added version=\"2.4.0\"/>\n\n    Args:\n        array (`pa.Array`):\n            The PyArrow array in which to embed data.\n        feature (`datasets.features.FeatureType`):\n            Array features.\n\n    Raises:\n        `TypeError`: if the target type is not supported according, e.g.\n\n            - if a field is missing\n\n    Returns:\n         array (`pyarrow.Array`): the casted array\n    \"\"\"\n    from .features import LargeList, List\n\n    _e = partial(embed_array_storage, token_per_repo_id=token_per_repo_id)\n\n    if isinstance(array, pa.ExtensionArray):\n        array = array.storage\n    if hasattr(feature, \"embed_storage\"):\n        return feature.embed_storage(array, token_per_repo_id=token_per_repo_id)\n    elif pa.types.is_struct(array.type):\n        # feature must be a dict\n        if isinstance(feature, dict):\n            arrays = [_e(array.field(name), subfeature) for name, subfeature in feature.items()]\n            return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null())\n    elif pa.types.is_list(array.type):\n        # feature must be either List(subfeature)\n        # Merge offsets with the null bitmap to avoid the \"Null bitmap with offsets slice not supported\" ArrowNotImplementedError\n        array_offsets = _combine_list_array_offsets_with_mask(array)\n        if isinstance(feature, List) and feature.length == -1:\n            return pa.ListArray.from_arrays(array_offsets, _e(array.values, feature.feature))\n    elif pa.types.is_large_list(array.type):\n        # feature must be LargeList(subfeature)\n        # Merge offsets with the null bitmap to avoid the \"Null bitmap with offsets slice not supported\" ArrowNotImplementedError\n        array_offsets = _combine_list_array_offsets_with_mask(array)\n        return pa.LargeListArray.from_arrays(array_offsets, _e(array.values, feature.feature))\n    elif pa.types.is_fixed_size_list(array.type):\n        # feature must be List(subfeature)\n        if isinstance(feature, List) and feature.length > -1:\n            array_values = array.values[\n                array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size\n            ]\n            embedded_array_values = _e(array_values, feature.feature)\n            return pa.FixedSizeListArray.from_arrays(embedded_array_values, feature.length, mask=array.is_null())\n    if not isinstance(feature, (List, LargeList, dict)):\n        return array\n    raise TypeError(f\"Couldn't embed array of type\\n{_short_str(array.type)}\\nwith\\n{_short_str(feature)}\")\n\n\nclass CastError(ValueError):\n    \"\"\"When it's not possible to cast an Arrow table to a specific schema or set of features\"\"\"\n\n    def __init__(self, *args, table_column_names: list[str], requested_column_names: list[str]) -> None:\n        super().__init__(*args)\n        self.table_column_names = table_column_names\n        self.requested_column_names = requested_column_names\n\n    def __reduce__(self):\n        # Fix unpickling: TypeError: __init__() missing 2 required keyword-only arguments: 'table_column_names' and 'requested_column_names'\n        return partial(\n            CastError, table_column_names=self.table_column_names, requested_column_names=self.requested_column_names\n        ), ()\n\n    def details(self):\n        new_columns = set(self.table_column_names) - set(self.requested_column_names)\n        missing_columns = set(self.requested_column_names) - set(self.table_column_names)\n        if new_columns and missing_columns:\n            return f\"there are {len(new_columns)} new columns ({_short_str(new_columns)}) and {len(missing_columns)} missing columns ({_short_str(missing_columns)}).\"\n        elif new_columns:\n            return f\"there are {len(new_columns)} new columns ({_short_str(new_columns)})\"\n        else:\n            return f\"there are {len(missing_columns)} missing columns ({_short_str(missing_columns)})\"\n\n\ndef cast_table_to_features(table: pa.Table, features: \"Features\"):\n    \"\"\"Cast a table to the arrow schema that corresponds to the requested features.\n\n    Args:\n        table (`pyarrow.Table`):\n            PyArrow table to cast.\n        features ([`Features`]):\n            Target features.\n\n    Returns:\n        table (`pyarrow.Table`): the casted table\n    \"\"\"\n    if sorted(table.column_names) != sorted(features):\n        raise CastError(\n            f\"Couldn't cast\\n{_short_str(table.schema)}\\nto\\n{_short_str(features)}\\nbecause column names don't match\",\n            table_column_names=table.column_names,\n            requested_column_names=list(features),\n        )\n    arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]\n    return pa.Table.from_arrays(arrays, schema=features.arrow_schema)\n\n\ndef cast_table_to_schema(table: pa.Table, schema: pa.Schema):\n    \"\"\"Cast a table to the arrow schema. Different from `cast_table_to_features`, this method can preserve nullability.\n\n    Args:\n        table (`pa.Table`):\n            PyArrow table to cast.\n        features ([`Features`]):\n            Target features.\n\n    Returns:\n        `pa.Table`: the casted table\n    \"\"\"\n    from .features import Features\n\n    features = Features.from_arrow_schema(schema)\n    table_column_names = set(table.column_names)\n    if not table_column_names <= set(schema.names):\n        raise CastError(\n            f\"Couldn't cast\\n{_short_str(table.schema)}\\nto\\n{_short_str(features)}\\nbecause column names don't match\",\n            table_column_names=table.column_names,\n            requested_column_names=list(features),\n        )\n    arrays = [\n        cast_array_to_feature(\n            table[name] if name in table_column_names else pa.array([None] * len(table), type=schema.field(name).type),\n            feature,\n        )\n        for name, feature in features.items()\n    ]\n    return pa.Table.from_arrays(arrays, schema=schema)\n\n\ndef embed_table_storage(table: pa.Table, token_per_repo_id=None):\n    \"\"\"Embed external data into a table's storage.\n\n    <Added version=\"2.4.0\"/>\n\n    Args:\n        table (`pyarrow.Table`):\n            PyArrow table in which to embed data.\n\n    Returns:\n        table (`pyarrow.Table`): the table with embedded data\n    \"\"\"\n    from .features.features import Features, require_storage_embed\n\n    features = Features.from_arrow_schema(table.schema)\n    arrays = [\n        embed_array_storage(table[name], feature, token_per_repo_id=token_per_repo_id)\n        if require_storage_embed(feature)\n        else table[name]\n        for name, feature in features.items()\n    ]\n    return pa.Table.from_arrays(arrays, schema=features.arrow_schema)\n\n\ndef table_cast(table: pa.Table, schema: pa.Schema):\n    \"\"\"Improved version of `pa.Table.cast`.\n\n    It supports casting to feature types stored in the schema metadata.\n\n    Args:\n        table (`pyarrow.Table`):\n            PyArrow table to cast.\n        schema (`pyarrow.Schema`):\n            Target PyArrow schema.\n\n    Returns:\n        table (`pyarrow.Table`): the casted table\n    \"\"\"\n    if table.schema != schema:\n        return cast_table_to_schema(table, schema)\n    elif table.schema.metadata != schema.metadata:\n        return table.replace_schema_metadata(schema.metadata)\n    else:\n        return table\n\n\ndef table_flatten(table: pa.Table):\n    \"\"\"Improved version of `pa.Table.flatten`.\n\n    It behaves as `pa.Table.flatten` in a sense it does 1-step flatten of the columns with a struct type into one column per struct field,\n    but updates the metadata and skips decodable features unless the `decode` attribute of these features is set to False.\n\n    Args:\n        table (`pa.Table`):\n            PyArrow table to flatten.\n\n    Returns:\n        `Table`: the flattened table\n    \"\"\"\n    from .features import Features\n\n    features = Features.from_arrow_schema(table.schema)\n    if any(hasattr(subfeature, \"flatten\") and subfeature.flatten() == subfeature for subfeature in features.values()):\n        flat_arrays = []\n        flat_column_names = []\n        for field in table.schema:\n            array = table.column(field.name)\n            subfeature = features[field.name]\n            if pa.types.is_struct(field.type) and (\n                not hasattr(subfeature, \"flatten\") or subfeature.flatten() != subfeature\n            ):\n                flat_arrays.extend(array.flatten())\n                flat_column_names.extend([f\"{field.name}.{subfield.name}\" for subfield in field.type])\n            else:\n                flat_arrays.append(array)\n                flat_column_names.append(field.name)\n        flat_table = pa.Table.from_arrays(\n            flat_arrays,\n            names=flat_column_names,\n        )\n    else:\n        flat_table = table.flatten()\n    # Preserve complex types in the metadata\n    flat_features = features.flatten(max_depth=2)\n    flat_features = Features({column_name: flat_features[column_name] for column_name in flat_table.column_names})\n    return flat_table.replace_schema_metadata(flat_features.arrow_schema.metadata)\n\n\ndef table_visitor(table: pa.Table, function: Callable[[pa.Array], None]):\n    \"\"\"Visit all arrays in a table and apply a function to them.\n\n    Args:\n        table (`pyarrow.Table`):\n            PyArrow table to visit.\n        function (`Callable[[pa.Array], None]`):\n            Function to apply to each array.\n    \"\"\"\n    from .features import Features, LargeList, List\n\n    features = Features.from_arrow_schema(table.schema)\n\n    def _visit(array, feature):\n        if isinstance(array, pa.ChunkedArray):\n            for chunk in array.chunks:\n                _visit(chunk, feature)\n        else:\n            if isinstance(array, pa.ExtensionArray):\n                array = array.storage\n            function(array, feature)\n            if pa.types.is_struct(array.type) and not hasattr(feature, \"cast_storage\"):\n                for name, subfeature in feature.items():\n                    _visit(array.field(name), subfeature)\n            elif pa.types.is_list(array.type):\n                if isinstance(feature, (LargeList, List)):\n                    _visit(array.values, feature.feature)\n\n    for name, feature in features.items():\n        _visit(table[name], feature)\n\n\ndef table_iter(table: Table, batch_size: int, drop_last_batch=False) -> Iterator[pa.Table]:\n    \"\"\"Iterate over sub-tables of size `batch_size`.\n\n    Args:\n        table (`pyarrow.Table`):\n            PyArrow table to iterate over.\n        batch_size (`int`):\n            Size of each sub-table to yield.\n        drop_last_batch (`bool`, defaults to `False`):\n            Drop the last batch if it is smaller than `batch_size`.\n    \"\"\"\n    chunks_buffer = []\n    chunks_buffer_size = 0\n    for chunk in table.to_reader(max_chunksize=batch_size):\n        if len(chunk) == 0:\n            continue\n        elif chunks_buffer_size + len(chunk) < batch_size:\n            chunks_buffer.append(chunk)\n            chunks_buffer_size += len(chunk)\n            continue\n        elif chunks_buffer_size + len(chunk) == batch_size:\n            chunks_buffer.append(chunk)\n            yield pa.Table.from_batches(chunks_buffer)\n            chunks_buffer = []\n            chunks_buffer_size = 0\n        else:\n            cropped_chunk_length = batch_size - chunks_buffer_size\n            chunks_buffer.append(chunk.slice(0, cropped_chunk_length))\n            yield pa.Table.from_batches(chunks_buffer)\n            chunks_buffer = [chunk.slice(cropped_chunk_length, len(chunk) - cropped_chunk_length)]\n            chunks_buffer_size = len(chunk) - cropped_chunk_length\n    if not drop_last_batch and chunks_buffer:\n        yield pa.Table.from_batches(chunks_buffer)\n"
  },
  {
    "path": "src/datasets/utils/__init__.py",
    "content": "# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom . import tqdm as _tqdm  # _tqdm is the module\nfrom .experimental import experimental\nfrom .info_utils import VerificationMode\nfrom .logging import disable_progress_bar, enable_progress_bar, is_progress_bar_enabled\nfrom .tqdm import (\n    are_progress_bars_disabled,\n    disable_progress_bars,\n    enable_progress_bars,\n    tqdm,\n)\nfrom .version import Version\n"
  },
  {
    "path": "src/datasets/utils/_dataset_viewer.py",
    "content": "from typing import Any, Optional, Union\n\nfrom huggingface_hub.utils import get_session\n\nfrom .. import config\nfrom ..exceptions import DatasetsError\nfrom .file_utils import (\n    get_authentication_headers_for_url,\n)\nfrom .logging import get_logger\n\n\nlogger = get_logger(__name__)\n\n\nclass DatasetViewerError(DatasetsError):\n    \"\"\"Dataset viewer error.\n\n    Raised when trying to use the dataset viewer HTTP API and when trying to access:\n    - a missing dataset, or\n    - a private/gated dataset and the user is not authenticated.\n    - unavailable /parquet or /info responses\n    \"\"\"\n\n\ndef get_exported_parquet_files(\n    dataset: str, commit_hash: str, token: Optional[Union[str, bool]]\n) -> list[dict[str, Any]]:\n    \"\"\"\n    Get the dataset exported parquet files\n    Docs: https://huggingface.co/docs/datasets-server/parquet\n    \"\"\"\n    dataset_viewer_parquet_url = config.HF_ENDPOINT.replace(\"://\", \"://datasets-server.\") + \"/parquet?dataset=\"\n    try:\n        parquet_data_files_response = get_session().get(\n            url=dataset_viewer_parquet_url + dataset,\n            headers=get_authentication_headers_for_url(config.HF_ENDPOINT + f\"datasets/{dataset}\", token=token),\n            timeout=100.0,\n        )\n        parquet_data_files_response.raise_for_status()\n        if \"X-Revision\" in parquet_data_files_response.headers:\n            if parquet_data_files_response.headers[\"X-Revision\"] == commit_hash or commit_hash is None:\n                parquet_data_files_response_json = parquet_data_files_response.json()\n                if (\n                    parquet_data_files_response_json.get(\"partial\") is False\n                    and not parquet_data_files_response_json.get(\"pending\", True)\n                    and not parquet_data_files_response_json.get(\"failed\", True)\n                    and \"parquet_files\" in parquet_data_files_response_json\n                ):\n                    return parquet_data_files_response_json[\"parquet_files\"]\n                else:\n                    logger.debug(f\"Parquet export for {dataset} is not completely ready yet.\")\n            else:\n                logger.debug(\n                    f\"Parquet export for {dataset} is available but outdated (commit_hash='{parquet_data_files_response.headers['X-Revision']}')\"\n                )\n    except Exception as e:  # noqa catch any exception of the dataset viewer API and consider the parquet export doesn't exist\n        logger.debug(f\"No parquet export for {dataset} available ({type(e).__name__}: {e})\")\n    raise DatasetViewerError(\"No exported Parquet files available.\")\n\n\ndef get_exported_dataset_infos(\n    dataset: str, commit_hash: str, token: Optional[Union[str, bool]]\n) -> dict[str, dict[str, Any]]:\n    \"\"\"\n    Get the dataset information, can be useful to get e.g. the dataset features.\n    Docs: https://huggingface.co/docs/datasets-server/info\n    \"\"\"\n    dataset_viewer_info_url = config.HF_ENDPOINT.replace(\"://\", \"://datasets-server.\") + \"/info?dataset=\"\n    try:\n        info_response = get_session().get(\n            url=dataset_viewer_info_url + dataset,\n            headers=get_authentication_headers_for_url(config.HF_ENDPOINT + f\"datasets/{dataset}\", token=token),\n            timeout=100.0,\n        )\n        info_response.raise_for_status()\n        if \"X-Revision\" in info_response.headers:\n            if info_response.headers[\"X-Revision\"] == commit_hash or commit_hash is None:\n                info_response = info_response.json()\n                if (\n                    info_response.get(\"partial\") is False\n                    and not info_response.get(\"pending\", True)\n                    and not info_response.get(\"failed\", True)\n                    and \"dataset_info\" in info_response\n                ):\n                    return info_response[\"dataset_info\"]\n                else:\n                    logger.debug(f\"Dataset info for {dataset} is not completely ready yet.\")\n            else:\n                logger.debug(\n                    f\"Dataset info for {dataset} is available but outdated (commit_hash='{info_response.headers['X-Revision']}')\"\n                )\n    except Exception as e:  # noqa catch any exception of the dataset viewer API and consider the dataset info doesn't exist\n        logger.debug(f\"No dataset info for {dataset} available ({type(e).__name__}: {e})\")\n    raise DatasetViewerError(\"No exported dataset infos available.\")\n"
  },
  {
    "path": "src/datasets/utils/_dill.py",
    "content": "# Copyright 2023 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Extends `dill` to support pickling more types and produce more consistent dumps.\"\"\"\n\nimport os\nimport sys\nfrom io import BytesIO\nfrom types import CodeType, FunctionType\n\nimport dill\nfrom packaging import version\n\nfrom .. import config\n\n\nclass Pickler(dill.Pickler):\n    dispatch = dill._dill.MetaCatchingDict(dill.Pickler.dispatch.copy())\n    _legacy_no_dict_keys_sorting = False\n\n    def save(self, obj, save_persistent_id=True):\n        obj_type = type(obj)\n        if obj_type not in self.dispatch:\n            if \"regex\" in sys.modules:\n                import regex  # type: ignore\n\n                if obj_type is regex.Pattern:\n                    pklregister(obj_type)(_save_regexPattern)\n            if \"spacy\" in sys.modules:\n                import spacy  # type: ignore\n\n                if issubclass(obj_type, spacy.Language):\n                    pklregister(obj_type)(_save_spacyLanguage)\n            if \"tiktoken\" in sys.modules:\n                import tiktoken  # type: ignore\n\n                if obj_type is tiktoken.Encoding:\n                    pklregister(obj_type)(_save_tiktokenEncoding)\n            if \"torch\" in sys.modules:\n                import torch  # type: ignore\n\n                if issubclass(obj_type, torch.Tensor):\n                    pklregister(obj_type)(_save_torchTensor)\n\n                if obj_type is torch.Generator:\n                    pklregister(obj_type)(_save_torchGenerator)\n\n                # Unwrap `torch.compile`-ed modules\n                if issubclass(obj_type, torch.nn.Module):\n                    obj = getattr(obj, \"_orig_mod\", obj)\n            if \"transformers\" in sys.modules:\n                import transformers  # type: ignore\n\n                if issubclass(obj_type, transformers.PreTrainedTokenizerBase):\n                    pklregister(obj_type)(_save_transformersPreTrainedTokenizerBase)\n\n        # Unwrap `torch.compile`-ed functions\n        if obj_type is FunctionType:\n            obj = getattr(obj, \"_torchdynamo_orig_callable\", obj)\n        dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)\n\n    def _batch_setitems(self, items, *args, **kwargs):\n        # Ignore the order of keys in a dict\n        try:\n            # Faster, but fails for unorderable elements\n            items = sorted(items)\n        except Exception:  # TypeError, decimal.InvalidOperation, etc.\n            from datasets.fingerprint import Hasher\n\n            items = sorted(items, key=lambda x: Hasher.hash(x[0]))\n        return super()._batch_setitems(items, *args, **kwargs)\n\n    def memoize(self, obj):\n        # Don't memoize strings since two identical strings can have different Python ids\n        if type(obj) is not str:  # noqa: E721\n            dill.Pickler.memoize(self, obj)\n\n\ndef pklregister(t):\n    \"\"\"Register a custom reducer for the type.\"\"\"\n\n    def proxy(func):\n        Pickler.dispatch[t] = func\n        return func\n\n    return proxy\n\n\ndef _is_supported_dill_version():\n    \"\"\"Check if the current dill version is in the supported range.\"\"\"\n    return config.DILL_VERSION.release[:3] in [\n        version.parse(\"0.3.6\").release,\n        version.parse(\"0.3.7\").release,\n        version.parse(\"0.3.8\").release,\n        version.parse(\"0.3.9\").release,\n        version.parse(\"0.4.0\").release,\n        version.parse(\"0.4.1\").release,\n    ]\n\n\ndef dump(obj, file):\n    \"\"\"Pickle an object to a file.\"\"\"\n    Pickler(file, recurse=True).dump(obj)\n\n\ndef dumps(obj):\n    \"\"\"Pickle an object to a string.\"\"\"\n    file = BytesIO()\n    dump(obj, file)\n    return file.getvalue()\n\n\nif config.DILL_VERSION < version.parse(\"0.3.6\"):\n\n    def log(pickler, msg):\n        dill._dill.log.info(msg)\n\nelif _is_supported_dill_version():\n\n    def log(pickler, msg):\n        dill._dill.logger.trace(pickler, msg)\n\n\n@pklregister(set)\ndef _save_set(pickler, obj):\n    log(pickler, f\"Se: {obj}\")\n    try:\n        # Faster, but fails for unorderable elements\n        args = (sorted(obj),)\n    except Exception:  # TypeError, decimal.InvalidOperation, etc.\n        from datasets.fingerprint import Hasher\n\n        args = (sorted(obj, key=Hasher.hash),)\n\n    pickler.save_reduce(set, args, obj=obj)\n    log(pickler, \"# Se\")\n\n\ndef _save_regexPattern(pickler, obj):\n    import regex  # type: ignore\n\n    log(pickler, f\"Re: {obj}\")\n    args = (obj.pattern, obj.flags)\n    pickler.save_reduce(regex.compile, args, obj=obj)\n    log(pickler, \"# Re\")\n\n\ndef _save_tiktokenEncoding(pickler, obj):\n    import tiktoken  # type: ignore\n\n    log(pickler, f\"Enc: {obj}\")\n    args = (obj.name, obj._pat_str, obj._mergeable_ranks, obj._special_tokens)\n    pickler.save_reduce(tiktoken.Encoding, args, obj=obj)\n    log(pickler, \"# Enc\")\n\n\ndef _save_torchTensor(pickler, obj):\n    import torch  # type: ignore\n\n    # `torch.from_numpy` is not picklable in `torch>=1.11.0`\n    def create_torchTensor(np_array, dtype=None):\n        tensor = torch.from_numpy(np_array)\n        if dtype:\n            tensor = tensor.type(dtype)\n        return tensor\n\n    log(pickler, f\"To: {obj}\")\n    if obj.dtype == torch.bfloat16:\n        args = (obj.detach().to(torch.float).cpu().numpy(), torch.bfloat16)\n    else:\n        args = (obj.detach().cpu().numpy(),)\n    pickler.save_reduce(create_torchTensor, args, obj=obj)\n    log(pickler, \"# To\")\n\n\ndef _save_torchGenerator(pickler, obj):\n    import torch  # type: ignore\n\n    def create_torchGenerator(state):\n        generator = torch.Generator()\n        generator.set_state(state)\n        return generator\n\n    log(pickler, f\"Ge: {obj}\")\n    args = (obj.get_state(),)\n    pickler.save_reduce(create_torchGenerator, args, obj=obj)\n    log(pickler, \"# Ge\")\n\n\ndef _save_spacyLanguage(pickler, obj):\n    import spacy  # type: ignore\n\n    def create_spacyLanguage(config, bytes):\n        lang_cls = spacy.util.get_lang_class(config[\"nlp\"][\"lang\"])\n        lang_inst = lang_cls.from_config(config)\n        return lang_inst.from_bytes(bytes)\n\n    log(pickler, f\"Sp: {obj}\")\n    args = (obj.config, obj.to_bytes())\n    pickler.save_reduce(create_spacyLanguage, args, obj=obj)\n    log(pickler, \"# Sp\")\n\n\ndef _save_transformersPreTrainedTokenizerBase(pickler, obj):\n    log(pickler, f\"Tok: {obj}\")\n    # Ignore the `cache` attribute and make hashing stable.\n    #\n    # Some tokenizers backed by the `tokenizers` library mutate their internal `_tokenizer` state when called\n    # (e.g. by enabling truncation/padding). This can change the serialized bytes across runs and make dataset\n    # fingerprints unstable, which prevents `.map(load_from_cache_file=True)` from reusing cache files.\n    #\n    # For hashing/fingerprinting, we temporarily disable backend truncation/padding to avoid these runtime settings\n    # affecting the fingerprint, then restore the original settings.\n    state = obj.__dict__.copy()\n    if \"cache\" in state and isinstance(state[\"cache\"], dict):\n        state[\"cache\"] = {}\n    if \"deprecation_warnings\" in state and isinstance(state[\"deprecation_warnings\"], dict):\n        state[\"deprecation_warnings\"] = {}\n\n    backend_tokenizer = obj.__dict__.get(\"_tokenizer\")\n    truncation = padding = None\n    if (\n        backend_tokenizer is not None\n        and hasattr(backend_tokenizer, \"truncation\")\n        and hasattr(backend_tokenizer, \"padding\")\n    ):\n        truncation = backend_tokenizer.truncation\n        padding = backend_tokenizer.padding\n        try:\n            if truncation is not None and hasattr(backend_tokenizer, \"no_truncation\"):\n                backend_tokenizer.no_truncation()\n            if padding is not None and hasattr(backend_tokenizer, \"no_padding\"):\n                backend_tokenizer.no_padding()\n        except Exception:\n            truncation = padding = None\n\n    try:\n        pickler.save_reduce(type(obj), (), state=state, obj=obj)\n    finally:\n        try:\n            if backend_tokenizer is not None:\n                if truncation is not None and hasattr(backend_tokenizer, \"enable_truncation\"):\n                    backend_tokenizer.enable_truncation(**truncation)\n                if padding is not None and hasattr(backend_tokenizer, \"enable_padding\"):\n                    backend_tokenizer.enable_padding(**padding)\n        except Exception:\n            pass\n    log(pickler, \"# Tok\")\n\n\nif config.DILL_VERSION < version.parse(\"0.3.6\"):\n\n    @pklregister(CodeType)\n    def _save_code(pickler, obj):\n        \"\"\"\n        From dill._dill.save_code\n        This is a modified version that removes the origin (filename + line no.)\n        of functions created in notebooks or shells for example.\n        \"\"\"\n        dill._dill.log.info(f\"Co: {obj}\")\n        # The filename of a function is the .py file where it is defined.\n        # Filenames of functions created in notebooks or shells start with '<'\n        # ex: <ipython-input-13-9ed2afe61d25> for ipython, and <stdin> for shell\n        # Filenames of functions created in ipykernel the filename\n        # look like f\"{tempdir}/ipykernel_{id1}/{id2}.py\"\n        # Moreover lambda functions have a special name: '<lambda>'\n        # ex: (lambda x: x).__code__.co_name == \"<lambda>\"  # True\n        #\n        # For the hashing mechanism we ignore where the function has been defined\n        # More specifically:\n        # - we ignore the filename of special functions (filename starts with '<')\n        # - we always ignore the line number\n        # - we only use the base name of the file instead of the whole path,\n        # to be robust in case a script is moved for example.\n        #\n        # Only those two lines are different from the original implementation:\n        co_filename = (\n            \"\"\n            if obj.co_filename.startswith(\"<\")\n            or (\n                len(obj.co_filename.split(os.path.sep)) > 1\n                and obj.co_filename.split(os.path.sep)[-2].startswith(\"ipykernel_\")\n            )\n            or obj.co_name == \"<lambda>\"\n            else os.path.basename(obj.co_filename)\n        )\n        co_firstlineno = 1\n        # The rest is the same as in the original dill implementation (with also a version check for 3.10)\n        if dill._dill.PY3:\n            if hasattr(obj, \"co_posonlyargcount\"):  # python 3.8 (16 args)\n                args = (\n                    obj.co_argcount,\n                    obj.co_posonlyargcount,\n                    obj.co_kwonlyargcount,\n                    obj.co_nlocals,\n                    obj.co_stacksize,\n                    obj.co_flags,\n                    obj.co_code,\n                    obj.co_consts,\n                    obj.co_names,\n                    obj.co_varnames,\n                    co_filename,\n                    obj.co_name,\n                    co_firstlineno,\n                    obj.co_linetable if sys.version_info >= (3, 10) else obj.co_lnotab,\n                    obj.co_freevars,\n                    obj.co_cellvars,\n                )\n            else:  # python 3.7 (15 args)\n                args = (\n                    obj.co_argcount,\n                    obj.co_kwonlyargcount,\n                    obj.co_nlocals,\n                    obj.co_stacksize,\n                    obj.co_flags,\n                    obj.co_code,\n                    obj.co_consts,\n                    obj.co_names,\n                    obj.co_varnames,\n                    co_filename,\n                    obj.co_name,\n                    co_firstlineno,\n                    obj.co_lnotab,\n                    obj.co_freevars,\n                    obj.co_cellvars,\n                )\n        else:\n            args = (\n                obj.co_argcount,\n                obj.co_nlocals,\n                obj.co_stacksize,\n                obj.co_flags,\n                obj.co_code,\n                obj.co_consts,\n                obj.co_names,\n                obj.co_varnames,\n                co_filename,\n                obj.co_name,\n                co_firstlineno,\n                obj.co_lnotab,\n                obj.co_freevars,\n                obj.co_cellvars,\n            )\n        pickler.save_reduce(CodeType, args, obj=obj)\n        dill._dill.log.info(\"# Co\")\n        return\n\nelif _is_supported_dill_version():\n    # From: https://github.com/uqfoundation/dill/blob/dill-0.3.6/dill/_dill.py#L1104\n    @pklregister(CodeType)\n    def save_code(pickler, obj):\n        dill._dill.logger.trace(pickler, \"Co: %s\", obj)\n\n        ############################################################################################################\n        # Modification here for huggingface/datasets\n        # The filename of a function is the .py file where it is defined.\n        # Filenames of functions created in notebooks or shells start with '<'\n        # ex: <ipython-input-13-9ed2afe61d25> for ipython, and <stdin> for shell\n        # Filenames of functions created in ipykernel the filename\n        # look like f\"{tempdir}/ipykernel_{id1}/{id2}.py\"\n        # Moreover lambda functions have a special name: '<lambda>'\n        # ex: (lambda x: x).__code__.co_name == \"<lambda>\"  # True\n        #\n        # For the hashing mechanism we ignore where the function has been defined\n        # More specifically:\n        # - we ignore the filename of special functions (filename starts with '<')\n        # - we always ignore the line number\n        # - we only use the base name of the file instead of the whole path,\n        # to be robust in case a script is moved for example.\n        #\n        # Only those two lines are different from the original implementation:\n        co_filename = (\n            \"\"\n            if obj.co_filename.startswith(\"<\")\n            or (\n                len(obj.co_filename.split(os.path.sep)) > 1\n                and obj.co_filename.split(os.path.sep)[-2].startswith(\"ipykernel_\")\n            )\n            or obj.co_name == \"<lambda>\"\n            else os.path.basename(obj.co_filename)\n        )\n        co_firstlineno = 1\n        # The rest is the same as in the original dill implementation, except for the replacements:\n        # - obj.co_filename => co_filename\n        # - obj.co_firstlineno => co_firstlineno\n        # - obj.co_lnotab => obj.co_linetable for >= 3.10 since co_lnotab was deprecated\n        ############################################################################################################\n\n        if hasattr(obj, \"co_endlinetable\"):  # python 3.11a (20 args)\n            args = (\n                obj.co_linetable,  # Modification for huggingface/datasets ############################################\n                obj.co_argcount,\n                obj.co_posonlyargcount,\n                obj.co_kwonlyargcount,\n                obj.co_nlocals,\n                obj.co_stacksize,\n                obj.co_flags,\n                obj.co_code,\n                obj.co_consts,\n                obj.co_names,\n                obj.co_varnames,\n                co_filename,  # Modification for huggingface/datasets ############################################\n                obj.co_name,\n                obj.co_qualname,\n                co_firstlineno,  # Modification for huggingface/datasets #########################################\n                obj.co_linetable,\n                obj.co_endlinetable,\n                obj.co_columntable,\n                obj.co_exceptiontable,\n                obj.co_freevars,\n                obj.co_cellvars,\n            )\n        elif hasattr(obj, \"co_exceptiontable\"):  # python 3.11 (18 args)\n            args = (\n                obj.co_linetable,  # Modification for huggingface/datasets #######################################\n                obj.co_argcount,\n                obj.co_posonlyargcount,\n                obj.co_kwonlyargcount,\n                obj.co_nlocals,\n                obj.co_stacksize,\n                obj.co_flags,\n                obj.co_code,\n                obj.co_consts,\n                obj.co_names,\n                obj.co_varnames,\n                co_filename,  # Modification for huggingface/datasets ############################################\n                obj.co_name,\n                obj.co_qualname,\n                co_firstlineno,  # Modification for huggingface/datasets #########################################\n                obj.co_linetable,\n                obj.co_exceptiontable,\n                obj.co_freevars,\n                obj.co_cellvars,\n            )\n        elif hasattr(obj, \"co_linetable\"):  # python 3.10 (16 args)\n            args = (\n                obj.co_linetable,  # Modification for huggingface/datasets #######################################\n                obj.co_argcount,\n                obj.co_posonlyargcount,\n                obj.co_kwonlyargcount,\n                obj.co_nlocals,\n                obj.co_stacksize,\n                obj.co_flags,\n                obj.co_code,\n                obj.co_consts,\n                obj.co_names,\n                obj.co_varnames,\n                co_filename,  # Modification for huggingface/datasets ############################################\n                obj.co_name,\n                co_firstlineno,  # Modification for huggingface/datasets #########################################\n                obj.co_linetable,\n                obj.co_freevars,\n                obj.co_cellvars,\n            )\n        elif hasattr(obj, \"co_posonlyargcount\"):  # python 3.8 (16 args)\n            args = (\n                obj.co_argcount,\n                obj.co_posonlyargcount,\n                obj.co_kwonlyargcount,\n                obj.co_nlocals,\n                obj.co_stacksize,\n                obj.co_flags,\n                obj.co_code,\n                obj.co_consts,\n                obj.co_names,\n                obj.co_varnames,\n                co_filename,  # Modification for huggingface/datasets ############################################\n                obj.co_name,\n                co_firstlineno,  # Modification for huggingface/datasets #########################################\n                obj.co_lnotab,\n                obj.co_freevars,\n                obj.co_cellvars,\n            )\n        else:  # python 3.7 (15 args)\n            args = (\n                obj.co_argcount,\n                obj.co_kwonlyargcount,\n                obj.co_nlocals,\n                obj.co_stacksize,\n                obj.co_flags,\n                obj.co_code,\n                obj.co_consts,\n                obj.co_names,\n                obj.co_varnames,\n                co_filename,  # Modification for huggingface/datasets ############################################\n                obj.co_name,\n                co_firstlineno,  # Modification for huggingface/datasets #########################################\n                obj.co_lnotab,\n                obj.co_freevars,\n                obj.co_cellvars,\n            )\n\n        pickler.save_reduce(dill._dill._create_code, args, obj=obj)\n        dill._dill.logger.trace(pickler, \"# Co\")\n        return\n"
  },
  {
    "path": "src/datasets/utils/_filelock.py",
    "content": "#!/usr/bin/env python\n# Copyright 2023 The HuggingFace Inc. team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License\n\"\"\"Utilities to handle file locking in `datasets`.\"\"\"\n\nimport os\n\nfrom filelock import FileLock as FileLock_\nfrom filelock import UnixFileLock\nfrom filelock import __version__ as _filelock_version\nfrom packaging import version\n\n\nclass FileLock(FileLock_):\n    \"\"\"\n    A `filelock.FileLock` initializer that handles long paths.\n    It also uses the current umask for lock files.\n    \"\"\"\n\n    MAX_FILENAME_LENGTH = 255\n\n    def __init__(self, lock_file, *args, **kwargs):\n        # The \"mode\" argument is required if we want to use the current umask in filelock >= 3.10\n        # In previous previous it was already using the current umask.\n        if \"mode\" not in kwargs and version.parse(_filelock_version) >= version.parse(\"3.10.0\"):\n            umask = os.umask(0o666)\n            os.umask(umask)\n            kwargs[\"mode\"] = 0o666 & ~umask\n        lock_file = self.hash_filename_if_too_long(lock_file)\n        super().__init__(lock_file, *args, **kwargs)\n\n    @classmethod\n    def hash_filename_if_too_long(cls, path: str) -> str:\n        path = os.path.abspath(os.path.expanduser(path))\n        filename = os.path.basename(path)\n        max_filename_length = cls.MAX_FILENAME_LENGTH\n        if issubclass(cls, UnixFileLock):\n            max_filename_length = min(max_filename_length, os.statvfs(os.path.dirname(path)).f_namemax)\n        if len(filename) > max_filename_length:\n            dirname = os.path.dirname(path)\n            hashed_filename = str(hash(filename))\n            new_filename = (\n                filename[: max_filename_length - len(hashed_filename) - 8] + \"...\" + hashed_filename + \".lock\"\n            )\n            return os.path.join(dirname, new_filename)\n        else:\n            return path\n"
  },
  {
    "path": "src/datasets/utils/deprecation_utils.py",
    "content": "import enum\nimport inspect\nimport warnings\nfrom functools import wraps\nfrom typing import Callable, Optional\n\nfrom .logging import get_logger\n\n\n_emitted_deprecation_warnings = set()\nlogger = get_logger(__name__)\n\n\ndef deprecated(help_message: Optional[str] = None):\n    \"\"\"Decorator to mark a class or a function as deprecated.\n\n    Args:\n        help_message (:obj:`str`, optional): An optional message to guide the user on how to\n            switch to non-deprecated usage of the library.\n    \"\"\"\n\n    def decorator(deprecated_class_or_function: Callable):\n        global _emitted_deprecation_warnings\n\n        if inspect.isclass(deprecated_class_or_function):\n            deprecated_function = deprecated_class_or_function.__init__\n            name = deprecated_class_or_function.__name__\n        else:\n            deprecated_function = deprecated_class_or_function\n            name = deprecated_function.__name__\n            # Support deprecating __init__ class method: class name instead\n            name = name if name != \"__init__\" else deprecated_function.__qualname__.split(\".\")[-2]\n\n        warning_msg = (\n            f\"{name} is deprecated and will be removed in the next major version of datasets.\" + f\" {help_message}\"\n            if help_message\n            else \"\"\n        )\n\n        @wraps(deprecated_function)\n        def wrapper(*args, **kwargs):\n            func_hash = hash(deprecated_function)\n            if func_hash not in _emitted_deprecation_warnings:\n                warnings.warn(warning_msg, category=FutureWarning, stacklevel=2)\n                _emitted_deprecation_warnings.add(func_hash)\n            return deprecated_function(*args, **kwargs)\n\n        wrapper._decorator_name_ = \"deprecated\"\n\n        if inspect.isclass(deprecated_class_or_function):\n            deprecated_class_or_function.__init__ = wrapper\n            return deprecated_class_or_function\n        else:\n            return wrapper\n\n    return decorator\n\n\nclass OnAccess(enum.EnumMeta):\n    \"\"\"\n    Enum metaclass that calls a user-specified function whenever a member is accessed.\n    \"\"\"\n\n    def __getattribute__(cls, name):\n        obj = super().__getattribute__(name)\n        if isinstance(obj, enum.Enum) and obj._on_access:\n            obj._on_access()\n        return obj\n\n    def __getitem__(cls, name):\n        member = super().__getitem__(name)\n        if member._on_access:\n            member._on_access()\n        return member\n\n    def __call__(cls, value, names=None, *, module=None, qualname=None, type=None, start=1):\n        obj = super().__call__(value, names, module=module, qualname=qualname, type=type, start=start)\n        if isinstance(obj, enum.Enum) and obj._on_access:\n            obj._on_access()\n        return obj\n\n\nclass DeprecatedEnum(enum.Enum, metaclass=OnAccess):\n    \"\"\"\n    Enum class that calls `deprecate` method whenever a member is accessed.\n    \"\"\"\n\n    def __new__(cls, value):\n        member = object.__new__(cls)\n        member._value_ = value\n        member._on_access = member.deprecate\n        return member\n\n    @property\n    def help_message(self):\n        return \"\"\n\n    def deprecate(self):\n        help_message = f\" {self.help_message}\" if self.help_message else \"\"\n        warnings.warn(\n            f\"'{self.__objclass__.__name__}' is deprecated and will be removed in the next major version of datasets.\"\n            + help_message,\n            FutureWarning,\n            stacklevel=3,\n        )\n"
  },
  {
    "path": "src/datasets/utils/doc_utils.py",
    "content": "from typing import Callable\n\n\ndef is_documented_by(function_with_docstring: Callable):\n    \"\"\"Decorator to share docstrings across common functions.\n\n    Args:\n        function_with_docstring (`Callable`): Name of the function with the docstring.\n    \"\"\"\n\n    def wrapper(target_function):\n        target_function.__doc__ = function_with_docstring.__doc__\n        return target_function\n\n    return wrapper\n"
  },
  {
    "path": "src/datasets/utils/experimental.py",
    "content": "\"\"\"Contains utilities to flag a feature as \"experimental\" in datasets.\"\"\"\n\nimport warnings\nfrom functools import wraps\nfrom typing import Callable\n\n\ndef experimental(fn: Callable) -> Callable:\n    \"\"\"Decorator to flag a feature as experimental.\n\n    An experimental feature trigger a warning when used as it might be subject to breaking changes in the future.\n\n    Args:\n        fn (`Callable`):\n            The function to flag as experimental.\n\n    Returns:\n        `Callable`: The decorated function.\n\n    Example:\n\n    ```python\n    >>> from datasets.utils import experimental\n\n    >>> @experimental\n    ... def my_function():\n    ...     print(\"Hello world!\")\n\n    >>> my_function()\n    UserWarning: 'my_function' is experimental and might be subject to breaking changes in the future.\n    Hello world!\n    ```\n    \"\"\"\n\n    @wraps(fn)\n    def _inner_fn(*args, **kwargs):\n        warnings.warn(\n            (f\"'{fn.__name__}' is experimental and might be subject to breaking changes in the future.\"),\n            UserWarning,\n        )\n        return fn(*args, **kwargs)\n\n    return _inner_fn\n"
  },
  {
    "path": "src/datasets/utils/extract.py",
    "content": "import bz2\nimport gzip\nimport lzma\nimport os\nimport shutil\nimport struct\nimport tarfile\nimport warnings\nimport zipfile\nfrom abc import ABC, abstractmethod\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Optional, Union\n\nfrom .. import config\nfrom ._filelock import FileLock\nfrom .logging import get_logger\n\n\nif TYPE_CHECKING:\n    import py7zr\n    import rarfile\n\n\nlogger = get_logger(__name__)\n\n\nclass ExtractManager:\n    def __init__(self, cache_dir: Optional[str] = None):\n        self.extract_dir = (\n            os.path.join(cache_dir, config.EXTRACTED_DATASETS_DIR) if cache_dir else config.EXTRACTED_DATASETS_PATH\n        )\n        self.extractor = Extractor\n\n    def _get_output_path(self, path: str) -> str:\n        from .file_utils import hash_url_to_filename\n\n        # Path where we extract compressed archives\n        # We extract in the cache dir, and get the extracted path name by hashing the original path\"\n        abs_path = os.path.abspath(path)\n        return os.path.join(self.extract_dir, hash_url_to_filename(abs_path))\n\n    def _do_extract(self, output_path: str, force_extract: bool) -> bool:\n        return force_extract or (\n            not os.path.isfile(output_path) and not (os.path.isdir(output_path) and os.listdir(output_path))\n        )\n\n    def extract(self, input_path: str, force_extract: bool = False) -> str:\n        extractor_format = self.extractor.infer_extractor_format(input_path)\n        if not extractor_format:\n            return input_path\n        output_path = self._get_output_path(input_path)\n        if self._do_extract(output_path, force_extract):\n            self.extractor.extract(input_path, output_path, extractor_format)\n        return output_path\n\n\nclass BaseExtractor(ABC):\n    @classmethod\n    @abstractmethod\n    def is_extractable(cls, path: Union[Path, str], **kwargs) -> bool: ...\n\n    @staticmethod\n    @abstractmethod\n    def extract(input_path: Union[Path, str], output_path: Union[Path, str]) -> None: ...\n\n\nclass MagicNumberBaseExtractor(BaseExtractor, ABC):\n    magic_numbers: list[bytes] = []\n\n    @staticmethod\n    def read_magic_number(path: Union[Path, str], magic_number_length: int):\n        with open(path, \"rb\") as f:\n            return f.read(magic_number_length)\n\n    @classmethod\n    def is_extractable(cls, path: Union[Path, str], magic_number: bytes = b\"\") -> bool:\n        if not magic_number:\n            magic_number_length = max(len(cls_magic_number) for cls_magic_number in cls.magic_numbers)\n            try:\n                magic_number = cls.read_magic_number(path, magic_number_length)\n            except OSError:\n                return False\n        return any(magic_number.startswith(cls_magic_number) for cls_magic_number in cls.magic_numbers)\n\n\nclass TarExtractor(BaseExtractor):\n    @classmethod\n    def is_extractable(cls, path: Union[Path, str], **kwargs) -> bool:\n        return tarfile.is_tarfile(path)\n\n    @staticmethod\n    def safemembers(members: tarfile.TarFile, output_path: Union[Path, str]):\n        \"\"\"\n        Fix for CVE-2007-4559\n        Desc:\n            Directory traversal vulnerability in the (1) extract and (2) extractall functions in the tarfile\n            module in Python allows user-assisted remote attackers to overwrite arbitrary files via a .. (dot dot)\n            sequence in filenames in a TAR archive, a related issue to CVE-2001-1267.\n        See: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2007-4559\n        From: https://stackoverflow.com/a/10077309\n        \"\"\"\n\n        def resolved(path: Union[Path, str]) -> str:\n            return os.path.realpath(os.path.abspath(path))\n\n        def badpath(path: str, base: str) -> bool:\n            # joinpath will ignore base if path is absolute\n            return not resolved(os.path.join(base, path)).startswith(base)\n\n        def badlink(info: tarfile.TarInfo, base: str) -> bool:\n            # Links are interpreted relative to the directory containing the link\n            tip = resolved(os.path.join(base, os.path.dirname(info.name)))\n            return badpath(info.linkname, base=tip)\n\n        base = resolved(output_path)\n\n        for finfo in members:\n            if badpath(finfo.name, base):\n                logger.error(f\"Extraction of {finfo.name} is blocked (illegal path)\")\n            elif finfo.issym() and badlink(finfo, base):\n                logger.error(f\"Extraction of {finfo.name} is blocked: Symlink to {finfo.linkname}\")\n            elif finfo.islnk() and badlink(finfo, base):\n                logger.error(f\"Extraction of {finfo.name} is blocked: Hard link to {finfo.linkname}\")\n            else:\n                yield finfo\n\n    @staticmethod\n    def extract(input_path: Union[Path, str], output_path: Union[Path, str]) -> None:\n        os.makedirs(output_path, exist_ok=True)\n        tar_file = tarfile.open(input_path)\n        tar_file.extractall(output_path, members=TarExtractor.safemembers(tar_file, output_path))\n        tar_file.close()\n\n\nclass GzipExtractor(MagicNumberBaseExtractor):\n    magic_numbers = [b\"\\x1f\\x8b\"]\n\n    @staticmethod\n    def extract(input_path: Union[Path, str], output_path: Union[Path, str]) -> None:\n        with gzip.open(input_path, \"rb\") as gzip_file:\n            with open(output_path, \"wb\") as extracted_file:\n                shutil.copyfileobj(gzip_file, extracted_file)\n\n\nclass ZipExtractor(MagicNumberBaseExtractor):\n    magic_numbers = [\n        b\"PK\\x03\\x04\",\n        b\"PK\\x05\\x06\",  # empty archive\n        b\"PK\\x07\\x08\",  # spanned archive\n    ]\n\n    @classmethod\n    def is_extractable(cls, path: Union[Path, str], magic_number: bytes = b\"\") -> bool:\n        if super().is_extractable(path, magic_number=magic_number):\n            return True\n        try:\n            # Alternative version of zipfile.is_zipfile that has less false positives, but misses executable zip archives.\n            # From: https://github.com/python/cpython/pull/5053\n            from zipfile import (\n                _CD_SIGNATURE,\n                _ECD_DISK_NUMBER,\n                _ECD_DISK_START,\n                _ECD_ENTRIES_TOTAL,\n                _ECD_OFFSET,\n                _ECD_SIZE,\n                _EndRecData,\n                sizeCentralDir,\n                stringCentralDir,\n                structCentralDir,\n            )\n\n            with open(path, \"rb\") as fp:\n                endrec = _EndRecData(fp)\n                if endrec:\n                    if endrec[_ECD_ENTRIES_TOTAL] == 0 and endrec[_ECD_SIZE] == 0 and endrec[_ECD_OFFSET] == 0:\n                        return True  # Empty zipfiles are still zipfiles\n                    elif endrec[_ECD_DISK_NUMBER] == endrec[_ECD_DISK_START]:\n                        fp.seek(endrec[_ECD_OFFSET])  # Central directory is on the same disk\n                        if fp.tell() == endrec[_ECD_OFFSET] and endrec[_ECD_SIZE] >= sizeCentralDir:\n                            data = fp.read(sizeCentralDir)  # CD is where we expect it to be\n                            if len(data) == sizeCentralDir:\n                                centdir = struct.unpack(structCentralDir, data)  # CD is the right size\n                                if centdir[_CD_SIGNATURE] == stringCentralDir:\n                                    return True  # First central directory entry  has correct magic number\n            return False\n        except Exception:  # catch all errors in case future python versions change the zipfile internals\n            return False\n\n    @staticmethod\n    def safemembers(members: list[zipfile.ZipInfo], output_path: Union[Path, str]):\n        \"\"\"\n        Fix for CVE-2007-4559\n        Desc:\n            Directory traversal vulnerability in the (1) extract and (2) extractall functions in the tarfile\n            module in Python allows user-assisted remote attackers to overwrite arbitrary files via a .. (dot dot)\n            sequence in filenames in a TAR archive, a related issue to CVE-2001-1267.\n        See: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2007-4559\n        From: https://stackoverflow.com/a/10077309\n\n        This additional mitigation is applied for zipfile as well.\n        \"\"\"\n\n        def resolved(path: Union[Path, str]) -> str:\n            return os.path.realpath(os.path.abspath(path))\n\n        def badpath(path: str, base: str) -> bool:\n            # joinpath will ignore base if path is absolute\n            return not resolved(os.path.join(base, path)).startswith(base)\n\n        base = resolved(output_path)\n\n        for finfo in members:\n            if badpath(finfo.filename, base):\n                logger.error(f\"Extraction of {finfo.filename} is blocked (illegal path)\")\n            # zipfile doesn't support symlinks\n            # elif finfo.is_symlink and badlink(finfo, base):\n            #     logger.error(f\"Extraction of {finfo.name} is blocked: Symlink to {finfo.linkname}\")\n            else:\n                yield finfo\n\n    @staticmethod\n    def extract(input_path: Union[Path, str], output_path: Union[Path, str]) -> None:\n        os.makedirs(output_path, exist_ok=True)\n        with zipfile.ZipFile(input_path, \"r\") as zip_file:\n            zip_file.extractall(output_path, members=ZipExtractor.safemembers(zip_file.filelist, output_path))\n            zip_file.close()\n\n\nclass XzExtractor(MagicNumberBaseExtractor):\n    magic_numbers = [b\"\\xfd\\x37\\x7a\\x58\\x5a\\x00\"]\n\n    @staticmethod\n    def extract(input_path: Union[Path, str], output_path: Union[Path, str]) -> None:\n        with lzma.open(input_path) as compressed_file:\n            with open(output_path, \"wb\") as extracted_file:\n                shutil.copyfileobj(compressed_file, extracted_file)\n\n\nclass RarExtractor(MagicNumberBaseExtractor):\n    magic_numbers = [b\"Rar!\\x1a\\x07\\x00\", b\"Rar!\\x1a\\x07\\x01\\x00\"]  # RAR_ID  # RAR5_ID\n\n    @staticmethod\n    def safemembers(members: list[\"rarfile.RarInfo\"], output_path: Union[Path, str]):\n        \"\"\"\n        Fix for CVE-2007-4559\n        Desc:\n            Directory traversal vulnerability in the (1) extract and (2) extractall functions in the tarfile\n            module in Python allows user-assisted remote attackers to overwrite arbitrary files via a .. (dot dot)\n            sequence in filenames in a TAR archive, a related issue to CVE-2001-1267.\n        See: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2007-4559\n        From: https://stackoverflow.com/a/10077309\n\n        This additional mitigation is applied for rarfile as well.\n        \"\"\"\n\n        def resolved(path: Union[Path, str]) -> str:\n            return os.path.realpath(os.path.abspath(path))\n\n        def badpath(path: str, base: str) -> bool:\n            # joinpath will ignore base if path is absolute\n            return not resolved(os.path.join(base, path)).startswith(base)\n\n        def badlink(info: \"rarfile.RarInfo\", base: str) -> bool:\n            # Links are interpreted relative to the directory containing the link\n            tip = resolved(os.path.join(base, os.path.dirname(info.filename)))\n            redir_type, redir_flags, link_name = info.file_redir\n            return badpath(link_name, base=tip)\n\n        base = resolved(output_path)\n\n        for finfo in members:\n            if badpath(finfo.filename, base):\n                logger.error(f\"Extraction of {finfo.filename} is blocked (illegal path)\")\n            elif finfo.is_symlink() and badlink(finfo, base):\n                logger.error(f\"Extraction of {finfo.filename} is blocked: Symlink to {finfo.file_redir}\")\n            else:\n                yield finfo\n\n    @staticmethod\n    def extract(input_path: Union[Path, str], output_path: Union[Path, str]) -> None:\n        if not config.RARFILE_AVAILABLE:\n            raise ImportError(\"Please pip install rarfile\")\n        import rarfile\n\n        os.makedirs(output_path, exist_ok=True)\n        rf = rarfile.RarFile(input_path)\n        rf.extractall(output_path, members=RarExtractor.safemembers(rf.infolist(), output_path))\n        rf.close()\n\n\nclass ZstdExtractor(MagicNumberBaseExtractor):\n    magic_numbers = [b\"\\x28\\xb5\\x2f\\xfd\"]\n\n    @staticmethod\n    def extract(input_path: Union[Path, str], output_path: Union[Path, str]) -> None:\n        if not config.ZSTANDARD_AVAILABLE:\n            raise ImportError(\"Please pip install zstandard\")\n        import zstandard as zstd\n\n        dctx = zstd.ZstdDecompressor()\n        with open(input_path, \"rb\") as ifh, open(output_path, \"wb\") as ofh:\n            dctx.copy_stream(ifh, ofh)\n\n\nclass Bzip2Extractor(MagicNumberBaseExtractor):\n    magic_numbers = [b\"\\x42\\x5a\\x68\"]\n\n    @staticmethod\n    def extract(input_path: Union[Path, str], output_path: Union[Path, str]) -> None:\n        with bz2.open(input_path, \"rb\") as compressed_file:\n            with open(output_path, \"wb\") as extracted_file:\n                shutil.copyfileobj(compressed_file, extracted_file)\n\n\nclass SevenZipExtractor(MagicNumberBaseExtractor):\n    magic_numbers = [b\"\\x37\\x7a\\xbc\\xaf\\x27\\x1c\"]\n\n    @staticmethod\n    def safemembers(members: list[\"py7zr.FileInfo\"], output_path: Union[Path, str]):\n        \"\"\"\n        Fix for CVE-2007-4559\n        Desc:\n            Directory traversal vulnerability in the (1) extract and (2) extractall functions in the tarfile\n            module in Python allows user-assisted remote attackers to overwrite arbitrary files via a .. (dot dot)\n            sequence in filenames in a TAR archive, a related issue to CVE-2001-1267.\n        See: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2007-4559\n        From: https://stackoverflow.com/a/10077309\n\n        This additional mitigation is applied for py7zr as well.\n        \"\"\"\n\n        def resolved(path: Union[Path, str]) -> str:\n            return os.path.realpath(os.path.abspath(path))\n\n        def badpath(path: str, base: str) -> bool:\n            # joinpath will ignore base if path is absolute\n            return not resolved(os.path.join(base, path)).startswith(base)\n\n        def badlink(info: \"py7zr.FileInfo\", base: str) -> bool:\n            # Links are interpreted relative to the directory containing the link\n            tip = resolved(os.path.join(base, os.path.dirname(info.filename)))\n            return badpath(os.path.basename(info.filename), base=tip)\n\n        base = resolved(output_path)\n\n        for finfo in members:\n            if badpath(finfo.filename, base):\n                logger.error(f\"Extraction of {finfo.filename} is blocked (illegal path)\")\n            # py7zr already checks symlinks validity\n            # elif finfo.is_symlink and badlink(finfo, base):\n            #     logger.error(f\"Extraction of {finfo.name} is blocked: Symlink to {finfo.linkname}\")\n            else:\n                yield finfo\n\n    @staticmethod\n    def extract(input_path: Union[Path, str], output_path: Union[Path, str]) -> None:\n        if not config.PY7ZR_AVAILABLE:\n            raise ImportError(\"Please pip install py7zr\")\n        import py7zr\n\n        os.makedirs(output_path, exist_ok=True)\n        with py7zr.SevenZipFile(input_path, \"r\") as archive:\n            targets = [finfo.filename for finfo in SevenZipExtractor.safemembers(archive.list(), output_path)]\n            archive.extract(output_path, targets=targets)\n\n\nclass Lz4Extractor(MagicNumberBaseExtractor):\n    magic_numbers = [b\"\\x04\\x22\\x4d\\x18\"]\n\n    @staticmethod\n    def extract(input_path: Union[Path, str], output_path: Union[Path, str]) -> None:\n        if not config.LZ4_AVAILABLE:\n            raise ImportError(\"Please pip install lz4\")\n        import lz4.frame\n\n        with lz4.frame.open(input_path, \"rb\") as compressed_file:\n            with open(output_path, \"wb\") as extracted_file:\n                shutil.copyfileobj(compressed_file, extracted_file)\n\n\nclass Extractor:\n    #  Put zip file to the last, b/c it is possible wrongly detected as zip (I guess it means: as tar or gzip)\n    extractors: dict[str, type[BaseExtractor]] = {\n        \"tar\": TarExtractor,\n        \"gzip\": GzipExtractor,\n        \"zip\": ZipExtractor,\n        \"xz\": XzExtractor,\n        \"rar\": RarExtractor,\n        \"zstd\": ZstdExtractor,\n        \"bz2\": Bzip2Extractor,\n        \"7z\": SevenZipExtractor,  # <Added version=\"2.4.0\"/>\n        \"lz4\": Lz4Extractor,  # <Added version=\"2.4.0\"/>\n    }\n\n    @classmethod\n    def _get_magic_number_max_length(cls):\n        return max(\n            len(extractor_magic_number)\n            for extractor in cls.extractors.values()\n            if issubclass(extractor, MagicNumberBaseExtractor)\n            for extractor_magic_number in extractor.magic_numbers\n        )\n\n    @staticmethod\n    def _read_magic_number(path: Union[Path, str], magic_number_length: int):\n        try:\n            return MagicNumberBaseExtractor.read_magic_number(path, magic_number_length=magic_number_length)\n        except OSError:\n            return b\"\"\n\n    @classmethod\n    def is_extractable(cls, path: Union[Path, str], return_extractor: bool = False) -> bool:\n        warnings.warn(\n            \"Method 'is_extractable' was deprecated in version 2.4.0 and will be removed in 3.0.0. \"\n            \"Use 'infer_extractor_format' instead.\",\n            category=FutureWarning,\n        )\n        extractor_format = cls.infer_extractor_format(path)\n        if extractor_format:\n            return True if not return_extractor else (True, cls.extractors[extractor_format])\n        return False if not return_extractor else (False, None)\n\n    @classmethod\n    def infer_extractor_format(cls, path: Union[Path, str]) -> Optional[str]:  # <Added version=\"2.4.0\"/>\n        magic_number_max_length = cls._get_magic_number_max_length()\n        magic_number = cls._read_magic_number(path, magic_number_max_length)\n        for extractor_format, extractor in cls.extractors.items():\n            if extractor.is_extractable(path, magic_number=magic_number):\n                return extractor_format\n\n    @classmethod\n    def extract(\n        cls,\n        input_path: Union[Path, str],\n        output_path: Union[Path, str],\n        extractor_format: str,\n    ) -> None:\n        os.makedirs(os.path.dirname(output_path), exist_ok=True)\n        # Prevent parallel extractions\n        lock_path = str(Path(output_path).with_suffix(\".lock\"))\n        with FileLock(lock_path):\n            shutil.rmtree(output_path, ignore_errors=True)\n            extractor = cls.extractors[extractor_format]\n            return extractor.extract(input_path, output_path)\n"
  },
  {
    "path": "src/datasets/utils/file_utils.py",
    "content": "\"\"\"\nUtilities for working with the local dataset cache.\nThis file is adapted from the AllenNLP library at https://github.com/allenai/allennlp\nCopyright by the AllenNLP authors.\n\"\"\"\n\nimport asyncio\nimport glob\nimport io\nimport json\nimport multiprocessing\nimport os\nimport posixpath\nimport re\nimport shutil\nimport tarfile\nimport time\nimport xml.dom.minidom\nimport zipfile\nfrom collections.abc import Generator\nfrom io import BytesIO\nfrom itertools import chain\nfrom pathlib import Path, PurePosixPath\nfrom typing import Any, Optional, TypeVar, Union\nfrom unittest.mock import patch\nfrom urllib.parse import urlparse\nfrom xml.etree import ElementTree as ET\n\nimport fsspec\nimport httpx\nimport huggingface_hub\nimport huggingface_hub.errors\nimport requests\nfrom fsspec.core import strip_protocol, url_to_fs\nfrom fsspec.utils import can_be_local\nfrom huggingface_hub.utils import get_session, insecure_hashlib\nfrom packaging import version\n\nfrom .. import __version__, config\nfrom ..download.download_config import DownloadConfig\nfrom ..filesystems import COMPRESSION_FILESYSTEMS\nfrom . import _tqdm, logging\nfrom ._filelock import FileLock\nfrom .extract import ExtractManager\nfrom .track import TrackedIterableFromGenerator\n\n\ntry:\n    from aiohttp.client_exceptions import ClientError as _AiohttpClientError\nexcept ImportError:\n    # aiohttp is not available; synthesize an exception type\n    # that will never be raised by any actual code for use in the `except`\n    # clause only.\n    class _AiohttpClientError(Exception):\n        pass\n\n\nlogger = logging.get_logger(__name__)  # pylint: disable=invalid-name\n\nINCOMPLETE_SUFFIX = \".incomplete\"\n\nT = TypeVar(\"T\", str, Path)\n\nCONNECTION_ERRORS_TO_RETRY = (\n    _AiohttpClientError,\n    asyncio.TimeoutError,\n    requests.exceptions.ConnectionError,\n    requests.exceptions.Timeout,\n    httpx.RequestError,\n)\nSERVER_UNAVAILABLE_CODE = 504\nRATE_LIMIT_CODE = 429\n\n\ndef is_remote_url(url_or_filename: str) -> bool:\n    return urlparse(url_or_filename).scheme != \"\" and not os.path.ismount(urlparse(url_or_filename).scheme + \":/\")\n\n\ndef is_local_path(url_or_filename: str) -> bool:\n    # On unix the scheme of a local path is empty (for both absolute and relative),\n    # while on windows the scheme is the drive name (ex: \"c\") for absolute paths.\n    # for details on the windows behavior, see https://bugs.python.org/issue42215\n    return urlparse(url_or_filename).scheme == \"\" or os.path.ismount(urlparse(url_or_filename).scheme + \":/\")\n\n\ndef is_relative_path(url_or_filename: str) -> bool:\n    return urlparse(url_or_filename).scheme == \"\" and not os.path.isabs(url_or_filename)\n\n\ndef relative_to_absolute_path(path: T) -> T:\n    \"\"\"Convert relative path to absolute path.\"\"\"\n    abs_path_str = os.path.abspath(os.path.expanduser(os.path.expandvars(str(path))))\n    return Path(abs_path_str) if isinstance(path, Path) else abs_path_str\n\n\ndef url_or_path_join(base_name: str, *pathnames: str) -> str:\n    if is_remote_url(base_name):\n        return posixpath.join(base_name, *(str(pathname).replace(os.sep, \"/\").lstrip(\"/\") for pathname in pathnames))\n    else:\n        return Path(base_name, *pathnames).as_posix()\n\n\ndef url_or_path_parent(url_or_path: str) -> str:\n    if is_remote_url(url_or_path):\n        return url_or_path[: url_or_path.rindex(\"/\")]\n    else:\n        return os.path.dirname(url_or_path)\n\n\ndef hash_url_to_filename(url, etag=None):\n    \"\"\"\n    Convert `url` into a hashed filename in a repeatable way.\n    If `etag` is specified, append its hash to the url's, delimited\n    by a period.\n    If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name\n    so that TF 2.0 can identify it as a HDF5 file\n    (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)\n    \"\"\"\n    url_bytes = url.encode(\"utf-8\")\n    url_hash = insecure_hashlib.sha256(url_bytes)\n    filename = url_hash.hexdigest()\n\n    if etag:\n        etag_bytes = etag.encode(\"utf-8\")\n        etag_hash = insecure_hashlib.sha256(etag_bytes)\n        filename += \".\" + etag_hash.hexdigest()\n\n    if url.endswith(\".py\"):\n        filename += \".py\"\n\n    return filename\n\n\ndef cached_path(\n    url_or_filename,\n    download_config=None,\n    **download_kwargs,\n) -> str:\n    \"\"\"\n    Given something that might be a URL (or might be a local path),\n    determine which. If it's a URL, download the file and cache it, and\n    return the path to the cached file. If it's already a local path,\n    make sure the file exists and then return the path.\n\n    Return:\n        Local path (string)\n\n    Raises:\n        FileNotFoundError: in case of non-recoverable file\n            (non-existent or no cache on disk)\n        ConnectionError: in case of unreachable url\n            and no cache on disk\n        ValueError: if it couldn't parse the url or filename correctly\n        httpx.NetworkError or requests.exceptions.ConnectionError: in case of internet connection issue\n    \"\"\"\n    if download_config is None:\n        download_config = DownloadConfig(**download_kwargs)\n\n    cache_dir = download_config.cache_dir or config.DOWNLOADED_DATASETS_PATH\n    if isinstance(cache_dir, Path):\n        cache_dir = str(cache_dir)\n    if isinstance(url_or_filename, Path):\n        url_or_filename = str(url_or_filename)\n\n    # Convert fsspec URL in the format \"file://local/path\" to \"local/path\"\n    if can_be_local(url_or_filename):\n        url_or_filename = strip_protocol(url_or_filename)\n\n    if is_remote_url(url_or_filename):\n        # URL, so get it from the cache (downloading if necessary)\n        url_or_filename, storage_options = _prepare_path_and_storage_options(\n            url_or_filename, download_config=download_config\n        )\n        # Download files from Hugging Face.\n        # Note: no need to check for https://huggingface.co file URLs since _prepare_path_and_storage_options\n        # prepares Hugging Face HTTP URLs as hf:// paths already\n        if url_or_filename.startswith(\"hf://\") and not url_or_filename.startswith(\"hf://buckets/\"):\n            resolved_path = huggingface_hub.HfFileSystem(\n                endpoint=config.HF_ENDPOINT, token=download_config.token\n            ).resolve_path(url_or_filename)\n            try:\n                output_path = huggingface_hub.HfApi(\n                    endpoint=config.HF_ENDPOINT,\n                    token=download_config.token,\n                    library_name=\"datasets\",\n                    library_version=__version__,\n                    user_agent=get_datasets_user_agent(download_config.user_agent),\n                ).hf_hub_download(\n                    repo_id=resolved_path.repo_id,\n                    repo_type=resolved_path.repo_type,\n                    revision=resolved_path.revision,\n                    filename=resolved_path.path_in_repo,\n                    force_download=download_config.force_download,\n                    proxies=download_config.proxies,\n                )\n            except (\n                huggingface_hub.utils.RepositoryNotFoundError,\n                huggingface_hub.utils.EntryNotFoundError,\n                huggingface_hub.utils.RevisionNotFoundError,\n                huggingface_hub.utils.GatedRepoError,\n            ) as e:\n                raise FileNotFoundError(str(e)) from e\n        # Download external files\n        else:\n            output_path = get_from_cache(\n                url_or_filename,\n                cache_dir=cache_dir,\n                force_download=download_config.force_download,\n                user_agent=download_config.user_agent,\n                use_etag=download_config.use_etag,\n                token=download_config.token,\n                storage_options=storage_options,\n                download_desc=download_config.download_desc,\n                disable_tqdm=download_config.disable_tqdm,\n            )\n    elif os.path.exists(url_or_filename):\n        # File, and it exists.\n        output_path = url_or_filename\n    elif is_local_path(url_or_filename):\n        # File, but it doesn't exist.\n        raise FileNotFoundError(f\"Local file {url_or_filename} doesn't exist\")\n    else:\n        # Something unknown\n        raise ValueError(f\"unable to parse {url_or_filename} as a URL or as a local path\")\n\n    if output_path is None:\n        return output_path\n\n    if download_config.extract_compressed_file:\n        if download_config.extract_on_the_fly:\n            # Add a compression prefix to the compressed file so that it can be extracted\n            # as it's being read using xopen.\n            protocol = _get_extraction_protocol(output_path, download_config=download_config)\n            extension = _get_path_extension(url_or_filename.split(\"::\")[0])\n            if (\n                protocol\n                and extension not in [\"tgz\", \"tar\"]\n                and not url_or_filename.split(\"::\")[0].endswith((\".tar.gz\", \".tar.bz2\", \".tar.xz\"))\n            ):\n                output_path = relative_to_absolute_path(output_path)\n                if protocol in SINGLE_FILE_COMPRESSION_PROTOCOLS:\n                    # there is one single file which is the uncompressed file\n                    inner_file = os.path.basename(output_path)\n                    inner_file = inner_file[: inner_file.rindex(\".\")] if \".\" in inner_file else inner_file\n                    output_path = f\"{protocol}://{inner_file}::{output_path}\"\n                else:\n                    output_path = f\"{protocol}://::{output_path}\"\n                return output_path\n\n        # Eager extraction\n        output_path = ExtractManager(cache_dir=download_config.cache_dir).extract(\n            output_path, force_extract=download_config.force_extract\n        )\n    return relative_to_absolute_path(output_path)\n\n\ndef get_datasets_user_agent(user_agent: Optional[Union[str, dict]] = None) -> str:\n    ua = f\"datasets/{__version__}\"\n    ua += f\"; python/{config.PY_VERSION}\"\n    ua += f\"; hf_hub/{huggingface_hub.__version__}\"\n    ua += f\"; pyarrow/{config.PYARROW_VERSION}\"\n    if config.TORCH_AVAILABLE:\n        ua += f\"; torch/{config.TORCH_VERSION}\"\n    if config.TF_AVAILABLE:\n        ua += f\"; tensorflow/{config.TF_VERSION}\"\n    if config.JAX_AVAILABLE:\n        ua += f\"; jax/{config.JAX_VERSION}\"\n    if isinstance(user_agent, dict):\n        ua += f\"; {'; '.join(f'{k}/{v}' for k, v in user_agent.items())}\"\n    elif isinstance(user_agent, str):\n        ua += \"; \" + user_agent\n    return ua\n\n\ndef get_authentication_headers_for_url(url: str, token: Optional[Union[str, bool]] = None) -> dict:\n    \"\"\"Handle the HF authentication\"\"\"\n    if url.startswith(config.HF_ENDPOINT):\n        return huggingface_hub.utils.build_hf_headers(\n            token=token, library_name=\"datasets\", library_version=__version__\n        )\n    else:\n        return {}\n\n\ndef _raise_if_offline_mode_is_enabled(msg: Optional[str] = None):\n    \"\"\"Raise an OfflineModeIsEnabled error (subclass of ConnectionError) if HF_HUB_OFFLINE is True.\"\"\"\n    if config.HF_HUB_OFFLINE:\n        raise huggingface_hub.errors.OfflineModeIsEnabled(\n            \"Offline mode is enabled.\" if msg is None else \"Offline mode is enabled. \" + str(msg)\n        )\n\n\ndef fsspec_head(url, storage_options=None):\n    _raise_if_offline_mode_is_enabled(f\"Tried to reach {url}\")\n    fs, path = url_to_fs(url, **(storage_options or {}))\n    return fs.info(path)\n\n\ndef stack_multiprocessing_download_progress_bars():\n    # Stack downloads progress bars automatically using HF_DATASETS_STACK_MULTIPROCESSING_DOWNLOAD_PROGRESS_BARS=1\n    # We use environment variables since the download may happen in a subprocess\n    return patch.dict(os.environ, {\"HF_DATASETS_STACK_MULTIPROCESSING_DOWNLOAD_PROGRESS_BARS\": \"1\"})\n\n\nclass TqdmCallback(fsspec.callbacks.TqdmCallback):\n    def __init__(self, tqdm_kwargs=None, *args, **kwargs):\n        if config.FSSPEC_VERSION < version.parse(\"2024.2.0\"):\n            super().__init__(tqdm_kwargs, *args, **kwargs)\n            self._tqdm = _tqdm  # replace tqdm module by datasets.utils.tqdm module\n        else:\n            kwargs[\"tqdm_cls\"] = _tqdm.tqdm\n            super().__init__(tqdm_kwargs, *args, **kwargs)\n\n\ndef fsspec_get(url, temp_file, storage_options=None, desc=None, disable_tqdm=False):\n    _raise_if_offline_mode_is_enabled(f\"Tried to reach {url}\")\n    fs, path = url_to_fs(url, **(storage_options or {}))\n    callback = TqdmCallback(\n        tqdm_kwargs={\n            \"desc\": desc or \"Downloading\",\n            \"unit\": \"B\",\n            \"unit_scale\": True,\n            \"position\": multiprocessing.current_process()._identity[-1]  # contains the ranks of subprocesses\n            if os.environ.get(\"HF_DATASETS_STACK_MULTIPROCESSING_DOWNLOAD_PROGRESS_BARS\") == \"1\"\n            and multiprocessing.current_process()._identity\n            else None,\n            \"disable\": disable_tqdm,\n        }\n    )\n    fs.get_file(path, temp_file.name, callback=callback)\n\n\ndef get_from_cache(\n    url,\n    cache_dir=None,\n    force_download=False,\n    user_agent=None,\n    use_etag=True,\n    token=None,\n    storage_options=None,\n    download_desc=None,\n    disable_tqdm=False,\n) -> str:\n    \"\"\"\n    Given a URL, look for the corresponding file in the local cache.\n    If it's not there, download it. Then return the path to the cached file.\n\n    Return:\n        Local path (string)\n\n    Raises:\n        FileNotFoundError: in case of non-recoverable file\n            (non-existent or no cache on disk)\n        ConnectionError: in case of unreachable url\n            and no cache on disk\n    \"\"\"\n    if storage_options is None:\n        storage_options = {}\n    if cache_dir is None:\n        cache_dir = config.HF_DATASETS_CACHE\n    if isinstance(cache_dir, Path):\n        cache_dir = str(cache_dir)\n\n    os.makedirs(cache_dir, exist_ok=True)\n\n    response = None\n    etag = None\n\n    # Try a first time to file the file on the local file system without eTag (None)\n    # if we don't ask for 'force_download' then we spare a request\n    filename = hash_url_to_filename(url, etag=None)\n    cache_path = os.path.join(cache_dir, filename)\n\n    if os.path.exists(cache_path) and not force_download and not use_etag:\n        return cache_path\n\n    # Prepare headers for authentication\n    headers = get_authentication_headers_for_url(url, token=token)\n    if user_agent is not None:\n        headers[\"user-agent\"] = user_agent\n\n    response = fsspec_head(url, storage_options=storage_options)\n    etag = (response.get(\"ETag\", None) or response.get(\"etag\", None)) if use_etag else None\n\n    # Try a second time\n    filename = hash_url_to_filename(url, etag)\n    cache_path = os.path.join(cache_dir, filename)\n\n    if os.path.exists(cache_path) and not force_download:\n        return cache_path\n\n    # Prevent parallel downloads of the same file with a lock.\n    lock_path = cache_path + \".lock\"\n    with FileLock(lock_path):\n        # Retry in case previously locked processes just enter after the precedent process releases the lock\n        if os.path.exists(cache_path) and not force_download:\n            return cache_path\n\n        incomplete_path = cache_path + \".incomplete\"\n\n        # Download to temporary file, then copy to cache path once finished.\n        # Otherwise, you get corrupt cache entries if the download gets interrupted.\n        with open(incomplete_path, \"w+b\") as temp_file:\n            logger.info(f\"{url} not found in cache or force_download set to True, downloading to {temp_file.name}\")\n            # GET file object\n            fsspec_get(url, temp_file, storage_options=storage_options, desc=download_desc, disable_tqdm=disable_tqdm)\n\n        logger.info(f\"storing {url} in cache at {cache_path}\")\n        shutil.move(temp_file.name, cache_path)\n\n        logger.info(f\"creating metadata file for {cache_path}\")\n        meta = {\"url\": url, \"etag\": etag}\n        meta_path = cache_path + \".json\"\n        with open(meta_path, \"w\", encoding=\"utf-8\") as meta_file:\n            json.dump(meta, meta_file)\n\n    return cache_path\n\n\ndef add_start_docstrings(*docstr):\n    def docstring_decorator(fn):\n        fn.__doc__ = \"\".join(docstr) + \"\\n\\n\" + (fn.__doc__ if fn.__doc__ is not None else \"\")\n        return fn\n\n    return docstring_decorator\n\n\ndef add_end_docstrings(*docstr):\n    def docstring_decorator(fn):\n        fn.__doc__ = (fn.__doc__ if fn.__doc__ is not None else \"\") + \"\\n\\n\" + \"\".join(docstr)\n        return fn\n\n    return docstring_decorator\n\n\ndef estimate_dataset_size(paths):\n    return sum(path.stat().st_size for path in paths)\n\n\ndef readline(f: io.RawIOBase):\n    # From: https://github.com/python/cpython/blob/d27e2f4d118e7a9909b6a3e5da06c5ff95806a85/Lib/_pyio.py#L525\n    res = bytearray()\n    while True:\n        b = f.read(1)\n        if not b:\n            break\n        res += b\n        if res.endswith(b\"\\n\"):\n            break\n    return bytes(res)\n\n\n#######################\n# Streaming utilities #\n#######################\n\nBASE_KNOWN_EXTENSIONS = [\n    \"txt\",\n    \"csv\",\n    \"json\",\n    \"jsonl\",\n    \"tsv\",\n    \"conll\",\n    \"conllu\",\n    \"orig\",\n    \"parquet\",\n    \"pkl\",\n    \"pickle\",\n    \"rel\",\n    \"xml\",\n    \"arrow\",\n]\nCOMPRESSION_EXTENSION_TO_PROTOCOL = {\n    # single file compression\n    **{\n        extension.lstrip(\".\"): fs_class.protocol\n        for fs_class in COMPRESSION_FILESYSTEMS\n        for extension in fs_class.extensions\n    },\n    # archive compression\n    \"zip\": \"zip\",\n    \"eval\": \"zip\",\n}\nSINGLE_FILE_COMPRESSION_EXTENSION_TO_PROTOCOL = {\n    extension.lstrip(\".\"): fs_class.protocol\n    for fs_class in COMPRESSION_FILESYSTEMS\n    for extension in fs_class.extensions\n}\nSINGLE_FILE_COMPRESSION_PROTOCOLS = {fs_class.protocol for fs_class in COMPRESSION_FILESYSTEMS}\nSINGLE_SLASH_AFTER_PROTOCOL_PATTERN = re.compile(r\"(?<!:):/\")\n\n\nMAGIC_NUMBER_TO_COMPRESSION_PROTOCOL = {\n    bytes.fromhex(\"504B0304\"): \"zip\",\n    bytes.fromhex(\"504B0506\"): \"zip\",  # empty archive\n    bytes.fromhex(\"504B0708\"): \"zip\",  # spanned archive\n    bytes.fromhex(\"425A68\"): \"bz2\",\n    bytes.fromhex(\"1F8B\"): \"gzip\",\n    bytes.fromhex(\"FD377A585A00\"): \"xz\",\n    bytes.fromhex(\"04224D18\"): \"lz4\",\n    bytes.fromhex(\"28B52FFD\"): \"zstd\",\n}\nMAGIC_NUMBER_TO_UNSUPPORTED_COMPRESSION_PROTOCOL = {\n    b\"Rar!\": \"rar\",\n}\nMAGIC_NUMBER_MAX_LENGTH = max(\n    len(magic_number)\n    for magic_number in chain(MAGIC_NUMBER_TO_COMPRESSION_PROTOCOL, MAGIC_NUMBER_TO_UNSUPPORTED_COMPRESSION_PROTOCOL)\n)\n\n\nclass NonStreamableDatasetError(Exception):\n    pass\n\n\ndef _get_path_extension(path: str) -> str:\n    # Get extension: https://foo.bar/train.json.gz -> gz\n    extension = path.split(\".\")[-1]\n    # Remove query params (\"dl=1\", \"raw=true\"): gz?dl=1 -> gz\n    # Remove shards infos (\".txt_1\", \".txt-00000-of-00100\"): txt_1 -> txt\n    for symb in \"?-_\":\n        extension = extension.split(symb)[0]\n    return extension\n\n\ndef _get_extraction_protocol_with_magic_number(f) -> Optional[str]:\n    \"\"\"read the magic number from a file-like object and return the compression protocol\"\"\"\n    # Check if the file object is seekable even before reading the magic number (to avoid https://bugs.python.org/issue26440)\n    try:\n        f.seek(0)\n    except (AttributeError, io.UnsupportedOperation):\n        return None\n    magic_number = f.read(MAGIC_NUMBER_MAX_LENGTH)\n    f.seek(0)\n    for i in range(MAGIC_NUMBER_MAX_LENGTH):\n        compression = MAGIC_NUMBER_TO_COMPRESSION_PROTOCOL.get(magic_number[: MAGIC_NUMBER_MAX_LENGTH - i])\n        if compression is not None:\n            return compression\n        compression = MAGIC_NUMBER_TO_UNSUPPORTED_COMPRESSION_PROTOCOL.get(magic_number[: MAGIC_NUMBER_MAX_LENGTH - i])\n        if compression is not None:\n            raise NotImplementedError(f\"Compression protocol '{compression}' not implemented.\")\n\n\ndef _get_extraction_protocol(urlpath: str, download_config: Optional[DownloadConfig] = None) -> Optional[str]:\n    # get inner file: zip://train-00000.json.gz::https://foo.bar/data.zip -> zip://train-00000.json.gz\n    urlpath = str(urlpath)\n    path = urlpath.split(\"::\")[0]\n    extension = _get_path_extension(path)\n    if (\n        extension in BASE_KNOWN_EXTENSIONS\n        or extension in [\"tgz\", \"tar\"]\n        or path.endswith((\".tar.gz\", \".tar.bz2\", \".tar.xz\"))\n    ):\n        return None\n    elif extension in COMPRESSION_EXTENSION_TO_PROTOCOL:\n        return COMPRESSION_EXTENSION_TO_PROTOCOL[extension]\n    urlpath, storage_options = _prepare_path_and_storage_options(urlpath, download_config=download_config)\n    try:\n        with fsspec.open(urlpath, **(storage_options or {})) as f:\n            return _get_extraction_protocol_with_magic_number(f)\n    except FileNotFoundError:\n        if urlpath.startswith(config.HF_ENDPOINT):\n            raise FileNotFoundError(\n                urlpath + \"\\nIf the repo is private or gated, make sure to log in with `huggingface-cli login`.\"\n            ) from None\n        else:\n            raise\n\n\ndef xjoin(a, *p):\n    \"\"\"\n    This function extends os.path.join to support the \"::\" hop separator. It supports both paths and urls.\n\n    A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator \"::\".\n    This is used to access files inside a zip file over http for example.\n\n    Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.\n    Then you can just chain the url this way:\n\n        zip://folder1/file.txt::https://host.com/archive.zip\n\n    The xjoin function allows you to apply the join on the first path of the chain.\n\n    Example::\n\n        >>> xjoin(\"zip://folder1::https://host.com/archive.zip\", \"file.txt\")\n        zip://folder1/file.txt::https://host.com/archive.zip\n    \"\"\"\n    a, *b = str(a).split(\"::\")\n    if is_local_path(a):\n        return os.path.join(a, *p)\n    else:\n        a = posixpath.join(a, *p)\n        return \"::\".join([a] + b)\n\n\ndef xdirname(a):\n    \"\"\"\n    This function extends os.path.dirname to support the \"::\" hop separator. It supports both paths and urls.\n\n    A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator \"::\".\n    This is used to access files inside a zip file over http for example.\n\n    Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.\n    Then you can just chain the url this way:\n\n        zip://folder1/file.txt::https://host.com/archive.zip\n\n    The xdirname function allows you to apply the dirname on the first path of the chain.\n\n    Example::\n\n        >>> xdirname(\"zip://folder1/file.txt::https://host.com/archive.zip\")\n        zip://folder1::https://host.com/archive.zip\n    \"\"\"\n    a, *b = str(a).split(\"::\")\n    if is_local_path(a):\n        a = os.path.dirname(Path(a).as_posix())\n    else:\n        a = posixpath.dirname(a)\n    # if we end up at the root of the protocol, we get for example a = 'http:'\n    # so we have to fix it by adding the '//' that was removed:\n    if a.endswith(\":\"):\n        a += \"//\"\n    return \"::\".join([a] + b)\n\n\ndef xexists(urlpath: str, download_config: Optional[DownloadConfig] = None):\n    \"\"\"Extend `os.path.exists` function to support both local and remote files.\n\n    Args:\n        urlpath (`str`): URL path.\n        download_config : mainly use token or storage_options to support different platforms and auth types.\n\n    Returns:\n        `bool`\n    \"\"\"\n\n    main_hop, *rest_hops = _as_str(urlpath).split(\"::\")\n    if is_local_path(main_hop):\n        return os.path.exists(main_hop)\n    else:\n        urlpath, storage_options = _prepare_path_and_storage_options(urlpath, download_config=download_config)\n        main_hop, *rest_hops = urlpath.split(\"::\")\n        fs, *_ = url_to_fs(urlpath, **storage_options)\n        return fs.exists(main_hop)\n\n\ndef xbasename(a):\n    \"\"\"\n    This function extends os.path.basename to support the \"::\" hop separator. It supports both paths and urls.\n\n    A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator \"::\".\n    This is used to access files inside a zip file over http for example.\n\n    Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.\n    Then you can just chain the url this way:\n\n        zip://folder1/file.txt::https://host.com/archive.zip\n\n    The xbasename function allows you to apply the basename on the first path of the chain.\n\n    Example::\n\n        >>> xbasename(\"zip://folder1/file.txt::https://host.com/archive.zip\")\n        file.txt\n    \"\"\"\n    a, *b = str(a).split(\"::\")\n    if is_local_path(a):\n        return os.path.basename(Path(a).as_posix())\n    else:\n        return posixpath.basename(a)\n\n\ndef xsplit(a):\n    \"\"\"\n    This function extends os.path.split to support the \"::\" hop separator. It supports both paths and urls.\n\n    A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator \"::\".\n    This is used to access files inside a zip file over http for example.\n\n    Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.\n    Then you can just chain the url this way:\n\n        zip://folder1/file.txt::https://host.com/archive.zip\n\n    The xsplit function allows you to apply the xsplit on the first path of the chain.\n\n    Example::\n\n        >>> xsplit(\"zip://folder1/file.txt::https://host.com/archive.zip\")\n        ('zip://folder1::https://host.com/archive.zip', 'file.txt')\n    \"\"\"\n    a, *b = str(a).split(\"::\")\n    if is_local_path(a):\n        return os.path.split(Path(a).as_posix())\n    else:\n        a, tail = posixpath.split(a)\n        return \"::\".join([a + \"//\" if a.endswith(\":\") else a] + b), tail\n\n\ndef xsplitext(a):\n    \"\"\"\n    This function extends os.path.splitext to support the \"::\" hop separator. It supports both paths and urls.\n\n    A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator \"::\".\n    This is used to access files inside a zip file over http for example.\n\n    Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.\n    Then you can just chain the url this way:\n\n        zip://folder1/file.txt::https://host.com/archive.zip\n\n    The xsplitext function allows you to apply the splitext on the first path of the chain.\n\n    Example::\n\n        >>> xsplitext(\"zip://folder1/file.txt::https://host.com/archive.zip\")\n        ('zip://folder1/file::https://host.com/archive.zip', '.txt')\n    \"\"\"\n    a, *b = str(a).split(\"::\")\n    if is_local_path(a):\n        return os.path.splitext(Path(a).as_posix())\n    else:\n        a, ext = posixpath.splitext(a)\n        return \"::\".join([a] + b), ext\n\n\ndef xisfile(path, download_config: Optional[DownloadConfig] = None) -> bool:\n    \"\"\"Extend `os.path.isfile` function to support remote files.\n\n    Args:\n        path (`str`): URL path.\n        download_config : mainly use token or storage_options to support different platforms and auth types.\n\n    Returns:\n        `bool`\n    \"\"\"\n    main_hop, *rest_hops = str(path).split(\"::\")\n    if is_local_path(main_hop):\n        return os.path.isfile(path)\n    else:\n        path, storage_options = _prepare_path_and_storage_options(path, download_config=download_config)\n        main_hop, *rest_hops = path.split(\"::\")\n        fs, *_ = url_to_fs(path, **storage_options)\n        return fs.isfile(main_hop)\n\n\ndef xgetsize(path, download_config: Optional[DownloadConfig] = None) -> int:\n    \"\"\"Extend `os.path.getsize` function to support remote files.\n\n    Args:\n        path (`str`): URL path.\n        download_config : mainly use token or storage_options to support different platforms and auth types.\n\n    Returns:\n        `int`: optional\n    \"\"\"\n    main_hop, *rest_hops = str(path).split(\"::\")\n    if is_local_path(main_hop):\n        return os.path.getsize(path)\n    else:\n        path, storage_options = _prepare_path_and_storage_options(path, download_config=download_config)\n        main_hop, *rest_hops = path.split(\"::\")\n        fs, *_ = fs, *_ = url_to_fs(path, **storage_options)\n        try:\n            size = fs.size(main_hop)\n        except huggingface_hub.utils.EntryNotFoundError:\n            raise FileNotFoundError(f\"No such file: {path}\")\n        if size is None:\n            # use xopen instead of fs.open to make data fetching more robust\n            with xopen(path, download_config=download_config) as f:\n                size = len(f.read())\n        return size\n\n\ndef xisdir(path, download_config: Optional[DownloadConfig] = None) -> bool:\n    \"\"\"Extend `os.path.isdir` function to support remote files.\n\n    Args:\n        path (`str`): URL path.\n        download_config : mainly use token or storage_options to support different platforms and auth types.\n\n    Returns:\n        `bool`\n    \"\"\"\n    main_hop, *rest_hops = str(path).split(\"::\")\n    if is_local_path(main_hop):\n        return os.path.isdir(path)\n    else:\n        path, storage_options = _prepare_path_and_storage_options(path, download_config=download_config)\n        main_hop, *rest_hops = path.split(\"::\")\n        fs, *_ = fs, *_ = url_to_fs(path, **storage_options)\n        inner_path = main_hop.split(\"://\")[-1]\n        if not inner_path.strip(\"/\"):\n            return True\n        return fs.isdir(inner_path)\n\n\ndef xrelpath(path, start=None):\n    \"\"\"Extend `os.path.relpath` function to support remote files.\n\n    Args:\n        path (`str`): URL path.\n        start (`str`): Start URL directory path.\n\n    Returns:\n        `str`\n    \"\"\"\n    main_hop, *rest_hops = str(path).split(\"::\")\n    if is_local_path(main_hop):\n        return os.path.relpath(main_hop, start=start) if start else os.path.relpath(main_hop)\n    else:\n        return posixpath.relpath(main_hop, start=str(start).split(\"::\")[0]) if start else os.path.relpath(main_hop)\n\n\nclass _OverridableIOWrapper(io.RawIOBase):\n    def __init__(self, f):\n        self._overrides = {}\n        self.f = f\n\n    def __getattribute__(self, attr):\n        if attr == \"_overrides\":\n            return object.__getattribute__(self, attr)\n        elif attr in self._overrides:\n            return self._overrides[attr]\n        return getattr(self.f, attr)\n\n    def __setattr__(self, attr, value):\n        if attr == \"_overrides\":\n            object.__setattr__(self, attr, value)\n        else:\n            self._overrides[attr] = value\n\n\ndef _add_retries_to_file_obj_read_method(file_obj):\n    read = file_obj.read\n    max_retries = config.STREAMING_READ_MAX_RETRIES\n\n    def read_with_retries(*args, **kwargs):\n        disconnect_err = None\n        for retry in range(1, max_retries + 1):\n            try:\n                out = read(*args, **kwargs)\n                break\n            except CONNECTION_ERRORS_TO_RETRY as err:\n                disconnect_err = err\n                logger.warning(\n                    f\"Got disconnected from remote data host. Retrying in {config.STREAMING_READ_RETRY_INTERVAL}sec [{retry}/{max_retries}]\"\n                )\n                time.sleep(config.STREAMING_READ_RETRY_INTERVAL)\n            except huggingface_hub.errors.HfHubHTTPError as err:\n                if err.response is not None and err.response.status_code == SERVER_UNAVAILABLE_CODE:\n                    disconnect_err = err\n                    logger.warning(\n                        f\"Got disconnected from remote data host. Retrying in {config.STREAMING_READ_SERVER_UNAVAILABLE_RETRY_INTERVAL}sec [{retry}/{max_retries}]\"\n                    )\n                    time.sleep(config.STREAMING_READ_SERVER_UNAVAILABLE_RETRY_INTERVAL)\n                elif err.response is not None and err.response.status_code == RATE_LIMIT_CODE:\n                    disconnect_err = err\n                    logger.warning(str(err))\n                    logger.warning(\n                        f\"Got disconnected from remote data host. Retrying in {config.STREAMING_READ_RATE_LIMIT_RETRY_INTERVAL}sec [{retry}/{max_retries}]\"\n                    )\n                    time.sleep(config.STREAMING_READ_RATE_LIMIT_RETRY_INTERVAL)\n                else:\n                    raise\n        else:\n            raise ConnectionError(\"Server Disconnected\") from disconnect_err\n        return out\n\n    try:\n        file_obj.read = read_with_retries\n    except AttributeError:  # read-only attribute\n        file_obj = _OverridableIOWrapper(file_obj)\n        file_obj.read = read_with_retries\n    return file_obj\n\n\ndef _prepare_path_and_storage_options(\n    urlpath: str, download_config: Optional[DownloadConfig] = None\n) -> tuple[str, dict[str, dict[str, Any]]]:\n    prepared_urlpath = []\n    prepared_storage_options = {}\n    for hop in urlpath.split(\"::\"):\n        hop, storage_options = _prepare_single_hop_path_and_storage_options(hop, download_config=download_config)\n        prepared_urlpath.append(hop)\n        prepared_storage_options.update(storage_options)\n    return \"::\".join(prepared_urlpath), storage_options\n\n\ndef _prepare_single_hop_path_and_storage_options(\n    urlpath: str, download_config: Optional[DownloadConfig] = None\n) -> tuple[str, dict[str, dict[str, Any]]]:\n    \"\"\"\n    Prepare the URL and the kwargs that must be passed to the HttpFileSystem or HfFileSystem\n\n    In particular it resolves google drive URLs\n    It also adds the authentication headers for the Hugging Face Hub, for both https:// and hf:// paths.\n\n    Storage options are formatted in the form {protocol: storage_options_for_protocol}\n    \"\"\"\n    token = None if download_config is None else download_config.token\n    if urlpath.startswith(config.HF_ENDPOINT) and \"/resolve/\" in urlpath:\n        urlpath = \"hf://\" + urlpath[len(config.HF_ENDPOINT) + 1 :].replace(\"/resolve/\", \"@\", 1)\n    protocol = urlpath.split(\"://\")[0] if \"://\" in urlpath else \"file\"\n    if download_config is not None and protocol in download_config.storage_options:\n        storage_options = download_config.storage_options[protocol].copy()\n    elif download_config is not None and protocol not in download_config.storage_options:\n        storage_options = {\n            option_name: option_value\n            for option_name, option_value in download_config.storage_options.items()\n            if option_name not in fsspec.available_protocols()\n        }\n    else:\n        storage_options = {}\n    if protocol in {\"http\", \"https\"}:\n        client_kwargs = storage_options.pop(\"client_kwargs\", {})\n        storage_options[\"client_kwargs\"] = {\"trust_env\": True, **client_kwargs}  # Enable reading proxy env variables\n        if \"drive.google.com\" in urlpath:\n            response = get_session().head(urlpath, timeout=10)\n            for k, v in response.cookies.items():\n                if k.startswith(\"download_warning\"):\n                    urlpath += \"&confirm=\" + v\n                    cookies = response.cookies\n                    storage_options = {\"cookies\": cookies, **storage_options}\n            # Fix Google Drive URL to avoid Virus scan warning\n            if \"confirm=\" not in urlpath:\n                urlpath += \"&confirm=t\"\n        if urlpath.startswith(\"https://raw.githubusercontent.com/\"):\n            # Workaround for served data with gzip content-encoding: https://github.com/fsspec/filesystem_spec/issues/389\n            headers = storage_options.pop(\"headers\", {})\n            storage_options[\"headers\"] = {\"Accept-Encoding\": \"identity\", **headers}\n    elif protocol == \"hf\":\n        storage_options = {\n            \"endpoint\": config.HF_ENDPOINT,\n            \"token\": token,\n            **storage_options,\n        }\n    if storage_options:\n        storage_options = {protocol: storage_options}\n    return urlpath, storage_options\n\n\ndef xopen(file: str, mode=\"r\", *args, download_config: Optional[DownloadConfig] = None, **kwargs):\n    \"\"\"Extend `open` function to support remote files using `fsspec`.\n\n    It also has a retry mechanism in case connection fails.\n    The `args` and `kwargs` are passed to `fsspec.open`, except `token` which is used for queries to private repos on huggingface.co\n\n    Args:\n        file (`str`): Path name of the file to be opened.\n        mode (`str`, *optional*, default \"r\"): Mode in which the file is opened.\n        *args: Arguments to be passed to `fsspec.open`.\n        download_config : mainly use token or storage_options to support different platforms and auth types.\n        **kwargs: Keyword arguments to be passed to `fsspec.open`.\n\n    Returns:\n        file object\n    \"\"\"\n    # This works as well for `xopen(str(Path(...)))`\n    file_str = _as_str(file)\n    main_hop, *rest_hops = file_str.split(\"::\")\n    if is_local_path(main_hop):\n        # ignore fsspec-specific kwargs\n        kwargs.pop(\"block_size\", None)\n        return open(main_hop, mode, *args, **kwargs)\n    # add headers and cookies for authentication on the HF Hub and for Google Drive\n    file, storage_options = _prepare_path_and_storage_options(file_str, download_config=download_config)\n    kwargs = {**kwargs, **(storage_options or {})}\n\n    max_retries = config.STREAMING_OPEN_MAX_RETRIES\n\n    disconnect_err = None\n    for retry in range(1, max_retries + 1):\n        try:\n            file_obj = fsspec.open(file, mode=mode, *args, **kwargs).open()\n            break\n        except CONNECTION_ERRORS_TO_RETRY as err:\n            disconnect_err = err\n            logger.warning(\n                f\"Failed to connect to remote data host. Retrying in {config.STREAMING_OPEN_RETRY_INTERVAL}sec [{retry}/{max_retries}]\"\n            )\n            time.sleep(config.STREAMING_OPEN_RETRY_INTERVAL)\n        except ValueError as e:\n            if str(e) == \"Cannot seek streaming HTTP file\":\n                raise NonStreamableDatasetError(\n                    \"Streaming is not possible for this dataset because data host server doesn't support HTTP range \"\n                    \"requests. You can still load this dataset in non-streaming mode by passing `streaming=False` (default)\"\n                ) from e\n            else:\n                raise\n        except FileNotFoundError:\n            if file.startswith(config.HF_ENDPOINT):\n                raise FileNotFoundError(\n                    file + \"\\nIf the repo is private or gated, make sure to log in with `huggingface-cli login`.\"\n                ) from None\n            else:\n                raise\n    else:\n        raise ConnectionError(\"Server Disconnected\") from disconnect_err\n    file_obj = _add_retries_to_file_obj_read_method(file_obj)\n    return file_obj\n\n\ndef xlistdir(path: str, download_config: Optional[DownloadConfig] = None) -> list[str]:\n    \"\"\"Extend `os.listdir` function to support remote files.\n\n    Args:\n        path (`str`): URL path.\n        download_config : mainly use token or storage_options to support different platforms and auth types.\n\n    Returns:\n        `list` of `str`\n    \"\"\"\n    main_hop, *rest_hops = _as_str(path).split(\"::\")\n    if is_local_path(main_hop):\n        return os.listdir(path)\n    else:\n        # globbing inside a zip in a private repo requires authentication\n        path, storage_options = _prepare_path_and_storage_options(path, download_config=download_config)\n        main_hop, *rest_hops = path.split(\"::\")\n        fs, *_ = url_to_fs(path, **storage_options)\n        inner_path = main_hop.split(\"://\")[-1]\n        if inner_path.strip(\"/\") and not fs.isdir(inner_path):\n            raise FileNotFoundError(f\"Directory doesn't exist: {path}\")\n        paths = fs.listdir(inner_path, detail=False)\n        return [os.path.basename(path.rstrip(\"/\")) for path in paths]\n\n\ndef xglob(urlpath, *, recursive=False, download_config: Optional[DownloadConfig] = None):\n    \"\"\"Extend `glob.glob` function to support remote files.\n\n    Args:\n        urlpath (`str`): URL path with shell-style wildcard patterns.\n        recursive (`bool`, default `False`): Whether to match the \"**\" pattern recursively to zero or more\n            directories or subdirectories.\n        download_config : mainly use token or storage_options to support different platforms and auth types.\n\n    Returns:\n        `list` of `str`\n    \"\"\"\n    main_hop, *rest_hops = _as_str(urlpath).split(\"::\")\n    if is_local_path(main_hop):\n        return glob.glob(main_hop, recursive=recursive)\n    else:\n        # globbing inside a zip in a private repo requires authentication\n        urlpath, storage_options = _prepare_path_and_storage_options(urlpath, download_config=download_config)\n        main_hop, *rest_hops = urlpath.split(\"::\")\n        fs, *_ = url_to_fs(urlpath, **storage_options)\n        inner_path = main_hop.split(\"://\")[1]\n        globbed_paths = fs.glob(inner_path)\n        protocol = fs.protocol if isinstance(fs.protocol, str) else fs.protocol[-1]\n        return [\"::\".join([f\"{protocol}://{globbed_path}\"] + rest_hops) for globbed_path in globbed_paths]\n\n\ndef xwalk(urlpath, download_config: Optional[DownloadConfig] = None, **kwargs):\n    \"\"\"Extend `os.walk` function to support remote files.\n\n    Args:\n        urlpath (`str`): URL root path.\n        download_config : mainly use token or storage_options to support different platforms and auth types.\n        **kwargs: Additional keyword arguments forwarded to the underlying filesystem.\n\n\n    Yields:\n        `tuple`: 3-tuple (dirpath, dirnames, filenames).\n    \"\"\"\n    main_hop, *rest_hops = _as_str(urlpath).split(\"::\")\n    if is_local_path(main_hop):\n        yield from os.walk(main_hop, **kwargs)\n    else:\n        # walking inside a zip in a private repo requires authentication\n        urlpath, storage_options = _prepare_path_and_storage_options(urlpath, download_config=download_config)\n        main_hop, *rest_hops = urlpath.split(\"::\")\n        fs, *_ = url_to_fs(urlpath, **storage_options)\n        inner_path = main_hop.split(\"://\")[-1]\n        if inner_path.strip(\"/\") and not fs.isdir(inner_path):\n            return []\n        protocol = fs.protocol if isinstance(fs.protocol, str) else fs.protocol[-1]\n        for dirpath, dirnames, filenames in fs.walk(inner_path, **kwargs):\n            yield \"::\".join([f\"{protocol}://{dirpath}\"] + rest_hops), dirnames, filenames\n\n\nclass xPath(type(Path())):\n    \"\"\"Extension of `pathlib.Path` to support both local paths and remote URLs.\"\"\"\n\n    def __str__(self):\n        path_str = super().__str__()\n        main_hop, *rest_hops = path_str.split(\"::\")\n        if is_local_path(main_hop):\n            return main_hop\n        path_as_posix = path_str.replace(\"\\\\\", \"/\")\n        path_as_posix = SINGLE_SLASH_AFTER_PROTOCOL_PATTERN.sub(\"://\", path_as_posix)\n        path_as_posix += \"//\" if path_as_posix.endswith(\":\") else \"\"  # Add slashes to root of the protocol\n        return path_as_posix\n\n    def exists(self, download_config: Optional[DownloadConfig] = None):\n        \"\"\"Extend `pathlib.Path.exists` method to support both local and remote files.\n\n        Args:\n            download_config : mainly use token or storage_options to support different platforms and auth types.\n\n        Returns:\n            `bool`\n        \"\"\"\n        return xexists(str(self), download_config=download_config)\n\n    def glob(self, pattern, download_config: Optional[DownloadConfig] = None):\n        \"\"\"Glob function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.\n\n        Args:\n            pattern (`str`): Pattern that resulting paths must match.\n            download_config : mainly use token or storage_options to support different platforms and auth types.\n\n        Yields:\n            [`xPath`]\n        \"\"\"\n        posix_path = self.as_posix()\n        main_hop, *rest_hops = posix_path.split(\"::\")\n        if is_local_path(main_hop):\n            yield from Path(main_hop).glob(pattern)\n        else:\n            # globbing inside a zip in a private repo requires authentication\n            if rest_hops:\n                urlpath = rest_hops[0]\n                urlpath, storage_options = _prepare_path_and_storage_options(urlpath, download_config=download_config)\n                storage_options = {urlpath.split(\"://\")[0]: storage_options}\n                posix_path = \"::\".join([main_hop, urlpath, *rest_hops[1:]])\n            else:\n                storage_options = None\n            fs, *_ = url_to_fs(xjoin(posix_path, pattern), **(storage_options or {}))\n            globbed_paths = fs.glob(xjoin(main_hop, pattern))\n            for globbed_path in globbed_paths:\n                yield type(self)(\"::\".join([f\"{fs.protocol}://{globbed_path}\"] + rest_hops))\n\n    def rglob(self, pattern, **kwargs):\n        \"\"\"Rglob function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.\n\n        Args:\n            pattern (`str`): Pattern that resulting paths must match.\n\n        Yields:\n            [`xPath`]\n        \"\"\"\n        return self.glob(\"**/\" + pattern, **kwargs)\n\n    @property\n    def parent(self) -> \"xPath\":\n        \"\"\"Name function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.\n\n        Returns:\n            [`xPath`]\n        \"\"\"\n        return type(self)(xdirname(self.as_posix()))\n\n    @property\n    def name(self) -> str:\n        \"\"\"Name function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.\n\n        Returns:\n            `str`\n        \"\"\"\n        return PurePosixPath(self.as_posix().split(\"::\")[0]).name\n\n    @property\n    def stem(self) -> str:\n        \"\"\"Stem function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.\n\n        Returns:\n            `str`\n        \"\"\"\n        return PurePosixPath(self.as_posix().split(\"::\")[0]).stem\n\n    @property\n    def suffix(self) -> str:\n        \"\"\"Suffix function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.\n\n        Returns:\n            `str`\n        \"\"\"\n        return PurePosixPath(self.as_posix().split(\"::\")[0]).suffix\n\n    def open(self, *args, **kwargs):\n        \"\"\"Extend :func:`xopen` to support argument of type :obj:`~pathlib.Path`.\n\n        Args:\n            **args: Arguments passed to :func:`fsspec.open`.\n            **kwargs: Keyword arguments passed to :func:`fsspec.open`.\n\n        Returns:\n            `io.FileIO`: File-like object.\n        \"\"\"\n        return xopen(str(self), *args, **kwargs)\n\n    def joinpath(self, *p: tuple[str, ...]) -> \"xPath\":\n        \"\"\"Extend :func:`xjoin` to support argument of type :obj:`~pathlib.Path`.\n\n        Args:\n            *p (`tuple` of `str`): Other path components.\n\n        Returns:\n            [`xPath`]\n        \"\"\"\n        return type(self)(xjoin(self.as_posix(), *p))\n\n    def __truediv__(self, p: str) -> \"xPath\":\n        return self.joinpath(p)\n\n    def with_suffix(self, suffix):\n        main_hop, *rest_hops = str(self).split(\"::\")\n        if is_local_path(main_hop):\n            return type(self)(str(super().with_suffix(suffix)))\n        return type(self)(\"::\".join([type(self)(PurePosixPath(main_hop).with_suffix(suffix)).as_posix()] + rest_hops))\n\n\ndef _as_str(path: Union[str, Path, xPath]):\n    return str(path) if isinstance(path, xPath) else str(xPath(str(path)))\n\n\ndef xgzip_open(filepath_or_buffer, *args, download_config: Optional[DownloadConfig] = None, **kwargs):\n    import gzip\n\n    if hasattr(filepath_or_buffer, \"read\"):\n        return gzip.open(filepath_or_buffer, *args, **kwargs)\n    else:\n        filepath_or_buffer = str(filepath_or_buffer)\n        return gzip.open(xopen(filepath_or_buffer, \"rb\", download_config=download_config), *args, **kwargs)\n\n\ndef xnumpy_load(filepath_or_buffer, *args, download_config: Optional[DownloadConfig] = None, **kwargs):\n    import numpy as np\n\n    if hasattr(filepath_or_buffer, \"read\"):\n        return np.load(filepath_or_buffer, *args, **kwargs)\n    else:\n        filepath_or_buffer = str(filepath_or_buffer)\n        return np.load(xopen(filepath_or_buffer, \"rb\", download_config=download_config), *args, **kwargs)\n\n\ndef xpandas_read_csv(filepath_or_buffer, download_config: Optional[DownloadConfig] = None, **kwargs):\n    import pandas as pd\n\n    if hasattr(filepath_or_buffer, \"read\"):\n        return pd.read_csv(filepath_or_buffer, **kwargs)\n    else:\n        filepath_or_buffer = str(filepath_or_buffer)\n        if kwargs.get(\"compression\", \"infer\") == \"infer\":\n            kwargs[\"compression\"] = _get_extraction_protocol(filepath_or_buffer, download_config=download_config)\n        return pd.read_csv(xopen(filepath_or_buffer, \"rb\", download_config=download_config), **kwargs)\n\n\ndef xpandas_read_excel(filepath_or_buffer, download_config: Optional[DownloadConfig] = None, **kwargs):\n    import pandas as pd\n\n    if hasattr(filepath_or_buffer, \"read\"):\n        try:\n            return pd.read_excel(filepath_or_buffer, **kwargs)\n        except ValueError:  # Cannot seek streaming HTTP file\n            return pd.read_excel(BytesIO(filepath_or_buffer.read()), **kwargs)\n    else:\n        filepath_or_buffer = str(filepath_or_buffer)\n        try:\n            return pd.read_excel(xopen(filepath_or_buffer, \"rb\", download_config=download_config), **kwargs)\n        except ValueError:  # Cannot seek streaming HTTP file\n            return pd.read_excel(\n                BytesIO(xopen(filepath_or_buffer, \"rb\", download_config=download_config).read()), **kwargs\n            )\n\n\ndef xpyarrow_parquet_read_table(filepath_or_buffer, download_config: Optional[DownloadConfig] = None, **kwargs):\n    import pyarrow.parquet as pq\n\n    if hasattr(filepath_or_buffer, \"read\"):\n        return pq.read_table(filepath_or_buffer, **kwargs)\n    else:\n        filepath_or_buffer = str(filepath_or_buffer)\n        return pq.read_table(xopen(filepath_or_buffer, mode=\"rb\", download_config=download_config), **kwargs)\n\n\ndef xsio_loadmat(filepath_or_buffer, download_config: Optional[DownloadConfig] = None, **kwargs):\n    import scipy.io as sio\n\n    if hasattr(filepath_or_buffer, \"read\"):\n        return sio.loadmat(filepath_or_buffer, **kwargs)\n    else:\n        return sio.loadmat(xopen(filepath_or_buffer, \"rb\", download_config=download_config), **kwargs)\n\n\ndef xet_parse(source, parser=None, download_config: Optional[DownloadConfig] = None):\n    \"\"\"Extend `xml.etree.ElementTree.parse` function to support remote files.\n\n    Args:\n        source: File path or file object.\n        parser (`XMLParser`, *optional*, default `XMLParser`): Parser instance.\n        download_config : mainly use token or storage_options to support different platforms and auth types.\n\n    Returns:\n        `xml.etree.ElementTree.Element`: Root element of the given source document.\n    \"\"\"\n    if hasattr(source, \"read\"):\n        return ET.parse(source, parser=parser)\n    else:\n        with xopen(source, \"rb\", download_config=download_config) as f:\n            return ET.parse(f, parser=parser)\n\n\ndef xxml_dom_minidom_parse(filename_or_file, download_config: Optional[DownloadConfig] = None, **kwargs):\n    \"\"\"Extend `xml.dom.minidom.parse` function to support remote files.\n\n    Args:\n        filename_or_file (`str` or file): File path or file object.\n        download_config : mainly use token or storage_options to support different platforms and auth types.\n        **kwargs (optional): Additional keyword arguments passed to `xml.dom.minidom.parse`.\n\n    Returns:\n        :obj:`xml.dom.minidom.Document`: Parsed document.\n    \"\"\"\n    if hasattr(filename_or_file, \"read\"):\n        return xml.dom.minidom.parse(filename_or_file, **kwargs)\n    else:\n        with xopen(filename_or_file, \"rb\", download_config=download_config) as f:\n            return xml.dom.minidom.parse(f, **kwargs)\n\n\nclass ArchiveIterable(TrackedIterableFromGenerator):\n    \"\"\"An iterable of (path, fileobj) from a TAR archive, used by `iter_archive`\"\"\"\n\n    @staticmethod\n    def _iter_tar(f):\n        stream = tarfile.open(fileobj=f, mode=\"r|*\")\n        for tarinfo in stream:\n            file_path = tarinfo.name\n            if not tarinfo.isreg():\n                continue\n            if file_path is None:\n                continue\n            if os.path.basename(file_path).startswith((\".\", \"__\")):\n                # skipping hidden files\n                continue\n            file_obj = stream.extractfile(tarinfo)\n            yield file_path, file_obj\n            stream.members = []\n        del stream\n\n    @staticmethod\n    def _iter_zip(f):\n        zipf = zipfile.ZipFile(f)\n        for member in zipf.infolist():\n            file_path = member.filename\n            if member.is_dir():\n                continue\n            if file_path is None:\n                continue\n            if os.path.basename(file_path).startswith((\".\", \"__\")):\n                # skipping hidden files\n                continue\n            file_obj = zipf.open(member)\n            yield file_path, file_obj\n\n    @classmethod\n    def _iter_from_fileobj(cls, f) -> Generator[tuple, None, None]:\n        compression = _get_extraction_protocol_with_magic_number(f)\n        if compression == \"zip\":\n            yield from cls._iter_zip(f)\n        else:\n            yield from cls._iter_tar(f)\n\n    @classmethod\n    def _iter_from_urlpath(\n        cls, urlpath: str, download_config: Optional[DownloadConfig] = None\n    ) -> Generator[tuple, None, None]:\n        compression = _get_extraction_protocol(urlpath, download_config=download_config)\n        # Set block_size=0 to get faster streaming\n        # (e.g. for hf:// and https:// it uses streaming Requests file-like instances)\n        with xopen(urlpath, \"rb\", download_config=download_config, block_size=0) as f:\n            if compression == \"zip\":\n                yield from cls._iter_zip(f)\n            else:\n                yield from cls._iter_tar(f)\n\n    @classmethod\n    def from_buf(cls, fileobj) -> \"ArchiveIterable\":\n        return cls(cls._iter_from_fileobj, fileobj)\n\n    @classmethod\n    def from_urlpath(cls, urlpath_or_buf, download_config: Optional[DownloadConfig] = None) -> \"ArchiveIterable\":\n        return cls(cls._iter_from_urlpath, urlpath_or_buf, download_config)\n\n\nclass FilesIterable(TrackedIterableFromGenerator):\n    \"\"\"An iterable of paths from a list of directories or files\"\"\"\n\n    @classmethod\n    def _iter_from_urlpaths(\n        cls, urlpaths: Union[str, list[str]], download_config: Optional[DownloadConfig] = None\n    ) -> Generator[str, None, None]:\n        if not isinstance(urlpaths, list):\n            urlpaths = [urlpaths]\n        for urlpath in urlpaths:\n            if xisfile(urlpath, download_config=download_config):\n                yield urlpath\n            elif xisdir(urlpath, download_config=download_config):\n                for dirpath, dirnames, filenames in xwalk(urlpath, download_config=download_config):\n                    # in-place modification to prune the search\n                    dirnames[:] = sorted([dirname for dirname in dirnames if not dirname.startswith((\".\", \"__\"))])\n                    if xbasename(dirpath).startswith((\".\", \"__\")):\n                        # skipping hidden directories\n                        continue\n                    for filename in sorted(filenames):\n                        if filename.startswith((\".\", \"__\")):\n                            # skipping hidden files\n                            continue\n                        yield xjoin(dirpath, filename)\n            else:\n                raise FileNotFoundError(urlpath)\n\n    @classmethod\n    def from_urlpaths(cls, urlpaths, download_config: Optional[DownloadConfig] = None) -> \"FilesIterable\":\n        return cls(cls._iter_from_urlpaths, urlpaths, download_config)\n"
  },
  {
    "path": "src/datasets/utils/filelock.py",
    "content": "# deprecated, please use the `filelock` package instead\n\nfrom filelock import (  # noqa: F401 # imported for backward compatibility TODO: remove in 3.0.0\n    BaseFileLock,\n    SoftFileLock,\n    Timeout,\n    UnixFileLock,\n    WindowsFileLock,\n)\n\nfrom ._filelock import FileLock  # noqa: F401 # imported for backward compatibility. TODO: remove in 3.0.0\n"
  },
  {
    "path": "src/datasets/utils/hub.py",
    "content": "from functools import partial\n\nfrom huggingface_hub import hf_hub_url\n\n\nhf_dataset_url = partial(hf_hub_url, repo_type=\"dataset\")\n"
  },
  {
    "path": "src/datasets/utils/info_utils.py",
    "content": "import enum\nimport os\nfrom typing import Optional\n\nfrom huggingface_hub.utils import insecure_hashlib\n\nfrom .. import config\nfrom ..exceptions import (\n    ExpectedMoreDownloadedFilesError,\n    ExpectedMoreSplitsError,\n    NonMatchingChecksumError,\n    NonMatchingSplitsSizesError,\n    UnexpectedDownloadedFileError,\n    UnexpectedSplitsError,\n)\nfrom .logging import get_logger\n\n\nlogger = get_logger(__name__)\n\n\nclass VerificationMode(enum.Enum):\n    \"\"\"`Enum` that specifies which verification checks to run.\n\n    The default mode is `BASIC_CHECKS`, which will perform only rudimentary checks to avoid slowdowns\n    when generating/downloading a dataset for the first time.\n\n    The verification modes:\n\n    |                           | Verification checks                                                           |\n    |---------------------------|------------------------------------------------------------------------------ |\n    | `ALL_CHECKS`              | Split checks and validity (number of files, checksums) of downloaded files    |\n    | `BASIC_CHECKS` (default)  | Same as `ALL_CHECKS` but without checking downloaded files                    |\n    | `NO_CHECKS`               | None                                                                          |\n\n    \"\"\"\n\n    ALL_CHECKS = \"all_checks\"\n    BASIC_CHECKS = \"basic_checks\"\n    NO_CHECKS = \"no_checks\"\n\n\ndef verify_checksums(expected_checksums: Optional[dict], recorded_checksums: dict, verification_name=None):\n    if expected_checksums is None:\n        logger.info(\"Unable to verify checksums.\")\n        return\n    if len(set(expected_checksums) - set(recorded_checksums)) > 0:\n        raise ExpectedMoreDownloadedFilesError(str(set(expected_checksums) - set(recorded_checksums)))\n    if len(set(recorded_checksums) - set(expected_checksums)) > 0:\n        raise UnexpectedDownloadedFileError(str(set(recorded_checksums) - set(expected_checksums)))\n    bad_urls = [url for url in expected_checksums if expected_checksums[url] != recorded_checksums[url]]\n    for_verification_name = \" for \" + verification_name if verification_name is not None else \"\"\n    if len(bad_urls) > 0:\n        raise NonMatchingChecksumError(\n            f\"Checksums didn't match{for_verification_name}:\\n\"\n            f\"{bad_urls}\\n\"\n            \"Set `verification_mode='no_checks'` to skip checksums verification and ignore this error\"\n        )\n    logger.info(\"All the checksums matched successfully\" + for_verification_name)\n\n\ndef verify_splits(expected_splits: Optional[dict], recorded_splits: dict):\n    if expected_splits is None:\n        logger.info(\"Unable to verify splits sizes.\")\n        return\n    if len(set(expected_splits) - set(recorded_splits)) > 0:\n        raise ExpectedMoreSplitsError(str(set(expected_splits) - set(recorded_splits)))\n    if len(set(recorded_splits) - set(expected_splits)) > 0:\n        raise UnexpectedSplitsError(str(set(recorded_splits) - set(expected_splits)))\n    bad_splits = [\n        {\"expected\": expected_splits[name], \"recorded\": recorded_splits[name]}\n        for name in expected_splits\n        if expected_splits[name].num_examples != recorded_splits[name].num_examples\n    ]\n    if len(bad_splits) > 0:\n        raise NonMatchingSplitsSizesError(str(bad_splits))\n    logger.info(\"All the splits matched successfully.\")\n\n\ndef get_size_checksum_dict(path: str, record_checksum: bool = True) -> dict:\n    \"\"\"Compute the file size and the sha256 checksum of a file\"\"\"\n    if record_checksum:\n        m = insecure_hashlib.sha256()\n        with open(path, \"rb\") as f:\n            for chunk in iter(lambda: f.read(1 << 20), b\"\"):\n                m.update(chunk)\n            checksum = m.hexdigest()\n    else:\n        checksum = None\n    return {\"num_bytes\": os.path.getsize(path), \"checksum\": checksum}\n\n\ndef is_small_dataset(dataset_size):\n    \"\"\"Check if `dataset_size` is smaller than `config.IN_MEMORY_MAX_SIZE`.\n\n    Args:\n        dataset_size (int): Dataset size in bytes.\n\n    Returns:\n        bool: Whether `dataset_size` is smaller than `config.IN_MEMORY_MAX_SIZE`.\n    \"\"\"\n    if dataset_size and config.IN_MEMORY_MAX_SIZE:\n        return dataset_size < config.IN_MEMORY_MAX_SIZE\n    else:\n        return False\n"
  },
  {
    "path": "src/datasets/utils/json.py",
    "content": "from typing import TYPE_CHECKING, Any\n\nimport pandas as pd\n\n\nif TYPE_CHECKING:\n    from ..features.features import FeatureType\n\n\ndef ujson_dumps(*args, **kwargs):\n    try:\n        return pd.io.json.ujson_dumps(*args, **kwargs)\n    except AttributeError:\n        # Before pandas-2.2.0, ujson_dumps was renamed to dumps: import ujson_dumps as dumps\n        return pd.io.json.dumps(*args, **kwargs)\n\n\ndef ujson_loads(*args, **kwargs):\n    try:\n        return pd.io.json.ujson_loads(*args, **kwargs)\n    except AttributeError:\n        # Before pandas-2.2.0, ujson_loads was renamed to loads: import ujson_loads as loads\n        return pd.io.json.loads(*args, **kwargs)\n\n\ndef json_encode_field(example: Any, json_field_path: str) -> Any:\n    if json_field_path:\n        field, *json_field_path = json_field_path\n        if example is None:\n            return None\n        elif field == 0:\n            return [json_encode_field(x, json_field_path) for x in example]\n        else:\n            return {**example, field: json_encode_field(example.get(field), json_field_path)}\n        return example\n    else:\n        try:\n            ujson_loads(example)\n        except Exception:\n            return ujson_dumps(example)\n        else:\n            return example\n\n\ndef find_mixed_struct_types_field_paths(examples: list, allow_root=False) -> list[str]:\n    mixed_struct_types_field_paths = []\n    examples = [example for example in examples if example is not None]\n    if not examples:\n        return []\n    paths_and_content_to_check = [([], examples)]\n    while paths_and_content_to_check:\n        path, content = paths_and_content_to_check.pop(0)\n        if all(isinstance(x, dict) for x in content):\n            if (allow_root or path) and (any(set(x) != set(content[0]) for x in content) or not content[0]):\n                mixed_struct_types_field_paths.append(path)\n            else:\n                for subfield in {field for x in content for field in x}:\n                    examples = [x[subfield] for x in content if subfield in x and x[subfield] is not None]\n                    if not examples:\n                        continue\n                    paths_and_content_to_check.append((path + [subfield], examples))\n        elif all(isinstance(x, list) for x in content):\n            examples = [x for sublist in content for x in sublist if x is not None]\n            if not examples:\n                continue\n            paths_and_content_to_check.append((path + [0], examples))\n        elif any(isinstance(x, (dict, list)) for x in content):\n            mixed_struct_types_field_paths.append(path)\n    return mixed_struct_types_field_paths\n\n\ndef get_json_field_path_from_pyarrow_json_error(err_str: str) -> str:\n    # e.g. json_field_path_str = \"col/subfield_containing_a_list/[]/subsubfield_in_item_in_the_list\"\n    json_field_path_str = err_str.split(\"Column(\", 1)[1].rsplit(\") changed from\", 1)[0].strip(\"/\")\n    # e.g. json_field_path = [\"col\", \"subfield_containing_a_list\", 0, \"subsubfield_in_item_in_the_list\"]\n    json_field_path = [0 if seg == \"[]\" else seg for seg in json_field_path_str.split(\"/\")]\n    return json_field_path\n\n\ndef insert_json_field_path(json_field_paths: list[str], json_field_path: str) -> None:\n    # Add to list of json_field_paths and check if other share a common path\n    for i in range(len(json_field_paths)):\n        if json_field_paths[i][: len(json_field_path)] == json_field_path:\n            json_field_paths[i] = json_field_path\n            break\n    else:\n        json_field_paths.append(json_field_path)\n\n\ndef json_encode_fields_in_json_lines(original_batch: bytes, json_field_paths: list[str]) -> bytes:\n    examples = [ujson_loads(line) for line in original_batch.splitlines()]\n    for json_field_path in json_field_paths:\n        examples = [json_encode_field(example, json_field_path) for example in examples]\n    batch = \"\\n\".join([ujson_dumps(example) for example in examples]).encode()\n    return batch\n\n\ndef get_json_field_paths_from_feature(feature: \"FeatureType\") -> list[str]:\n    from datasets.features.features import Json, _visit_with_path\n\n    json_field_paths = []\n\n    def get_json_type_path(_feature, feature_path):\n        if isinstance(_feature, Json):\n            json_field_paths.append(feature_path)\n        return _feature\n\n    _visit_with_path(feature, get_json_type_path)\n    return json_field_paths\n\n\ndef set_json_types_in_feature(feature: \"FeatureType\", json_field_paths: list[str]) -> None:\n    from datasets.features.features import Json, _visit_with_path\n\n    def set_json_type(feature, feature_path):\n        return Json() if feature_path in json_field_paths else feature\n\n    feature = _visit_with_path(feature, set_json_type)\n    return feature\n"
  },
  {
    "path": "src/datasets/utils/logging.py",
    "content": "# Copyright 2020 Optuna, Hugging Face\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Logging utilities.\"\"\"\n\nimport logging\nimport os\nfrom logging import (\n    CRITICAL,  # NOQA\n    DEBUG,  # NOQA\n    ERROR,  # NOQA\n    FATAL,  # NOQA\n    INFO,  # NOQA\n    NOTSET,  # NOQA\n    WARN,  # NOQA\n    WARNING,  # NOQA\n)\nfrom typing import Optional\n\nfrom .tqdm import (  # noqa: F401 # imported for backward compatibility\n    disable_progress_bar,\n    enable_progress_bar,\n    is_progress_bar_enabled,\n    tqdm,\n)\n\n\nlog_levels = {\n    \"debug\": logging.DEBUG,\n    \"info\": logging.INFO,\n    \"warning\": logging.WARNING,\n    \"error\": logging.ERROR,\n    \"critical\": logging.CRITICAL,\n}\n\n_default_log_level = logging.WARNING\n\n\ndef _get_default_logging_level():\n    \"\"\"\n    If DATASETS_VERBOSITY env var is set to one of the valid choices return that as the new default level.\n    If it is not - fall back to ``_default_log_level``\n    \"\"\"\n    env_level_str = os.getenv(\"DATASETS_VERBOSITY\", None)\n    if env_level_str:\n        if env_level_str in log_levels:\n            return log_levels[env_level_str]\n        else:\n            logging.getLogger().warning(\n                f\"Unknown option DATASETS_VERBOSITY={env_level_str}, has to be one of: {', '.join(log_levels.keys())}\"\n            )\n    return _default_log_level\n\n\ndef _get_library_name() -> str:\n    return __name__.split(\".\")[0]\n\n\ndef _get_library_root_logger() -> logging.Logger:\n    return logging.getLogger(_get_library_name())\n\n\ndef _configure_library_root_logger() -> None:\n    # Apply our default configuration to the library root logger.\n    library_root_logger = _get_library_root_logger()\n    library_root_logger.addHandler(logging.StreamHandler())\n    library_root_logger.setLevel(_get_default_logging_level())\n\n\ndef _reset_library_root_logger() -> None:\n    library_root_logger = _get_library_root_logger()\n    library_root_logger.setLevel(logging.NOTSET)\n\n\ndef get_logger(name: Optional[str] = None) -> logging.Logger:\n    \"\"\"Return a logger with the specified name.\n    This function can be used in dataset builders.\n    \"\"\"\n    if name is None:\n        name = _get_library_name()\n    return logging.getLogger(name)\n\n\ndef get_verbosity() -> int:\n    \"\"\"Return the current level for the HuggingFace datasets library's root logger.\n    Returns:\n        Logging level, e.g., `datasets.logging.DEBUG` and `datasets.logging.INFO`.\n\n    > [!TIP]\n    > HuggingFace datasets library has following logging levels:\n    >     - `datasets.logging.CRITICAL`, `datasets.logging.FATAL`\n    >     - `datasets.logging.ERROR`\n    >     - `datasets.logging.WARNING`, `datasets.logging.WARN`\n    >     - `datasets.logging.INFO`\n    >     - `datasets.logging.DEBUG`\n    \"\"\"\n    return _get_library_root_logger().getEffectiveLevel()\n\n\ndef set_verbosity(verbosity: int) -> None:\n    \"\"\"Set the level for the Hugging Face Datasets library's root logger.\n    Args:\n        verbosity:\n            Logging level, e.g., `datasets.logging.DEBUG` and `datasets.logging.INFO`.\n    \"\"\"\n    _get_library_root_logger().setLevel(verbosity)\n\n\ndef set_verbosity_info():\n    \"\"\"Set the level for the Hugging Face datasets library's root logger to `INFO`.\n\n    This will display most of the logging information and tqdm bars.\n\n    Shortcut to `datasets.logging.set_verbosity(datasets.logging.INFO)`.\n    \"\"\"\n    return set_verbosity(INFO)\n\n\ndef set_verbosity_warning():\n    \"\"\"Set the level for the Hugging Face datasets library's root logger to `WARNING`.\n\n    This will display only the warning and errors logging information and tqdm bars.\n\n    Shortcut to `datasets.logging.set_verbosity(datasets.logging.WARNING)`.\n    \"\"\"\n    return set_verbosity(WARNING)\n\n\ndef set_verbosity_debug():\n    \"\"\"Set the level for the Hugging Face datasets library's root logger to `DEBUG`.\n\n    This will display all the logging information and tqdm bars.\n\n    Shortcut to `datasets.logging.set_verbosity(datasets.logging.DEBUG)`.\n    \"\"\"\n    return set_verbosity(DEBUG)\n\n\ndef set_verbosity_error():\n    \"\"\"Set the level for the Hugging Face datasets library's root logger to `ERROR`.\n\n    This will display only the errors logging information and tqdm bars.\n\n    Shortcut to `datasets.logging.set_verbosity(datasets.logging.ERROR)`.\n    \"\"\"\n    return set_verbosity(ERROR)\n\n\ndef disable_propagation() -> None:\n    \"\"\"Disable propagation of the library log outputs.\n    Note that log propagation is disabled by default.\n    \"\"\"\n    _get_library_root_logger().propagate = False\n\n\ndef enable_propagation() -> None:\n    \"\"\"Enable propagation of the library log outputs.\n    Please disable the Hugging Face datasets library's default handler to prevent double logging if the root logger has\n    been configured.\n    \"\"\"\n    _get_library_root_logger().propagate = True\n\n\n# Configure the library root logger at the module level (singleton-like)\n_configure_library_root_logger()\n"
  },
  {
    "path": "src/datasets/utils/metadata.py",
    "content": "import re\nimport textwrap\nfrom collections import Counter\nfrom itertools import groupby\nfrom operator import itemgetter\nfrom typing import Any, ClassVar, Optional\n\nimport yaml\nfrom huggingface_hub import DatasetCardData\n\nfrom ..config import METADATA_CONFIGS_FIELD\nfrom ..features import Features\nfrom ..info import DatasetInfo, DatasetInfosDict\nfrom ..naming import _split_re\nfrom ..utils.logging import get_logger\n\n\nlogger = get_logger(__name__)\n\n\nclass _NoDuplicateSafeLoader(yaml.SafeLoader):\n    def _check_no_duplicates_on_constructed_node(self, node):\n        keys = [self.constructed_objects[key_node] for key_node, _ in node.value]\n        keys = [tuple(key) if isinstance(key, list) else key for key in keys]\n        counter = Counter(keys)\n        duplicate_keys = [key for key in counter if counter[key] > 1]\n        if duplicate_keys:\n            raise TypeError(f\"Got duplicate yaml keys: {duplicate_keys}\")\n\n    def construct_mapping(self, node, deep=False):\n        mapping = super().construct_mapping(node, deep=deep)\n        self._check_no_duplicates_on_constructed_node(node)\n        return mapping\n\n\ndef _split_yaml_from_readme(readme_content: str) -> tuple[Optional[str], str]:\n    full_content = list(readme_content.splitlines())\n    if full_content and full_content[0] == \"---\" and \"---\" in full_content[1:]:\n        sep_idx = full_content[1:].index(\"---\") + 1\n        yamlblock = \"\\n\".join(full_content[1:sep_idx])\n        return yamlblock, \"\\n\".join(full_content[sep_idx + 1 :])\n\n    return None, \"\\n\".join(full_content)\n\n\nclass MetadataConfigs(dict[str, dict[str, Any]]):\n    \"\"\"Should be in format {config_name: {**config_params}}.\"\"\"\n\n    FIELD_NAME: ClassVar[str] = METADATA_CONFIGS_FIELD\n\n    @staticmethod\n    def _raise_if_data_files_field_not_valid(metadata_config: dict):\n        yaml_data_files = metadata_config.get(\"data_files\")\n        if yaml_data_files is not None:\n            yaml_error_message = textwrap.dedent(\n                f\"\"\"\n                Expected data_files in YAML to be either a string or a list of strings\n                or a list of dicts with two keys: 'split' and 'path', but got {yaml_data_files}\n                Examples of data_files in YAML:\n\n                   data_files: data.csv\n\n                   data_files: data/*.png\n\n                   data_files:\n                    - part0/*\n                    - part1/*\n\n                   data_files:\n                    - split: train\n                      path: train/*\n                    - split: test\n                      path: test/*\n\n                   data_files:\n                    - split: train\n                      path:\n                      - train/part1/*\n                      - train/part2/*\n                    - split: test\n                      path: test/*\n\n                PS: some symbols like dashes '-' are not allowed in split names\n                \"\"\"\n            )\n            if not isinstance(yaml_data_files, (list, str)):\n                raise ValueError(yaml_error_message)\n            if isinstance(yaml_data_files, list):\n                for yaml_data_files_item in yaml_data_files:\n                    if (\n                        not isinstance(yaml_data_files_item, (str, dict))\n                        or isinstance(yaml_data_files_item, dict)\n                        and not (\n                            len(yaml_data_files_item) == 2\n                            and \"split\" in yaml_data_files_item\n                            and re.match(_split_re, yaml_data_files_item[\"split\"])\n                            and isinstance(yaml_data_files_item.get(\"path\"), (str, list))\n                        )\n                    ):\n                        raise ValueError(yaml_error_message)\n\n    @classmethod\n    def _from_exported_parquet_files_and_dataset_infos(\n        cls,\n        parquet_commit_hash: str,\n        exported_parquet_files: list[dict[str, Any]],\n        dataset_infos: DatasetInfosDict,\n    ) -> \"MetadataConfigs\":\n        metadata_configs = {\n            config_name: {\n                \"data_files\": [\n                    {\n                        \"split\": split_name,\n                        \"path\": [\n                            parquet_file[\"url\"].replace(\"refs%2Fconvert%2Fparquet\", parquet_commit_hash)\n                            for parquet_file in parquet_files_for_split\n                        ],\n                    }\n                    for split_name, parquet_files_for_split in groupby(parquet_files_for_config, itemgetter(\"split\"))\n                ],\n                \"version\": str(dataset_infos.get(config_name, DatasetInfo()).version or \"0.0.0\"),\n            }\n            for config_name, parquet_files_for_config in groupby(exported_parquet_files, itemgetter(\"config\"))\n        }\n        if dataset_infos:\n            # Preserve order of configs and splits\n            metadata_configs = {\n                config_name: {\n                    \"data_files\": [\n                        data_file\n                        for split_name in dataset_info.splits\n                        for data_file in metadata_configs[config_name][\"data_files\"]\n                        if data_file[\"split\"] == split_name\n                    ],\n                    \"version\": metadata_configs[config_name][\"version\"],\n                }\n                for config_name, dataset_info in dataset_infos.items()\n            }\n        return cls(metadata_configs)\n\n    @classmethod\n    def from_dataset_card_data(cls, dataset_card_data: DatasetCardData) -> \"MetadataConfigs\":\n        if dataset_card_data.get(cls.FIELD_NAME):\n            metadata_configs = dataset_card_data[cls.FIELD_NAME]\n            if not isinstance(metadata_configs, list):\n                raise ValueError(f\"Expected {cls.FIELD_NAME} to be a list, but got '{metadata_configs}'\")\n            for metadata_config in metadata_configs:\n                if \"config_name\" not in metadata_config:\n                    raise ValueError(\n                        f\"Each config must include `config_name` field with a string name of a config, \"\n                        f\"but got {metadata_config}. \"\n                    )\n                cls._raise_if_data_files_field_not_valid(metadata_config)\n            return cls(\n                {\n                    config.pop(\"config_name\"): {\n                        param: value if param != \"features\" else Features._from_yaml_list(value)\n                        for param, value in config.items()\n                    }\n                    for metadata_config in metadata_configs\n                    if (config := metadata_config.copy())\n                }\n            )\n        return cls()\n\n    def to_dataset_card_data(self, dataset_card_data: DatasetCardData) -> None:\n        if self:\n            for metadata_config in self.values():\n                self._raise_if_data_files_field_not_valid(metadata_config)\n            current_metadata_configs = self.from_dataset_card_data(dataset_card_data)\n            total_metadata_configs = dict(sorted({**current_metadata_configs, **self}.items()))\n            for config_name, config_metadata in total_metadata_configs.items():\n                config_metadata.pop(\"config_name\", None)\n            dataset_card_data[self.FIELD_NAME] = [\n                {\"config_name\": config_name, **config_metadata}\n                for config_name, config_metadata in total_metadata_configs.items()\n            ]\n\n    def get_default_config_name(self) -> Optional[str]:\n        default_config_name = None\n        for config_name, metadata_config in self.items():\n            if len(self) == 1 or config_name == \"default\" or metadata_config.get(\"default\"):\n                if default_config_name is None:\n                    default_config_name = config_name\n                else:\n                    raise ValueError(\n                        f\"Dataset has several default configs: '{default_config_name}' and '{config_name}'.\"\n                    )\n        return default_config_name\n\n\n# DEPRECATED - just here to support old versions of evaluate like 0.2.2\n# To support new tasks on the Hugging Face Hub, please open a PR for this file:\n# https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/pipelines.ts\nknown_task_ids = {\n    \"image-classification\": [],\n    \"translation\": [],\n    \"image-segmentation\": [],\n    \"fill-mask\": [],\n    \"automatic-speech-recognition\": [],\n    \"token-classification\": [],\n    \"sentence-similarity\": [],\n    \"audio-classification\": [],\n    \"question-answering\": [],\n    \"summarization\": [],\n    \"zero-shot-classification\": [],\n    \"table-to-text\": [],\n    \"feature-extraction\": [],\n    \"other\": [],\n    \"multiple-choice\": [],\n    \"text-classification\": [],\n    \"text-to-image\": [],\n    \"text2text-generation\": [],\n    \"zero-shot-image-classification\": [],\n    \"tabular-classification\": [],\n    \"tabular-regression\": [],\n    \"image-to-image\": [],\n    \"tabular-to-text\": [],\n    \"unconditional-image-generation\": [],\n    \"text-retrieval\": [],\n    \"text-to-speech\": [],\n    \"object-detection\": [],\n    \"audio-to-audio\": [],\n    \"text-generation\": [],\n    \"conversational\": [],\n    \"table-question-answering\": [],\n    \"visual-question-answering\": [],\n    \"image-to-text\": [],\n    \"reinforcement-learning\": [],\n    \"voice-activity-detection\": [],\n    \"time-series-forecasting\": [],\n    \"document-question-answering\": [],\n}\n"
  },
  {
    "path": "src/datasets/utils/patching.py",
    "content": "from importlib import import_module\n\nfrom .logging import get_logger\n\n\nlogger = get_logger(__name__)\n\n\nclass _PatchedModuleObj:\n    \"\"\"Set all the modules components as attributes of the _PatchedModuleObj object.\"\"\"\n\n    def __init__(self, module, attrs=None):\n        attrs = attrs or []\n        if module is not None:\n            for key in module.__dict__:\n                if key in attrs or not key.startswith(\"__\"):\n                    setattr(self, key, getattr(module, key))\n        self._original_module = module._original_module if isinstance(module, _PatchedModuleObj) else module\n\n\nclass patch_submodule:\n    \"\"\"\n    Patch a submodule attribute of an object, by keeping all other submodules intact at all levels.\n\n    Example::\n\n        >>> import importlib\n        >>> from datasets.load import dataset_module_factory\n        >>> from datasets.streaming import patch_submodule, xjoin\n        >>>\n        >>> dataset_module = dataset_module_factory(\"stanfordnlp/snli\")\n        >>> snli_module = importlib.import_module(dataset_module.module_path)\n        >>> patcher = patch_submodule(snli_module, \"os.path.join\", xjoin)\n        >>> patcher.start()\n        >>> assert snli_module.os.path.join is xjoin\n    \"\"\"\n\n    _active_patches = []\n\n    def __init__(self, obj, target: str, new, attrs=None):\n        self.obj = obj\n        self.target = target\n        self.new = new\n        self.key = target.split(\".\")[0]\n        self.original = {}\n        self.attrs = attrs or []\n\n    def __enter__(self):\n        *submodules, target_attr = self.target.split(\".\")\n\n        # Patch modules:\n        # it's used to patch attributes of submodules like \"os.path.join\";\n        # in this case we need to patch \"os\" and \"os.path\"\n\n        for i in range(len(submodules)):\n            try:\n                submodule = import_module(\".\".join(submodules[: i + 1]))\n            except ModuleNotFoundError:\n                continue\n            # We iterate over all the globals in self.obj in case we find \"os\" or \"os.path\"\n            for attr in self.obj.__dir__():\n                obj_attr = getattr(self.obj, attr)\n                # We don't check for the name of the global, but rather if its value *is* \"os\" or \"os.path\".\n                # This allows to patch renamed modules like \"from os import path as ospath\".\n                if obj_attr is submodule or (\n                    isinstance(obj_attr, _PatchedModuleObj) and obj_attr._original_module is submodule\n                ):\n                    self.original[attr] = obj_attr\n                    # patch at top level\n                    setattr(self.obj, attr, _PatchedModuleObj(obj_attr, attrs=self.attrs))\n                    patched = getattr(self.obj, attr)\n                    # construct lower levels patches\n                    for key in submodules[i + 1 :]:\n                        setattr(patched, key, _PatchedModuleObj(getattr(patched, key, None), attrs=self.attrs))\n                        patched = getattr(patched, key)\n                    # finally set the target attribute\n                    setattr(patched, target_attr, self.new)\n\n        # Patch attribute itself:\n        # it's used for builtins like \"open\",\n        # and also to patch \"os.path.join\" we may also need to patch \"join\"\n        # itself if it was imported as \"from os.path import join\".\n\n        if submodules:  # if it's an attribute of a submodule like \"os.path.join\"\n            try:\n                attr_value = getattr(import_module(\".\".join(submodules)), target_attr)\n            except (AttributeError, ModuleNotFoundError):\n                return\n            # We iterate over all the globals in self.obj in case we find \"os.path.join\"\n            for attr in self.obj.__dir__():\n                # We don't check for the name of the global, but rather if its value *is* \"os.path.join\".\n                # This allows to patch renamed attributes like \"from os.path import join as pjoin\".\n                if getattr(self.obj, attr) is attr_value:\n                    self.original[attr] = getattr(self.obj, attr)\n                    setattr(self.obj, attr, self.new)\n        elif target_attr in globals()[\"__builtins__\"]:  # if it'a s builtin like \"open\"\n            self.original[target_attr] = globals()[\"__builtins__\"][target_attr]\n            setattr(self.obj, target_attr, self.new)\n        else:\n            raise RuntimeError(f\"Tried to patch attribute {target_attr} instead of a submodule.\")\n\n    def __exit__(self, *exc_info):\n        for attr in list(self.original):\n            setattr(self.obj, attr, self.original.pop(attr))\n\n    def start(self):\n        \"\"\"Activate a patch.\"\"\"\n        self.__enter__()\n        self._active_patches.append(self)\n\n    def stop(self):\n        \"\"\"Stop an active patch.\"\"\"\n        try:\n            self._active_patches.remove(self)\n        except ValueError:\n            # If the patch hasn't been started this will fail\n            return None\n\n        return self.__exit__()\n"
  },
  {
    "path": "src/datasets/utils/py_utils.py",
    "content": "# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Lint as: python3\n\"\"\"Some python utils function and classes.\"\"\"\n\nimport copy\nimport functools\nimport itertools\nimport multiprocessing.pool\nimport os\nimport queue\nimport re\nimport types\nimport warnings\nfrom collections.abc import Iterable\nfrom contextlib import contextmanager\nfrom dataclasses import fields, is_dataclass\nfrom queue import Empty\nfrom shutil import disk_usage\nfrom typing import Any, Callable, Optional, TypeVar, Union\n\nimport multiprocess\nimport multiprocess.pool\nimport numpy as np\nfrom tqdm.auto import tqdm\n\nfrom .. import config\nfrom ..parallel import parallel_map\nfrom . import logging\nfrom . import tqdm as hf_tqdm\nfrom ._dill import (  # noqa: F401 # imported for backward compatibility. TODO: remove in 3.0.0\n    Pickler,\n    dump,\n    dumps,\n    pklregister,\n)\n\n\ntry:  # pragma: no branch\n    from typing import Final\n\n    import typing_extensions as _typing_extensions\n    from typing_extensions import Literal\nexcept ImportError:\n    _typing_extensions = Literal = Final = None\n\n\nlogger = logging.get_logger(__name__)\n\n\n# NOTE: When used on an instance method, the cache is shared across all\n# instances and IS NOT per-instance.\n# See\n# https://stackoverflow.com/questions/14946264/python-lru-cache-decorator-per-instance\n# For @property methods, use @memoized_property below.\nmemoize = functools.lru_cache\n\n\ndef size_str(size_in_bytes):\n    \"\"\"Returns a human readable size string.\n\n    If size_in_bytes is None, then returns \"Unknown size\".\n\n    For example `size_str(1.5 * datasets.units.GiB) == \"1.50 GiB\"`.\n\n    Args:\n        size_in_bytes: `int` or `None`, the size, in bytes, that we want to\n            format as a human-readable size string.\n    \"\"\"\n    if not size_in_bytes:\n        return \"Unknown size\"\n\n    _NAME_LIST = [(\"PiB\", 2**50), (\"TiB\", 2**40), (\"GiB\", 2**30), (\"MiB\", 2**20), (\"KiB\", 2**10)]\n\n    size_in_bytes = float(size_in_bytes)\n    for name, size_bytes in _NAME_LIST:\n        value = size_in_bytes / size_bytes\n        if value >= 1.0:\n            return f\"{value:.2f} {name}\"\n    return f\"{int(size_in_bytes)} bytes\"\n\n\ndef convert_file_size_to_int(size: Union[int, str]) -> int:\n    \"\"\"\n    Converts a size expressed as a string with digits an unit (like `\"50MB\"`) to an integer (in bytes).\n\n    Args:\n        size (`int` or `str`): The size to convert. Will be directly returned if an `int`.\n\n    Example:\n\n    ```py\n    >>> convert_file_size_to_int(\"1MiB\")\n    1048576\n    ```\n    \"\"\"\n    if isinstance(size, int):\n        return size\n    if size.upper().endswith(\"PIB\"):\n        return int(size[:-3]) * (2**50)\n    if size.upper().endswith(\"TIB\"):\n        return int(size[:-3]) * (2**40)\n    if size.upper().endswith(\"GIB\"):\n        return int(size[:-3]) * (2**30)\n    if size.upper().endswith(\"MIB\"):\n        return int(size[:-3]) * (2**20)\n    if size.upper().endswith(\"KIB\"):\n        return int(size[:-3]) * (2**10)\n    if size.upper().endswith(\"PB\"):\n        int_size = int(size[:-2]) * (10**15)\n        return int_size // 8 if size.endswith(\"b\") else int_size\n    if size.upper().endswith(\"TB\"):\n        int_size = int(size[:-2]) * (10**12)\n        return int_size // 8 if size.endswith(\"b\") else int_size\n    if size.upper().endswith(\"GB\"):\n        int_size = int(size[:-2]) * (10**9)\n        return int_size // 8 if size.endswith(\"b\") else int_size\n    if size.upper().endswith(\"MB\"):\n        int_size = int(size[:-2]) * (10**6)\n        return int_size // 8 if size.endswith(\"b\") else int_size\n    if size.upper().endswith(\"KB\"):\n        int_size = int(size[:-2]) * (10**3)\n        return int_size // 8 if size.endswith(\"b\") else int_size\n    raise ValueError(f\"`size={size}` is not in a valid format. Use an integer followed by the unit, e.g., '5GB'.\")\n\n\ndef glob_pattern_to_regex(pattern):\n    # partially taken from fsspec:\n    # https://github.com/fsspec/filesystem_spec/blob/697d0f8133d8a5fbc3926e4761d7ecd51337ce50/fsspec/asyn.py#L735\n    return (\n        pattern.replace(\"\\\\\", r\"\\\\\")\n        .replace(\".\", r\"\\.\")\n        .replace(\"*\", \".*\")\n        .replace(\"+\", r\"\\+\")\n        .replace(\"//\", \"/\")\n        .replace(\"(\", r\"\\(\")\n        .replace(\")\", r\"\\)\")\n        .replace(\"|\", r\"\\|\")\n        .replace(\"^\", r\"\\^\")\n        .replace(\"$\", r\"\\$\")\n        .rstrip(\"/\")\n        .replace(\"?\", \".\")\n    )\n\n\ndef string_to_dict(string: str, pattern: str) -> Optional[dict[str, str]]:\n    \"\"\"Un-format a string using a python f-string pattern.\n    From https://stackoverflow.com/a/36838374\n\n    Example::\n\n        >>> p = 'hello, my name is {name} and I am a {age} year old {what}'\n        >>> s = p.format(name='cody', age=18, what='quarterback')\n        >>> s\n        'hello, my name is cody and I am a 18 year old quarterback'\n        >>> string_to_dict(s, p)\n        {'age': '18', 'name': 'cody', 'what': 'quarterback'}\n\n    Args:\n        string (str): input string\n        pattern (str): pattern formatted like a python f-string\n            This can be a regex - so in case of un-formatting paths you should use posix paths.\n            Otherwise backslashes for windows paths can cause issues.\n\n    Returns:\n        Optional[dict[str, str]]: dictionary of variable -> value, retrieved from the input using the pattern, or\n        `None` if the string does not match the pattern.\n    \"\"\"\n    pattern = re.sub(r\"{([^:}]+)(?::[^}]+)?}\", r\"{\\1}\", pattern)  # remove format specifiers, e.g. {rank:05d} -> {rank}\n    regex = re.sub(r\"{(.+?)}\", r\"(?P<_\\1>.+)\", pattern)\n    result = re.search(regex, string)\n    if result is None:\n        return None\n    values = list(result.groups())\n    keys = re.findall(r\"{(.+?)}\", pattern)\n    _dict = dict(zip(keys, values))\n    return _dict\n\n\ndef asdict(obj):\n    \"\"\"Convert an object to its dictionary representation recursively.\n\n    <Added version=\"2.4.0\"/>\n    \"\"\"\n\n    # Implementation based on https://docs.python.org/3/library/dataclasses.html#dataclasses.asdict\n\n    def _is_dataclass_instance(obj):\n        # https://docs.python.org/3/library/dataclasses.html#dataclasses.is_dataclass\n        return is_dataclass(obj) and not isinstance(obj, type)\n\n    def _asdict_inner(obj):\n        if _is_dataclass_instance(obj):\n            result = {}\n            for f in fields(obj):\n                value = _asdict_inner(getattr(obj, f.name))\n                if not f.init or value != f.default or f.metadata.get(\"include_in_asdict_even_if_is_default\", False):\n                    result[f.name] = value\n            return result\n        elif isinstance(obj, tuple) and hasattr(obj, \"_fields\"):\n            # obj is a namedtuple\n            return type(obj)(*[_asdict_inner(v) for v in obj])\n        elif isinstance(obj, (list, tuple)):\n            # Assume we can create an object of this type by passing in a\n            # generator (which is not true for namedtuples, handled\n            # above).\n            return type(obj)(_asdict_inner(v) for v in obj)\n        elif isinstance(obj, dict):\n            return {_asdict_inner(k): _asdict_inner(v) for k, v in obj.items()}\n        else:\n            return copy.deepcopy(obj)\n\n    if not isinstance(obj, dict) and not _is_dataclass_instance(obj):\n        raise TypeError(f\"{obj} is not a dict or a dataclass\")\n\n    return _asdict_inner(obj)\n\n\n@contextmanager\ndef temporary_assignment(obj, attr, value):\n    \"\"\"Temporarily assign obj.attr to value.\"\"\"\n    original = getattr(obj, attr, None)\n    setattr(obj, attr, value)\n    try:\n        yield\n    finally:\n        setattr(obj, attr, original)\n\n\n@contextmanager\ndef temp_seed(seed: int, set_pytorch=False, set_tensorflow=False):\n    \"\"\"Temporarily set the random seed. This works for python numpy, pytorch and tensorflow.\"\"\"\n    np_state = np.random.get_state()\n    np.random.seed(seed)\n\n    if set_pytorch and config.TORCH_AVAILABLE:\n        import torch\n\n        torch_state = torch.random.get_rng_state()\n        torch.random.manual_seed(seed)\n\n        if torch.cuda.is_available():\n            torch_cuda_states = torch.cuda.get_rng_state_all()\n            torch.cuda.manual_seed_all(seed)\n\n    if set_tensorflow and config.TF_AVAILABLE:\n        import tensorflow as tf\n        from tensorflow.python.eager import context as tfpycontext\n\n        tf_state = tf.random.get_global_generator()\n        temp_gen = tf.random.Generator.from_seed(seed)\n        tf.random.set_global_generator(temp_gen)\n\n        if not tf.executing_eagerly():\n            raise ValueError(\"Setting random seed for TensorFlow is only available in eager mode\")\n\n        tf_context = tfpycontext.context()  # eager mode context\n        tf_seed = tf_context._seed\n        tf_rng_initialized = hasattr(tf_context, \"_rng\")\n        if tf_rng_initialized:\n            tf_rng = tf_context._rng\n        tf_context._set_global_seed(seed)\n\n    try:\n        yield\n    finally:\n        np.random.set_state(np_state)\n\n        if set_pytorch and config.TORCH_AVAILABLE:\n            torch.random.set_rng_state(torch_state)\n            if torch.cuda.is_available():\n                torch.cuda.set_rng_state_all(torch_cuda_states)\n\n        if set_tensorflow and config.TF_AVAILABLE:\n            tf.random.set_global_generator(tf_state)\n\n            tf_context._seed = tf_seed\n            if tf_rng_initialized:\n                tf_context._rng = tf_rng\n            else:\n                delattr(tf_context, \"_rng\")\n\n\ndef unique_values(values):\n    \"\"\"Iterate over iterable and return only unique values in order.\"\"\"\n    seen = set()\n    for value in values:\n        if value not in seen:\n            seen.add(value)\n            yield value\n\n\ndef no_op_if_value_is_null(func):\n    \"\"\"If the value is None, return None, else call `func`.\"\"\"\n\n    def wrapper(value):\n        return func(value) if value is not None else None\n\n    return wrapper\n\n\ndef first_non_null_value(iterable):\n    \"\"\"Return the index and the value of the first non-null value in the iterable. If all values are None, return -1 as index.\"\"\"\n    for i, value in enumerate(iterable):\n        if value is not None:\n            return i, value\n    return -1, None\n\n\ndef first_non_null_non_empty_value(iterable):\n    \"\"\"Return the index and the value of the first non-null non-empty value in the iterable. If all values are None or empty, return -1 as index.\"\"\"\n    for i, value in enumerate(iterable):\n        if value is not None and not (isinstance(value, (dict, list)) and len(value) == 0):\n            return i, value\n    return -1, None\n\n\ndef zip_dict(*dicts):\n    \"\"\"Iterate over items of dictionaries grouped by their keys.\"\"\"\n    for key in unique_values(itertools.chain(*dicts)):  # set merge all keys\n        # Will raise KeyError if the dict don't have the same keys\n        yield key, tuple(d[key] for d in dicts)\n\n\nclass NonMutableDict(dict):\n    \"\"\"Dict where keys can only be added but not modified.\n\n    Will raise an error if the user try to overwrite one key. The error message\n    can be customized during construction. It will be formatted using {key} for\n    the overwritten key.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        self._error_msg = kwargs.pop(\n            \"error_msg\",\n            \"Try to overwrite existing key: {key}\",\n        )\n        if kwargs:\n            raise ValueError(\"NonMutableDict cannot be initialized with kwargs.\")\n        super().__init__(*args, **kwargs)\n\n    def __setitem__(self, key, value):\n        if key in self:\n            raise ValueError(self._error_msg.format(key=key))\n        return super().__setitem__(key, value)\n\n    def update(self, other):\n        if any(k in self for k in other):\n            raise ValueError(self._error_msg.format(key=set(self) & set(other)))\n        return super().update(other)\n\n\nclass classproperty(property):  # pylint: disable=invalid-name\n    \"\"\"Descriptor to be used as decorator for @classmethods.\"\"\"\n\n    def __get__(self, obj, objtype=None):\n        return self.fget.__get__(None, objtype)()\n\n\ndef _single_map_nested(args):\n    \"\"\"Apply a function recursively to each element of a nested data struct.\"\"\"\n    function, data_struct, batched, batch_size, types, rank, disable_tqdm, desc = args\n\n    # Singleton first to spare some computation\n    if not isinstance(data_struct, dict) and not isinstance(data_struct, types):\n        if batched:\n            return function([data_struct])[0]\n        else:\n            return function(data_struct)\n    if (\n        batched\n        and not isinstance(data_struct, dict)\n        and isinstance(data_struct, types)\n        and all(not isinstance(v, (dict, types)) for v in data_struct)\n    ):\n        return [mapped_item for batch in iter_batched(data_struct, batch_size) for mapped_item in function(batch)]\n\n    # Reduce logging to keep things readable in multiprocessing with tqdm\n    if rank is not None and logging.get_verbosity() < logging.WARNING:\n        logging.set_verbosity_warning()\n    # Print at least one thing to fix tqdm in notebooks in multiprocessing\n    # see https://github.com/tqdm/tqdm/issues/485#issuecomment-473338308\n    if rank is not None and not disable_tqdm and any(\"notebook\" in tqdm_cls.__name__ for tqdm_cls in tqdm.__mro__):\n        print(\" \", end=\"\", flush=True)\n\n    # Loop over single examples or batches and write to buffer/file if examples are to be updated\n    pbar_iterable = data_struct.items() if isinstance(data_struct, dict) else data_struct\n    pbar_desc = (desc + \" \" if desc is not None else \"\") + \"#\" + str(rank) if rank is not None else desc\n    with hf_tqdm(pbar_iterable, disable=disable_tqdm, position=rank, unit=\"obj\", desc=pbar_desc) as pbar:\n        if isinstance(data_struct, dict):\n            return {\n                k: _single_map_nested((function, v, batched, batch_size, types, None, True, None)) for k, v in pbar\n            }\n        else:\n            mapped = [_single_map_nested((function, v, batched, batch_size, types, None, True, None)) for v in pbar]\n            if isinstance(data_struct, list):\n                return mapped\n            elif isinstance(data_struct, tuple):\n                return tuple(mapped)\n            else:\n                return np.array(mapped)\n\n\ndef map_nested(\n    function: Callable[[Any], Any],\n    data_struct: Any,\n    dict_only: bool = False,\n    map_list: bool = True,\n    map_tuple: bool = False,\n    map_numpy: bool = False,\n    num_proc: Optional[int] = None,\n    parallel_min_length: int = 2,\n    batched: bool = False,\n    batch_size: Optional[int] = 1000,\n    types: Optional[tuple] = None,\n    disable_tqdm: bool = True,\n    desc: Optional[str] = None,\n) -> Any:\n    \"\"\"Apply a function recursively to each element of a nested data struct.\n\n    Use multiprocessing if num_proc > 1 and the length of data_struct is greater than or equal to\n    `parallel_min_length`.\n\n    <Changed version=\"2.5.0\">\n\n    Before version 2.5.0, multiprocessing was not used if `num_proc` was greater than or equal to ``len(iterable)``.\n\n    Now, if `num_proc` is greater than or equal to ``len(iterable)``, `num_proc` is set to ``len(iterable)`` and\n    multiprocessing is used.\n\n    </Changed>\n\n    Args:\n        function (`Callable`): Function to be applied to `data_struct`.\n        data_struct (`Any`): Data structure to apply `function` to.\n        dict_only (`bool`, default `False`): Whether only apply `function` recursively to `dict` values in\n            `data_struct`.\n        map_list (`bool`, default `True`): Whether also apply `function` recursively to `list` elements (besides `dict`\n            values).\n        map_tuple (`bool`, default `False`): Whether also apply `function` recursively to `tuple` elements (besides\n            `dict` values).\n        map_numpy (`bool, default `False`): Whether also apply `function` recursively to `numpy.array` elements (besides\n            `dict` values).\n        num_proc (`int`, *optional*): Number of processes.\n            The level in the data struct used for multiprocessing is the first level that has smaller sub-structs,\n            starting from the root.\n        parallel_min_length (`int`, default `2`): Minimum length of `data_struct` required for parallel\n            processing.\n            <Added version=\"2.5.0\"/>\n        batched (`bool`, defaults to `False`):\n            Provide batch of items to `function`.\n            <Added version=\"2.19.0\"/>\n        batch_size (`int`, *optional*, defaults to `1000`):\n            Number of items per batch provided to `function` if `batched=True`.\n            If `batch_size <= 0` or `batch_size == None`, provide the full iterable as a single batch to `function`.\n            <Added version=\"2.19.0\"/>\n        types (`tuple`, *optional*): Additional types (besides `dict` values) to apply `function` recursively to their\n            elements.\n        disable_tqdm (`bool`, default `True`): Whether to disable the tqdm progressbar.\n        desc (`str`, *optional*): Prefix for the tqdm progressbar.\n\n    Returns:\n        `Any`\n    \"\"\"\n    if types is None:\n        types = []\n        if not dict_only:\n            if map_list:\n                types.append(list)\n            if map_tuple:\n                types.append(tuple)\n            if map_numpy:\n                types.append(np.ndarray)\n        types = tuple(types)\n\n    # Singleton\n    if not isinstance(data_struct, dict) and not isinstance(data_struct, types):\n        if batched:\n            data_struct = [data_struct]\n        mapped = function(data_struct)\n        if batched:\n            mapped = mapped[0]\n        return mapped\n\n    iterable = list(data_struct.values()) if isinstance(data_struct, dict) else data_struct\n\n    if num_proc is None:\n        num_proc = 1\n    if any(isinstance(v, types) and len(v) > len(iterable) for v in iterable):\n        mapped = [\n            map_nested(\n                function=function,\n                data_struct=obj,\n                num_proc=num_proc,\n                parallel_min_length=parallel_min_length,\n                batched=batched,\n                batch_size=batch_size,\n                types=types,\n            )\n            for obj in iterable\n        ]\n    elif num_proc != -1 and num_proc <= 1 or len(iterable) < parallel_min_length:\n        if batched:\n            if batch_size is None or batch_size <= 0:\n                batch_size = max(len(iterable) // num_proc + int(len(iterable) % num_proc > 0), 1)\n            iterable = list(iter_batched(iterable, batch_size))\n        mapped = [\n            _single_map_nested((function, obj, batched, batch_size, types, None, True, None))\n            for obj in hf_tqdm(iterable, disable=disable_tqdm, desc=desc)\n        ]\n        if batched:\n            mapped = [mapped_item for mapped_batch in mapped for mapped_item in mapped_batch]\n    else:\n        with warnings.catch_warnings():\n            warnings.filterwarnings(\n                \"ignore\",\n                message=\".* is experimental and might be subject to breaking changes in the future\\\\.$\",\n                category=UserWarning,\n            )\n            if batched:\n                if batch_size is None or batch_size <= 0:\n                    batch_size = len(iterable) // num_proc + int(len(iterable) % num_proc > 0)\n                iterable = list(iter_batched(iterable, batch_size))\n            mapped = parallel_map(\n                function, iterable, num_proc, batched, batch_size, types, disable_tqdm, desc, _single_map_nested\n            )\n            if batched:\n                mapped = [mapped_item for mapped_batch in mapped for mapped_item in mapped_batch]\n\n    if isinstance(data_struct, dict):\n        return dict(zip(data_struct.keys(), mapped))\n    else:\n        if isinstance(data_struct, list):\n            return mapped\n        elif isinstance(data_struct, tuple):\n            return tuple(mapped)\n        else:\n            return np.array(mapped)\n\n\nclass NestedDataStructure:\n    def __init__(self, data=None):\n        self.data = data if data is not None else []\n\n    def flatten(self, data=None):\n        data = data if data is not None else self.data\n        if isinstance(data, dict):\n            return self.flatten(list(data.values()))\n        elif isinstance(data, (list, tuple)):\n            return [flattened for item in data for flattened in self.flatten(item)]\n        else:\n            return [data]\n\n\ndef has_sufficient_disk_space(needed_bytes, directory=\".\"):\n    try:\n        free_bytes = disk_usage(os.path.abspath(directory)).free\n    except OSError:\n        return True\n    return needed_bytes < free_bytes\n\n\ndef copyfunc(func):\n    result = types.FunctionType(func.__code__, func.__globals__, func.__name__, func.__defaults__, func.__closure__)\n    result.__kwdefaults__ = func.__kwdefaults__\n    return result\n\n\nY = TypeVar(\"Y\")\n\n\ndef _write_generator_to_queue(queue: queue.Queue, func: Callable[..., Iterable[Y]], kwargs: dict) -> int:\n    for i, result in enumerate(func(**kwargs)):\n        queue.put(result)\n    return i\n\n\ndef _get_pool_pid(pool: Union[multiprocessing.pool.Pool, multiprocess.pool.Pool]) -> set[int]:\n    return {f.pid for f in pool._pool}\n\n\ndef iflatmap_unordered(\n    pool: Union[multiprocessing.pool.Pool, multiprocess.pool.Pool],\n    func: Callable[..., Iterable[Y]],\n    *,\n    kwargs_iterable: Iterable[dict],\n) -> Iterable[Y]:\n    initial_pool_pid = _get_pool_pid(pool)\n    pool_changed = False\n    with pool._ctx.Manager() as manager:\n        queue = manager.Queue()\n        async_results = [\n            pool.apply_async(_write_generator_to_queue, (queue, func, kwargs)) for kwargs in kwargs_iterable\n        ]\n        try:\n            while True:\n                try:\n                    yield queue.get(timeout=0.05)\n                except Empty:\n                    if all(async_result.ready() for async_result in async_results) and queue.empty():\n                        break\n                if _get_pool_pid(pool) != initial_pool_pid:\n                    pool_changed = True\n                    # One of the subprocesses has died. We should not wait forever.\n                    raise RuntimeError(\n                        \"One of the subprocesses has abruptly died during map operation.\"\n                        \"To debug the error, disable multiprocessing.\"\n                    )\n        finally:\n            if not pool_changed:\n                # we get the result in case there's an error to raise\n                [async_result.get(timeout=0.05) for async_result in async_results]\n\n\nT = TypeVar(\"T\")\n\n\ndef iter_batched(iterable: Iterable[T], n: int) -> Iterable[list[T]]:\n    if n < 1:\n        raise ValueError(f\"Invalid batch size {n}\")\n    batch = []\n    for item in iterable:\n        batch.append(item)\n        if len(batch) == n:\n            yield batch\n            batch = []\n    if batch:\n        yield batch\n"
  },
  {
    "path": "src/datasets/utils/resources/__init__.py",
    "content": ""
  },
  {
    "path": "src/datasets/utils/resources/creators.json",
    "content": "{\n  \"language\": [\n    \"found\",\n    \"crowdsourced\",\n    \"expert-generated\",\n    \"machine-generated\",\n    \"other\"\n  ],\n  \"annotations\": [\n    \"found\",\n    \"crowdsourced\",\n    \"expert-generated\",\n    \"machine-generated\",\n    \"no-annotation\",\n    \"other\"\n  ]\n}\n"
  },
  {
    "path": "src/datasets/utils/resources/languages.json",
    "content": "{\n    \"code\": \"Programming language (C++, Java, Javascript, Python, etc.)\",\n    \"aa\": \"Afar\",\n    \"aaa\": \"Ghotuo\",\n    \"aab\": \"Alumu-Tesu\",\n    \"aac\": \"Ari\",\n    \"aad\": \"Amal\",\n    \"aae\": \"Arbëreshë Albanian\",\n    \"aaf\": \"Aranadan\",\n    \"aag\": \"Ambrak\",\n    \"aah\": \"Abu' Arapesh\",\n    \"aai\": \"Arifama-Miniafia\",\n    \"aak\": \"Ankave\",\n    \"aal\": \"Afade\",\n    \"aan\": \"Anambé\",\n    \"aao\": \"Algerian Saharan Arabic\",\n    \"aap\": \"Pará Arára\",\n    \"aaq\": \"Eastern Abnaki\",\n    \"aas\": \"Aasáx\",\n    \"aat\": \"Arvanitika Albanian\",\n    \"aau\": \"Abau\",\n    \"aav\": \"Austro-Asiatic languages\",\n    \"aaw\": \"Solong\",\n    \"aax\": \"Mandobo Atas\",\n    \"aaz\": \"Amarasi\",\n    \"ab\": \"Abkhazian\",\n    \"aba\": \"Abé\",\n    \"abb\": \"Bankon\",\n    \"abc\": \"Ambala Ayta\",\n    \"abd\": \"Manide\",\n    \"abe\": \"Western Abnaki\",\n    \"abf\": \"Abai Sungai\",\n    \"abg\": \"Abaga\",\n    \"abh\": \"Tajiki Arabic\",\n    \"abi\": \"Abidji\",\n    \"abj\": \"Aka-Bea\",\n    \"abl\": \"Lampung Nyo\",\n    \"abm\": \"Abanyom\",\n    \"abn\": \"Abua\",\n    \"abo\": \"Abon\",\n    \"abp\": \"Abellen Ayta\",\n    \"abq\": \"Abaza\",\n    \"abr\": \"Abron\",\n    \"abs\": \"Ambonese Malay\",\n    \"abt\": \"Ambulas\",\n    \"abu\": \"Abure\",\n    \"abv\": \"Baharna Arabic\",\n    \"abw\": \"Pal\",\n    \"abx\": \"Inabaknon\",\n    \"aby\": \"Aneme Wake\",\n    \"abz\": \"Abui\",\n    \"aca\": \"Achagua\",\n    \"acb\": \"Áncá\",\n    \"acd\": \"Gikyode\",\n    \"ace\": \"Achinese\",\n    \"acf\": \"Saint Lucian Creole French\",\n    \"ach\": \"Acoli\",\n    \"aci\": \"Aka-Cari\",\n    \"ack\": \"Aka-Kora\",\n    \"acl\": \"Akar-Bale\",\n    \"acm\": \"Mesopotamian Arabic\",\n    \"acn\": \"Achang\",\n    \"acp\": \"Eastern Acipa\",\n    \"acq\": \"Ta'izzi-Adeni Arabic\",\n    \"acr\": \"Achi\",\n    \"acs\": \"Acroá\",\n    \"act\": \"Achterhoeks\",\n    \"acu\": \"Achuar-Shiwiar\",\n    \"acv\": \"Achumawi\",\n    \"acw\": \"Hijazi Arabic\",\n    \"acx\": \"Omani Arabic\",\n    \"acy\": \"Cypriot Arabic\",\n    \"acz\": \"Acheron\",\n    \"ada\": \"Adangme\",\n    \"adb\": \"Atauran\",\n    \"add\": \"Lidzonka; Dzodinka\",\n    \"ade\": \"Adele\",\n    \"adf\": \"Dhofari Arabic\",\n    \"adg\": \"Andegerebinha\",\n    \"adh\": \"Adhola\",\n    \"adi\": \"Adi\",\n    \"adj\": \"Adioukrou\",\n    \"adl\": \"Galo\",\n    \"adn\": \"Adang\",\n    \"ado\": \"Abu\",\n    \"adq\": \"Adangbe\",\n    \"adr\": \"Adonara\",\n    \"ads\": \"Adamorobe Sign Language\",\n    \"adt\": \"Adnyamathanha\",\n    \"adu\": \"Aduge\",\n    \"adw\": \"Amundava\",\n    \"adx\": \"Amdo Tibetan\",\n    \"ady\": \"Adyghe; Adygei\",\n    \"adz\": \"Adzera\",\n    \"ae\": \"Avestan\",\n    \"aea\": \"Areba\",\n    \"aeb\": \"Tunisian Arabic\",\n    \"aec\": \"Saidi Arabic\",\n    \"aed\": \"Argentine Sign Language\",\n    \"aee\": \"Northeast Pashai; Northeast Pashayi\",\n    \"aek\": \"Haeke\",\n    \"ael\": \"Ambele\",\n    \"aem\": \"Arem\",\n    \"aen\": \"Armenian Sign Language\",\n    \"aeq\": \"Aer\",\n    \"aer\": \"Eastern Arrernte\",\n    \"aes\": \"Alsea\",\n    \"aeu\": \"Akeu\",\n    \"aew\": \"Ambakich\",\n    \"aey\": \"Amele\",\n    \"aez\": \"Aeka\",\n    \"af\": \"Afrikaans\",\n    \"afa\": \"Afro-Asiatic languages\",\n    \"afb\": \"Gulf Arabic\",\n    \"afd\": \"Andai\",\n    \"afe\": \"Putukwam\",\n    \"afg\": \"Afghan Sign Language\",\n    \"afh\": \"Afrihili\",\n    \"afi\": \"Akrukay; Chini\",\n    \"afk\": \"Nanubae\",\n    \"afn\": \"Defaka\",\n    \"afo\": \"Eloyi\",\n    \"afp\": \"Tapei\",\n    \"afs\": \"Afro-Seminole Creole\",\n    \"aft\": \"Afitti\",\n    \"afu\": \"Awutu\",\n    \"afz\": \"Obokuitai\",\n    \"aga\": \"Aguano\",\n    \"agb\": \"Legbo\",\n    \"agc\": \"Agatu\",\n    \"agd\": \"Agarabi\",\n    \"age\": \"Angal\",\n    \"agf\": \"Arguni\",\n    \"agg\": \"Angor\",\n    \"agh\": \"Ngelima\",\n    \"agi\": \"Agariya\",\n    \"agj\": \"Argobba\",\n    \"agk\": \"Isarog Agta\",\n    \"agl\": \"Fembe\",\n    \"agm\": \"Angaataha\",\n    \"agn\": \"Agutaynen\",\n    \"ago\": \"Tainae\",\n    \"agq\": \"Aghem\",\n    \"agr\": \"Aguaruna\",\n    \"ags\": \"Esimbi\",\n    \"agt\": \"Central Cagayan Agta\",\n    \"agu\": \"Aguacateco\",\n    \"agv\": \"Remontado Dumagat\",\n    \"agw\": \"Kahua\",\n    \"agx\": \"Aghul\",\n    \"agy\": \"Southern Alta\",\n    \"agz\": \"Mt. Iriga Agta\",\n    \"aha\": \"Ahanta\",\n    \"ahb\": \"Axamb\",\n    \"ahg\": \"Qimant\",\n    \"ahh\": \"Aghu\",\n    \"ahi\": \"Tiagbamrin Aizi\",\n    \"ahk\": \"Akha\",\n    \"ahl\": \"Igo\",\n    \"ahm\": \"Mobumrin Aizi\",\n    \"ahn\": \"Àhàn\",\n    \"aho\": \"Ahom\",\n    \"ahp\": \"Aproumu Aizi\",\n    \"ahr\": \"Ahirani\",\n    \"ahs\": \"Ashe\",\n    \"aht\": \"Ahtena\",\n    \"aia\": \"Arosi\",\n    \"aib\": \"Ainu (China)\",\n    \"aic\": \"Ainbai\",\n    \"aid\": \"Alngith\",\n    \"aie\": \"Amara\",\n    \"aif\": \"Agi\",\n    \"aig\": \"Antigua and Barbuda Creole English\",\n    \"aih\": \"Ai-Cham\",\n    \"aii\": \"Assyrian Neo-Aramaic\",\n    \"aij\": \"Lishanid Noshan\",\n    \"aik\": \"Ake\",\n    \"ail\": \"Aimele\",\n    \"aim\": \"Aimol\",\n    \"ain\": \"Ainu (Japan)\",\n    \"aio\": \"Aiton\",\n    \"aip\": \"Burumakok\",\n    \"aiq\": \"Aimaq\",\n    \"air\": \"Airoran\",\n    \"ait\": \"Arikem\",\n    \"aiw\": \"Aari\",\n    \"aix\": \"Aighon\",\n    \"aiy\": \"Ali\",\n    \"aja\": \"Aja (South Sudan)\",\n    \"ajg\": \"Aja (Benin)\",\n    \"aji\": \"Ajië\",\n    \"ajn\": \"Andajin\",\n    \"ajp\": \"South Levantine Arabic\",\n    \"ajs\": \"Algerian Jewish Sign Language\",\n    \"aju\": \"Judeo-Moroccan Arabic\",\n    \"ajw\": \"Ajawa\",\n    \"ajz\": \"Amri Karbi\",\n    \"ak\": \"Akan\",\n    \"akb\": \"Batak Angkola\",\n    \"akc\": \"Mpur\",\n    \"akd\": \"Ukpet-Ehom\",\n    \"ake\": \"Akawaio\",\n    \"akf\": \"Akpa\",\n    \"akg\": \"Anakalangu\",\n    \"akh\": \"Angal Heneng\",\n    \"aki\": \"Aiome\",\n    \"akj\": \"Aka-Jeru\",\n    \"akk\": \"Akkadian\",\n    \"akl\": \"Aklanon\",\n    \"akm\": \"Aka-Bo\",\n    \"ako\": \"Akurio\",\n    \"akp\": \"Siwu\",\n    \"akq\": \"Ak\",\n    \"akr\": \"Araki\",\n    \"aks\": \"Akaselem\",\n    \"akt\": \"Akolet\",\n    \"aku\": \"Akum\",\n    \"akv\": \"Akhvakh\",\n    \"akw\": \"Akwa\",\n    \"akx\": \"Aka-Kede\",\n    \"aky\": \"Aka-Kol\",\n    \"akz\": \"Alabama\",\n    \"ala\": \"Alago\",\n    \"alc\": \"Qawasqar\",\n    \"ald\": \"Alladian\",\n    \"ale\": \"Aleut\",\n    \"alf\": \"Alege\",\n    \"alg\": \"Algonquian languages\",\n    \"alh\": \"Alawa\",\n    \"ali\": \"Amaimon\",\n    \"alj\": \"Alangan\",\n    \"alk\": \"Alak\",\n    \"all\": \"Allar\",\n    \"alm\": \"Amblong\",\n    \"aln\": \"Gheg Albanian\",\n    \"alo\": \"Larike-Wakasihu\",\n    \"alp\": \"Alune\",\n    \"alq\": \"Algonquin\",\n    \"alr\": \"Alutor\",\n    \"als\": \"Tosk Albanian\",\n    \"alt\": \"Southern Altai\",\n    \"alu\": \"'Are'are\",\n    \"alv\": \"Atlantic-Congo languages\",\n    \"alw\": \"Alaba-K’abeena; Wanbasana\",\n    \"alx\": \"Amol\",\n    \"aly\": \"Alyawarr\",\n    \"alz\": \"Alur\",\n    \"am\": \"Amharic\",\n    \"ama\": \"Amanayé\",\n    \"amb\": \"Ambo\",\n    \"amc\": \"Amahuaca\",\n    \"ame\": \"Yanesha'\",\n    \"amf\": \"Hamer-Banna\",\n    \"amg\": \"Amurdak\",\n    \"ami\": \"Amis\",\n    \"amj\": \"Amdang\",\n    \"amk\": \"Ambai\",\n    \"aml\": \"War-Jaintia\",\n    \"amm\": \"Ama (Papua New Guinea)\",\n    \"amn\": \"Amanab\",\n    \"amo\": \"Amo\",\n    \"amp\": \"Alamblak\",\n    \"amq\": \"Amahai\",\n    \"amr\": \"Amarakaeri\",\n    \"ams\": \"Southern Amami-Oshima\",\n    \"amt\": \"Amto\",\n    \"amu\": \"Guerrero Amuzgo\",\n    \"amv\": \"Ambelau\",\n    \"amw\": \"Western Neo-Aramaic\",\n    \"amx\": \"Anmatyerre\",\n    \"amy\": \"Ami\",\n    \"amz\": \"Atampaya\",\n    \"an\": \"Aragonese\",\n    \"ana\": \"Andaqui\",\n    \"anb\": \"Andoa\",\n    \"anc\": \"Ngas\",\n    \"and\": \"Ansus\",\n    \"ane\": \"Xârâcùù\",\n    \"anf\": \"Animere\",\n    \"ang\": \"Old English (ca. 450-1100)\",\n    \"anh\": \"Nend\",\n    \"ani\": \"Andi\",\n    \"anj\": \"Anor\",\n    \"ank\": \"Goemai\",\n    \"anl\": \"Anu-Hkongso Chin\",\n    \"anm\": \"Anal\",\n    \"ann\": \"Obolo\",\n    \"ano\": \"Andoque\",\n    \"anp\": \"Angika\",\n    \"anq\": \"Jarawa (India)\",\n    \"anr\": \"Andh\",\n    \"ans\": \"Anserma\",\n    \"ant\": \"Antakarinya; Antikarinya\",\n    \"anu\": \"Anuak\",\n    \"anv\": \"Denya\",\n    \"anw\": \"Anaang\",\n    \"anx\": \"Andra-Hus\",\n    \"any\": \"Anyin\",\n    \"anz\": \"Anem\",\n    \"aoa\": \"Angolar\",\n    \"aob\": \"Abom\",\n    \"aoc\": \"Pemon\",\n    \"aod\": \"Andarum\",\n    \"aoe\": \"Angal Enen\",\n    \"aof\": \"Bragat\",\n    \"aog\": \"Angoram\",\n    \"aoi\": \"Anindilyakwa\",\n    \"aoj\": \"Mufian\",\n    \"aok\": \"Arhö\",\n    \"aol\": \"Alor\",\n    \"aom\": \"Ömie\",\n    \"aon\": \"Bumbita Arapesh\",\n    \"aor\": \"Aore\",\n    \"aos\": \"Taikat\",\n    \"aot\": \"Atong (India); A'tong\",\n    \"aou\": \"A'ou\",\n    \"aox\": \"Atorada\",\n    \"aoz\": \"Uab Meto\",\n    \"apa\": \"Apache languages\",\n    \"apb\": \"Sa'a\",\n    \"apc\": \"North Levantine Arabic\",\n    \"apd\": \"Sudanese Arabic\",\n    \"ape\": \"Bukiyip\",\n    \"apf\": \"Pahanan Agta\",\n    \"apg\": \"Ampanang\",\n    \"aph\": \"Athpariya\",\n    \"api\": \"Apiaká\",\n    \"apj\": \"Jicarilla Apache\",\n    \"apk\": \"Kiowa Apache\",\n    \"apl\": \"Lipan Apache\",\n    \"apm\": \"Mescalero-Chiricahua Apache\",\n    \"apn\": \"Apinayé\",\n    \"apo\": \"Ambul\",\n    \"app\": \"Apma\",\n    \"apq\": \"A-Pucikwar\",\n    \"apr\": \"Arop-Lokep\",\n    \"aps\": \"Arop-Sissano\",\n    \"apt\": \"Apatani\",\n    \"apu\": \"Apurinã\",\n    \"apv\": \"Alapmunte\",\n    \"apw\": \"Western Apache\",\n    \"apx\": \"Aputai\",\n    \"apy\": \"Apalaí\",\n    \"apz\": \"Safeyoka\",\n    \"aqa\": \"Alacalufan languages\",\n    \"aqc\": \"Archi\",\n    \"aqd\": \"Ampari Dogon\",\n    \"aqg\": \"Arigidi\",\n    \"aqk\": \"Aninka\",\n    \"aql\": \"Algic languages\",\n    \"aqm\": \"Atohwaim\",\n    \"aqn\": \"Northern Alta\",\n    \"aqp\": \"Atakapa\",\n    \"aqr\": \"Arhâ\",\n    \"aqt\": \"Angaité\",\n    \"aqz\": \"Akuntsu\",\n    \"ar\": \"Arabic\",\n    \"arb\": \"Standard Arabic\",\n    \"arc\": \"Official Aramaic (700-300 BCE); Imperial Aramaic (700-300 BCE)\",\n    \"ard\": \"Arabana\",\n    \"are\": \"Western Arrarnta\",\n    \"arh\": \"Arhuaco\",\n    \"ari\": \"Arikara\",\n    \"arj\": \"Arapaso\",\n    \"ark\": \"Arikapú\",\n    \"arl\": \"Arabela\",\n    \"arn\": \"Mapudungun; Mapuche\",\n    \"aro\": \"Araona\",\n    \"arp\": \"Arapaho\",\n    \"arq\": \"Algerian Arabic\",\n    \"arr\": \"Karo (Brazil)\",\n    \"ars\": \"Najdi Arabic\",\n    \"art\": \"Artificial languages\",\n    \"aru\": \"Aruá (Amazonas State); Arawá\",\n    \"arv\": \"Arbore\",\n    \"arw\": \"Arawak\",\n    \"arx\": \"Aruá (Rodonia State)\",\n    \"ary\": \"Moroccan Arabic\",\n    \"arz\": \"Egyptian Arabic\",\n    \"as\": \"Assamese\",\n    \"asa\": \"Asu (Tanzania)\",\n    \"asb\": \"Assiniboine\",\n    \"asc\": \"Casuarina Coast Asmat\",\n    \"ase\": \"American Sign Language\",\n    \"asf\": \"Auslan; Australian Sign Language\",\n    \"asg\": \"Cishingini\",\n    \"ash\": \"Abishira\",\n    \"asi\": \"Buruwai\",\n    \"asj\": \"Sari\",\n    \"ask\": \"Ashkun\",\n    \"asl\": \"Asilulu\",\n    \"asn\": \"Xingú Asuriní\",\n    \"aso\": \"Dano\",\n    \"asp\": \"Algerian Sign Language\",\n    \"asq\": \"Austrian Sign Language\",\n    \"asr\": \"Asuri\",\n    \"ass\": \"Ipulo\",\n    \"ast\": \"Asturian; Asturleonese; Bable; Leonese\",\n    \"asu\": \"Tocantins Asurini\",\n    \"asv\": \"Asoa\",\n    \"asw\": \"Australian Aborigines Sign Language\",\n    \"asx\": \"Muratayak\",\n    \"asy\": \"Yaosakor Asmat\",\n    \"asz\": \"As\",\n    \"ata\": \"Pele-Ata\",\n    \"atb\": \"Zaiwa\",\n    \"atc\": \"Atsahuaca\",\n    \"atd\": \"Ata Manobo\",\n    \"ate\": \"Atemble\",\n    \"atg\": \"Ivbie North-Okpela-Arhe\",\n    \"ath\": \"Athapascan languages\",\n    \"ati\": \"Attié\",\n    \"atj\": \"Atikamekw\",\n    \"atk\": \"Ati\",\n    \"atl\": \"Mt. Iraya Agta\",\n    \"atm\": \"Ata\",\n    \"atn\": \"Ashtiani\",\n    \"ato\": \"Atong (Cameroon)\",\n    \"atp\": \"Pudtol Atta\",\n    \"atq\": \"Aralle-Tabulahan\",\n    \"atr\": \"Waimiri-Atroari\",\n    \"ats\": \"Gros Ventre\",\n    \"att\": \"Pamplona Atta\",\n    \"atu\": \"Reel\",\n    \"atv\": \"Northern Altai\",\n    \"atw\": \"Atsugewi\",\n    \"atx\": \"Arutani\",\n    \"aty\": \"Aneityum\",\n    \"atz\": \"Arta\",\n    \"aua\": \"Asumboa\",\n    \"aub\": \"Alugu\",\n    \"auc\": \"Waorani\",\n    \"aud\": \"Anuta\",\n    \"auf\": \"Arauan languages\",\n    \"aug\": \"Aguna\",\n    \"auh\": \"Aushi\",\n    \"aui\": \"Anuki\",\n    \"auj\": \"Awjilah\",\n    \"auk\": \"Heyo\",\n    \"aul\": \"Aulua\",\n    \"aum\": \"Asu (Nigeria)\",\n    \"aun\": \"Molmo One\",\n    \"auo\": \"Auyokawa\",\n    \"aup\": \"Makayam\",\n    \"auq\": \"Anus; Korur\",\n    \"aur\": \"Aruek\",\n    \"aus\": \"Australian languages\",\n    \"aut\": \"Austral\",\n    \"auu\": \"Auye\",\n    \"auw\": \"Awyi\",\n    \"aux\": \"Aurá\",\n    \"auy\": \"Awiyaana\",\n    \"auz\": \"Uzbeki Arabic\",\n    \"av\": \"Avaric\",\n    \"avb\": \"Avau\",\n    \"avd\": \"Alviri-Vidari\",\n    \"avi\": \"Avikam\",\n    \"avk\": \"Kotava\",\n    \"avl\": \"Eastern Egyptian Bedawi Arabic\",\n    \"avm\": \"Angkamuthi\",\n    \"avn\": \"Avatime\",\n    \"avo\": \"Agavotaguerra\",\n    \"avs\": \"Aushiri\",\n    \"avt\": \"Au\",\n    \"avu\": \"Avokaya\",\n    \"avv\": \"Avá-Canoeiro\",\n    \"awa\": \"Awadhi\",\n    \"awb\": \"Awa (Papua New Guinea)\",\n    \"awc\": \"Cicipu\",\n    \"awd\": \"Arawakan languages\",\n    \"awe\": \"Awetí\",\n    \"awg\": \"Anguthimri\",\n    \"awh\": \"Awbono\",\n    \"awi\": \"Aekyom\",\n    \"awk\": \"Awabakal\",\n    \"awm\": \"Arawum\",\n    \"awn\": \"Awngi\",\n    \"awo\": \"Awak\",\n    \"awr\": \"Awera\",\n    \"aws\": \"South Awyu\",\n    \"awt\": \"Araweté\",\n    \"awu\": \"Central Awyu\",\n    \"awv\": \"Jair Awyu\",\n    \"aww\": \"Awun\",\n    \"awx\": \"Awara\",\n    \"awy\": \"Edera Awyu\",\n    \"axb\": \"Abipon\",\n    \"axe\": \"Ayerrerenge\",\n    \"axg\": \"Mato Grosso Arára\",\n    \"axk\": \"Yaka (Central African Republic)\",\n    \"axl\": \"Lower Southern Aranda\",\n    \"axm\": \"Middle Armenian\",\n    \"axx\": \"Xârâgurè\",\n    \"ay\": \"Aymara\",\n    \"aya\": \"Awar\",\n    \"ayb\": \"Ayizo Gbe\",\n    \"ayc\": \"Southern Aymara\",\n    \"ayd\": \"Ayabadhu\",\n    \"aye\": \"Ayere\",\n    \"ayg\": \"Ginyanga\",\n    \"ayh\": \"Hadrami Arabic\",\n    \"ayi\": \"Leyigha\",\n    \"ayk\": \"Akuku\",\n    \"ayl\": \"Libyan Arabic\",\n    \"ayn\": \"Sanaani Arabic\",\n    \"ayo\": \"Ayoreo\",\n    \"ayp\": \"North Mesopotamian Arabic\",\n    \"ayq\": \"Ayi (Papua New Guinea)\",\n    \"ayr\": \"Central Aymara\",\n    \"ays\": \"Sorsogon Ayta\",\n    \"ayt\": \"Magbukun Ayta\",\n    \"ayu\": \"Ayu\",\n    \"ayz\": \"Mai Brat\",\n    \"az\": \"Azerbaijani\",\n    \"aza\": \"Azha\",\n    \"azb\": \"South Azerbaijani\",\n    \"azc\": \"Uto-Aztecan languages\",\n    \"azd\": \"Eastern Durango Nahuatl\",\n    \"azg\": \"San Pedro Amuzgos Amuzgo\",\n    \"azj\": \"North Azerbaijani\",\n    \"azm\": \"Ipalapa Amuzgo\",\n    \"azn\": \"Western Durango Nahuatl\",\n    \"azo\": \"Awing\",\n    \"azt\": \"Faire Atta\",\n    \"azz\": \"Highland Puebla Nahuatl\",\n    \"ba\": \"Bashkir\",\n    \"baa\": \"Babatana\",\n    \"bab\": \"Bainouk-Gunyuño\",\n    \"bac\": \"Badui\",\n    \"bad\": \"Banda languages\",\n    \"bae\": \"Baré\",\n    \"baf\": \"Nubaca\",\n    \"bag\": \"Tuki\",\n    \"bah\": \"Bahamas Creole English\",\n    \"bai\": \"Bamileke languages\",\n    \"baj\": \"Barakai\",\n    \"bal\": \"Baluchi\",\n    \"ban\": \"Balinese\",\n    \"bao\": \"Waimaha\",\n    \"bap\": \"Bantawa\",\n    \"bar\": \"Bavarian\",\n    \"bas\": \"Basa (Cameroon)\",\n    \"bat\": \"Baltic languages\",\n    \"bau\": \"Bada (Nigeria)\",\n    \"bav\": \"Vengo\",\n    \"baw\": \"Bambili-Bambui\",\n    \"bax\": \"Bamun\",\n    \"bay\": \"Batuley\",\n    \"bba\": \"Baatonum\",\n    \"bbb\": \"Barai\",\n    \"bbc\": \"Batak Toba\",\n    \"bbd\": \"Bau\",\n    \"bbe\": \"Bangba\",\n    \"bbf\": \"Baibai\",\n    \"bbg\": \"Barama\",\n    \"bbh\": \"Bugan\",\n    \"bbi\": \"Barombi\",\n    \"bbj\": \"Ghomálá'\",\n    \"bbk\": \"Babanki\",\n    \"bbl\": \"Bats\",\n    \"bbm\": \"Babango\",\n    \"bbn\": \"Uneapa\",\n    \"bbo\": \"Northern Bobo Madaré; Konabéré\",\n    \"bbp\": \"West Central Banda\",\n    \"bbq\": \"Bamali\",\n    \"bbr\": \"Girawa\",\n    \"bbs\": \"Bakpinka\",\n    \"bbt\": \"Mburku\",\n    \"bbu\": \"Kulung (Nigeria)\",\n    \"bbv\": \"Karnai\",\n    \"bbw\": \"Baba\",\n    \"bbx\": \"Bubia\",\n    \"bby\": \"Befang\",\n    \"bca\": \"Central Bai\",\n    \"bcb\": \"Bainouk-Samik\",\n    \"bcc\": \"Southern Balochi\",\n    \"bcd\": \"North Babar\",\n    \"bce\": \"Bamenyam\",\n    \"bcf\": \"Bamu\",\n    \"bcg\": \"Baga Pokur\",\n    \"bch\": \"Bariai\",\n    \"bci\": \"Baoulé\",\n    \"bcj\": \"Bardi\",\n    \"bck\": \"Bunuba\",\n    \"bcl\": \"Central Bikol\",\n    \"bcm\": \"Bannoni\",\n    \"bcn\": \"Bali (Nigeria)\",\n    \"bco\": \"Kaluli\",\n    \"bcp\": \"Bali (Democratic Republic of Congo)\",\n    \"bcq\": \"Bench\",\n    \"bcr\": \"Babine\",\n    \"bcs\": \"Kohumono\",\n    \"bct\": \"Bendi\",\n    \"bcu\": \"Awad Bing\",\n    \"bcv\": \"Shoo-Minda-Nye\",\n    \"bcw\": \"Bana\",\n    \"bcy\": \"Bacama\",\n    \"bcz\": \"Bainouk-Gunyaamolo\",\n    \"bda\": \"Bayot\",\n    \"bdb\": \"Basap\",\n    \"bdc\": \"Emberá-Baudó\",\n    \"bdd\": \"Bunama\",\n    \"bde\": \"Bade\",\n    \"bdf\": \"Biage\",\n    \"bdg\": \"Bonggi\",\n    \"bdh\": \"Baka (South Sudan)\",\n    \"bdi\": \"Burun\",\n    \"bdj\": \"Bai (South Sudan); Bai\",\n    \"bdk\": \"Budukh\",\n    \"bdl\": \"Indonesian Bajau\",\n    \"bdm\": \"Buduma\",\n    \"bdn\": \"Baldemu\",\n    \"bdo\": \"Morom\",\n    \"bdp\": \"Bende\",\n    \"bdq\": \"Bahnar\",\n    \"bdr\": \"West Coast Bajau\",\n    \"bds\": \"Burunge\",\n    \"bdt\": \"Bokoto\",\n    \"bdu\": \"Oroko\",\n    \"bdv\": \"Bodo Parja\",\n    \"bdw\": \"Baham\",\n    \"bdx\": \"Budong-Budong\",\n    \"bdy\": \"Bandjalang\",\n    \"bdz\": \"Badeshi\",\n    \"be\": \"Belarusian\",\n    \"bea\": \"Beaver\",\n    \"beb\": \"Bebele\",\n    \"bec\": \"Iceve-Maci\",\n    \"bed\": \"Bedoanas\",\n    \"bee\": \"Byangsi\",\n    \"bef\": \"Benabena\",\n    \"beg\": \"Belait\",\n    \"beh\": \"Biali\",\n    \"bei\": \"Bekati'\",\n    \"bej\": \"Beja; Bedawiyet\",\n    \"bek\": \"Bebeli\",\n    \"bem\": \"Bemba (Zambia)\",\n    \"beo\": \"Beami\",\n    \"bep\": \"Besoa\",\n    \"beq\": \"Beembe\",\n    \"ber\": \"Berber languages\",\n    \"bes\": \"Besme\",\n    \"bet\": \"Guiberoua Béte\",\n    \"beu\": \"Blagar\",\n    \"bev\": \"Daloa Bété\",\n    \"bew\": \"Betawi\",\n    \"bex\": \"Jur Modo\",\n    \"bey\": \"Beli (Papua New Guinea)\",\n    \"bez\": \"Bena (Tanzania)\",\n    \"bfa\": \"Bari\",\n    \"bfb\": \"Pauri Bareli\",\n    \"bfc\": \"Panyi Bai; Northern Bai\",\n    \"bfd\": \"Bafut\",\n    \"bfe\": \"Betaf; Tena\",\n    \"bff\": \"Bofi\",\n    \"bfg\": \"Busang Kayan\",\n    \"bfh\": \"Blafe\",\n    \"bfi\": \"British Sign Language\",\n    \"bfj\": \"Bafanji\",\n    \"bfk\": \"Ban Khor Sign Language\",\n    \"bfl\": \"Banda-Ndélé\",\n    \"bfm\": \"Mmen\",\n    \"bfn\": \"Bunak\",\n    \"bfo\": \"Malba Birifor\",\n    \"bfp\": \"Beba\",\n    \"bfq\": \"Badaga\",\n    \"bfr\": \"Bazigar\",\n    \"bfs\": \"Southern Bai\",\n    \"bft\": \"Balti\",\n    \"bfu\": \"Gahri\",\n    \"bfw\": \"Bondo\",\n    \"bfx\": \"Bantayanon\",\n    \"bfy\": \"Bagheli\",\n    \"bfz\": \"Mahasu Pahari\",\n    \"bg\": \"Bulgarian\",\n    \"bga\": \"Gwamhi-Wuri\",\n    \"bgb\": \"Bobongko\",\n    \"bgc\": \"Haryanvi\",\n    \"bgd\": \"Rathwi Bareli\",\n    \"bge\": \"Bauria\",\n    \"bgf\": \"Bangandu\",\n    \"bgg\": \"Bugun\",\n    \"bgi\": \"Giangan\",\n    \"bgj\": \"Bangolan\",\n    \"bgk\": \"Bit; Buxinhua\",\n    \"bgl\": \"Bo (Laos)\",\n    \"bgn\": \"Western Balochi\",\n    \"bgo\": \"Baga Koga\",\n    \"bgp\": \"Eastern Balochi\",\n    \"bgq\": \"Bagri\",\n    \"bgr\": \"Bawm Chin\",\n    \"bgs\": \"Tagabawa\",\n    \"bgt\": \"Bughotu\",\n    \"bgu\": \"Mbongno\",\n    \"bgv\": \"Warkay-Bipim\",\n    \"bgw\": \"Bhatri\",\n    \"bgx\": \"Balkan Gagauz Turkish\",\n    \"bgy\": \"Benggoi\",\n    \"bgz\": \"Banggai\",\n    \"bh\": \"Bihari languages\",\n    \"bha\": \"Bharia\",\n    \"bhb\": \"Bhili\",\n    \"bhc\": \"Biga\",\n    \"bhd\": \"Bhadrawahi\",\n    \"bhe\": \"Bhaya\",\n    \"bhf\": \"Odiai\",\n    \"bhg\": \"Binandere\",\n    \"bhh\": \"Bukharic\",\n    \"bhi\": \"Bhilali\",\n    \"bhj\": \"Bahing\",\n    \"bhl\": \"Bimin\",\n    \"bhm\": \"Bathari\",\n    \"bhn\": \"Bohtan Neo-Aramaic\",\n    \"bho\": \"Bhojpuri\",\n    \"bhp\": \"Bima\",\n    \"bhq\": \"Tukang Besi South\",\n    \"bhr\": \"Bara Malagasy\",\n    \"bhs\": \"Buwal\",\n    \"bht\": \"Bhattiyali\",\n    \"bhu\": \"Bhunjia\",\n    \"bhv\": \"Bahau\",\n    \"bhw\": \"Biak\",\n    \"bhx\": \"Bhalay\",\n    \"bhy\": \"Bhele\",\n    \"bhz\": \"Bada (Indonesia)\",\n    \"bi\": \"Bislama\",\n    \"bia\": \"Badimaya\",\n    \"bib\": \"Bissa; Bisa\",\n    \"bid\": \"Bidiyo\",\n    \"bie\": \"Bepour\",\n    \"bif\": \"Biafada\",\n    \"big\": \"Biangai\",\n    \"bik\": \"Bikol\",\n    \"bil\": \"Bile\",\n    \"bim\": \"Bimoba\",\n    \"bin\": \"Bini; Edo\",\n    \"bio\": \"Nai\",\n    \"bip\": \"Bila\",\n    \"biq\": \"Bipi\",\n    \"bir\": \"Bisorio\",\n    \"bit\": \"Berinomo\",\n    \"biu\": \"Biete\",\n    \"biv\": \"Southern Birifor\",\n    \"biw\": \"Kol (Cameroon)\",\n    \"bix\": \"Bijori\",\n    \"biy\": \"Birhor\",\n    \"biz\": \"Baloi\",\n    \"bja\": \"Budza\",\n    \"bjb\": \"Banggarla\",\n    \"bjc\": \"Bariji\",\n    \"bje\": \"Biao-Jiao Mien\",\n    \"bjf\": \"Barzani Jewish Neo-Aramaic\",\n    \"bjg\": \"Bidyogo\",\n    \"bjh\": \"Bahinemo\",\n    \"bji\": \"Burji\",\n    \"bjj\": \"Kanauji\",\n    \"bjk\": \"Barok\",\n    \"bjl\": \"Bulu (Papua New Guinea)\",\n    \"bjm\": \"Bajelani\",\n    \"bjn\": \"Banjar\",\n    \"bjo\": \"Mid-Southern Banda\",\n    \"bjp\": \"Fanamaket\",\n    \"bjr\": \"Binumarien\",\n    \"bjs\": \"Bajan\",\n    \"bjt\": \"Balanta-Ganja\",\n    \"bju\": \"Busuu\",\n    \"bjv\": \"Bedjond\",\n    \"bjw\": \"Bakwé\",\n    \"bjx\": \"Banao Itneg\",\n    \"bjy\": \"Bayali\",\n    \"bjz\": \"Baruga\",\n    \"bka\": \"Kyak\",\n    \"bkc\": \"Baka (Cameroon)\",\n    \"bkd\": \"Binukid; Talaandig\",\n    \"bkf\": \"Beeke\",\n    \"bkg\": \"Buraka\",\n    \"bkh\": \"Bakoko\",\n    \"bki\": \"Baki\",\n    \"bkj\": \"Pande\",\n    \"bkk\": \"Brokskat\",\n    \"bkl\": \"Berik\",\n    \"bkm\": \"Kom (Cameroon)\",\n    \"bkn\": \"Bukitan\",\n    \"bko\": \"Kwa'\",\n    \"bkp\": \"Boko (Democratic Republic of Congo)\",\n    \"bkq\": \"Bakairí\",\n    \"bkr\": \"Bakumpai\",\n    \"bks\": \"Northern Sorsoganon\",\n    \"bkt\": \"Boloki\",\n    \"bku\": \"Buhid\",\n    \"bkv\": \"Bekwarra\",\n    \"bkw\": \"Bekwel\",\n    \"bkx\": \"Baikeno\",\n    \"bky\": \"Bokyi\",\n    \"bkz\": \"Bungku\",\n    \"bla\": \"Siksika\",\n    \"blb\": \"Bilua\",\n    \"blc\": \"Bella Coola\",\n    \"bld\": \"Bolango\",\n    \"ble\": \"Balanta-Kentohe\",\n    \"blf\": \"Buol\",\n    \"blh\": \"Kuwaa\",\n    \"bli\": \"Bolia\",\n    \"blj\": \"Bolongan\",\n    \"blk\": \"Pa'o Karen; Pa'O\",\n    \"bll\": \"Biloxi\",\n    \"blm\": \"Beli (South Sudan)\",\n    \"bln\": \"Southern Catanduanes Bikol\",\n    \"blo\": \"Anii\",\n    \"blp\": \"Blablanga\",\n    \"blq\": \"Baluan-Pam\",\n    \"blr\": \"Blang\",\n    \"bls\": \"Balaesang\",\n    \"blt\": \"Tai Dam\",\n    \"blv\": \"Kibala; Bolo\",\n    \"blw\": \"Balangao\",\n    \"blx\": \"Mag-Indi Ayta\",\n    \"bly\": \"Notre\",\n    \"blz\": \"Balantak\",\n    \"bm\": \"Bambara\",\n    \"bma\": \"Lame\",\n    \"bmb\": \"Bembe\",\n    \"bmc\": \"Biem\",\n    \"bmd\": \"Baga Manduri\",\n    \"bme\": \"Limassa\",\n    \"bmf\": \"Bom-Kim\",\n    \"bmg\": \"Bamwe\",\n    \"bmh\": \"Kein\",\n    \"bmi\": \"Bagirmi\",\n    \"bmj\": \"Bote-Majhi\",\n    \"bmk\": \"Ghayavi\",\n    \"bml\": \"Bomboli\",\n    \"bmm\": \"Northern Betsimisaraka Malagasy\",\n    \"bmn\": \"Bina (Papua New Guinea)\",\n    \"bmo\": \"Bambalang\",\n    \"bmp\": \"Bulgebi\",\n    \"bmq\": \"Bomu\",\n    \"bmr\": \"Muinane\",\n    \"bms\": \"Bilma Kanuri\",\n    \"bmt\": \"Biao Mon\",\n    \"bmu\": \"Somba-Siawari\",\n    \"bmv\": \"Bum\",\n    \"bmw\": \"Bomwali\",\n    \"bmx\": \"Baimak\",\n    \"bmz\": \"Baramu\",\n    \"bn\": \"Bengali; Bangla\",\n    \"bna\": \"Bonerate\",\n    \"bnb\": \"Bookan\",\n    \"bnc\": \"Bontok\",\n    \"bnd\": \"Banda (Indonesia)\",\n    \"bne\": \"Bintauna\",\n    \"bnf\": \"Masiwang\",\n    \"bng\": \"Benga\",\n    \"bni\": \"Bangi\",\n    \"bnj\": \"Eastern Tawbuid\",\n    \"bnk\": \"Bierebo\",\n    \"bnl\": \"Boon\",\n    \"bnm\": \"Batanga\",\n    \"bnn\": \"Bunun\",\n    \"bno\": \"Bantoanon\",\n    \"bnp\": \"Bola\",\n    \"bnq\": \"Bantik\",\n    \"bnr\": \"Butmas-Tur\",\n    \"bns\": \"Bundeli\",\n    \"bnt\": \"Bantu languages\",\n    \"bnu\": \"Bentong\",\n    \"bnv\": \"Bonerif; Beneraf; Edwas\",\n    \"bnw\": \"Bisis\",\n    \"bnx\": \"Bangubangu\",\n    \"bny\": \"Bintulu\",\n    \"bnz\": \"Beezen\",\n    \"bo\": \"Tibetan\",\n    \"boa\": \"Bora\",\n    \"bob\": \"Aweer\",\n    \"boe\": \"Mundabli\",\n    \"bof\": \"Bolon\",\n    \"bog\": \"Bamako Sign Language\",\n    \"boh\": \"Boma\",\n    \"boi\": \"Barbareño\",\n    \"boj\": \"Anjam\",\n    \"bok\": \"Bonjo\",\n    \"bol\": \"Bole\",\n    \"bom\": \"Berom\",\n    \"bon\": \"Bine\",\n    \"boo\": \"Tiemacèwè Bozo\",\n    \"bop\": \"Bonkiman\",\n    \"boq\": \"Bogaya\",\n    \"bor\": \"Borôro\",\n    \"bot\": \"Bongo\",\n    \"bou\": \"Bondei\",\n    \"bov\": \"Tuwuli\",\n    \"bow\": \"Rema\",\n    \"box\": \"Buamu\",\n    \"boy\": \"Bodo (Central African Republic)\",\n    \"boz\": \"Tiéyaxo Bozo\",\n    \"bpa\": \"Daakaka\",\n    \"bpc\": \"Mbuk\",\n    \"bpd\": \"Banda-Banda\",\n    \"bpe\": \"Bauni\",\n    \"bpg\": \"Bonggo\",\n    \"bph\": \"Botlikh\",\n    \"bpi\": \"Bagupi\",\n    \"bpj\": \"Binji\",\n    \"bpk\": \"Orowe; 'Ôrôê\",\n    \"bpl\": \"Broome Pearling Lugger Pidgin\",\n    \"bpm\": \"Biyom\",\n    \"bpn\": \"Dzao Min\",\n    \"bpo\": \"Anasi\",\n    \"bpp\": \"Kaure\",\n    \"bpq\": \"Banda Malay\",\n    \"bpr\": \"Koronadal Blaan\",\n    \"bps\": \"Sarangani Blaan\",\n    \"bpt\": \"Barrow Point\",\n    \"bpu\": \"Bongu\",\n    \"bpv\": \"Bian Marind\",\n    \"bpw\": \"Bo (Papua New Guinea)\",\n    \"bpx\": \"Palya Bareli\",\n    \"bpy\": \"Bishnupriya\",\n    \"bpz\": \"Bilba\",\n    \"bqa\": \"Tchumbuli\",\n    \"bqb\": \"Bagusa\",\n    \"bqc\": \"Boko (Benin); Boo\",\n    \"bqd\": \"Bung\",\n    \"bqf\": \"Baga Kaloum\",\n    \"bqg\": \"Bago-Kusuntu\",\n    \"bqh\": \"Baima\",\n    \"bqi\": \"Bakhtiari\",\n    \"bqj\": \"Bandial\",\n    \"bqk\": \"Banda-Mbrès\",\n    \"bql\": \"Bilakura\",\n    \"bqm\": \"Wumboko\",\n    \"bqn\": \"Bulgarian Sign Language\",\n    \"bqo\": \"Balo\",\n    \"bqp\": \"Busa\",\n    \"bqq\": \"Biritai\",\n    \"bqr\": \"Burusu\",\n    \"bqs\": \"Bosngun\",\n    \"bqt\": \"Bamukumbit\",\n    \"bqu\": \"Boguru\",\n    \"bqv\": \"Koro Wachi; Begbere-Ejar\",\n    \"bqw\": \"Buru (Nigeria)\",\n    \"bqx\": \"Baangi\",\n    \"bqy\": \"Bengkala Sign Language\",\n    \"bqz\": \"Bakaka\",\n    \"br\": \"Breton\",\n    \"bra\": \"Braj\",\n    \"brb\": \"Brao; Lave\",\n    \"brc\": \"Berbice Creole Dutch\",\n    \"brd\": \"Baraamu\",\n    \"brf\": \"Bira\",\n    \"brg\": \"Baure\",\n    \"brh\": \"Brahui\",\n    \"bri\": \"Mokpwe\",\n    \"brj\": \"Bieria\",\n    \"brk\": \"Birked\",\n    \"brl\": \"Birwa\",\n    \"brm\": \"Barambu\",\n    \"brn\": \"Boruca\",\n    \"bro\": \"Brokkat\",\n    \"brp\": \"Barapasi\",\n    \"brq\": \"Breri\",\n    \"brr\": \"Birao\",\n    \"brs\": \"Baras\",\n    \"brt\": \"Bitare\",\n    \"bru\": \"Eastern Bru\",\n    \"brv\": \"Western Bru\",\n    \"brw\": \"Bellari\",\n    \"brx\": \"Bodo (India)\",\n    \"bry\": \"Burui\",\n    \"brz\": \"Bilbil\",\n    \"bs\": \"Bosnian\",\n    \"bsa\": \"Abinomn\",\n    \"bsb\": \"Brunei Bisaya\",\n    \"bsc\": \"Bassari; Oniyan\",\n    \"bse\": \"Wushi\",\n    \"bsf\": \"Bauchi\",\n    \"bsg\": \"Bashkardi\",\n    \"bsh\": \"Kati\",\n    \"bsi\": \"Bassossi\",\n    \"bsj\": \"Bangwinji\",\n    \"bsk\": \"Burushaski\",\n    \"bsl\": \"Basa-Gumna\",\n    \"bsm\": \"Busami\",\n    \"bsn\": \"Barasana-Eduria\",\n    \"bso\": \"Buso\",\n    \"bsp\": \"Baga Sitemu\",\n    \"bsq\": \"Bassa\",\n    \"bsr\": \"Bassa-Kontagora\",\n    \"bss\": \"Akoose\",\n    \"bst\": \"Basketo\",\n    \"bsu\": \"Bahonsuai\",\n    \"bsv\": \"Baga Sobané\",\n    \"bsw\": \"Baiso\",\n    \"bsx\": \"Yangkam\",\n    \"bsy\": \"Sabah Bisaya\",\n    \"bta\": \"Bata\",\n    \"btc\": \"Bati (Cameroon)\",\n    \"btd\": \"Batak Dairi\",\n    \"bte\": \"Gamo-Ningi\",\n    \"btf\": \"Birgit\",\n    \"btg\": \"Gagnoa Bété\",\n    \"bth\": \"Biatah Bidayuh\",\n    \"bti\": \"Burate\",\n    \"btj\": \"Bacanese Malay\",\n    \"btk\": \"Batak languages\",\n    \"btm\": \"Batak Mandailing\",\n    \"btn\": \"Ratagnon\",\n    \"bto\": \"Rinconada Bikol\",\n    \"btp\": \"Budibud\",\n    \"btq\": \"Batek\",\n    \"btr\": \"Baetora\",\n    \"bts\": \"Batak Simalungun\",\n    \"btt\": \"Bete-Bendi\",\n    \"btu\": \"Batu\",\n    \"btv\": \"Bateri\",\n    \"btw\": \"Butuanon\",\n    \"btx\": \"Batak Karo\",\n    \"bty\": \"Bobot\",\n    \"btz\": \"Batak Alas-Kluet\",\n    \"bua\": \"Buriat\",\n    \"bub\": \"Bua\",\n    \"buc\": \"Bushi\",\n    \"bud\": \"Ntcham\",\n    \"bue\": \"Beothuk\",\n    \"buf\": \"Bushoong\",\n    \"bug\": \"Buginese\",\n    \"buh\": \"Younuo Bunu\",\n    \"bui\": \"Bongili\",\n    \"buj\": \"Basa-Gurmana\",\n    \"buk\": \"Bugawac\",\n    \"bum\": \"Bulu (Cameroon)\",\n    \"bun\": \"Sherbro\",\n    \"buo\": \"Terei\",\n    \"bup\": \"Busoa\",\n    \"buq\": \"Brem\",\n    \"bus\": \"Bokobaru\",\n    \"but\": \"Bungain\",\n    \"buu\": \"Budu\",\n    \"buv\": \"Bun\",\n    \"buw\": \"Bubi\",\n    \"bux\": \"Boghom\",\n    \"buy\": \"Bullom So\",\n    \"buz\": \"Bukwen\",\n    \"bva\": \"Barein\",\n    \"bvb\": \"Bube\",\n    \"bvc\": \"Baelelea\",\n    \"bvd\": \"Baeggu\",\n    \"bve\": \"Berau Malay\",\n    \"bvf\": \"Boor\",\n    \"bvg\": \"Bonkeng\",\n    \"bvh\": \"Bure\",\n    \"bvi\": \"Belanda Viri\",\n    \"bvj\": \"Baan\",\n    \"bvk\": \"Bukat\",\n    \"bvl\": \"Bolivian Sign Language\",\n    \"bvm\": \"Bamunka\",\n    \"bvn\": \"Buna\",\n    \"bvo\": \"Bolgo\",\n    \"bvp\": \"Bumang\",\n    \"bvq\": \"Birri\",\n    \"bvr\": \"Burarra\",\n    \"bvt\": \"Bati (Indonesia)\",\n    \"bvu\": \"Bukit Malay\",\n    \"bvv\": \"Baniva\",\n    \"bvw\": \"Boga\",\n    \"bvx\": \"Dibole\",\n    \"bvy\": \"Baybayanon\",\n    \"bvz\": \"Bauzi\",\n    \"bwa\": \"Bwatoo\",\n    \"bwb\": \"Namosi-Naitasiri-Serua\",\n    \"bwc\": \"Bwile\",\n    \"bwd\": \"Bwaidoka\",\n    \"bwe\": \"Bwe Karen\",\n    \"bwf\": \"Boselewa\",\n    \"bwg\": \"Barwe\",\n    \"bwh\": \"Bishuo\",\n    \"bwi\": \"Baniwa\",\n    \"bwj\": \"Láá Láá Bwamu\",\n    \"bwk\": \"Bauwaki\",\n    \"bwl\": \"Bwela\",\n    \"bwm\": \"Biwat\",\n    \"bwn\": \"Wunai Bunu\",\n    \"bwo\": \"Boro (Ethiopia); Borna (Ethiopia)\",\n    \"bwp\": \"Mandobo Bawah\",\n    \"bwq\": \"Southern Bobo Madaré\",\n    \"bwr\": \"Bura-Pabir\",\n    \"bws\": \"Bomboma\",\n    \"bwt\": \"Bafaw-Balong\",\n    \"bwu\": \"Buli (Ghana)\",\n    \"bww\": \"Bwa\",\n    \"bwx\": \"Bu-Nao Bunu\",\n    \"bwy\": \"Cwi Bwamu\",\n    \"bwz\": \"Bwisi\",\n    \"bxa\": \"Tairaha\",\n    \"bxb\": \"Belanda Bor\",\n    \"bxc\": \"Molengue\",\n    \"bxd\": \"Pela\",\n    \"bxe\": \"Birale\",\n    \"bxf\": \"Bilur; Minigir\",\n    \"bxg\": \"Bangala\",\n    \"bxh\": \"Buhutu\",\n    \"bxi\": \"Pirlatapa\",\n    \"bxj\": \"Bayungu\",\n    \"bxk\": \"Bukusu; Lubukusu\",\n    \"bxl\": \"Jalkunan\",\n    \"bxm\": \"Mongolia Buriat\",\n    \"bxn\": \"Burduna\",\n    \"bxo\": \"Barikanchi\",\n    \"bxp\": \"Bebil\",\n    \"bxq\": \"Beele\",\n    \"bxr\": \"Russia Buriat\",\n    \"bxs\": \"Busam\",\n    \"bxu\": \"China Buriat\",\n    \"bxv\": \"Berakou\",\n    \"bxw\": \"Bankagooma\",\n    \"bxz\": \"Binahari\",\n    \"bya\": \"Batak\",\n    \"byb\": \"Bikya\",\n    \"byc\": \"Ubaghara\",\n    \"byd\": \"Benyadu'\",\n    \"bye\": \"Pouye\",\n    \"byf\": \"Bete\",\n    \"byg\": \"Baygo\",\n    \"byh\": \"Bhujel\",\n    \"byi\": \"Buyu\",\n    \"byj\": \"Bina (Nigeria)\",\n    \"byk\": \"Biao\",\n    \"byl\": \"Bayono\",\n    \"bym\": \"Bidjara\",\n    \"byn\": \"Bilin; Blin\",\n    \"byo\": \"Biyo\",\n    \"byp\": \"Bumaji\",\n    \"byq\": \"Basay\",\n    \"byr\": \"Baruya; Yipma\",\n    \"bys\": \"Burak\",\n    \"byt\": \"Berti\",\n    \"byv\": \"Medumba\",\n    \"byw\": \"Belhariya\",\n    \"byx\": \"Qaqet\",\n    \"byz\": \"Banaro\",\n    \"bza\": \"Bandi\",\n    \"bzb\": \"Andio\",\n    \"bzc\": \"Southern Betsimisaraka Malagasy\",\n    \"bzd\": \"Bribri\",\n    \"bze\": \"Jenaama Bozo\",\n    \"bzf\": \"Boikin\",\n    \"bzg\": \"Babuza\",\n    \"bzh\": \"Mapos Buang\",\n    \"bzi\": \"Bisu\",\n    \"bzj\": \"Belize Kriol English\",\n    \"bzk\": \"Nicaragua Creole English\",\n    \"bzl\": \"Boano (Sulawesi)\",\n    \"bzm\": \"Bolondo\",\n    \"bzn\": \"Boano (Maluku)\",\n    \"bzo\": \"Bozaba\",\n    \"bzp\": \"Kemberano\",\n    \"bzq\": \"Buli (Indonesia)\",\n    \"bzr\": \"Biri\",\n    \"bzs\": \"Brazilian Sign Language\",\n    \"bzt\": \"Brithenig\",\n    \"bzu\": \"Burmeso\",\n    \"bzv\": \"Naami\",\n    \"bzw\": \"Basa (Nigeria)\",\n    \"bzx\": \"Kɛlɛngaxo Bozo\",\n    \"bzy\": \"Obanliku\",\n    \"bzz\": \"Evant\",\n    \"ca\": \"Catalan; Valencian\",\n    \"caa\": \"Chortí\",\n    \"cab\": \"Garifuna\",\n    \"cac\": \"Chuj\",\n    \"cad\": \"Caddo\",\n    \"cae\": \"Lehar; Laalaa\",\n    \"caf\": \"Southern Carrier\",\n    \"cag\": \"Nivaclé\",\n    \"cah\": \"Cahuarano\",\n    \"cai\": \"Central American Indian languages\",\n    \"caj\": \"Chané\",\n    \"cak\": \"Kaqchikel; Cakchiquel\",\n    \"cal\": \"Carolinian\",\n    \"cam\": \"Cemuhî\",\n    \"can\": \"Chambri\",\n    \"cao\": \"Chácobo\",\n    \"cap\": \"Chipaya\",\n    \"caq\": \"Car Nicobarese\",\n    \"car\": \"Galibi Carib\",\n    \"cas\": \"Tsimané\",\n    \"cau\": \"Caucasian languages\",\n    \"cav\": \"Cavineña\",\n    \"caw\": \"Callawalla\",\n    \"cax\": \"Chiquitano\",\n    \"cay\": \"Cayuga\",\n    \"caz\": \"Canichana\",\n    \"cba\": \"Chibchan languages\",\n    \"cbb\": \"Cabiyarí\",\n    \"cbc\": \"Carapana\",\n    \"cbd\": \"Carijona\",\n    \"cbg\": \"Chimila\",\n    \"cbi\": \"Chachi\",\n    \"cbj\": \"Ede Cabe\",\n    \"cbk\": \"Chavacano\",\n    \"cbl\": \"Bualkhaw Chin\",\n    \"cbn\": \"Nyahkur\",\n    \"cbo\": \"Izora\",\n    \"cbq\": \"Tsucuba; Cuba\",\n    \"cbr\": \"Cashibo-Cacataibo\",\n    \"cbs\": \"Cashinahua\",\n    \"cbt\": \"Chayahuita\",\n    \"cbu\": \"Candoshi-Shapra\",\n    \"cbv\": \"Cacua\",\n    \"cbw\": \"Kinabalian\",\n    \"cby\": \"Carabayo\",\n    \"ccc\": \"Chamicuro\",\n    \"ccd\": \"Cafundo Creole\",\n    \"cce\": \"Chopi\",\n    \"ccg\": \"Samba Daka\",\n    \"cch\": \"Atsam\",\n    \"ccj\": \"Kasanga\",\n    \"ccl\": \"Cutchi-Swahili\",\n    \"ccm\": \"Malaccan Creole Malay\",\n    \"ccn\": \"North Caucasian languages\",\n    \"cco\": \"Comaltepec Chinantec\",\n    \"ccp\": \"Chakma\",\n    \"ccr\": \"Cacaopera\",\n    \"ccs\": \"South Caucasian languages\",\n    \"cda\": \"Choni\",\n    \"cdc\": \"Chadic languages\",\n    \"cdd\": \"Caddoan languages\",\n    \"cde\": \"Chenchu\",\n    \"cdf\": \"Chiru\",\n    \"cdh\": \"Chambeali\",\n    \"cdi\": \"Chodri\",\n    \"cdj\": \"Churahi\",\n    \"cdm\": \"Chepang\",\n    \"cdn\": \"Chaudangsi\",\n    \"cdo\": \"Min Dong Chinese\",\n    \"cdr\": \"Cinda-Regi-Tiyal\",\n    \"cds\": \"Chadian Sign Language\",\n    \"cdy\": \"Chadong\",\n    \"cdz\": \"Koda\",\n    \"ce\": \"Chechen\",\n    \"cea\": \"Lower Chehalis\",\n    \"ceb\": \"Cebuano\",\n    \"ceg\": \"Chamacoco\",\n    \"cek\": \"Eastern Khumi Chin\",\n    \"cel\": \"Celtic languages\",\n    \"cen\": \"Cen\",\n    \"cet\": \"Centúúm\",\n    \"cey\": \"Ekai Chin\",\n    \"cfa\": \"Dijim-Bwilim\",\n    \"cfd\": \"Cara\",\n    \"cfg\": \"Como Karim\",\n    \"cfm\": \"Falam Chin\",\n    \"cga\": \"Changriwa\",\n    \"cgc\": \"Kagayanen\",\n    \"cgg\": \"Chiga\",\n    \"cgk\": \"Chocangacakha\",\n    \"ch\": \"Chamorro\",\n    \"chb\": \"Chibcha\",\n    \"chc\": \"Catawba\",\n    \"chd\": \"Highland Oaxaca Chontal\",\n    \"chf\": \"Tabasco Chontal\",\n    \"chg\": \"Chagatai\",\n    \"chh\": \"Chinook\",\n    \"chj\": \"Ojitlán Chinantec\",\n    \"chk\": \"Chuukese\",\n    \"chl\": \"Cahuilla\",\n    \"chm\": \"Mari (Russia)\",\n    \"chn\": \"Chinook jargon\",\n    \"cho\": \"Choctaw\",\n    \"chp\": \"Chipewyan; Dene Suline\",\n    \"chq\": \"Quiotepec Chinantec\",\n    \"chr\": \"Cherokee\",\n    \"cht\": \"Cholón\",\n    \"chw\": \"Chuwabu\",\n    \"chx\": \"Chantyal\",\n    \"chy\": \"Cheyenne\",\n    \"chz\": \"Ozumacín Chinantec\",\n    \"cia\": \"Cia-Cia\",\n    \"cib\": \"Ci Gbe\",\n    \"cic\": \"Chickasaw\",\n    \"cid\": \"Chimariko\",\n    \"cie\": \"Cineni\",\n    \"cih\": \"Chinali\",\n    \"cik\": \"Chitkuli Kinnauri\",\n    \"cim\": \"Cimbrian\",\n    \"cin\": \"Cinta Larga\",\n    \"cip\": \"Chiapanec\",\n    \"cir\": \"Tiri; Haméa; Méa\",\n    \"ciw\": \"Chippewa\",\n    \"ciy\": \"Chaima\",\n    \"cja\": \"Western Cham\",\n    \"cje\": \"Chru\",\n    \"cjh\": \"Upper Chehalis\",\n    \"cji\": \"Chamalal\",\n    \"cjk\": \"Chokwe\",\n    \"cjm\": \"Eastern Cham\",\n    \"cjn\": \"Chenapian\",\n    \"cjo\": \"Ashéninka Pajonal\",\n    \"cjp\": \"Cabécar\",\n    \"cjs\": \"Shor\",\n    \"cjv\": \"Chuave\",\n    \"cjy\": \"Jinyu Chinese\",\n    \"ckb\": \"Central Kurdish\",\n    \"ckh\": \"Chak\",\n    \"ckl\": \"Cibak\",\n    \"ckm\": \"Chakavian\",\n    \"ckn\": \"Kaang Chin\",\n    \"cko\": \"Anufo\",\n    \"ckq\": \"Kajakse\",\n    \"ckr\": \"Kairak\",\n    \"cks\": \"Tayo\",\n    \"ckt\": \"Chukot\",\n    \"cku\": \"Koasati\",\n    \"ckv\": \"Kavalan\",\n    \"ckx\": \"Caka\",\n    \"cky\": \"Cakfem-Mushere\",\n    \"ckz\": \"Cakchiquel-Quiché Mixed Language\",\n    \"cla\": \"Ron\",\n    \"clc\": \"Chilcotin\",\n    \"cld\": \"Chaldean Neo-Aramaic\",\n    \"cle\": \"Lealao Chinantec\",\n    \"clh\": \"Chilisso\",\n    \"cli\": \"Chakali\",\n    \"clj\": \"Laitu Chin\",\n    \"clk\": \"Idu-Mishmi\",\n    \"cll\": \"Chala\",\n    \"clm\": \"Clallam\",\n    \"clo\": \"Lowland Oaxaca Chontal\",\n    \"clt\": \"Lautu Chin\",\n    \"clu\": \"Caluyanun\",\n    \"clw\": \"Chulym\",\n    \"cly\": \"Eastern Highland Chatino\",\n    \"cma\": \"Maa\",\n    \"cmc\": \"Chamic languages\",\n    \"cme\": \"Cerma\",\n    \"cmg\": \"Classical Mongolian\",\n    \"cmi\": \"Emberá-Chamí\",\n    \"cml\": \"Campalagian\",\n    \"cmm\": \"Michigamea\",\n    \"cmn\": \"Mandarin Chinese\",\n    \"cmo\": \"Central Mnong\",\n    \"cmr\": \"Mro-Khimi Chin\",\n    \"cms\": \"Messapic\",\n    \"cmt\": \"Camtho\",\n    \"cna\": \"Changthang\",\n    \"cnb\": \"Chinbon Chin\",\n    \"cnc\": \"Côông\",\n    \"cng\": \"Northern Qiang\",\n    \"cnh\": \"Hakha Chin; Haka Chin\",\n    \"cni\": \"Asháninka\",\n    \"cnk\": \"Khumi Chin\",\n    \"cnl\": \"Lalana Chinantec\",\n    \"cno\": \"Con\",\n    \"cnp\": \"Northern Ping Chinese; Northern Pinghua\",\n    \"cnq\": \"Chung\",\n    \"cnr\": \"Montenegrin\",\n    \"cns\": \"Central Asmat\",\n    \"cnt\": \"Tepetotutla Chinantec\",\n    \"cnu\": \"Chenoua\",\n    \"cnw\": \"Ngawn Chin\",\n    \"cnx\": \"Middle Cornish\",\n    \"co\": \"Corsican\",\n    \"coa\": \"Cocos Islands Malay\",\n    \"cob\": \"Chicomuceltec\",\n    \"coc\": \"Cocopa\",\n    \"cod\": \"Cocama-Cocamilla\",\n    \"coe\": \"Koreguaje\",\n    \"cof\": \"Colorado\",\n    \"cog\": \"Chong\",\n    \"coh\": \"Chonyi-Dzihana-Kauma; Chichonyi-Chidzihana-Chikauma\",\n    \"coj\": \"Cochimi\",\n    \"cok\": \"Santa Teresa Cora\",\n    \"col\": \"Columbia-Wenatchi\",\n    \"com\": \"Comanche\",\n    \"con\": \"Cofán\",\n    \"coo\": \"Comox\",\n    \"cop\": \"Coptic\",\n    \"coq\": \"Coquille\",\n    \"cot\": \"Caquinte\",\n    \"cou\": \"Wamey\",\n    \"cov\": \"Cao Miao\",\n    \"cow\": \"Cowlitz\",\n    \"cox\": \"Nanti\",\n    \"coz\": \"Chochotec\",\n    \"cpa\": \"Palantla Chinantec\",\n    \"cpb\": \"Ucayali-Yurúa Ashéninka\",\n    \"cpc\": \"Ajyíninka Apurucayali\",\n    \"cpe\": \"English-based creoles and pidgins\",\n    \"cpf\": \"French-based creoles and pidgins\",\n    \"cpg\": \"Cappadocian Greek\",\n    \"cpi\": \"Chinese Pidgin English\",\n    \"cpn\": \"Cherepon\",\n    \"cpo\": \"Kpeego\",\n    \"cpp\": \"Portuguese-based creoles and pidgins\",\n    \"cps\": \"Capiznon\",\n    \"cpu\": \"Pichis Ashéninka\",\n    \"cpx\": \"Pu-Xian Chinese\",\n    \"cpy\": \"South Ucayali Ashéninka\",\n    \"cqd\": \"Chuanqiandian Cluster Miao\",\n    \"cr\": \"Cree\",\n    \"cra\": \"Chara\",\n    \"crb\": \"Island Carib\",\n    \"crc\": \"Lonwolwol\",\n    \"crd\": \"Coeur d'Alene\",\n    \"crf\": \"Caramanta\",\n    \"crg\": \"Michif\",\n    \"crh\": \"Crimean Tatar; Crimean Turkish\",\n    \"cri\": \"Sãotomense\",\n    \"crj\": \"Southern East Cree\",\n    \"crk\": \"Plains Cree\",\n    \"crl\": \"Northern East Cree\",\n    \"crm\": \"Moose Cree\",\n    \"crn\": \"El Nayar Cora\",\n    \"cro\": \"Crow\",\n    \"crp\": \"Creoles and pidgins\",\n    \"crq\": \"Iyo'wujwa Chorote\",\n    \"crr\": \"Carolina Algonquian\",\n    \"crs\": \"Seselwa Creole French\",\n    \"crt\": \"Iyojwa'ja Chorote\",\n    \"crv\": \"Chaura\",\n    \"crw\": \"Chrau\",\n    \"crx\": \"Carrier\",\n    \"cry\": \"Cori\",\n    \"crz\": \"Cruzeño\",\n    \"cs\": \"Czech\",\n    \"csa\": \"Chiltepec Chinantec\",\n    \"csb\": \"Kashubian\",\n    \"csc\": \"Catalan Sign Language; Lengua de señas catalana; Llengua de Signes Catalana\",\n    \"csd\": \"Chiangmai Sign Language\",\n    \"cse\": \"Czech Sign Language\",\n    \"csf\": \"Cuba Sign Language\",\n    \"csg\": \"Chilean Sign Language\",\n    \"csh\": \"Asho Chin\",\n    \"csi\": \"Coast Miwok\",\n    \"csj\": \"Songlai Chin\",\n    \"csk\": \"Jola-Kasa\",\n    \"csl\": \"Chinese Sign Language\",\n    \"csm\": \"Central Sierra Miwok\",\n    \"csn\": \"Colombian Sign Language\",\n    \"cso\": \"Sochiapam Chinantec; Sochiapan Chinantec\",\n    \"csp\": \"Southern Ping Chinese; Southern Pinghua\",\n    \"csq\": \"Croatia Sign Language\",\n    \"csr\": \"Costa Rican Sign Language\",\n    \"css\": \"Southern Ohlone\",\n    \"cst\": \"Northern Ohlone\",\n    \"csu\": \"Central Sudanic languages\",\n    \"csv\": \"Sumtu Chin\",\n    \"csw\": \"Swampy Cree\",\n    \"csx\": \"Cambodian Sign Language\",\n    \"csy\": \"Siyin Chin\",\n    \"csz\": \"Coos\",\n    \"cta\": \"Tataltepec Chatino\",\n    \"ctc\": \"Chetco\",\n    \"ctd\": \"Tedim Chin\",\n    \"cte\": \"Tepinapa Chinantec\",\n    \"ctg\": \"Chittagonian\",\n    \"cth\": \"Thaiphum Chin\",\n    \"ctl\": \"Tlacoatzintepec Chinantec\",\n    \"ctm\": \"Chitimacha\",\n    \"ctn\": \"Chhintange\",\n    \"cto\": \"Emberá-Catío\",\n    \"ctp\": \"Western Highland Chatino\",\n    \"cts\": \"Northern Catanduanes Bikol\",\n    \"ctt\": \"Wayanad Chetti\",\n    \"ctu\": \"Chol\",\n    \"cty\": \"Moundadan Chetty\",\n    \"ctz\": \"Zacatepec Chatino\",\n    \"cu\": \"Church Slavic; Church Slavonic; Old Bulgarian; Old Church Slavonic; Old Slavonic\",\n    \"cua\": \"Cua\",\n    \"cub\": \"Cubeo\",\n    \"cuc\": \"Usila Chinantec\",\n    \"cuh\": \"Chuka; Gichuka\",\n    \"cui\": \"Cuiba\",\n    \"cuj\": \"Mashco Piro\",\n    \"cuk\": \"San Blas Kuna\",\n    \"cul\": \"Culina; Kulina\",\n    \"cuo\": \"Cumanagoto\",\n    \"cup\": \"Cupeño\",\n    \"cuq\": \"Cun\",\n    \"cur\": \"Chhulung\",\n    \"cus\": \"Cushitic languages\",\n    \"cut\": \"Teutila Cuicatec\",\n    \"cuu\": \"Tai Ya\",\n    \"cuv\": \"Cuvok\",\n    \"cuw\": \"Chukwa\",\n    \"cux\": \"Tepeuxila Cuicatec\",\n    \"cuy\": \"Cuitlatec\",\n    \"cv\": \"Chuvash\",\n    \"cvg\": \"Chug\",\n    \"cvn\": \"Valle Nacional Chinantec\",\n    \"cwa\": \"Kabwa\",\n    \"cwb\": \"Maindo\",\n    \"cwd\": \"Woods Cree\",\n    \"cwe\": \"Kwere\",\n    \"cwg\": \"Chewong; Cheq Wong\",\n    \"cwt\": \"Kuwaataay\",\n    \"cy\": \"Welsh\",\n    \"cya\": \"Nopala Chatino\",\n    \"cyb\": \"Cayubaba\",\n    \"cyo\": \"Cuyonon\",\n    \"czh\": \"Huizhou Chinese\",\n    \"czk\": \"Knaanic\",\n    \"czn\": \"Zenzontepec Chatino\",\n    \"czo\": \"Min Zhong Chinese\",\n    \"czt\": \"Zotung Chin\",\n    \"da\": \"Danish\",\n    \"daa\": \"Dangaléat\",\n    \"dac\": \"Dambi\",\n    \"dad\": \"Marik\",\n    \"dae\": \"Duupa\",\n    \"dag\": \"Dagbani\",\n    \"dah\": \"Gwahatike\",\n    \"dai\": \"Day\",\n    \"daj\": \"Dar Fur Daju\",\n    \"dak\": \"Dakota\",\n    \"dal\": \"Dahalo\",\n    \"dam\": \"Damakawa\",\n    \"dao\": \"Daai Chin\",\n    \"daq\": \"Dandami Maria\",\n    \"dar\": \"Dargwa\",\n    \"das\": \"Daho-Doo\",\n    \"dau\": \"Dar Sila Daju\",\n    \"dav\": \"Taita; Dawida\",\n    \"daw\": \"Davawenyo\",\n    \"dax\": \"Dayi\",\n    \"day\": \"Land Dayak languages\",\n    \"daz\": \"Dao\",\n    \"dba\": \"Bangime\",\n    \"dbb\": \"Deno\",\n    \"dbd\": \"Dadiya\",\n    \"dbe\": \"Dabe\",\n    \"dbf\": \"Edopi\",\n    \"dbg\": \"Dogul Dom Dogon\",\n    \"dbi\": \"Doka\",\n    \"dbj\": \"Ida'an\",\n    \"dbl\": \"Dyirbal\",\n    \"dbm\": \"Duguri\",\n    \"dbn\": \"Duriankere\",\n    \"dbo\": \"Dulbu\",\n    \"dbp\": \"Duwai\",\n    \"dbq\": \"Daba\",\n    \"dbr\": \"Dabarre\",\n    \"dbt\": \"Ben Tey Dogon\",\n    \"dbu\": \"Bondum Dom Dogon\",\n    \"dbv\": \"Dungu\",\n    \"dbw\": \"Bankan Tey Dogon\",\n    \"dby\": \"Dibiyaso\",\n    \"dcc\": \"Deccan\",\n    \"dcr\": \"Negerhollands\",\n    \"dda\": \"Dadi Dadi\",\n    \"ddd\": \"Dongotono\",\n    \"dde\": \"Doondo\",\n    \"ddg\": \"Fataluku\",\n    \"ddi\": \"West Goodenough\",\n    \"ddj\": \"Jaru\",\n    \"ddn\": \"Dendi (Benin)\",\n    \"ddo\": \"Dido\",\n    \"ddr\": \"Dhudhuroa\",\n    \"dds\": \"Donno So Dogon\",\n    \"ddw\": \"Dawera-Daweloor\",\n    \"de\": \"German\",\n    \"dec\": \"Dagik\",\n    \"ded\": \"Dedua\",\n    \"dee\": \"Dewoin\",\n    \"def\": \"Dezfuli\",\n    \"deg\": \"Degema\",\n    \"deh\": \"Dehwari\",\n    \"dei\": \"Demisa\",\n    \"dek\": \"Dek\",\n    \"del\": \"Delaware\",\n    \"dem\": \"Dem\",\n    \"den\": \"Slave (Athapascan)\",\n    \"dep\": \"Pidgin Delaware\",\n    \"deq\": \"Dendi (Central African Republic)\",\n    \"der\": \"Deori\",\n    \"des\": \"Desano\",\n    \"dev\": \"Domung\",\n    \"dez\": \"Dengese\",\n    \"dga\": \"Southern Dagaare\",\n    \"dgb\": \"Bunoge Dogon\",\n    \"dgc\": \"Casiguran Dumagat Agta\",\n    \"dgd\": \"Dagaari Dioula\",\n    \"dge\": \"Degenan\",\n    \"dgg\": \"Doga\",\n    \"dgh\": \"Dghwede\",\n    \"dgi\": \"Northern Dagara\",\n    \"dgk\": \"Dagba\",\n    \"dgl\": \"Andaandi; Dongolawi\",\n    \"dgn\": \"Dagoman\",\n    \"dgo\": \"Dogri (individual language)\",\n    \"dgr\": \"Dogrib; Tłı̨chǫ\",\n    \"dgs\": \"Dogoso\",\n    \"dgt\": \"Ndra'ngith\",\n    \"dgw\": \"Daungwurrung\",\n    \"dgx\": \"Doghoro\",\n    \"dgz\": \"Daga\",\n    \"dhd\": \"Dhundari\",\n    \"dhg\": \"Dhangu-Djangu; Dhangu; Djangu\",\n    \"dhi\": \"Dhimal\",\n    \"dhl\": \"Dhalandji\",\n    \"dhm\": \"Zemba\",\n    \"dhn\": \"Dhanki\",\n    \"dho\": \"Dhodia\",\n    \"dhr\": \"Dhargari\",\n    \"dhs\": \"Dhaiso\",\n    \"dhu\": \"Dhurga\",\n    \"dhv\": \"Dehu; Drehu\",\n    \"dhw\": \"Dhanwar (Nepal)\",\n    \"dhx\": \"Dhungaloo\",\n    \"dia\": \"Dia\",\n    \"dib\": \"South Central Dinka\",\n    \"dic\": \"Lakota Dida\",\n    \"did\": \"Didinga\",\n    \"dif\": \"Dieri; Diyari\",\n    \"dig\": \"Digo; Chidigo\",\n    \"dih\": \"Kumiai\",\n    \"dii\": \"Dimbong\",\n    \"dij\": \"Dai\",\n    \"dik\": \"Southwestern Dinka\",\n    \"dil\": \"Dilling\",\n    \"dim\": \"Dime\",\n    \"din\": \"Dinka\",\n    \"dio\": \"Dibo\",\n    \"dip\": \"Northeastern Dinka\",\n    \"diq\": \"Dimli (individual language)\",\n    \"dir\": \"Dirim\",\n    \"dis\": \"Dimasa\",\n    \"diu\": \"Diriku\",\n    \"diw\": \"Northwestern Dinka\",\n    \"dix\": \"Dixon Reef\",\n    \"diy\": \"Diuwe\",\n    \"diz\": \"Ding\",\n    \"dja\": \"Djadjawurrung\",\n    \"djb\": \"Djinba\",\n    \"djc\": \"Dar Daju Daju\",\n    \"djd\": \"Djamindjung; Ngaliwurru\",\n    \"dje\": \"Zarma\",\n    \"djf\": \"Djangun\",\n    \"dji\": \"Djinang\",\n    \"djj\": \"Djeebbana\",\n    \"djk\": \"Eastern Maroon Creole; Businenge Tongo; Nenge\",\n    \"djm\": \"Jamsay Dogon\",\n    \"djn\": \"Jawoyn; Djauan\",\n    \"djo\": \"Jangkang\",\n    \"djr\": \"Djambarrpuyngu\",\n    \"dju\": \"Kapriman\",\n    \"djw\": \"Djawi\",\n    \"dka\": \"Dakpakha\",\n    \"dkg\": \"Kadung\",\n    \"dkk\": \"Dakka\",\n    \"dkr\": \"Kuijau\",\n    \"dks\": \"Southeastern Dinka\",\n    \"dkx\": \"Mazagway\",\n    \"dlg\": \"Dolgan\",\n    \"dlk\": \"Dahalik\",\n    \"dlm\": \"Dalmatian\",\n    \"dln\": \"Darlong\",\n    \"dma\": \"Duma\",\n    \"dmb\": \"Mombo Dogon\",\n    \"dmc\": \"Gavak\",\n    \"dmd\": \"Madhi Madhi\",\n    \"dme\": \"Dugwor\",\n    \"dmf\": \"Medefaidrin\",\n    \"dmg\": \"Upper Kinabatangan\",\n    \"dmk\": \"Domaaki\",\n    \"dml\": \"Dameli\",\n    \"dmm\": \"Dama\",\n    \"dmn\": \"Mande languages\",\n    \"dmo\": \"Kemedzung\",\n    \"dmr\": \"East Damar\",\n    \"dms\": \"Dampelas\",\n    \"dmu\": \"Dubu; Tebi\",\n    \"dmv\": \"Dumpas\",\n    \"dmw\": \"Mudburra\",\n    \"dmx\": \"Dema\",\n    \"dmy\": \"Demta; Sowari\",\n    \"dna\": \"Upper Grand Valley Dani\",\n    \"dnd\": \"Daonda\",\n    \"dne\": \"Ndendeule\",\n    \"dng\": \"Dungan\",\n    \"dni\": \"Lower Grand Valley Dani\",\n    \"dnj\": \"Dan\",\n    \"dnk\": \"Dengka\",\n    \"dnn\": \"Dzùùngoo\",\n    \"dno\": \"Ndrulo; Northern Lendu\",\n    \"dnr\": \"Danaru\",\n    \"dnt\": \"Mid Grand Valley Dani\",\n    \"dnu\": \"Danau\",\n    \"dnv\": \"Danu\",\n    \"dnw\": \"Western Dani\",\n    \"dny\": \"Dení\",\n    \"doa\": \"Dom\",\n    \"dob\": \"Dobu\",\n    \"doc\": \"Northern Dong\",\n    \"doe\": \"Doe\",\n    \"dof\": \"Domu\",\n    \"doh\": \"Dong\",\n    \"doi\": \"Dogri (macrolanguage)\",\n    \"dok\": \"Dondo\",\n    \"dol\": \"Doso\",\n    \"don\": \"Toura (Papua New Guinea)\",\n    \"doo\": \"Dongo\",\n    \"dop\": \"Lukpa\",\n    \"doq\": \"Dominican Sign Language\",\n    \"dor\": \"Dori'o\",\n    \"dos\": \"Dogosé\",\n    \"dot\": \"Dass\",\n    \"dov\": \"Dombe\",\n    \"dow\": \"Doyayo\",\n    \"dox\": \"Bussa\",\n    \"doy\": \"Dompo\",\n    \"doz\": \"Dorze\",\n    \"dpp\": \"Papar\",\n    \"dra\": \"Dravidian languages\",\n    \"drb\": \"Dair\",\n    \"drc\": \"Minderico\",\n    \"drd\": \"Darmiya\",\n    \"dre\": \"Dolpo\",\n    \"drg\": \"Rungus\",\n    \"dri\": \"C'Lela\",\n    \"drl\": \"Paakantyi\",\n    \"drn\": \"West Damar\",\n    \"dro\": \"Daro-Matu Melanau\",\n    \"drq\": \"Dura\",\n    \"drs\": \"Gedeo\",\n    \"drt\": \"Drents\",\n    \"dru\": \"Rukai\",\n    \"dry\": \"Darai\",\n    \"dsb\": \"Lower Sorbian\",\n    \"dse\": \"Dutch Sign Language\",\n    \"dsh\": \"Daasanach\",\n    \"dsi\": \"Disa\",\n    \"dsl\": \"Danish Sign Language\",\n    \"dsn\": \"Dusner\",\n    \"dso\": \"Desiya\",\n    \"dsq\": \"Tadaksahak\",\n    \"dsz\": \"Mardin Sign Language\",\n    \"dta\": \"Daur\",\n    \"dtb\": \"Labuk-Kinabatangan Kadazan\",\n    \"dtd\": \"Ditidaht\",\n    \"dth\": \"Adithinngithigh\",\n    \"dti\": \"Ana Tinga Dogon\",\n    \"dtk\": \"Tene Kan Dogon\",\n    \"dtm\": \"Tomo Kan Dogon\",\n    \"dtn\": \"Daatsʼíin\",\n    \"dto\": \"Tommo So Dogon\",\n    \"dtp\": \"Kadazan Dusun; Central Dusun\",\n    \"dtr\": \"Lotud\",\n    \"dts\": \"Toro So Dogon\",\n    \"dtt\": \"Toro Tegu Dogon\",\n    \"dtu\": \"Tebul Ure Dogon\",\n    \"dty\": \"Dotyali\",\n    \"dua\": \"Duala\",\n    \"dub\": \"Dubli\",\n    \"duc\": \"Duna\",\n    \"due\": \"Umiray Dumaget Agta\",\n    \"duf\": \"Dumbea; Drubea\",\n    \"dug\": \"Duruma; Chiduruma\",\n    \"duh\": \"Dungra Bhil\",\n    \"dui\": \"Dumun\",\n    \"duk\": \"Uyajitaya\",\n    \"dul\": \"Alabat Island Agta\",\n    \"dum\": \"Middle Dutch (ca. 1050-1350)\",\n    \"dun\": \"Dusun Deyah\",\n    \"duo\": \"Dupaninan Agta\",\n    \"dup\": \"Duano\",\n    \"duq\": \"Dusun Malang\",\n    \"dur\": \"Dii\",\n    \"dus\": \"Dumi\",\n    \"duu\": \"Drung\",\n    \"duv\": \"Duvle\",\n    \"duw\": \"Dusun Witu\",\n    \"dux\": \"Duungooma\",\n    \"duy\": \"Dicamay Agta\",\n    \"duz\": \"Duli-Gey\",\n    \"dv\": \"Dhivehi; Divehi; Maldivian\",\n    \"dva\": \"Duau\",\n    \"dwa\": \"Diri\",\n    \"dwk\": \"Dawik Kui\",\n    \"dwr\": \"Dawro\",\n    \"dws\": \"Dutton World Speedwords\",\n    \"dwu\": \"Dhuwal\",\n    \"dww\": \"Dawawa\",\n    \"dwy\": \"Dhuwaya\",\n    \"dwz\": \"Dewas Rai\",\n    \"dya\": \"Dyan\",\n    \"dyb\": \"Dyaberdyaber\",\n    \"dyd\": \"Dyugun\",\n    \"dyg\": \"Villa Viciosa Agta\",\n    \"dyi\": \"Djimini Senoufo\",\n    \"dym\": \"Yanda Dom Dogon\",\n    \"dyn\": \"Dyangadi; Dhanggatti\",\n    \"dyo\": \"Jola-Fonyi\",\n    \"dyu\": \"Dyula\",\n    \"dyy\": \"Djabugay; Dyaabugay\",\n    \"dz\": \"Dzongkha\",\n    \"dza\": \"Tunzu\",\n    \"dze\": \"Djiwarli\",\n    \"dzg\": \"Dazaga\",\n    \"dzl\": \"Dzalakha\",\n    \"dzn\": \"Dzando\",\n    \"eaa\": \"Karenggapa\",\n    \"ebc\": \"Beginci\",\n    \"ebg\": \"Ebughu\",\n    \"ebk\": \"Eastern Bontok\",\n    \"ebo\": \"Teke-Ebo\",\n    \"ebr\": \"Ebrié\",\n    \"ebu\": \"Embu; Kiembu\",\n    \"ecr\": \"Eteocretan\",\n    \"ecs\": \"Ecuadorian Sign Language\",\n    \"ecy\": \"Eteocypriot\",\n    \"ee\": \"Ewe\",\n    \"eee\": \"E\",\n    \"efa\": \"Efai\",\n    \"efe\": \"Efe\",\n    \"efi\": \"Efik\",\n    \"ega\": \"Ega\",\n    \"egl\": \"Emilian\",\n    \"egm\": \"Benamanga\",\n    \"ego\": \"Eggon\",\n    \"egx\": \"Egyptian languages\",\n    \"egy\": \"Egyptian (Ancient)\",\n    \"ehs\": \"Miyakubo Sign Language\",\n    \"ehu\": \"Ehueun\",\n    \"eip\": \"Eipomek\",\n    \"eit\": \"Eitiep\",\n    \"eiv\": \"Askopan\",\n    \"eja\": \"Ejamat\",\n    \"eka\": \"Ekajuk\",\n    \"eke\": \"Ekit\",\n    \"ekg\": \"Ekari\",\n    \"eki\": \"Eki\",\n    \"ekk\": \"Standard Estonian\",\n    \"ekl\": \"Kol (Bangladesh); Kol\",\n    \"ekm\": \"Elip\",\n    \"eko\": \"Koti\",\n    \"ekp\": \"Ekpeye\",\n    \"ekr\": \"Yace\",\n    \"eky\": \"Eastern Kayah\",\n    \"el\": \"Modern Greek (1453-)\",\n    \"ele\": \"Elepi\",\n    \"elh\": \"El Hugeirat\",\n    \"eli\": \"Nding\",\n    \"elk\": \"Elkei\",\n    \"elm\": \"Eleme\",\n    \"elo\": \"El Molo\",\n    \"elu\": \"Elu\",\n    \"elx\": \"Elamite\",\n    \"ema\": \"Emai-Iuleha-Ora\",\n    \"emb\": \"Embaloh\",\n    \"eme\": \"Emerillon\",\n    \"emg\": \"Eastern Meohang\",\n    \"emi\": \"Mussau-Emira\",\n    \"emk\": \"Eastern Maninkakan\",\n    \"emm\": \"Mamulique\",\n    \"emn\": \"Eman\",\n    \"emp\": \"Northern Emberá\",\n    \"emq\": \"Eastern Minyag\",\n    \"ems\": \"Pacific Gulf Yupik\",\n    \"emu\": \"Eastern Muria\",\n    \"emw\": \"Emplawas\",\n    \"emx\": \"Erromintxela\",\n    \"emy\": \"Epigraphic Mayan\",\n    \"emz\": \"Mbessa\",\n    \"en\": \"English\",\n    \"ena\": \"Apali\",\n    \"enb\": \"Markweeta\",\n    \"enc\": \"En\",\n    \"end\": \"Ende\",\n    \"enf\": \"Forest Enets\",\n    \"enh\": \"Tundra Enets\",\n    \"enl\": \"Enlhet\",\n    \"enm\": \"Middle English (1100-1500)\",\n    \"enn\": \"Engenni\",\n    \"eno\": \"Enggano\",\n    \"enq\": \"Enga\",\n    \"enr\": \"Emumu; Emem\",\n    \"enu\": \"Enu\",\n    \"env\": \"Enwan (Edo State)\",\n    \"enw\": \"Enwan (Akwa Ibom State)\",\n    \"enx\": \"Enxet\",\n    \"eo\": \"Esperanto\",\n    \"eot\": \"Beti (Côte d'Ivoire)\",\n    \"epi\": \"Epie\",\n    \"era\": \"Eravallan\",\n    \"erg\": \"Sie\",\n    \"erh\": \"Eruwa\",\n    \"eri\": \"Ogea\",\n    \"erk\": \"South Efate\",\n    \"ero\": \"Horpa\",\n    \"err\": \"Erre\",\n    \"ers\": \"Ersu\",\n    \"ert\": \"Eritai\",\n    \"erw\": \"Erokwanas\",\n    \"es\": \"Spanish; Castilian\",\n    \"ese\": \"Ese Ejja\",\n    \"esg\": \"Aheri Gondi\",\n    \"esh\": \"Eshtehardi\",\n    \"esi\": \"North Alaskan Inupiatun\",\n    \"esk\": \"Northwest Alaska Inupiatun\",\n    \"esl\": \"Egypt Sign Language\",\n    \"esm\": \"Esuma\",\n    \"esn\": \"Salvadoran Sign Language\",\n    \"eso\": \"Estonian Sign Language\",\n    \"esq\": \"Esselen\",\n    \"ess\": \"Central Siberian Yupik\",\n    \"esu\": \"Central Yupik\",\n    \"esx\": \"Eskimo-Aleut languages\",\n    \"esy\": \"Eskayan\",\n    \"et\": \"Estonian\",\n    \"etb\": \"Etebi\",\n    \"etc\": \"Etchemin\",\n    \"eth\": \"Ethiopian Sign Language\",\n    \"etn\": \"Eton (Vanuatu)\",\n    \"eto\": \"Eton (Cameroon)\",\n    \"etr\": \"Edolo\",\n    \"ets\": \"Yekhee\",\n    \"ett\": \"Etruscan\",\n    \"etu\": \"Ejagham\",\n    \"etx\": \"Eten\",\n    \"etz\": \"Semimi\",\n    \"eu\": \"Basque\",\n    \"euq\": \"Basque (family)\",\n    \"eve\": \"Even\",\n    \"evh\": \"Uvbie\",\n    \"evn\": \"Evenki\",\n    \"ewo\": \"Ewondo\",\n    \"ext\": \"Extremaduran\",\n    \"eya\": \"Eyak\",\n    \"eyo\": \"Keiyo\",\n    \"eza\": \"Ezaa\",\n    \"eze\": \"Uzekwe\",\n    \"fa\": \"Persian\",\n    \"faa\": \"Fasu\",\n    \"fab\": \"Fa d'Ambu\",\n    \"fad\": \"Wagi\",\n    \"faf\": \"Fagani\",\n    \"fag\": \"Finongan\",\n    \"fah\": \"Baissa Fali\",\n    \"fai\": \"Faiwol\",\n    \"faj\": \"Faita\",\n    \"fak\": \"Fang (Cameroon)\",\n    \"fal\": \"South Fali\",\n    \"fam\": \"Fam\",\n    \"fan\": \"Fang (Equatorial Guinea)\",\n    \"fap\": \"Paloor\",\n    \"far\": \"Fataleka\",\n    \"fat\": \"Fanti\",\n    \"fau\": \"Fayu\",\n    \"fax\": \"Fala\",\n    \"fay\": \"Southwestern Fars\",\n    \"faz\": \"Northwestern Fars\",\n    \"fbl\": \"West Albay Bikol\",\n    \"fcs\": \"Quebec Sign Language\",\n    \"fer\": \"Feroge\",\n    \"ff\": \"Fulah\",\n    \"ffi\": \"Foia Foia\",\n    \"ffm\": \"Maasina Fulfulde\",\n    \"fgr\": \"Fongoro\",\n    \"fi\": \"Finnish\",\n    \"fia\": \"Nobiin\",\n    \"fie\": \"Fyer\",\n    \"fif\": \"Faifi\",\n    \"fil\": \"Filipino; Pilipino\",\n    \"fip\": \"Fipa\",\n    \"fir\": \"Firan\",\n    \"fit\": \"Tornedalen Finnish; Meänkieli\",\n    \"fiu\": \"Finno-Ugrian languages\",\n    \"fiw\": \"Fiwaga\",\n    \"fj\": \"Fijian\",\n    \"fkk\": \"Kirya-Konzəl\",\n    \"fkv\": \"Kven Finnish\",\n    \"fla\": \"Kalispel-Pend d'Oreille\",\n    \"flh\": \"Foau\",\n    \"fli\": \"Fali\",\n    \"fll\": \"North Fali\",\n    \"fln\": \"Flinders Island\",\n    \"flr\": \"Fuliiru\",\n    \"fly\": \"Flaaitaal; Tsotsitaal\",\n    \"fmp\": \"Fe'fe'\",\n    \"fmu\": \"Far Western Muria\",\n    \"fnb\": \"Fanbak\",\n    \"fng\": \"Fanagalo\",\n    \"fni\": \"Fania\",\n    \"fo\": \"Faroese\",\n    \"fod\": \"Foodo\",\n    \"foi\": \"Foi\",\n    \"fom\": \"Foma\",\n    \"fon\": \"Fon\",\n    \"for\": \"Fore\",\n    \"fos\": \"Siraya\",\n    \"fox\": \"Formosan languages\",\n    \"fpe\": \"Fernando Po Creole English\",\n    \"fqs\": \"Fas\",\n    \"fr\": \"French\",\n    \"frc\": \"Cajun French\",\n    \"frd\": \"Fordata\",\n    \"frk\": \"Frankish\",\n    \"frm\": \"Middle French (ca. 1400-1600)\",\n    \"fro\": \"Old French (842-ca. 1400)\",\n    \"frp\": \"Arpitan; Francoprovençal\",\n    \"frq\": \"Forak\",\n    \"frr\": \"Northern Frisian\",\n    \"frs\": \"Eastern Frisian\",\n    \"frt\": \"Fortsenal\",\n    \"fse\": \"Finnish Sign Language\",\n    \"fsl\": \"French Sign Language\",\n    \"fss\": \"Finland-Swedish Sign Language; finlandssvenskt teckenspråk; suomenruotsalainen viittomakieli\",\n    \"fub\": \"Adamawa Fulfulde\",\n    \"fuc\": \"Pulaar\",\n    \"fud\": \"East Futuna\",\n    \"fue\": \"Borgu Fulfulde\",\n    \"fuf\": \"Pular\",\n    \"fuh\": \"Western Niger Fulfulde\",\n    \"fui\": \"Bagirmi Fulfulde\",\n    \"fuj\": \"Ko\",\n    \"fum\": \"Fum\",\n    \"fun\": \"Fulniô\",\n    \"fuq\": \"Central-Eastern Niger Fulfulde\",\n    \"fur\": \"Friulian\",\n    \"fut\": \"Futuna-Aniwa\",\n    \"fuu\": \"Furu\",\n    \"fuv\": \"Nigerian Fulfulde\",\n    \"fuy\": \"Fuyug\",\n    \"fvr\": \"Fur\",\n    \"fwa\": \"Fwâi\",\n    \"fwe\": \"Fwe\",\n    \"fy\": \"Western Frisian\",\n    \"ga\": \"Irish\",\n    \"gaa\": \"Ga\",\n    \"gab\": \"Gabri\",\n    \"gac\": \"Mixed Great Andamanese\",\n    \"gad\": \"Gaddang\",\n    \"gae\": \"Guarequena\",\n    \"gaf\": \"Gende\",\n    \"gag\": \"Gagauz\",\n    \"gah\": \"Alekano\",\n    \"gai\": \"Borei\",\n    \"gaj\": \"Gadsup\",\n    \"gak\": \"Gamkonora\",\n    \"gal\": \"Galolen\",\n    \"gam\": \"Kandawo\",\n    \"gan\": \"Gan Chinese\",\n    \"gao\": \"Gants\",\n    \"gap\": \"Gal\",\n    \"gaq\": \"Gata'\",\n    \"gar\": \"Galeya\",\n    \"gas\": \"Adiwasi Garasia\",\n    \"gat\": \"Kenati\",\n    \"gau\": \"Mudhili Gadaba\",\n    \"gaw\": \"Nobonob\",\n    \"gax\": \"Borana-Arsi-Guji Oromo\",\n    \"gay\": \"Gayo\",\n    \"gaz\": \"West Central Oromo\",\n    \"gba\": \"Gbaya (Central African Republic)\",\n    \"gbb\": \"Kaytetye\",\n    \"gbd\": \"Karajarri\",\n    \"gbe\": \"Niksek\",\n    \"gbf\": \"Gaikundi\",\n    \"gbg\": \"Gbanziri\",\n    \"gbh\": \"Defi Gbe\",\n    \"gbi\": \"Galela\",\n    \"gbj\": \"Bodo Gadaba\",\n    \"gbk\": \"Gaddi\",\n    \"gbl\": \"Gamit\",\n    \"gbm\": \"Garhwali\",\n    \"gbn\": \"Mo'da\",\n    \"gbo\": \"Northern Grebo\",\n    \"gbp\": \"Gbaya-Bossangoa\",\n    \"gbq\": \"Gbaya-Bozoum\",\n    \"gbr\": \"Gbagyi\",\n    \"gbs\": \"Gbesi Gbe\",\n    \"gbu\": \"Gagadu\",\n    \"gbv\": \"Gbanu\",\n    \"gbw\": \"Gabi-Gabi\",\n    \"gbx\": \"Eastern Xwla Gbe\",\n    \"gby\": \"Gbari\",\n    \"gbz\": \"Zoroastrian Dari\",\n    \"gcc\": \"Mali\",\n    \"gcd\": \"Ganggalida\",\n    \"gce\": \"Galice\",\n    \"gcf\": \"Guadeloupean Creole French\",\n    \"gcl\": \"Grenadian Creole English\",\n    \"gcn\": \"Gaina\",\n    \"gcr\": \"Guianese Creole French\",\n    \"gct\": \"Colonia Tovar German\",\n    \"gd\": \"Scottish Gaelic; Gaelic\",\n    \"gda\": \"Gade Lohar\",\n    \"gdb\": \"Pottangi Ollar Gadaba\",\n    \"gdc\": \"Gugu Badhun\",\n    \"gdd\": \"Gedaged\",\n    \"gde\": \"Gude\",\n    \"gdf\": \"Guduf-Gava\",\n    \"gdg\": \"Ga'dang\",\n    \"gdh\": \"Gadjerawang; Gajirrabeng\",\n    \"gdi\": \"Gundi\",\n    \"gdj\": \"Gurdjar\",\n    \"gdk\": \"Gadang\",\n    \"gdl\": \"Dirasha\",\n    \"gdm\": \"Laal\",\n    \"gdn\": \"Umanakaina\",\n    \"gdo\": \"Ghodoberi\",\n    \"gdq\": \"Mehri\",\n    \"gdr\": \"Wipi\",\n    \"gds\": \"Ghandruk Sign Language\",\n    \"gdt\": \"Kungardutyi\",\n    \"gdu\": \"Gudu\",\n    \"gdx\": \"Godwari\",\n    \"gea\": \"Geruma\",\n    \"geb\": \"Kire\",\n    \"gec\": \"Gboloo Grebo\",\n    \"ged\": \"Gade\",\n    \"gef\": \"Gerai\",\n    \"geg\": \"Gengle\",\n    \"geh\": \"Hutterite German; Hutterisch\",\n    \"gei\": \"Gebe\",\n    \"gej\": \"Gen\",\n    \"gek\": \"Ywom\",\n    \"gel\": \"ut-Ma'in\",\n    \"gem\": \"Germanic languages\",\n    \"geq\": \"Geme\",\n    \"ges\": \"Geser-Gorom\",\n    \"gev\": \"Eviya\",\n    \"gew\": \"Gera\",\n    \"gex\": \"Garre\",\n    \"gey\": \"Enya\",\n    \"gez\": \"Geez\",\n    \"gfk\": \"Patpatar\",\n    \"gft\": \"Gafat\",\n    \"gga\": \"Gao\",\n    \"ggb\": \"Gbii\",\n    \"ggd\": \"Gugadj\",\n    \"gge\": \"Gurr-goni\",\n    \"ggg\": \"Gurgula\",\n    \"ggk\": \"Kungarakany\",\n    \"ggl\": \"Ganglau\",\n    \"ggt\": \"Gitua\",\n    \"ggu\": \"Gagu; Gban\",\n    \"ggw\": \"Gogodala\",\n    \"gha\": \"Ghadamès\",\n    \"ghc\": \"Hiberno-Scottish Gaelic\",\n    \"ghe\": \"Southern Ghale\",\n    \"ghh\": \"Northern Ghale\",\n    \"ghk\": \"Geko Karen\",\n    \"ghl\": \"Ghulfan\",\n    \"ghn\": \"Ghanongga\",\n    \"gho\": \"Ghomara\",\n    \"ghr\": \"Ghera\",\n    \"ghs\": \"Guhu-Samane\",\n    \"ght\": \"Kuke; Kutang Ghale\",\n    \"gia\": \"Kija\",\n    \"gib\": \"Gibanawa\",\n    \"gic\": \"Gail\",\n    \"gid\": \"Gidar\",\n    \"gie\": \"Gaɓogbo; Guébie\",\n    \"gig\": \"Goaria\",\n    \"gih\": \"Githabul\",\n    \"gii\": \"Girirra\",\n    \"gil\": \"Gilbertese\",\n    \"gim\": \"Gimi (Eastern Highlands)\",\n    \"gin\": \"Hinukh\",\n    \"gip\": \"Gimi (West New Britain)\",\n    \"giq\": \"Green Gelao\",\n    \"gir\": \"Red Gelao\",\n    \"gis\": \"North Giziga\",\n    \"git\": \"Gitxsan\",\n    \"giu\": \"Mulao\",\n    \"giw\": \"White Gelao\",\n    \"gix\": \"Gilima\",\n    \"giy\": \"Giyug\",\n    \"giz\": \"South Giziga\",\n    \"gjk\": \"Kachi Koli\",\n    \"gjm\": \"Gunditjmara\",\n    \"gjn\": \"Gonja\",\n    \"gjr\": \"Gurindji Kriol\",\n    \"gju\": \"Gujari\",\n    \"gka\": \"Guya\",\n    \"gkd\": \"Magɨ (Madang Province)\",\n    \"gke\": \"Ndai\",\n    \"gkn\": \"Gokana\",\n    \"gko\": \"Kok-Nar\",\n    \"gkp\": \"Guinea Kpelle\",\n    \"gku\": \"ǂUngkue\",\n    \"gl\": \"Galician\",\n    \"glb\": \"Belning\",\n    \"glc\": \"Bon Gula\",\n    \"gld\": \"Nanai\",\n    \"glh\": \"Northwest Pashai; Northwest Pashayi\",\n    \"glj\": \"Gula Iro\",\n    \"glk\": \"Gilaki\",\n    \"gll\": \"Garlali\",\n    \"glo\": \"Galambu\",\n    \"glr\": \"Glaro-Twabo\",\n    \"glu\": \"Gula (Chad)\",\n    \"glw\": \"Glavda\",\n    \"gly\": \"Gule\",\n    \"gma\": \"Gambera\",\n    \"gmb\": \"Gula'alaa\",\n    \"gmd\": \"Mághdì\",\n    \"gme\": \"East Germanic languages\",\n    \"gmg\": \"Magɨyi\",\n    \"gmh\": \"Middle High German (ca. 1050-1500)\",\n    \"gml\": \"Middle Low German\",\n    \"gmm\": \"Gbaya-Mbodomo\",\n    \"gmn\": \"Gimnime\",\n    \"gmq\": \"North Germanic languages\",\n    \"gmr\": \"Mirning; Mirniny\",\n    \"gmu\": \"Gumalu\",\n    \"gmv\": \"Gamo\",\n    \"gmw\": \"West Germanic languages\",\n    \"gmx\": \"Magoma\",\n    \"gmy\": \"Mycenaean Greek\",\n    \"gmz\": \"Mgbolizhia\",\n    \"gn\": \"Guarani\",\n    \"gna\": \"Kaansa\",\n    \"gnb\": \"Gangte\",\n    \"gnc\": \"Guanche\",\n    \"gnd\": \"Zulgo-Gemzek\",\n    \"gne\": \"Ganang\",\n    \"gng\": \"Ngangam\",\n    \"gnh\": \"Lere\",\n    \"gni\": \"Gooniyandi\",\n    \"gnj\": \"Ngen\",\n    \"gnk\": \"ǁGana\",\n    \"gnl\": \"Gangulu\",\n    \"gnm\": \"Ginuman\",\n    \"gnn\": \"Gumatj\",\n    \"gno\": \"Northern Gondi\",\n    \"gnq\": \"Gana\",\n    \"gnr\": \"Gureng Gureng\",\n    \"gnt\": \"Guntai\",\n    \"gnu\": \"Gnau\",\n    \"gnw\": \"Western Bolivian Guaraní\",\n    \"gnz\": \"Ganzi\",\n    \"goa\": \"Guro\",\n    \"gob\": \"Playero\",\n    \"goc\": \"Gorakor\",\n    \"god\": \"Godié\",\n    \"goe\": \"Gongduk\",\n    \"gof\": \"Gofa\",\n    \"gog\": \"Gogo\",\n    \"goh\": \"Old High German (ca. 750-1050)\",\n    \"goi\": \"Gobasi\",\n    \"goj\": \"Gowlan\",\n    \"gok\": \"Gowli\",\n    \"gol\": \"Gola\",\n    \"gom\": \"Goan Konkani\",\n    \"gon\": \"Gondi\",\n    \"goo\": \"Gone Dau\",\n    \"gop\": \"Yeretuar\",\n    \"goq\": \"Gorap\",\n    \"gor\": \"Gorontalo\",\n    \"gos\": \"Gronings\",\n    \"got\": \"Gothic\",\n    \"gou\": \"Gavar\",\n    \"gov\": \"Goo\",\n    \"gow\": \"Gorowa\",\n    \"gox\": \"Gobu\",\n    \"goy\": \"Goundo\",\n    \"goz\": \"Gozarkhani\",\n    \"gpa\": \"Gupa-Abawa\",\n    \"gpe\": \"Ghanaian Pidgin English\",\n    \"gpn\": \"Taiap\",\n    \"gqa\": \"Ga'anda\",\n    \"gqi\": \"Guiqiong\",\n    \"gqn\": \"Guana (Brazil)\",\n    \"gqr\": \"Gor\",\n    \"gqu\": \"Qau\",\n    \"gra\": \"Rajput Garasia\",\n    \"grb\": \"Grebo\",\n    \"grc\": \"Ancient Greek (to 1453)\",\n    \"grd\": \"Guruntum-Mbaaru\",\n    \"grg\": \"Madi\",\n    \"grh\": \"Gbiri-Niragu\",\n    \"gri\": \"Ghari\",\n    \"grj\": \"Southern Grebo\",\n    \"grk\": \"Greek languages\",\n    \"grm\": \"Kota Marudu Talantang\",\n    \"gro\": \"Groma\",\n    \"grq\": \"Gorovu\",\n    \"grr\": \"Taznatit\",\n    \"grs\": \"Gresi\",\n    \"grt\": \"Garo\",\n    \"gru\": \"Kistane\",\n    \"grv\": \"Central Grebo\",\n    \"grw\": \"Gweda\",\n    \"grx\": \"Guriaso\",\n    \"gry\": \"Barclayville Grebo\",\n    \"grz\": \"Guramalum\",\n    \"gse\": \"Ghanaian Sign Language\",\n    \"gsg\": \"German Sign Language\",\n    \"gsl\": \"Gusilay\",\n    \"gsm\": \"Guatemalan Sign Language\",\n    \"gsn\": \"Nema; Gusan\",\n    \"gso\": \"Southwest Gbaya\",\n    \"gsp\": \"Wasembo\",\n    \"gss\": \"Greek Sign Language\",\n    \"gsw\": \"Swiss German; Alemannic; Alsatian\",\n    \"gta\": \"Guató\",\n    \"gtu\": \"Aghu-Tharnggala\",\n    \"gu\": \"Gujarati\",\n    \"gua\": \"Shiki\",\n    \"gub\": \"Guajajára\",\n    \"guc\": \"Wayuu\",\n    \"gud\": \"Yocoboué Dida\",\n    \"gue\": \"Gurindji\",\n    \"guf\": \"Gupapuyngu\",\n    \"gug\": \"Paraguayan Guaraní\",\n    \"guh\": \"Guahibo\",\n    \"gui\": \"Eastern Bolivian Guaraní\",\n    \"guk\": \"Gumuz\",\n    \"gul\": \"Sea Island Creole English\",\n    \"gum\": \"Guambiano\",\n    \"gun\": \"Mbyá Guaraní\",\n    \"guo\": \"Guayabero\",\n    \"gup\": \"Gunwinggu\",\n    \"guq\": \"Aché\",\n    \"gur\": \"Farefare\",\n    \"gus\": \"Guinean Sign Language\",\n    \"gut\": \"Maléku Jaíka\",\n    \"guu\": \"Yanomamö\",\n    \"guw\": \"Gun\",\n    \"gux\": \"Gourmanchéma\",\n    \"guz\": \"Gusii; Ekegusii\",\n    \"gv\": \"Manx\",\n    \"gva\": \"Guana (Paraguay)\",\n    \"gvc\": \"Guanano\",\n    \"gve\": \"Duwet\",\n    \"gvf\": \"Golin\",\n    \"gvj\": \"Guajá\",\n    \"gvl\": \"Gulay\",\n    \"gvm\": \"Gurmana\",\n    \"gvn\": \"Kuku-Yalanji\",\n    \"gvo\": \"Gavião Do Jiparaná\",\n    \"gvp\": \"Pará Gavião\",\n    \"gvr\": \"Gurung\",\n    \"gvs\": \"Gumawana\",\n    \"gvy\": \"Guyani\",\n    \"gwa\": \"Mbato\",\n    \"gwb\": \"Gwa\",\n    \"gwc\": \"Gawri; Kalami\",\n    \"gwd\": \"Gawwada\",\n    \"gwe\": \"Gweno\",\n    \"gwf\": \"Gowro\",\n    \"gwg\": \"Moo\",\n    \"gwi\": \"Gwichʼin\",\n    \"gwj\": \"ǀGwi\",\n    \"gwm\": \"Awngthim\",\n    \"gwn\": \"Gwandara\",\n    \"gwr\": \"Gwere\",\n    \"gwt\": \"Gawar-Bati\",\n    \"gwu\": \"Guwamu\",\n    \"gww\": \"Kwini\",\n    \"gwx\": \"Gua\",\n    \"gxx\": \"Wè Southern\",\n    \"gya\": \"Northwest Gbaya\",\n    \"gyb\": \"Garus\",\n    \"gyd\": \"Kayardild\",\n    \"gye\": \"Gyem\",\n    \"gyf\": \"Gungabula\",\n    \"gyg\": \"Gbayi\",\n    \"gyi\": \"Gyele\",\n    \"gyl\": \"Gayil\",\n    \"gym\": \"Ngäbere\",\n    \"gyn\": \"Guyanese Creole English\",\n    \"gyo\": \"Gyalsumdo\",\n    \"gyr\": \"Guarayu\",\n    \"gyy\": \"Gunya\",\n    \"gyz\": \"Geji; Gyaazi\",\n    \"gza\": \"Ganza\",\n    \"gzi\": \"Gazi\",\n    \"gzn\": \"Gane\",\n    \"ha\": \"Hausa\",\n    \"haa\": \"Han\",\n    \"hab\": \"Hanoi Sign Language\",\n    \"hac\": \"Gurani\",\n    \"had\": \"Hatam\",\n    \"hae\": \"Eastern Oromo\",\n    \"haf\": \"Haiphong Sign Language\",\n    \"hag\": \"Hanga\",\n    \"hah\": \"Hahon\",\n    \"hai\": \"Haida\",\n    \"haj\": \"Hajong\",\n    \"hak\": \"Hakka Chinese\",\n    \"hal\": \"Halang\",\n    \"ham\": \"Hewa\",\n    \"han\": \"Hangaza\",\n    \"hao\": \"Hakö\",\n    \"hap\": \"Hupla\",\n    \"haq\": \"Ha\",\n    \"har\": \"Harari\",\n    \"has\": \"Haisla\",\n    \"hav\": \"Havu\",\n    \"haw\": \"Hawaiian\",\n    \"hax\": \"Southern Haida\",\n    \"hay\": \"Haya\",\n    \"haz\": \"Hazaragi\",\n    \"hba\": \"Hamba\",\n    \"hbb\": \"Huba\",\n    \"hbn\": \"Heiban\",\n    \"hbo\": \"Ancient Hebrew\",\n    \"hbu\": \"Habu\",\n    \"hca\": \"Andaman Creole Hindi\",\n    \"hch\": \"Huichol\",\n    \"hdn\": \"Northern Haida\",\n    \"hds\": \"Honduras Sign Language\",\n    \"hdy\": \"Hadiyya\",\n    \"he\": \"Hebrew\",\n    \"hea\": \"Northern Qiandong Miao\",\n    \"hed\": \"Herdé\",\n    \"heg\": \"Helong\",\n    \"heh\": \"Hehe\",\n    \"hei\": \"Heiltsuk\",\n    \"hem\": \"Hemba\",\n    \"hgm\": \"Haiǁom\",\n    \"hgw\": \"Haigwai\",\n    \"hhi\": \"Hoia Hoia\",\n    \"hhr\": \"Kerak\",\n    \"hhy\": \"Hoyahoya\",\n    \"hi\": \"Hindi\",\n    \"hia\": \"Lamang\",\n    \"hib\": \"Hibito\",\n    \"hid\": \"Hidatsa\",\n    \"hif\": \"Fiji Hindi\",\n    \"hig\": \"Kamwe\",\n    \"hih\": \"Pamosu\",\n    \"hii\": \"Hinduri\",\n    \"hij\": \"Hijuk\",\n    \"hik\": \"Seit-Kaitetu\",\n    \"hil\": \"Hiligaynon\",\n    \"him\": \"Himachali languages; Western Pahari languages\",\n    \"hio\": \"Tsoa\",\n    \"hir\": \"Himarimã\",\n    \"hit\": \"Hittite\",\n    \"hiw\": \"Hiw\",\n    \"hix\": \"Hixkaryána\",\n    \"hji\": \"Haji\",\n    \"hka\": \"Kahe\",\n    \"hke\": \"Hunde\",\n    \"hkh\": \"Khah; Poguli\",\n    \"hkk\": \"Hunjara-Kaina Ke\",\n    \"hkn\": \"Mel-Khaonh\",\n    \"hks\": \"Hong Kong Sign Language; Heung Kong Sau Yue\",\n    \"hla\": \"Halia\",\n    \"hlb\": \"Halbi\",\n    \"hld\": \"Halang Doan\",\n    \"hle\": \"Hlersu\",\n    \"hlt\": \"Matu Chin\",\n    \"hlu\": \"Hieroglyphic Luwian\",\n    \"hma\": \"Southern Mashan Hmong; Southern Mashan Miao\",\n    \"hmb\": \"Humburi Senni Songhay\",\n    \"hmc\": \"Central Huishui Hmong; Central Huishui Miao\",\n    \"hmd\": \"Large Flowery Miao; A-hmaos; Da-Hua Miao\",\n    \"hme\": \"Eastern Huishui Hmong; Eastern Huishui Miao\",\n    \"hmf\": \"Hmong Don\",\n    \"hmg\": \"Southwestern Guiyang Hmong\",\n    \"hmh\": \"Southwestern Huishui Hmong; Southwestern Huishui Miao\",\n    \"hmi\": \"Northern Huishui Hmong; Northern Huishui Miao\",\n    \"hmj\": \"Ge; Gejia\",\n    \"hmk\": \"Maek\",\n    \"hml\": \"Luopohe Hmong; Luopohe Miao\",\n    \"hmm\": \"Central Mashan Hmong; Central Mashan Miao\",\n    \"hmn\": \"Hmong; Mong\",\n    \"hmp\": \"Northern Mashan Hmong; Northern Mashan Miao\",\n    \"hmq\": \"Eastern Qiandong Miao\",\n    \"hmr\": \"Hmar\",\n    \"hms\": \"Southern Qiandong Miao\",\n    \"hmt\": \"Hamtai\",\n    \"hmu\": \"Hamap\",\n    \"hmv\": \"Hmong Dô\",\n    \"hmw\": \"Western Mashan Hmong; Western Mashan Miao\",\n    \"hmx\": \"Hmong-Mien languages\",\n    \"hmy\": \"Southern Guiyang Hmong; Southern Guiyang Miao\",\n    \"hmz\": \"Hmong Shua; Sinicized Miao\",\n    \"hna\": \"Mina (Cameroon)\",\n    \"hnd\": \"Southern Hindko\",\n    \"hne\": \"Chhattisgarhi\",\n    \"hng\": \"Hungu\",\n    \"hnh\": \"ǁAni\",\n    \"hni\": \"Hani\",\n    \"hnj\": \"Hmong Njua; Mong Leng; Mong Njua\",\n    \"hnn\": \"Hanunoo\",\n    \"hno\": \"Northern Hindko\",\n    \"hns\": \"Caribbean Hindustani\",\n    \"hnu\": \"Hung\",\n    \"ho\": \"Hiri Motu\",\n    \"hoa\": \"Hoava\",\n    \"hob\": \"Mari (Madang Province)\",\n    \"hoc\": \"Ho\",\n    \"hod\": \"Holma\",\n    \"hoe\": \"Horom\",\n    \"hoh\": \"Hobyót\",\n    \"hoi\": \"Holikachuk\",\n    \"hoj\": \"Hadothi; Haroti\",\n    \"hok\": \"Hokan languages\",\n    \"hol\": \"Holu\",\n    \"hom\": \"Homa\",\n    \"hoo\": \"Holoholo\",\n    \"hop\": \"Hopi\",\n    \"hor\": \"Horo\",\n    \"hos\": \"Ho Chi Minh City Sign Language\",\n    \"hot\": \"Hote; Malê\",\n    \"hov\": \"Hovongan\",\n    \"how\": \"Honi\",\n    \"hoy\": \"Holiya\",\n    \"hoz\": \"Hozo\",\n    \"hpo\": \"Hpon\",\n    \"hps\": \"Hawai'i Sign Language (HSL); Hawai'i Pidgin Sign Language\",\n    \"hr\": \"Croatian\",\n    \"hra\": \"Hrangkhol\",\n    \"hrc\": \"Niwer Mil\",\n    \"hre\": \"Hre\",\n    \"hrk\": \"Haruku\",\n    \"hrm\": \"Horned Miao\",\n    \"hro\": \"Haroi\",\n    \"hrp\": \"Nhirrpi\",\n    \"hrt\": \"Hértevin\",\n    \"hru\": \"Hruso\",\n    \"hrw\": \"Warwar Feni\",\n    \"hrx\": \"Hunsrik\",\n    \"hrz\": \"Harzani\",\n    \"hsb\": \"Upper Sorbian\",\n    \"hsh\": \"Hungarian Sign Language\",\n    \"hsl\": \"Hausa Sign Language\",\n    \"hsn\": \"Xiang Chinese\",\n    \"hss\": \"Harsusi\",\n    \"ht\": \"Haitian; Haitian Creole\",\n    \"hti\": \"Hoti\",\n    \"hto\": \"Minica Huitoto\",\n    \"hts\": \"Hadza\",\n    \"htu\": \"Hitu\",\n    \"htx\": \"Middle Hittite\",\n    \"hu\": \"Hungarian\",\n    \"hub\": \"Huambisa\",\n    \"huc\": \"ǂHua; ǂʼAmkhoe\",\n    \"hud\": \"Huaulu\",\n    \"hue\": \"San Francisco Del Mar Huave\",\n    \"huf\": \"Humene\",\n    \"hug\": \"Huachipaeri\",\n    \"huh\": \"Huilliche\",\n    \"hui\": \"Huli\",\n    \"huj\": \"Northern Guiyang Hmong; Northern Guiyang Miao\",\n    \"huk\": \"Hulung\",\n    \"hul\": \"Hula\",\n    \"hum\": \"Hungana\",\n    \"huo\": \"Hu\",\n    \"hup\": \"Hupa\",\n    \"huq\": \"Tsat\",\n    \"hur\": \"Halkomelem\",\n    \"hus\": \"Huastec\",\n    \"hut\": \"Humla\",\n    \"huu\": \"Murui Huitoto\",\n    \"huv\": \"San Mateo Del Mar Huave\",\n    \"huw\": \"Hukumina\",\n    \"hux\": \"Nüpode Huitoto\",\n    \"huy\": \"Hulaulá\",\n    \"huz\": \"Hunzib\",\n    \"hvc\": \"Haitian Vodoun Culture Language\",\n    \"hve\": \"San Dionisio Del Mar Huave\",\n    \"hvk\": \"Haveke\",\n    \"hvn\": \"Sabu\",\n    \"hvv\": \"Santa María Del Mar Huave\",\n    \"hwa\": \"Wané\",\n    \"hwc\": \"Hawai'i Creole English; Hawai'i Pidgin\",\n    \"hwo\": \"Hwana\",\n    \"hy\": \"Armenian\",\n    \"hya\": \"Hya\",\n    \"hyw\": \"Western Armenian\",\n    \"hyx\": \"Armenian (family)\",\n    \"hz\": \"Herero\",\n    \"ia\": \"Interlingua (International Auxiliary Language Association)\",\n    \"iai\": \"Iaai\",\n    \"ian\": \"Iatmul\",\n    \"iar\": \"Purari\",\n    \"iba\": \"Iban\",\n    \"ibb\": \"Ibibio\",\n    \"ibd\": \"Iwaidja\",\n    \"ibe\": \"Akpes\",\n    \"ibg\": \"Ibanag\",\n    \"ibh\": \"Bih\",\n    \"ibl\": \"Ibaloi\",\n    \"ibm\": \"Agoi\",\n    \"ibn\": \"Ibino\",\n    \"ibr\": \"Ibuoro\",\n    \"ibu\": \"Ibu\",\n    \"iby\": \"Ibani\",\n    \"ica\": \"Ede Ica\",\n    \"ich\": \"Etkywan\",\n    \"icl\": \"Icelandic Sign Language\",\n    \"icr\": \"Islander Creole English\",\n    \"id\": \"Indonesian\",\n    \"ida\": \"Idakho-Isukha-Tiriki; Luidakho-Luisukha-Lutirichi\",\n    \"idb\": \"Indo-Portuguese\",\n    \"idc\": \"Idon; Ajiya\",\n    \"idd\": \"Ede Idaca\",\n    \"ide\": \"Idere\",\n    \"idi\": \"Idi\",\n    \"idr\": \"Indri\",\n    \"ids\": \"Idesa\",\n    \"idt\": \"Idaté\",\n    \"idu\": \"Idoma\",\n    \"ie\": \"Interlingue; Occidental\",\n    \"ifa\": \"Amganad Ifugao\",\n    \"ifb\": \"Batad Ifugao; Ayangan Ifugao\",\n    \"ife\": \"Ifè\",\n    \"iff\": \"Ifo\",\n    \"ifk\": \"Tuwali Ifugao\",\n    \"ifm\": \"Teke-Fuumu\",\n    \"ifu\": \"Mayoyao Ifugao\",\n    \"ify\": \"Keley-I Kallahan\",\n    \"ig\": \"Igbo\",\n    \"igb\": \"Ebira\",\n    \"ige\": \"Igede\",\n    \"igg\": \"Igana\",\n    \"igl\": \"Igala\",\n    \"igm\": \"Kanggape\",\n    \"ign\": \"Ignaciano\",\n    \"igo\": \"Isebe\",\n    \"igs\": \"Interglossa\",\n    \"igw\": \"Igwe\",\n    \"ihb\": \"Iha Based Pidgin\",\n    \"ihi\": \"Ihievbe\",\n    \"ihp\": \"Iha\",\n    \"ihw\": \"Bidhawal\",\n    \"ii\": \"Sichuan Yi; Nuosu\",\n    \"iin\": \"Thiin\",\n    \"iir\": \"Indo-Iranian languages\",\n    \"ijc\": \"Izon\",\n    \"ije\": \"Biseni\",\n    \"ijj\": \"Ede Ije\",\n    \"ijn\": \"Kalabari\",\n    \"ijo\": \"Ijo languages\",\n    \"ijs\": \"Southeast Ijo\",\n    \"ik\": \"Inupiaq\",\n    \"ike\": \"Eastern Canadian Inuktitut\",\n    \"iki\": \"Iko\",\n    \"ikk\": \"Ika\",\n    \"ikl\": \"Ikulu\",\n    \"iko\": \"Olulumo-Ikom\",\n    \"ikp\": \"Ikpeshi\",\n    \"ikr\": \"Ikaranggal\",\n    \"iks\": \"Inuit Sign Language\",\n    \"ikt\": \"Inuinnaqtun; Western Canadian Inuktitut\",\n    \"ikv\": \"Iku-Gora-Ankwa\",\n    \"ikw\": \"Ikwere\",\n    \"ikx\": \"Ik\",\n    \"ikz\": \"Ikizu\",\n    \"ila\": \"Ile Ape\",\n    \"ilb\": \"Ila\",\n    \"ilg\": \"Garig-Ilgar\",\n    \"ili\": \"Ili Turki\",\n    \"ilk\": \"Ilongot\",\n    \"ilm\": \"Iranun (Malaysia)\",\n    \"ilo\": \"Iloko\",\n    \"ilp\": \"Iranun (Philippines)\",\n    \"ils\": \"International Sign\",\n    \"ilu\": \"Ili'uun\",\n    \"ilv\": \"Ilue\",\n    \"ima\": \"Mala Malasar\",\n    \"imi\": \"Anamgura\",\n    \"iml\": \"Miluk\",\n    \"imn\": \"Imonda\",\n    \"imo\": \"Imbongu\",\n    \"imr\": \"Imroing\",\n    \"ims\": \"Marsian\",\n    \"imt\": \"Imotong\",\n    \"imy\": \"Milyan\",\n    \"inb\": \"Inga\",\n    \"inc\": \"Indic languages\",\n    \"ine\": \"Indo-European languages\",\n    \"ing\": \"Degexit'an\",\n    \"inh\": \"Ingush\",\n    \"inj\": \"Jungle Inga\",\n    \"inl\": \"Indonesian Sign Language\",\n    \"inm\": \"Minaean\",\n    \"inn\": \"Isinai\",\n    \"ino\": \"Inoke-Yate\",\n    \"inp\": \"Iñapari\",\n    \"ins\": \"Indian Sign Language\",\n    \"int\": \"Intha\",\n    \"inz\": \"Ineseño\",\n    \"io\": \"Ido\",\n    \"ior\": \"Inor\",\n    \"iou\": \"Tuma-Irumu\",\n    \"iow\": \"Iowa-Oto\",\n    \"ipi\": \"Ipili\",\n    \"ipo\": \"Ipiko\",\n    \"iqu\": \"Iquito\",\n    \"iqw\": \"Ikwo\",\n    \"ira\": \"Iranian languages\",\n    \"ire\": \"Iresim\",\n    \"irh\": \"Irarutu\",\n    \"iri\": \"Rigwe; Irigwe\",\n    \"irk\": \"Iraqw\",\n    \"irn\": \"Irántxe\",\n    \"iro\": \"Iroquoian languages\",\n    \"irr\": \"Ir\",\n    \"iru\": \"Irula\",\n    \"irx\": \"Kamberau\",\n    \"iry\": \"Iraya\",\n    \"is\": \"Icelandic\",\n    \"isa\": \"Isabi\",\n    \"isc\": \"Isconahua\",\n    \"isd\": \"Isnag\",\n    \"ise\": \"Italian Sign Language\",\n    \"isg\": \"Irish Sign Language\",\n    \"ish\": \"Esan\",\n    \"isi\": \"Nkem-Nkum\",\n    \"isk\": \"Ishkashimi\",\n    \"ism\": \"Masimasi\",\n    \"isn\": \"Isanzu\",\n    \"iso\": \"Isoko\",\n    \"isr\": \"Israeli Sign Language\",\n    \"ist\": \"Istriot\",\n    \"isu\": \"Isu (Menchum Division)\",\n    \"it\": \"Italian\",\n    \"itb\": \"Binongan Itneg\",\n    \"itc\": \"Italic languages\",\n    \"itd\": \"Southern Tidung\",\n    \"ite\": \"Itene\",\n    \"iti\": \"Inlaod Itneg\",\n    \"itk\": \"Judeo-Italian\",\n    \"itl\": \"Itelmen\",\n    \"itm\": \"Itu Mbon Uzo\",\n    \"ito\": \"Itonama\",\n    \"itr\": \"Iteri\",\n    \"its\": \"Isekiri\",\n    \"itt\": \"Maeng Itneg\",\n    \"itv\": \"Itawit\",\n    \"itw\": \"Ito\",\n    \"itx\": \"Itik\",\n    \"ity\": \"Moyadan Itneg\",\n    \"itz\": \"Itzá\",\n    \"iu\": \"Inuktitut\",\n    \"ium\": \"Iu Mien\",\n    \"ivb\": \"Ibatan\",\n    \"ivv\": \"Ivatan\",\n    \"iwk\": \"I-Wak\",\n    \"iwm\": \"Iwam\",\n    \"iwo\": \"Iwur\",\n    \"iws\": \"Sepik Iwam\",\n    \"ixc\": \"Ixcatec\",\n    \"ixl\": \"Ixil\",\n    \"iya\": \"Iyayu\",\n    \"iyo\": \"Mesaka\",\n    \"iyx\": \"Yaka (Congo)\",\n    \"izh\": \"Ingrian\",\n    \"izr\": \"Izere\",\n    \"izz\": \"Izii\",\n    \"ja\": \"Japanese\",\n    \"jaa\": \"Jamamadí\",\n    \"jab\": \"Hyam\",\n    \"jac\": \"Popti'; Jakalteko\",\n    \"jad\": \"Jahanka\",\n    \"jae\": \"Yabem\",\n    \"jaf\": \"Jara\",\n    \"jah\": \"Jah Hut\",\n    \"jaj\": \"Zazao\",\n    \"jak\": \"Jakun\",\n    \"jal\": \"Yalahatan\",\n    \"jam\": \"Jamaican Creole English\",\n    \"jan\": \"Jandai\",\n    \"jao\": \"Yanyuwa\",\n    \"jaq\": \"Yaqay\",\n    \"jas\": \"New Caledonian Javanese\",\n    \"jat\": \"Jakati\",\n    \"jau\": \"Yaur\",\n    \"jax\": \"Jambi Malay\",\n    \"jay\": \"Yan-nhangu; Nhangu\",\n    \"jaz\": \"Jawe\",\n    \"jbe\": \"Judeo-Berber\",\n    \"jbi\": \"Badjiri\",\n    \"jbj\": \"Arandai\",\n    \"jbk\": \"Barikewa\",\n    \"jbm\": \"Bijim\",\n    \"jbn\": \"Nafusi\",\n    \"jbo\": \"Lojban\",\n    \"jbr\": \"Jofotek-Bromnya\",\n    \"jbt\": \"Jabutí\",\n    \"jbu\": \"Jukun Takum\",\n    \"jbw\": \"Yawijibaya\",\n    \"jcs\": \"Jamaican Country Sign Language\",\n    \"jct\": \"Krymchak\",\n    \"jda\": \"Jad\",\n    \"jdg\": \"Jadgali\",\n    \"jdt\": \"Judeo-Tat\",\n    \"jeb\": \"Jebero\",\n    \"jee\": \"Jerung\",\n    \"jeh\": \"Jeh\",\n    \"jei\": \"Yei\",\n    \"jek\": \"Jeri Kuo\",\n    \"jel\": \"Yelmek\",\n    \"jen\": \"Dza\",\n    \"jer\": \"Jere\",\n    \"jet\": \"Manem\",\n    \"jeu\": \"Jonkor Bourmataguil\",\n    \"jgb\": \"Ngbee\",\n    \"jge\": \"Judeo-Georgian\",\n    \"jgk\": \"Gwak\",\n    \"jgo\": \"Ngomba\",\n    \"jhi\": \"Jehai\",\n    \"jhs\": \"Jhankot Sign Language\",\n    \"jia\": \"Jina\",\n    \"jib\": \"Jibu\",\n    \"jic\": \"Tol\",\n    \"jid\": \"Bu (Kaduna State)\",\n    \"jie\": \"Jilbe\",\n    \"jig\": \"Jingulu; Djingili\",\n    \"jih\": \"sTodsde; Shangzhai\",\n    \"jii\": \"Jiiddu\",\n    \"jil\": \"Jilim\",\n    \"jim\": \"Jimi (Cameroon)\",\n    \"jio\": \"Jiamao\",\n    \"jiq\": \"Guanyinqiao; Lavrung\",\n    \"jit\": \"Jita\",\n    \"jiu\": \"Youle Jinuo\",\n    \"jiv\": \"Shuar\",\n    \"jiy\": \"Buyuan Jinuo\",\n    \"jje\": \"Jejueo\",\n    \"jjr\": \"Bankal\",\n    \"jka\": \"Kaera\",\n    \"jkm\": \"Mobwa Karen\",\n    \"jko\": \"Kubo\",\n    \"jkp\": \"Paku Karen\",\n    \"jkr\": \"Koro (India)\",\n    \"jks\": \"Amami Koniya Sign Language\",\n    \"jku\": \"Labir\",\n    \"jle\": \"Ngile\",\n    \"jls\": \"Jamaican Sign Language\",\n    \"jma\": \"Dima\",\n    \"jmb\": \"Zumbun\",\n    \"jmc\": \"Machame\",\n    \"jmd\": \"Yamdena\",\n    \"jmi\": \"Jimi (Nigeria)\",\n    \"jml\": \"Jumli\",\n    \"jmn\": \"Makuri Naga\",\n    \"jmr\": \"Kamara\",\n    \"jms\": \"Mashi (Nigeria)\",\n    \"jmw\": \"Mouwase\",\n    \"jmx\": \"Western Juxtlahuaca Mixtec\",\n    \"jna\": \"Jangshung\",\n    \"jnd\": \"Jandavra\",\n    \"jng\": \"Yangman\",\n    \"jni\": \"Janji\",\n    \"jnj\": \"Yemsa\",\n    \"jnl\": \"Rawat\",\n    \"jns\": \"Jaunsari\",\n    \"job\": \"Joba\",\n    \"jod\": \"Wojenaka\",\n    \"jog\": \"Jogi\",\n    \"jor\": \"Jorá\",\n    \"jos\": \"Jordanian Sign Language\",\n    \"jow\": \"Jowulu\",\n    \"jpa\": \"Jewish Palestinian Aramaic\",\n    \"jpr\": \"Judeo-Persian\",\n    \"jpx\": \"Japanese (family)\",\n    \"jqr\": \"Jaqaru\",\n    \"jra\": \"Jarai\",\n    \"jrb\": \"Judeo-Arabic\",\n    \"jrr\": \"Jiru\",\n    \"jrt\": \"Jakattoe\",\n    \"jru\": \"Japrería\",\n    \"jsl\": \"Japanese Sign Language\",\n    \"jua\": \"Júma\",\n    \"jub\": \"Wannu\",\n    \"juc\": \"Jurchen\",\n    \"jud\": \"Worodougou\",\n    \"juh\": \"Hõne\",\n    \"jui\": \"Ngadjuri\",\n    \"juk\": \"Wapan\",\n    \"jul\": \"Jirel\",\n    \"jum\": \"Jumjum\",\n    \"jun\": \"Juang\",\n    \"juo\": \"Jiba\",\n    \"jup\": \"Hupdë\",\n    \"jur\": \"Jurúna\",\n    \"jus\": \"Jumla Sign Language\",\n    \"jut\": \"Jutish\",\n    \"juu\": \"Ju\",\n    \"juw\": \"Wãpha\",\n    \"juy\": \"Juray\",\n    \"jv\": \"Javanese\",\n    \"jvd\": \"Javindo\",\n    \"jvn\": \"Caribbean Javanese\",\n    \"jwi\": \"Jwira-Pepesa\",\n    \"jya\": \"Jiarong\",\n    \"jye\": \"Judeo-Yemeni Arabic\",\n    \"jyy\": \"Jaya\",\n    \"ka\": \"Georgian\",\n    \"kaa\": \"Kara-Kalpak; Karakalpak\",\n    \"kab\": \"Kabyle\",\n    \"kac\": \"Kachin; Jingpho\",\n    \"kad\": \"Adara\",\n    \"kae\": \"Ketangalan\",\n    \"kaf\": \"Katso\",\n    \"kag\": \"Kajaman\",\n    \"kah\": \"Kara (Central African Republic)\",\n    \"kai\": \"Karekare\",\n    \"kaj\": \"Jju\",\n    \"kak\": \"Kalanguya; Kayapa Kallahan\",\n    \"kam\": \"Kamba (Kenya)\",\n    \"kao\": \"Xaasongaxango\",\n    \"kap\": \"Bezhta\",\n    \"kaq\": \"Capanahua\",\n    \"kar\": \"Karen languages\",\n    \"kav\": \"Katukína\",\n    \"kaw\": \"Kawi\",\n    \"kax\": \"Kao\",\n    \"kay\": \"Kamayurá\",\n    \"kba\": \"Kalarko\",\n    \"kbb\": \"Kaxuiâna\",\n    \"kbc\": \"Kadiwéu\",\n    \"kbd\": \"Kabardian\",\n    \"kbe\": \"Kanju\",\n    \"kbg\": \"Khamba\",\n    \"kbh\": \"Camsá\",\n    \"kbi\": \"Kaptiau\",\n    \"kbj\": \"Kari\",\n    \"kbk\": \"Grass Koiari\",\n    \"kbl\": \"Kanembu\",\n    \"kbm\": \"Iwal\",\n    \"kbn\": \"Kare (Central African Republic)\",\n    \"kbo\": \"Keliko\",\n    \"kbp\": \"Kabiyè\",\n    \"kbq\": \"Kamano\",\n    \"kbr\": \"Kafa\",\n    \"kbs\": \"Kande\",\n    \"kbt\": \"Abadi\",\n    \"kbu\": \"Kabutra\",\n    \"kbv\": \"Dera (Indonesia)\",\n    \"kbw\": \"Kaiep\",\n    \"kbx\": \"Ap Ma\",\n    \"kby\": \"Manga Kanuri\",\n    \"kbz\": \"Duhwa\",\n    \"kca\": \"Khanty\",\n    \"kcb\": \"Kawacha\",\n    \"kcc\": \"Lubila\",\n    \"kcd\": \"Ngkâlmpw Kanum\",\n    \"kce\": \"Kaivi\",\n    \"kcf\": \"Ukaan\",\n    \"kcg\": \"Tyap\",\n    \"kch\": \"Vono\",\n    \"kci\": \"Kamantan\",\n    \"kcj\": \"Kobiana\",\n    \"kck\": \"Kalanga\",\n    \"kcl\": \"Kela (Papua New Guinea); Kala\",\n    \"kcm\": \"Gula (Central African Republic)\",\n    \"kcn\": \"Nubi\",\n    \"kco\": \"Kinalakna\",\n    \"kcp\": \"Kanga\",\n    \"kcq\": \"Kamo\",\n    \"kcr\": \"Katla\",\n    \"kcs\": \"Koenoem\",\n    \"kct\": \"Kaian\",\n    \"kcu\": \"Kami (Tanzania)\",\n    \"kcv\": \"Kete\",\n    \"kcw\": \"Kabwari\",\n    \"kcx\": \"Kachama-Ganjule\",\n    \"kcy\": \"Korandje\",\n    \"kcz\": \"Konongo\",\n    \"kda\": \"Worimi\",\n    \"kdc\": \"Kutu\",\n    \"kdd\": \"Yankunytjatjara\",\n    \"kde\": \"Makonde\",\n    \"kdf\": \"Mamusi\",\n    \"kdg\": \"Seba\",\n    \"kdh\": \"Tem\",\n    \"kdi\": \"Kumam\",\n    \"kdj\": \"Karamojong\",\n    \"kdk\": \"Numèè; Kwényi\",\n    \"kdl\": \"Tsikimba\",\n    \"kdm\": \"Kagoma\",\n    \"kdn\": \"Kunda\",\n    \"kdo\": \"Kordofanian languages\",\n    \"kdp\": \"Kaningdon-Nindem\",\n    \"kdq\": \"Koch\",\n    \"kdr\": \"Karaim\",\n    \"kdt\": \"Kuy\",\n    \"kdu\": \"Kadaru\",\n    \"kdw\": \"Koneraw\",\n    \"kdx\": \"Kam\",\n    \"kdy\": \"Keder; Keijar\",\n    \"kdz\": \"Kwaja\",\n    \"kea\": \"Kabuverdianu\",\n    \"keb\": \"Kélé\",\n    \"kec\": \"Keiga\",\n    \"ked\": \"Kerewe\",\n    \"kee\": \"Eastern Keres\",\n    \"kef\": \"Kpessi\",\n    \"keg\": \"Tese\",\n    \"keh\": \"Keak\",\n    \"kei\": \"Kei\",\n    \"kej\": \"Kadar\",\n    \"kek\": \"Kekchí\",\n    \"kel\": \"Kela (Democratic Republic of Congo)\",\n    \"kem\": \"Kemak\",\n    \"ken\": \"Kenyang\",\n    \"keo\": \"Kakwa\",\n    \"kep\": \"Kaikadi\",\n    \"keq\": \"Kamar\",\n    \"ker\": \"Kera\",\n    \"kes\": \"Kugbo\",\n    \"ket\": \"Ket\",\n    \"keu\": \"Akebu\",\n    \"kev\": \"Kanikkaran\",\n    \"kew\": \"West Kewa\",\n    \"kex\": \"Kukna\",\n    \"key\": \"Kupia\",\n    \"kez\": \"Kukele\",\n    \"kfa\": \"Kodava\",\n    \"kfb\": \"Northwestern Kolami\",\n    \"kfc\": \"Konda-Dora\",\n    \"kfd\": \"Korra Koraga\",\n    \"kfe\": \"Kota (India)\",\n    \"kff\": \"Koya\",\n    \"kfg\": \"Kudiya\",\n    \"kfh\": \"Kurichiya\",\n    \"kfi\": \"Kannada Kurumba\",\n    \"kfj\": \"Kemiehua\",\n    \"kfk\": \"Kinnauri\",\n    \"kfl\": \"Kung\",\n    \"kfm\": \"Khunsari\",\n    \"kfn\": \"Kuk\",\n    \"kfo\": \"Koro (Côte d'Ivoire)\",\n    \"kfp\": \"Korwa\",\n    \"kfq\": \"Korku\",\n    \"kfr\": \"Kachhi; Kutchi\",\n    \"kfs\": \"Bilaspuri\",\n    \"kft\": \"Kanjari\",\n    \"kfu\": \"Katkari\",\n    \"kfv\": \"Kurmukar\",\n    \"kfw\": \"Kharam Naga\",\n    \"kfx\": \"Kullu Pahari\",\n    \"kfy\": \"Kumaoni\",\n    \"kfz\": \"Koromfé\",\n    \"kg\": \"Kongo\",\n    \"kga\": \"Koyaga\",\n    \"kgb\": \"Kawe\",\n    \"kge\": \"Komering\",\n    \"kgf\": \"Kube\",\n    \"kgg\": \"Kusunda\",\n    \"kgi\": \"Selangor Sign Language\",\n    \"kgj\": \"Gamale Kham\",\n    \"kgk\": \"Kaiwá\",\n    \"kgl\": \"Kunggari\",\n    \"kgm\": \"Karipúna\",\n    \"kgn\": \"Karingani\",\n    \"kgo\": \"Krongo\",\n    \"kgp\": \"Kaingang\",\n    \"kgq\": \"Kamoro\",\n    \"kgr\": \"Abun\",\n    \"kgs\": \"Kumbainggar\",\n    \"kgt\": \"Somyev\",\n    \"kgu\": \"Kobol\",\n    \"kgv\": \"Karas\",\n    \"kgw\": \"Karon Dori\",\n    \"kgx\": \"Kamaru\",\n    \"kgy\": \"Kyerung\",\n    \"kha\": \"Khasi\",\n    \"khb\": \"Lü\",\n    \"khc\": \"Tukang Besi North\",\n    \"khd\": \"Bädi Kanum\",\n    \"khe\": \"Korowai\",\n    \"khf\": \"Khuen\",\n    \"khg\": \"Khams Tibetan\",\n    \"khh\": \"Kehu\",\n    \"khi\": \"Khoisan languages\",\n    \"khj\": \"Kuturmi\",\n    \"khk\": \"Halh Mongolian\",\n    \"khl\": \"Lusi\",\n    \"khn\": \"Khandesi\",\n    \"kho\": \"Khotanese; Sakan\",\n    \"khp\": \"Kapori; Kapauri\",\n    \"khq\": \"Koyra Chiini Songhay\",\n    \"khr\": \"Kharia\",\n    \"khs\": \"Kasua\",\n    \"kht\": \"Khamti\",\n    \"khu\": \"Nkhumbi\",\n    \"khv\": \"Khvarshi\",\n    \"khw\": \"Khowar\",\n    \"khx\": \"Kanu\",\n    \"khy\": \"Kele (Democratic Republic of Congo)\",\n    \"khz\": \"Keapara\",\n    \"ki\": \"Kikuyu; Gikuyu\",\n    \"kia\": \"Kim\",\n    \"kib\": \"Koalib\",\n    \"kic\": \"Kickapoo\",\n    \"kid\": \"Koshin\",\n    \"kie\": \"Kibet\",\n    \"kif\": \"Eastern Parbate Kham\",\n    \"kig\": \"Kimaama; Kimaghima\",\n    \"kih\": \"Kilmeri\",\n    \"kii\": \"Kitsai\",\n    \"kij\": \"Kilivila\",\n    \"kil\": \"Kariya\",\n    \"kim\": \"Karagas\",\n    \"kio\": \"Kiowa\",\n    \"kip\": \"Sheshi Kham\",\n    \"kiq\": \"Kosadle; Kosare\",\n    \"kis\": \"Kis\",\n    \"kit\": \"Agob\",\n    \"kiu\": \"Kirmanjki (individual language)\",\n    \"kiv\": \"Kimbu\",\n    \"kiw\": \"Northeast Kiwai\",\n    \"kix\": \"Khiamniungan Naga\",\n    \"kiy\": \"Kirikiri\",\n    \"kiz\": \"Kisi\",\n    \"kj\": \"Kuanyama; Kwanyama\",\n    \"kja\": \"Mlap\",\n    \"kjb\": \"Q'anjob'al; Kanjobal\",\n    \"kjc\": \"Coastal Konjo\",\n    \"kjd\": \"Southern Kiwai\",\n    \"kje\": \"Kisar\",\n    \"kjg\": \"Khmu\",\n    \"kjh\": \"Khakas\",\n    \"kji\": \"Zabana\",\n    \"kjj\": \"Khinalugh\",\n    \"kjk\": \"Highland Konjo\",\n    \"kjl\": \"Western Parbate Kham\",\n    \"kjm\": \"Kháng\",\n    \"kjn\": \"Kunjen\",\n    \"kjo\": \"Harijan Kinnauri\",\n    \"kjp\": \"Pwo Eastern Karen\",\n    \"kjq\": \"Western Keres\",\n    \"kjr\": \"Kurudu\",\n    \"kjs\": \"East Kewa\",\n    \"kjt\": \"Phrae Pwo Karen\",\n    \"kju\": \"Kashaya\",\n    \"kjv\": \"Kaikavian Literary Language\",\n    \"kjx\": \"Ramopa\",\n    \"kjy\": \"Erave\",\n    \"kjz\": \"Bumthangkha\",\n    \"kk\": \"Kazakh\",\n    \"kka\": \"Kakanda\",\n    \"kkb\": \"Kwerisa\",\n    \"kkc\": \"Odoodee\",\n    \"kkd\": \"Kinuku\",\n    \"kke\": \"Kakabe\",\n    \"kkf\": \"Kalaktang Monpa\",\n    \"kkg\": \"Mabaka Valley Kalinga\",\n    \"kkh\": \"Khün\",\n    \"kki\": \"Kagulu\",\n    \"kkj\": \"Kako\",\n    \"kkk\": \"Kokota\",\n    \"kkl\": \"Kosarek Yale\",\n    \"kkm\": \"Kiong\",\n    \"kkn\": \"Kon Keu\",\n    \"kko\": \"Karko\",\n    \"kkp\": \"Gugubera; Koko-Bera\",\n    \"kkq\": \"Kaeku\",\n    \"kkr\": \"Kir-Balar\",\n    \"kks\": \"Giiwo\",\n    \"kkt\": \"Koi\",\n    \"kku\": \"Tumi\",\n    \"kkv\": \"Kangean\",\n    \"kkw\": \"Teke-Kukuya\",\n    \"kkx\": \"Kohin\",\n    \"kky\": \"Guugu Yimidhirr; Guguyimidjir\",\n    \"kkz\": \"Kaska\",\n    \"kl\": \"Kalaallisut; Greenlandic\",\n    \"kla\": \"Klamath-Modoc\",\n    \"klb\": \"Kiliwa\",\n    \"klc\": \"Kolbila\",\n    \"kld\": \"Gamilaraay\",\n    \"kle\": \"Kulung (Nepal)\",\n    \"klf\": \"Kendeje\",\n    \"klg\": \"Tagakaulo\",\n    \"klh\": \"Weliki\",\n    \"kli\": \"Kalumpang\",\n    \"klj\": \"Khalaj\",\n    \"klk\": \"Kono (Nigeria)\",\n    \"kll\": \"Kagan Kalagan\",\n    \"klm\": \"Migum\",\n    \"kln\": \"Kalenjin\",\n    \"klo\": \"Kapya\",\n    \"klp\": \"Kamasa\",\n    \"klq\": \"Rumu\",\n    \"klr\": \"Khaling\",\n    \"kls\": \"Kalasha\",\n    \"klt\": \"Nukna\",\n    \"klu\": \"Klao\",\n    \"klv\": \"Maskelynes\",\n    \"klw\": \"Tado; Lindu\",\n    \"klx\": \"Koluwawa\",\n    \"kly\": \"Kalao\",\n    \"klz\": \"Kabola\",\n    \"km\": \"Khmer; Central Khmer\",\n    \"kma\": \"Konni\",\n    \"kmb\": \"Kimbundu\",\n    \"kmc\": \"Southern Dong\",\n    \"kmd\": \"Majukayang Kalinga\",\n    \"kme\": \"Bakole\",\n    \"kmf\": \"Kare (Papua New Guinea)\",\n    \"kmg\": \"Kâte\",\n    \"kmh\": \"Kalam\",\n    \"kmi\": \"Kami (Nigeria)\",\n    \"kmj\": \"Kumarbhag Paharia\",\n    \"kmk\": \"Limos Kalinga\",\n    \"kml\": \"Tanudan Kalinga\",\n    \"kmm\": \"Kom (India)\",\n    \"kmn\": \"Awtuw\",\n    \"kmo\": \"Kwoma\",\n    \"kmp\": \"Gimme\",\n    \"kmq\": \"Kwama\",\n    \"kmr\": \"Northern Kurdish\",\n    \"kms\": \"Kamasau\",\n    \"kmt\": \"Kemtuik\",\n    \"kmu\": \"Kanite\",\n    \"kmv\": \"Karipúna Creole French\",\n    \"kmw\": \"Komo (Democratic Republic of Congo)\",\n    \"kmx\": \"Waboda\",\n    \"kmy\": \"Koma\",\n    \"kmz\": \"Khorasani Turkish\",\n    \"kn\": \"Kannada\",\n    \"kna\": \"Dera (Nigeria)\",\n    \"knb\": \"Lubuagan Kalinga\",\n    \"knc\": \"Central Kanuri\",\n    \"knd\": \"Konda\",\n    \"kne\": \"Kankanaey\",\n    \"knf\": \"Mankanya\",\n    \"kng\": \"Koongo\",\n    \"kni\": \"Kanufi\",\n    \"knj\": \"Western Kanjobal\",\n    \"knk\": \"Kuranko\",\n    \"knl\": \"Keninjal\",\n    \"knm\": \"Kanamarí\",\n    \"knn\": \"Konkani (individual language)\",\n    \"kno\": \"Kono (Sierra Leone)\",\n    \"knp\": \"Kwanja\",\n    \"knq\": \"Kintaq\",\n    \"knr\": \"Kaningra\",\n    \"kns\": \"Kensiu\",\n    \"knt\": \"Panoan Katukína\",\n    \"knu\": \"Kono (Guinea)\",\n    \"knv\": \"Tabo\",\n    \"knw\": \"Kung-Ekoka\",\n    \"knx\": \"Kendayan; Salako\",\n    \"kny\": \"Kanyok\",\n    \"knz\": \"Kalamsé\",\n    \"ko\": \"Korean\",\n    \"koa\": \"Konomala\",\n    \"koc\": \"Kpati\",\n    \"kod\": \"Kodi\",\n    \"koe\": \"Kacipo-Bale Suri\",\n    \"kof\": \"Kubi\",\n    \"kog\": \"Cogui; Kogi\",\n    \"koh\": \"Koyo\",\n    \"koi\": \"Komi-Permyak\",\n    \"kok\": \"Konkani (macrolanguage)\",\n    \"kol\": \"Kol (Papua New Guinea)\",\n    \"koo\": \"Konzo\",\n    \"kop\": \"Waube\",\n    \"koq\": \"Kota (Gabon)\",\n    \"kos\": \"Kosraean\",\n    \"kot\": \"Lagwan\",\n    \"kou\": \"Koke\",\n    \"kov\": \"Kudu-Camo\",\n    \"kow\": \"Kugama\",\n    \"koy\": \"Koyukon\",\n    \"koz\": \"Korak\",\n    \"kpa\": \"Kutto\",\n    \"kpb\": \"Mullu Kurumba\",\n    \"kpc\": \"Curripaco\",\n    \"kpd\": \"Koba\",\n    \"kpe\": \"Kpelle\",\n    \"kpf\": \"Komba\",\n    \"kpg\": \"Kapingamarangi\",\n    \"kph\": \"Kplang\",\n    \"kpi\": \"Kofei\",\n    \"kpj\": \"Karajá\",\n    \"kpk\": \"Kpan\",\n    \"kpl\": \"Kpala\",\n    \"kpm\": \"Koho\",\n    \"kpn\": \"Kepkiriwát\",\n    \"kpo\": \"Ikposo\",\n    \"kpq\": \"Korupun-Sela\",\n    \"kpr\": \"Korafe-Yegha\",\n    \"kps\": \"Tehit\",\n    \"kpt\": \"Karata\",\n    \"kpu\": \"Kafoa\",\n    \"kpv\": \"Komi-Zyrian\",\n    \"kpw\": \"Kobon\",\n    \"kpx\": \"Mountain Koiali\",\n    \"kpy\": \"Koryak\",\n    \"kpz\": \"Kupsabiny\",\n    \"kqa\": \"Mum\",\n    \"kqb\": \"Kovai\",\n    \"kqc\": \"Doromu-Koki\",\n    \"kqd\": \"Koy Sanjaq Surat\",\n    \"kqe\": \"Kalagan\",\n    \"kqf\": \"Kakabai\",\n    \"kqg\": \"Khe\",\n    \"kqh\": \"Kisankasa\",\n    \"kqi\": \"Koitabu\",\n    \"kqj\": \"Koromira\",\n    \"kqk\": \"Kotafon Gbe\",\n    \"kql\": \"Kyenele\",\n    \"kqm\": \"Khisa\",\n    \"kqn\": \"Kaonde\",\n    \"kqo\": \"Eastern Krahn\",\n    \"kqp\": \"Kimré\",\n    \"kqq\": \"Krenak\",\n    \"kqr\": \"Kimaragang\",\n    \"kqs\": \"Northern Kissi\",\n    \"kqt\": \"Klias River Kadazan\",\n    \"kqu\": \"Seroa\",\n    \"kqv\": \"Okolod\",\n    \"kqw\": \"Kandas\",\n    \"kqx\": \"Mser\",\n    \"kqy\": \"Koorete\",\n    \"kqz\": \"Korana\",\n    \"kr\": \"Kanuri\",\n    \"kra\": \"Kumhali\",\n    \"krb\": \"Karkin\",\n    \"krc\": \"Karachay-Balkar\",\n    \"krd\": \"Kairui-Midiki\",\n    \"kre\": \"Panará\",\n    \"krf\": \"Koro (Vanuatu)\",\n    \"krh\": \"Kurama\",\n    \"kri\": \"Krio\",\n    \"krj\": \"Kinaray-A\",\n    \"krk\": \"Kerek\",\n    \"krl\": \"Karelian\",\n    \"krn\": \"Sapo\",\n    \"kro\": \"Kru languages\",\n    \"krp\": \"Korop\",\n    \"krr\": \"Krung\",\n    \"krs\": \"Gbaya (Sudan)\",\n    \"krt\": \"Tumari Kanuri\",\n    \"kru\": \"Kurukh\",\n    \"krv\": \"Kavet\",\n    \"krw\": \"Western Krahn\",\n    \"krx\": \"Karon\",\n    \"kry\": \"Kryts\",\n    \"krz\": \"Sota Kanum\",\n    \"ks\": \"Kashmiri\",\n    \"ksa\": \"Shuwa-Zamani\",\n    \"ksb\": \"Shambala\",\n    \"ksc\": \"Southern Kalinga\",\n    \"ksd\": \"Kuanua\",\n    \"kse\": \"Kuni\",\n    \"ksf\": \"Bafia\",\n    \"ksg\": \"Kusaghe\",\n    \"ksh\": \"Kölsch\",\n    \"ksi\": \"Krisa; I'saka\",\n    \"ksj\": \"Uare\",\n    \"ksk\": \"Kansa\",\n    \"ksl\": \"Kumalu\",\n    \"ksm\": \"Kumba\",\n    \"ksn\": \"Kasiguranin\",\n    \"kso\": \"Kofa\",\n    \"ksp\": \"Kaba\",\n    \"ksq\": \"Kwaami\",\n    \"ksr\": \"Borong\",\n    \"kss\": \"Southern Kisi\",\n    \"kst\": \"Winyé\",\n    \"ksu\": \"Khamyang\",\n    \"ksv\": \"Kusu\",\n    \"ksw\": \"S'gaw Karen\",\n    \"ksx\": \"Kedang\",\n    \"ksy\": \"Kharia Thar\",\n    \"ksz\": \"Kodaku\",\n    \"kta\": \"Katua\",\n    \"ktb\": \"Kambaata\",\n    \"ktc\": \"Kholok\",\n    \"ktd\": \"Kokata; Kukatha\",\n    \"kte\": \"Nubri\",\n    \"ktf\": \"Kwami\",\n    \"ktg\": \"Kalkutung\",\n    \"kth\": \"Karanga\",\n    \"kti\": \"North Muyu\",\n    \"ktj\": \"Plapo Krumen\",\n    \"ktk\": \"Kaniet\",\n    \"ktl\": \"Koroshi\",\n    \"ktm\": \"Kurti\",\n    \"ktn\": \"Karitiâna\",\n    \"kto\": \"Kuot\",\n    \"ktp\": \"Kaduo\",\n    \"ktq\": \"Katabaga\",\n    \"kts\": \"South Muyu\",\n    \"ktt\": \"Ketum\",\n    \"ktu\": \"Kituba (Democratic Republic of Congo)\",\n    \"ktv\": \"Eastern Katu\",\n    \"ktw\": \"Kato\",\n    \"ktx\": \"Kaxararí\",\n    \"kty\": \"Kango (Bas-Uélé District)\",\n    \"ktz\": \"Juǀʼhoan; Juǀʼhoansi\",\n    \"ku\": \"Kurdish\",\n    \"kub\": \"Kutep\",\n    \"kuc\": \"Kwinsu\",\n    \"kud\": \"'Auhelawa\",\n    \"kue\": \"Kuman (Papua New Guinea)\",\n    \"kuf\": \"Western Katu\",\n    \"kug\": \"Kupa\",\n    \"kuh\": \"Kushi\",\n    \"kui\": \"Kuikúro-Kalapálo; Kalapalo\",\n    \"kuj\": \"Kuria\",\n    \"kuk\": \"Kepo'\",\n    \"kul\": \"Kulere\",\n    \"kum\": \"Kumyk\",\n    \"kun\": \"Kunama\",\n    \"kuo\": \"Kumukio\",\n    \"kup\": \"Kunimaipa\",\n    \"kuq\": \"Karipuna\",\n    \"kus\": \"Kusaal\",\n    \"kut\": \"Kutenai\",\n    \"kuu\": \"Upper Kuskokwim\",\n    \"kuv\": \"Kur\",\n    \"kuw\": \"Kpagua\",\n    \"kux\": \"Kukatja\",\n    \"kuy\": \"Kuuku-Ya'u\",\n    \"kuz\": \"Kunza\",\n    \"kv\": \"Komi\",\n    \"kva\": \"Bagvalal\",\n    \"kvb\": \"Kubu\",\n    \"kvc\": \"Kove\",\n    \"kvd\": \"Kui (Indonesia)\",\n    \"kve\": \"Kalabakan\",\n    \"kvf\": \"Kabalai\",\n    \"kvg\": \"Kuni-Boazi\",\n    \"kvh\": \"Komodo\",\n    \"kvi\": \"Kwang\",\n    \"kvj\": \"Psikye\",\n    \"kvk\": \"Korean Sign Language\",\n    \"kvl\": \"Kayaw\",\n    \"kvm\": \"Kendem\",\n    \"kvn\": \"Border Kuna\",\n    \"kvo\": \"Dobel\",\n    \"kvp\": \"Kompane\",\n    \"kvq\": \"Geba Karen\",\n    \"kvr\": \"Kerinci\",\n    \"kvt\": \"Lahta Karen; Lahta\",\n    \"kvu\": \"Yinbaw Karen\",\n    \"kvv\": \"Kola\",\n    \"kvw\": \"Wersing\",\n    \"kvx\": \"Parkari Koli\",\n    \"kvy\": \"Yintale Karen; Yintale\",\n    \"kvz\": \"Tsakwambo; Tsaukambo\",\n    \"kw\": \"Cornish\",\n    \"kwa\": \"Dâw\",\n    \"kwb\": \"Kwa\",\n    \"kwc\": \"Likwala\",\n    \"kwd\": \"Kwaio\",\n    \"kwe\": \"Kwerba\",\n    \"kwf\": \"Kwara'ae\",\n    \"kwg\": \"Sara Kaba Deme\",\n    \"kwh\": \"Kowiai\",\n    \"kwi\": \"Awa-Cuaiquer\",\n    \"kwj\": \"Kwanga\",\n    \"kwk\": \"Kwakiutl\",\n    \"kwl\": \"Kofyar\",\n    \"kwm\": \"Kwambi\",\n    \"kwn\": \"Kwangali\",\n    \"kwo\": \"Kwomtari\",\n    \"kwp\": \"Kodia\",\n    \"kwr\": \"Kwer\",\n    \"kws\": \"Kwese\",\n    \"kwt\": \"Kwesten\",\n    \"kwu\": \"Kwakum\",\n    \"kwv\": \"Sara Kaba Náà\",\n    \"kww\": \"Kwinti\",\n    \"kwx\": \"Khirwar\",\n    \"kwy\": \"San Salvador Kongo\",\n    \"kwz\": \"Kwadi\",\n    \"kxa\": \"Kairiru\",\n    \"kxb\": \"Krobu\",\n    \"kxc\": \"Konso; Khonso\",\n    \"kxd\": \"Brunei\",\n    \"kxf\": \"Manumanaw Karen; Manumanaw\",\n    \"kxh\": \"Karo (Ethiopia)\",\n    \"kxi\": \"Keningau Murut\",\n    \"kxj\": \"Kulfa\",\n    \"kxk\": \"Zayein Karen\",\n    \"kxm\": \"Northern Khmer\",\n    \"kxn\": \"Kanowit-Tanjong Melanau\",\n    \"kxo\": \"Kanoé\",\n    \"kxp\": \"Wadiyara Koli\",\n    \"kxq\": \"Smärky Kanum\",\n    \"kxr\": \"Koro (Papua New Guinea)\",\n    \"kxs\": \"Kangjia\",\n    \"kxt\": \"Koiwat\",\n    \"kxv\": \"Kuvi\",\n    \"kxw\": \"Konai\",\n    \"kxx\": \"Likuba\",\n    \"kxy\": \"Kayong\",\n    \"kxz\": \"Kerewo\",\n    \"ky\": \"Kirghiz; Kyrgyz\",\n    \"kya\": \"Kwaya\",\n    \"kyb\": \"Butbut Kalinga\",\n    \"kyc\": \"Kyaka\",\n    \"kyd\": \"Karey\",\n    \"kye\": \"Krache\",\n    \"kyf\": \"Kouya\",\n    \"kyg\": \"Keyagana\",\n    \"kyh\": \"Karok\",\n    \"kyi\": \"Kiput\",\n    \"kyj\": \"Karao\",\n    \"kyk\": \"Kamayo\",\n    \"kyl\": \"Kalapuya\",\n    \"kym\": \"Kpatili\",\n    \"kyn\": \"Northern Binukidnon\",\n    \"kyo\": \"Kelon\",\n    \"kyp\": \"Kang\",\n    \"kyq\": \"Kenga\",\n    \"kyr\": \"Kuruáya\",\n    \"kys\": \"Baram Kayan\",\n    \"kyt\": \"Kayagar\",\n    \"kyu\": \"Western Kayah\",\n    \"kyv\": \"Kayort\",\n    \"kyw\": \"Kudmali\",\n    \"kyx\": \"Rapoisi\",\n    \"kyy\": \"Kambaira\",\n    \"kyz\": \"Kayabí\",\n    \"kza\": \"Western Karaboro\",\n    \"kzb\": \"Kaibobo\",\n    \"kzc\": \"Bondoukou Kulango\",\n    \"kzd\": \"Kadai\",\n    \"kze\": \"Kosena\",\n    \"kzf\": \"Da'a Kaili\",\n    \"kzg\": \"Kikai\",\n    \"kzi\": \"Kelabit\",\n    \"kzk\": \"Kazukuru\",\n    \"kzl\": \"Kayeli\",\n    \"kzm\": \"Kais\",\n    \"kzn\": \"Kokola\",\n    \"kzo\": \"Kaningi\",\n    \"kzp\": \"Kaidipang\",\n    \"kzq\": \"Kaike\",\n    \"kzr\": \"Karang\",\n    \"kzs\": \"Sugut Dusun\",\n    \"kzu\": \"Kayupulau\",\n    \"kzv\": \"Komyandaret\",\n    \"kzw\": \"Karirí-Xocó\",\n    \"kzx\": \"Kamarian\",\n    \"kzy\": \"Kango (Tshopo District)\",\n    \"kzz\": \"Kalabra\",\n    \"la\": \"Latin\",\n    \"laa\": \"Southern Subanen\",\n    \"lab\": \"Linear A\",\n    \"lac\": \"Lacandon\",\n    \"lad\": \"Ladino\",\n    \"lae\": \"Pattani\",\n    \"laf\": \"Lafofa\",\n    \"lag\": \"Langi\",\n    \"lah\": \"Lahnda\",\n    \"lai\": \"Lambya\",\n    \"laj\": \"Lango (Uganda)\",\n    \"lal\": \"Lalia\",\n    \"lam\": \"Lamba\",\n    \"lan\": \"Laru\",\n    \"lap\": \"Laka (Chad)\",\n    \"laq\": \"Qabiao\",\n    \"lar\": \"Larteh\",\n    \"las\": \"Lama (Togo)\",\n    \"lau\": \"Laba\",\n    \"law\": \"Lauje\",\n    \"lax\": \"Tiwa\",\n    \"lay\": \"Lama Bai\",\n    \"laz\": \"Aribwatsa\",\n    \"lb\": \"Luxembourgish; Letzeburgesch\",\n    \"lbb\": \"Label\",\n    \"lbc\": \"Lakkia\",\n    \"lbe\": \"Lak\",\n    \"lbf\": \"Tinani\",\n    \"lbg\": \"Laopang\",\n    \"lbi\": \"La'bi\",\n    \"lbj\": \"Ladakhi\",\n    \"lbk\": \"Central Bontok\",\n    \"lbl\": \"Libon Bikol\",\n    \"lbm\": \"Lodhi\",\n    \"lbn\": \"Rmeet\",\n    \"lbo\": \"Laven\",\n    \"lbq\": \"Wampar\",\n    \"lbr\": \"Lohorung\",\n    \"lbs\": \"Libyan Sign Language\",\n    \"lbt\": \"Lachi\",\n    \"lbu\": \"Labu\",\n    \"lbv\": \"Lavatbura-Lamusong\",\n    \"lbw\": \"Tolaki\",\n    \"lbx\": \"Lawangan\",\n    \"lby\": \"Lamalama; Lamu-Lamu\",\n    \"lbz\": \"Lardil\",\n    \"lcc\": \"Legenyem\",\n    \"lcd\": \"Lola\",\n    \"lce\": \"Loncong; Sekak\",\n    \"lcf\": \"Lubu\",\n    \"lch\": \"Luchazi\",\n    \"lcl\": \"Lisela\",\n    \"lcm\": \"Tungag\",\n    \"lcp\": \"Western Lawa\",\n    \"lcq\": \"Luhu\",\n    \"lcs\": \"Lisabata-Nuniali\",\n    \"lda\": \"Kla-Dan\",\n    \"ldb\": \"Dũya\",\n    \"ldd\": \"Luri\",\n    \"ldg\": \"Lenyima\",\n    \"ldh\": \"Lamja-Dengsa-Tola\",\n    \"ldi\": \"Laari\",\n    \"ldj\": \"Lemoro\",\n    \"ldk\": \"Leelau\",\n    \"ldl\": \"Kaan\",\n    \"ldm\": \"Landoma\",\n    \"ldn\": \"Láadan\",\n    \"ldo\": \"Loo\",\n    \"ldp\": \"Tso\",\n    \"ldq\": \"Lufu\",\n    \"lea\": \"Lega-Shabunda\",\n    \"leb\": \"Lala-Bisa\",\n    \"lec\": \"Leco\",\n    \"led\": \"Lendu\",\n    \"lee\": \"Lyélé\",\n    \"lef\": \"Lelemi\",\n    \"leh\": \"Lenje\",\n    \"lei\": \"Lemio\",\n    \"lej\": \"Lengola\",\n    \"lek\": \"Leipon\",\n    \"lel\": \"Lele (Democratic Republic of Congo)\",\n    \"lem\": \"Nomaande\",\n    \"len\": \"Lenca\",\n    \"leo\": \"Leti (Cameroon)\",\n    \"lep\": \"Lepcha\",\n    \"leq\": \"Lembena\",\n    \"ler\": \"Lenkau\",\n    \"les\": \"Lese\",\n    \"let\": \"Lesing-Gelimi; Amio-Gelimi\",\n    \"leu\": \"Kara (Papua New Guinea)\",\n    \"lev\": \"Lamma\",\n    \"lew\": \"Ledo Kaili\",\n    \"lex\": \"Luang\",\n    \"ley\": \"Lemolang\",\n    \"lez\": \"Lezghian\",\n    \"lfa\": \"Lefa\",\n    \"lfn\": \"Lingua Franca Nova\",\n    \"lg\": \"Ganda; Luganda\",\n    \"lga\": \"Lungga\",\n    \"lgb\": \"Laghu\",\n    \"lgg\": \"Lugbara\",\n    \"lgh\": \"Laghuu\",\n    \"lgi\": \"Lengilu\",\n    \"lgk\": \"Lingarak; Neverver\",\n    \"lgl\": \"Wala\",\n    \"lgm\": \"Lega-Mwenga\",\n    \"lgn\": \"T'apo; Opuuo\",\n    \"lgo\": \"Lango (South Sudan)\",\n    \"lgq\": \"Logba\",\n    \"lgr\": \"Lengo\",\n    \"lgt\": \"Pahi\",\n    \"lgu\": \"Longgu\",\n    \"lgz\": \"Ligenza\",\n    \"lha\": \"Laha (Viet Nam)\",\n    \"lhh\": \"Laha (Indonesia)\",\n    \"lhi\": \"Lahu Shi\",\n    \"lhl\": \"Lahul Lohar\",\n    \"lhm\": \"Lhomi\",\n    \"lhn\": \"Lahanan\",\n    \"lhp\": \"Lhokpu\",\n    \"lhs\": \"Mlahsö\",\n    \"lht\": \"Lo-Toga\",\n    \"lhu\": \"Lahu\",\n    \"li\": \"Limburgan; Limburger; Limburgish\",\n    \"lia\": \"West-Central Limba\",\n    \"lib\": \"Likum\",\n    \"lic\": \"Hlai\",\n    \"lid\": \"Nyindrou\",\n    \"lie\": \"Likila\",\n    \"lif\": \"Limbu\",\n    \"lig\": \"Ligbi\",\n    \"lih\": \"Lihir\",\n    \"lij\": \"Ligurian\",\n    \"lik\": \"Lika\",\n    \"lil\": \"Lillooet\",\n    \"lio\": \"Liki\",\n    \"lip\": \"Sekpele\",\n    \"liq\": \"Libido\",\n    \"lir\": \"Liberian English\",\n    \"lis\": \"Lisu\",\n    \"liu\": \"Logorik\",\n    \"liv\": \"Liv\",\n    \"liw\": \"Col\",\n    \"lix\": \"Liabuku\",\n    \"liy\": \"Banda-Bambari\",\n    \"liz\": \"Libinza\",\n    \"lja\": \"Golpa\",\n    \"lje\": \"Rampi\",\n    \"lji\": \"Laiyolo\",\n    \"ljl\": \"Li'o\",\n    \"ljp\": \"Lampung Api\",\n    \"ljw\": \"Yirandali\",\n    \"ljx\": \"Yuru\",\n    \"lka\": \"Lakalei\",\n    \"lkb\": \"Kabras; Lukabaras\",\n    \"lkc\": \"Kucong\",\n    \"lkd\": \"Lakondê\",\n    \"lke\": \"Kenyi\",\n    \"lkh\": \"Lakha\",\n    \"lki\": \"Laki\",\n    \"lkj\": \"Remun\",\n    \"lkl\": \"Laeko-Libuat\",\n    \"lkm\": \"Kalaamaya\",\n    \"lkn\": \"Lakon; Vure\",\n    \"lko\": \"Khayo; Olukhayo\",\n    \"lkr\": \"Päri\",\n    \"lks\": \"Kisa; Olushisa\",\n    \"lkt\": \"Lakota\",\n    \"lku\": \"Kungkari\",\n    \"lky\": \"Lokoya\",\n    \"lla\": \"Lala-Roba\",\n    \"llb\": \"Lolo\",\n    \"llc\": \"Lele (Guinea)\",\n    \"lld\": \"Ladin\",\n    \"lle\": \"Lele (Papua New Guinea)\",\n    \"llf\": \"Hermit\",\n    \"llg\": \"Lole\",\n    \"llh\": \"Lamu\",\n    \"lli\": \"Teke-Laali\",\n    \"llj\": \"Ladji Ladji\",\n    \"llk\": \"Lelak\",\n    \"lll\": \"Lilau\",\n    \"llm\": \"Lasalimu\",\n    \"lln\": \"Lele (Chad)\",\n    \"llp\": \"North Efate\",\n    \"llq\": \"Lolak\",\n    \"lls\": \"Lithuanian Sign Language\",\n    \"llu\": \"Lau\",\n    \"llx\": \"Lauan\",\n    \"lma\": \"East Limba\",\n    \"lmb\": \"Merei\",\n    \"lmc\": \"Limilngan\",\n    \"lmd\": \"Lumun\",\n    \"lme\": \"Pévé\",\n    \"lmf\": \"South Lembata\",\n    \"lmg\": \"Lamogai\",\n    \"lmh\": \"Lambichhong\",\n    \"lmi\": \"Lombi\",\n    \"lmj\": \"West Lembata\",\n    \"lmk\": \"Lamkang\",\n    \"lml\": \"Hano\",\n    \"lmn\": \"Lambadi\",\n    \"lmo\": \"Lombard\",\n    \"lmp\": \"Limbum\",\n    \"lmq\": \"Lamatuka\",\n    \"lmr\": \"Lamalera\",\n    \"lmu\": \"Lamenu\",\n    \"lmv\": \"Lomaiviti\",\n    \"lmw\": \"Lake Miwok\",\n    \"lmx\": \"Laimbue\",\n    \"lmy\": \"Lamboya\",\n    \"ln\": \"Lingala\",\n    \"lna\": \"Langbashe\",\n    \"lnb\": \"Mbalanhu\",\n    \"lnd\": \"Lundayeh; Lun Bawang\",\n    \"lng\": \"Langobardic\",\n    \"lnh\": \"Lanoh\",\n    \"lni\": \"Daantanai'\",\n    \"lnj\": \"Leningitij\",\n    \"lnl\": \"South Central Banda\",\n    \"lnm\": \"Langam\",\n    \"lnn\": \"Lorediakarkar\",\n    \"lns\": \"Lamnso'\",\n    \"lnu\": \"Longuda\",\n    \"lnw\": \"Lanima\",\n    \"lnz\": \"Lonzo\",\n    \"lo\": \"Lao\",\n    \"loa\": \"Loloda\",\n    \"lob\": \"Lobi\",\n    \"loc\": \"Inonhan\",\n    \"loe\": \"Saluan\",\n    \"lof\": \"Logol\",\n    \"log\": \"Logo\",\n    \"loh\": \"Narim\",\n    \"loi\": \"Loma (Côte d'Ivoire)\",\n    \"loj\": \"Lou\",\n    \"lok\": \"Loko\",\n    \"lol\": \"Mongo\",\n    \"lom\": \"Loma (Liberia)\",\n    \"lon\": \"Malawi Lomwe\",\n    \"loo\": \"Lombo\",\n    \"lop\": \"Lopa\",\n    \"loq\": \"Lobala\",\n    \"lor\": \"Téén\",\n    \"los\": \"Loniu\",\n    \"lot\": \"Otuho\",\n    \"lou\": \"Louisiana Creole\",\n    \"lov\": \"Lopi\",\n    \"low\": \"Tampias Lobu\",\n    \"lox\": \"Loun\",\n    \"loy\": \"Loke\",\n    \"loz\": \"Lozi\",\n    \"lpa\": \"Lelepa\",\n    \"lpe\": \"Lepki\",\n    \"lpn\": \"Long Phuri Naga\",\n    \"lpo\": \"Lipo\",\n    \"lpx\": \"Lopit\",\n    \"lqr\": \"Logir\",\n    \"lra\": \"Rara Bakati'\",\n    \"lrc\": \"Northern Luri\",\n    \"lre\": \"Laurentian\",\n    \"lrg\": \"Laragia\",\n    \"lri\": \"Marachi; Olumarachi\",\n    \"lrk\": \"Loarki\",\n    \"lrl\": \"Lari\",\n    \"lrm\": \"Marama; Olumarama\",\n    \"lrn\": \"Lorang\",\n    \"lro\": \"Laro\",\n    \"lrr\": \"Southern Yamphu\",\n    \"lrt\": \"Larantuka Malay\",\n    \"lrv\": \"Larevat\",\n    \"lrz\": \"Lemerig\",\n    \"lsa\": \"Lasgerdi\",\n    \"lsb\": \"Burundian Sign Language; Langue des Signes Burundaise\",\n    \"lsc\": \"Albarradas Sign Language; Lengua de señas Albarradas\",\n    \"lsd\": \"Lishana Deni\",\n    \"lse\": \"Lusengo\",\n    \"lsh\": \"Lish\",\n    \"lsi\": \"Lashi\",\n    \"lsl\": \"Latvian Sign Language\",\n    \"lsm\": \"Saamia; Olusamia\",\n    \"lsn\": \"Tibetan Sign Language\",\n    \"lso\": \"Laos Sign Language\",\n    \"lsp\": \"Panamanian Sign Language; Lengua de Señas Panameñas\",\n    \"lsr\": \"Aruop\",\n    \"lss\": \"Lasi\",\n    \"lst\": \"Trinidad and Tobago Sign Language\",\n    \"lsv\": \"Sivia Sign Language\",\n    \"lsw\": \"Seychelles Sign Language; Lalang Siny Seselwa; Langue des Signes Seychelloise\",\n    \"lsy\": \"Mauritian Sign Language\",\n    \"lt\": \"Lithuanian\",\n    \"ltc\": \"Late Middle Chinese\",\n    \"ltg\": \"Latgalian\",\n    \"lth\": \"Thur\",\n    \"lti\": \"Leti (Indonesia)\",\n    \"ltn\": \"Latundê\",\n    \"lto\": \"Tsotso; Olutsotso\",\n    \"lts\": \"Tachoni; Lutachoni\",\n    \"ltu\": \"Latu\",\n    \"lu\": \"Luba-Katanga\",\n    \"lua\": \"Luba-Lulua\",\n    \"luc\": \"Aringa\",\n    \"lud\": \"Ludian\",\n    \"lue\": \"Luvale\",\n    \"luf\": \"Laua\",\n    \"lui\": \"Luiseno\",\n    \"luj\": \"Luna\",\n    \"luk\": \"Lunanakha\",\n    \"lul\": \"Olu'bo\",\n    \"lum\": \"Luimbi\",\n    \"lun\": \"Lunda\",\n    \"luo\": \"Luo (Kenya and Tanzania); Dholuo\",\n    \"lup\": \"Lumbu\",\n    \"luq\": \"Lucumi\",\n    \"lur\": \"Laura\",\n    \"lus\": \"Lushai\",\n    \"lut\": \"Lushootseed\",\n    \"luu\": \"Lumba-Yakkha\",\n    \"luv\": \"Luwati\",\n    \"luw\": \"Luo (Cameroon)\",\n    \"luy\": \"Luyia; Oluluyia\",\n    \"luz\": \"Southern Luri\",\n    \"lv\": \"Latvian\",\n    \"lva\": \"Maku'a\",\n    \"lvi\": \"Lavi\",\n    \"lvk\": \"Lavukaleve\",\n    \"lvs\": \"Standard Latvian\",\n    \"lvu\": \"Levuka\",\n    \"lwa\": \"Lwalu\",\n    \"lwe\": \"Lewo Eleng\",\n    \"lwg\": \"Wanga; Oluwanga\",\n    \"lwh\": \"White Lachi\",\n    \"lwl\": \"Eastern Lawa\",\n    \"lwm\": \"Laomian\",\n    \"lwo\": \"Luwo\",\n    \"lws\": \"Malawian Sign Language\",\n    \"lwt\": \"Lewotobi\",\n    \"lwu\": \"Lawu\",\n    \"lww\": \"Lewo\",\n    \"lxm\": \"Lakurumau\",\n    \"lya\": \"Layakha\",\n    \"lyg\": \"Lyngngam\",\n    \"lyn\": \"Luyana\",\n    \"lzh\": \"Literary Chinese\",\n    \"lzl\": \"Litzlitz\",\n    \"lzn\": \"Leinong Naga\",\n    \"lzz\": \"Laz\",\n    \"maa\": \"San Jerónimo Tecóatl Mazatec\",\n    \"mab\": \"Yutanduchi Mixtec\",\n    \"mad\": \"Madurese\",\n    \"mae\": \"Bo-Rukul\",\n    \"maf\": \"Mafa\",\n    \"mag\": \"Magahi\",\n    \"mai\": \"Maithili\",\n    \"maj\": \"Jalapa De Díaz Mazatec\",\n    \"mak\": \"Makasar\",\n    \"mam\": \"Mam\",\n    \"man\": \"Mandingo; Manding\",\n    \"map\": \"Austronesian languages\",\n    \"maq\": \"Chiquihuitlán Mazatec\",\n    \"mas\": \"Masai\",\n    \"mat\": \"San Francisco Matlatzinca\",\n    \"mau\": \"Huautla Mazatec\",\n    \"mav\": \"Sateré-Mawé\",\n    \"maw\": \"Mampruli\",\n    \"max\": \"North Moluccan Malay\",\n    \"maz\": \"Central Mazahua\",\n    \"mba\": \"Higaonon\",\n    \"mbb\": \"Western Bukidnon Manobo\",\n    \"mbc\": \"Macushi\",\n    \"mbd\": \"Dibabawon Manobo\",\n    \"mbe\": \"Molale\",\n    \"mbf\": \"Baba Malay\",\n    \"mbh\": \"Mangseng\",\n    \"mbi\": \"Ilianen Manobo\",\n    \"mbj\": \"Nadëb\",\n    \"mbk\": \"Malol\",\n    \"mbl\": \"Maxakalí\",\n    \"mbm\": \"Ombamba\",\n    \"mbn\": \"Macaguán\",\n    \"mbo\": \"Mbo (Cameroon)\",\n    \"mbp\": \"Malayo\",\n    \"mbq\": \"Maisin\",\n    \"mbr\": \"Nukak Makú\",\n    \"mbs\": \"Sarangani Manobo\",\n    \"mbt\": \"Matigsalug Manobo\",\n    \"mbu\": \"Mbula-Bwazza\",\n    \"mbv\": \"Mbulungish\",\n    \"mbw\": \"Maring\",\n    \"mbx\": \"Mari (East Sepik Province)\",\n    \"mby\": \"Memoni\",\n    \"mbz\": \"Amoltepec Mixtec\",\n    \"mca\": \"Maca\",\n    \"mcb\": \"Machiguenga\",\n    \"mcc\": \"Bitur\",\n    \"mcd\": \"Sharanahua\",\n    \"mce\": \"Itundujia Mixtec\",\n    \"mcf\": \"Matsés\",\n    \"mcg\": \"Mapoyo\",\n    \"mch\": \"Maquiritari\",\n    \"mci\": \"Mese\",\n    \"mcj\": \"Mvanip\",\n    \"mck\": \"Mbunda\",\n    \"mcl\": \"Macaguaje\",\n    \"mcm\": \"Malaccan Creole Portuguese\",\n    \"mcn\": \"Masana\",\n    \"mco\": \"Coatlán Mixe\",\n    \"mcp\": \"Makaa\",\n    \"mcq\": \"Ese\",\n    \"mcr\": \"Menya\",\n    \"mcs\": \"Mambai\",\n    \"mct\": \"Mengisa\",\n    \"mcu\": \"Cameroon Mambila\",\n    \"mcv\": \"Minanibai\",\n    \"mcw\": \"Mawa (Chad)\",\n    \"mcx\": \"Mpiemo\",\n    \"mcy\": \"South Watut\",\n    \"mcz\": \"Mawan\",\n    \"mda\": \"Mada (Nigeria)\",\n    \"mdb\": \"Morigi\",\n    \"mdc\": \"Male (Papua New Guinea)\",\n    \"mdd\": \"Mbum\",\n    \"mde\": \"Maba (Chad)\",\n    \"mdf\": \"Moksha\",\n    \"mdg\": \"Massalat\",\n    \"mdh\": \"Maguindanaon\",\n    \"mdi\": \"Mamvu\",\n    \"mdj\": \"Mangbetu\",\n    \"mdk\": \"Mangbutu\",\n    \"mdl\": \"Maltese Sign Language\",\n    \"mdm\": \"Mayogo\",\n    \"mdn\": \"Mbati\",\n    \"mdp\": \"Mbala\",\n    \"mdq\": \"Mbole\",\n    \"mdr\": \"Mandar\",\n    \"mds\": \"Maria (Papua New Guinea)\",\n    \"mdt\": \"Mbere\",\n    \"mdu\": \"Mboko\",\n    \"mdv\": \"Santa Lucía Monteverde Mixtec\",\n    \"mdw\": \"Mbosi\",\n    \"mdx\": \"Dizin\",\n    \"mdy\": \"Male (Ethiopia)\",\n    \"mdz\": \"Suruí Do Pará\",\n    \"mea\": \"Menka\",\n    \"meb\": \"Ikobi\",\n    \"mec\": \"Marra\",\n    \"med\": \"Melpa\",\n    \"mee\": \"Mengen\",\n    \"mef\": \"Megam\",\n    \"meh\": \"Southwestern Tlaxiaco Mixtec\",\n    \"mei\": \"Midob\",\n    \"mej\": \"Meyah\",\n    \"mek\": \"Mekeo\",\n    \"mel\": \"Central Melanau\",\n    \"mem\": \"Mangala\",\n    \"men\": \"Mende (Sierra Leone)\",\n    \"meo\": \"Kedah Malay\",\n    \"mep\": \"Miriwoong\",\n    \"meq\": \"Merey\",\n    \"mer\": \"Meru\",\n    \"mes\": \"Masmaje\",\n    \"met\": \"Mato\",\n    \"meu\": \"Motu\",\n    \"mev\": \"Mano\",\n    \"mew\": \"Maaka\",\n    \"mey\": \"Hassaniyya\",\n    \"mez\": \"Menominee\",\n    \"mfa\": \"Pattani Malay\",\n    \"mfb\": \"Bangka\",\n    \"mfc\": \"Mba\",\n    \"mfd\": \"Mendankwe-Nkwen\",\n    \"mfe\": \"Morisyen\",\n    \"mff\": \"Naki\",\n    \"mfg\": \"Mogofin\",\n    \"mfh\": \"Matal\",\n    \"mfi\": \"Wandala\",\n    \"mfj\": \"Mefele\",\n    \"mfk\": \"North Mofu\",\n    \"mfl\": \"Putai\",\n    \"mfm\": \"Marghi South\",\n    \"mfn\": \"Cross River Mbembe\",\n    \"mfo\": \"Mbe\",\n    \"mfp\": \"Makassar Malay\",\n    \"mfq\": \"Moba\",\n    \"mfr\": \"Marrithiyel\",\n    \"mfs\": \"Mexican Sign Language\",\n    \"mft\": \"Mokerang\",\n    \"mfu\": \"Mbwela\",\n    \"mfv\": \"Mandjak\",\n    \"mfw\": \"Mulaha\",\n    \"mfx\": \"Melo\",\n    \"mfy\": \"Mayo\",\n    \"mfz\": \"Mabaan\",\n    \"mg\": \"Malagasy\",\n    \"mga\": \"Middle Irish (900-1200)\",\n    \"mgb\": \"Mararit\",\n    \"mgc\": \"Morokodo\",\n    \"mgd\": \"Moru\",\n    \"mge\": \"Mango\",\n    \"mgf\": \"Maklew\",\n    \"mgg\": \"Mpumpong\",\n    \"mgh\": \"Makhuwa-Meetto\",\n    \"mgi\": \"Lijili\",\n    \"mgj\": \"Abureni\",\n    \"mgk\": \"Mawes\",\n    \"mgl\": \"Maleu-Kilenge\",\n    \"mgm\": \"Mambae\",\n    \"mgn\": \"Mbangi\",\n    \"mgo\": \"Meta'\",\n    \"mgp\": \"Eastern Magar\",\n    \"mgq\": \"Malila\",\n    \"mgr\": \"Mambwe-Lungu\",\n    \"mgs\": \"Manda (Tanzania)\",\n    \"mgt\": \"Mongol\",\n    \"mgu\": \"Mailu\",\n    \"mgv\": \"Matengo\",\n    \"mgw\": \"Matumbi\",\n    \"mgy\": \"Mbunga\",\n    \"mgz\": \"Mbugwe\",\n    \"mh\": \"Marshallese\",\n    \"mha\": \"Manda (India)\",\n    \"mhb\": \"Mahongwe\",\n    \"mhc\": \"Mocho\",\n    \"mhd\": \"Mbugu\",\n    \"mhe\": \"Besisi; Mah Meri\",\n    \"mhf\": \"Mamaa\",\n    \"mhg\": \"Margu\",\n    \"mhi\": \"Ma'di\",\n    \"mhj\": \"Mogholi\",\n    \"mhk\": \"Mungaka\",\n    \"mhl\": \"Mauwake\",\n    \"mhm\": \"Makhuwa-Moniga\",\n    \"mhn\": \"Mócheno\",\n    \"mho\": \"Mashi (Zambia)\",\n    \"mhp\": \"Balinese Malay\",\n    \"mhq\": \"Mandan\",\n    \"mhr\": \"Eastern Mari\",\n    \"mhs\": \"Buru (Indonesia)\",\n    \"mht\": \"Mandahuaca\",\n    \"mhu\": \"Digaro-Mishmi; Darang Deng\",\n    \"mhw\": \"Mbukushu\",\n    \"mhx\": \"Maru; Lhaovo\",\n    \"mhy\": \"Ma'anyan\",\n    \"mhz\": \"Mor (Mor Islands)\",\n    \"mi\": \"Maori\",\n    \"mia\": \"Miami\",\n    \"mib\": \"Atatláhuca Mixtec\",\n    \"mic\": \"Mi'kmaq; Micmac\",\n    \"mid\": \"Mandaic\",\n    \"mie\": \"Ocotepec Mixtec\",\n    \"mif\": \"Mofu-Gudur\",\n    \"mig\": \"San Miguel El Grande Mixtec\",\n    \"mih\": \"Chayuco Mixtec\",\n    \"mii\": \"Chigmecatitlán Mixtec\",\n    \"mij\": \"Abar; Mungbam\",\n    \"mik\": \"Mikasuki\",\n    \"mil\": \"Peñoles Mixtec\",\n    \"mim\": \"Alacatlatzala Mixtec\",\n    \"min\": \"Minangkabau\",\n    \"mio\": \"Pinotepa Nacional Mixtec\",\n    \"mip\": \"Apasco-Apoala Mixtec\",\n    \"miq\": \"Mískito\",\n    \"mir\": \"Isthmus Mixe\",\n    \"mit\": \"Southern Puebla Mixtec\",\n    \"miu\": \"Cacaloxtepec Mixtec\",\n    \"miw\": \"Akoye\",\n    \"mix\": \"Mixtepec Mixtec\",\n    \"miy\": \"Ayutla Mixtec\",\n    \"miz\": \"Coatzospan Mixtec\",\n    \"mjb\": \"Makalero\",\n    \"mjc\": \"San Juan Colorado Mixtec\",\n    \"mjd\": \"Northwest Maidu\",\n    \"mje\": \"Muskum\",\n    \"mjg\": \"Tu\",\n    \"mjh\": \"Mwera (Nyasa)\",\n    \"mji\": \"Kim Mun\",\n    \"mjj\": \"Mawak\",\n    \"mjk\": \"Matukar\",\n    \"mjl\": \"Mandeali\",\n    \"mjm\": \"Medebur\",\n    \"mjn\": \"Ma (Papua New Guinea)\",\n    \"mjo\": \"Malankuravan\",\n    \"mjp\": \"Malapandaram\",\n    \"mjq\": \"Malaryan\",\n    \"mjr\": \"Malavedan\",\n    \"mjs\": \"Miship\",\n    \"mjt\": \"Sauria Paharia\",\n    \"mju\": \"Manna-Dora\",\n    \"mjv\": \"Mannan\",\n    \"mjw\": \"Karbi\",\n    \"mjx\": \"Mahali\",\n    \"mjy\": \"Mahican\",\n    \"mjz\": \"Majhi\",\n    \"mk\": \"Macedonian\",\n    \"mka\": \"Mbre\",\n    \"mkb\": \"Mal Paharia\",\n    \"mkc\": \"Siliput\",\n    \"mke\": \"Mawchi\",\n    \"mkf\": \"Miya\",\n    \"mkg\": \"Mak (China)\",\n    \"mkh\": \"Mon-Khmer languages\",\n    \"mki\": \"Dhatki\",\n    \"mkj\": \"Mokilese\",\n    \"mkk\": \"Byep\",\n    \"mkl\": \"Mokole\",\n    \"mkm\": \"Moklen\",\n    \"mkn\": \"Kupang Malay\",\n    \"mko\": \"Mingang Doso\",\n    \"mkp\": \"Moikodi\",\n    \"mkq\": \"Bay Miwok\",\n    \"mkr\": \"Malas\",\n    \"mks\": \"Silacayoapan Mixtec\",\n    \"mkt\": \"Vamale\",\n    \"mku\": \"Konyanka Maninka\",\n    \"mkv\": \"Mafea\",\n    \"mkw\": \"Kituba (Congo)\",\n    \"mkx\": \"Kinamiging Manobo\",\n    \"mky\": \"East Makian\",\n    \"mkz\": \"Makasae\",\n    \"ml\": \"Malayalam\",\n    \"mla\": \"Malo\",\n    \"mlb\": \"Mbule\",\n    \"mlc\": \"Cao Lan\",\n    \"mle\": \"Manambu\",\n    \"mlf\": \"Mal\",\n    \"mlh\": \"Mape\",\n    \"mli\": \"Malimpung\",\n    \"mlj\": \"Miltu\",\n    \"mlk\": \"Ilwana; Kiwilwana\",\n    \"mll\": \"Malua Bay\",\n    \"mlm\": \"Mulam\",\n    \"mln\": \"Malango\",\n    \"mlo\": \"Mlomp\",\n    \"mlp\": \"Bargam\",\n    \"mlq\": \"Western Maninkakan\",\n    \"mlr\": \"Vame\",\n    \"mls\": \"Masalit\",\n    \"mlu\": \"To'abaita\",\n    \"mlv\": \"Motlav; Mwotlap\",\n    \"mlw\": \"Moloko\",\n    \"mlx\": \"Malfaxal; Naha'ai\",\n    \"mlz\": \"Malaynon\",\n    \"mma\": \"Mama\",\n    \"mmb\": \"Momina\",\n    \"mmc\": \"Michoacán Mazahua\",\n    \"mmd\": \"Maonan\",\n    \"mme\": \"Mae\",\n    \"mmf\": \"Mundat\",\n    \"mmg\": \"North Ambrym\",\n    \"mmh\": \"Mehináku\",\n    \"mmi\": \"Musar\",\n    \"mmj\": \"Majhwar\",\n    \"mmk\": \"Mukha-Dora\",\n    \"mml\": \"Man Met\",\n    \"mmm\": \"Maii\",\n    \"mmn\": \"Mamanwa\",\n    \"mmo\": \"Mangga Buang\",\n    \"mmp\": \"Siawi\",\n    \"mmq\": \"Musak\",\n    \"mmr\": \"Western Xiangxi Miao\",\n    \"mmt\": \"Malalamai\",\n    \"mmu\": \"Mmaala\",\n    \"mmv\": \"Miriti\",\n    \"mmw\": \"Emae\",\n    \"mmx\": \"Madak\",\n    \"mmy\": \"Migaama\",\n    \"mmz\": \"Mabaale\",\n    \"mn\": \"Mongolian\",\n    \"mna\": \"Mbula\",\n    \"mnb\": \"Muna\",\n    \"mnc\": \"Manchu\",\n    \"mnd\": \"Mondé\",\n    \"mne\": \"Naba\",\n    \"mnf\": \"Mundani\",\n    \"mng\": \"Eastern Mnong\",\n    \"mnh\": \"Mono (Democratic Republic of Congo)\",\n    \"mni\": \"Manipuri\",\n    \"mnj\": \"Munji\",\n    \"mnk\": \"Mandinka\",\n    \"mnl\": \"Tiale\",\n    \"mnm\": \"Mapena\",\n    \"mnn\": \"Southern Mnong\",\n    \"mno\": \"Manobo languages\",\n    \"mnp\": \"Min Bei Chinese\",\n    \"mnq\": \"Minriq\",\n    \"mnr\": \"Mono (USA)\",\n    \"mns\": \"Mansi\",\n    \"mnu\": \"Mer\",\n    \"mnv\": \"Rennell-Bellona\",\n    \"mnw\": \"Mon\",\n    \"mnx\": \"Manikion\",\n    \"mny\": \"Manyawa\",\n    \"mnz\": \"Moni\",\n    \"moa\": \"Mwan\",\n    \"moc\": \"Mocoví\",\n    \"mod\": \"Mobilian\",\n    \"moe\": \"Innu; Montagnais\",\n    \"mog\": \"Mongondow\",\n    \"moh\": \"Mohawk\",\n    \"moi\": \"Mboi\",\n    \"moj\": \"Monzombo\",\n    \"mok\": \"Morori\",\n    \"mom\": \"Mangue\",\n    \"moo\": \"Monom\",\n    \"mop\": \"Mopán Maya\",\n    \"moq\": \"Mor (Bomberai Peninsula)\",\n    \"mor\": \"Moro\",\n    \"mos\": \"Mossi\",\n    \"mot\": \"Barí\",\n    \"mou\": \"Mogum\",\n    \"mov\": \"Mohave\",\n    \"mow\": \"Moi (Congo)\",\n    \"mox\": \"Molima\",\n    \"moy\": \"Shekkacho\",\n    \"moz\": \"Mukulu; Gergiko\",\n    \"mpa\": \"Mpoto\",\n    \"mpb\": \"Malak Malak; Mullukmulluk\",\n    \"mpc\": \"Mangarrayi\",\n    \"mpd\": \"Machinere\",\n    \"mpe\": \"Majang\",\n    \"mpg\": \"Marba\",\n    \"mph\": \"Maung\",\n    \"mpi\": \"Mpade\",\n    \"mpj\": \"Martu Wangka; Wangkajunga\",\n    \"mpk\": \"Mbara (Chad)\",\n    \"mpl\": \"Middle Watut\",\n    \"mpm\": \"Yosondúa Mixtec\",\n    \"mpn\": \"Mindiri\",\n    \"mpo\": \"Miu\",\n    \"mpp\": \"Migabac\",\n    \"mpq\": \"Matís\",\n    \"mpr\": \"Vangunu\",\n    \"mps\": \"Dadibi\",\n    \"mpt\": \"Mian\",\n    \"mpu\": \"Makuráp\",\n    \"mpv\": \"Mungkip\",\n    \"mpw\": \"Mapidian\",\n    \"mpx\": \"Misima-Panaeati\",\n    \"mpy\": \"Mapia\",\n    \"mpz\": \"Mpi\",\n    \"mqa\": \"Maba (Indonesia)\",\n    \"mqb\": \"Mbuko\",\n    \"mqc\": \"Mangole\",\n    \"mqe\": \"Matepi\",\n    \"mqf\": \"Momuna\",\n    \"mqg\": \"Kota Bangun Kutai Malay\",\n    \"mqh\": \"Tlazoyaltepec Mixtec\",\n    \"mqi\": \"Mariri\",\n    \"mqj\": \"Mamasa\",\n    \"mqk\": \"Rajah Kabunsuwan Manobo\",\n    \"mql\": \"Mbelime\",\n    \"mqm\": \"South Marquesan\",\n    \"mqn\": \"Moronene\",\n    \"mqo\": \"Modole\",\n    \"mqp\": \"Manipa\",\n    \"mqq\": \"Minokok\",\n    \"mqr\": \"Mander\",\n    \"mqs\": \"West Makian\",\n    \"mqt\": \"Mok\",\n    \"mqu\": \"Mandari\",\n    \"mqv\": \"Mosimo\",\n    \"mqw\": \"Murupi\",\n    \"mqx\": \"Mamuju\",\n    \"mqy\": \"Manggarai\",\n    \"mqz\": \"Pano\",\n    \"mr\": \"Marathi\",\n    \"mra\": \"Mlabri\",\n    \"mrb\": \"Marino\",\n    \"mrc\": \"Maricopa\",\n    \"mrd\": \"Western Magar\",\n    \"mre\": \"Martha's Vineyard Sign Language\",\n    \"mrf\": \"Elseng\",\n    \"mrg\": \"Mising\",\n    \"mrh\": \"Mara Chin\",\n    \"mrj\": \"Western Mari\",\n    \"mrk\": \"Hmwaveke\",\n    \"mrl\": \"Mortlockese\",\n    \"mrm\": \"Merlav; Mwerlap\",\n    \"mrn\": \"Cheke Holo\",\n    \"mro\": \"Mru\",\n    \"mrp\": \"Morouas\",\n    \"mrq\": \"North Marquesan\",\n    \"mrr\": \"Maria (India)\",\n    \"mrs\": \"Maragus\",\n    \"mrt\": \"Marghi Central\",\n    \"mru\": \"Mono (Cameroon)\",\n    \"mrv\": \"Mangareva\",\n    \"mrw\": \"Maranao\",\n    \"mrx\": \"Maremgi; Dineor\",\n    \"mry\": \"Mandaya\",\n    \"mrz\": \"Marind\",\n    \"ms\": \"Malay (macrolanguage)\",\n    \"msb\": \"Masbatenyo\",\n    \"msc\": \"Sankaran Maninka\",\n    \"msd\": \"Yucatec Maya Sign Language\",\n    \"mse\": \"Musey\",\n    \"msf\": \"Mekwei\",\n    \"msg\": \"Moraid\",\n    \"msh\": \"Masikoro Malagasy\",\n    \"msi\": \"Sabah Malay\",\n    \"msj\": \"Ma (Democratic Republic of Congo)\",\n    \"msk\": \"Mansaka\",\n    \"msl\": \"Molof; Poule\",\n    \"msm\": \"Agusan Manobo\",\n    \"msn\": \"Vurës\",\n    \"mso\": \"Mombum\",\n    \"msp\": \"Maritsauá\",\n    \"msq\": \"Caac\",\n    \"msr\": \"Mongolian Sign Language\",\n    \"mss\": \"West Masela\",\n    \"msu\": \"Musom\",\n    \"msv\": \"Maslam\",\n    \"msw\": \"Mansoanka\",\n    \"msx\": \"Moresada\",\n    \"msy\": \"Aruamu\",\n    \"msz\": \"Momare\",\n    \"mt\": \"Maltese\",\n    \"mta\": \"Cotabato Manobo\",\n    \"mtb\": \"Anyin Morofo\",\n    \"mtc\": \"Munit\",\n    \"mtd\": \"Mualang\",\n    \"mte\": \"Mono (Solomon Islands)\",\n    \"mtf\": \"Murik (Papua New Guinea)\",\n    \"mtg\": \"Una\",\n    \"mth\": \"Munggui\",\n    \"mti\": \"Maiwa (Papua New Guinea)\",\n    \"mtj\": \"Moskona\",\n    \"mtk\": \"Mbe'\",\n    \"mtl\": \"Montol\",\n    \"mtm\": \"Mator\",\n    \"mtn\": \"Matagalpa\",\n    \"mto\": \"Totontepec Mixe\",\n    \"mtp\": \"Wichí Lhamtés Nocten\",\n    \"mtq\": \"Muong\",\n    \"mtr\": \"Mewari\",\n    \"mts\": \"Yora\",\n    \"mtt\": \"Mota\",\n    \"mtu\": \"Tututepec Mixtec\",\n    \"mtv\": \"Asaro'o\",\n    \"mtw\": \"Southern Binukidnon\",\n    \"mtx\": \"Tidaá Mixtec\",\n    \"mty\": \"Nabi\",\n    \"mua\": \"Mundang\",\n    \"mub\": \"Mubi\",\n    \"muc\": \"Ajumbu\",\n    \"mud\": \"Mednyj Aleut\",\n    \"mue\": \"Media Lengua\",\n    \"mug\": \"Musgu\",\n    \"muh\": \"Mündü\",\n    \"mui\": \"Musi\",\n    \"muj\": \"Mabire\",\n    \"muk\": \"Mugom\",\n    \"mum\": \"Maiwala\",\n    \"mun\": \"Munda languages\",\n    \"muo\": \"Nyong\",\n    \"mup\": \"Malvi\",\n    \"muq\": \"Eastern Xiangxi Miao\",\n    \"mur\": \"Murle\",\n    \"mus\": \"Creek\",\n    \"mut\": \"Western Muria\",\n    \"muu\": \"Yaaku\",\n    \"muv\": \"Muthuvan\",\n    \"mux\": \"Bo-Ung\",\n    \"muy\": \"Muyang\",\n    \"muz\": \"Mursi\",\n    \"mva\": \"Manam\",\n    \"mvb\": \"Mattole\",\n    \"mvd\": \"Mamboru\",\n    \"mve\": \"Marwari (Pakistan)\",\n    \"mvf\": \"Peripheral Mongolian\",\n    \"mvg\": \"Yucuañe Mixtec\",\n    \"mvh\": \"Mulgi\",\n    \"mvi\": \"Miyako\",\n    \"mvk\": \"Mekmek\",\n    \"mvl\": \"Mbara (Australia)\",\n    \"mvn\": \"Minaveha\",\n    \"mvo\": \"Marovo\",\n    \"mvp\": \"Duri\",\n    \"mvq\": \"Moere\",\n    \"mvr\": \"Marau\",\n    \"mvs\": \"Massep\",\n    \"mvt\": \"Mpotovoro\",\n    \"mvu\": \"Marfa\",\n    \"mvv\": \"Tagal Murut\",\n    \"mvw\": \"Machinga\",\n    \"mvx\": \"Meoswar\",\n    \"mvy\": \"Indus Kohistani\",\n    \"mvz\": \"Mesqan\",\n    \"mwa\": \"Mwatebu\",\n    \"mwb\": \"Juwal\",\n    \"mwc\": \"Are\",\n    \"mwe\": \"Mwera (Chimwera)\",\n    \"mwf\": \"Murrinh-Patha\",\n    \"mwg\": \"Aiklep\",\n    \"mwh\": \"Mouk-Aria\",\n    \"mwi\": \"Labo; Ninde\",\n    \"mwk\": \"Kita Maninkakan\",\n    \"mwl\": \"Mirandese\",\n    \"mwm\": \"Sar\",\n    \"mwn\": \"Nyamwanga\",\n    \"mwo\": \"Central Maewo\",\n    \"mwp\": \"Kala Lagaw Ya\",\n    \"mwq\": \"Mün Chin\",\n    \"mwr\": \"Marwari\",\n    \"mws\": \"Mwimbi-Muthambi\",\n    \"mwt\": \"Moken\",\n    \"mwu\": \"Mittu\",\n    \"mwv\": \"Mentawai\",\n    \"mww\": \"Hmong Daw\",\n    \"mwz\": \"Moingi\",\n    \"mxa\": \"Northwest Oaxaca Mixtec\",\n    \"mxb\": \"Tezoatlán Mixtec\",\n    \"mxc\": \"Manyika\",\n    \"mxd\": \"Modang\",\n    \"mxe\": \"Mele-Fila\",\n    \"mxf\": \"Malgbe\",\n    \"mxg\": \"Mbangala\",\n    \"mxh\": \"Mvuba\",\n    \"mxi\": \"Mozarabic\",\n    \"mxj\": \"Miju-Mishmi; Geman Deng\",\n    \"mxk\": \"Monumbo\",\n    \"mxl\": \"Maxi Gbe\",\n    \"mxm\": \"Meramera\",\n    \"mxn\": \"Moi (Indonesia)\",\n    \"mxo\": \"Mbowe\",\n    \"mxp\": \"Tlahuitoltepec Mixe\",\n    \"mxq\": \"Juquila Mixe\",\n    \"mxr\": \"Murik (Malaysia)\",\n    \"mxs\": \"Huitepec Mixtec\",\n    \"mxt\": \"Jamiltepec Mixtec\",\n    \"mxu\": \"Mada (Cameroon)\",\n    \"mxv\": \"Metlatónoc Mixtec\",\n    \"mxw\": \"Namo\",\n    \"mxx\": \"Mahou; Mawukakan\",\n    \"mxy\": \"Southeastern Nochixtlán Mixtec\",\n    \"mxz\": \"Central Masela\",\n    \"my\": \"Burmese\",\n    \"myb\": \"Mbay\",\n    \"myc\": \"Mayeka\",\n    \"mye\": \"Myene\",\n    \"myf\": \"Bambassi\",\n    \"myg\": \"Manta\",\n    \"myh\": \"Makah\",\n    \"myj\": \"Mangayat\",\n    \"myk\": \"Mamara Senoufo\",\n    \"myl\": \"Moma\",\n    \"mym\": \"Me'en\",\n    \"myn\": \"Mayan languages\",\n    \"myo\": \"Anfillo\",\n    \"myp\": \"Pirahã\",\n    \"myr\": \"Muniche\",\n    \"mys\": \"Mesmes\",\n    \"myu\": \"Mundurukú\",\n    \"myv\": \"Erzya\",\n    \"myw\": \"Muyuw\",\n    \"myx\": \"Masaaba\",\n    \"myy\": \"Macuna\",\n    \"myz\": \"Classical Mandaic\",\n    \"mza\": \"Santa María Zacatepec Mixtec\",\n    \"mzb\": \"Tumzabt\",\n    \"mzc\": \"Madagascar Sign Language\",\n    \"mzd\": \"Malimba\",\n    \"mze\": \"Morawa\",\n    \"mzg\": \"Monastic Sign Language\",\n    \"mzh\": \"Wichí Lhamtés Güisnay\",\n    \"mzi\": \"Ixcatlán Mazatec\",\n    \"mzj\": \"Manya\",\n    \"mzk\": \"Nigeria Mambila\",\n    \"mzl\": \"Mazatlán Mixe\",\n    \"mzm\": \"Mumuye\",\n    \"mzn\": \"Mazanderani\",\n    \"mzo\": \"Matipuhy\",\n    \"mzp\": \"Movima\",\n    \"mzq\": \"Mori Atas\",\n    \"mzr\": \"Marúbo\",\n    \"mzs\": \"Macanese\",\n    \"mzt\": \"Mintil\",\n    \"mzu\": \"Inapang\",\n    \"mzv\": \"Manza\",\n    \"mzw\": \"Deg\",\n    \"mzx\": \"Mawayana\",\n    \"mzy\": \"Mozambican Sign Language\",\n    \"mzz\": \"Maiadomu\",\n    \"na\": \"Nauru\",\n    \"naa\": \"Namla\",\n    \"nab\": \"Southern Nambikuára\",\n    \"nac\": \"Narak\",\n    \"nae\": \"Naka'ela\",\n    \"naf\": \"Nabak\",\n    \"nag\": \"Naga Pidgin\",\n    \"nah\": \"Nahuatl languages\",\n    \"nai\": \"North American Indian languages\",\n    \"naj\": \"Nalu\",\n    \"nak\": \"Nakanai\",\n    \"nal\": \"Nalik\",\n    \"nam\": \"Ngan'gityemerri\",\n    \"nan\": \"Min Nan Chinese\",\n    \"nao\": \"Naaba\",\n    \"nap\": \"Neapolitan\",\n    \"naq\": \"Khoekhoe; Nama (Namibia)\",\n    \"nar\": \"Iguta\",\n    \"nas\": \"Naasioi\",\n    \"nat\": \"Ca̱hungwa̱rya̱; Hungworo\",\n    \"naw\": \"Nawuri\",\n    \"nax\": \"Nakwi\",\n    \"nay\": \"Ngarrindjeri\",\n    \"naz\": \"Coatepec Nahuatl\",\n    \"nb\": \"Norwegian Bokmål\",\n    \"nba\": \"Nyemba\",\n    \"nbb\": \"Ndoe\",\n    \"nbc\": \"Chang Naga\",\n    \"nbd\": \"Ngbinda\",\n    \"nbe\": \"Konyak Naga\",\n    \"nbg\": \"Nagarchal\",\n    \"nbh\": \"Ngamo\",\n    \"nbi\": \"Mao Naga\",\n    \"nbj\": \"Ngarinyman\",\n    \"nbk\": \"Nake\",\n    \"nbm\": \"Ngbaka Ma'bo\",\n    \"nbn\": \"Kuri\",\n    \"nbo\": \"Nkukoli\",\n    \"nbp\": \"Nnam\",\n    \"nbq\": \"Nggem\",\n    \"nbr\": \"Numana\",\n    \"nbs\": \"Namibian Sign Language\",\n    \"nbt\": \"Na\",\n    \"nbu\": \"Rongmei Naga\",\n    \"nbv\": \"Ngamambo\",\n    \"nbw\": \"Southern Ngbandi\",\n    \"nby\": \"Ningera\",\n    \"nca\": \"Iyo\",\n    \"ncb\": \"Central Nicobarese\",\n    \"ncc\": \"Ponam\",\n    \"ncd\": \"Nachering\",\n    \"nce\": \"Yale\",\n    \"ncf\": \"Notsi\",\n    \"ncg\": \"Nisga'a\",\n    \"nch\": \"Central Huasteca Nahuatl\",\n    \"nci\": \"Classical Nahuatl\",\n    \"ncj\": \"Northern Puebla Nahuatl\",\n    \"nck\": \"Na-kara\",\n    \"ncl\": \"Michoacán Nahuatl\",\n    \"ncm\": \"Nambo\",\n    \"ncn\": \"Nauna\",\n    \"nco\": \"Sibe\",\n    \"ncq\": \"Northern Katang\",\n    \"ncr\": \"Ncane\",\n    \"ncs\": \"Nicaraguan Sign Language\",\n    \"nct\": \"Chothe Naga\",\n    \"ncu\": \"Chumburung\",\n    \"ncx\": \"Central Puebla Nahuatl\",\n    \"ncz\": \"Natchez\",\n    \"nd\": \"North Ndebele\",\n    \"nda\": \"Ndasa\",\n    \"ndb\": \"Kenswei Nsei\",\n    \"ndc\": \"Ndau\",\n    \"ndd\": \"Nde-Nsele-Nta\",\n    \"ndf\": \"Nadruvian\",\n    \"ndg\": \"Ndengereko\",\n    \"ndh\": \"Ndali\",\n    \"ndi\": \"Samba Leko\",\n    \"ndj\": \"Ndamba\",\n    \"ndk\": \"Ndaka\",\n    \"ndl\": \"Ndolo\",\n    \"ndm\": \"Ndam\",\n    \"ndn\": \"Ngundi\",\n    \"ndp\": \"Ndo\",\n    \"ndq\": \"Ndombe\",\n    \"ndr\": \"Ndoola\",\n    \"nds\": \"Low German; Low Saxon\",\n    \"ndt\": \"Ndunga\",\n    \"ndu\": \"Dugun\",\n    \"ndv\": \"Ndut\",\n    \"ndw\": \"Ndobo\",\n    \"ndx\": \"Nduga\",\n    \"ndy\": \"Lutos\",\n    \"ndz\": \"Ndogo\",\n    \"ne\": \"Nepali (macrolanguage)\",\n    \"nea\": \"Eastern Ngad'a\",\n    \"neb\": \"Toura (Côte d'Ivoire)\",\n    \"nec\": \"Nedebang\",\n    \"ned\": \"Nde-Gbite\",\n    \"nee\": \"Nêlêmwa-Nixumwak\",\n    \"nef\": \"Nefamese\",\n    \"neg\": \"Negidal\",\n    \"neh\": \"Nyenkha\",\n    \"nei\": \"Neo-Hittite\",\n    \"nej\": \"Neko\",\n    \"nek\": \"Neku\",\n    \"nem\": \"Nemi\",\n    \"nen\": \"Nengone\",\n    \"neo\": \"Ná-Meo\",\n    \"neq\": \"North Central Mixe\",\n    \"ner\": \"Yahadian\",\n    \"nes\": \"Bhoti Kinnauri\",\n    \"net\": \"Nete\",\n    \"neu\": \"Neo\",\n    \"nev\": \"Nyaheun\",\n    \"new\": \"Newari; Nepal Bhasa\",\n    \"nex\": \"Neme\",\n    \"ney\": \"Neyo\",\n    \"nez\": \"Nez Perce\",\n    \"nfa\": \"Dhao\",\n    \"nfd\": \"Ahwai\",\n    \"nfl\": \"Ayiwo; Äiwoo\",\n    \"nfr\": \"Nafaanra\",\n    \"nfu\": \"Mfumte\",\n    \"ng\": \"Ndonga\",\n    \"nga\": \"Ngbaka\",\n    \"ngb\": \"Northern Ngbandi\",\n    \"ngc\": \"Ngombe (Democratic Republic of Congo)\",\n    \"ngd\": \"Ngando (Central African Republic)\",\n    \"nge\": \"Ngemba\",\n    \"ngf\": \"Trans-New Guinea languages\",\n    \"ngg\": \"Ngbaka Manza\",\n    \"ngh\": \"Nǁng\",\n    \"ngi\": \"Ngizim\",\n    \"ngj\": \"Ngie\",\n    \"ngk\": \"Dalabon\",\n    \"ngl\": \"Lomwe\",\n    \"ngm\": \"Ngatik Men's Creole\",\n    \"ngn\": \"Ngwo\",\n    \"ngp\": \"Ngulu\",\n    \"ngq\": \"Ngurimi; Ngoreme\",\n    \"ngr\": \"Engdewu\",\n    \"ngs\": \"Gvoko\",\n    \"ngt\": \"Kriang; Ngeq\",\n    \"ngu\": \"Guerrero Nahuatl\",\n    \"ngv\": \"Nagumi\",\n    \"ngw\": \"Ngwaba\",\n    \"ngx\": \"Nggwahyi\",\n    \"ngy\": \"Tibea\",\n    \"ngz\": \"Ngungwel\",\n    \"nha\": \"Nhanda\",\n    \"nhb\": \"Beng\",\n    \"nhc\": \"Tabasco Nahuatl\",\n    \"nhd\": \"Chiripá; Ava Guaraní\",\n    \"nhe\": \"Eastern Huasteca Nahuatl\",\n    \"nhf\": \"Nhuwala\",\n    \"nhg\": \"Tetelcingo Nahuatl\",\n    \"nhh\": \"Nahari\",\n    \"nhi\": \"Zacatlán-Ahuacatlán-Tepetzintla Nahuatl\",\n    \"nhk\": \"Isthmus-Cosoleacaque Nahuatl\",\n    \"nhm\": \"Morelos Nahuatl\",\n    \"nhn\": \"Central Nahuatl\",\n    \"nho\": \"Takuu\",\n    \"nhp\": \"Isthmus-Pajapan Nahuatl\",\n    \"nhq\": \"Huaxcaleca Nahuatl\",\n    \"nhr\": \"Naro\",\n    \"nht\": \"Ometepec Nahuatl\",\n    \"nhu\": \"Noone\",\n    \"nhv\": \"Temascaltepec Nahuatl\",\n    \"nhw\": \"Western Huasteca Nahuatl\",\n    \"nhx\": \"Isthmus-Mecayapan Nahuatl\",\n    \"nhy\": \"Northern Oaxaca Nahuatl\",\n    \"nhz\": \"Santa María La Alta Nahuatl\",\n    \"nia\": \"Nias\",\n    \"nib\": \"Nakame\",\n    \"nic\": \"Niger-Kordofanian languages\",\n    \"nid\": \"Ngandi\",\n    \"nie\": \"Niellim\",\n    \"nif\": \"Nek\",\n    \"nig\": \"Ngalakgan\",\n    \"nih\": \"Nyiha (Tanzania)\",\n    \"nii\": \"Nii\",\n    \"nij\": \"Ngaju\",\n    \"nik\": \"Southern Nicobarese\",\n    \"nil\": \"Nila\",\n    \"nim\": \"Nilamba\",\n    \"nin\": \"Ninzo\",\n    \"nio\": \"Nganasan\",\n    \"niq\": \"Nandi\",\n    \"nir\": \"Nimboran\",\n    \"nis\": \"Nimi\",\n    \"nit\": \"Southeastern Kolami\",\n    \"niu\": \"Niuean\",\n    \"niv\": \"Gilyak\",\n    \"niw\": \"Nimo\",\n    \"nix\": \"Hema\",\n    \"niy\": \"Ngiti\",\n    \"niz\": \"Ningil\",\n    \"nja\": \"Nzanyi\",\n    \"njb\": \"Nocte Naga\",\n    \"njd\": \"Ndonde Hamba\",\n    \"njh\": \"Lotha Naga\",\n    \"nji\": \"Gudanji\",\n    \"njj\": \"Njen\",\n    \"njl\": \"Njalgulgule\",\n    \"njm\": \"Angami Naga\",\n    \"njn\": \"Liangmai Naga\",\n    \"njo\": \"Ao Naga\",\n    \"njr\": \"Njerep\",\n    \"njs\": \"Nisa\",\n    \"njt\": \"Ndyuka-Trio Pidgin\",\n    \"nju\": \"Ngadjunmaya\",\n    \"njx\": \"Kunyi\",\n    \"njy\": \"Njyem\",\n    \"njz\": \"Nyishi\",\n    \"nka\": \"Nkoya\",\n    \"nkb\": \"Khoibu Naga\",\n    \"nkc\": \"Nkongho\",\n    \"nkd\": \"Koireng\",\n    \"nke\": \"Duke\",\n    \"nkf\": \"Inpui Naga\",\n    \"nkg\": \"Nekgini\",\n    \"nkh\": \"Khezha Naga\",\n    \"nki\": \"Thangal Naga\",\n    \"nkj\": \"Nakai\",\n    \"nkk\": \"Nokuku\",\n    \"nkm\": \"Namat\",\n    \"nkn\": \"Nkangala\",\n    \"nko\": \"Nkonya\",\n    \"nkp\": \"Niuatoputapu\",\n    \"nkq\": \"Nkami\",\n    \"nkr\": \"Nukuoro\",\n    \"nks\": \"North Asmat\",\n    \"nkt\": \"Nyika (Tanzania)\",\n    \"nku\": \"Bouna Kulango\",\n    \"nkv\": \"Nyika (Malawi and Zambia)\",\n    \"nkw\": \"Nkutu\",\n    \"nkx\": \"Nkoroo\",\n    \"nkz\": \"Nkari\",\n    \"nl\": \"Dutch; Flemish\",\n    \"nla\": \"Ngombale\",\n    \"nlc\": \"Nalca\",\n    \"nle\": \"East Nyala\",\n    \"nlg\": \"Gela\",\n    \"nli\": \"Grangali\",\n    \"nlj\": \"Nyali\",\n    \"nlk\": \"Ninia Yali\",\n    \"nll\": \"Nihali\",\n    \"nlm\": \"Mankiyali\",\n    \"nlo\": \"Ngul\",\n    \"nlq\": \"Lao Naga\",\n    \"nlu\": \"Nchumbulu\",\n    \"nlv\": \"Orizaba Nahuatl\",\n    \"nlw\": \"Walangama\",\n    \"nlx\": \"Nahali\",\n    \"nly\": \"Nyamal\",\n    \"nlz\": \"Nalögo\",\n    \"nma\": \"Maram Naga\",\n    \"nmb\": \"Big Nambas; V'ënen Taut\",\n    \"nmc\": \"Ngam\",\n    \"nmd\": \"Ndumu\",\n    \"nme\": \"Mzieme Naga\",\n    \"nmf\": \"Tangkhul Naga (India)\",\n    \"nmg\": \"Kwasio\",\n    \"nmh\": \"Monsang Naga\",\n    \"nmi\": \"Nyam\",\n    \"nmj\": \"Ngombe (Central African Republic)\",\n    \"nmk\": \"Namakura\",\n    \"nml\": \"Ndemli\",\n    \"nmm\": \"Manangba\",\n    \"nmn\": \"ǃXóõ\",\n    \"nmo\": \"Moyon Naga\",\n    \"nmp\": \"Nimanbur\",\n    \"nmq\": \"Nambya\",\n    \"nmr\": \"Nimbari\",\n    \"nms\": \"Letemboi\",\n    \"nmt\": \"Namonuito\",\n    \"nmu\": \"Northeast Maidu\",\n    \"nmv\": \"Ngamini\",\n    \"nmw\": \"Nimoa; Rifao\",\n    \"nmx\": \"Nama (Papua New Guinea)\",\n    \"nmy\": \"Namuyi\",\n    \"nmz\": \"Nawdm\",\n    \"nn\": \"Norwegian Nynorsk\",\n    \"nna\": \"Nyangumarta\",\n    \"nnb\": \"Nande\",\n    \"nnc\": \"Nancere\",\n    \"nnd\": \"West Ambae\",\n    \"nne\": \"Ngandyera\",\n    \"nnf\": \"Ngaing\",\n    \"nng\": \"Maring Naga\",\n    \"nnh\": \"Ngiemboon\",\n    \"nni\": \"North Nuaulu\",\n    \"nnj\": \"Nyangatom\",\n    \"nnk\": \"Nankina\",\n    \"nnl\": \"Northern Rengma Naga\",\n    \"nnm\": \"Namia\",\n    \"nnn\": \"Ngete\",\n    \"nnp\": \"Wancho Naga\",\n    \"nnq\": \"Ngindo\",\n    \"nnr\": \"Narungga\",\n    \"nnt\": \"Nanticoke\",\n    \"nnu\": \"Dwang\",\n    \"nnv\": \"Nugunu (Australia)\",\n    \"nnw\": \"Southern Nuni\",\n    \"nny\": \"Nyangga\",\n    \"nnz\": \"Nda'nda'\",\n    \"no\": \"Norwegian\",\n    \"noa\": \"Woun Meu\",\n    \"noc\": \"Nuk\",\n    \"nod\": \"Northern Thai\",\n    \"noe\": \"Nimadi\",\n    \"nof\": \"Nomane\",\n    \"nog\": \"Nogai\",\n    \"noh\": \"Nomu\",\n    \"noi\": \"Noiri\",\n    \"noj\": \"Nonuya\",\n    \"nok\": \"Nooksack\",\n    \"nol\": \"Nomlaki\",\n    \"nom\": \"Nocamán\",\n    \"non\": \"Old Norse\",\n    \"nop\": \"Numanggang\",\n    \"noq\": \"Ngongo\",\n    \"nos\": \"Eastern Nisu\",\n    \"not\": \"Nomatsiguenga\",\n    \"nou\": \"Ewage-Notu\",\n    \"nov\": \"Novial\",\n    \"now\": \"Nyambo\",\n    \"noy\": \"Noy\",\n    \"noz\": \"Nayi\",\n    \"npa\": \"Nar Phu\",\n    \"npb\": \"Nupbikha\",\n    \"npg\": \"Ponyo-Gongwang Naga\",\n    \"nph\": \"Phom Naga\",\n    \"npi\": \"Nepali (individual language)\",\n    \"npl\": \"Southeastern Puebla Nahuatl\",\n    \"npn\": \"Mondropolon\",\n    \"npo\": \"Pochuri Naga\",\n    \"nps\": \"Nipsan\",\n    \"npu\": \"Puimei Naga\",\n    \"npx\": \"Noipx\",\n    \"npy\": \"Napu\",\n    \"nqg\": \"Southern Nago\",\n    \"nqk\": \"Kura Ede Nago\",\n    \"nql\": \"Ngendelengo\",\n    \"nqm\": \"Ndom\",\n    \"nqn\": \"Nen\",\n    \"nqo\": \"N'Ko; N’Ko\",\n    \"nqq\": \"Kyan-Karyaw Naga\",\n    \"nqt\": \"Nteng\",\n    \"nqy\": \"Akyaung Ari Naga\",\n    \"nr\": \"South Ndebele\",\n    \"nra\": \"Ngom\",\n    \"nrb\": \"Nara\",\n    \"nrc\": \"Noric\",\n    \"nre\": \"Southern Rengma Naga\",\n    \"nrf\": \"Jèrriais; Guernésiais\",\n    \"nrg\": \"Narango\",\n    \"nri\": \"Chokri Naga\",\n    \"nrk\": \"Ngarla\",\n    \"nrl\": \"Ngarluma\",\n    \"nrm\": \"Narom\",\n    \"nrn\": \"Norn\",\n    \"nrp\": \"North Picene\",\n    \"nrr\": \"Norra; Nora\",\n    \"nrt\": \"Northern Kalapuya\",\n    \"nru\": \"Narua\",\n    \"nrx\": \"Ngurmbur\",\n    \"nrz\": \"Lala\",\n    \"nsa\": \"Sangtam Naga\",\n    \"nsb\": \"Lower Nossob\",\n    \"nsc\": \"Nshi\",\n    \"nsd\": \"Southern Nisu\",\n    \"nse\": \"Nsenga\",\n    \"nsf\": \"Northwestern Nisu\",\n    \"nsg\": \"Ngasa\",\n    \"nsh\": \"Ngoshie\",\n    \"nsi\": \"Nigerian Sign Language\",\n    \"nsk\": \"Naskapi\",\n    \"nsl\": \"Norwegian Sign Language\",\n    \"nsm\": \"Sumi Naga\",\n    \"nsn\": \"Nehan\",\n    \"nso\": \"Pedi; Northern Sotho; Sepedi\",\n    \"nsp\": \"Nepalese Sign Language\",\n    \"nsq\": \"Northern Sierra Miwok\",\n    \"nsr\": \"Maritime Sign Language\",\n    \"nss\": \"Nali\",\n    \"nst\": \"Tase Naga\",\n    \"nsu\": \"Sierra Negra Nahuatl\",\n    \"nsv\": \"Southwestern Nisu\",\n    \"nsw\": \"Navut\",\n    \"nsx\": \"Nsongo\",\n    \"nsy\": \"Nasal\",\n    \"nsz\": \"Nisenan\",\n    \"ntd\": \"Northern Tidung\",\n    \"nte\": \"Nathembo\",\n    \"ntg\": \"Ngantangarra\",\n    \"nti\": \"Natioro\",\n    \"ntj\": \"Ngaanyatjarra\",\n    \"ntk\": \"Ikoma-Nata-Isenye\",\n    \"ntm\": \"Nateni\",\n    \"nto\": \"Ntomba\",\n    \"ntp\": \"Northern Tepehuan\",\n    \"ntr\": \"Delo\",\n    \"ntu\": \"Natügu\",\n    \"ntw\": \"Nottoway\",\n    \"ntx\": \"Tangkhul Naga (Myanmar)\",\n    \"nty\": \"Mantsi\",\n    \"ntz\": \"Natanzi\",\n    \"nua\": \"Yuanga\",\n    \"nub\": \"Nubian languages\",\n    \"nuc\": \"Nukuini\",\n    \"nud\": \"Ngala\",\n    \"nue\": \"Ngundu\",\n    \"nuf\": \"Nusu\",\n    \"nug\": \"Nungali\",\n    \"nuh\": \"Ndunda\",\n    \"nui\": \"Ngumbi\",\n    \"nuj\": \"Nyole\",\n    \"nuk\": \"Nuu-chah-nulth; Nuuchahnulth\",\n    \"nul\": \"Nusa Laut\",\n    \"num\": \"Niuafo'ou\",\n    \"nun\": \"Anong\",\n    \"nuo\": \"Nguôn\",\n    \"nup\": \"Nupe-Nupe-Tako\",\n    \"nuq\": \"Nukumanu\",\n    \"nur\": \"Nukuria\",\n    \"nus\": \"Nuer\",\n    \"nut\": \"Nung (Viet Nam)\",\n    \"nuu\": \"Ngbundu\",\n    \"nuv\": \"Northern Nuni\",\n    \"nuw\": \"Nguluwan\",\n    \"nux\": \"Mehek\",\n    \"nuy\": \"Nunggubuyu\",\n    \"nuz\": \"Tlamacazapa Nahuatl\",\n    \"nv\": \"Navajo; Navaho\",\n    \"nvh\": \"Nasarian\",\n    \"nvm\": \"Namiae\",\n    \"nvo\": \"Nyokon\",\n    \"nwa\": \"Nawathinehena\",\n    \"nwb\": \"Nyabwa\",\n    \"nwc\": \"Classical Newari; Classical Nepal Bhasa; Old Newari\",\n    \"nwe\": \"Ngwe\",\n    \"nwg\": \"Ngayawung\",\n    \"nwi\": \"Southwest Tanna\",\n    \"nwm\": \"Nyamusa-Molo\",\n    \"nwo\": \"Nauo\",\n    \"nwr\": \"Nawaru\",\n    \"nww\": \"Ndwewe\",\n    \"nwx\": \"Middle Newar\",\n    \"nwy\": \"Nottoway-Meherrin\",\n    \"nxa\": \"Nauete\",\n    \"nxd\": \"Ngando (Democratic Republic of Congo)\",\n    \"nxe\": \"Nage\",\n    \"nxg\": \"Ngad'a\",\n    \"nxi\": \"Nindi\",\n    \"nxk\": \"Koki Naga\",\n    \"nxl\": \"South Nuaulu\",\n    \"nxm\": \"Numidian\",\n    \"nxn\": \"Ngawun\",\n    \"nxo\": \"Ndambomo\",\n    \"nxq\": \"Naxi\",\n    \"nxr\": \"Ninggerum\",\n    \"nxx\": \"Nafri\",\n    \"ny\": \"Nyanja; Chewa; Chichewa\",\n    \"nyb\": \"Nyangbo\",\n    \"nyc\": \"Nyanga-li\",\n    \"nyd\": \"Nyore; Olunyole\",\n    \"nye\": \"Nyengo\",\n    \"nyf\": \"Giryama; Kigiryama\",\n    \"nyg\": \"Nyindu\",\n    \"nyh\": \"Nyikina\",\n    \"nyi\": \"Ama (Sudan)\",\n    \"nyj\": \"Nyanga\",\n    \"nyk\": \"Nyaneka\",\n    \"nyl\": \"Nyeu\",\n    \"nym\": \"Nyamwezi\",\n    \"nyn\": \"Nyankole\",\n    \"nyo\": \"Nyoro\",\n    \"nyp\": \"Nyang'i\",\n    \"nyq\": \"Nayini\",\n    \"nyr\": \"Nyiha (Malawi)\",\n    \"nys\": \"Nyungar\",\n    \"nyt\": \"Nyawaygi\",\n    \"nyu\": \"Nyungwe\",\n    \"nyv\": \"Nyulnyul\",\n    \"nyw\": \"Nyaw\",\n    \"nyx\": \"Nganyaywana\",\n    \"nyy\": \"Nyakyusa-Ngonde\",\n    \"nza\": \"Tigon Mbembe\",\n    \"nzb\": \"Njebi\",\n    \"nzd\": \"Nzadi\",\n    \"nzi\": \"Nzima\",\n    \"nzk\": \"Nzakara\",\n    \"nzm\": \"Zeme Naga\",\n    \"nzs\": \"New Zealand Sign Language\",\n    \"nzu\": \"Teke-Nzikou\",\n    \"nzy\": \"Nzakambay\",\n    \"nzz\": \"Nanga Dama Dogon\",\n    \"oaa\": \"Orok\",\n    \"oac\": \"Oroch\",\n    \"oar\": \"Old Aramaic (up to 700 BCE); Ancient Aramaic (up to 700 BCE)\",\n    \"oav\": \"Old Avar\",\n    \"obi\": \"Obispeño\",\n    \"obk\": \"Southern Bontok\",\n    \"obl\": \"Oblo\",\n    \"obm\": \"Moabite\",\n    \"obo\": \"Obo Manobo\",\n    \"obr\": \"Old Burmese\",\n    \"obt\": \"Old Breton\",\n    \"obu\": \"Obulom\",\n    \"oc\": \"Occitan (post 1500)\",\n    \"oca\": \"Ocaina\",\n    \"och\": \"Old Chinese\",\n    \"ocm\": \"Old Cham\",\n    \"oco\": \"Old Cornish\",\n    \"ocu\": \"Atzingo Matlatzinca\",\n    \"oda\": \"Odut\",\n    \"odk\": \"Od\",\n    \"odt\": \"Old Dutch\",\n    \"odu\": \"Odual\",\n    \"ofo\": \"Ofo\",\n    \"ofs\": \"Old Frisian\",\n    \"ofu\": \"Efutop\",\n    \"ogb\": \"Ogbia\",\n    \"ogc\": \"Ogbah\",\n    \"oge\": \"Old Georgian\",\n    \"ogg\": \"Ogbogolo\",\n    \"ogo\": \"Khana\",\n    \"ogu\": \"Ogbronuagum\",\n    \"oht\": \"Old Hittite\",\n    \"ohu\": \"Old Hungarian\",\n    \"oia\": \"Oirata\",\n    \"oie\": \"Okolie\",\n    \"oin\": \"Inebu One\",\n    \"oj\": \"Ojibwa\",\n    \"ojb\": \"Northwestern Ojibwa\",\n    \"ojc\": \"Central Ojibwa\",\n    \"ojg\": \"Eastern Ojibwa\",\n    \"ojp\": \"Old Japanese\",\n    \"ojs\": \"Severn Ojibwa\",\n    \"ojv\": \"Ontong Java\",\n    \"ojw\": \"Western Ojibwa\",\n    \"oka\": \"Okanagan\",\n    \"okb\": \"Okobo\",\n    \"okc\": \"Kobo\",\n    \"okd\": \"Okodia\",\n    \"oke\": \"Okpe (Southwestern Edo)\",\n    \"okg\": \"Koko Babangk\",\n    \"okh\": \"Koresh-e Rostam\",\n    \"oki\": \"Okiek\",\n    \"okj\": \"Oko-Juwoi\",\n    \"okk\": \"Kwamtim One\",\n    \"okl\": \"Old Kentish Sign Language\",\n    \"okm\": \"Middle Korean (10th-16th cent.)\",\n    \"okn\": \"Oki-No-Erabu\",\n    \"oko\": \"Old Korean (3rd-9th cent.)\",\n    \"okr\": \"Kirike\",\n    \"oks\": \"Oko-Eni-Osayen\",\n    \"oku\": \"Oku\",\n    \"okv\": \"Orokaiva\",\n    \"okx\": \"Okpe (Northwestern Edo)\",\n    \"okz\": \"Old Khmer\",\n    \"ola\": \"Walungge\",\n    \"old\": \"Mochi\",\n    \"ole\": \"Olekha\",\n    \"olk\": \"Olkol\",\n    \"olm\": \"Oloma\",\n    \"olo\": \"Livvi\",\n    \"olr\": \"Olrat\",\n    \"olt\": \"Old Lithuanian\",\n    \"olu\": \"Kuvale\",\n    \"om\": \"Oromo\",\n    \"oma\": \"Omaha-Ponca\",\n    \"omb\": \"East Ambae\",\n    \"omc\": \"Mochica\",\n    \"omg\": \"Omagua\",\n    \"omi\": \"Omi\",\n    \"omk\": \"Omok\",\n    \"oml\": \"Ombo\",\n    \"omn\": \"Minoan\",\n    \"omo\": \"Utarmbung\",\n    \"omp\": \"Old Manipuri\",\n    \"omq\": \"Oto-Manguean languages\",\n    \"omr\": \"Old Marathi\",\n    \"omt\": \"Omotik\",\n    \"omu\": \"Omurano\",\n    \"omv\": \"Omotic languages\",\n    \"omw\": \"South Tairora\",\n    \"omx\": \"Old Mon\",\n    \"omy\": \"Old Malay\",\n    \"ona\": \"Ona\",\n    \"onb\": \"Lingao\",\n    \"one\": \"Oneida\",\n    \"ong\": \"Olo\",\n    \"oni\": \"Onin\",\n    \"onj\": \"Onjob\",\n    \"onk\": \"Kabore One\",\n    \"onn\": \"Onobasulu\",\n    \"ono\": \"Onondaga\",\n    \"onp\": \"Sartang\",\n    \"onr\": \"Northern One\",\n    \"ons\": \"Ono\",\n    \"ont\": \"Ontenu\",\n    \"onu\": \"Unua\",\n    \"onw\": \"Old Nubian\",\n    \"onx\": \"Onin Based Pidgin\",\n    \"ood\": \"Tohono O'odham\",\n    \"oog\": \"Ong\",\n    \"oon\": \"Önge\",\n    \"oor\": \"Oorlams\",\n    \"oos\": \"Old Ossetic\",\n    \"opa\": \"Okpamheri\",\n    \"opk\": \"Kopkaka\",\n    \"opm\": \"Oksapmin\",\n    \"opo\": \"Opao\",\n    \"opt\": \"Opata\",\n    \"opy\": \"Ofayé\",\n    \"or\": \"Oriya (macrolanguage); Odia (macrolanguage)\",\n    \"ora\": \"Oroha\",\n    \"orc\": \"Orma\",\n    \"ore\": \"Orejón\",\n    \"org\": \"Oring\",\n    \"orh\": \"Oroqen\",\n    \"orn\": \"Orang Kanaq\",\n    \"oro\": \"Orokolo\",\n    \"orr\": \"Oruma\",\n    \"ors\": \"Orang Seletar\",\n    \"ort\": \"Adivasi Oriya\",\n    \"oru\": \"Ormuri\",\n    \"orv\": \"Old Russian\",\n    \"orw\": \"Oro Win\",\n    \"orx\": \"Oro\",\n    \"ory\": \"Odia (individual language); Oriya (individual language)\",\n    \"orz\": \"Ormu\",\n    \"os\": \"Ossetian; Ossetic\",\n    \"osa\": \"Osage\",\n    \"osc\": \"Oscan\",\n    \"osi\": \"Osing\",\n    \"osn\": \"Old Sundanese\",\n    \"oso\": \"Ososo\",\n    \"osp\": \"Old Spanish\",\n    \"ost\": \"Osatu\",\n    \"osu\": \"Southern One\",\n    \"osx\": \"Old Saxon\",\n    \"ota\": \"Ottoman Turkish (1500-1928)\",\n    \"otb\": \"Old Tibetan\",\n    \"otd\": \"Ot Danum\",\n    \"ote\": \"Mezquital Otomi\",\n    \"oti\": \"Oti\",\n    \"otk\": \"Old Turkish\",\n    \"otl\": \"Tilapa Otomi\",\n    \"otm\": \"Eastern Highland Otomi\",\n    \"otn\": \"Tenango Otomi\",\n    \"oto\": \"Otomian languages\",\n    \"otq\": \"Querétaro Otomi\",\n    \"otr\": \"Otoro\",\n    \"ots\": \"Estado de México Otomi\",\n    \"ott\": \"Temoaya Otomi\",\n    \"otu\": \"Otuke\",\n    \"otw\": \"Ottawa\",\n    \"otx\": \"Texcatepec Otomi\",\n    \"oty\": \"Old Tamil\",\n    \"otz\": \"Ixtenco Otomi\",\n    \"oua\": \"Tagargrent\",\n    \"oub\": \"Glio-Oubi\",\n    \"oue\": \"Oune\",\n    \"oui\": \"Old Uighur\",\n    \"oum\": \"Ouma\",\n    \"ovd\": \"Elfdalian; Övdalian\",\n    \"owi\": \"Owiniga\",\n    \"owl\": \"Old Welsh\",\n    \"oyb\": \"Oy\",\n    \"oyd\": \"Oyda\",\n    \"oym\": \"Wayampi\",\n    \"oyy\": \"Oya'oya\",\n    \"ozm\": \"Koonzime\",\n    \"pa\": \"Panjabi; Punjabi\",\n    \"paa\": \"Papuan languages\",\n    \"pab\": \"Parecís\",\n    \"pac\": \"Pacoh\",\n    \"pad\": \"Paumarí\",\n    \"pae\": \"Pagibete\",\n    \"paf\": \"Paranawát\",\n    \"pag\": \"Pangasinan\",\n    \"pah\": \"Tenharim\",\n    \"pai\": \"Pe\",\n    \"pak\": \"Parakanã\",\n    \"pal\": \"Pahlavi\",\n    \"pam\": \"Pampanga; Kapampangan\",\n    \"pao\": \"Northern Paiute\",\n    \"pap\": \"Papiamento\",\n    \"paq\": \"Parya\",\n    \"par\": \"Panamint; Timbisha\",\n    \"pas\": \"Papasena\",\n    \"pau\": \"Palauan\",\n    \"pav\": \"Pakaásnovos\",\n    \"paw\": \"Pawnee\",\n    \"pax\": \"Pankararé\",\n    \"pay\": \"Pech\",\n    \"paz\": \"Pankararú\",\n    \"pbb\": \"Páez\",\n    \"pbc\": \"Patamona\",\n    \"pbe\": \"Mezontla Popoloca\",\n    \"pbf\": \"Coyotepec Popoloca\",\n    \"pbg\": \"Paraujano\",\n    \"pbh\": \"E'ñapa Woromaipu\",\n    \"pbi\": \"Parkwa\",\n    \"pbl\": \"Mak (Nigeria)\",\n    \"pbm\": \"Puebla Mazatec\",\n    \"pbn\": \"Kpasam\",\n    \"pbo\": \"Papel\",\n    \"pbp\": \"Badyara\",\n    \"pbr\": \"Pangwa\",\n    \"pbs\": \"Central Pame\",\n    \"pbt\": \"Southern Pashto\",\n    \"pbu\": \"Northern Pashto\",\n    \"pbv\": \"Pnar\",\n    \"pby\": \"Pyu (Papua New Guinea)\",\n    \"pca\": \"Santa Inés Ahuatempan Popoloca\",\n    \"pcb\": \"Pear\",\n    \"pcc\": \"Bouyei\",\n    \"pcd\": \"Picard\",\n    \"pce\": \"Ruching Palaung\",\n    \"pcf\": \"Paliyan\",\n    \"pcg\": \"Paniya\",\n    \"pch\": \"Pardhan\",\n    \"pci\": \"Duruwa\",\n    \"pcj\": \"Parenga\",\n    \"pck\": \"Paite Chin\",\n    \"pcl\": \"Pardhi\",\n    \"pcm\": \"Nigerian Pidgin\",\n    \"pcn\": \"Piti\",\n    \"pcp\": \"Pacahuara\",\n    \"pcw\": \"Pyapun\",\n    \"pda\": \"Anam\",\n    \"pdc\": \"Pennsylvania German\",\n    \"pdi\": \"Pa Di\",\n    \"pdn\": \"Podena; Fedan\",\n    \"pdo\": \"Padoe\",\n    \"pdt\": \"Plautdietsch\",\n    \"pdu\": \"Kayan\",\n    \"pea\": \"Peranakan Indonesian\",\n    \"peb\": \"Eastern Pomo\",\n    \"ped\": \"Mala (Papua New Guinea)\",\n    \"pee\": \"Taje\",\n    \"pef\": \"Northeastern Pomo\",\n    \"peg\": \"Pengo\",\n    \"peh\": \"Bonan\",\n    \"pei\": \"Chichimeca-Jonaz\",\n    \"pej\": \"Northern Pomo\",\n    \"pek\": \"Penchal\",\n    \"pel\": \"Pekal\",\n    \"pem\": \"Phende\",\n    \"peo\": \"Old Persian (ca. 600-400 B.C.)\",\n    \"pep\": \"Kunja\",\n    \"peq\": \"Southern Pomo\",\n    \"pes\": \"Iranian Persian\",\n    \"pev\": \"Pémono\",\n    \"pex\": \"Petats\",\n    \"pey\": \"Petjo\",\n    \"pez\": \"Eastern Penan\",\n    \"pfa\": \"Pááfang\",\n    \"pfe\": \"Pere\",\n    \"pfl\": \"Pfaelzisch\",\n    \"pga\": \"Sudanese Creole Arabic\",\n    \"pgd\": \"Gāndhārī\",\n    \"pgg\": \"Pangwali\",\n    \"pgi\": \"Pagi\",\n    \"pgk\": \"Rerep\",\n    \"pgl\": \"Primitive Irish\",\n    \"pgn\": \"Paelignian\",\n    \"pgs\": \"Pangseng\",\n    \"pgu\": \"Pagu\",\n    \"pgz\": \"Papua New Guinean Sign Language\",\n    \"pha\": \"Pa-Hng\",\n    \"phd\": \"Phudagi\",\n    \"phg\": \"Phuong\",\n    \"phh\": \"Phukha\",\n    \"phi\": \"Philippine languages\",\n    \"phj\": \"Pahari\",\n    \"phk\": \"Phake\",\n    \"phl\": \"Phalura; Palula\",\n    \"phm\": \"Phimbi\",\n    \"phn\": \"Phoenician\",\n    \"pho\": \"Phunoi\",\n    \"phq\": \"Phana'\",\n    \"phr\": \"Pahari-Potwari\",\n    \"pht\": \"Phu Thai\",\n    \"phu\": \"Phuan\",\n    \"phv\": \"Pahlavani\",\n    \"phw\": \"Phangduwali\",\n    \"pi\": \"Pali\",\n    \"pia\": \"Pima Bajo\",\n    \"pib\": \"Yine\",\n    \"pic\": \"Pinji\",\n    \"pid\": \"Piaroa\",\n    \"pie\": \"Piro\",\n    \"pif\": \"Pingelapese\",\n    \"pig\": \"Pisabo\",\n    \"pih\": \"Pitcairn-Norfolk\",\n    \"pij\": \"Pijao\",\n    \"pil\": \"Yom\",\n    \"pim\": \"Powhatan\",\n    \"pin\": \"Piame\",\n    \"pio\": \"Piapoco\",\n    \"pip\": \"Pero\",\n    \"pir\": \"Piratapuyo\",\n    \"pis\": \"Pijin\",\n    \"pit\": \"Pitta Pitta\",\n    \"piu\": \"Pintupi-Luritja\",\n    \"piv\": \"Pileni; Vaeakau-Taumako\",\n    \"piw\": \"Pimbwe\",\n    \"pix\": \"Piu\",\n    \"piy\": \"Piya-Kwonci\",\n    \"piz\": \"Pije\",\n    \"pjt\": \"Pitjantjatjara\",\n    \"pka\": \"Ardhamāgadhī Prākrit\",\n    \"pkb\": \"Pokomo; Kipfokomo\",\n    \"pkc\": \"Paekche\",\n    \"pkg\": \"Pak-Tong\",\n    \"pkh\": \"Pankhu\",\n    \"pkn\": \"Pakanha\",\n    \"pko\": \"Pökoot\",\n    \"pkp\": \"Pukapuka\",\n    \"pkr\": \"Attapady Kurumba\",\n    \"pks\": \"Pakistan Sign Language\",\n    \"pkt\": \"Maleng\",\n    \"pku\": \"Paku\",\n    \"pl\": \"Polish\",\n    \"pla\": \"Miani\",\n    \"plb\": \"Polonombauk\",\n    \"plc\": \"Central Palawano\",\n    \"pld\": \"Polari\",\n    \"ple\": \"Palu'e\",\n    \"plf\": \"Central Malayo-Polynesian languages\",\n    \"plg\": \"Pilagá\",\n    \"plh\": \"Paulohi\",\n    \"plj\": \"Polci\",\n    \"plk\": \"Kohistani Shina\",\n    \"pll\": \"Shwe Palaung\",\n    \"pln\": \"Palenquero\",\n    \"plo\": \"Oluta Popoluca\",\n    \"plq\": \"Palaic\",\n    \"plr\": \"Palaka Senoufo\",\n    \"pls\": \"San Marcos Tlacoyalco Popoloca; San Marcos Tlalcoyalco Popoloca\",\n    \"plt\": \"Plateau Malagasy\",\n    \"plu\": \"Palikúr\",\n    \"plv\": \"Southwest Palawano\",\n    \"plw\": \"Brooke's Point Palawano\",\n    \"ply\": \"Bolyu\",\n    \"plz\": \"Paluan\",\n    \"pma\": \"Paama\",\n    \"pmb\": \"Pambia\",\n    \"pmd\": \"Pallanganmiddang\",\n    \"pme\": \"Pwaamei\",\n    \"pmf\": \"Pamona\",\n    \"pmh\": \"Māhārāṣṭri Prākrit\",\n    \"pmi\": \"Northern Pumi\",\n    \"pmj\": \"Southern Pumi\",\n    \"pmk\": \"Pamlico\",\n    \"pml\": \"Lingua Franca\",\n    \"pmm\": \"Pomo\",\n    \"pmn\": \"Pam\",\n    \"pmo\": \"Pom\",\n    \"pmq\": \"Northern Pame\",\n    \"pmr\": \"Paynamar\",\n    \"pms\": \"Piemontese\",\n    \"pmt\": \"Tuamotuan\",\n    \"pmw\": \"Plains Miwok\",\n    \"pmx\": \"Poumei Naga\",\n    \"pmy\": \"Papuan Malay\",\n    \"pmz\": \"Southern Pame\",\n    \"pna\": \"Punan Bah-Biau\",\n    \"pnb\": \"Western Panjabi\",\n    \"pnc\": \"Pannei\",\n    \"pnd\": \"Mpinda\",\n    \"pne\": \"Western Penan\",\n    \"png\": \"Pangu; Pongu\",\n    \"pnh\": \"Penrhyn\",\n    \"pni\": \"Aoheng\",\n    \"pnj\": \"Pinjarup\",\n    \"pnk\": \"Paunaka\",\n    \"pnl\": \"Paleni\",\n    \"pnm\": \"Punan Batu 1\",\n    \"pnn\": \"Pinai-Hagahai\",\n    \"pno\": \"Panobo\",\n    \"pnp\": \"Pancana\",\n    \"pnq\": \"Pana (Burkina Faso)\",\n    \"pnr\": \"Panim\",\n    \"pns\": \"Ponosakan\",\n    \"pnt\": \"Pontic\",\n    \"pnu\": \"Jiongnai Bunu\",\n    \"pnv\": \"Pinigura\",\n    \"pnw\": \"Banyjima; Panytyima\",\n    \"pnx\": \"Phong-Kniang\",\n    \"pny\": \"Pinyin\",\n    \"pnz\": \"Pana (Central African Republic)\",\n    \"poc\": \"Poqomam\",\n    \"poe\": \"San Juan Atzingo Popoloca\",\n    \"pof\": \"Poke\",\n    \"pog\": \"Potiguára\",\n    \"poh\": \"Poqomchi'\",\n    \"poi\": \"Highland Popoluca\",\n    \"pok\": \"Pokangá\",\n    \"pom\": \"Southeastern Pomo\",\n    \"pon\": \"Pohnpeian\",\n    \"poo\": \"Central Pomo\",\n    \"pop\": \"Pwapwâ\",\n    \"poq\": \"Texistepec Popoluca\",\n    \"pos\": \"Sayula Popoluca\",\n    \"pot\": \"Potawatomi\",\n    \"pov\": \"Upper Guinea Crioulo\",\n    \"pow\": \"San Felipe Otlaltepec Popoloca\",\n    \"pox\": \"Polabian\",\n    \"poy\": \"Pogolo\",\n    \"poz\": \"Malayo-Polynesian languages\",\n    \"ppe\": \"Papi\",\n    \"ppi\": \"Paipai\",\n    \"ppk\": \"Uma\",\n    \"ppl\": \"Pipil; Nicarao\",\n    \"ppm\": \"Papuma\",\n    \"ppn\": \"Papapana\",\n    \"ppo\": \"Folopa\",\n    \"ppp\": \"Pelende\",\n    \"ppq\": \"Pei\",\n    \"pps\": \"San Luís Temalacayuca Popoloca\",\n    \"ppt\": \"Pare\",\n    \"ppu\": \"Papora\",\n    \"pqa\": \"Pa'a\",\n    \"pqe\": \"Eastern Malayo-Polynesian languages\",\n    \"pqm\": \"Malecite-Passamaquoddy\",\n    \"pqw\": \"Western Malayo-Polynesian languages\",\n    \"pra\": \"Prakrit languages\",\n    \"prc\": \"Parachi\",\n    \"prd\": \"Parsi-Dari\",\n    \"pre\": \"Principense\",\n    \"prf\": \"Paranan\",\n    \"prg\": \"Prussian\",\n    \"prh\": \"Porohanon\",\n    \"pri\": \"Paicî\",\n    \"prk\": \"Parauk\",\n    \"prl\": \"Peruvian Sign Language\",\n    \"prm\": \"Kibiri\",\n    \"prn\": \"Prasuni\",\n    \"pro\": \"Old Provençal (to 1500); Old Occitan (to 1500)\",\n    \"prp\": \"Parsi\",\n    \"prq\": \"Ashéninka Perené\",\n    \"prr\": \"Puri\",\n    \"prs\": \"Dari; Afghan Persian\",\n    \"prt\": \"Phai\",\n    \"pru\": \"Puragi\",\n    \"prw\": \"Parawen\",\n    \"prx\": \"Purik\",\n    \"prz\": \"Providencia Sign Language\",\n    \"ps\": \"Pushto; Pashto\",\n    \"psa\": \"Asue Awyu\",\n    \"psc\": \"Iranian Sign Language; Persian Sign Language\",\n    \"psd\": \"Plains Indian Sign Language\",\n    \"pse\": \"Central Malay\",\n    \"psg\": \"Penang Sign Language\",\n    \"psh\": \"Southwest Pashai; Southwest Pashayi\",\n    \"psi\": \"Southeast Pashai; Southeast Pashayi\",\n    \"psl\": \"Puerto Rican Sign Language\",\n    \"psm\": \"Pauserna\",\n    \"psn\": \"Panasuan\",\n    \"pso\": \"Polish Sign Language\",\n    \"psp\": \"Philippine Sign Language\",\n    \"psq\": \"Pasi\",\n    \"psr\": \"Portuguese Sign Language\",\n    \"pss\": \"Kaulong\",\n    \"pst\": \"Central Pashto\",\n    \"psu\": \"Sauraseni Prākrit\",\n    \"psw\": \"Port Sandwich\",\n    \"psy\": \"Piscataway\",\n    \"pt\": \"Portuguese\",\n    \"pta\": \"Pai Tavytera\",\n    \"pth\": \"Pataxó Hã-Ha-Hãe\",\n    \"pti\": \"Pindiini; Wangkatha\",\n    \"ptn\": \"Patani\",\n    \"pto\": \"Zo'é\",\n    \"ptp\": \"Patep\",\n    \"ptq\": \"Pattapu\",\n    \"ptr\": \"Piamatsina\",\n    \"ptt\": \"Enrekang\",\n    \"ptu\": \"Bambam\",\n    \"ptv\": \"Port Vato\",\n    \"ptw\": \"Pentlatch\",\n    \"pty\": \"Pathiya\",\n    \"pua\": \"Western Highland Purepecha\",\n    \"pub\": \"Purum\",\n    \"puc\": \"Punan Merap\",\n    \"pud\": \"Punan Aput\",\n    \"pue\": \"Puelche\",\n    \"puf\": \"Punan Merah\",\n    \"pug\": \"Phuie\",\n    \"pui\": \"Puinave\",\n    \"puj\": \"Punan Tubu\",\n    \"pum\": \"Puma\",\n    \"puo\": \"Puoc\",\n    \"pup\": \"Pulabu\",\n    \"puq\": \"Puquina\",\n    \"pur\": \"Puruborá\",\n    \"put\": \"Putoh\",\n    \"puu\": \"Punu\",\n    \"puw\": \"Puluwatese\",\n    \"pux\": \"Puare\",\n    \"puy\": \"Purisimeño\",\n    \"pwa\": \"Pawaia\",\n    \"pwb\": \"Panawa\",\n    \"pwg\": \"Gapapaiwa\",\n    \"pwi\": \"Patwin\",\n    \"pwm\": \"Molbog\",\n    \"pwn\": \"Paiwan\",\n    \"pwo\": \"Pwo Western Karen\",\n    \"pwr\": \"Powari\",\n    \"pww\": \"Pwo Northern Karen\",\n    \"pxm\": \"Quetzaltepec Mixe\",\n    \"pye\": \"Pye Krumen\",\n    \"pym\": \"Fyam\",\n    \"pyn\": \"Poyanáwa\",\n    \"pys\": \"Paraguayan Sign Language; Lengua de Señas del Paraguay\",\n    \"pyu\": \"Puyuma\",\n    \"pyx\": \"Pyu (Myanmar)\",\n    \"pyy\": \"Pyen\",\n    \"pzh\": \"Pazeh\",\n    \"pzn\": \"Jejara Naga; Para Naga\",\n    \"qu\": \"Quechua\",\n    \"qua\": \"Quapaw\",\n    \"qub\": \"Huallaga Huánuco Quechua\",\n    \"quc\": \"K'iche'; Quiché\",\n    \"qud\": \"Calderón Highland Quichua\",\n    \"quf\": \"Lambayeque Quechua\",\n    \"qug\": \"Chimborazo Highland Quichua\",\n    \"quh\": \"South Bolivian Quechua\",\n    \"qui\": \"Quileute\",\n    \"quk\": \"Chachapoyas Quechua\",\n    \"qul\": \"North Bolivian Quechua\",\n    \"qum\": \"Sipacapense\",\n    \"qun\": \"Quinault\",\n    \"qup\": \"Southern Pastaza Quechua\",\n    \"quq\": \"Quinqui\",\n    \"qur\": \"Yanahuanca Pasco Quechua\",\n    \"qus\": \"Santiago del Estero Quichua\",\n    \"quv\": \"Sacapulteco\",\n    \"quw\": \"Tena Lowland Quichua\",\n    \"qux\": \"Yauyos Quechua\",\n    \"quy\": \"Ayacucho Quechua\",\n    \"quz\": \"Cusco Quechua\",\n    \"qva\": \"Ambo-Pasco Quechua\",\n    \"qvc\": \"Cajamarca Quechua\",\n    \"qve\": \"Eastern Apurímac Quechua\",\n    \"qvh\": \"Huamalíes-Dos de Mayo Huánuco Quechua\",\n    \"qvi\": \"Imbabura Highland Quichua\",\n    \"qvj\": \"Loja Highland Quichua\",\n    \"qvl\": \"Cajatambo North Lima Quechua\",\n    \"qvm\": \"Margos-Yarowilca-Lauricocha Quechua\",\n    \"qvn\": \"North Junín Quechua\",\n    \"qvo\": \"Napo Lowland Quechua\",\n    \"qvp\": \"Pacaraos Quechua\",\n    \"qvs\": \"San Martín Quechua\",\n    \"qvw\": \"Huaylla Wanca Quechua\",\n    \"qvy\": \"Queyu\",\n    \"qvz\": \"Northern Pastaza Quichua\",\n    \"qwa\": \"Corongo Ancash Quechua\",\n    \"qwc\": \"Classical Quechua\",\n    \"qwe\": \"Quechuan (family)\",\n    \"qwh\": \"Huaylas Ancash Quechua\",\n    \"qwm\": \"Kuman (Russia)\",\n    \"qws\": \"Sihuas Ancash Quechua\",\n    \"qwt\": \"Kwalhioqua-Tlatskanai\",\n    \"qxa\": \"Chiquián Ancash Quechua\",\n    \"qxc\": \"Chincha Quechua\",\n    \"qxh\": \"Panao Huánuco Quechua\",\n    \"qxl\": \"Salasaca Highland Quichua\",\n    \"qxn\": \"Northern Conchucos Ancash Quechua\",\n    \"qxo\": \"Southern Conchucos Ancash Quechua\",\n    \"qxp\": \"Puno Quechua\",\n    \"qxq\": \"Qashqa'i\",\n    \"qxr\": \"Cañar Highland Quichua\",\n    \"qxs\": \"Southern Qiang\",\n    \"qxt\": \"Santa Ana de Tusi Pasco Quechua\",\n    \"qxu\": \"Arequipa-La Unión Quechua\",\n    \"qxw\": \"Jauja Wanca Quechua\",\n    \"qya\": \"Quenya\",\n    \"qyp\": \"Quiripi\",\n    \"raa\": \"Dungmali\",\n    \"rab\": \"Camling\",\n    \"rac\": \"Rasawa\",\n    \"rad\": \"Rade\",\n    \"raf\": \"Western Meohang\",\n    \"rag\": \"Logooli; Lulogooli\",\n    \"rah\": \"Rabha\",\n    \"rai\": \"Ramoaaina\",\n    \"raj\": \"Rajasthani\",\n    \"rak\": \"Tulu-Bohuai\",\n    \"ral\": \"Ralte\",\n    \"ram\": \"Canela\",\n    \"ran\": \"Riantana\",\n    \"rao\": \"Rao\",\n    \"rap\": \"Rapanui\",\n    \"raq\": \"Saam\",\n    \"rar\": \"Rarotongan; Cook Islands Maori\",\n    \"ras\": \"Tegali\",\n    \"rat\": \"Razajerdi\",\n    \"rau\": \"Raute\",\n    \"rav\": \"Sampang\",\n    \"raw\": \"Rawang\",\n    \"rax\": \"Rang\",\n    \"ray\": \"Rapa\",\n    \"raz\": \"Rahambuu\",\n    \"rbb\": \"Rumai Palaung\",\n    \"rbk\": \"Northern Bontok\",\n    \"rbl\": \"Miraya Bikol\",\n    \"rbp\": \"Barababaraba\",\n    \"rcf\": \"Réunion Creole French\",\n    \"rdb\": \"Rudbari\",\n    \"rea\": \"Rerau\",\n    \"reb\": \"Rembong\",\n    \"ree\": \"Rejang Kayan\",\n    \"reg\": \"Kara (Tanzania)\",\n    \"rei\": \"Reli\",\n    \"rej\": \"Rejang\",\n    \"rel\": \"Rendille\",\n    \"rem\": \"Remo\",\n    \"ren\": \"Rengao\",\n    \"rer\": \"Rer Bare\",\n    \"res\": \"Reshe\",\n    \"ret\": \"Retta\",\n    \"rey\": \"Reyesano\",\n    \"rga\": \"Roria\",\n    \"rge\": \"Romano-Greek\",\n    \"rgk\": \"Rangkas\",\n    \"rgn\": \"Romagnol\",\n    \"rgr\": \"Resígaro\",\n    \"rgs\": \"Southern Roglai\",\n    \"rgu\": \"Ringgou\",\n    \"rhg\": \"Rohingya\",\n    \"rhp\": \"Yahang\",\n    \"ria\": \"Riang (India)\",\n    \"rib\": \"Bribri Sign Language\",\n    \"rif\": \"Tarifit\",\n    \"ril\": \"Riang Lang; Riang (Myanmar)\",\n    \"rim\": \"Nyaturu\",\n    \"rin\": \"Nungu\",\n    \"rir\": \"Ribun\",\n    \"rit\": \"Ritharrngu\",\n    \"riu\": \"Riung\",\n    \"rjg\": \"Rajong\",\n    \"rji\": \"Raji\",\n    \"rjs\": \"Rajbanshi\",\n    \"rka\": \"Kraol\",\n    \"rkb\": \"Rikbaktsa\",\n    \"rkh\": \"Rakahanga-Manihiki\",\n    \"rki\": \"Rakhine\",\n    \"rkm\": \"Marka\",\n    \"rkt\": \"Rangpuri; Kamta\",\n    \"rkw\": \"Arakwal\",\n    \"rm\": \"Romansh\",\n    \"rma\": \"Rama\",\n    \"rmb\": \"Rembarrnga\",\n    \"rmc\": \"Carpathian Romani\",\n    \"rmd\": \"Traveller Danish\",\n    \"rme\": \"Angloromani\",\n    \"rmf\": \"Kalo Finnish Romani\",\n    \"rmg\": \"Traveller Norwegian\",\n    \"rmh\": \"Murkim\",\n    \"rmi\": \"Lomavren\",\n    \"rmk\": \"Romkun\",\n    \"rml\": \"Baltic Romani\",\n    \"rmm\": \"Roma\",\n    \"rmn\": \"Balkan Romani\",\n    \"rmo\": \"Sinte Romani\",\n    \"rmp\": \"Rempi\",\n    \"rmq\": \"Caló\",\n    \"rms\": \"Romanian Sign Language\",\n    \"rmt\": \"Domari\",\n    \"rmu\": \"Tavringer Romani\",\n    \"rmv\": \"Romanova\",\n    \"rmw\": \"Welsh Romani\",\n    \"rmx\": \"Romam\",\n    \"rmy\": \"Vlax Romani\",\n    \"rmz\": \"Marma\",\n    \"rn\": \"Rundi\",\n    \"rnb\": \"Brunca Sign Language\",\n    \"rnd\": \"Ruund\",\n    \"rng\": \"Ronga\",\n    \"rnl\": \"Ranglong\",\n    \"rnn\": \"Roon\",\n    \"rnp\": \"Rongpo\",\n    \"rnr\": \"Nari Nari\",\n    \"rnw\": \"Rungwa\",\n    \"ro\": \"Romanian; Moldavian; Moldovan\",\n    \"roa\": \"Romance languages\",\n    \"rob\": \"Tae'\",\n    \"roc\": \"Cacgia Roglai\",\n    \"rod\": \"Rogo\",\n    \"roe\": \"Ronji\",\n    \"rof\": \"Rombo\",\n    \"rog\": \"Northern Roglai\",\n    \"rol\": \"Romblomanon\",\n    \"rom\": \"Romany\",\n    \"roo\": \"Rotokas\",\n    \"rop\": \"Kriol\",\n    \"ror\": \"Rongga\",\n    \"rou\": \"Runga\",\n    \"row\": \"Dela-Oenale\",\n    \"rpn\": \"Repanbitip\",\n    \"rpt\": \"Rapting\",\n    \"rri\": \"Ririo\",\n    \"rro\": \"Waima\",\n    \"rrt\": \"Arritinngithigh\",\n    \"rsb\": \"Romano-Serbian\",\n    \"rsk\": \"Ruthenian; Rusyn\",\n    \"rsl\": \"Russian Sign Language\",\n    \"rsm\": \"Miriwoong Sign Language\",\n    \"rsn\": \"Rwandan Sign Language\",\n    \"rtc\": \"Rungtu Chin\",\n    \"rth\": \"Ratahan\",\n    \"rtm\": \"Rotuman\",\n    \"rts\": \"Yurats\",\n    \"rtw\": \"Rathawi\",\n    \"ru\": \"Russian\",\n    \"rub\": \"Gungu\",\n    \"ruc\": \"Ruuli\",\n    \"rue\": \"Rusyn\",\n    \"ruf\": \"Luguru\",\n    \"rug\": \"Roviana\",\n    \"ruh\": \"Ruga\",\n    \"rui\": \"Rufiji\",\n    \"ruk\": \"Che\",\n    \"ruo\": \"Istro Romanian\",\n    \"rup\": \"Macedo-Romanian; Aromanian; Arumanian\",\n    \"ruq\": \"Megleno Romanian\",\n    \"rut\": \"Rutul\",\n    \"ruu\": \"Lanas Lobu\",\n    \"ruy\": \"Mala (Nigeria)\",\n    \"ruz\": \"Ruma\",\n    \"rw\": \"Kinyarwanda\",\n    \"rwa\": \"Rawo\",\n    \"rwk\": \"Rwa\",\n    \"rwl\": \"Ruwila\",\n    \"rwm\": \"Amba (Uganda)\",\n    \"rwo\": \"Rawa\",\n    \"rwr\": \"Marwari (India)\",\n    \"rxd\": \"Ngardi\",\n    \"rxw\": \"Karuwali; Garuwali\",\n    \"ryn\": \"Northern Amami-Oshima\",\n    \"rys\": \"Yaeyama\",\n    \"ryu\": \"Central Okinawan\",\n    \"rzh\": \"Rāziḥī\",\n    \"sa\": \"Sanskrit\",\n    \"saa\": \"Saba\",\n    \"sab\": \"Buglere\",\n    \"sac\": \"Meskwaki\",\n    \"sad\": \"Sandawe\",\n    \"sae\": \"Sabanê\",\n    \"saf\": \"Safaliba\",\n    \"sah\": \"Yakut\",\n    \"sai\": \"South American Indian languages\",\n    \"saj\": \"Sahu\",\n    \"sak\": \"Sake\",\n    \"sal\": \"Salishan languages\",\n    \"sam\": \"Samaritan Aramaic\",\n    \"sao\": \"Sause\",\n    \"saq\": \"Samburu\",\n    \"sar\": \"Saraveca\",\n    \"sas\": \"Sasak\",\n    \"sat\": \"Santali\",\n    \"sau\": \"Saleman\",\n    \"sav\": \"Saafi-Saafi\",\n    \"saw\": \"Sawi\",\n    \"sax\": \"Sa\",\n    \"say\": \"Saya\",\n    \"saz\": \"Saurashtra\",\n    \"sba\": \"Ngambay\",\n    \"sbb\": \"Simbo\",\n    \"sbc\": \"Kele (Papua New Guinea)\",\n    \"sbd\": \"Southern Samo\",\n    \"sbe\": \"Saliba\",\n    \"sbf\": \"Chabu; Shabo\",\n    \"sbg\": \"Seget\",\n    \"sbh\": \"Sori-Harengan\",\n    \"sbi\": \"Seti\",\n    \"sbj\": \"Surbakhal\",\n    \"sbk\": \"Safwa\",\n    \"sbl\": \"Botolan Sambal\",\n    \"sbm\": \"Sagala\",\n    \"sbn\": \"Sindhi Bhil\",\n    \"sbo\": \"Sabüm\",\n    \"sbp\": \"Sangu (Tanzania)\",\n    \"sbq\": \"Sileibi\",\n    \"sbr\": \"Sembakung Murut\",\n    \"sbs\": \"Subiya\",\n    \"sbt\": \"Kimki\",\n    \"sbu\": \"Stod Bhoti\",\n    \"sbv\": \"Sabine\",\n    \"sbw\": \"Simba\",\n    \"sbx\": \"Seberuang\",\n    \"sby\": \"Soli\",\n    \"sbz\": \"Sara Kaba\",\n    \"sc\": \"Sardinian\",\n    \"scb\": \"Chut\",\n    \"sce\": \"Dongxiang\",\n    \"scf\": \"San Miguel Creole French\",\n    \"scg\": \"Sanggau\",\n    \"sch\": \"Sakachep\",\n    \"sci\": \"Sri Lankan Creole Malay\",\n    \"sck\": \"Sadri\",\n    \"scl\": \"Shina\",\n    \"scn\": \"Sicilian\",\n    \"sco\": \"Scots\",\n    \"scp\": \"Hyolmo; Helambu Sherpa\",\n    \"scq\": \"Sa'och\",\n    \"scs\": \"North Slavey\",\n    \"sct\": \"Southern Katang\",\n    \"scu\": \"Shumcho\",\n    \"scv\": \"Sheni\",\n    \"scw\": \"Sha\",\n    \"scx\": \"Sicel\",\n    \"sd\": \"Sindhi\",\n    \"sda\": \"Toraja-Sa'dan\",\n    \"sdb\": \"Shabak\",\n    \"sdc\": \"Sassarese Sardinian\",\n    \"sde\": \"Surubu\",\n    \"sdf\": \"Sarli\",\n    \"sdg\": \"Savi\",\n    \"sdh\": \"Southern Kurdish\",\n    \"sdj\": \"Suundi\",\n    \"sdk\": \"Sos Kundi\",\n    \"sdl\": \"Saudi Arabian Sign Language\",\n    \"sdn\": \"Gallurese Sardinian\",\n    \"sdo\": \"Bukar-Sadung Bidayuh\",\n    \"sdp\": \"Sherdukpen\",\n    \"sdq\": \"Semandang\",\n    \"sdr\": \"Oraon Sadri\",\n    \"sds\": \"Sened\",\n    \"sdt\": \"Shuadit\",\n    \"sdu\": \"Sarudu\",\n    \"sdv\": \"Eastern Sudanic languages\",\n    \"sdx\": \"Sibu Melanau\",\n    \"sdz\": \"Sallands\",\n    \"se\": \"Northern Sami\",\n    \"sea\": \"Semai\",\n    \"seb\": \"Shempire Senoufo\",\n    \"sec\": \"Sechelt\",\n    \"sed\": \"Sedang\",\n    \"see\": \"Seneca\",\n    \"sef\": \"Cebaara Senoufo\",\n    \"seg\": \"Segeju\",\n    \"seh\": \"Sena\",\n    \"sei\": \"Seri\",\n    \"sej\": \"Sene\",\n    \"sek\": \"Sekani\",\n    \"sel\": \"Selkup\",\n    \"sem\": \"Semitic languages\",\n    \"sen\": \"Nanerigé Sénoufo\",\n    \"seo\": \"Suarmin\",\n    \"sep\": \"Sìcìté Sénoufo\",\n    \"seq\": \"Senara Sénoufo\",\n    \"ser\": \"Serrano\",\n    \"ses\": \"Koyraboro Senni Songhai\",\n    \"set\": \"Sentani\",\n    \"seu\": \"Serui-Laut\",\n    \"sev\": \"Nyarafolo Senoufo\",\n    \"sew\": \"Sewa Bay\",\n    \"sey\": \"Secoya\",\n    \"sez\": \"Senthang Chin\",\n    \"sfb\": \"Langue des signes de Belgique Francophone; French Belgian Sign Language\",\n    \"sfe\": \"Eastern Subanen\",\n    \"sfm\": \"Small Flowery Miao\",\n    \"sfs\": \"South African Sign Language\",\n    \"sfw\": \"Sehwi\",\n    \"sg\": \"Sango\",\n    \"sga\": \"Old Irish (to 900)\",\n    \"sgb\": \"Mag-antsi Ayta\",\n    \"sgc\": \"Kipsigis\",\n    \"sgd\": \"Surigaonon\",\n    \"sge\": \"Segai\",\n    \"sgg\": \"Swiss-German Sign Language\",\n    \"sgh\": \"Shughni\",\n    \"sgi\": \"Suga\",\n    \"sgj\": \"Surgujia\",\n    \"sgk\": \"Sangkong\",\n    \"sgm\": \"Singa\",\n    \"sgn\": \"Sign languages\",\n    \"sgp\": \"Singpho\",\n    \"sgr\": \"Sangisari\",\n    \"sgs\": \"Samogitian\",\n    \"sgt\": \"Brokpake\",\n    \"sgu\": \"Salas\",\n    \"sgw\": \"Sebat Bet Gurage\",\n    \"sgx\": \"Sierra Leone Sign Language\",\n    \"sgy\": \"Sanglechi\",\n    \"sgz\": \"Sursurunga\",\n    \"sh\": \"Serbo-Croatian\",\n    \"sha\": \"Shall-Zwall\",\n    \"shb\": \"Ninam\",\n    \"shc\": \"Sonde\",\n    \"shd\": \"Kundal Shahi\",\n    \"she\": \"Sheko\",\n    \"shg\": \"Shua\",\n    \"shh\": \"Shoshoni\",\n    \"shi\": \"Tachelhit\",\n    \"shj\": \"Shatt\",\n    \"shk\": \"Shilluk\",\n    \"shl\": \"Shendu\",\n    \"shm\": \"Shahrudi\",\n    \"shn\": \"Shan\",\n    \"sho\": \"Shanga\",\n    \"shp\": \"Shipibo-Conibo\",\n    \"shq\": \"Sala\",\n    \"shr\": \"Shi\",\n    \"shs\": \"Shuswap\",\n    \"sht\": \"Shasta\",\n    \"shu\": \"Chadian Arabic\",\n    \"shv\": \"Shehri\",\n    \"shw\": \"Shwai\",\n    \"shx\": \"She\",\n    \"shy\": \"Tachawit\",\n    \"shz\": \"Syenara Senoufo\",\n    \"si\": \"Sinhala; Sinhalese\",\n    \"sia\": \"Akkala Sami\",\n    \"sib\": \"Sebop\",\n    \"sid\": \"Sidamo\",\n    \"sie\": \"Simaa\",\n    \"sif\": \"Siamou\",\n    \"sig\": \"Paasaal\",\n    \"sih\": \"Zire; Sîshëë\",\n    \"sii\": \"Shom Peng\",\n    \"sij\": \"Numbami\",\n    \"sik\": \"Sikiana\",\n    \"sil\": \"Tumulung Sisaala\",\n    \"sim\": \"Mende (Papua New Guinea)\",\n    \"sio\": \"Siouan languages\",\n    \"sip\": \"Sikkimese\",\n    \"siq\": \"Sonia\",\n    \"sir\": \"Siri\",\n    \"sis\": \"Siuslaw\",\n    \"sit\": \"Sino-Tibetan languages\",\n    \"siu\": \"Sinagen\",\n    \"siv\": \"Sumariup\",\n    \"siw\": \"Siwai\",\n    \"six\": \"Sumau\",\n    \"siy\": \"Sivandi\",\n    \"siz\": \"Siwi\",\n    \"sja\": \"Epena\",\n    \"sjb\": \"Sajau Basap\",\n    \"sjd\": \"Kildin Sami\",\n    \"sje\": \"Pite Sami\",\n    \"sjg\": \"Assangori\",\n    \"sjk\": \"Kemi Sami\",\n    \"sjl\": \"Sajalong; Miji\",\n    \"sjm\": \"Mapun\",\n    \"sjn\": \"Sindarin\",\n    \"sjo\": \"Xibe\",\n    \"sjp\": \"Surjapuri\",\n    \"sjr\": \"Siar-Lak\",\n    \"sjs\": \"Senhaja De Srair\",\n    \"sjt\": \"Ter Sami\",\n    \"sju\": \"Ume Sami\",\n    \"sjw\": \"Shawnee\",\n    \"sk\": \"Slovak\",\n    \"ska\": \"Skagit\",\n    \"skb\": \"Saek\",\n    \"skc\": \"Ma Manda\",\n    \"skd\": \"Southern Sierra Miwok\",\n    \"ske\": \"Seke (Vanuatu)\",\n    \"skf\": \"Sakirabiá\",\n    \"skg\": \"Sakalava Malagasy\",\n    \"skh\": \"Sikule\",\n    \"ski\": \"Sika\",\n    \"skj\": \"Seke (Nepal)\",\n    \"skm\": \"Kutong\",\n    \"skn\": \"Kolibugan Subanon\",\n    \"sko\": \"Seko Tengah\",\n    \"skp\": \"Sekapan\",\n    \"skq\": \"Sininkere\",\n    \"skr\": \"Saraiki; Seraiki\",\n    \"sks\": \"Maia\",\n    \"skt\": \"Sakata\",\n    \"sku\": \"Sakao\",\n    \"skv\": \"Skou\",\n    \"skw\": \"Skepi Creole Dutch\",\n    \"skx\": \"Seko Padang\",\n    \"sky\": \"Sikaiana\",\n    \"skz\": \"Sekar\",\n    \"sl\": \"Slovenian\",\n    \"sla\": \"Slavic languages\",\n    \"slc\": \"Sáliba\",\n    \"sld\": \"Sissala\",\n    \"sle\": \"Sholaga\",\n    \"slf\": \"Swiss-Italian Sign Language\",\n    \"slg\": \"Selungai Murut\",\n    \"slh\": \"Southern Puget Sound Salish\",\n    \"sli\": \"Lower Silesian\",\n    \"slj\": \"Salumá\",\n    \"sll\": \"Salt-Yui\",\n    \"slm\": \"Pangutaran Sama\",\n    \"sln\": \"Salinan\",\n    \"slp\": \"Lamaholot\",\n    \"slq\": \"Salchuq\",\n    \"slr\": \"Salar\",\n    \"sls\": \"Singapore Sign Language\",\n    \"slt\": \"Sila\",\n    \"slu\": \"Selaru\",\n    \"slw\": \"Sialum\",\n    \"slx\": \"Salampasu\",\n    \"sly\": \"Selayar\",\n    \"slz\": \"Ma'ya\",\n    \"sm\": \"Samoan\",\n    \"sma\": \"Southern Sami\",\n    \"smb\": \"Simbari\",\n    \"smc\": \"Som\",\n    \"smf\": \"Auwe\",\n    \"smg\": \"Simbali\",\n    \"smh\": \"Samei\",\n    \"smi\": \"Sami languages\",\n    \"smj\": \"Lule Sami\",\n    \"smk\": \"Bolinao\",\n    \"sml\": \"Central Sama\",\n    \"smm\": \"Musasa\",\n    \"smn\": \"Inari Sami\",\n    \"smp\": \"Samaritan\",\n    \"smq\": \"Samo\",\n    \"smr\": \"Simeulue\",\n    \"sms\": \"Skolt Sami\",\n    \"smt\": \"Simte\",\n    \"smu\": \"Somray\",\n    \"smv\": \"Samvedi\",\n    \"smw\": \"Sumbawa\",\n    \"smx\": \"Samba\",\n    \"smy\": \"Semnani\",\n    \"smz\": \"Simeku\",\n    \"sn\": \"Shona\",\n    \"snc\": \"Sinaugoro\",\n    \"sne\": \"Bau Bidayuh\",\n    \"snf\": \"Noon\",\n    \"sng\": \"Sanga (Democratic Republic of Congo)\",\n    \"sni\": \"Sensi\",\n    \"snj\": \"Riverain Sango\",\n    \"snk\": \"Soninke\",\n    \"snl\": \"Sangil\",\n    \"snm\": \"Southern Ma'di\",\n    \"snn\": \"Siona\",\n    \"sno\": \"Snohomish\",\n    \"snp\": \"Siane\",\n    \"snq\": \"Sangu (Gabon)\",\n    \"snr\": \"Sihan\",\n    \"sns\": \"South West Bay; Nahavaq\",\n    \"snu\": \"Senggi; Viid\",\n    \"snv\": \"Sa'ban\",\n    \"snw\": \"Selee\",\n    \"snx\": \"Sam\",\n    \"sny\": \"Saniyo-Hiyewe\",\n    \"snz\": \"Kou\",\n    \"so\": \"Somali\",\n    \"soa\": \"Thai Song\",\n    \"sob\": \"Sobei\",\n    \"soc\": \"So (Democratic Republic of Congo)\",\n    \"sod\": \"Songoora\",\n    \"soe\": \"Songomeno\",\n    \"sog\": \"Sogdian\",\n    \"soh\": \"Aka\",\n    \"soi\": \"Sonha\",\n    \"soj\": \"Soi\",\n    \"sok\": \"Sokoro\",\n    \"sol\": \"Solos\",\n    \"son\": \"Songhai languages\",\n    \"soo\": \"Songo\",\n    \"sop\": \"Songe\",\n    \"soq\": \"Kanasi\",\n    \"sor\": \"Somrai\",\n    \"sos\": \"Seeku\",\n    \"sou\": \"Southern Thai\",\n    \"sov\": \"Sonsorol\",\n    \"sow\": \"Sowanda\",\n    \"sox\": \"Swo\",\n    \"soy\": \"Miyobe\",\n    \"soz\": \"Temi\",\n    \"spb\": \"Sepa (Indonesia)\",\n    \"spc\": \"Sapé\",\n    \"spd\": \"Saep\",\n    \"spe\": \"Sepa (Papua New Guinea)\",\n    \"spg\": \"Sian\",\n    \"spi\": \"Saponi\",\n    \"spk\": \"Sengo\",\n    \"spl\": \"Selepet\",\n    \"spm\": \"Akukem\",\n    \"spn\": \"Sanapaná\",\n    \"spo\": \"Spokane\",\n    \"spp\": \"Supyire Senoufo\",\n    \"spq\": \"Loreto-Ucayali Spanish\",\n    \"spr\": \"Saparua\",\n    \"sps\": \"Saposa\",\n    \"spt\": \"Spiti Bhoti\",\n    \"spu\": \"Sapuan\",\n    \"spv\": \"Sambalpuri; Kosli\",\n    \"spx\": \"South Picene\",\n    \"spy\": \"Sabaot\",\n    \"sq\": \"Albanian\",\n    \"sqa\": \"Shama-Sambuga\",\n    \"sqh\": \"Shau\",\n    \"sqj\": \"Albanian languages\",\n    \"sqk\": \"Albanian Sign Language\",\n    \"sqm\": \"Suma\",\n    \"sqn\": \"Susquehannock\",\n    \"sqo\": \"Sorkhei\",\n    \"sqq\": \"Sou\",\n    \"sqr\": \"Siculo Arabic\",\n    \"sqs\": \"Sri Lankan Sign Language\",\n    \"sqt\": \"Soqotri\",\n    \"squ\": \"Squamish\",\n    \"sqx\": \"Kufr Qassem Sign Language (KQSL)\",\n    \"sr\": \"Serbian\",\n    \"sra\": \"Saruga\",\n    \"srb\": \"Sora\",\n    \"src\": \"Logudorese Sardinian\",\n    \"sre\": \"Sara\",\n    \"srf\": \"Nafi\",\n    \"srg\": \"Sulod\",\n    \"srh\": \"Sarikoli\",\n    \"sri\": \"Siriano\",\n    \"srk\": \"Serudung Murut\",\n    \"srl\": \"Isirawa\",\n    \"srm\": \"Saramaccan\",\n    \"srn\": \"Sranan Tongo\",\n    \"sro\": \"Campidanese Sardinian\",\n    \"srq\": \"Sirionó\",\n    \"srr\": \"Serer\",\n    \"srs\": \"Sarsi\",\n    \"srt\": \"Sauri\",\n    \"sru\": \"Suruí\",\n    \"srv\": \"Southern Sorsoganon\",\n    \"srw\": \"Serua\",\n    \"srx\": \"Sirmauri\",\n    \"sry\": \"Sera\",\n    \"srz\": \"Shahmirzadi\",\n    \"ss\": \"Swati\",\n    \"ssa\": \"Nilo-Saharan languages\",\n    \"ssb\": \"Southern Sama\",\n    \"ssc\": \"Suba-Simbiti\",\n    \"ssd\": \"Siroi\",\n    \"sse\": \"Balangingi; Bangingih Sama\",\n    \"ssf\": \"Thao\",\n    \"ssg\": \"Seimat\",\n    \"ssh\": \"Shihhi Arabic\",\n    \"ssi\": \"Sansi\",\n    \"ssj\": \"Sausi\",\n    \"ssk\": \"Sunam\",\n    \"ssl\": \"Western Sisaala\",\n    \"ssm\": \"Semnam\",\n    \"ssn\": \"Waata\",\n    \"sso\": \"Sissano\",\n    \"ssp\": \"Spanish Sign Language\",\n    \"ssq\": \"So'a\",\n    \"ssr\": \"Swiss-French Sign Language\",\n    \"sss\": \"Sô\",\n    \"sst\": \"Sinasina\",\n    \"ssu\": \"Susuami\",\n    \"ssv\": \"Shark Bay\",\n    \"ssx\": \"Samberigi\",\n    \"ssy\": \"Saho\",\n    \"ssz\": \"Sengseng\",\n    \"st\": \"Southern Sotho\",\n    \"sta\": \"Settla\",\n    \"stb\": \"Northern Subanen\",\n    \"std\": \"Sentinel\",\n    \"ste\": \"Liana-Seti\",\n    \"stf\": \"Seta\",\n    \"stg\": \"Trieng\",\n    \"sth\": \"Shelta\",\n    \"sti\": \"Bulo Stieng\",\n    \"stj\": \"Matya Samo\",\n    \"stk\": \"Arammba\",\n    \"stl\": \"Stellingwerfs\",\n    \"stm\": \"Setaman\",\n    \"stn\": \"Owa\",\n    \"sto\": \"Stoney\",\n    \"stp\": \"Southeastern Tepehuan\",\n    \"stq\": \"Saterfriesisch\",\n    \"str\": \"Straits Salish\",\n    \"sts\": \"Shumashti\",\n    \"stt\": \"Budeh Stieng\",\n    \"stu\": \"Samtao\",\n    \"stv\": \"Silt'e\",\n    \"stw\": \"Satawalese\",\n    \"sty\": \"Siberian Tatar\",\n    \"su\": \"Sundanese\",\n    \"sua\": \"Sulka\",\n    \"sub\": \"Suku\",\n    \"suc\": \"Western Subanon\",\n    \"sue\": \"Suena\",\n    \"sug\": \"Suganga\",\n    \"sui\": \"Suki\",\n    \"suj\": \"Shubi\",\n    \"suk\": \"Sukuma\",\n    \"suo\": \"Bouni\",\n    \"suq\": \"Tirmaga-Chai Suri; Suri\",\n    \"sur\": \"Mwaghavul\",\n    \"sus\": \"Susu\",\n    \"sut\": \"Subtiaba\",\n    \"suv\": \"Puroik\",\n    \"suw\": \"Sumbwa\",\n    \"sux\": \"Sumerian\",\n    \"suy\": \"Suyá\",\n    \"suz\": \"Sunwar\",\n    \"sv\": \"Swedish\",\n    \"sva\": \"Svan\",\n    \"svb\": \"Ulau-Suain\",\n    \"svc\": \"Vincentian Creole English\",\n    \"sve\": \"Serili\",\n    \"svk\": \"Slovakian Sign Language\",\n    \"svm\": \"Slavomolisano\",\n    \"svs\": \"Savosavo\",\n    \"svx\": \"Skalvian\",\n    \"sw\": \"Swahili (macrolanguage)\",\n    \"swb\": \"Maore Comorian\",\n    \"swc\": \"Congo Swahili\",\n    \"swf\": \"Sere\",\n    \"swg\": \"Swabian\",\n    \"swh\": \"Swahili (individual language); Kiswahili\",\n    \"swi\": \"Sui\",\n    \"swj\": \"Sira\",\n    \"swk\": \"Malawi Sena\",\n    \"swl\": \"Swedish Sign Language\",\n    \"swm\": \"Samosa\",\n    \"swn\": \"Sawknah\",\n    \"swo\": \"Shanenawa\",\n    \"swp\": \"Suau\",\n    \"swq\": \"Sharwa\",\n    \"swr\": \"Saweru\",\n    \"sws\": \"Seluwasan\",\n    \"swt\": \"Sawila\",\n    \"swu\": \"Suwawa\",\n    \"swv\": \"Shekhawati\",\n    \"sww\": \"Sowa\",\n    \"swx\": \"Suruahá\",\n    \"swy\": \"Sarua\",\n    \"sxb\": \"Suba\",\n    \"sxc\": \"Sicanian\",\n    \"sxe\": \"Sighu\",\n    \"sxg\": \"Shuhi; Shixing\",\n    \"sxk\": \"Southern Kalapuya\",\n    \"sxl\": \"Selian\",\n    \"sxm\": \"Samre\",\n    \"sxn\": \"Sangir\",\n    \"sxo\": \"Sorothaptic\",\n    \"sxr\": \"Saaroa\",\n    \"sxs\": \"Sasaru\",\n    \"sxu\": \"Upper Saxon\",\n    \"sxw\": \"Saxwe Gbe\",\n    \"sya\": \"Siang\",\n    \"syb\": \"Central Subanen\",\n    \"syc\": \"Classical Syriac\",\n    \"syd\": \"Samoyedic languages\",\n    \"syi\": \"Seki\",\n    \"syk\": \"Sukur\",\n    \"syl\": \"Sylheti\",\n    \"sym\": \"Maya Samo\",\n    \"syn\": \"Senaya\",\n    \"syo\": \"Suoy\",\n    \"syr\": \"Syriac\",\n    \"sys\": \"Sinyar\",\n    \"syw\": \"Kagate\",\n    \"syx\": \"Samay\",\n    \"syy\": \"Al-Sayyid Bedouin Sign Language\",\n    \"sza\": \"Semelai\",\n    \"szb\": \"Ngalum\",\n    \"szc\": \"Semaq Beri\",\n    \"szd\": \"Seru\",\n    \"sze\": \"Seze\",\n    \"szg\": \"Sengele\",\n    \"szl\": \"Silesian\",\n    \"szn\": \"Sula\",\n    \"szp\": \"Suabo\",\n    \"szs\": \"Solomon Islands Sign Language\",\n    \"szv\": \"Isu (Fako Division)\",\n    \"szw\": \"Sawai\",\n    \"szy\": \"Sakizaya\",\n    \"ta\": \"Tamil\",\n    \"taa\": \"Lower Tanana\",\n    \"tab\": \"Tabassaran\",\n    \"tac\": \"Lowland Tarahumara\",\n    \"tad\": \"Tause\",\n    \"tae\": \"Tariana\",\n    \"taf\": \"Tapirapé\",\n    \"tag\": \"Tagoi\",\n    \"tai\": \"Tai languages\",\n    \"taj\": \"Eastern Tamang\",\n    \"tak\": \"Tala\",\n    \"tal\": \"Tal\",\n    \"tan\": \"Tangale\",\n    \"tao\": \"Yami\",\n    \"tap\": \"Taabwa\",\n    \"taq\": \"Tamasheq\",\n    \"tar\": \"Central Tarahumara\",\n    \"tas\": \"Tay Boi\",\n    \"tau\": \"Upper Tanana\",\n    \"tav\": \"Tatuyo\",\n    \"taw\": \"Tai\",\n    \"tax\": \"Tamki\",\n    \"tay\": \"Atayal\",\n    \"taz\": \"Tocho\",\n    \"tba\": \"Aikanã\",\n    \"tbc\": \"Takia\",\n    \"tbd\": \"Kaki Ae\",\n    \"tbe\": \"Tanimbili\",\n    \"tbf\": \"Mandara\",\n    \"tbg\": \"North Tairora\",\n    \"tbh\": \"Dharawal; Thurawal\",\n    \"tbi\": \"Gaam\",\n    \"tbj\": \"Tiang\",\n    \"tbk\": \"Calamian Tagbanwa\",\n    \"tbl\": \"Tboli\",\n    \"tbm\": \"Tagbu\",\n    \"tbn\": \"Barro Negro Tunebo\",\n    \"tbo\": \"Tawala\",\n    \"tbp\": \"Taworta; Diebroud\",\n    \"tbq\": \"Tibeto-Burman languages\",\n    \"tbr\": \"Tumtum\",\n    \"tbs\": \"Tanguat\",\n    \"tbt\": \"Tembo (Kitembo)\",\n    \"tbu\": \"Tubar\",\n    \"tbv\": \"Tobo\",\n    \"tbw\": \"Tagbanwa\",\n    \"tbx\": \"Kapin\",\n    \"tby\": \"Tabaru\",\n    \"tbz\": \"Ditammari\",\n    \"tca\": \"Ticuna\",\n    \"tcb\": \"Tanacross\",\n    \"tcc\": \"Datooga\",\n    \"tcd\": \"Tafi\",\n    \"tce\": \"Southern Tutchone\",\n    \"tcf\": \"Malinaltepec Me'phaa; Malinaltepec Tlapanec\",\n    \"tcg\": \"Tamagario\",\n    \"tch\": \"Turks And Caicos Creole English\",\n    \"tci\": \"Wára\",\n    \"tck\": \"Tchitchege\",\n    \"tcl\": \"Taman (Myanmar)\",\n    \"tcm\": \"Tanahmerah\",\n    \"tcn\": \"Tichurong\",\n    \"tco\": \"Taungyo\",\n    \"tcp\": \"Tawr Chin\",\n    \"tcq\": \"Kaiy\",\n    \"tcs\": \"Torres Strait Creole; Yumplatok\",\n    \"tct\": \"T'en\",\n    \"tcu\": \"Southeastern Tarahumara\",\n    \"tcw\": \"Tecpatlán Totonac\",\n    \"tcx\": \"Toda\",\n    \"tcy\": \"Tulu\",\n    \"tcz\": \"Thado Chin\",\n    \"tda\": \"Tagdal\",\n    \"tdb\": \"Panchpargania\",\n    \"tdc\": \"Emberá-Tadó\",\n    \"tdd\": \"Tai Nüa\",\n    \"tde\": \"Tiranige Diga Dogon\",\n    \"tdf\": \"Talieng\",\n    \"tdg\": \"Western Tamang\",\n    \"tdh\": \"Thulung\",\n    \"tdi\": \"Tomadino\",\n    \"tdj\": \"Tajio\",\n    \"tdk\": \"Tambas\",\n    \"tdl\": \"Sur\",\n    \"tdm\": \"Taruma\",\n    \"tdn\": \"Tondano\",\n    \"tdo\": \"Teme\",\n    \"tdq\": \"Tita\",\n    \"tdr\": \"Todrah\",\n    \"tds\": \"Doutai\",\n    \"tdt\": \"Tetun Dili\",\n    \"tdv\": \"Toro\",\n    \"tdx\": \"Tandroy-Mahafaly Malagasy\",\n    \"tdy\": \"Tadyawan\",\n    \"te\": \"Telugu\",\n    \"tea\": \"Temiar\",\n    \"teb\": \"Tetete\",\n    \"tec\": \"Terik\",\n    \"ted\": \"Tepo Krumen\",\n    \"tee\": \"Huehuetla Tepehua\",\n    \"tef\": \"Teressa\",\n    \"teg\": \"Teke-Tege\",\n    \"teh\": \"Tehuelche\",\n    \"tei\": \"Torricelli\",\n    \"tek\": \"Ibali Teke\",\n    \"tem\": \"Timne\",\n    \"ten\": \"Tama (Colombia)\",\n    \"teo\": \"Teso\",\n    \"tep\": \"Tepecano\",\n    \"teq\": \"Temein\",\n    \"ter\": \"Tereno\",\n    \"tes\": \"Tengger\",\n    \"tet\": \"Tetum\",\n    \"teu\": \"Soo\",\n    \"tev\": \"Teor\",\n    \"tew\": \"Tewa (USA)\",\n    \"tex\": \"Tennet\",\n    \"tey\": \"Tulishi\",\n    \"tez\": \"Tetserret\",\n    \"tfi\": \"Tofin Gbe\",\n    \"tfn\": \"Tanaina\",\n    \"tfo\": \"Tefaro\",\n    \"tfr\": \"Teribe\",\n    \"tft\": \"Ternate\",\n    \"tg\": \"Tajik\",\n    \"tga\": \"Sagalla\",\n    \"tgb\": \"Tobilung\",\n    \"tgc\": \"Tigak\",\n    \"tgd\": \"Ciwogai\",\n    \"tge\": \"Eastern Gorkha Tamang\",\n    \"tgf\": \"Chalikha\",\n    \"tgh\": \"Tobagonian Creole English\",\n    \"tgi\": \"Lawunuia\",\n    \"tgj\": \"Tagin\",\n    \"tgn\": \"Tandaganon\",\n    \"tgo\": \"Sudest\",\n    \"tgp\": \"Tangoa\",\n    \"tgq\": \"Tring\",\n    \"tgr\": \"Tareng\",\n    \"tgs\": \"Nume\",\n    \"tgt\": \"Central Tagbanwa\",\n    \"tgu\": \"Tanggu\",\n    \"tgv\": \"Tingui-Boto\",\n    \"tgw\": \"Tagwana Senoufo\",\n    \"tgx\": \"Tagish\",\n    \"tgy\": \"Togoyo\",\n    \"tgz\": \"Tagalaka\",\n    \"th\": \"Thai\",\n    \"thd\": \"Kuuk Thaayorre; Thayore\",\n    \"the\": \"Chitwania Tharu\",\n    \"thf\": \"Thangmi\",\n    \"thh\": \"Northern Tarahumara\",\n    \"thi\": \"Tai Long\",\n    \"thk\": \"Tharaka; Kitharaka\",\n    \"thl\": \"Dangaura Tharu\",\n    \"thm\": \"Aheu\",\n    \"thn\": \"Thachanadan\",\n    \"thp\": \"Thompson\",\n    \"thq\": \"Kochila Tharu\",\n    \"thr\": \"Rana Tharu\",\n    \"ths\": \"Thakali\",\n    \"tht\": \"Tahltan\",\n    \"thu\": \"Thuri\",\n    \"thv\": \"Tahaggart Tamahaq\",\n    \"thy\": \"Tha\",\n    \"thz\": \"Tayart Tamajeq\",\n    \"ti\": \"Tigrinya\",\n    \"tia\": \"Tidikelt Tamazight\",\n    \"tic\": \"Tira\",\n    \"tif\": \"Tifal\",\n    \"tig\": \"Tigre\",\n    \"tih\": \"Timugon Murut\",\n    \"tii\": \"Tiene\",\n    \"tij\": \"Tilung\",\n    \"tik\": \"Tikar\",\n    \"til\": \"Tillamook\",\n    \"tim\": \"Timbe\",\n    \"tin\": \"Tindi\",\n    \"tio\": \"Teop\",\n    \"tip\": \"Trimuris\",\n    \"tiq\": \"Tiéfo\",\n    \"tis\": \"Masadiit Itneg\",\n    \"tit\": \"Tinigua\",\n    \"tiu\": \"Adasen\",\n    \"tiv\": \"Tiv\",\n    \"tiw\": \"Tiwi\",\n    \"tix\": \"Southern Tiwa\",\n    \"tiy\": \"Tiruray\",\n    \"tiz\": \"Tai Hongjin\",\n    \"tja\": \"Tajuasohn\",\n    \"tjg\": \"Tunjung\",\n    \"tji\": \"Northern Tujia\",\n    \"tjj\": \"Tjungundji\",\n    \"tjl\": \"Tai Laing\",\n    \"tjm\": \"Timucua\",\n    \"tjn\": \"Tonjon\",\n    \"tjo\": \"Temacine Tamazight\",\n    \"tjp\": \"Tjupany\",\n    \"tjs\": \"Southern Tujia\",\n    \"tju\": \"Tjurruru\",\n    \"tjw\": \"Djabwurrung\",\n    \"tk\": \"Turkmen\",\n    \"tka\": \"Truká\",\n    \"tkb\": \"Buksa\",\n    \"tkd\": \"Tukudede\",\n    \"tke\": \"Takwane\",\n    \"tkf\": \"Tukumanféd\",\n    \"tkg\": \"Tesaka Malagasy\",\n    \"tkl\": \"Tokelau\",\n    \"tkm\": \"Takelma\",\n    \"tkn\": \"Toku-No-Shima\",\n    \"tkp\": \"Tikopia\",\n    \"tkq\": \"Tee\",\n    \"tkr\": \"Tsakhur\",\n    \"tks\": \"Takestani\",\n    \"tkt\": \"Kathoriya Tharu\",\n    \"tku\": \"Upper Necaxa Totonac\",\n    \"tkv\": \"Mur Pano\",\n    \"tkw\": \"Teanu\",\n    \"tkx\": \"Tangko\",\n    \"tkz\": \"Takua\",\n    \"tl\": \"Tagalog\",\n    \"tla\": \"Southwestern Tepehuan\",\n    \"tlb\": \"Tobelo\",\n    \"tlc\": \"Yecuatla Totonac\",\n    \"tld\": \"Talaud\",\n    \"tlf\": \"Telefol\",\n    \"tlg\": \"Tofanma\",\n    \"tlh\": \"Klingon; tlhIngan Hol\",\n    \"tli\": \"Tlingit\",\n    \"tlj\": \"Talinga-Bwisi\",\n    \"tlk\": \"Taloki\",\n    \"tll\": \"Tetela\",\n    \"tlm\": \"Tolomako\",\n    \"tln\": \"Talondo'\",\n    \"tlo\": \"Talodi\",\n    \"tlp\": \"Filomena Mata-Coahuitlán Totonac\",\n    \"tlq\": \"Tai Loi\",\n    \"tlr\": \"Talise\",\n    \"tls\": \"Tambotalo\",\n    \"tlt\": \"Sou Nama; Teluti\",\n    \"tlu\": \"Tulehu\",\n    \"tlv\": \"Taliabu\",\n    \"tlx\": \"Khehek\",\n    \"tly\": \"Talysh\",\n    \"tma\": \"Tama (Chad)\",\n    \"tmb\": \"Katbol; Avava\",\n    \"tmc\": \"Tumak\",\n    \"tmd\": \"Haruai\",\n    \"tme\": \"Tremembé\",\n    \"tmf\": \"Toba-Maskoy\",\n    \"tmg\": \"Ternateño\",\n    \"tmh\": \"Tamashek\",\n    \"tmi\": \"Tutuba\",\n    \"tmj\": \"Samarokena\",\n    \"tmk\": \"Northwestern Tamang\",\n    \"tml\": \"Tamnim Citak\",\n    \"tmm\": \"Tai Thanh\",\n    \"tmn\": \"Taman (Indonesia)\",\n    \"tmo\": \"Temoq\",\n    \"tmq\": \"Tumleo\",\n    \"tmr\": \"Jewish Babylonian Aramaic (ca. 200-1200 CE)\",\n    \"tms\": \"Tima\",\n    \"tmt\": \"Tasmate\",\n    \"tmu\": \"Iau\",\n    \"tmv\": \"Tembo (Motembo)\",\n    \"tmw\": \"Temuan\",\n    \"tmy\": \"Tami\",\n    \"tmz\": \"Tamanaku\",\n    \"tn\": \"Tswana\",\n    \"tna\": \"Tacana\",\n    \"tnb\": \"Western Tunebo\",\n    \"tnc\": \"Tanimuca-Retuarã\",\n    \"tnd\": \"Angosturas Tunebo\",\n    \"tng\": \"Tobanga\",\n    \"tnh\": \"Maiani\",\n    \"tni\": \"Tandia\",\n    \"tnk\": \"Kwamera\",\n    \"tnl\": \"Lenakel\",\n    \"tnm\": \"Tabla\",\n    \"tnn\": \"North Tanna\",\n    \"tno\": \"Toromono\",\n    \"tnp\": \"Whitesands\",\n    \"tnq\": \"Taino\",\n    \"tnr\": \"Ménik\",\n    \"tns\": \"Tenis\",\n    \"tnt\": \"Tontemboan\",\n    \"tnu\": \"Tay Khang\",\n    \"tnv\": \"Tangchangya\",\n    \"tnw\": \"Tonsawang\",\n    \"tnx\": \"Tanema\",\n    \"tny\": \"Tongwe\",\n    \"tnz\": \"Ten'edn\",\n    \"to\": \"Tonga (Tonga Islands)\",\n    \"tob\": \"Toba\",\n    \"toc\": \"Coyutla Totonac\",\n    \"tod\": \"Toma\",\n    \"tof\": \"Gizrra\",\n    \"tog\": \"Tonga (Nyasa)\",\n    \"toh\": \"Gitonga\",\n    \"toi\": \"Tonga (Zambia)\",\n    \"toj\": \"Tojolabal\",\n    \"tok\": \"Toki Pona\",\n    \"tol\": \"Tolowa\",\n    \"tom\": \"Tombulu\",\n    \"too\": \"Xicotepec De Juárez Totonac\",\n    \"top\": \"Papantla Totonac\",\n    \"toq\": \"Toposa\",\n    \"tor\": \"Togbo-Vara Banda\",\n    \"tos\": \"Highland Totonac\",\n    \"tou\": \"Tho\",\n    \"tov\": \"Upper Taromi\",\n    \"tow\": \"Jemez\",\n    \"tox\": \"Tobian\",\n    \"toy\": \"Topoiyo\",\n    \"toz\": \"To\",\n    \"tpa\": \"Taupota\",\n    \"tpc\": \"Azoyú Me'phaa; Azoyú Tlapanec\",\n    \"tpe\": \"Tippera\",\n    \"tpf\": \"Tarpia\",\n    \"tpg\": \"Kula\",\n    \"tpi\": \"Tok Pisin\",\n    \"tpj\": \"Tapieté\",\n    \"tpk\": \"Tupinikin\",\n    \"tpl\": \"Tlacoapa Me'phaa; Tlacoapa Tlapanec\",\n    \"tpm\": \"Tampulma\",\n    \"tpn\": \"Tupinambá\",\n    \"tpo\": \"Tai Pao\",\n    \"tpp\": \"Pisaflores Tepehua\",\n    \"tpq\": \"Tukpa\",\n    \"tpr\": \"Tuparí\",\n    \"tpt\": \"Tlachichilco Tepehua\",\n    \"tpu\": \"Tampuan\",\n    \"tpv\": \"Tanapag\",\n    \"tpw\": \"Tupí\",\n    \"tpx\": \"Acatepec Me'phaa; Acatepec Tlapanec\",\n    \"tpy\": \"Trumai\",\n    \"tpz\": \"Tinputz\",\n    \"tqb\": \"Tembé\",\n    \"tql\": \"Lehali\",\n    \"tqm\": \"Turumsa\",\n    \"tqn\": \"Tenino\",\n    \"tqo\": \"Toaripi\",\n    \"tqp\": \"Tomoip\",\n    \"tqq\": \"Tunni\",\n    \"tqr\": \"Torona\",\n    \"tqt\": \"Western Totonac\",\n    \"tqu\": \"Touo\",\n    \"tqw\": \"Tonkawa\",\n    \"tr\": \"Turkish\",\n    \"tra\": \"Tirahi\",\n    \"trb\": \"Terebu\",\n    \"trc\": \"Copala Triqui\",\n    \"trd\": \"Turi\",\n    \"tre\": \"East Tarangan\",\n    \"trf\": \"Trinidadian Creole English\",\n    \"trg\": \"Lishán Didán\",\n    \"trh\": \"Turaka\",\n    \"tri\": \"Trió\",\n    \"trj\": \"Toram\",\n    \"trk\": \"Turkic languages\",\n    \"trl\": \"Traveller Scottish\",\n    \"trm\": \"Tregami\",\n    \"trn\": \"Trinitario\",\n    \"tro\": \"Tarao Naga\",\n    \"trp\": \"Kok Borok\",\n    \"trq\": \"San Martín Itunyoso Triqui\",\n    \"trr\": \"Taushiro\",\n    \"trs\": \"Chicahuaxtla Triqui\",\n    \"trt\": \"Tunggare\",\n    \"tru\": \"Turoyo; Surayt\",\n    \"trv\": \"Sediq; Seediq; Taroko\",\n    \"trw\": \"Torwali\",\n    \"trx\": \"Tringgus-Sembaan Bidayuh\",\n    \"try\": \"Turung\",\n    \"trz\": \"Torá\",\n    \"ts\": \"Tsonga\",\n    \"tsa\": \"Tsaangi\",\n    \"tsb\": \"Tsamai\",\n    \"tsc\": \"Tswa\",\n    \"tsd\": \"Tsakonian\",\n    \"tse\": \"Tunisian Sign Language\",\n    \"tsg\": \"Tausug\",\n    \"tsh\": \"Tsuvan\",\n    \"tsi\": \"Tsimshian\",\n    \"tsj\": \"Tshangla\",\n    \"tsk\": \"Tseku\",\n    \"tsl\": \"Ts'ün-Lao\",\n    \"tsm\": \"Turkish Sign Language; Türk İşaret Dili\",\n    \"tsp\": \"Northern Toussian\",\n    \"tsq\": \"Thai Sign Language\",\n    \"tsr\": \"Akei\",\n    \"tss\": \"Taiwan Sign Language\",\n    \"tst\": \"Tondi Songway Kiini\",\n    \"tsu\": \"Tsou\",\n    \"tsv\": \"Tsogo\",\n    \"tsw\": \"Tsishingini\",\n    \"tsx\": \"Mubami\",\n    \"tsy\": \"Tebul Sign Language\",\n    \"tsz\": \"Purepecha\",\n    \"tt\": \"Tatar\",\n    \"tta\": \"Tutelo\",\n    \"ttb\": \"Gaa\",\n    \"ttc\": \"Tektiteko\",\n    \"ttd\": \"Tauade\",\n    \"tte\": \"Bwanabwana\",\n    \"ttf\": \"Tuotomb\",\n    \"ttg\": \"Tutong\",\n    \"tth\": \"Upper Ta'oih\",\n    \"tti\": \"Tobati\",\n    \"ttj\": \"Tooro\",\n    \"ttk\": \"Totoro\",\n    \"ttl\": \"Totela\",\n    \"ttm\": \"Northern Tutchone\",\n    \"ttn\": \"Towei\",\n    \"tto\": \"Lower Ta'oih\",\n    \"ttp\": \"Tombelala\",\n    \"ttq\": \"Tawallammat Tamajaq\",\n    \"ttr\": \"Tera\",\n    \"tts\": \"Northeastern Thai\",\n    \"ttt\": \"Muslim Tat\",\n    \"ttu\": \"Torau\",\n    \"ttv\": \"Titan\",\n    \"ttw\": \"Long Wat\",\n    \"tty\": \"Sikaritai\",\n    \"ttz\": \"Tsum\",\n    \"tua\": \"Wiarumus\",\n    \"tub\": \"Tübatulabal\",\n    \"tuc\": \"Mutu\",\n    \"tud\": \"Tuxá\",\n    \"tue\": \"Tuyuca\",\n    \"tuf\": \"Central Tunebo\",\n    \"tug\": \"Tunia\",\n    \"tuh\": \"Taulil\",\n    \"tui\": \"Tupuri\",\n    \"tuj\": \"Tugutil\",\n    \"tul\": \"Tula\",\n    \"tum\": \"Tumbuka\",\n    \"tun\": \"Tunica\",\n    \"tuo\": \"Tucano\",\n    \"tup\": \"Tupi languages\",\n    \"tuq\": \"Tedaga\",\n    \"tus\": \"Tuscarora\",\n    \"tut\": \"Altaic languages\",\n    \"tuu\": \"Tututni\",\n    \"tuv\": \"Turkana\",\n    \"tuw\": \"Tungus languages\",\n    \"tux\": \"Tuxináwa\",\n    \"tuy\": \"Tugen\",\n    \"tuz\": \"Turka\",\n    \"tva\": \"Vaghua\",\n    \"tvd\": \"Tsuvadi\",\n    \"tve\": \"Te'un\",\n    \"tvk\": \"Southeast Ambrym\",\n    \"tvl\": \"Tuvalu\",\n    \"tvm\": \"Tela-Masbuar\",\n    \"tvn\": \"Tavoyan\",\n    \"tvo\": \"Tidore\",\n    \"tvs\": \"Taveta\",\n    \"tvt\": \"Tutsa Naga\",\n    \"tvu\": \"Tunen\",\n    \"tvw\": \"Sedoa\",\n    \"tvx\": \"Taivoan\",\n    \"tvy\": \"Timor Pidgin\",\n    \"tw\": \"Twi\",\n    \"twa\": \"Twana\",\n    \"twb\": \"Western Tawbuid\",\n    \"twc\": \"Teshenawa\",\n    \"twd\": \"Twents\",\n    \"twe\": \"Tewa (Indonesia)\",\n    \"twf\": \"Northern Tiwa\",\n    \"twg\": \"Tereweng\",\n    \"twh\": \"Tai Dón\",\n    \"twl\": \"Tawara\",\n    \"twm\": \"Tawang Monpa\",\n    \"twn\": \"Twendi\",\n    \"two\": \"Tswapong\",\n    \"twp\": \"Ere\",\n    \"twq\": \"Tasawaq\",\n    \"twr\": \"Southwestern Tarahumara\",\n    \"twt\": \"Turiwára\",\n    \"twu\": \"Termanu\",\n    \"tww\": \"Tuwari\",\n    \"twx\": \"Tewe\",\n    \"twy\": \"Tawoyan\",\n    \"txa\": \"Tombonuo\",\n    \"txb\": \"Tokharian B\",\n    \"txc\": \"Tsetsaut\",\n    \"txe\": \"Totoli\",\n    \"txg\": \"Tangut\",\n    \"txh\": \"Thracian\",\n    \"txi\": \"Ikpeng\",\n    \"txj\": \"Tarjumo\",\n    \"txm\": \"Tomini\",\n    \"txn\": \"West Tarangan\",\n    \"txo\": \"Toto\",\n    \"txq\": \"Tii\",\n    \"txr\": \"Tartessian\",\n    \"txs\": \"Tonsea\",\n    \"txt\": \"Citak\",\n    \"txu\": \"Kayapó\",\n    \"txx\": \"Tatana\",\n    \"txy\": \"Tanosy Malagasy\",\n    \"ty\": \"Tahitian\",\n    \"tya\": \"Tauya\",\n    \"tye\": \"Kyanga\",\n    \"tyh\": \"O'du\",\n    \"tyi\": \"Teke-Tsaayi\",\n    \"tyj\": \"Tai Do; Tai Yo\",\n    \"tyl\": \"Thu Lao\",\n    \"tyn\": \"Kombai\",\n    \"typ\": \"Thaypan\",\n    \"tyr\": \"Tai Daeng\",\n    \"tys\": \"Tày Sa Pa\",\n    \"tyt\": \"Tày Tac\",\n    \"tyu\": \"Kua\",\n    \"tyv\": \"Tuvinian\",\n    \"tyx\": \"Teke-Tyee\",\n    \"tyy\": \"Tiyaa\",\n    \"tyz\": \"Tày\",\n    \"tza\": \"Tanzanian Sign Language\",\n    \"tzh\": \"Tzeltal\",\n    \"tzj\": \"Tz'utujil\",\n    \"tzl\": \"Talossan\",\n    \"tzm\": \"Central Atlas Tamazight\",\n    \"tzn\": \"Tugun\",\n    \"tzo\": \"Tzotzil\",\n    \"tzx\": \"Tabriak\",\n    \"uam\": \"Uamué\",\n    \"uan\": \"Kuan\",\n    \"uar\": \"Tairuma\",\n    \"uba\": \"Ubang\",\n    \"ubi\": \"Ubi\",\n    \"ubl\": \"Buhi'non Bikol\",\n    \"ubr\": \"Ubir\",\n    \"ubu\": \"Umbu-Ungu\",\n    \"uby\": \"Ubykh\",\n    \"uda\": \"Uda\",\n    \"ude\": \"Udihe\",\n    \"udg\": \"Muduga\",\n    \"udi\": \"Udi\",\n    \"udj\": \"Ujir\",\n    \"udl\": \"Wuzlam\",\n    \"udm\": \"Udmurt\",\n    \"udu\": \"Uduk\",\n    \"ues\": \"Kioko\",\n    \"ufi\": \"Ufim\",\n    \"ug\": \"Uighur; Uyghur\",\n    \"uga\": \"Ugaritic\",\n    \"ugb\": \"Kuku-Ugbanh\",\n    \"uge\": \"Ughele\",\n    \"ugh\": \"Kubachi\",\n    \"ugn\": \"Ugandan Sign Language\",\n    \"ugo\": \"Ugong\",\n    \"ugy\": \"Uruguayan Sign Language\",\n    \"uha\": \"Uhami\",\n    \"uhn\": \"Damal\",\n    \"uis\": \"Uisai\",\n    \"uiv\": \"Iyive\",\n    \"uji\": \"Tanjijili\",\n    \"uk\": \"Ukrainian\",\n    \"uka\": \"Kaburi\",\n    \"ukg\": \"Ukuriguma\",\n    \"ukh\": \"Ukhwejo\",\n    \"uki\": \"Kui (India)\",\n    \"ukk\": \"Muak Sa-aak\",\n    \"ukl\": \"Ukrainian Sign Language\",\n    \"ukp\": \"Ukpe-Bayobiri\",\n    \"ukq\": \"Ukwa\",\n    \"uks\": \"Urubú-Kaapor Sign Language; Kaapor Sign Language\",\n    \"uku\": \"Ukue\",\n    \"ukv\": \"Kuku\",\n    \"ukw\": \"Ukwuani-Aboh-Ndoni\",\n    \"uky\": \"Kuuk-Yak\",\n    \"ula\": \"Fungwa\",\n    \"ulb\": \"Ulukwumi\",\n    \"ulc\": \"Ulch\",\n    \"ule\": \"Lule\",\n    \"ulf\": \"Usku; Afra\",\n    \"uli\": \"Ulithian\",\n    \"ulk\": \"Meriam Mir\",\n    \"ull\": \"Ullatan\",\n    \"ulm\": \"Ulumanda'\",\n    \"uln\": \"Unserdeutsch\",\n    \"ulu\": \"Uma' Lung\",\n    \"ulw\": \"Ulwa\",\n    \"uma\": \"Umatilla\",\n    \"umb\": \"Umbundu\",\n    \"umc\": \"Marrucinian\",\n    \"umd\": \"Umbindhamu\",\n    \"umg\": \"Morrobalama; Umbuygamu\",\n    \"umi\": \"Ukit\",\n    \"umm\": \"Umon\",\n    \"umn\": \"Makyan Naga\",\n    \"umo\": \"Umotína\",\n    \"ump\": \"Umpila\",\n    \"umr\": \"Umbugarla\",\n    \"ums\": \"Pendau\",\n    \"umu\": \"Munsee\",\n    \"una\": \"North Watut\",\n    \"und\": \"Undetermined\",\n    \"une\": \"Uneme\",\n    \"ung\": \"Ngarinyin\",\n    \"uni\": \"Uni\",\n    \"unk\": \"Enawené-Nawé\",\n    \"unm\": \"Unami\",\n    \"unn\": \"Kurnai\",\n    \"unr\": \"Mundari\",\n    \"unu\": \"Unubahe\",\n    \"unx\": \"Munda\",\n    \"unz\": \"Unde Kaili\",\n    \"uon\": \"Kulon\",\n    \"upi\": \"Umeda\",\n    \"upv\": \"Uripiv-Wala-Rano-Atchin\",\n    \"ur\": \"Urdu\",\n    \"ura\": \"Urarina\",\n    \"urb\": \"Urubú-Kaapor; Kaapor\",\n    \"urc\": \"Urningangg\",\n    \"ure\": \"Uru\",\n    \"urf\": \"Uradhi\",\n    \"urg\": \"Urigina\",\n    \"urh\": \"Urhobo\",\n    \"uri\": \"Urim\",\n    \"urj\": \"Uralic languages\",\n    \"urk\": \"Urak Lawoi'\",\n    \"url\": \"Urali\",\n    \"urm\": \"Urapmin\",\n    \"urn\": \"Uruangnirin\",\n    \"uro\": \"Ura (Papua New Guinea)\",\n    \"urp\": \"Uru-Pa-In\",\n    \"urr\": \"Lehalurup; Löyöp\",\n    \"urt\": \"Urat\",\n    \"uru\": \"Urumi\",\n    \"urv\": \"Uruava\",\n    \"urw\": \"Sop\",\n    \"urx\": \"Urimo\",\n    \"ury\": \"Orya\",\n    \"urz\": \"Uru-Eu-Wau-Wau\",\n    \"usa\": \"Usarufa\",\n    \"ush\": \"Ushojo\",\n    \"usi\": \"Usui\",\n    \"usk\": \"Usaghade\",\n    \"usp\": \"Uspanteco\",\n    \"uss\": \"us-Saare\",\n    \"usu\": \"Uya\",\n    \"uta\": \"Otank\",\n    \"ute\": \"Ute-Southern Paiute\",\n    \"uth\": \"ut-Hun\",\n    \"utp\": \"Amba (Solomon Islands)\",\n    \"utr\": \"Etulo\",\n    \"utu\": \"Utu\",\n    \"uum\": \"Urum\",\n    \"uur\": \"Ura (Vanuatu)\",\n    \"uuu\": \"U\",\n    \"uve\": \"West Uvean; Fagauvea\",\n    \"uvh\": \"Uri\",\n    \"uvl\": \"Lote\",\n    \"uwa\": \"Kuku-Uwanh\",\n    \"uya\": \"Doko-Uyanga\",\n    \"uz\": \"Uzbek\",\n    \"uzn\": \"Northern Uzbek\",\n    \"uzs\": \"Southern Uzbek\",\n    \"vaa\": \"Vaagri Booli\",\n    \"vae\": \"Vale\",\n    \"vaf\": \"Vafsi\",\n    \"vag\": \"Vagla\",\n    \"vah\": \"Varhadi-Nagpuri\",\n    \"vai\": \"Vai\",\n    \"vaj\": \"Sekele; Northwestern ǃKung; Vasekele\",\n    \"val\": \"Vehes\",\n    \"vam\": \"Vanimo\",\n    \"van\": \"Valman\",\n    \"vao\": \"Vao\",\n    \"vap\": \"Vaiphei\",\n    \"var\": \"Huarijio\",\n    \"vas\": \"Vasavi\",\n    \"vau\": \"Vanuma\",\n    \"vav\": \"Varli\",\n    \"vay\": \"Wayu\",\n    \"vbb\": \"Southeast Babar\",\n    \"vbk\": \"Southwestern Bontok\",\n    \"ve\": \"Venda\",\n    \"vec\": \"Venetian\",\n    \"ved\": \"Veddah\",\n    \"vel\": \"Veluws\",\n    \"vem\": \"Vemgo-Mabas\",\n    \"veo\": \"Ventureño\",\n    \"vep\": \"Veps\",\n    \"ver\": \"Mom Jango\",\n    \"vgr\": \"Vaghri\",\n    \"vgt\": \"Vlaamse Gebarentaal; Flemish Sign Language\",\n    \"vi\": \"Vietnamese\",\n    \"vic\": \"Virgin Islands Creole English\",\n    \"vid\": \"Vidunda\",\n    \"vif\": \"Vili\",\n    \"vig\": \"Viemo\",\n    \"vil\": \"Vilela\",\n    \"vin\": \"Vinza\",\n    \"vis\": \"Vishavan\",\n    \"vit\": \"Viti\",\n    \"viv\": \"Iduna\",\n    \"vka\": \"Kariyarra\",\n    \"vkj\": \"Kujarge\",\n    \"vkk\": \"Kaur\",\n    \"vkl\": \"Kulisusu\",\n    \"vkm\": \"Kamakan\",\n    \"vkn\": \"Koro Nulu\",\n    \"vko\": \"Kodeoha\",\n    \"vkp\": \"Korlai Creole Portuguese\",\n    \"vkt\": \"Tenggarong Kutai Malay\",\n    \"vku\": \"Kurrama\",\n    \"vkz\": \"Koro Zuba\",\n    \"vlp\": \"Valpei\",\n    \"vls\": \"Vlaams\",\n    \"vma\": \"Martuyhunira\",\n    \"vmb\": \"Barbaram\",\n    \"vmc\": \"Juxtlahuaca Mixtec\",\n    \"vmd\": \"Mudu Koraga\",\n    \"vme\": \"East Masela\",\n    \"vmf\": \"Mainfränkisch\",\n    \"vmg\": \"Lungalunga\",\n    \"vmh\": \"Maraghei\",\n    \"vmi\": \"Miwa\",\n    \"vmj\": \"Ixtayutla Mixtec\",\n    \"vmk\": \"Makhuwa-Shirima\",\n    \"vml\": \"Malgana\",\n    \"vmm\": \"Mitlatongo Mixtec\",\n    \"vmp\": \"Soyaltepec Mazatec\",\n    \"vmq\": \"Soyaltepec Mixtec\",\n    \"vmr\": \"Marenje\",\n    \"vms\": \"Moksela\",\n    \"vmu\": \"Muluridyi\",\n    \"vmv\": \"Valley Maidu\",\n    \"vmw\": \"Makhuwa\",\n    \"vmx\": \"Tamazola Mixtec\",\n    \"vmy\": \"Ayautla Mazatec\",\n    \"vmz\": \"Mazatlán Mazatec\",\n    \"vnk\": \"Vano; Lovono\",\n    \"vnm\": \"Vinmavis; Neve'ei\",\n    \"vnp\": \"Vunapu\",\n    \"vo\": \"Volapük\",\n    \"vor\": \"Voro\",\n    \"vot\": \"Votic\",\n    \"vra\": \"Vera'a\",\n    \"vro\": \"Võro\",\n    \"vrs\": \"Varisi\",\n    \"vrt\": \"Burmbar; Banam Bay\",\n    \"vsi\": \"Moldova Sign Language\",\n    \"vsl\": \"Venezuelan Sign Language\",\n    \"vsv\": \"Valencian Sign Language; Llengua de signes valenciana\",\n    \"vto\": \"Vitou\",\n    \"vum\": \"Vumbu\",\n    \"vun\": \"Vunjo\",\n    \"vut\": \"Vute\",\n    \"vwa\": \"Awa (China)\",\n    \"wa\": \"Walloon\",\n    \"waa\": \"Walla Walla\",\n    \"wab\": \"Wab\",\n    \"wac\": \"Wasco-Wishram\",\n    \"wad\": \"Wamesa; Wondama\",\n    \"wae\": \"Walser\",\n    \"waf\": \"Wakoná\",\n    \"wag\": \"Wa'ema\",\n    \"wah\": \"Watubela\",\n    \"wai\": \"Wares\",\n    \"waj\": \"Waffa\",\n    \"wak\": \"Wakashan languages\",\n    \"wal\": \"Wolaytta; Wolaitta\",\n    \"wam\": \"Wampanoag\",\n    \"wan\": \"Wan\",\n    \"wao\": \"Wappo\",\n    \"wap\": \"Wapishana\",\n    \"waq\": \"Wagiman\",\n    \"war\": \"Waray (Philippines)\",\n    \"was\": \"Washo\",\n    \"wat\": \"Kaninuwa\",\n    \"wau\": \"Waurá\",\n    \"wav\": \"Waka\",\n    \"waw\": \"Waiwai\",\n    \"wax\": \"Watam; Marangis\",\n    \"way\": \"Wayana\",\n    \"waz\": \"Wampur\",\n    \"wba\": \"Warao\",\n    \"wbb\": \"Wabo\",\n    \"wbe\": \"Waritai\",\n    \"wbf\": \"Wara\",\n    \"wbh\": \"Wanda\",\n    \"wbi\": \"Vwanji\",\n    \"wbj\": \"Alagwa\",\n    \"wbk\": \"Waigali\",\n    \"wbl\": \"Wakhi\",\n    \"wbm\": \"Wa\",\n    \"wbp\": \"Warlpiri\",\n    \"wbq\": \"Waddar\",\n    \"wbr\": \"Wagdi\",\n    \"wbs\": \"West Bengal Sign Language\",\n    \"wbt\": \"Warnman\",\n    \"wbv\": \"Wajarri\",\n    \"wbw\": \"Woi\",\n    \"wca\": \"Yanomámi\",\n    \"wci\": \"Waci Gbe\",\n    \"wdd\": \"Wandji\",\n    \"wdg\": \"Wadaginam\",\n    \"wdj\": \"Wadjiginy\",\n    \"wdk\": \"Wadikali\",\n    \"wdt\": \"Wendat\",\n    \"wdu\": \"Wadjigu\",\n    \"wdy\": \"Wadjabangayi\",\n    \"wea\": \"Wewaw\",\n    \"wec\": \"Wè Western\",\n    \"wed\": \"Wedau\",\n    \"weg\": \"Wergaia\",\n    \"weh\": \"Weh\",\n    \"wei\": \"Kiunum\",\n    \"wem\": \"Weme Gbe\",\n    \"wen\": \"Sorbian languages\",\n    \"weo\": \"Wemale\",\n    \"wep\": \"Westphalien\",\n    \"wer\": \"Weri\",\n    \"wes\": \"Cameroon Pidgin\",\n    \"wet\": \"Perai\",\n    \"weu\": \"Rawngtu Chin\",\n    \"wew\": \"Wejewa\",\n    \"wfg\": \"Yafi; Zorop\",\n    \"wga\": \"Wagaya\",\n    \"wgb\": \"Wagawaga\",\n    \"wgg\": \"Wangkangurru; Wangganguru\",\n    \"wgi\": \"Wahgi\",\n    \"wgo\": \"Waigeo\",\n    \"wgu\": \"Wirangu\",\n    \"wgy\": \"Warrgamay\",\n    \"wha\": \"Sou Upaa; Manusela\",\n    \"whg\": \"North Wahgi\",\n    \"whk\": \"Wahau Kenyah\",\n    \"whu\": \"Wahau Kayan\",\n    \"wib\": \"Southern Toussian\",\n    \"wic\": \"Wichita\",\n    \"wie\": \"Wik-Epa\",\n    \"wif\": \"Wik-Keyangan\",\n    \"wig\": \"Wik Ngathan\",\n    \"wih\": \"Wik-Me'anha\",\n    \"wii\": \"Minidien\",\n    \"wij\": \"Wik-Iiyanh\",\n    \"wik\": \"Wikalkan\",\n    \"wil\": \"Wilawila\",\n    \"wim\": \"Wik-Mungkan\",\n    \"win\": \"Ho-Chunk\",\n    \"wir\": \"Wiraféd\",\n    \"wiu\": \"Wiru\",\n    \"wiv\": \"Vitu\",\n    \"wiy\": \"Wiyot\",\n    \"wja\": \"Waja\",\n    \"wji\": \"Warji\",\n    \"wka\": \"Kw'adza\",\n    \"wkb\": \"Kumbaran\",\n    \"wkd\": \"Wakde; Mo\",\n    \"wkl\": \"Kalanadi\",\n    \"wkr\": \"Keerray-Woorroong\",\n    \"wku\": \"Kunduvadi\",\n    \"wkw\": \"Wakawaka\",\n    \"wky\": \"Wangkayutyuru\",\n    \"wla\": \"Walio\",\n    \"wlc\": \"Mwali Comorian\",\n    \"wle\": \"Wolane\",\n    \"wlg\": \"Kunbarlang\",\n    \"wlh\": \"Welaun\",\n    \"wli\": \"Waioli\",\n    \"wlk\": \"Wailaki\",\n    \"wll\": \"Wali (Sudan)\",\n    \"wlm\": \"Middle Welsh\",\n    \"wlo\": \"Wolio\",\n    \"wlr\": \"Wailapa\",\n    \"wls\": \"Wallisian\",\n    \"wlu\": \"Wuliwuli\",\n    \"wlv\": \"Wichí Lhamtés Vejoz\",\n    \"wlw\": \"Walak\",\n    \"wlx\": \"Wali (Ghana)\",\n    \"wly\": \"Waling\",\n    \"wma\": \"Mawa (Nigeria)\",\n    \"wmb\": \"Wambaya\",\n    \"wmc\": \"Wamas\",\n    \"wmd\": \"Mamaindé\",\n    \"wme\": \"Wambule\",\n    \"wmg\": \"Western Minyag\",\n    \"wmh\": \"Waima'a\",\n    \"wmi\": \"Wamin\",\n    \"wmm\": \"Maiwa (Indonesia)\",\n    \"wmn\": \"Waamwang\",\n    \"wmo\": \"Wom (Papua New Guinea)\",\n    \"wms\": \"Wambon\",\n    \"wmt\": \"Walmajarri\",\n    \"wmw\": \"Mwani\",\n    \"wmx\": \"Womo\",\n    \"wnb\": \"Wanambre\",\n    \"wnc\": \"Wantoat\",\n    \"wnd\": \"Wandarang\",\n    \"wne\": \"Waneci\",\n    \"wng\": \"Wanggom\",\n    \"wni\": \"Ndzwani Comorian\",\n    \"wnk\": \"Wanukaka\",\n    \"wnm\": \"Wanggamala\",\n    \"wnn\": \"Wunumara\",\n    \"wno\": \"Wano\",\n    \"wnp\": \"Wanap\",\n    \"wnu\": \"Usan\",\n    \"wnw\": \"Wintu\",\n    \"wny\": \"Wanyi; Waanyi\",\n    \"wo\": \"Wolof\",\n    \"woa\": \"Kuwema; Tyaraity\",\n    \"wob\": \"Wè Northern\",\n    \"woc\": \"Wogeo\",\n    \"wod\": \"Wolani\",\n    \"woe\": \"Woleaian\",\n    \"wof\": \"Gambian Wolof\",\n    \"wog\": \"Wogamusin\",\n    \"woi\": \"Kamang\",\n    \"wok\": \"Longto\",\n    \"wom\": \"Wom (Nigeria)\",\n    \"won\": \"Wongo\",\n    \"woo\": \"Manombai\",\n    \"wor\": \"Woria\",\n    \"wos\": \"Hanga Hundi\",\n    \"wow\": \"Wawonii\",\n    \"woy\": \"Weyto\",\n    \"wpc\": \"Maco\",\n    \"wrb\": \"Waluwarra; Warluwara\",\n    \"wrg\": \"Warungu; Gudjal\",\n    \"wrh\": \"Wiradjuri\",\n    \"wri\": \"Wariyangga\",\n    \"wrk\": \"Garrwa\",\n    \"wrl\": \"Warlmanpa\",\n    \"wrm\": \"Warumungu\",\n    \"wrn\": \"Warnang\",\n    \"wro\": \"Worrorra\",\n    \"wrp\": \"Waropen\",\n    \"wrr\": \"Wardaman\",\n    \"wrs\": \"Waris\",\n    \"wru\": \"Waru\",\n    \"wrv\": \"Waruna\",\n    \"wrw\": \"Gugu Warra\",\n    \"wrx\": \"Wae Rana\",\n    \"wry\": \"Merwari\",\n    \"wrz\": \"Waray (Australia)\",\n    \"wsa\": \"Warembori\",\n    \"wsg\": \"Adilabad Gondi\",\n    \"wsi\": \"Wusi\",\n    \"wsk\": \"Waskia\",\n    \"wsr\": \"Owenia\",\n    \"wss\": \"Wasa\",\n    \"wsu\": \"Wasu\",\n    \"wsv\": \"Wotapuri-Katarqalai\",\n    \"wtf\": \"Watiwa\",\n    \"wth\": \"Wathawurrung\",\n    \"wti\": \"Berta\",\n    \"wtk\": \"Watakataui\",\n    \"wtm\": \"Mewati\",\n    \"wtw\": \"Wotu\",\n    \"wua\": \"Wikngenchera\",\n    \"wub\": \"Wunambal\",\n    \"wud\": \"Wudu\",\n    \"wuh\": \"Wutunhua\",\n    \"wul\": \"Silimo\",\n    \"wum\": \"Wumbvu\",\n    \"wun\": \"Bungu\",\n    \"wur\": \"Wurrugu\",\n    \"wut\": \"Wutung\",\n    \"wuu\": \"Wu Chinese\",\n    \"wuv\": \"Wuvulu-Aua\",\n    \"wux\": \"Wulna\",\n    \"wuy\": \"Wauyai\",\n    \"wwa\": \"Waama\",\n    \"wwb\": \"Wakabunga\",\n    \"wwo\": \"Wetamut; Dorig\",\n    \"wwr\": \"Warrwa\",\n    \"www\": \"Wawa\",\n    \"wxa\": \"Waxianghua\",\n    \"wxw\": \"Wardandi\",\n    \"wyb\": \"Wangaaybuwan-Ngiyambaa\",\n    \"wyi\": \"Woiwurrung\",\n    \"wym\": \"Wymysorys\",\n    \"wyn\": \"Wyandot\",\n    \"wyr\": \"Wayoró\",\n    \"wyy\": \"Western Fijian\",\n    \"xaa\": \"Andalusian Arabic\",\n    \"xab\": \"Sambe\",\n    \"xac\": \"Kachari\",\n    \"xad\": \"Adai\",\n    \"xae\": \"Aequian\",\n    \"xag\": \"Aghwan\",\n    \"xai\": \"Kaimbé\",\n    \"xaj\": \"Ararandewára\",\n    \"xak\": \"Máku\",\n    \"xal\": \"Kalmyk; Oirat\",\n    \"xam\": \"ǀXam\",\n    \"xan\": \"Xamtanga\",\n    \"xao\": \"Khao\",\n    \"xap\": \"Apalachee\",\n    \"xaq\": \"Aquitanian\",\n    \"xar\": \"Karami\",\n    \"xas\": \"Kamas\",\n    \"xat\": \"Katawixi\",\n    \"xau\": \"Kauwera\",\n    \"xav\": \"Xavánte\",\n    \"xaw\": \"Kawaiisu\",\n    \"xay\": \"Kayan Mahakam\",\n    \"xbb\": \"Lower Burdekin\",\n    \"xbc\": \"Bactrian\",\n    \"xbd\": \"Bindal\",\n    \"xbe\": \"Bigambal\",\n    \"xbg\": \"Bunganditj\",\n    \"xbi\": \"Kombio\",\n    \"xbj\": \"Birrpayi\",\n    \"xbm\": \"Middle Breton\",\n    \"xbn\": \"Kenaboi\",\n    \"xbo\": \"Bolgarian\",\n    \"xbp\": \"Bibbulman\",\n    \"xbr\": \"Kambera\",\n    \"xbw\": \"Kambiwá\",\n    \"xby\": \"Batjala; Batyala\",\n    \"xcb\": \"Cumbric\",\n    \"xcc\": \"Camunic\",\n    \"xce\": \"Celtiberian\",\n    \"xcg\": \"Cisalpine Gaulish\",\n    \"xch\": \"Chemakum; Chimakum\",\n    \"xcl\": \"Classical Armenian\",\n    \"xcm\": \"Comecrudo\",\n    \"xcn\": \"Cotoname\",\n    \"xco\": \"Chorasmian\",\n    \"xcr\": \"Carian\",\n    \"xct\": \"Classical Tibetan\",\n    \"xcu\": \"Curonian\",\n    \"xcv\": \"Chuvantsy\",\n    \"xcw\": \"Coahuilteco\",\n    \"xcy\": \"Cayuse\",\n    \"xda\": \"Darkinyung\",\n    \"xdc\": \"Dacian\",\n    \"xdk\": \"Dharuk\",\n    \"xdm\": \"Edomite\",\n    \"xdo\": \"Kwandu\",\n    \"xdq\": \"Kaitag\",\n    \"xdy\": \"Malayic Dayak\",\n    \"xeb\": \"Eblan\",\n    \"xed\": \"Hdi\",\n    \"xeg\": \"ǁXegwi\",\n    \"xel\": \"Kelo\",\n    \"xem\": \"Kembayan\",\n    \"xep\": \"Epi-Olmec\",\n    \"xer\": \"Xerénte\",\n    \"xes\": \"Kesawai\",\n    \"xet\": \"Xetá\",\n    \"xeu\": \"Keoru-Ahia\",\n    \"xfa\": \"Faliscan\",\n    \"xga\": \"Galatian\",\n    \"xgb\": \"Gbin\",\n    \"xgd\": \"Gudang\",\n    \"xgf\": \"Gabrielino-Fernandeño\",\n    \"xgg\": \"Goreng\",\n    \"xgi\": \"Garingbal\",\n    \"xgl\": \"Galindan\",\n    \"xgm\": \"Dharumbal; Guwinmal\",\n    \"xgn\": \"Mongolian languages\",\n    \"xgr\": \"Garza\",\n    \"xgu\": \"Unggumi\",\n    \"xgw\": \"Guwa\",\n    \"xh\": \"Xhosa\",\n    \"xha\": \"Harami\",\n    \"xhc\": \"Hunnic\",\n    \"xhd\": \"Hadrami\",\n    \"xhe\": \"Khetrani\",\n    \"xhm\": \"Middle Khmer (1400 to 1850 CE)\",\n    \"xhr\": \"Hernican\",\n    \"xht\": \"Hattic\",\n    \"xhu\": \"Hurrian\",\n    \"xhv\": \"Khua\",\n    \"xib\": \"Iberian\",\n    \"xii\": \"Xiri\",\n    \"xil\": \"Illyrian\",\n    \"xin\": \"Xinca\",\n    \"xir\": \"Xiriâna\",\n    \"xis\": \"Kisan\",\n    \"xiv\": \"Indus Valley Language\",\n    \"xiy\": \"Xipaya\",\n    \"xjb\": \"Minjungbal\",\n    \"xjt\": \"Jaitmatang\",\n    \"xka\": \"Kalkoti\",\n    \"xkb\": \"Northern Nago\",\n    \"xkc\": \"Kho'ini\",\n    \"xkd\": \"Mendalam Kayan\",\n    \"xke\": \"Kereho\",\n    \"xkf\": \"Khengkha\",\n    \"xkg\": \"Kagoro\",\n    \"xki\": \"Kenyan Sign Language\",\n    \"xkj\": \"Kajali\",\n    \"xkk\": \"Kachok; Kaco'\",\n    \"xkl\": \"Mainstream Kenyah\",\n    \"xkn\": \"Kayan River Kayan\",\n    \"xko\": \"Kiorr\",\n    \"xkp\": \"Kabatei\",\n    \"xkq\": \"Koroni\",\n    \"xkr\": \"Xakriabá\",\n    \"xks\": \"Kumbewaha\",\n    \"xkt\": \"Kantosi\",\n    \"xku\": \"Kaamba\",\n    \"xkv\": \"Kgalagadi\",\n    \"xkw\": \"Kembra\",\n    \"xkx\": \"Karore\",\n    \"xky\": \"Uma' Lasan\",\n    \"xkz\": \"Kurtokha\",\n    \"xla\": \"Kamula\",\n    \"xlb\": \"Loup B\",\n    \"xlc\": \"Lycian\",\n    \"xld\": \"Lydian\",\n    \"xle\": \"Lemnian\",\n    \"xlg\": \"Ligurian (Ancient)\",\n    \"xli\": \"Liburnian\",\n    \"xln\": \"Alanic\",\n    \"xlo\": \"Loup A\",\n    \"xlp\": \"Lepontic\",\n    \"xls\": \"Lusitanian\",\n    \"xlu\": \"Cuneiform Luwian\",\n    \"xly\": \"Elymian\",\n    \"xma\": \"Mushungulu\",\n    \"xmb\": \"Mbonga\",\n    \"xmc\": \"Makhuwa-Marrevone\",\n    \"xmd\": \"Mbudum\",\n    \"xme\": \"Median\",\n    \"xmf\": \"Mingrelian\",\n    \"xmg\": \"Mengaka\",\n    \"xmh\": \"Kugu-Muminh\",\n    \"xmj\": \"Majera\",\n    \"xmk\": \"Ancient Macedonian\",\n    \"xml\": \"Malaysian Sign Language\",\n    \"xmm\": \"Manado Malay\",\n    \"xmn\": \"Manichaean Middle Persian\",\n    \"xmo\": \"Morerebi\",\n    \"xmp\": \"Kuku-Mu'inh\",\n    \"xmq\": \"Kuku-Mangk\",\n    \"xmr\": \"Meroitic\",\n    \"xms\": \"Moroccan Sign Language\",\n    \"xmt\": \"Matbat\",\n    \"xmu\": \"Kamu\",\n    \"xmv\": \"Antankarana Malagasy; Tankarana Malagasy\",\n    \"xmw\": \"Tsimihety Malagasy\",\n    \"xmx\": \"Salawati; Maden\",\n    \"xmy\": \"Mayaguduna\",\n    \"xmz\": \"Mori Bawah\",\n    \"xna\": \"Ancient North Arabian\",\n    \"xnb\": \"Kanakanabu\",\n    \"xnd\": \"Na-Dene languages\",\n    \"xng\": \"Middle Mongolian\",\n    \"xnh\": \"Kuanhua\",\n    \"xni\": \"Ngarigu\",\n    \"xnj\": \"Ngoni (Tanzania)\",\n    \"xnk\": \"Nganakarti\",\n    \"xnm\": \"Ngumbarl\",\n    \"xnn\": \"Northern Kankanay\",\n    \"xno\": \"Anglo-Norman\",\n    \"xnq\": \"Ngoni (Mozambique)\",\n    \"xnr\": \"Kangri\",\n    \"xns\": \"Kanashi\",\n    \"xnt\": \"Narragansett\",\n    \"xnu\": \"Nukunul\",\n    \"xny\": \"Nyiyaparli\",\n    \"xnz\": \"Kenzi; Mattoki\",\n    \"xoc\": \"O'chi'chi'\",\n    \"xod\": \"Kokoda\",\n    \"xog\": \"Soga\",\n    \"xoi\": \"Kominimung\",\n    \"xok\": \"Xokleng\",\n    \"xom\": \"Komo (Sudan)\",\n    \"xon\": \"Konkomba\",\n    \"xoo\": \"Xukurú\",\n    \"xop\": \"Kopar\",\n    \"xor\": \"Korubo\",\n    \"xow\": \"Kowaki\",\n    \"xpa\": \"Pirriya\",\n    \"xpb\": \"Northeastern Tasmanian; Pyemmairrener\",\n    \"xpc\": \"Pecheneg\",\n    \"xpd\": \"Oyster Bay Tasmanian\",\n    \"xpe\": \"Liberia Kpelle\",\n    \"xpf\": \"Southeast Tasmanian; Nuenonne\",\n    \"xpg\": \"Phrygian\",\n    \"xph\": \"North Midlands Tasmanian; Tyerrenoterpanner\",\n    \"xpi\": \"Pictish\",\n    \"xpj\": \"Mpalitjanh\",\n    \"xpk\": \"Kulina Pano\",\n    \"xpl\": \"Port Sorell Tasmanian\",\n    \"xpm\": \"Pumpokol\",\n    \"xpn\": \"Kapinawá\",\n    \"xpo\": \"Pochutec\",\n    \"xpp\": \"Puyo-Paekche\",\n    \"xpq\": \"Mohegan-Pequot\",\n    \"xpr\": \"Parthian\",\n    \"xps\": \"Pisidian\",\n    \"xpt\": \"Punthamara\",\n    \"xpu\": \"Punic\",\n    \"xpv\": \"Northern Tasmanian; Tommeginne\",\n    \"xpw\": \"Northwestern Tasmanian; Peerapper\",\n    \"xpx\": \"Southwestern Tasmanian; Toogee\",\n    \"xpy\": \"Puyo\",\n    \"xpz\": \"Bruny Island Tasmanian\",\n    \"xqa\": \"Karakhanid\",\n    \"xqt\": \"Qatabanian\",\n    \"xra\": \"Krahô\",\n    \"xrb\": \"Eastern Karaboro\",\n    \"xrd\": \"Gundungurra\",\n    \"xre\": \"Kreye\",\n    \"xrg\": \"Minang\",\n    \"xri\": \"Krikati-Timbira\",\n    \"xrm\": \"Armazic\",\n    \"xrn\": \"Arin\",\n    \"xrr\": \"Raetic\",\n    \"xrt\": \"Aranama-Tamique\",\n    \"xru\": \"Marriammu\",\n    \"xrw\": \"Karawa\",\n    \"xsa\": \"Sabaean\",\n    \"xsb\": \"Sambal\",\n    \"xsc\": \"Scythian\",\n    \"xsd\": \"Sidetic\",\n    \"xse\": \"Sempan\",\n    \"xsh\": \"Shamang\",\n    \"xsi\": \"Sio\",\n    \"xsj\": \"Subi\",\n    \"xsl\": \"South Slavey\",\n    \"xsm\": \"Kasem\",\n    \"xsn\": \"Sanga (Nigeria)\",\n    \"xso\": \"Solano\",\n    \"xsp\": \"Silopi\",\n    \"xsq\": \"Makhuwa-Saka\",\n    \"xsr\": \"Sherpa\",\n    \"xss\": \"Assan\",\n    \"xsu\": \"Sanumá\",\n    \"xsv\": \"Sudovian\",\n    \"xsy\": \"Saisiyat\",\n    \"xta\": \"Alcozauca Mixtec\",\n    \"xtb\": \"Chazumba Mixtec\",\n    \"xtc\": \"Katcha-Kadugli-Miri\",\n    \"xtd\": \"Diuxi-Tilantongo Mixtec\",\n    \"xte\": \"Ketengban\",\n    \"xtg\": \"Transalpine Gaulish\",\n    \"xth\": \"Yitha Yitha\",\n    \"xti\": \"Sinicahua Mixtec\",\n    \"xtj\": \"San Juan Teita Mixtec\",\n    \"xtl\": \"Tijaltepec Mixtec\",\n    \"xtm\": \"Magdalena Peñasco Mixtec\",\n    \"xtn\": \"Northern Tlaxiaco Mixtec\",\n    \"xto\": \"Tokharian A\",\n    \"xtp\": \"San Miguel Piedras Mixtec\",\n    \"xtq\": \"Tumshuqese\",\n    \"xtr\": \"Early Tripuri\",\n    \"xts\": \"Sindihui Mixtec\",\n    \"xtt\": \"Tacahua Mixtec\",\n    \"xtu\": \"Cuyamecalco Mixtec\",\n    \"xtv\": \"Thawa\",\n    \"xtw\": \"Tawandê\",\n    \"xty\": \"Yoloxochitl Mixtec\",\n    \"xua\": \"Alu Kurumba\",\n    \"xub\": \"Betta Kurumba\",\n    \"xud\": \"Umiida\",\n    \"xug\": \"Kunigami\",\n    \"xuj\": \"Jennu Kurumba\",\n    \"xul\": \"Ngunawal; Nunukul\",\n    \"xum\": \"Umbrian\",\n    \"xun\": \"Unggaranggu\",\n    \"xuo\": \"Kuo\",\n    \"xup\": \"Upper Umpqua\",\n    \"xur\": \"Urartian\",\n    \"xut\": \"Kuthant\",\n    \"xuu\": \"Kxoe; Khwedam\",\n    \"xve\": \"Venetic\",\n    \"xvi\": \"Kamviri\",\n    \"xvn\": \"Vandalic\",\n    \"xvo\": \"Volscian\",\n    \"xvs\": \"Vestinian\",\n    \"xwa\": \"Kwaza\",\n    \"xwc\": \"Woccon\",\n    \"xwd\": \"Wadi Wadi\",\n    \"xwe\": \"Xwela Gbe\",\n    \"xwg\": \"Kwegu\",\n    \"xwj\": \"Wajuk\",\n    \"xwk\": \"Wangkumara\",\n    \"xwl\": \"Western Xwla Gbe\",\n    \"xwo\": \"Written Oirat\",\n    \"xwr\": \"Kwerba Mamberamo\",\n    \"xwt\": \"Wotjobaluk\",\n    \"xww\": \"Wemba Wemba\",\n    \"xxb\": \"Boro (Ghana)\",\n    \"xxk\": \"Ke'o\",\n    \"xxm\": \"Minkin\",\n    \"xxr\": \"Koropó\",\n    \"xxt\": \"Tambora\",\n    \"xya\": \"Yaygir\",\n    \"xyb\": \"Yandjibara\",\n    \"xyj\": \"Mayi-Yapi\",\n    \"xyk\": \"Mayi-Kulan\",\n    \"xyl\": \"Yalakalore\",\n    \"xyt\": \"Mayi-Thakurti\",\n    \"xyy\": \"Yorta Yorta\",\n    \"xzh\": \"Zhang-Zhung\",\n    \"xzm\": \"Zemgalian\",\n    \"xzp\": \"Ancient Zapotec\",\n    \"yaa\": \"Yaminahua\",\n    \"yab\": \"Yuhup\",\n    \"yac\": \"Pass Valley Yali\",\n    \"yad\": \"Yagua\",\n    \"yae\": \"Pumé\",\n    \"yaf\": \"Yaka (Democratic Republic of Congo)\",\n    \"yag\": \"Yámana\",\n    \"yah\": \"Yazgulyam\",\n    \"yai\": \"Yagnobi\",\n    \"yaj\": \"Banda-Yangere\",\n    \"yak\": \"Yakama\",\n    \"yal\": \"Yalunka\",\n    \"yam\": \"Yamba\",\n    \"yan\": \"Mayangna\",\n    \"yao\": \"Yao\",\n    \"yap\": \"Yapese\",\n    \"yaq\": \"Yaqui\",\n    \"yar\": \"Yabarana\",\n    \"yas\": \"Nugunu (Cameroon)\",\n    \"yat\": \"Yambeta\",\n    \"yau\": \"Yuwana\",\n    \"yav\": \"Yangben\",\n    \"yaw\": \"Yawalapití\",\n    \"yax\": \"Yauma\",\n    \"yay\": \"Agwagwune\",\n    \"yaz\": \"Lokaa\",\n    \"yba\": \"Yala\",\n    \"ybb\": \"Yemba\",\n    \"ybe\": \"West Yugur\",\n    \"ybh\": \"Yakha\",\n    \"ybi\": \"Yamphu\",\n    \"ybj\": \"Hasha\",\n    \"ybk\": \"Bokha\",\n    \"ybl\": \"Yukuben\",\n    \"ybm\": \"Yaben\",\n    \"ybn\": \"Yabaâna\",\n    \"ybo\": \"Yabong\",\n    \"ybx\": \"Yawiyo\",\n    \"yby\": \"Yaweyuha\",\n    \"ych\": \"Chesu\",\n    \"ycl\": \"Lolopo\",\n    \"ycn\": \"Yucuna\",\n    \"ycp\": \"Chepya\",\n    \"yda\": \"Yanda\",\n    \"ydd\": \"Eastern Yiddish\",\n    \"yde\": \"Yangum Dey\",\n    \"ydg\": \"Yidgha\",\n    \"ydk\": \"Yoidik\",\n    \"yea\": \"Ravula\",\n    \"yec\": \"Yeniche\",\n    \"yee\": \"Yimas\",\n    \"yei\": \"Yeni\",\n    \"yej\": \"Yevanic\",\n    \"yel\": \"Yela\",\n    \"yer\": \"Tarok\",\n    \"yes\": \"Nyankpa\",\n    \"yet\": \"Yetfa\",\n    \"yeu\": \"Yerukula\",\n    \"yev\": \"Yapunda\",\n    \"yey\": \"Yeyi\",\n    \"yga\": \"Malyangapa\",\n    \"ygi\": \"Yiningayi\",\n    \"ygl\": \"Yangum Gel\",\n    \"ygm\": \"Yagomi\",\n    \"ygp\": \"Gepo\",\n    \"ygr\": \"Yagaria\",\n    \"ygs\": \"Yolŋu Sign Language\",\n    \"ygu\": \"Yugul\",\n    \"ygw\": \"Yagwoia\",\n    \"yha\": \"Baha Buyang\",\n    \"yhd\": \"Judeo-Iraqi Arabic\",\n    \"yhl\": \"Hlepho Phowa\",\n    \"yhs\": \"Yan-nhaŋu Sign Language\",\n    \"yi\": \"Yiddish\",\n    \"yia\": \"Yinggarda\",\n    \"yif\": \"Ache\",\n    \"yig\": \"Wusa Nasu\",\n    \"yih\": \"Western Yiddish\",\n    \"yii\": \"Yidiny\",\n    \"yij\": \"Yindjibarndi\",\n    \"yik\": \"Dongshanba Lalo\",\n    \"yil\": \"Yindjilandji\",\n    \"yim\": \"Yimchungru Naga\",\n    \"yin\": \"Riang Lai; Yinchia\",\n    \"yip\": \"Pholo\",\n    \"yiq\": \"Miqie\",\n    \"yir\": \"North Awyu\",\n    \"yis\": \"Yis\",\n    \"yit\": \"Eastern Lalu\",\n    \"yiu\": \"Awu\",\n    \"yiv\": \"Northern Nisu\",\n    \"yix\": \"Axi Yi\",\n    \"yiz\": \"Azhe\",\n    \"yka\": \"Yakan\",\n    \"ykg\": \"Northern Yukaghir\",\n    \"yki\": \"Yoke\",\n    \"ykk\": \"Yakaikeke\",\n    \"ykl\": \"Khlula\",\n    \"ykm\": \"Kap\",\n    \"ykn\": \"Kua-nsi\",\n    \"yko\": \"Yasa\",\n    \"ykr\": \"Yekora\",\n    \"ykt\": \"Kathu\",\n    \"yku\": \"Kuamasi\",\n    \"yky\": \"Yakoma\",\n    \"yla\": \"Yaul\",\n    \"ylb\": \"Yaleba\",\n    \"yle\": \"Yele\",\n    \"ylg\": \"Yelogu\",\n    \"yli\": \"Angguruk Yali\",\n    \"yll\": \"Yil\",\n    \"ylm\": \"Limi\",\n    \"yln\": \"Langnian Buyang\",\n    \"ylo\": \"Naluo Yi\",\n    \"ylr\": \"Yalarnnga\",\n    \"ylu\": \"Aribwaung\",\n    \"yly\": \"Nyâlayu; Nyelâyu\",\n    \"ymb\": \"Yambes\",\n    \"ymc\": \"Southern Muji\",\n    \"ymd\": \"Muda\",\n    \"yme\": \"Yameo\",\n    \"ymg\": \"Yamongeri\",\n    \"ymh\": \"Mili\",\n    \"ymi\": \"Moji\",\n    \"ymk\": \"Makwe\",\n    \"yml\": \"Iamalele\",\n    \"ymm\": \"Maay\",\n    \"ymn\": \"Yamna; Sunum\",\n    \"ymo\": \"Yangum Mon\",\n    \"ymp\": \"Yamap\",\n    \"ymq\": \"Qila Muji\",\n    \"ymr\": \"Malasar\",\n    \"yms\": \"Mysian\",\n    \"ymx\": \"Northern Muji\",\n    \"ymz\": \"Muzi\",\n    \"yna\": \"Aluo\",\n    \"ynd\": \"Yandruwandha\",\n    \"yne\": \"Lang'e\",\n    \"yng\": \"Yango\",\n    \"ynk\": \"Naukan Yupik\",\n    \"ynl\": \"Yangulam\",\n    \"ynn\": \"Yana\",\n    \"yno\": \"Yong\",\n    \"ynq\": \"Yendang\",\n    \"yns\": \"Yansi\",\n    \"ynu\": \"Yahuna\",\n    \"yo\": \"Yoruba\",\n    \"yob\": \"Yoba\",\n    \"yog\": \"Yogad\",\n    \"yoi\": \"Yonaguni\",\n    \"yok\": \"Yokuts\",\n    \"yol\": \"Yola\",\n    \"yom\": \"Yombe\",\n    \"yon\": \"Yongkom\",\n    \"yot\": \"Yotti\",\n    \"yox\": \"Yoron\",\n    \"yoy\": \"Yoy\",\n    \"ypa\": \"Phala\",\n    \"ypb\": \"Labo Phowa\",\n    \"ypg\": \"Phola\",\n    \"yph\": \"Phupha\",\n    \"ypk\": \"Yupik languages\",\n    \"ypm\": \"Phuma\",\n    \"ypn\": \"Ani Phowa\",\n    \"ypo\": \"Alo Phola\",\n    \"ypp\": \"Phupa\",\n    \"ypz\": \"Phuza\",\n    \"yra\": \"Yerakai\",\n    \"yrb\": \"Yareba\",\n    \"yre\": \"Yaouré\",\n    \"yrk\": \"Nenets\",\n    \"yrl\": \"Nhengatu\",\n    \"yrm\": \"Yirrk-Mel\",\n    \"yrn\": \"Yerong\",\n    \"yro\": \"Yaroamë\",\n    \"yrs\": \"Yarsun\",\n    \"yrw\": \"Yarawata\",\n    \"yry\": \"Yarluyandi\",\n    \"ysc\": \"Yassic\",\n    \"ysd\": \"Samatao\",\n    \"ysg\": \"Sonaga\",\n    \"ysl\": \"Yugoslavian Sign Language\",\n    \"ysm\": \"Myanmar Sign Language\",\n    \"ysn\": \"Sani\",\n    \"yso\": \"Nisi (China)\",\n    \"ysp\": \"Southern Lolopo\",\n    \"ysr\": \"Sirenik Yupik\",\n    \"yss\": \"Yessan-Mayo\",\n    \"ysy\": \"Sanie\",\n    \"yta\": \"Talu\",\n    \"ytl\": \"Tanglang\",\n    \"ytp\": \"Thopho\",\n    \"ytw\": \"Yout Wam\",\n    \"yty\": \"Yatay\",\n    \"yua\": \"Yucateco; Yucatec Maya\",\n    \"yub\": \"Yugambal\",\n    \"yuc\": \"Yuchi\",\n    \"yud\": \"Judeo-Tripolitanian Arabic\",\n    \"yue\": \"Yue Chinese; Cantonese\",\n    \"yuf\": \"Havasupai-Walapai-Yavapai\",\n    \"yug\": \"Yug\",\n    \"yui\": \"Yurutí\",\n    \"yuj\": \"Karkar-Yuri\",\n    \"yuk\": \"Yuki\",\n    \"yul\": \"Yulu\",\n    \"yum\": \"Quechan\",\n    \"yun\": \"Bena (Nigeria)\",\n    \"yup\": \"Yukpa\",\n    \"yuq\": \"Yuqui\",\n    \"yur\": \"Yurok\",\n    \"yut\": \"Yopno\",\n    \"yuw\": \"Yau (Morobe Province)\",\n    \"yux\": \"Southern Yukaghir\",\n    \"yuy\": \"East Yugur\",\n    \"yuz\": \"Yuracare\",\n    \"yva\": \"Yawa\",\n    \"yvt\": \"Yavitero\",\n    \"ywa\": \"Kalou\",\n    \"ywg\": \"Yinhawangka\",\n    \"ywl\": \"Western Lalu\",\n    \"ywn\": \"Yawanawa\",\n    \"ywq\": \"Wuding-Luquan Yi\",\n    \"ywr\": \"Yawuru\",\n    \"ywt\": \"Xishanba Lalo; Central Lalo\",\n    \"ywu\": \"Wumeng Nasu\",\n    \"yww\": \"Yawarawarga\",\n    \"yxa\": \"Mayawali\",\n    \"yxg\": \"Yagara\",\n    \"yxl\": \"Yardliyawarra\",\n    \"yxm\": \"Yinwum\",\n    \"yxu\": \"Yuyu\",\n    \"yxy\": \"Yabula Yabula\",\n    \"yyr\": \"Yir Yoront\",\n    \"yyu\": \"Yau (Sandaun Province)\",\n    \"yyz\": \"Ayizi\",\n    \"yzg\": \"E'ma Buyang\",\n    \"yzk\": \"Zokhuo\",\n    \"za\": \"Zhuang; Chuang\",\n    \"zaa\": \"Sierra de Juárez Zapotec\",\n    \"zab\": \"Western Tlacolula Valley Zapotec; San Juan Guelavía Zapotec\",\n    \"zac\": \"Ocotlán Zapotec\",\n    \"zad\": \"Cajonos Zapotec\",\n    \"zae\": \"Yareni Zapotec\",\n    \"zaf\": \"Ayoquesco Zapotec\",\n    \"zag\": \"Zaghawa\",\n    \"zah\": \"Zangwal\",\n    \"zai\": \"Isthmus Zapotec\",\n    \"zaj\": \"Zaramo\",\n    \"zak\": \"Zanaki\",\n    \"zal\": \"Zauzou\",\n    \"zam\": \"Miahuatlán Zapotec\",\n    \"zao\": \"Ozolotepec Zapotec\",\n    \"zap\": \"Zapotec\",\n    \"zaq\": \"Aloápam Zapotec\",\n    \"zar\": \"Rincón Zapotec\",\n    \"zas\": \"Santo Domingo Albarradas Zapotec\",\n    \"zat\": \"Tabaa Zapotec\",\n    \"zau\": \"Zangskari\",\n    \"zav\": \"Yatzachi Zapotec\",\n    \"zaw\": \"Mitla Zapotec\",\n    \"zax\": \"Xadani Zapotec\",\n    \"zay\": \"Zayse-Zergulla; Zaysete\",\n    \"zaz\": \"Zari\",\n    \"zba\": \"Balaibalan\",\n    \"zbc\": \"Central Berawan\",\n    \"zbe\": \"East Berawan\",\n    \"zbl\": \"Blissymbols; Bliss; Blissymbolics\",\n    \"zbt\": \"Batui\",\n    \"zbu\": \"Bu (Bauchi State)\",\n    \"zbw\": \"West Berawan\",\n    \"zca\": \"Coatecas Altas Zapotec\",\n    \"zcd\": \"Las Delicias Zapotec\",\n    \"zch\": \"Central Hongshuihe Zhuang\",\n    \"zdj\": \"Ngazidja Comorian\",\n    \"zea\": \"Zeeuws\",\n    \"zeg\": \"Zenag\",\n    \"zeh\": \"Eastern Hongshuihe Zhuang\",\n    \"zen\": \"Zenaga\",\n    \"zga\": \"Kinga\",\n    \"zgb\": \"Guibei Zhuang\",\n    \"zgh\": \"Standard Moroccan Tamazight\",\n    \"zgm\": \"Minz Zhuang\",\n    \"zgn\": \"Guibian Zhuang\",\n    \"zgr\": \"Magori\",\n    \"zh\": \"Chinese\",\n    \"zhb\": \"Zhaba\",\n    \"zhd\": \"Dai Zhuang\",\n    \"zhi\": \"Zhire\",\n    \"zhn\": \"Nong Zhuang\",\n    \"zhw\": \"Zhoa\",\n    \"zhx\": \"Chinese (family)\",\n    \"zia\": \"Zia\",\n    \"zib\": \"Zimbabwe Sign Language\",\n    \"zik\": \"Zimakani\",\n    \"zil\": \"Zialo\",\n    \"zim\": \"Mesme\",\n    \"zin\": \"Zinza\",\n    \"ziw\": \"Zigula\",\n    \"ziz\": \"Zizilivakan\",\n    \"zka\": \"Kaimbulawa\",\n    \"zkb\": \"Koibal\",\n    \"zkd\": \"Kadu\",\n    \"zkg\": \"Koguryo\",\n    \"zkh\": \"Khorezmian\",\n    \"zkk\": \"Karankawa\",\n    \"zkn\": \"Kanan\",\n    \"zko\": \"Kott\",\n    \"zkp\": \"São Paulo Kaingáng\",\n    \"zkr\": \"Zakhring\",\n    \"zkt\": \"Kitan\",\n    \"zku\": \"Kaurna\",\n    \"zkv\": \"Krevinian\",\n    \"zkz\": \"Khazar\",\n    \"zla\": \"Zula\",\n    \"zle\": \"East Slavic languages\",\n    \"zlj\": \"Liujiang Zhuang\",\n    \"zlm\": \"Malay (individual language)\",\n    \"zln\": \"Lianshan Zhuang\",\n    \"zlq\": \"Liuqian Zhuang\",\n    \"zls\": \"South Slavic languages\",\n    \"zlw\": \"West Slavic languages\",\n    \"zma\": \"Manda (Australia)\",\n    \"zmb\": \"Zimba\",\n    \"zmc\": \"Margany\",\n    \"zmd\": \"Maridan\",\n    \"zme\": \"Mangerr\",\n    \"zmf\": \"Mfinu\",\n    \"zmg\": \"Marti Ke\",\n    \"zmh\": \"Makolkol\",\n    \"zmi\": \"Negeri Sembilan Malay\",\n    \"zmj\": \"Maridjabin\",\n    \"zmk\": \"Mandandanyi\",\n    \"zml\": \"Matngala\",\n    \"zmm\": \"Marimanindji; Marramaninyshi\",\n    \"zmn\": \"Mbangwe\",\n    \"zmo\": \"Molo\",\n    \"zmp\": \"Mpuono\",\n    \"zmq\": \"Mituku\",\n    \"zmr\": \"Maranunggu\",\n    \"zms\": \"Mbesa\",\n    \"zmt\": \"Maringarr\",\n    \"zmu\": \"Muruwari\",\n    \"zmv\": \"Mbariman-Gudhinma\",\n    \"zmw\": \"Mbo (Democratic Republic of Congo)\",\n    \"zmx\": \"Bomitaba\",\n    \"zmy\": \"Mariyedi\",\n    \"zmz\": \"Mbandja\",\n    \"zna\": \"Zan Gula\",\n    \"znd\": \"Zande languages\",\n    \"zne\": \"Zande (individual language)\",\n    \"zng\": \"Mang\",\n    \"znk\": \"Manangkari\",\n    \"zns\": \"Mangas\",\n    \"zoc\": \"Copainalá Zoque\",\n    \"zoh\": \"Chimalapa Zoque\",\n    \"zom\": \"Zou\",\n    \"zoo\": \"Asunción Mixtepec Zapotec\",\n    \"zoq\": \"Tabasco Zoque\",\n    \"zor\": \"Rayón Zoque\",\n    \"zos\": \"Francisco León Zoque\",\n    \"zpa\": \"Lachiguiri Zapotec\",\n    \"zpb\": \"Yautepec Zapotec\",\n    \"zpc\": \"Choapan Zapotec\",\n    \"zpd\": \"Southeastern Ixtlán Zapotec\",\n    \"zpe\": \"Petapa Zapotec\",\n    \"zpf\": \"San Pedro Quiatoni Zapotec\",\n    \"zpg\": \"Guevea De Humboldt Zapotec\",\n    \"zph\": \"Totomachapan Zapotec\",\n    \"zpi\": \"Santa María Quiegolani Zapotec\",\n    \"zpj\": \"Quiavicuzas Zapotec\",\n    \"zpk\": \"Tlacolulita Zapotec\",\n    \"zpl\": \"Lachixío Zapotec\",\n    \"zpm\": \"Mixtepec Zapotec\",\n    \"zpn\": \"Santa Inés Yatzechi Zapotec\",\n    \"zpo\": \"Amatlán Zapotec\",\n    \"zpp\": \"El Alto Zapotec\",\n    \"zpq\": \"Zoogocho Zapotec\",\n    \"zpr\": \"Santiago Xanica Zapotec\",\n    \"zps\": \"Coatlán Zapotec\",\n    \"zpt\": \"San Vicente Coatlán Zapotec\",\n    \"zpu\": \"Yalálag Zapotec\",\n    \"zpv\": \"Chichicapan Zapotec\",\n    \"zpw\": \"Zaniza Zapotec\",\n    \"zpx\": \"San Baltazar Loxicha Zapotec\",\n    \"zpy\": \"Mazaltepec Zapotec\",\n    \"zpz\": \"Texmelucan Zapotec\",\n    \"zqe\": \"Qiubei Zhuang\",\n    \"zra\": \"Kara (Korea)\",\n    \"zrg\": \"Mirgan\",\n    \"zrn\": \"Zerenkel\",\n    \"zro\": \"Záparo\",\n    \"zrp\": \"Zarphatic\",\n    \"zrs\": \"Mairasi\",\n    \"zsa\": \"Sarasira\",\n    \"zsk\": \"Kaskean\",\n    \"zsl\": \"Zambian Sign Language\",\n    \"zsm\": \"Standard Malay\",\n    \"zsr\": \"Southern Rincon Zapotec\",\n    \"zsu\": \"Sukurum\",\n    \"zte\": \"Elotepec Zapotec\",\n    \"ztg\": \"Xanaguía Zapotec\",\n    \"ztl\": \"Lapaguía-Guivini Zapotec\",\n    \"ztm\": \"San Agustín Mixtepec Zapotec\",\n    \"ztn\": \"Santa Catarina Albarradas Zapotec\",\n    \"ztp\": \"Loxicha Zapotec\",\n    \"ztq\": \"Quioquitani-Quierí Zapotec\",\n    \"zts\": \"Tilquiapan Zapotec\",\n    \"ztt\": \"Tejalapan Zapotec\",\n    \"ztu\": \"Güilá Zapotec\",\n    \"ztx\": \"Zaachila Zapotec\",\n    \"zty\": \"Yatee Zapotec\",\n    \"zu\": \"Zulu\",\n    \"zua\": \"Zeem\",\n    \"zuh\": \"Tokano\",\n    \"zum\": \"Kumzari\",\n    \"zun\": \"Zuni\",\n    \"zuy\": \"Zumaya\",\n    \"zwa\": \"Zay\",\n    \"zyb\": \"Yongbei Zhuang\",\n    \"zyg\": \"Yang Zhuang\",\n    \"zyj\": \"Youjiang Zhuang\",\n    \"zyn\": \"Yongnan Zhuang\",\n    \"zyp\": \"Zyphe Chin\",\n    \"zza\": \"Zaza; Dimili; Dimli (macrolanguage); Kirdki; Kirmanjki (macrolanguage); Zazaki\",\n    \"zzj\": \"Zuojiang Zhuang\"\n}"
  },
  {
    "path": "src/datasets/utils/resources/multilingualities.json",
    "content": "{\n  \"monolingual\": \"contains a single language\",\n  \"multilingual\": \"contains multiple languages\",\n  \"translation\": \"contains translated or aligned text\",\n  \"other\": \"other type of language distribution\"\n}\n"
  },
  {
    "path": "src/datasets/utils/resources/readme_structure.yaml",
    "content": "name: \"\" # Filename comes here\nallow_empty: false\nallow_empty_text: true\nsubsections:\n  - name: \"Dataset Card for X\" # First-level markdown heading\n    allow_empty: false\n    allow_empty_text: true\n    subsections:\n      - name: \"Table of Contents\"\n        allow_empty: false\n        allow_empty_text: false\n        subsections: null # meaning it should not be checked.\n      - name: \"Dataset Description\"\n        allow_empty: false\n        allow_empty_text: false\n        subsections:\n          - name: \"Dataset Summary\"\n            allow_empty: false\n            allow_empty_text: false\n            subsections: null\n          - name: \"Supported Tasks and Leaderboards\"\n            allow_empty: true\n            allow_empty_text: true\n            subsections: null\n          - name: Languages\n            allow_empty: true\n            allow_empty_text: true\n            subsections: null\n      - name: \"Dataset Structure\"\n        allow_empty: false\n        allow_empty_text: true\n        subsections:\n          - name: \"Data Instances\"\n            allow_empty: false\n            allow_empty_text: true\n            subsections: null\n          - name: \"Data Fields\"\n            allow_empty: false\n            allow_empty_text: true\n            subsections: null\n          - name: \"Data Splits\"\n            allow_empty: false\n            allow_empty_text: true\n            subsections: null\n      - name: \"Dataset Creation\"\n        allow_empty: false\n        allow_empty_text: true\n        subsections:\n          - name: \"Curation Rationale\"\n            allow_empty: true\n            allow_empty_text: true\n            subsections: null\n          - name: \"Source Data\"\n            allow_empty: false\n            allow_empty_text: true\n            subsections:\n              - name: \"Initial Data Collection and Normalization\"\n                allow_empty: true\n                allow_empty_text: true\n                subsections: null\n              - name: \"Who are the source language producers?\"\n                allow_empty: true\n                allow_empty_text: true\n                subsections: null\n          - name: \"Annotations\"\n            allow_empty: false\n            allow_empty_text: true\n            subsections:\n              - name: \"Annotation process\"\n                allow_empty: true\n                allow_empty_text: true\n                subsections: null\n              - name: \"Who are the annotators?\"\n                allow_empty: true\n                allow_empty_text: true\n                subsections: null\n          - name: \"Personal and Sensitive Information\"\n            allow_empty: true\n            allow_empty_text: true\n            subsections: null\n      - name: \"Considerations for Using the Data\"\n        allow_empty: true\n        allow_empty_text: true\n        subsections:\n          - name: \"Social Impact of Dataset\"\n            allow_empty: true\n            allow_empty_text: true\n            subsections: null\n          - name: \"Discussion of Biases\"\n            allow_empty: true\n            allow_empty_text: true\n            subsections: null\n          - name: \"Other Known Limitations\"\n            allow_empty: true\n            allow_empty_text: true\n            subsections: null\n      - name: \"Additional Information\"\n        allow_empty: true\n        allow_empty_text: true\n        subsections:\n          - name: \"Dataset Curators\"\n            allow_empty: true\n            allow_empty_text: true\n            subsections: null\n          - name: \"Licensing Information\"\n            allow_empty: true\n            allow_empty_text: true\n            subsections: null\n          - name: \"Citation Information\"\n            allow_empty: false\n            allow_empty_text: true\n            subsections: null\n          - name: \"Contributions\"\n            allow_empty: false\n            allow_empty_text: false\n            subsections: null\n"
  },
  {
    "path": "src/datasets/utils/resources/size_categories.json",
    "content": "[\n  \"unknown\",\n  \"n<1K\",\n  \"1K<n<10K\",\n  \"10K<n<100K\",\n  \"100K<n<1M\",\n  \"1M<n<10M\",\n  \"10M<n<100M\",\n  \"100M<n<1B\",\n  \"1B<n<10B\",\n  \"10B<n<100B\",\n  \"100B<n<1T\",\n  \"n>1T\"\n]\n"
  },
  {
    "path": "src/datasets/utils/sharding.py",
    "content": "import numpy as np\n\n\ndef _number_of_shards_in_gen_kwargs(gen_kwargs: dict) -> int:\n    \"\"\"Return the number of possible shards according to the input gen_kwargs\"\"\"\n    # Having lists of different sizes makes sharding ambigious, raise an error in this case\n    # until we decide how to define sharding without ambiguity for users\n    lists_lengths = {key: len(value) for key, value in gen_kwargs.items() if isinstance(value, list)}\n    if len(set(lists_lengths.values())) > 1:\n        raise RuntimeError(\n            \"Sharding is ambiguous for this dataset: \"\n            + \"we found several data sources lists of different lengths, and we don't know over which list we should parallelize:\\n\"\n            + \"\\n\".join(f\"\\t- key {key} has length {length}\" for key, length in lists_lengths.items())\n            + \"\\nTo fix this, check the 'gen_kwargs' and make sure to use lists only for data sources, \"\n            + \"and use tuples otherwise. In the end there should only be one single list, or several lists with the same length.\"\n        )\n    max_length = max(lists_lengths.values(), default=0)\n    return max(1, max_length)\n\n\ndef _distribute_shards(num_shards: int, max_num_jobs: int) -> list[range]:\n    \"\"\"\n    Get the range of shard indices per job.\n    If num_shards<max_num_jobs, then num_shards jobs are given a range of one shard.\n    The shards indices order is preserved: e.g. all the first shards are given the first job.\n    Moreover all the jobs are given approximately the same number of shards.\n\n    Example:\n\n    ```python\n    >>> _distribute_shards(2, max_num_jobs=4)\n    [range(0, 1), range(1, 2)]\n    >>> _distribute_shards(10, max_num_jobs=3)\n    [range(0, 4), range(4, 7), range(7, 10)]\n    ```\n    \"\"\"\n    shards_indices_per_group = []\n    for group_idx in range(max_num_jobs):\n        num_shards_to_add = num_shards // max_num_jobs + (group_idx < (num_shards % max_num_jobs))\n        if num_shards_to_add == 0:\n            break\n        start = shards_indices_per_group[-1].stop if shards_indices_per_group else 0\n        shard_indices = range(start, start + num_shards_to_add)\n        shards_indices_per_group.append(shard_indices)\n    return shards_indices_per_group\n\n\ndef _split_gen_kwargs(gen_kwargs: dict, max_num_jobs: int) -> list[dict]:\n    \"\"\"Split the gen_kwargs into `max_num_job` gen_kwargs\"\"\"\n    # Having lists of different sizes makes sharding ambigious, raise an error in this case\n    num_shards = _number_of_shards_in_gen_kwargs(gen_kwargs)\n    if num_shards == 1:\n        return [dict(gen_kwargs)]\n    else:\n        shard_indices_per_group = _distribute_shards(num_shards=num_shards, max_num_jobs=max_num_jobs)\n        return [\n            {\n                key: [value[shard_idx] for shard_idx in shard_indices_per_group[group_idx]]\n                if isinstance(value, list)\n                else value\n                for key, value in gen_kwargs.items()\n            }\n            for group_idx in range(len(shard_indices_per_group))\n        ]\n\n\ndef _merge_gen_kwargs(gen_kwargs_list: list[dict]) -> dict:\n    return {\n        key: [value for gen_kwargs in gen_kwargs_list for value in gen_kwargs[key]]\n        if isinstance(gen_kwargs_list[0][key], list)\n        else gen_kwargs_list[0][key]\n        for key in gen_kwargs_list[0]\n    }\n\n\ndef _shuffle_gen_kwargs(rng: np.random.Generator, gen_kwargs: dict) -> dict:\n    \"\"\"Return a shuffled copy of the input gen_kwargs\"\"\"\n    # We must shuffle all the lists, and lists of the same size must have the same shuffling.\n    # This way entangled lists of (shard, shard_metadata) are still in the right order.\n\n    # First, let's generate the shuffled indices per list size\n    list_sizes = {len(value) for value in gen_kwargs.values() if isinstance(value, list)}\n    indices_per_size = {}\n    for size in list_sizes:\n        indices_per_size[size] = list(range(size))\n        rng.shuffle(indices_per_size[size])\n    # Now let's copy the gen_kwargs and shuffle the lists based on their sizes\n    shuffled_kwargs = dict(gen_kwargs)\n    for key, value in shuffled_kwargs.items():\n        if isinstance(value, list):\n            shuffled_kwargs[key] = [value[i] for i in indices_per_size[len(value)]]\n    return shuffled_kwargs\n"
  },
  {
    "path": "src/datasets/utils/stratify.py",
    "content": "import numpy as np\n\n\ndef approximate_mode(class_counts, n_draws, rng):\n    \"\"\"Computes approximate mode of multivariate hypergeometric.\n    This is an approximation to the mode of the multivariate\n    hypergeometric given by class_counts and n_draws.\n    It shouldn't be off by more than one.\n    It is the mostly likely outcome of drawing n_draws many\n    samples from the population given by class_counts.\n    Args\n    ----------\n    class_counts : ndarray of int\n        Population per class.\n    n_draws : int\n        Number of draws (samples to draw) from the overall population.\n    rng : random state\n        Used to break ties.\n    Returns\n    -------\n    sampled_classes : ndarray of int\n        Number of samples drawn from each class.\n        np.sum(sampled_classes) == n_draws\n\n    \"\"\"\n    # this computes a bad approximation to the mode of the\n    # multivariate hypergeometric given by class_counts and n_draws\n    continuous = n_draws * class_counts / class_counts.sum()\n    # floored means we don't overshoot n_samples, but probably undershoot\n    floored = np.floor(continuous)\n    # we add samples according to how much \"left over\" probability\n    # they had, until we arrive at n_samples\n    need_to_add = int(n_draws - floored.sum())\n    if need_to_add > 0:\n        remainder = continuous - floored\n        values = np.sort(np.unique(remainder))[::-1]\n        # add according to remainder, but break ties\n        # randomly to avoid biases\n        for value in values:\n            (inds,) = np.where(remainder == value)\n            # if we need_to_add less than what's in inds\n            # we draw randomly from them.\n            # if we need to add more, we add them all and\n            # go to the next value\n            add_now = min(len(inds), need_to_add)\n            inds = rng.choice(inds, size=add_now, replace=False)\n            floored[inds] += 1\n            need_to_add -= add_now\n            if need_to_add == 0:\n                break\n    return floored.astype(np.int64)\n\n\ndef stratified_shuffle_split_generate_indices(y, n_train, n_test, rng, n_splits=10):\n    \"\"\"\n\n    Provides train/test indices to split data in train/test sets.\n    It's reference is taken from StratifiedShuffleSplit implementation\n    of scikit-learn library.\n\n    Args\n    ----------\n\n    n_train : int,\n        represents the absolute number of train samples.\n\n    n_test : int,\n        represents the absolute number of test samples.\n\n    random_state : int or RandomState instance, default=None\n        Controls the randomness of the training and testing indices produced.\n        Pass an int for reproducible output across multiple function calls.\n\n    n_splits : int, default=10\n        Number of re-shuffling & splitting iterations.\n    \"\"\"\n    classes, y_indices = np.unique(y, return_inverse=True)\n    n_classes = classes.shape[0]\n    class_counts = np.bincount(y_indices)\n    if np.min(class_counts) < 2:\n        raise ValueError(\"Minimum class count error\")\n    if n_train < n_classes:\n        raise ValueError(\n            \"The train_size = %d should be greater or equal to the number of classes = %d\" % (n_train, n_classes)\n        )\n    if n_test < n_classes:\n        raise ValueError(\n            \"The test_size = %d should be greater or equal to the number of classes = %d\" % (n_test, n_classes)\n        )\n    class_indices = np.split(np.argsort(y_indices, kind=\"mergesort\"), np.cumsum(class_counts)[:-1])\n    for _ in range(n_splits):\n        n_i = approximate_mode(class_counts, n_train, rng)\n        class_counts_remaining = class_counts - n_i\n        t_i = approximate_mode(class_counts_remaining, n_test, rng)\n\n        train = []\n        test = []\n\n        for i in range(n_classes):\n            permutation = rng.permutation(class_counts[i])\n            perm_indices_class_i = class_indices[i].take(permutation, mode=\"clip\")\n            train.extend(perm_indices_class_i[: n_i[i]])\n            test.extend(perm_indices_class_i[n_i[i] : n_i[i] + t_i[i]])\n        train = rng.permutation(train)\n        test = rng.permutation(test)\n\n        yield train, test\n"
  },
  {
    "path": "src/datasets/utils/tf_utils.py",
    "content": "# Copyright 2022 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\"\"\"TF-specific utils import.\"\"\"\n\nimport os\nimport warnings\nfrom functools import partial\nfrom math import ceil\nfrom uuid import uuid4\n\nimport numpy as np\nimport pyarrow as pa\nfrom multiprocess import get_context\n\n\ntry:\n    from multiprocess.shared_memory import SharedMemory\nexcept ImportError:\n    SharedMemory = None  # Version checks should prevent this being called on older Python versions\n\nfrom .. import config\n\n\ndef minimal_tf_collate_fn(features):\n    if isinstance(features, dict):  # case batch_size=None: nothing to collate\n        return features\n    elif config.TF_AVAILABLE:\n        import tensorflow as tf\n    else:\n        raise ImportError(\"Called a Tensorflow-specific function but Tensorflow is not installed.\")\n\n    first = features[0]\n    batch = {}\n    for k, v in first.items():\n        if isinstance(v, np.ndarray):\n            batch[k] = np.stack([f[k] for f in features])\n        elif isinstance(v, tf.Tensor):\n            batch[k] = tf.stack([f[k] for f in features])\n        else:\n            batch[k] = np.array([f[k] for f in features])\n    return batch\n\n\ndef minimal_tf_collate_fn_with_renaming(features):\n    batch = minimal_tf_collate_fn(features)\n    if \"label\" in batch:\n        batch[\"labels\"] = batch[\"label\"]\n        del batch[\"label\"]\n    return batch\n\n\ndef is_numeric_pa_type(pa_type):\n    if pa.types.is_list(pa_type):\n        return is_numeric_pa_type(pa_type.value_type)\n    return pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type) or pa.types.is_decimal(pa_type)\n\n\ndef np_get_batch(\n    indices, dataset, cols_to_retain, collate_fn, collate_fn_args, columns_to_np_types, return_dict=False\n):\n    if not isinstance(indices, np.ndarray):\n        indices = indices.numpy()\n\n    is_batched = True\n    # Optimization - if we're loading a sequential batch, do it with slicing instead of a list of indices\n    if isinstance(indices, np.integer):\n        batch = dataset[indices.item()]\n        is_batched = False\n    elif np.all(np.diff(indices) == 1):\n        batch = dataset[indices[0] : indices[-1] + 1]\n    elif isinstance(indices, np.ndarray):\n        batch = dataset[indices]\n    else:\n        raise RuntimeError(f\"Unexpected type for indices: {type(indices)}\")\n\n    if cols_to_retain is not None:\n        batch = {\n            key: value\n            for key, value in batch.items()\n            if key in cols_to_retain or key in (\"label\", \"label_ids\", \"labels\")\n        }\n\n    if is_batched:\n        actual_size = len(list(batch.values())[0])  # Get the length of one of the arrays, assume all same\n        # Our collators expect a list of dicts, not a dict of lists/arrays, so we invert\n        batch = [{key: value[i] for key, value in batch.items()} for i in range(actual_size)]\n    batch = collate_fn(batch, **collate_fn_args)\n\n    if return_dict:\n        out_batch = {}\n        for col, cast_dtype in columns_to_np_types.items():\n            # In case the collate_fn returns something strange\n            array = np.array(batch[col])\n            array = array.astype(cast_dtype)\n            out_batch[col] = array\n    else:\n        out_batch = []\n        for col, cast_dtype in columns_to_np_types.items():\n            # In case the collate_fn returns something strange\n            array = np.array(batch[col])\n            array = array.astype(cast_dtype)\n            out_batch.append(array)\n    return out_batch\n\n\ndef dataset_to_tf(\n    dataset,\n    cols_to_retain,\n    collate_fn,\n    collate_fn_args,\n    columns_to_np_types,\n    output_signature,\n    shuffle,\n    batch_size,\n    drop_remainder,\n):\n    \"\"\"Create a tf.data.Dataset from the underlying Dataset. This is a single-process method - the multiprocess\n    equivalent is multiprocess_dataset_to_tf.\n\n    Args:\n        dataset (`Dataset`): Dataset to wrap with tf.data.Dataset.\n        cols_to_retain (`List[str]`): Dataset column(s) to load in the\n            tf.data.Dataset. It is acceptable to include column names that are created by the `collate_fn` and\n            that do not exist in the original dataset.\n        collate_fn(`Callable`): A function or callable object (such as a `DataCollator`) that will collate\n            lists of samples into a batch.\n        collate_fn_args (`Dict`): A  `dict` of keyword arguments to be passed to the\n            `collate_fn`. Can be empty.\n        columns_to_np_types (`Dict[str, np.dtype]`): A `dict` mapping column names to numpy dtypes.\n        output_signature (`Dict[str, tf.TensorSpec]`): A `dict` mapping column names to\n            `tf.TensorSpec` objects.\n        shuffle(`bool`): Shuffle the dataset order when loading. Recommended True for training, False for\n            validation/evaluation.\n        batch_size (`int`, default `None`): Size of batches to load from the dataset. Defaults to `None`, which implies that\n            the dataset won't be batched, but the returned dataset can be batched later with `tf_dataset.batch(batch_size)`.\n        drop_remainder(`bool`, default `None`): Drop the last incomplete batch when loading. If not provided,\n            defaults to the same setting as shuffle.\n\n    Returns:\n        `tf.data.Dataset`\n    \"\"\"\n    if config.TF_AVAILABLE:\n        import tensorflow as tf\n    else:\n        raise ImportError(\"Called a Tensorflow-specific function but Tensorflow is not installed.\")\n\n    # TODO Matt: When our minimum Python version is 3.8 or higher, we can delete all of this and move everything\n    #            to the NumPy multiprocessing path.\n    if hasattr(tf, \"random_index_shuffle\"):\n        random_index_shuffle = tf.random_index_shuffle\n    elif hasattr(tf.random.experimental, \"index_shuffle\"):\n        random_index_shuffle = tf.random.experimental.index_shuffle\n    else:\n        if len(dataset) > 10_000_000:\n            warnings.warn(\n                \"to_tf_dataset() can be memory-inefficient on versions of TensorFlow older than 2.9. \"\n                \"If you are iterating over a dataset with a very large number of samples, consider \"\n                \"upgrading to TF >= 2.9.\"\n            )\n        random_index_shuffle = None\n\n    getter_fn = partial(\n        np_get_batch,\n        dataset=dataset,\n        cols_to_retain=cols_to_retain,\n        collate_fn=collate_fn,\n        collate_fn_args=collate_fn_args,\n        columns_to_np_types=columns_to_np_types,\n        return_dict=False,\n    )\n\n    # This works because dictionaries always output in the same order\n    tout = [tf.dtypes.as_dtype(dtype) for dtype in columns_to_np_types.values()]\n\n    @tf.function(input_signature=[tf.TensorSpec(None, tf.int64)])\n    def fetch_function(indices):\n        output = tf.py_function(\n            getter_fn,\n            inp=[indices],\n            Tout=tout,\n        )\n        return {key: output[i] for i, key in enumerate(columns_to_np_types.keys())}\n\n    tf_dataset = tf.data.Dataset.range(len(dataset))\n\n    if shuffle and random_index_shuffle is not None:\n        base_seed = tf.fill((3,), value=tf.cast(-1, dtype=tf.int64))\n\n        def scan_random_index(state, index):\n            if tf.reduce_all(state == -1):\n                # This generates a new random seed once per epoch only,\n                # to ensure that we iterate over each sample exactly once per epoch\n                state = tf.random.uniform(shape=(3,), maxval=2**62, dtype=tf.int64)\n            shuffled_index = random_index_shuffle(index=index, seed=state, max_index=len(dataset) - 1)\n            return state, shuffled_index\n\n        tf_dataset = tf_dataset.scan(base_seed, scan_random_index)\n    elif shuffle:\n        tf_dataset = tf_dataset.shuffle(tf_dataset.cardinality())\n\n    if batch_size is not None:\n        tf_dataset = tf_dataset.batch(batch_size, drop_remainder=drop_remainder)\n\n    tf_dataset = tf_dataset.map(fetch_function)\n\n    if batch_size is not None:\n\n        def ensure_shapes(input_dict):\n            return {key: tf.ensure_shape(val, output_signature[key].shape) for key, val in input_dict.items()}\n\n    else:\n        # Ensure shape but remove batch dimension of output_signature[key].shape\n        def ensure_shapes(input_dict):\n            return {key: tf.ensure_shape(val, output_signature[key].shape[1:]) for key, val in input_dict.items()}\n\n    return tf_dataset.map(ensure_shapes)\n\n\nclass SharedMemoryContext:\n    # This is a context manager for creating shared memory that ensures cleanup happens even if a process is interrupted\n    # The process that creates shared memory is always the one responsible for unlinking it in the end\n    def __init__(self):\n        self.created_shms = []\n        self.opened_shms = []\n\n    def get_shm(self, name, size, create):\n        shm = SharedMemory(size=int(size), name=name, create=create)\n        if create:\n            # We only unlink the ones we created in this context\n            self.created_shms.append(shm)\n        else:\n            # If we didn't create it, we only close it when done, we don't unlink it\n            self.opened_shms.append(shm)\n        return shm\n\n    def get_array(self, name, shape, dtype, create):\n        shm = self.get_shm(name=name, size=np.prod(shape) * np.dtype(dtype).itemsize, create=create)\n        return np.ndarray(shape, dtype=dtype, buffer=shm.buf)\n\n    def __enter__(self):\n        return self\n\n    def __exit__(self, exc_type, exc_value, traceback):\n        for shm in self.created_shms:\n            shm.close()\n            shm.unlink()\n        for shm in self.opened_shms:\n            shm.close()\n\n\nclass NumpyMultiprocessingGenerator:\n    def __init__(\n        self,\n        dataset,\n        cols_to_retain,\n        collate_fn,\n        collate_fn_args,\n        columns_to_np_types,\n        output_signature,\n        shuffle,\n        batch_size,\n        drop_remainder,\n        num_workers,\n    ):\n        self.dataset = dataset\n        self.cols_to_retain = cols_to_retain\n        self.collate_fn = collate_fn\n        self.collate_fn_args = collate_fn_args\n        self.string_columns = [col for col, dtype in columns_to_np_types.items() if dtype is np.str_]\n        # Strings will be converted to arrays of single unicode chars, so that we can have a constant itemsize\n        self.columns_to_np_types = {\n            col: dtype if col not in self.string_columns else np.dtype(\"U1\")\n            for col, dtype in columns_to_np_types.items()\n        }\n        self.output_signature = output_signature\n        self.shuffle = shuffle\n        self.batch_size = batch_size\n        self.drop_remainder = drop_remainder\n        self.num_workers = num_workers\n        # Because strings are converted to characters, we need to add one extra dimension to the shape\n        self.columns_to_ranks = {\n            col: int(spec.shape.rank) if col not in self.string_columns else int(spec.shape.rank) + 1\n            for col, spec in output_signature.items()\n        }\n\n    def __iter__(self):\n        # Make sure we only spawn workers if they have work to do\n        num_workers = min(self.num_workers, int(ceil(len(self.dataset) / self.batch_size)))\n        # Do the shuffling in iter so that it's done at the start of each epoch\n        per_worker_batches, final_batch, final_batch_worker = self.distribute_batches(\n            self.dataset, self.batch_size, self.drop_remainder, num_workers, self.shuffle\n        )\n        ctx = get_context(\"spawn\")\n        names = []\n        shape_arrays = []\n        workers = []\n        array_ready_events = [ctx.Event() for _ in range(num_workers)]\n        array_loaded_events = [ctx.Event() for _ in range(num_workers)]\n\n        base_args = {\n            \"dataset\": self.dataset,\n            \"cols_to_retain\": self.cols_to_retain,\n            \"collate_fn\": self.collate_fn,\n            \"collate_fn_args\": self.collate_fn_args,\n            \"columns_to_np_types\": self.columns_to_np_types,\n            \"columns_to_ranks\": self.columns_to_ranks,\n            \"string_columns\": self.string_columns,\n        }\n        with SharedMemoryContext() as shm_ctx:\n            for i in range(num_workers):\n                worker_random_id = str(uuid4())\n                worker_name = f\"dw_{i}_{worker_random_id}\"[:10]\n                names.append(worker_name)\n\n                worker_shape_arrays = {\n                    col: shm_ctx.get_array(f\"{worker_name}_{col}_shape\", shape=(rank,), dtype=np.int64, create=True)\n                    for col, rank in self.columns_to_ranks.items()\n                }\n                shape_arrays.append(worker_shape_arrays)\n\n                worker_indices = per_worker_batches[i]\n                if i == final_batch_worker and final_batch is not None:\n                    final_batch_arg = final_batch\n                else:\n                    final_batch_arg = None\n                worker_kwargs = {\n                    \"worker_name\": worker_name,\n                    \"indices\": worker_indices,\n                    \"extra_batch\": final_batch_arg,\n                    \"array_ready_event\": array_ready_events[i],\n                    \"array_loaded_event\": array_loaded_events[i],\n                    **base_args,\n                }\n                worker = ctx.Process(target=self.worker_loop, kwargs=worker_kwargs, daemon=True)\n                worker.start()\n                workers.append(worker)\n\n            end_signal_received = False\n            while not end_signal_received:\n                for i in range(num_workers):\n                    if not array_ready_events[i].wait(timeout=60):\n                        raise TimeoutError(\"Data loading worker timed out!\")\n                    array_ready_events[i].clear()\n                    array_shapes = shape_arrays[i]\n                    if any(np.any(shape < 0) for shape in array_shapes.values()):\n                        # Child processes send negative array shapes to indicate\n                        # that no more data is going to be sent\n                        end_signal_received = True\n                        break\n                    # Matt: Because array shapes are variable we recreate the shared memory each iteration.\n                    #       I suspect repeatedly opening lots of shared memory is the bottleneck for the parent process.\n                    #       A future optimization, at the cost of some code complexity, could be to reuse shared memory\n                    #       between iterations, but this would require knowing in advance the maximum size, or having\n                    #       a system to only create a new memory block when a new maximum size is seen.\n                    #       Another potential optimization would be to figure out which memory copies are necessary,\n                    #       or whether we can yield objects straight out of shared memory.\n                    with SharedMemoryContext() as batch_shm_ctx:\n                        # This memory context only lasts long enough to copy everything out of the batch\n                        arrays = {\n                            col: batch_shm_ctx.get_array(\n                                f\"{names[i]}_{col}\",\n                                shape=shape,\n                                dtype=self.columns_to_np_types[col],\n                                create=False,\n                            )\n                            for col, shape in array_shapes.items()\n                        }\n                        # Copy everything out of shm because the memory\n                        # will be unlinked by the child process at some point\n                        arrays = {col: np.copy(arr) for col, arr in arrays.items()}\n                        # Now we convert any unicode char arrays to strings\n                        for string_col in self.string_columns:\n                            arrays[string_col] = (\n                                arrays[string_col].view(f\"U{arrays[string_col].shape[-1]}\").squeeze(-1)\n                            )\n                    yield arrays\n                    array_loaded_events[i].set()\n            # Now we just do some cleanup\n            # Shared memory is cleaned up by the context manager, so we just make sure workers finish\n            for worker in workers:\n                worker.join()\n\n    def __call__(self):\n        return self\n\n    @staticmethod\n    def worker_loop(\n        dataset,\n        cols_to_retain,\n        collate_fn,\n        collate_fn_args,\n        columns_to_np_types,\n        columns_to_ranks,\n        string_columns,\n        indices,\n        extra_batch,\n        worker_name,\n        array_ready_event,\n        array_loaded_event,\n    ):\n        os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\"\n\n        if config.TF_AVAILABLE:\n            import tensorflow as tf\n        else:\n            raise ImportError(\"Called a Tensorflow-specific function but Tensorflow is not installed.\")\n\n        tf.config.set_visible_devices([], \"GPU\")  # Make sure workers don't try to allocate GPU memory\n\n        def send_batch_to_parent(indices):\n            batch = np_get_batch(\n                indices=indices,\n                dataset=dataset,\n                cols_to_retain=cols_to_retain,\n                collate_fn=collate_fn,\n                collate_fn_args=collate_fn_args,\n                columns_to_np_types=columns_to_np_types,\n                return_dict=True,\n            )\n\n            # Now begins the fun part where we start shovelling shared memory at the parent process\n            out_arrays = {}\n            with SharedMemoryContext() as batch_shm_ctx:\n                # The batch shared memory context exists only as long as it takes for the parent process\n                # to read everything, after which it cleans everything up again\n                for col, cast_dtype in columns_to_np_types.items():\n                    # Everything has to be np.array for this to work, even if the collate_fn is giving us tf.Tensor\n                    array = batch[col]\n                    if col in string_columns:\n                        # We can't send unicode arrays over shared memory, so we convert to single chars (\"U1\")\n                        # which have a fixed width of 4 bytes. The parent process will convert these back to strings.\n                        array = array.view(\"U1\").reshape(array.shape + (-1,))\n                    shape_arrays[col][:] = array.shape\n                    out_arrays[col] = batch_shm_ctx.get_array(\n                        f\"{worker_name}_{col}\", shape=array.shape, dtype=cast_dtype, create=True\n                    )\n                    out_arrays[col][:] = array\n\n                array_ready_event.set()\n                array_loaded_event.wait()\n                array_loaded_event.clear()\n\n        with SharedMemoryContext() as shm_ctx:\n            shape_arrays = {\n                col: shm_ctx.get_array(f\"{worker_name}_{col}_shape\", shape=(rank,), dtype=np.int64, create=False)\n                for col, rank in columns_to_ranks.items()\n            }\n\n            for batch in indices:\n                send_batch_to_parent(batch)\n            if extra_batch is not None:\n                send_batch_to_parent(extra_batch)\n            # Now we send a batsignal to the parent process that we're done\n            for col, array in shape_arrays.items():\n                array[:] = -1\n            array_ready_event.set()\n\n    @staticmethod\n    def distribute_batches(dataset, batch_size, drop_remainder, num_workers, shuffle):\n        indices = np.arange(len(dataset))\n        if shuffle:\n            np.random.shuffle(indices)\n        num_samples = len(indices)\n        # We distribute the batches so that reading from the workers in round-robin order yields the exact\n        # order specified in indices. This is only important when shuffle is False, but we do it regardless.\n        incomplete_batch_cutoff = num_samples - (num_samples % batch_size)\n        indices, last_incomplete_batch = np.split(indices, [incomplete_batch_cutoff])\n        if drop_remainder or len(last_incomplete_batch) == 0:\n            last_incomplete_batch = None\n\n        indices = indices.reshape(-1, batch_size)\n        num_batches = len(indices)\n        final_batches_cutoff = num_batches - (num_batches % num_workers)\n        indices, final_batches = np.split(indices, [final_batches_cutoff])\n        indices = indices.reshape(-1, num_workers, batch_size)\n\n        per_worker_indices = np.split(indices, indices.shape[1], axis=1)\n        per_worker_indices = [np.squeeze(worker_indices, 1) for worker_indices in per_worker_indices]\n        # Distribute the final batches to the first workers\n        for i in range(len(final_batches)):\n            # len(final_batches) can be zero, and is always less than num_workers\n            per_worker_indices[i] = np.concatenate([per_worker_indices[i], final_batches[i].reshape(1, -1)], axis=0)\n        # Add the last incomplete batch to the next worker, which might be the first worker\n        if last_incomplete_batch is not None:\n            incomplete_batch_worker_idx = len(final_batches)\n        else:\n            incomplete_batch_worker_idx = None\n        return per_worker_indices, last_incomplete_batch, incomplete_batch_worker_idx\n\n\ndef multiprocess_dataset_to_tf(\n    dataset,\n    cols_to_retain,\n    collate_fn,\n    collate_fn_args,\n    columns_to_np_types,\n    output_signature,\n    shuffle,\n    batch_size,\n    drop_remainder,\n    num_workers,\n):\n    \"\"\"Create a tf.data.Dataset from the underlying Dataset. This is a multi-process method - the single-process\n    equivalent is dataset_to_tf.\n\n    Args:\n        dataset (`Dataset`): Dataset to wrap with tf.data.Dataset.\n        cols_to_retain (`List[str]`): Dataset column(s) to load in the\n            tf.data.Dataset. It is acceptable to include column names that are created by the `collate_fn` and\n            that do not exist in the original dataset.\n        collate_fn(`Callable`): A function or callable object (such as a `DataCollator`) that will collate\n            lists of samples into a batch.\n        collate_fn_args (`Dict`): A  `dict` of keyword arguments to be passed to the\n            `collate_fn`. Can be empty.\n        columns_to_np_types (`Dict[str, np.dtype]`): A `dict` mapping column names to numpy dtypes.\n        output_signature (`Dict[str, tf.TensorSpec]`): A `dict` mapping column names to\n            `tf.TensorSpec` objects.\n        shuffle(`bool`): Shuffle the dataset order when loading. Recommended True for training, False for\n            validation/evaluation.\n        batch_size (`int`, default `None`): Size of batches to load from the dataset. Defaults to `None`, which implies that\n            the dataset won't be batched, but the returned dataset can be batched later with `tf_dataset.batch(batch_size)`.\n        drop_remainder(`bool`, default `None`): Drop the last incomplete batch when loading. If not provided,\n            defaults to the same setting as shuffle.\n        num_workers (`int`): Number of workers to use for loading the dataset. Should be >= 1.\n\n    Returns:\n        `tf.data.Dataset`\n    \"\"\"\n    if config.TF_AVAILABLE:\n        import tensorflow as tf\n    else:\n        raise ImportError(\"Called a Tensorflow-specific function but Tensorflow is not installed.\")\n\n    data_generator = NumpyMultiprocessingGenerator(\n        dataset=dataset,\n        cols_to_retain=cols_to_retain,\n        collate_fn=collate_fn,\n        collate_fn_args=collate_fn_args,\n        columns_to_np_types=columns_to_np_types,\n        output_signature=output_signature,\n        shuffle=shuffle,\n        batch_size=batch_size,\n        drop_remainder=drop_remainder,\n        num_workers=num_workers,\n    )\n\n    tf_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=output_signature)\n    if drop_remainder:\n        dataset_length = int(len(dataset) // batch_size)\n    else:\n        dataset_length = int(ceil(len(dataset) / batch_size))\n    return tf_dataset.apply(tf.data.experimental.assert_cardinality(dataset_length))\n"
  },
  {
    "path": "src/datasets/utils/tqdm.py",
    "content": "\"\"\"Utility helpers to handle progress bars in `datasets`.\n\nExample:\n    1. Use `datasets.utils.tqdm` as you would use `tqdm.tqdm` or `tqdm.auto.tqdm`.\n    2. To disable progress bars, either use `disable_progress_bars()` helper or set the\n       environment variable `HF_DATASETS_DISABLE_PROGRESS_BARS` to 1.\n    3. To re-enable progress bars, use `enable_progress_bars()`.\n    4. To check whether progress bars are disabled, use `are_progress_bars_disabled()`.\n\nNOTE: Environment variable `HF_DATASETS_DISABLE_PROGRESS_BARS` has the priority.\n\nExample:\n    ```py\n    from datasets.utils import (\n        are_progress_bars_disabled,\n        disable_progress_bars,\n        enable_progress_bars,\n        tqdm,\n    )\n\n    # Disable progress bars globally\n    disable_progress_bars()\n\n    # Use as normal `tqdm`\n    for _ in tqdm(range(5)):\n       do_something()\n\n    # Still not showing progress bars, as `disable=False` is overwritten to `True`.\n    for _ in tqdm(range(5), disable=False):\n       do_something()\n\n    are_progress_bars_disabled() # True\n\n    # Re-enable progress bars globally\n    enable_progress_bars()\n\n    # Progress bar will be shown !\n    for _ in tqdm(range(5)):\n       do_something()\n    ```\n\"\"\"\n\nimport os\nimport warnings\n\nfrom tqdm.auto import tqdm as old_tqdm\n\nfrom ..config import HF_DATASETS_DISABLE_PROGRESS_BARS\n\n\n# `HF_DATASETS_DISABLE_PROGRESS_BARS` is `Optional[bool]` while `_hf_datasets_progress_bars_disabled`\n# is a `bool`. If `HF_DATASETS_DISABLE_PROGRESS_BARS` is set to True or False, it has priority.\n# If `HF_DATASETS_DISABLE_PROGRESS_BARS` is None, it means the user have not set the\n# environment variable and is free to enable/disable progress bars programmatically.\n# TL;DR: env variable has priority over code.\n#\n# By default, progress bars are enabled.\n_hf_datasets_progress_bars_disabled: bool = HF_DATASETS_DISABLE_PROGRESS_BARS or False\n\n\ndef disable_progress_bars() -> None:\n    \"\"\"\n    Disable globally progress bars used in `datasets` except if `HF_DATASETS_DISABLE_PROGRESS_BARS` environment\n    variable has been set.\n\n    Use [`~utils.enable_progress_bars`] to re-enable them.\n    \"\"\"\n    if HF_DATASETS_DISABLE_PROGRESS_BARS is False:\n        warnings.warn(\n            \"Cannot disable progress bars: environment variable `HF_DATASETS_DISABLE_PROGRESS_BARS=0` is set and has\"\n            \" priority.\"\n        )\n        return\n    global _hf_datasets_progress_bars_disabled\n    _hf_datasets_progress_bars_disabled = True\n\n\ndef enable_progress_bars() -> None:\n    \"\"\"\n    Enable globally progress bars used in `datasets` except if `HF_DATASETS_DISABLE_PROGRESS_BARS` environment\n    variable has been set.\n\n    Use [`~utils.disable_progress_bars`] to disable them.\n    \"\"\"\n    if HF_DATASETS_DISABLE_PROGRESS_BARS is True:\n        warnings.warn(\n            \"Cannot enable progress bars: environment variable `HF_DATASETS_DISABLE_PROGRESS_BARS=1` is set and has\"\n            \" priority.\"\n        )\n        return\n    global _hf_datasets_progress_bars_disabled\n    _hf_datasets_progress_bars_disabled = False\n\n\ndef are_progress_bars_disabled() -> bool:\n    \"\"\"Return whether progress bars are globally disabled or not.\n\n    Progress bars used in `datasets` can be enable or disabled globally using [`~utils.enable_progress_bars`]\n    and [`~utils.disable_progress_bars`] or by setting `HF_DATASETS_DISABLE_PROGRESS_BARS` as environment variable.\n    \"\"\"\n    global _hf_datasets_progress_bars_disabled\n    return _hf_datasets_progress_bars_disabled\n\n\nclass tqdm(old_tqdm):\n    \"\"\"\n    Class to override `disable` argument in case progress bars are globally disabled.\n\n    Taken from https://github.com/tqdm/tqdm/issues/619#issuecomment-619639324.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        if are_progress_bars_disabled():\n            kwargs[\"disable\"] = True\n        elif kwargs.get(\"disable\") is None and os.getenv(\"TQDM_POSITION\") == \"-1\":\n            # Force-enable progress bars in cloud environments when disable=None\n            kwargs[\"disable\"] = False\n        super().__init__(*args, **kwargs)\n\n    def __delattr__(self, attr: str) -> None:\n        \"\"\"Fix for https://github.com/huggingface/datasets/issues/6066\"\"\"\n        try:\n            super().__delattr__(attr)\n        except AttributeError:\n            if attr != \"_lock\":\n                raise\n\n\n# backward compatibility\nenable_progress_bar = enable_progress_bars\ndisable_progress_bar = disable_progress_bars\n\n\ndef is_progress_bar_enabled():\n    return not are_progress_bars_disabled()\n"
  },
  {
    "path": "src/datasets/utils/track.py",
    "content": "from collections.abc import Iterable, Iterator\n\n\nclass tracked_str(str):\n    origins = {}\n\n    def set_origin(self, origin: str):\n        if super().__repr__() not in self.origins:\n            self.origins[super().__repr__()] = origin\n\n    def get_origin(self):\n        return self.origins.get(super().__repr__(), str(self))\n\n    def __repr__(self) -> str:\n        if super().__repr__() not in self.origins or self.origins[super().__repr__()] == self:\n            return super().__repr__()\n        else:\n            return f\"{str(self)} (origin={self.origins[super().__repr__()]})\"\n\n\nclass tracked_list(list):\n    def __init__(self, *args, **kwargs) -> None:\n        super().__init__(*args, **kwargs)\n        self.last_item = None\n\n    def __iter__(self) -> Iterator:\n        for x in super().__iter__():\n            self.last_item = x\n            yield x\n        self.last_item = None\n\n    def __repr__(self) -> str:\n        if self.last_item is None:\n            return super().__repr__()\n        else:\n            return f\"{self.__class__.__name__}(current={self.last_item})\"\n\n\nclass TrackedIterableFromGenerator(Iterable):\n    \"\"\"Utility class to create an iterable from a generator function, in order to reset the generator when needed.\"\"\"\n\n    def __init__(self, generator, *args):\n        super().__init__()\n        self.generator = generator\n        self.args = args\n        self.last_item = None\n\n    def __iter__(self):\n        for x in self.generator(*self.args):\n            self.last_item = x\n            yield x\n        self.last_item = None\n\n    def __repr__(self) -> str:\n        if self.last_item is None:\n            return super().__repr__()\n        else:\n            return f\"{self.__class__.__name__}(current={self.last_item})\"\n\n    def __reduce__(self):\n        return (self.__class__, (self.generator, *self.args))\n"
  },
  {
    "path": "src/datasets/utils/typing.py",
    "content": "import os\nfrom typing import TypeVar, Union\n\n\nT = TypeVar(\"T\")\n\nListLike = Union[list[T], tuple[T, ...]]\nNestedDataStructureLike = Union[T, list[T], dict[str, T]]\nPathLike = Union[str, bytes, os.PathLike]\n"
  },
  {
    "path": "src/datasets/utils/version.py",
    "content": "# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Lint as: python3\n\"\"\"Version utils.\"\"\"\n\nimport dataclasses\nimport re\nfrom dataclasses import dataclass\nfrom functools import total_ordering\nfrom typing import Optional, Union\n\n\n_VERSION_REG = re.compile(r\"^(?P<major>\\d+)\" r\"\\.(?P<minor>\\d+)\" r\"\\.(?P<patch>\\d+)$\")\n\n\n@total_ordering\n@dataclass\nclass Version:\n    \"\"\"Dataset version `MAJOR.MINOR.PATCH`.\n\n    Args:\n        version_str (`str`):\n            The dataset version.\n        description (`str`):\n            A description of what is new in this version.\n        major (`str`):\n        minor (`str`):\n        patch (`str`):\n\n    Example:\n\n    ```py\n    >>> VERSION = datasets.Version(\"1.0.0\")\n    ```\n    \"\"\"\n\n    version_str: str\n    description: Optional[str] = None\n    major: Optional[Union[str, int]] = None\n    minor: Optional[Union[str, int]] = None\n    patch: Optional[Union[str, int]] = None\n\n    def __post_init__(self):\n        self.major, self.minor, self.patch = _str_to_version_tuple(self.version_str)\n\n    def __repr__(self):\n        return f\"{self.tuple[0]}.{self.tuple[1]}.{self.tuple[2]}\"\n\n    @property\n    def tuple(self):\n        return self.major, self.minor, self.patch\n\n    def _validate_operand(self, other):\n        if isinstance(other, str):\n            return Version(other)\n        elif isinstance(other, Version):\n            return other\n        raise TypeError(f\"{other} (type {type(other)}) cannot be compared to version.\")\n\n    def __eq__(self, other):\n        try:\n            other = self._validate_operand(other)\n        except (TypeError, ValueError):\n            return False\n        else:\n            return self.tuple == other.tuple\n\n    def __lt__(self, other):\n        other = self._validate_operand(other)\n        return self.tuple < other.tuple\n\n    def __hash__(self):\n        return hash(_version_tuple_to_str(self.tuple))\n\n    @classmethod\n    def from_dict(cls, dic):\n        field_names = {f.name for f in dataclasses.fields(cls)}\n        return cls(**{k: v for k, v in dic.items() if k in field_names})\n\n    def _to_yaml_string(self) -> str:\n        return self.version_str\n\n\ndef _str_to_version_tuple(version_str):\n    \"\"\"Return the tuple (major, minor, patch) version extracted from the str.\"\"\"\n    res = _VERSION_REG.match(version_str)\n    if not res:\n        raise ValueError(f\"Invalid version '{version_str}'. Format should be x.y.z with {{x,y,z}} being digits.\")\n    return tuple(int(v) for v in [res.group(\"major\"), res.group(\"minor\"), res.group(\"patch\")])\n\n\ndef _version_tuple_to_str(version_tuple):\n    \"\"\"Return the str version from the version tuple (major, minor, patch).\"\"\"\n    return \".\".join(str(v) for v in version_tuple)\n"
  },
  {
    "path": "templates/README.md",
    "content": "---\nTODO: \"Add YAML tags here. Delete these instructions and copy-paste the YAML tags obtained with the online tagging app: https://huggingface.co/spaces/huggingface/datasets-tagging\"\n---\n\n# Dataset Card for [Dataset Name]\n\n## Table of Contents\n- [Table of Contents](#table-of-contents)\n- [Dataset Description](#dataset-description)\n  - [Dataset Summary](#dataset-summary)\n  - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)\n  - [Languages](#languages)\n- [Dataset Structure](#dataset-structure)\n  - [Data Instances](#data-instances)\n  - [Data Fields](#data-fields)\n  - [Data Splits](#data-splits)\n- [Dataset Creation](#dataset-creation)\n  - [Curation Rationale](#curation-rationale)\n  - [Source Data](#source-data)\n  - [Annotations](#annotations)\n  - [Personal and Sensitive Information](#personal-and-sensitive-information)\n- [Considerations for Using the Data](#considerations-for-using-the-data)\n  - [Social Impact of Dataset](#social-impact-of-dataset)\n  - [Discussion of Biases](#discussion-of-biases)\n  - [Other Known Limitations](#other-known-limitations)\n- [Additional Information](#additional-information)\n  - [Dataset Curators](#dataset-curators)\n  - [Licensing Information](#licensing-information)\n  - [Citation Information](#citation-information)\n  - [Contributions](#contributions)\n\n## Dataset Description\n\n- **Homepage:**\n- **Repository:**\n- **Paper:**\n- **Leaderboard:**\n- **Point of Contact:**\n\n### Dataset Summary\n\n[More Information Needed]\n\n### Supported Tasks and Leaderboards\n\n[More Information Needed]\n\n### Languages\n\n[More Information Needed]\n\n## Dataset Structure\n\n### Data Instances\n\n[More Information Needed]\n\n### Data Fields\n\n[More Information Needed]\n\n### Data Splits\n\n[More Information Needed]\n\n## Dataset Creation\n\n### Curation Rationale\n\n[More Information Needed]\n\n### Source Data\n\n#### Initial Data Collection and Normalization\n\n[More Information Needed]\n\n#### Who are the source language producers?\n\n[More Information Needed]\n\n### Annotations\n\n#### Annotation process\n\n[More Information Needed]\n\n#### Who are the annotators?\n\n[More Information Needed]\n\n### Personal and Sensitive Information\n\n[More Information Needed]\n\n## Considerations for Using the Data\n\n### Social Impact of Dataset\n\n[More Information Needed]\n\n### Discussion of Biases\n\n[More Information Needed]\n\n### Other Known Limitations\n\n[More Information Needed]\n\n## Additional Information\n\n### Dataset Curators\n\n[More Information Needed]\n\n### Licensing Information\n\n[More Information Needed]\n\n### Citation Information\n\n[More Information Needed]\n\n### Contributions\n\nThanks to [@github-username](https://github.com/<github-username>) for adding this dataset.\n"
  },
  {
    "path": "templates/README_guide.md",
    "content": "---\nTODO: \"Add YAML tags here. Delete these instructions and copy-paste the YAML tags obtained with the online tagging app: https://huggingface.co/spaces/huggingface/datasets-tagging\"\nYAML tags: \"Find the full spec here: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1\"\n---\n\n# Dataset Card Creation Guide\n\n## Table of Contents\n- [Dataset Card Creation Guide](#dataset-card-creation-guide)\n  - [Table of Contents](#table-of-contents)\n  - [Dataset Description](#dataset-description)\n    - [Dataset Summary](#dataset-summary)\n    - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)\n    - [Languages](#languages)\n  - [Dataset Structure](#dataset-structure)\n    - [Data Instances](#data-instances)\n    - [Data Fields](#data-fields)\n    - [Data Splits](#data-splits)\n  - [Dataset Creation](#dataset-creation)\n    - [Curation Rationale](#curation-rationale)\n    - [Source Data](#source-data)\n      - [Initial Data Collection and Normalization](#initial-data-collection-and-normalization)\n      - [Who are the source language producers?](#who-are-the-source-language-producers)\n    - [Annotations](#annotations)\n      - [Annotation process](#annotation-process)\n      - [Who are the annotators?](#who-are-the-annotators)\n    - [Personal and Sensitive Information](#personal-and-sensitive-information)\n  - [Considerations for Using the Data](#considerations-for-using-the-data)\n    - [Social Impact of Dataset](#social-impact-of-dataset)\n    - [Discussion of Biases](#discussion-of-biases)\n    - [Other Known Limitations](#other-known-limitations)\n  - [Additional Information](#additional-information)\n    - [Dataset Curators](#dataset-curators)\n    - [Licensing Information](#licensing-information)\n    - [Citation Information](#citation-information)\n    - [Contributions](#contributions)\n\n## Dataset Description\n\n- **Homepage:** [Add homepage URL here if available (unless it's a GitHub repository)]()\n- **Repository:** [If the dataset is hosted on github or has a github homepage, add URL here]()\n- **Paper:** [If the dataset was introduced by a paper or there was a paper written describing the dataset, add URL here (landing page for Arxiv paper preferred)]()\n- **Leaderboard:** [If the dataset supports an active leaderboard, add link here]()\n- **Point of Contact:** [If known, name and email of at least one person the reader can contact for questions about the dataset.]()\n\n### Dataset Summary\n\nBriefly summarize the dataset, its intended use and the supported tasks. Give an overview of how and why the dataset was created. The summary should explicitly mention the languages present in the dataset (possibly in broad terms, e.g. *translations between several pairs of European languages*), and describe the domain, topic, or genre covered.\n\n### Supported Tasks and Leaderboards\n\nFor each of the tasks tagged for this dataset, give a brief description of the tag, metrics, and suggested models (with a link to their HuggingFace implementation if available). Give a similar description of tasks that were not covered by the structured tag set (repace the `task-category-tag` with an appropriate `other:other-task-name`).\n\n- `task-category-tag`: The dataset can be used to train a model for [TASK NAME], which consists in [TASK DESCRIPTION]. Success on this task is typically measured by achieving a *high/low* [metric name](https://huggingface.co/metrics/metric_name). The ([model name](https://huggingface.co/model_name) or [model class](https://huggingface.co/transformers/model_doc/model_class.html)) model currently achieves the following score. *[IF A LEADERBOARD IS AVAILABLE]:* This task has an active leaderboard which can be found at [leaderboard url]() and ranks models based on [metric name](https://huggingface.co/metrics/metric_name) while also reporting [other metric name](https://huggingface.co/metrics/other_metric_name).\n\n### Languages\n\nProvide a brief overview of the languages represented in the dataset. Describe relevant details about specifics of the language such as whether it is social media text, African American English,...\n\nWhen relevant, please provide [BCP-47 codes](https://tools.ietf.org/html/bcp47), which consist of a [primary language subtag](https://tools.ietf.org/html/bcp47#section-2.2.1), with a [script subtag](https://tools.ietf.org/html/bcp47#section-2.2.3) and/or [region subtag](https://tools.ietf.org/html/bcp47#section-2.2.4) if available.\n\n## Dataset Structure\n\n### Data Instances\n\nProvide an JSON-formatted example and brief description of a typical instance in the dataset. If available, provide a link to further examples.\n\n```\n{\n  'example_field': ...,\n  ...\n}\n```\n\nProvide any additional information that is not covered in the other sections about the data here. In particular describe any relationships between data points and if these relationships are made explicit.\n\n### Data Fields\n\nList and describe the fields present in the dataset. Mention their data type, and whether they are used as input or output in any of the tasks the dataset currently supports. If the data has span indices, describe their attributes, such as whether they are at the character level or word level, whether they are contiguous or not, etc. If the datasets contains example IDs, state whether they have an inherent meaning, such as a mapping to other datasets or pointing to relationships between data points.\n\n- `example_field`: description of `example_field`\n\nNote that the descriptions can be initialized with the **Show Markdown Data Fields** output of the [Datasets Tagging app](https://huggingface.co/spaces/huggingface/datasets-tagging), you will then only need to refine the generated descriptions.\n\n### Data Splits\n\nDescribe and name the splits in the dataset if there are more than one.\n\nDescribe any criteria for splitting the data, if used. If there are differences between the splits (e.g. if the training annotations are machine-generated and the dev and test ones are created by humans, or if different numbers of annotators contributed to each example), describe them here.\n\nProvide the sizes of each split. As appropriate, provide any descriptive statistics for the features, such as average length.  For example:\n\n|                         | train | validation | test |\n|-------------------------|------:|-----------:|-----:|\n| Input Sentences         |       |            |      |\n| Average Sentence Length |       |            |      |\n\n## Dataset Creation\n\n### Curation Rationale\n\nWhat need motivated the creation of this dataset? What are some of the reasons underlying the major choices involved in putting it together?\n\n### Source Data\n\nThis section describes the source data (e.g. news text and headlines, social media posts, translated sentences,...)\n\n#### Initial Data Collection and Normalization\n\nDescribe the data collection process. Describe any criteria for data selection or filtering. List any key words or search terms used. If possible, include runtime information for the collection process.\n\nIf data was collected from other pre-existing datasets, link to source here and to their [Hugging Face version](https://huggingface.co/datasets/dataset_name).\n\nIf the data was modified or normalized after being collected (e.g. if the data is word-tokenized), describe the process and the tools used.\n\n#### Who are the source language producers?\n\nState whether the data was produced by humans or machine generated. Describe the people or systems who originally created the data.\n\nIf available, include self-reported demographic or identity information for the source data creators, but avoid inferring this information. Instead state that this information is unknown. See [Larson 2017](https://www.aclweb.org/anthology/W17-1601.pdf) for using identity categories as a variables, particularly gender.\n\nDescribe the conditions under which the data was created (for example, if the producers were crowdworkers, state what platform was used, or if the data was found, what website the data was found on). If compensation was provided, include that information here.\n\nDescribe other people represented or mentioned in the data. Where possible, link to references for the information.\n\n### Annotations\n\nIf the dataset contains annotations which are not part of the initial data collection, describe them in the following paragraphs.\n\n#### Annotation process\n\nIf applicable, describe the annotation process and any tools used, or state otherwise. Describe the amount of data annotated, if not all. Describe or reference annotation guidelines provided to the annotators. If available, provide interannotator statistics. Describe any annotation validation processes.\n\n#### Who are the annotators?\n\nIf annotations were collected for the source data (such as class labels or syntactic parses), state whether the annotations were produced by humans or machine generated.\n\nDescribe the people or systems who originally created the annotations and their selection criteria if applicable.\n\nIf available, include self-reported demographic or identity information for the annotators, but avoid inferring this information. Instead state that this information is unknown. See [Larson 2017](https://www.aclweb.org/anthology/W17-1601.pdf) for using identity categories as a variables, particularly gender.\n\nDescribe the conditions under which the data was annotated (for example, if the annotators were crowdworkers, state what platform was used, or if the data was found, what website the data was found on). If compensation was provided, include that information here.\n\n### Personal and Sensitive Information\n\nState whether the dataset uses identity categories and, if so, how the information is used. Describe where this information comes from (i.e. self-reporting, collecting from profiles, inferring, etc.). See [Larson 2017](https://www.aclweb.org/anthology/W17-1601.pdf) for using identity categories as a variables, particularly gender. State whether the data is linked to individuals and whether those individuals can be identified in the dataset, either directly or indirectly (i.e., in combination with other data).\n\nState whether the dataset contains other data that might be considered sensitive (e.g., data that reveals racial or ethnic origins, sexual orientations, religious beliefs, political opinions or union memberships, or locations; financial or health data; biometric or genetic data; forms of government identification, such as social security numbers; criminal history).  \n\nIf efforts were made to anonymize the data, describe the anonymization process.\n\n## Considerations for Using the Data\n\n### Social Impact of Dataset\n\nPlease discuss some of the ways you believe the use of this dataset will impact society.\n\nThe statement should include both positive outlooks, such as outlining how technologies developed through its use may improve people's lives, and discuss the accompanying risks. These risks may range from making important decisions more opaque to people who are affected by the technology, to reinforcing existing harmful biases (whose specifics should be discussed in the next section), among other considerations.\n\nAlso describe in this section if the proposed dataset contains a low-resource or under-represented language. If this is the case or if this task has any impact on underserved communities, please elaborate here.\n\n### Discussion of Biases\n\nProvide descriptions of specific biases that are likely to be reflected in the data, and state whether any steps were taken to reduce their impact.\n\nFor Wikipedia text, see for example [Dinan et al 2020 on biases in Wikipedia (esp. Table 1)](https://huggingface.co/papers/2005.00614), or [Blodgett et al 2020](https://www.aclweb.org/anthology/2020.acl-main.485/) for a more general discussion of the topic.\n\nIf analyses have been run quantifying these biases, please add brief summaries and links to the studies here.\n\n### Other Known Limitations\n\nIf studies of the datasets have outlined other limitations of the dataset, such as annotation artifacts, please outline and cite them here.\n\n## Additional Information\n\n### Dataset Curators\n\nList the people involved in collecting the dataset and their affiliation(s). If funding information is known, include it here.\n\n### Licensing Information\n\nProvide the license and link to the license webpage if available.\n\n### Citation Information\n\nProvide the [BibTex](http://www.bibtex.org/)-formatted reference for the dataset. For example:\n```\n@article{article_id,\n  author    = {Author List},\n  title     = {Dataset Paper Title},\n  journal   = {Publication Venue},\n  year      = {2525}\n}\n```\n\nIf the dataset has a [DOI](https://www.doi.org/), please provide it here.\n\n### Contributions\n\nThanks to [@github-username](https://github.com/<github-username>) for adding this dataset.\n"
  },
  {
    "path": "tests/__init__.py",
    "content": ""
  },
  {
    "path": "tests/_test_patching.py",
    "content": "# ruff: noqa: F401\n# This is the module that test_patching.py uses to test patch_submodule()\nimport os\nimport os as renamed_os\nfrom os import path\nfrom os import path as renamed_path\nfrom os.path import join\nfrom os.path import join as renamed_join\n\n\nopen = open  # we just need to have a builtin inside this module to test it properly\n"
  },
  {
    "path": "tests/commands/__init__.py",
    "content": ""
  },
  {
    "path": "tests/commands/conftest.py",
    "content": "import pytest\nfrom huggingface_hub import snapshot_download\n\n\n@pytest.fixture\ndef dataset_dir(tmp_path):\n    dataset_dir = tmp_path / \"test_command_dataset_dir\"\n    snapshot_download(\"hf-internal-testing/ner-jsonl\", repo_type=\"dataset\", local_dir=dataset_dir)\n    return str(dataset_dir)\n"
  },
  {
    "path": "tests/commands/test_test.py",
    "content": "import os\nfrom collections import namedtuple\n\nimport pytest\n\nfrom datasets import ClassLabel, Features, List, Value\nfrom datasets.commands.test import TestCommand\nfrom datasets.info import DatasetInfo, DatasetInfosDict\n\n\n_TestCommandArgs = namedtuple(\n    \"_TestCommandArgs\",\n    [\n        \"dataset\",\n        \"name\",\n        \"cache_dir\",\n        \"data_dir\",\n        \"all_configs\",\n        \"save_infos\",\n        \"ignore_verifications\",\n        \"force_redownload\",\n        \"clear_cache\",\n        \"num_proc\",\n    ],\n    defaults=[None, None, None, False, False, False, False, False, None],\n)\n\n\ndef is_1percent_close(source, target):\n    return (abs(source - target) / target) < 0.01\n\n\n@pytest.mark.integration\ndef test_test_command(dataset_dir):\n    args = _TestCommandArgs(dataset=dataset_dir, all_configs=True, save_infos=True)\n    test_command = TestCommand(*args)\n    test_command.run()\n    dataset_readme_path = os.path.join(dataset_dir, \"README.md\")\n    assert os.path.exists(dataset_readme_path)\n    dataset_infos = DatasetInfosDict.from_directory(dataset_dir)\n    expected_dataset_infos = DatasetInfosDict(\n        {\n            \"default\": DatasetInfo(\n                features=Features(\n                    {\n                        \"tokens\": List(Value(\"string\")),\n                        \"ner_tags\": List(\n                            ClassLabel(names=[\"O\", \"B-PER\", \"I-PER\", \"B-ORG\", \"I-ORG\", \"B-LOC\", \"I-LOC\"])\n                        ),\n                        \"langs\": List(Value(\"string\")),\n                        \"spans\": List(Value(\"string\")),\n                    }\n                ),\n                splits=[\n                    {\n                        \"name\": \"train\",\n                        \"num_bytes\": 2351563,\n                        \"num_examples\": 10000,\n                    },\n                    {\n                        \"name\": \"validation\",\n                        \"num_bytes\": 238418,\n                        \"num_examples\": 1000,\n                    },\n                ],\n                download_size=3940680,\n                dataset_size=2589981,\n            )\n        }\n    )\n    assert dataset_infos.keys() == expected_dataset_infos.keys()\n    for key in DatasetInfo._INCLUDED_INFO_IN_YAML:\n        result, expected = getattr(dataset_infos[\"default\"], key), getattr(expected_dataset_infos[\"default\"], key)\n        if key == \"num_bytes\":\n            assert is_1percent_close(result, expected)\n        elif key == \"splits\":\n            assert list(result) == list(expected)\n            for split in result:\n                assert result[split].name == expected[split].name\n                assert result[split].num_examples == expected[split].num_examples\n                assert is_1percent_close(result[split].num_bytes, expected[split].num_bytes)\n        else:\n            result == expected\n"
  },
  {
    "path": "tests/conftest.py",
    "content": "import pytest\n\nimport datasets\nimport datasets.config\n\n\n# Import fixture modules as plugins\npytest_plugins = [\"tests.fixtures.files\", \"tests.fixtures.hub\", \"tests.fixtures.fsspec\"]\n\n\ndef pytest_collection_modifyitems(config, items):\n    # Mark tests as \"unit\" by default if not marked as \"integration\" (or already marked as \"unit\")\n    for item in items:\n        if any(marker in item.keywords for marker in [\"integration\", \"unit\"]):\n            continue\n        item.add_marker(pytest.mark.unit)\n\n\n@pytest.fixture(autouse=True)\ndef set_test_cache_config(tmp_path_factory, monkeypatch):\n    # test_hf_cache_home = tmp_path_factory.mktemp(\"cache\")  # TODO: why a cache dir per test function does not work?\n    test_hf_cache_home = tmp_path_factory.getbasetemp() / \"cache\"\n    test_hf_datasets_cache = test_hf_cache_home / \"datasets\"\n    monkeypatch.setattr(\"datasets.config.HF_DATASETS_CACHE\", str(test_hf_datasets_cache))\n    test_downloaded_datasets_path = test_hf_datasets_cache / \"downloads\"\n    monkeypatch.setattr(\"datasets.config.DOWNLOADED_DATASETS_PATH\", str(test_downloaded_datasets_path))\n    test_extracted_datasets_path = test_hf_datasets_cache / \"downloads\" / \"extracted\"\n    monkeypatch.setattr(\"datasets.config.EXTRACTED_DATASETS_PATH\", str(test_extracted_datasets_path))\n\n    # used in dataset viewer, we may set it to true by default in the future\n    monkeypatch.setattr(\"datasets.config.SAVE_ORIGINAL_SHARD_LENGTHS\", True)\n\n\n@pytest.fixture(autouse=True)\ndef disable_implicit_token(monkeypatch):\n    monkeypatch.setattr(\"huggingface_hub.constants.HF_HUB_DISABLE_IMPLICIT_TOKEN\", True)\n\n\n@pytest.fixture(autouse=True, scope=\"session\")\ndef disable_tqdm_output():\n    datasets.disable_progress_bar()\n\n\n@pytest.fixture(autouse=True)\ndef set_update_download_counts_to_false(monkeypatch):\n    # don't take tests into account when counting downloads\n    monkeypatch.setattr(\"datasets.config.HF_UPDATE_DOWNLOAD_COUNTS\", False)\n\n\n@pytest.fixture\ndef set_sqlalchemy_silence_uber_warning(monkeypatch):\n    # Required to suppress RemovedIn20Warning when feature(s) are not compatible with SQLAlchemy 2.0\n    # To be removed once SQLAlchemy 2.0 supported\n    try:\n        monkeypatch.setattr(\"sqlalchemy.util.deprecations.SILENCE_UBER_WARNING\", True)\n    except (ModuleNotFoundError, AttributeError):\n        pass\n\n\n@pytest.fixture(autouse=True, scope=\"session\")\ndef zero_time_out_for_remote_code():\n    datasets.config.TIME_OUT_REMOTE_CODE = 0\n"
  },
  {
    "path": "tests/distributed_scripts/run_torch_distributed.py",
    "content": "import os\nfrom argparse import ArgumentParser\nfrom typing import List\n\nimport torch.utils.data\n\nfrom datasets import Dataset, IterableDataset\nfrom datasets.distributed import split_dataset_by_node\n\n\nNUM_SHARDS = 4\nNUM_ITEMS_PER_SHARD = 3\n\n\nclass FailedTestError(RuntimeError):\n    pass\n\n\ndef gen(shards: List[str]):\n    for shard in shards:\n        for i in range(NUM_ITEMS_PER_SHARD):\n            yield {\"i\": i, \"shard\": shard}\n\n\ndef main():\n    rank = int(os.environ[\"RANK\"])\n    world_size = int(os.environ[\"WORLD_SIZE\"])\n\n    parser = ArgumentParser()\n    parser.add_argument(\"--streaming\", type=bool)\n    parser.add_argument(\"--local_rank\", type=int)\n    parser.add_argument(\"--num_workers\", type=int, default=0)\n    args = parser.parse_args()\n    streaming = args.streaming\n    num_workers = args.num_workers\n\n    gen_kwargs = {\"shards\": [f\"shard_{shard_idx}\" for shard_idx in range(NUM_SHARDS)]}\n    ds = IterableDataset.from_generator(gen, gen_kwargs=gen_kwargs)\n    if not streaming:\n        ds = Dataset.from_list(list(ds))\n\n    ds = split_dataset_by_node(ds, rank=rank, world_size=world_size)\n    dataloader = torch.utils.data.DataLoader(ds, num_workers=num_workers)\n\n    full_size = NUM_SHARDS * NUM_ITEMS_PER_SHARD\n    expected_local_size = full_size // world_size\n    expected_local_size += int(rank < (full_size % world_size))\n\n    local_size = sum(1 for _ in dataloader)\n    if local_size != expected_local_size:\n        raise FailedTestError(f\"local_size {local_size} != expected_local_size {expected_local_size}\")\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "tests/features/__init__.py",
    "content": ""
  },
  {
    "path": "tests/features/test_array_xd.py",
    "content": "import os\nimport random\nimport tempfile\nimport unittest\n\nimport numpy as np\nimport pandas as pd\nimport pyarrow as pa\nimport pytest\nfrom absl.testing import parameterized\n\nimport datasets\nfrom datasets.arrow_writer import ArrowWriter\nfrom datasets.features import Array2D, Array3D, Array4D, Array5D, Value\nfrom datasets.features.features import Array3DExtensionType, PandasArrayExtensionDtype, _ArrayXD\nfrom datasets.formatting.formatting import NumpyArrowExtractor, SimpleArrowExtractor\n\n\nSHAPE_TEST_1 = (30, 487)\nSHAPE_TEST_2 = (36, 1024)\nSHAPE_TEST_3 = (None, 100)\nSPEED_TEST_SHAPE = (100, 100)\nSPEED_TEST_N_EXAMPLES = 100\n\nDEFAULT_FEATURES = datasets.Features(\n    {\n        \"text\": Array2D(SHAPE_TEST_1, dtype=\"float32\"),\n        \"image\": Array2D(SHAPE_TEST_2, dtype=\"float32\"),\n        \"dynamic\": Array2D(SHAPE_TEST_3, dtype=\"float32\"),\n    }\n)\n\n\ndef generate_examples(features: dict, num_examples=100, seq_shapes=None):\n    dummy_data = []\n    seq_shapes = seq_shapes or {}\n    for i in range(num_examples):\n        example = {}\n        for col_id, (k, v) in enumerate(features.items()):\n            if isinstance(v, _ArrayXD):\n                if k == \"dynamic\":\n                    first_dim = random.randint(1, 3)\n                    data = np.random.rand(first_dim, *v.shape[1:]).astype(v.dtype)\n                else:\n                    data = np.random.rand(*v.shape).astype(v.dtype)\n            elif isinstance(v, datasets.Value):\n                data = \"foo\"\n            elif isinstance(v, datasets.Sequence):\n                while isinstance(v, datasets.Sequence):\n                    v = v.feature\n                shape = seq_shapes[k]\n                data = np.random.rand(*shape).astype(v.dtype)\n            example[k] = data\n            dummy_data.append((i, example))\n\n    return dummy_data\n\n\nclass ExtensionTypeCompatibilityTest(unittest.TestCase):\n    def test_array2d_nonspecific_shape(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            my_features = DEFAULT_FEATURES.copy()\n            with ArrowWriter(features=my_features, path=os.path.join(tmp_dir, \"beta.arrow\")) as writer:\n                for key, record in generate_examples(\n                    features=my_features,\n                    num_examples=1,\n                ):\n                    example = my_features.encode_example(record)\n                    writer.write(example)\n                num_examples, num_bytes = writer.finalize()\n            dataset = datasets.Dataset.from_file(os.path.join(tmp_dir, \"beta.arrow\"))\n            dataset.set_format(\"numpy\")\n            row = dataset[0]\n            first_shape = row[\"image\"].shape\n            second_shape = row[\"text\"].shape\n            self.assertTrue(first_shape is not None and second_shape is not None, \"need atleast 2 different shapes\")\n            self.assertEqual(len(first_shape), len(second_shape), \"both shapes are supposed to be equal length\")\n            self.assertNotEqual(first_shape, second_shape, \"shapes must not be the same\")\n            del dataset\n\n    def test_multiple_extensions_same_row(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            my_features = DEFAULT_FEATURES.copy()\n            with ArrowWriter(features=my_features, path=os.path.join(tmp_dir, \"beta.arrow\")) as writer:\n                for key, record in generate_examples(features=my_features, num_examples=1):\n                    example = my_features.encode_example(record)\n                    writer.write(example)\n                num_examples, num_bytes = writer.finalize()\n            dataset = datasets.Dataset.from_file(os.path.join(tmp_dir, \"beta.arrow\"))\n            dataset.set_format(\"numpy\")\n            row = dataset[0]\n            first_len = len(row[\"image\"].shape)\n            second_len = len(row[\"text\"].shape)\n            third_len = len(row[\"dynamic\"].shape)\n            self.assertEqual(first_len, 2, \"use a sequence type if dim is  < 2\")\n            self.assertEqual(second_len, 2, \"use a sequence type if dim is  < 2\")\n            self.assertEqual(third_len, 2, \"use a sequence type if dim is  < 2\")\n            del dataset\n\n    def test_compatability_with_string_values(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            my_features = DEFAULT_FEATURES.copy()\n            my_features[\"image_id\"] = datasets.Value(\"string\")\n            with ArrowWriter(features=my_features, path=os.path.join(tmp_dir, \"beta.arrow\")) as writer:\n                for key, record in generate_examples(features=my_features, num_examples=1):\n                    example = my_features.encode_example(record)\n                    writer.write(example)\n                num_examples, num_bytes = writer.finalize()\n            dataset = datasets.Dataset.from_file(os.path.join(tmp_dir, \"beta.arrow\"))\n            self.assertIsInstance(dataset[0][\"image_id\"], str, \"image id must be of type string\")\n            del dataset\n\n    def test_extension_indexing(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            my_features = DEFAULT_FEATURES.copy()\n            my_features[\"explicit_ext\"] = Array2D((3, 3), dtype=\"float32\")\n            with ArrowWriter(features=my_features, path=os.path.join(tmp_dir, \"beta.arrow\")) as writer:\n                for key, record in generate_examples(features=my_features, num_examples=1):\n                    example = my_features.encode_example(record)\n                    writer.write(example)\n                num_examples, num_bytes = writer.finalize()\n            dataset = datasets.Dataset.from_file(os.path.join(tmp_dir, \"beta.arrow\"))\n            dataset.set_format(\"numpy\")\n            data = dataset[0][\"explicit_ext\"]\n            self.assertIsInstance(data, np.ndarray, \"indexed extension must return numpy.ndarray\")\n            del dataset\n\n\ndef get_array_feature_types():\n    shape_1 = [3] * 5\n    shape_2 = [3, 4, 5, 6, 7]\n    return [\n        {\n            \"testcase_name\": f\"{d}d\",\n            \"array_feature\": array_feature,\n            \"shape_1\": tuple(shape_1[:d]),\n            \"shape_2\": tuple(shape_2[:d]),\n        }\n        for d, array_feature in zip(range(2, 6), [Array2D, Array3D, Array4D, Array5D])\n    ]\n\n\n@parameterized.named_parameters(get_array_feature_types())\nclass ArrayXDTest(unittest.TestCase):\n    def get_features(self, array_feature, shape_1, shape_2):\n        return datasets.Features(\n            {\n                \"image\": array_feature(shape_1, dtype=\"float32\"),\n                \"source\": Value(\"string\"),\n                \"matrix\": array_feature(shape_2, dtype=\"float32\"),\n            }\n        )\n\n    def get_dict_example_0(self, shape_1, shape_2):\n        return {\n            \"image\": np.random.rand(*shape_1).astype(\"float32\"),\n            \"source\": \"foo\",\n            \"matrix\": np.random.rand(*shape_2).astype(\"float32\"),\n        }\n\n    def get_dict_example_1(self, shape_1, shape_2):\n        return {\n            \"image\": np.random.rand(*shape_1).astype(\"float32\"),\n            \"matrix\": np.random.rand(*shape_2).astype(\"float32\"),\n            \"source\": \"bar\",\n        }\n\n    def get_dict_examples(self, shape_1, shape_2):\n        return {\n            \"image\": np.random.rand(2, *shape_1).astype(\"float32\").tolist(),\n            \"source\": [\"foo\", \"bar\"],\n            \"matrix\": np.random.rand(2, *shape_2).astype(\"float32\").tolist(),\n        }\n\n    def _check_getitem_output_type(self, dataset, shape_1, shape_2, first_matrix):\n        matrix_column = dataset[\"matrix\"][:]\n        self.assertIsInstance(matrix_column, list)\n        self.assertIsInstance(matrix_column[0], list)\n        self.assertIsInstance(matrix_column[0][0], list)\n        self.assertTupleEqual(np.array(matrix_column).shape, (2, *shape_2))\n\n        matrix_field_of_first_example = dataset[0][\"matrix\"]\n        self.assertIsInstance(matrix_field_of_first_example, list)\n        self.assertIsInstance(matrix_field_of_first_example, list)\n        self.assertEqual(np.array(matrix_field_of_first_example).shape, shape_2)\n        np.testing.assert_array_equal(np.array(matrix_field_of_first_example), np.array(first_matrix))\n\n        matrix_field_of_first_two_examples = dataset[:2][\"matrix\"]\n        self.assertIsInstance(matrix_field_of_first_two_examples, list)\n        self.assertIsInstance(matrix_field_of_first_two_examples[0], list)\n        self.assertIsInstance(matrix_field_of_first_two_examples[0][0], list)\n        self.assertTupleEqual(np.array(matrix_field_of_first_two_examples).shape, (2, *shape_2))\n\n        with dataset.formatted_as(\"numpy\"):\n            self.assertTupleEqual(dataset[\"matrix\"][:].shape, (2, *shape_2))\n            self.assertEqual(dataset[0][\"matrix\"].shape, shape_2)\n            self.assertTupleEqual(dataset[:2][\"matrix\"].shape, (2, *shape_2))\n\n        with dataset.formatted_as(\"pandas\"):\n            self.assertIsInstance(dataset[\"matrix\"], pd.Series)\n            self.assertIsInstance(dataset[0][\"matrix\"], pd.Series)\n            self.assertIsInstance(dataset[:2][\"matrix\"], pd.Series)\n            self.assertTupleEqual(dataset[\"matrix\"].to_numpy().shape, (2, *shape_2))\n            self.assertTupleEqual(dataset[0][\"matrix\"].to_numpy().shape, (1, *shape_2))\n            self.assertTupleEqual(dataset[:2][\"matrix\"].to_numpy().shape, (2, *shape_2))\n\n    def test_write(self, array_feature, shape_1, shape_2):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            my_features = self.get_features(array_feature, shape_1, shape_2)\n            my_examples = [\n                (0, self.get_dict_example_0(shape_1, shape_2)),\n                (1, self.get_dict_example_1(shape_1, shape_2)),\n            ]\n            with ArrowWriter(features=my_features, path=os.path.join(tmp_dir, \"beta.arrow\")) as writer:\n                for key, record in my_examples:\n                    example = my_features.encode_example(record)\n                    writer.write(example)\n                num_examples, num_bytes = writer.finalize()\n            dataset = datasets.Dataset.from_file(os.path.join(tmp_dir, \"beta.arrow\"))\n            self._check_getitem_output_type(dataset, shape_1, shape_2, my_examples[0][1][\"matrix\"])\n            del dataset\n\n    def test_write_batch(self, array_feature, shape_1, shape_2):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            my_features = self.get_features(array_feature, shape_1, shape_2)\n            dict_examples = self.get_dict_examples(shape_1, shape_2)\n            dict_examples = my_features.encode_batch(dict_examples)\n            with ArrowWriter(features=my_features, path=os.path.join(tmp_dir, \"beta.arrow\")) as writer:\n                writer.write_batch(dict_examples)\n                num_examples, num_bytes = writer.finalize()\n            dataset = datasets.Dataset.from_file(os.path.join(tmp_dir, \"beta.arrow\"))\n            self._check_getitem_output_type(dataset, shape_1, shape_2, dict_examples[\"matrix\"][0])\n            del dataset\n\n    def test_from_dict(self, array_feature, shape_1, shape_2):\n        dict_examples = self.get_dict_examples(shape_1, shape_2)\n        dataset = datasets.Dataset.from_dict(\n            dict_examples, features=self.get_features(array_feature, shape_1, shape_2)\n        )\n        self._check_getitem_output_type(dataset, shape_1, shape_2, dict_examples[\"matrix\"][0])\n        del dataset\n\n\nclass ArrayXDDynamicTest(unittest.TestCase):\n    def get_one_col_dataset(self, first_dim_list, fixed_shape):\n        features = datasets.Features({\"image\": Array3D(shape=(None, *fixed_shape), dtype=\"float32\")})\n        dict_values = {\"image\": [np.random.rand(fdim, *fixed_shape).astype(\"float32\") for fdim in first_dim_list]}\n        dataset = datasets.Dataset.from_dict(dict_values, features=features)\n        return dataset\n\n    def get_two_col_datasset(self, first_dim_list, fixed_shape):\n        features = datasets.Features(\n            {\"image\": Array3D(shape=(None, *fixed_shape), dtype=\"float32\"), \"text\": Value(\"string\")}\n        )\n        dict_values = {\n            \"image\": [np.random.rand(fdim, *fixed_shape).astype(\"float32\") for fdim in first_dim_list],\n            \"text\": [\"text\" for _ in first_dim_list],\n        }\n        dataset = datasets.Dataset.from_dict(dict_values, features=features)\n        return dataset\n\n    def test_to_pylist(self):\n        fixed_shape = (2, 2)\n        first_dim_list = [1, 3, 10]\n        dataset = self.get_one_col_dataset(first_dim_list, fixed_shape)\n        arr_xd = SimpleArrowExtractor().extract_column(dataset._data)\n        self.assertIsInstance(arr_xd.type, Array3DExtensionType)\n        pylist = arr_xd.to_pylist()\n\n        for first_dim, single_arr in zip(first_dim_list, pylist):\n            self.assertIsInstance(single_arr, list)\n            self.assertTupleEqual(np.array(single_arr).shape, (first_dim, *fixed_shape))\n\n    def test_to_numpy(self):\n        fixed_shape = (2, 2)\n\n        # ragged\n        first_dim_list = [1, 3, 10]\n        dataset = self.get_one_col_dataset(first_dim_list, fixed_shape)\n        arr_xd = SimpleArrowExtractor().extract_column(dataset._data)\n        self.assertIsInstance(arr_xd.type, Array3DExtensionType)\n        # replace with arr_xd = arr_xd.combine_chunks() when 12.0.0 will be the minimal required PyArrow version\n        arr_xd = arr_xd.type.wrap_array(pa.concat_arrays([chunk.storage for chunk in arr_xd.chunks]))\n        numpy_arr = arr_xd.to_numpy()\n\n        self.assertIsInstance(numpy_arr, np.ndarray)\n        self.assertEqual(numpy_arr.dtype, object)\n        for first_dim, single_arr in zip(first_dim_list, numpy_arr):\n            self.assertIsInstance(single_arr, np.ndarray)\n            self.assertTupleEqual(single_arr.shape, (first_dim, *fixed_shape))\n\n        # non-ragged\n        first_dim_list = [4, 4, 4]\n        dataset = self.get_one_col_dataset(first_dim_list, fixed_shape)\n        arr_xd = SimpleArrowExtractor().extract_column(dataset._data)\n        self.assertIsInstance(arr_xd.type, Array3DExtensionType)\n        # replace with arr_xd = arr_xd.combine_chunks() when 12.0.0 will be the minimal required PyArrow version\n        arr_xd = arr_xd.type.wrap_array(pa.concat_arrays([chunk.storage for chunk in arr_xd.chunks]))\n        numpy_arr = arr_xd.to_numpy()\n\n        self.assertIsInstance(numpy_arr, np.ndarray)\n        self.assertNotEqual(numpy_arr.dtype, object)\n        for first_dim, single_arr in zip(first_dim_list, numpy_arr):\n            self.assertIsInstance(single_arr, np.ndarray)\n            self.assertTupleEqual(single_arr.shape, (first_dim, *fixed_shape))\n\n    def test_iter_dataset(self):\n        fixed_shape = (2, 2)\n        first_dim_list = [1, 3, 10]\n        dataset = self.get_one_col_dataset(first_dim_list, fixed_shape)\n\n        for first_dim, ds_row in zip(first_dim_list, dataset):\n            single_arr = ds_row[\"image\"]\n            self.assertIsInstance(single_arr, list)\n            self.assertTupleEqual(np.array(single_arr).shape, (first_dim, *fixed_shape))\n\n    def test_to_pandas(self):\n        fixed_shape = (2, 2)\n\n        # ragged\n        first_dim_list = [1, 3, 10]\n        dataset = self.get_one_col_dataset(first_dim_list, fixed_shape)\n        df = dataset.to_pandas()\n        self.assertEqual(type(df.image.dtype), PandasArrayExtensionDtype)\n        numpy_arr = df.image.to_numpy()\n\n        self.assertIsInstance(numpy_arr, np.ndarray)\n        self.assertEqual(numpy_arr.dtype, object)\n        for first_dim, single_arr in zip(first_dim_list, numpy_arr):\n            self.assertIsInstance(single_arr, np.ndarray)\n            self.assertTupleEqual(single_arr.shape, (first_dim, *fixed_shape))\n\n        # non-ragged\n        first_dim_list = [4, 4, 4]\n        dataset = self.get_one_col_dataset(first_dim_list, fixed_shape)\n        df = dataset.to_pandas()\n        self.assertEqual(type(df.image.dtype), PandasArrayExtensionDtype)\n        numpy_arr = df.image.to_numpy()\n\n        self.assertIsInstance(numpy_arr, np.ndarray)\n        self.assertNotEqual(numpy_arr.dtype, object)\n        for first_dim, single_arr in zip(first_dim_list, numpy_arr):\n            self.assertIsInstance(single_arr, np.ndarray)\n            self.assertTupleEqual(single_arr.shape, (first_dim, *fixed_shape))\n\n    def test_map_dataset(self):\n        fixed_shape = (2, 2)\n        first_dim_list = [1, 3, 10]\n        dataset = self.get_one_col_dataset(first_dim_list, fixed_shape)\n\n        dataset = dataset.map(lambda a: {\"image\": np.concatenate([a] * 2)}, input_columns=\"image\")\n\n        # check also if above function resulted with 2x bigger first dim\n        for first_dim, ds_row in zip(first_dim_list, dataset):\n            single_arr = ds_row[\"image\"]\n            self.assertIsInstance(single_arr, list)\n            self.assertTupleEqual(np.array(single_arr).shape, (first_dim * 2, *fixed_shape))\n\n\n@pytest.mark.parametrize(\"dtype, dummy_value\", [(\"int32\", 1), (\"bool\", True), (\"float64\", 1)])\ndef test_table_to_pandas(dtype, dummy_value):\n    features = datasets.Features({\"foo\": datasets.Array2D(dtype=dtype, shape=(2, 2))})\n    dataset = datasets.Dataset.from_dict({\"foo\": [[[dummy_value] * 2] * 2]}, features=features)\n    df = dataset._data.to_pandas()\n    assert isinstance(df.foo.dtype, PandasArrayExtensionDtype)\n    arr = df.foo.to_numpy()\n    np.testing.assert_equal(arr, np.array([[[dummy_value] * 2] * 2], dtype=np.dtype(dtype)))\n\n\n@pytest.mark.parametrize(\"dtype, dummy_value\", [(\"int32\", 1), (\"bool\", True), (\"float64\", 1)])\ndef test_array_xd_numpy_arrow_extractor(dtype, dummy_value):\n    features = datasets.Features({\"foo\": datasets.Array2D(dtype=dtype, shape=(2, 2))})\n    dataset = datasets.Dataset.from_dict({\"foo\": [[[dummy_value] * 2] * 2]}, features=features)\n    arr = NumpyArrowExtractor().extract_column(dataset._data)\n    assert isinstance(arr, np.ndarray)\n    np.testing.assert_equal(arr, np.array([[[dummy_value] * 2] * 2], dtype=np.dtype(dtype)))\n\n\ndef test_array_xd_with_none():\n    # Fixed shape\n    features = datasets.Features({\"foo\": datasets.Array2D(dtype=\"int32\", shape=(2, 2))})\n    dummy_array = np.array([[1, 2], [3, 4]], dtype=\"int32\")\n    dataset = datasets.Dataset.from_dict({\"foo\": [dummy_array, None, dummy_array, None]}, features=features)\n    arr = NumpyArrowExtractor().extract_column(dataset._data)\n    assert isinstance(arr, np.ndarray) and arr.dtype == np.float64 and arr.shape == (4, 2, 2)\n    assert np.allclose(arr[0], dummy_array) and np.allclose(arr[2], dummy_array)\n    assert np.all(np.isnan(arr[1])) and np.all(np.isnan(arr[3]))  # broadcasted np.nan - use np.all\n\n    # Dynamic shape\n    features = datasets.Features({\"foo\": datasets.Array2D(dtype=\"int32\", shape=(None, 2))})\n    dummy_array = np.array([[1, 2], [3, 4]], dtype=\"int32\")\n    dataset = datasets.Dataset.from_dict({\"foo\": [dummy_array, None, dummy_array, None]}, features=features)\n    arr = NumpyArrowExtractor().extract_column(dataset._data)\n    assert isinstance(arr, np.ndarray) and arr.dtype == object and arr.shape == (4,)\n    np.testing.assert_equal(arr[0], dummy_array)\n    np.testing.assert_equal(arr[2], dummy_array)\n    assert np.isnan(arr[1]) and np.isnan(arr[3])  # a single np.nan value - np.all not needed\n\n\n@pytest.mark.parametrize(\"seq_type\", [\"no_sequence\", \"sequence\", \"sequence_of_sequence\"])\n@pytest.mark.parametrize(\n    \"dtype\",\n    [\n        \"bool\",\n        \"int8\",\n        \"int16\",\n        \"int32\",\n        \"int64\",\n        \"uint8\",\n        \"uint16\",\n        \"uint32\",\n        \"uint64\",\n        \"float16\",\n        \"float32\",\n        \"float64\",\n    ],\n)\n@pytest.mark.parametrize(\"shape, feature_class\", [((2, 3), datasets.Array2D), ((2, 3, 4), datasets.Array3D)])\ndef test_array_xd_with_np(seq_type, dtype, shape, feature_class):\n    feature = feature_class(dtype=dtype, shape=shape)\n    data = np.zeros(shape, dtype=dtype)\n    expected = data.tolist()\n    if seq_type == \"sequence\":\n        feature = datasets.List(feature)\n        data = [data]\n        expected = [expected]\n    elif seq_type == \"sequence_of_sequence\":\n        feature = datasets.List(datasets.List(feature))\n        data = [[data]]\n        expected = [[expected]]\n    ds = datasets.Dataset.from_dict({\"col\": [data]}, features=datasets.Features({\"col\": feature}))\n    assert ds[0][\"col\"] == expected\n\n\n@pytest.mark.parametrize(\"with_none\", [False, True])\ndef test_dataset_map(with_none):\n    ds = datasets.Dataset.from_dict({\"path\": [\"path1\", \"path2\"]})\n\n    def process_data(batch):\n        batch = {\n            \"image\": [\n                np.array(\n                    [\n                        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],\n                        [[10, 20, 30], [40, 50, 60], [70, 80, 90]],\n                        [[100, 200, 300], [400, 500, 600], [700, 800, 900]],\n                    ]\n                )\n                for _ in batch[\"path\"]\n            ]\n        }\n        if with_none:\n            batch[\"image\"][0] = None\n        return batch\n\n    features = datasets.Features({\"image\": Array3D(dtype=\"int32\", shape=(3, 3, 3))})\n    processed_ds = ds.map(process_data, batched=True, remove_columns=ds.column_names, features=features)\n    assert processed_ds.shape == (2, 1)\n    with processed_ds.with_format(\"numpy\") as pds:\n        for i, example in enumerate(pds):\n            assert \"image\" in example\n            assert isinstance(example[\"image\"], np.ndarray)\n            assert example[\"image\"].shape == (3, 3, 3)\n            if with_none and i == 0:\n                assert np.all(np.isnan(example[\"image\"]))\n"
  },
  {
    "path": "tests/features/test_audio.py",
    "content": "import os\nimport tarfile\nfrom itertools import product\nfrom pathlib import Path\n\nimport numpy as np\nimport pyarrow as pa\nimport pytest\n\nfrom datasets import Column, Dataset, concatenate_datasets, load_dataset\nfrom datasets.features import Audio, Features, List, Value\n\nfrom ..utils import require_torchcodec\n\n\n@pytest.fixture()\ndef tar_wav_path(shared_datadir, tmp_path_factory):\n    audio_path = str(shared_datadir / \"test_audio_44100.wav\")\n    path = tmp_path_factory.mktemp(\"data\") / \"audio_data.wav.tar\"\n    with tarfile.TarFile(path, \"w\") as f:\n        f.add(audio_path, arcname=os.path.basename(audio_path))\n    return path\n\n\n@pytest.fixture()\ndef tar_mp3_path(shared_datadir, tmp_path_factory):\n    audio_path = str(shared_datadir / \"test_audio_44100.mp3\")\n    path = tmp_path_factory.mktemp(\"data\") / \"audio_data.mp3.tar\"\n    with tarfile.TarFile(path, \"w\") as f:\n        f.add(audio_path, arcname=os.path.basename(audio_path))\n    return path\n\n\ndef iter_archive(archive_path):\n    with tarfile.open(archive_path) as tar:\n        for tarinfo in tar:\n            file_path = tarinfo.name\n            file_obj = tar.extractfile(tarinfo)\n            yield file_path, file_obj\n\n\ndef test_audio_instantiation():\n    audio = Audio()\n    assert audio.sampling_rate is None\n    assert audio.id is None\n    assert audio.stream_index is None\n\n    assert audio.dtype == \"dict\"\n    assert audio.pa_type == pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})\n    assert audio._type == \"Audio\"\n\n\ndef test_audio_feature_type_to_arrow():\n    features = Features({\"audio\": Audio()})\n    assert features.arrow_schema == pa.schema({\"audio\": Audio().pa_type})\n    features = Features({\"struct_containing_an_audio\": {\"audio\": Audio()}})\n    assert features.arrow_schema == pa.schema({\"struct_containing_an_audio\": pa.struct({\"audio\": Audio().pa_type})})\n    features = Features({\"sequence_of_audios\": List(Audio())})\n    assert features.arrow_schema == pa.schema({\"sequence_of_audios\": pa.list_(Audio().pa_type)})\n\n\n@require_torchcodec\n@pytest.mark.parametrize(\n    \"build_example\",\n    [\n        lambda audio_path: audio_path,\n        lambda audio_path: Path(audio_path),\n        lambda audio_path: open(audio_path, \"rb\").read(),\n        lambda audio_path: {\"path\": audio_path},\n        lambda audio_path: {\"path\": audio_path, \"bytes\": None},\n        lambda audio_path: {\"path\": audio_path, \"bytes\": open(audio_path, \"rb\").read()},\n        lambda audio_path: {\"path\": None, \"bytes\": open(audio_path, \"rb\").read()},\n        lambda audio_path: {\"bytes\": open(audio_path, \"rb\").read()},\n        lambda audio_path: {\"array\": np.array([0.1, 0.2, 0.3]), \"sampling_rate\": 16_000},\n    ],\n)\ndef test_audio_feature_encode_example(shared_datadir, build_example):\n    from torchcodec.decoders import AudioDecoder\n\n    audio_path = str(shared_datadir / \"test_audio_44100.wav\")\n    audio = Audio()\n    encoded_example = audio.encode_example(build_example(audio_path))\n    assert isinstance(encoded_example, dict)\n    assert encoded_example.keys() == {\"bytes\", \"path\"}\n    assert encoded_example[\"bytes\"] is not None or encoded_example[\"path\"] is not None\n    decoded_example = audio.decode_example(encoded_example)\n    assert isinstance(decoded_example, AudioDecoder)\n\n\n@require_torchcodec\n@pytest.mark.parametrize(\n    \"build_example\",\n    [\n        lambda audio_path: {\"path\": audio_path, \"sampling_rate\": 16_000},\n        lambda audio_path: {\"path\": audio_path, \"bytes\": None, \"sampling_rate\": 16_000},\n        lambda audio_path: {\"path\": audio_path, \"bytes\": open(audio_path, \"rb\").read(), \"sampling_rate\": 16_000},\n        lambda audio_path: {\"array\": np.array([0.1, 0.2, 0.3]), \"sampling_rate\": 16_000},\n    ],\n)\ndef test_audio_feature_encode_example_pcm(shared_datadir, build_example):\n    from torchcodec.decoders import AudioDecoder\n\n    audio_path = str(shared_datadir / \"test_audio_16000.pcm\")\n    audio = Audio(sampling_rate=16_000)\n    encoded_example = audio.encode_example(build_example(audio_path))\n    assert isinstance(encoded_example, dict)\n    assert encoded_example.keys() == {\"bytes\", \"path\"}\n    assert encoded_example[\"bytes\"] is not None or encoded_example[\"path\"] is not None\n    decoded_example = audio.decode_example(encoded_example)\n    assert isinstance(decoded_example, AudioDecoder)\n\n\nsample_rates = [16_000, 48_000]\n\n\n@require_torchcodec\n@pytest.mark.parametrize(\n    \"in_sample_rate,out_sample_rate\",\n    list(product(sample_rates, sample_rates)),\n)\ndef test_audio_feature_encode_example_audiodecoder(shared_datadir, in_sample_rate, out_sample_rate):\n    from torchcodec.decoders import AudioDecoder\n\n    audio_path = str(shared_datadir / \"test_audio_44100.wav\")\n    audio = Audio(sampling_rate=out_sample_rate)\n    example = AudioDecoder(audio_path, sample_rate=in_sample_rate)\n    encoded_example = audio.encode_example(example)\n    assert isinstance(encoded_example, dict)\n    assert encoded_example.keys() == {\"bytes\", \"path\"}\n    assert encoded_example[\"bytes\"] is not None or encoded_example[\"path\"] is not None\n    decoded_example = audio.decode_example(encoded_example)\n    assert isinstance(decoded_example, AudioDecoder)\n\n\n@require_torchcodec\ndef test_audio_decode_example(shared_datadir):\n    from torchcodec.decoders import AudioDecoder\n\n    audio_path = str(shared_datadir / \"test_audio_44100.wav\")\n    audio = Audio()\n    decoded_example = audio.decode_example(audio.encode_example(audio_path))\n    assert isinstance(decoded_example, AudioDecoder)\n    samples = decoded_example.get_all_samples()\n    assert samples.sample_rate == 44100\n    assert samples.data.shape == (2, 202311)\n\n    with pytest.raises(RuntimeError):\n        Audio(decode=False).decode_example(audio_path)\n\n\n@require_torchcodec\ndef test_audio_resampling(shared_datadir):\n    from torchcodec.decoders import AudioDecoder\n\n    audio_path = str(shared_datadir / \"test_audio_44100.wav\")\n    audio = Audio(sampling_rate=16000)\n    decoded_example = audio.decode_example(audio.encode_example(audio_path))\n    assert isinstance(decoded_example, AudioDecoder)\n    samples = decoded_example.get_all_samples()\n    assert samples.sample_rate == 16000\n    assert samples.data.shape == (2, 73401)\n\n\n@require_torchcodec\ndef test_audio_decode_example_mp3(shared_datadir):\n    from torchcodec.decoders import AudioDecoder\n\n    audio_path = str(shared_datadir / \"test_audio_44100.mp3\")\n    audio = Audio()\n    decoded_example = audio.decode_example(audio.encode_example(audio_path))\n    print(\"decoded_example\", decoded_example)\n    assert isinstance(decoded_example, AudioDecoder)\n    samples = decoded_example.get_all_samples()\n    assert samples.sample_rate == 44100\n    assert samples.data.shape == (2, 110592)\n\n\n@require_torchcodec\ndef test_audio_decode_example_opus(shared_datadir):\n    from torchcodec.decoders import AudioDecoder\n\n    audio_path = str(shared_datadir / \"test_audio_48000.opus\")\n    audio = Audio()\n    decoded_example = audio.decode_example(audio.encode_example(audio_path))\n    assert isinstance(decoded_example, AudioDecoder)\n    samples = decoded_example.get_all_samples()\n    assert samples.sample_rate == 48000\n    assert samples.data.shape == (1, 48000)\n\n\n@require_torchcodec\n@pytest.mark.parametrize(\"sampling_rate\", [16_000, 48_000])\ndef test_audio_decode_example_pcm(shared_datadir, sampling_rate):\n    from torchcodec.decoders import AudioDecoder\n\n    audio_path = str(shared_datadir / \"test_audio_16000.pcm\")\n    audio_input = {\"path\": audio_path, \"sampling_rate\": 16_000}\n    audio = Audio(sampling_rate=sampling_rate)\n    decoded_example = audio.decode_example(audio.encode_example(audio_input))\n    assert isinstance(decoded_example, AudioDecoder)\n    samples = decoded_example.get_all_samples()\n    assert samples.sample_rate == sampling_rate\n    assert samples.data.shape == (1, 16208 * sampling_rate // 16_000)\n\n\n@require_torchcodec\ndef test_audio_resampling_mp3_different_sampling_rates(shared_datadir):\n    from torchcodec.decoders import AudioDecoder\n\n    audio_path = str(shared_datadir / \"test_audio_44100.mp3\")\n    audio_path2 = str(shared_datadir / \"test_audio_16000.mp3\")\n    audio = Audio(sampling_rate=48000)\n\n    decoded_example = audio.decode_example(audio.encode_example(audio_path))\n    assert isinstance(decoded_example, AudioDecoder)\n    samples = decoded_example.get_all_samples()\n    assert samples.sample_rate == 48000\n    assert samples.data.shape == (2, 120373)\n\n    decoded_example = audio.decode_example(audio.encode_example(audio_path2))\n    assert isinstance(decoded_example, AudioDecoder)\n    samples = decoded_example.get_all_samples()\n    assert samples.sample_rate == 48000\n    assert samples.data.shape == (2, 122688)\n\n\n@require_torchcodec\ndef test_backwards_compatibility(shared_datadir):\n    from torchcodec.decoders import AudioDecoder\n\n    audio_path = str(shared_datadir / \"test_audio_44100.mp3\")\n    audio_path2 = str(shared_datadir / \"test_audio_16000.mp3\")\n    audio = Audio(sampling_rate=48000)\n\n    decoded_example = audio.decode_example(audio.encode_example(audio_path))\n    assert isinstance(decoded_example, AudioDecoder)\n    samples = decoded_example.get_all_samples()\n    assert decoded_example[\"sampling_rate\"] == samples.sample_rate\n    assert decoded_example[\"array\"].ndim == 1  # mono\n    assert abs(decoded_example[\"array\"].shape[0] - samples.data.shape[1]) < 2  # can have off by one error\n\n    decoded_example = audio.decode_example(audio.encode_example(audio_path2))\n    assert isinstance(decoded_example, AudioDecoder)\n    samples = decoded_example.get_all_samples()\n    assert decoded_example[\"sampling_rate\"] == samples.sample_rate\n    assert decoded_example[\"array\"].ndim == 1  # mono\n    assert abs(decoded_example[\"array\"].shape[0] - samples.data.shape[1]) < 2  # can have off by one error\n\n\n@require_torchcodec\ndef test_dataset_with_audio_feature(shared_datadir):\n    from torchcodec.decoders import AudioDecoder\n\n    audio_path = str(shared_datadir / \"test_audio_44100.wav\")\n    data = {\"audio\": [audio_path]}\n    features = Features({\"audio\": Audio()})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"audio\"}\n    assert isinstance(item[\"audio\"], AudioDecoder)\n    samples = item[\"audio\"].get_all_samples()\n    assert samples.sample_rate == 44100\n    assert samples.data.shape == (2, 202311)\n    batch = dset[:1]\n    assert batch.keys() == {\"audio\"}\n    assert len(batch[\"audio\"]) == 1\n    assert isinstance(batch[\"audio\"][0], AudioDecoder)\n    samples = batch[\"audio\"][0].get_all_samples()\n    assert samples.sample_rate == 44100\n    assert samples.data.shape == (2, 202311)\n    column = dset[\"audio\"]\n    assert len(column) == 1\n    assert isinstance(column[0], AudioDecoder)\n    samples = column[0].get_all_samples()\n    assert samples.sample_rate == 44100\n    assert samples.data.shape == (2, 202311)\n\n\n@require_torchcodec\ndef test_dataset_with_audio_feature_tar_wav(tar_wav_path):\n    from torchcodec.decoders import AudioDecoder\n\n    audio_filename = \"test_audio_44100.wav\"\n    data = {\"audio\": []}\n    for file_path, file_obj in iter_archive(tar_wav_path):\n        data[\"audio\"].append({\"path\": file_path, \"bytes\": file_obj.read()})\n        break\n    features = Features({\"audio\": Audio()})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"audio\"}\n    assert isinstance(item[\"audio\"], AudioDecoder)\n    samples = item[\"audio\"].get_all_samples()\n    assert samples.sample_rate == 44100\n    assert samples.data.shape == (2, 202311)\n    assert item[\"audio\"].metadata.path == audio_filename\n    batch = dset[:1]\n    assert batch.keys() == {\"audio\"}\n    assert len(batch[\"audio\"]) == 1\n    assert isinstance(batch[\"audio\"][0], AudioDecoder)\n    samples = batch[\"audio\"][0].get_all_samples()\n    assert samples.sample_rate == 44100\n    assert samples.data.shape == (2, 202311)\n    assert batch[\"audio\"][0].metadata.path == audio_filename\n    column = dset[\"audio\"]\n    assert len(column) == 1\n    assert isinstance(column[0], AudioDecoder)\n    samples = column[0].get_all_samples()\n    assert samples.sample_rate == 44100\n    assert samples.data.shape == (2, 202311)\n\n\n@require_torchcodec\ndef test_dataset_with_audio_feature_tar_mp3(tar_mp3_path):\n    from torchcodec.decoders import AudioDecoder\n\n    audio_filename = \"test_audio_44100.mp3\"\n    data = {\"audio\": []}\n    for file_path, file_obj in iter_archive(tar_mp3_path):\n        data[\"audio\"].append({\"path\": file_path, \"bytes\": file_obj.read()})\n        break\n    features = Features({\"audio\": Audio()})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"audio\"}\n    assert isinstance(item[\"audio\"], AudioDecoder)\n    samples = item[\"audio\"].get_all_samples()\n    assert samples.sample_rate == 44100\n    assert samples.data.shape == (2, 110592)\n    assert item[\"audio\"].metadata.path == audio_filename\n    batch = dset[:1]\n    assert batch.keys() == {\"audio\"}\n    assert len(batch[\"audio\"]) == 1\n    assert isinstance(batch[\"audio\"][0], AudioDecoder)\n    samples = batch[\"audio\"][0].get_all_samples()\n    assert samples.sample_rate == 44100\n    assert samples.data.shape == (2, 110592)\n    assert batch[\"audio\"][0].metadata.path == audio_filename\n    column = dset[\"audio\"]\n    assert len(column) == 1\n    assert isinstance(column[0], AudioDecoder)\n    samples = column[0].get_all_samples()\n    assert samples.sample_rate == 44100\n    assert samples.data.shape == (2, 110592)\n\n\n@require_torchcodec\ndef test_dataset_with_audio_feature_with_none():\n    data = {\"audio\": [None]}\n    features = Features({\"audio\": Audio()})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"audio\"}\n    assert item[\"audio\"] is None\n    batch = dset[:1]\n    assert len(batch) == 1\n    assert batch.keys() == {\"audio\"}\n    assert isinstance(batch[\"audio\"], list) and all(item is None for item in batch[\"audio\"])\n    column = dset[\"audio\"]\n    assert len(column) == 1\n    assert isinstance(column, Column) and all(item is None for item in column)\n\n    # nested tests\n\n    data = {\"audio\": [[None]]}\n    features = Features({\"audio\": List(Audio())})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"audio\"}\n    assert all(i is None for i in item[\"audio\"])\n\n    data = {\"nested\": [{\"audio\": None}]}\n    features = Features({\"nested\": {\"audio\": Audio()}})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"nested\"}\n    assert item[\"nested\"].keys() == {\"audio\"}\n    assert item[\"nested\"][\"audio\"] is None\n\n\n@require_torchcodec\ndef test_resampling_at_loading_dataset_with_audio_feature(shared_datadir):\n    from torchcodec.decoders import AudioDecoder\n\n    audio_path = str(shared_datadir / \"test_audio_44100.wav\")\n    data = {\"audio\": [audio_path]}\n    features = Features({\"audio\": Audio(sampling_rate=16000)})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"audio\"}\n    assert isinstance(item[\"audio\"], AudioDecoder)\n    samples = item[\"audio\"].get_all_samples()\n    assert samples.sample_rate == 16000\n    assert samples.data.shape == (2, 73401)\n    batch = dset[:1]\n    assert batch.keys() == {\"audio\"}\n    assert len(batch[\"audio\"]) == 1\n    assert isinstance(batch[\"audio\"][0], AudioDecoder)\n    samples = batch[\"audio\"][0].get_all_samples()\n    assert samples.sample_rate == 16000\n    assert samples.data.shape == (2, 73401)\n    column = dset[\"audio\"]\n    assert len(column) == 1\n    assert isinstance(column[0], AudioDecoder)\n    samples = column[0].get_all_samples()\n    assert samples.sample_rate == 16000\n    assert samples.data.shape == (2, 73401)\n\n\n@require_torchcodec\ndef test_resampling_at_loading_dataset_with_audio_feature_mp3(shared_datadir):\n    from torchcodec.decoders import AudioDecoder\n\n    audio_path = str(shared_datadir / \"test_audio_44100.mp3\")\n    data = {\"audio\": [audio_path]}\n    features = Features({\"audio\": Audio(sampling_rate=16000)})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"audio\"}\n    assert isinstance(item[\"audio\"], AudioDecoder)\n    samples = item[\"audio\"].get_all_samples()\n    assert samples.sample_rate == 16000\n    assert samples.data.shape == (2, 40124)\n    batch = dset[:1]\n    assert batch.keys() == {\"audio\"}\n    assert len(batch[\"audio\"]) == 1\n    assert isinstance(batch[\"audio\"][0], AudioDecoder)\n    samples = batch[\"audio\"][0].get_all_samples()\n    assert samples.sample_rate == 16000\n    assert samples.data.shape == (2, 40124)\n    column = dset[\"audio\"]\n    assert len(column) == 1\n    assert isinstance(column[0], AudioDecoder)\n    samples = column[0].get_all_samples()\n    assert samples.sample_rate == 16000\n    assert samples.data.shape == (2, 40124)\n\n\n@require_torchcodec\ndef test_resampling_after_loading_dataset_with_audio_feature(shared_datadir):\n    from torchcodec.decoders import AudioDecoder\n\n    audio_path = str(shared_datadir / \"test_audio_44100.wav\")\n    data = {\"audio\": [audio_path]}\n    features = Features({\"audio\": Audio()})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    samples = item[\"audio\"].get_all_samples()\n    assert samples.sample_rate == 44100\n    dset = dset.cast_column(\"audio\", Audio(sampling_rate=16000))\n    item = dset[0]\n    assert item.keys() == {\"audio\"}\n    assert isinstance(item[\"audio\"], AudioDecoder)\n    samples = item[\"audio\"].get_all_samples()\n    assert samples.sample_rate == 16000\n    assert samples.data.shape == (2, 73401)\n    batch = dset[:1]\n    assert batch.keys() == {\"audio\"}\n    assert len(batch[\"audio\"]) == 1\n    assert isinstance(batch[\"audio\"][0], AudioDecoder)\n    samples = batch[\"audio\"][0].get_all_samples()\n    assert samples.sample_rate == 16000\n    assert samples.data.shape == (2, 73401)\n    column = dset[\"audio\"]\n    assert len(column) == 1\n    assert isinstance(column[0], AudioDecoder)\n    samples = column[0].get_all_samples()\n    assert samples.sample_rate == 16000\n    assert samples.data.shape == (2, 73401)\n\n\n@require_torchcodec\ndef test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir):\n    from torchcodec.decoders import AudioDecoder\n\n    audio_path = str(shared_datadir / \"test_audio_44100.mp3\")\n    data = {\"audio\": [audio_path]}\n    features = Features({\"audio\": Audio()})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    samples = item[\"audio\"].get_all_samples()\n    assert samples.sample_rate == 44100\n    dset = dset.cast_column(\"audio\", Audio(sampling_rate=16000))\n    item = dset[0]\n    assert item.keys() == {\"audio\"}\n    assert isinstance(item[\"audio\"], AudioDecoder)\n    samples = item[\"audio\"].get_all_samples()\n    assert samples.sample_rate == 16000\n    assert samples.data.shape == (2, 40124)\n    batch = dset[:1]\n    assert batch.keys() == {\"audio\"}\n    assert len(batch[\"audio\"]) == 1\n    assert isinstance(batch[\"audio\"][0], AudioDecoder)\n    samples = batch[\"audio\"][0].get_all_samples()\n    assert samples.sample_rate == 16000\n    assert samples.data.shape == (2, 40124)\n    column = dset[\"audio\"]\n    assert len(column) == 1\n    assert isinstance(column[0], AudioDecoder)\n    samples = column[0].get_all_samples()\n    assert samples.sample_rate == 16000\n    assert samples.data.shape == (2, 40124)\n\n\n@require_torchcodec\n@pytest.mark.parametrize(\n    \"build_data\",\n    [\n        lambda audio_path: {\"audio\": [audio_path]},\n        lambda audio_path: {\"audio\": [open(audio_path, \"rb\").read()]},\n        lambda audio_path: {\"audio\": [{\"path\": audio_path}]},\n        lambda audio_path: {\"audio\": [{\"path\": audio_path, \"bytes\": None}]},\n        lambda audio_path: {\"audio\": [{\"path\": audio_path, \"bytes\": open(audio_path, \"rb\").read()}]},\n        lambda audio_path: {\"audio\": [{\"path\": None, \"bytes\": open(audio_path, \"rb\").read()}]},\n        lambda audio_path: {\"audio\": [{\"bytes\": open(audio_path, \"rb\").read()}]},\n    ],\n)\ndef test_dataset_cast_to_audio_features(shared_datadir, build_data):\n    from torchcodec.decoders import AudioDecoder\n\n    audio_path = str(shared_datadir / \"test_audio_44100.wav\")\n    data = build_data(audio_path)\n    dset = Dataset.from_dict(data)\n    item = dset.cast(Features({\"audio\": Audio()}))[0]\n    assert item.keys() == {\"audio\"}\n    assert isinstance(item[\"audio\"], AudioDecoder)\n    item = dset.cast_column(\"audio\", Audio())[0]\n    assert item.keys() == {\"audio\"}\n    assert isinstance(item[\"audio\"], AudioDecoder)\n\n\n@require_torchcodec\ndef test_dataset_concatenate_audio_features(shared_datadir):\n    # we use a different data structure between 1 and 2 to make sure they are compatible with each other\n    audio_path = str(shared_datadir / \"test_audio_44100.wav\")\n    data1 = {\"audio\": [audio_path]}\n    dset1 = Dataset.from_dict(data1, features=Features({\"audio\": Audio()}))\n    data2 = {\"audio\": [{\"bytes\": open(audio_path, \"rb\").read()}]}\n    dset2 = Dataset.from_dict(data2, features=Features({\"audio\": Audio()}))\n    concatenated_dataset = concatenate_datasets([dset1, dset2])\n    assert len(concatenated_dataset) == len(dset1) + len(dset2)\n    assert (\n        concatenated_dataset[0][\"audio\"].get_all_samples().data.shape == dset1[0][\"audio\"].get_all_samples().data.shape\n    )\n    assert (\n        concatenated_dataset[1][\"audio\"].get_all_samples().data.shape == dset2[0][\"audio\"].get_all_samples().data.shape\n    )\n\n\n@require_torchcodec\ndef test_dataset_concatenate_nested_audio_features(shared_datadir):\n    # we use a different data structure between 1 and 2 to make sure they are compatible with each other\n    audio_path = str(shared_datadir / \"test_audio_44100.wav\")\n    features = Features({\"list_of_structs_of_audios\": [{\"audio\": Audio()}]})\n    data1 = {\"list_of_structs_of_audios\": [[{\"audio\": audio_path}]]}\n    dset1 = Dataset.from_dict(data1, features=features)\n    data2 = {\"list_of_structs_of_audios\": [[{\"audio\": {\"bytes\": open(audio_path, \"rb\").read()}}]]}\n    dset2 = Dataset.from_dict(data2, features=features)\n    concatenated_dataset = concatenate_datasets([dset1, dset2])\n    assert len(concatenated_dataset) == len(dset1) + len(dset2)\n    assert (\n        concatenated_dataset[0][\"list_of_structs_of_audios\"][0][\"audio\"].get_all_samples().data.shape\n        == dset1[0][\"list_of_structs_of_audios\"][0][\"audio\"].get_all_samples().data.shape\n    )\n    assert (\n        concatenated_dataset[1][\"list_of_structs_of_audios\"][0][\"audio\"].get_all_samples().data.shape\n        == dset2[0][\"list_of_structs_of_audios\"][0][\"audio\"].get_all_samples().data.shape\n    )\n\n\n@require_torchcodec\ndef test_dataset_with_audio_feature_map_is_not_decoded(shared_datadir):\n    audio_path = str(shared_datadir / \"test_audio_44100.wav\")\n    data = {\"audio\": [audio_path], \"text\": [\"Hello\"]}\n    features = Features({\"audio\": Audio(), \"text\": Value(\"string\")})\n    dset = Dataset.from_dict(data, features=features)\n\n    expected_audio = features.encode_batch(data)[\"audio\"][0]\n    for item in dset.cast_column(\"audio\", Audio(decode=False)):\n        assert item.keys() == {\"audio\", \"text\"}\n        assert item == {\"audio\": expected_audio, \"text\": \"Hello\"}\n\n    def process_text(example):\n        example[\"text\"] = example[\"text\"] + \" World!\"\n        return example\n\n    processed_dset = dset.map(process_text)\n    for item in processed_dset.cast_column(\"audio\", Audio(decode=False)):\n        assert item.keys() == {\"audio\", \"text\"}\n        assert item == {\"audio\": expected_audio, \"text\": \"Hello World!\"}\n\n\n@require_torchcodec\ndef test_dataset_with_audio_feature_map_is_decoded(shared_datadir):\n    audio_path = str(shared_datadir / \"test_audio_44100.wav\")\n    data = {\"audio\": [audio_path], \"text\": [\"Hello\"]}\n    features = Features({\"audio\": Audio(), \"text\": Value(\"string\")})\n    dset = Dataset.from_dict(data, features=features)\n\n    def process_audio_sampling_rate_by_example(example):\n        sample_rate = example[\"audio\"].get_all_samples().sample_rate\n        example[\"double_sampling_rate\"] = 2 * sample_rate\n        return example\n\n    decoded_dset = dset.map(process_audio_sampling_rate_by_example)\n    for item in decoded_dset.cast_column(\"audio\", Audio(decode=False)):\n        assert item.keys() == {\"audio\", \"text\", \"double_sampling_rate\"}\n        assert item[\"double_sampling_rate\"] == 88200\n\n    def process_audio_sampling_rate_by_batch(batch):\n        double_sampling_rates = []\n        for audio in batch[\"audio\"]:\n            double_sampling_rates.append(2 * audio.get_all_samples().sample_rate)\n        batch[\"double_sampling_rate\"] = double_sampling_rates\n        return batch\n\n    decoded_dset = dset.map(process_audio_sampling_rate_by_batch, batched=True)\n    for item in decoded_dset.cast_column(\"audio\", Audio(decode=False)):\n        assert item.keys() == {\"audio\", \"text\", \"double_sampling_rate\"}\n        assert item[\"double_sampling_rate\"] == 88200\n\n\n@require_torchcodec\ndef test_formatted_dataset_with_audio_feature(shared_datadir):\n    from torchcodec.decoders import AudioDecoder\n\n    audio_path = str(shared_datadir / \"test_audio_44100.wav\")\n    data = {\"audio\": [audio_path, audio_path]}\n    features = Features({\"audio\": Audio()})\n    dset = Dataset.from_dict(data, features=features)\n    with dset.formatted_as(\"numpy\"):\n        item = dset[0]\n        assert item.keys() == {\"audio\"}\n        assert isinstance(item[\"audio\"], AudioDecoder)\n        samples = item[\"audio\"].get_all_samples()\n        assert samples.sample_rate == 44100\n        assert samples.data.shape == (2, 202311)\n        batch = dset[:1]\n        assert batch.keys() == {\"audio\"}\n        assert len(batch[\"audio\"]) == 1\n        assert isinstance(batch[\"audio\"][0], AudioDecoder)\n        samples = batch[\"audio\"][0].get_all_samples()\n        assert samples.sample_rate == 44100\n        assert samples.data.shape == (2, 202311)\n        column = dset[\"audio\"]\n        assert len(column) == 2\n        assert isinstance(column[0], AudioDecoder)\n        samples = column[0].get_all_samples()\n        assert samples.sample_rate == 44100\n        assert samples.data.shape == (2, 202311)\n\n    with dset.formatted_as(\"pandas\"):\n        item = dset[0]\n        assert item.shape == (1, 1)\n        assert item.columns == [\"audio\"]\n        assert isinstance(item[\"audio\"][0], AudioDecoder)\n        samples = item[\"audio\"][0].get_all_samples()\n        assert samples.sample_rate == 44100\n        assert samples.data.shape == (2, 202311)\n        batch = dset[:1]\n        assert batch.shape == (1, 1)\n        assert batch.columns == [\"audio\"]\n        assert isinstance(batch[\"audio\"][0], AudioDecoder)\n        samples = batch[\"audio\"][0].get_all_samples()\n        assert samples.sample_rate == 44100\n        assert samples.data.shape == (2, 202311)\n        column = dset[\"audio\"]\n        assert len(column) == 2\n        assert isinstance(column[0], AudioDecoder)\n        samples = column[0].get_all_samples()\n        assert samples.sample_rate == 44100\n        assert samples.data.shape == (2, 202311)\n\n\n@pytest.fixture\ndef jsonl_audio_dataset_path(shared_datadir, tmp_path_factory):\n    import json\n\n    audio_path = str(shared_datadir / \"test_audio_44100.wav\")\n    data = [{\"audio\": audio_path, \"text\": \"Hello world!\"}]\n    path = str(tmp_path_factory.mktemp(\"data\") / \"audio_dataset.jsonl\")\n    with open(path, \"w\") as f:\n        for item in data:\n            f.write(json.dumps(item) + \"\\n\")\n    return path\n\n\n@require_torchcodec\n@pytest.mark.parametrize(\"streaming\", [False, True])\ndef test_load_dataset_with_audio_feature(streaming, jsonl_audio_dataset_path, shared_datadir):\n    from torchcodec.decoders import AudioDecoder\n\n    audio_path = str(shared_datadir / \"test_audio_44100.wav\")\n    data_files = jsonl_audio_dataset_path\n    features = Features({\"audio\": Audio(), \"text\": Value(\"string\")})\n    dset = load_dataset(\"json\", split=\"train\", data_files=data_files, features=features, streaming=streaming)\n    item = dset[0] if not streaming else next(iter(dset))\n    assert item.keys() == {\"audio\", \"text\"}\n    assert isinstance(item[\"audio\"], AudioDecoder)\n    samples = item[\"audio\"].get_all_samples()\n    assert samples.sample_rate == 44100\n    assert samples.data.shape == (2, 202311)\n    assert item[\"audio\"].metadata.path == audio_path\n\n\n@require_torchcodec\n@pytest.mark.integration\ndef test_dataset_with_audio_feature_loaded_from_cache():\n    # load first time\n    ds = load_dataset(\"hf-internal-testing/librispeech_asr_dummy\", \"clean\")\n    # load from cache\n    ds = load_dataset(\"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\")\n    assert isinstance(ds, Dataset)\n\n\n@require_torchcodec\ndef test_dataset_with_audio_feature_undecoded(shared_datadir):\n    audio_path = str(shared_datadir / \"test_audio_44100.wav\")\n    data = {\"audio\": [audio_path]}\n    features = Features({\"audio\": Audio(decode=False)})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"audio\"}\n    assert item[\"audio\"] == {\"path\": audio_path, \"bytes\": None}\n    batch = dset[:1]\n    assert batch.keys() == {\"audio\"}\n    assert len(batch[\"audio\"]) == 1\n    assert batch[\"audio\"][0] == {\"path\": audio_path, \"bytes\": None}\n    column = dset[\"audio\"]\n    assert len(column) == 1\n    assert column[0] == {\"path\": audio_path, \"bytes\": None}\n\n\n@require_torchcodec\ndef test_formatted_dataset_with_audio_feature_undecoded(shared_datadir):\n    audio_path = str(shared_datadir / \"test_audio_44100.wav\")\n    data = {\"audio\": [audio_path]}\n    features = Features({\"audio\": Audio(decode=False)})\n    dset = Dataset.from_dict(data, features=features)\n    with dset.formatted_as(\"numpy\"):\n        item = dset[0]\n        assert item.keys() == {\"audio\"}\n        assert item[\"audio\"] == {\"path\": audio_path, \"bytes\": None}\n        batch = dset[:1]\n        assert batch.keys() == {\"audio\"}\n        assert len(batch[\"audio\"]) == 1\n        assert batch[\"audio\"][0] == {\"path\": audio_path, \"bytes\": None}\n        column = dset[\"audio\"]\n        assert len(column) == 1\n        assert column[0] == {\"path\": audio_path, \"bytes\": None}\n\n    with dset.formatted_as(\"pandas\"):\n        item = dset[0]\n        assert item.shape == (1, 1)\n        assert item.columns == [\"audio\"]\n        assert item[\"audio\"][0] == {\"path\": audio_path, \"bytes\": None}\n        batch = dset[:1]\n        assert batch.shape == (1, 1)\n        assert batch.columns == [\"audio\"]\n        assert batch[\"audio\"][0] == {\"path\": audio_path, \"bytes\": None}\n        column = dset[\"audio\"]\n        assert len(column) == 1\n        assert column[0] == {\"path\": audio_path, \"bytes\": None}\n\n\n@require_torchcodec\ndef test_dataset_with_audio_feature_map_undecoded(shared_datadir):\n    audio_path = str(shared_datadir / \"test_audio_44100.wav\")\n    data = {\"audio\": [audio_path]}\n    features = Features({\"audio\": Audio(decode=False)})\n    dset = Dataset.from_dict(data, features=features)\n\n    def assert_audio_example_undecoded(example):\n        assert example[\"audio\"] == {\"path\": audio_path, \"bytes\": None}\n\n    dset.map(assert_audio_example_undecoded)\n\n    def assert_audio_batch_undecoded(batch):\n        for audio in batch[\"audio\"]:\n            assert audio == {\"path\": audio_path, \"bytes\": None}\n\n    dset.map(assert_audio_batch_undecoded, batched=True)\n\n\ndef test_audio_embed_storage(shared_datadir):\n    audio_path = str(shared_datadir / \"test_audio_44100.wav\")\n    example = {\"bytes\": None, \"path\": audio_path}\n    storage = pa.array([example], type=pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()}))\n    embedded_storage = Audio().embed_storage(storage)\n    embedded_example = embedded_storage.to_pylist()[0]\n    assert embedded_example == {\"bytes\": open(audio_path, \"rb\").read(), \"path\": \"test_audio_44100.wav\"}\n\n\n@require_torchcodec\ndef test_audio_decode_example_opus_convert_to_stereo(shared_datadir):\n    # GH 7837\n    from torchcodec.decoders import AudioDecoder\n\n    audio_path = str(shared_datadir / \"test_audio_48000.opus\")  # mono file\n    audio = Audio(num_channels=2)\n    decoded_example = audio.decode_example(audio.encode_example(audio_path))\n    assert isinstance(decoded_example, AudioDecoder)\n    samples = decoded_example.get_all_samples()\n    assert samples.sample_rate == 48000\n    assert samples.data.shape == (2, 48000)\n\n\n@require_torchcodec\ndef test_audio_decode_example_opus_convert_to_mono(shared_datadir):\n    # GH 7837\n    from torchcodec.decoders import AudioDecoder\n\n    audio_path = str(shared_datadir / \"test_audio_44100.wav\")  # stereo file\n    audio = Audio(num_channels=1)\n    decoded_example = audio.decode_example(audio.encode_example(audio_path))\n    assert isinstance(decoded_example, AudioDecoder)\n    samples = decoded_example.get_all_samples()\n    assert samples.sample_rate == 44100\n    assert samples.data.shape == (1, 202311)\n"
  },
  {
    "path": "tests/features/test_features.py",
    "content": "import datetime\nfrom unittest import TestCase\nfrom unittest.mock import MagicMock, patch\n\nimport numpy as np\nimport pandas as pd\nimport pyarrow as pa\nimport pytest\n\nfrom datasets import Array2D\nfrom datasets.arrow_dataset import Column, Dataset\nfrom datasets.features import Audio, ClassLabel, Features, Image, Json, LargeList, List, Sequence, Value\nfrom datasets.features.features import (\n    _align_features,\n    _arrow_to_datasets_dtype,\n    _cast_to_python_objects,\n    _check_if_features_can_be_aligned,\n    _check_non_null_non_empty_recursive,\n    _is_null_feature,\n    _visit,\n    cast_to_python_objects,\n    decode_nested_example,\n    encode_nested_example,\n    generate_from_arrow_type,\n    generate_from_dict,\n    get_nested_type,\n    require_decoding,\n    require_storage_cast,\n    require_storage_embed,\n    string_to_arrow,\n)\nfrom datasets.features.translation import Translation, TranslationVariableLanguages\nfrom datasets.info import DatasetInfo\nfrom datasets.utils.py_utils import asdict\n\nfrom ..utils import require_jax, require_numpy1_on_windows, require_tf, require_torch\n\n\ndef list_with(item):\n    return [item]\n\n\nclass FeaturesTest(TestCase):\n    def test_from_arrow_schema_simple(self):\n        data = {\"a\": [{\"b\": {\"c\": \"text\"}}] * 10, \"foo\": [1] * 10}\n        original_features = Features({\"a\": {\"b\": {\"c\": Value(\"string\")}}, \"foo\": Value(\"int64\")})\n        dset = Dataset.from_dict(data, features=original_features)\n        new_features = dset.features\n        new_dset = Dataset.from_dict(data, features=new_features)\n        self.assertEqual(original_features.type, new_features.type)\n        self.assertDictEqual(dset[0], new_dset[0])\n        self.assertDictEqual(dset[:], new_dset[:])\n\n    def test_from_arrow_schema_with_sequence(self):\n        data = {\"a\": [{\"b\": {\"c\": [\"text\"]}}] * 10, \"foo\": [1] * 10}\n        original_features = Features({\"a\": {\"b\": {\"c\": List(Value(\"string\"))}}, \"foo\": Value(\"int64\")})\n        dset = Dataset.from_dict(data, features=original_features)\n        new_features = dset.features\n        new_dset = Dataset.from_dict(data, features=new_features)\n        self.assertEqual(original_features.type, new_features.type)\n        self.assertDictEqual(dset[0], new_dset[0])\n        self.assertDictEqual(dset[:], new_dset[:])\n\n    def test_string_to_arrow_bijection_for_primitive_types(self):\n        supported_pyarrow_datatypes = [\n            pa.time32(\"s\"),\n            pa.time64(\"us\"),\n            pa.timestamp(\"s\"),\n            pa.timestamp(\"ns\", tz=\"America/New_York\"),\n            pa.date32(),\n            pa.date64(),\n            pa.duration(\"s\"),\n            pa.decimal128(10, 2),\n            pa.decimal256(40, -3),\n            pa.string(),\n            pa.int32(),\n            pa.float64(),\n            pa.array([datetime.time(1, 1, 1)]).type,  # arrow type: DataType(time64[us])\n        ]\n        for dt in supported_pyarrow_datatypes:\n            self.assertEqual(dt, string_to_arrow(_arrow_to_datasets_dtype(dt)))\n\n        unsupported_pyarrow_datatypes = [pa.list_(pa.float64())]\n        for dt in unsupported_pyarrow_datatypes:\n            with self.assertRaises(ValueError):\n                string_to_arrow(_arrow_to_datasets_dtype(dt))\n\n        supported_datasets_dtypes = [\n            \"time32[s]\",\n            \"timestamp[ns]\",\n            \"timestamp[ns, tz=+07:30]\",\n            \"duration[us]\",\n            \"decimal128(30, -4)\",\n            \"int32\",\n            \"float64\",\n        ]\n        for sdt in supported_datasets_dtypes:\n            self.assertEqual(sdt, _arrow_to_datasets_dtype(string_to_arrow(sdt)))\n\n        unsupported_datasets_dtypes = [\n            \"time32[ns]\",\n            \"timestamp[blob]\",\n            \"timestamp[[ns]]\",\n            \"timestamp[ns, tz=[ns]]\",\n            \"duration[[us]]\",\n            \"decimal20(30, -4)\",\n            \"int\",\n        ]\n        for sdt in unsupported_datasets_dtypes:\n            with self.assertRaises(ValueError):\n                string_to_arrow(sdt)\n\n    def test_categorical_one_way(self):\n        # Categorical types (aka dictionary types) need special handling as there isn't a bijection\n        categorical_type = pa.dictionary(pa.int32(), pa.string())\n\n        self.assertEqual(\"string\", _arrow_to_datasets_dtype(categorical_type))\n\n    def test_feature_named_type(self):\n        \"\"\"reference: issue #1110\"\"\"\n        features = Features({\"_type\": Value(\"string\")})\n        ds_info = DatasetInfo(features=features)\n        reloaded_features = Features.from_dict(asdict(ds_info)[\"features\"])\n        assert features == reloaded_features\n\n    def test_feature_named_self_as_kwarg(self):\n        \"\"\"reference: issue #5641\"\"\"\n        features = Features(self=Value(\"string\"))\n        ds_info = DatasetInfo(features=features)\n        reloaded_features = Features.from_dict(asdict(ds_info)[\"features\"])\n        assert features == reloaded_features\n\n    def test_class_label_feature_with_no_labels(self):\n        \"\"\"reference: issue #4681\"\"\"\n        features = Features({\"label\": ClassLabel(names=[])})\n        ds_info = DatasetInfo(features=features)\n        reloaded_features = Features.from_dict(asdict(ds_info)[\"features\"])\n        assert features == reloaded_features\n\n    def test_reorder_fields_as(self):\n        features = Features(\n            {\n                \"id\": Value(\"string\"),\n                \"document\": {\n                    \"title\": Value(\"string\"),\n                    \"url\": Value(\"string\"),\n                    \"html\": Value(\"string\"),\n                    \"tokens\": {\"token\": List(Value(\"string\")), \"is_html\": List(Value(\"bool\"))},\n                },\n                \"question\": {\n                    \"text\": Value(\"string\"),\n                    \"tokens\": List(Value(\"string\")),\n                },\n                \"annotations\": {\n                    \"id\": List(Value(\"string\")),\n                    \"long_answer\": List(\n                        {\n                            \"start_token\": Value(\"int64\"),\n                            \"end_token\": Value(\"int64\"),\n                            \"start_byte\": Value(\"int64\"),\n                            \"end_byte\": Value(\"int64\"),\n                        }\n                    ),\n                    \"short_answers\": List(\n                        {\n                            \"start_token\": List(Value(\"int64\")),\n                            \"end_token\": List(Value(\"int64\")),\n                            \"start_byte\": List(Value(\"int64\")),\n                            \"end_byte\": List(Value(\"int64\")),\n                            \"text\": List(Value(\"string\")),\n                        }\n                    ),\n                    \"yes_no_answer\": List(ClassLabel(names=[\"NO\", \"YES\"])),\n                },\n            }\n        )\n\n        other = Features(  # same but with a shuffled fields order\n            {\n                \"id\": Value(\"string\"),\n                \"document\": {\n                    \"tokens\": {\"token\": List(Value(\"string\")), \"is_html\": List(Value(\"bool\"))},\n                    \"title\": Value(\"string\"),\n                    \"url\": Value(\"string\"),\n                    \"html\": Value(\"string\"),\n                },\n                \"question\": {\n                    \"text\": Value(\"string\"),\n                    \"tokens\": List(Value(\"string\")),\n                },\n                \"annotations\": {\n                    \"yes_no_answer\": List(ClassLabel(names=[\"NO\", \"YES\"])),\n                    \"id\": List(Value(\"string\")),\n                    \"long_answer\": List(\n                        {\n                            \"end_byte\": Value(\"int64\"),\n                            \"start_token\": Value(\"int64\"),\n                            \"end_token\": Value(\"int64\"),\n                            \"start_byte\": Value(\"int64\"),\n                        }\n                    ),\n                    \"short_answers\": List(\n                        {\n                            \"text\": List(Value(\"string\")),\n                            \"start_token\": List(Value(\"int64\")),\n                            \"end_token\": List(Value(\"int64\")),\n                            \"start_byte\": List(Value(\"int64\")),\n                            \"end_byte\": List(Value(\"int64\")),\n                        }\n                    ),\n                },\n            }\n        )\n\n        expected = Features(\n            {\n                \"id\": Value(\"string\"),\n                \"document\": {\n                    \"tokens\": {\"token\": List(Value(\"string\")), \"is_html\": List(Value(\"bool\"))},\n                    \"title\": Value(\"string\"),\n                    \"url\": Value(\"string\"),\n                    \"html\": Value(\"string\"),\n                },\n                \"question\": {\n                    \"text\": Value(\"string\"),\n                    \"tokens\": List(Value(\"string\")),\n                },\n                \"annotations\": {\n                    \"yes_no_answer\": List(ClassLabel(names=[\"NO\", \"YES\"])),\n                    \"id\": List(Value(\"string\")),\n                    \"long_answer\": List(\n                        {\n                            \"end_byte\": Value(\"int64\"),\n                            \"start_token\": Value(\"int64\"),\n                            \"end_token\": Value(\"int64\"),\n                            \"start_byte\": Value(\"int64\"),\n                        }\n                    ),\n                    \"short_answers\": List(\n                        {\n                            \"text\": List(Value(\"string\")),\n                            \"start_token\": List(Value(\"int64\")),\n                            \"end_token\": List(Value(\"int64\")),\n                            \"start_byte\": List(Value(\"int64\")),\n                            \"end_byte\": List(Value(\"int64\")),\n                        }\n                    ),\n                },\n            }\n        )\n\n        reordered_features = features.reorder_fields_as(other)\n        self.assertDictEqual(reordered_features, expected)\n        self.assertEqual(reordered_features.type, other.type)\n        self.assertEqual(reordered_features.type, expected.type)\n        self.assertNotEqual(reordered_features.type, features.type)\n\n    def test_flatten(self):\n        features = Features({\"foo\": {\"bar1\": Value(\"int32\"), \"bar2\": {\"foobar\": Value(\"string\")}}})\n        _features = features.copy()\n        flattened_features = features.flatten()\n        assert flattened_features == {\"foo.bar1\": Value(\"int32\"), \"foo.bar2.foobar\": Value(\"string\")}\n        assert features == _features, \"calling flatten shouldn't alter the current features\"\n\n    def test_flatten_with_sequence(self):\n        features = Features({\"foo\": {\"bar\": List({\"my_value\": Value(\"int32\")})}})\n        _features = features.copy()\n        flattened_features = features.flatten()\n        assert flattened_features == {\"foo.bar\": List({\"my_value\": Value(\"int32\")})}\n        assert features == _features, \"calling flatten shouldn't alter the current features\"\n\n    def test_features_dicts_are_synced(self):\n        def assert_features_dicts_are_synced(features: Features):\n            assert (\n                hasattr(features, \"_column_requires_decoding\")\n                and features.keys() == features._column_requires_decoding.keys()\n            )\n\n        features = Features({\"foo\": {\"bar\": List({\"my_value\": Value(\"int32\")})}})\n        assert_features_dicts_are_synced(features)\n        features[\"barfoo\"] = Image()\n        assert_features_dicts_are_synced(features)\n        del features[\"barfoo\"]\n        assert_features_dicts_are_synced(features)\n        features.update({\"foobar\": Value(\"string\")})\n        assert_features_dicts_are_synced(features)\n        features.pop(\"foobar\")\n        assert_features_dicts_are_synced(features)\n        features.popitem()\n        assert_features_dicts_are_synced(features)\n        features.setdefault(\"xyz\", Value(\"bool\"))\n        assert_features_dicts_are_synced(features)\n        features.clear()\n        assert_features_dicts_are_synced(features)\n\n\ndef test_classlabel_init(tmp_path_factory):\n    names = [\"negative\", \"positive\"]\n    names_file = str(tmp_path_factory.mktemp(\"features\") / \"labels.txt\")\n    with open(names_file, \"w\", encoding=\"utf-8\") as f:\n        f.write(\"\\n\".join(names))\n    classlabel = ClassLabel(names=names)\n    assert classlabel.names == names and classlabel.num_classes == len(names)\n    classlabel = ClassLabel(names_file=names_file)\n    assert classlabel.names == names and classlabel.num_classes == len(names)\n    classlabel = ClassLabel(num_classes=len(names), names=names)\n    assert classlabel.names == names and classlabel.num_classes == len(names)\n    classlabel = ClassLabel(num_classes=len(names))\n    assert classlabel.names == [str(i) for i in range(len(names))] and classlabel.num_classes == len(names)\n    with pytest.raises(ValueError):\n        classlabel = ClassLabel(num_classes=len(names) + 1, names=names)\n    with pytest.raises(ValueError):\n        classlabel = ClassLabel(names=names, names_file=names_file)\n    with pytest.raises(ValueError):\n        classlabel = ClassLabel()\n    with pytest.raises(TypeError):\n        classlabel = ClassLabel(names=np.array(names))\n\n\ndef test_classlabel_str2int():\n    names = [\"negative\", \"positive\"]\n    classlabel = ClassLabel(names=names)\n    for label in names:\n        assert classlabel.str2int(label) == names.index(label)\n    with pytest.raises(ValueError):\n        classlabel.str2int(\"__bad_label_name__\")\n    with pytest.raises(ValueError):\n        classlabel.str2int(1)\n    with pytest.raises(ValueError):\n        classlabel.str2int(None)\n\n\ndef test_classlabel_int2str():\n    names = [\"negative\", \"positive\"]\n    classlabel = ClassLabel(names=names)\n    for i in range(len(names)):\n        assert classlabel.int2str(i) == names[i]\n    with pytest.raises(ValueError):\n        classlabel.int2str(len(names))\n    with pytest.raises(ValueError):\n        classlabel.int2str(-1)\n    with pytest.raises(ValueError):\n        classlabel.int2str(None)\n\n\ndef test_classlabel_cast_storage():\n    names = [\"negative\", \"positive\"]\n    classlabel = ClassLabel(names=names)\n    # from integers\n    arr = pa.array([0, 1, -1, -100], type=pa.int64())\n    result = classlabel.cast_storage(arr)\n    assert result.type == pa.int64()\n    assert result.to_pylist() == [0, 1, -1, -100]\n    arr = pa.array([0, 1, -1, -100], type=pa.int32())\n    result = classlabel.cast_storage(arr)\n    assert result.type == pa.int64()\n    assert result.to_pylist() == [0, 1, -1, -100]\n    arr = pa.array([3])\n    with pytest.raises(ValueError):\n        classlabel.cast_storage(arr)\n    # from strings\n    arr = pa.array([\"negative\", \"positive\"])\n    result = classlabel.cast_storage(arr)\n    assert result.type == pa.int64()\n    assert result.to_pylist() == [0, 1]\n    arr = pa.array([\"__label_that_doesnt_exist__\"])\n    with pytest.raises(ValueError):\n        classlabel.cast_storage(arr)\n    # from nulls\n    arr = pa.array([None])\n    result = classlabel.cast_storage(arr)\n    assert result.type == pa.int64()\n    assert result.to_pylist() == [None]\n    # from empty\n    arr = pa.array([], pa.int64())\n    result = classlabel.cast_storage(arr)\n    assert result.type == pa.int64()\n    assert result.to_pylist() == []\n    arr = pa.array([], pa.string())\n    result = classlabel.cast_storage(arr)\n    assert result.type == pa.int64()\n    assert result.to_pylist() == []\n\n\n@pytest.mark.parametrize(\"class_label_arg\", [\"names\", \"names_file\"])\ndef test_class_label_to_and_from_dict(class_label_arg, tmp_path_factory):\n    names = [\"negative\", \"positive\"]\n    names_file = str(tmp_path_factory.mktemp(\"features\") / \"labels.txt\")\n    with open(names_file, \"w\", encoding=\"utf-8\") as f:\n        f.write(\"\\n\".join(names))\n    if class_label_arg == \"names\":\n        class_label = ClassLabel(names=names)\n    elif class_label_arg == \"names_file\":\n        class_label = ClassLabel(names_file=names_file)\n    generated_class_label = generate_from_dict(asdict(class_label))\n    assert generated_class_label == class_label\n\n\n@pytest.mark.parametrize(\n    \"schema\",\n    [LargeList(Audio()), List(Audio())],\n)\ndef test_decode_nested_example_with_list_types(schema, monkeypatch):\n    mock_decode_example = MagicMock()\n    monkeypatch.setattr(Audio, \"decode_example\", mock_decode_example)\n    audio_example = {\"path\": \"dummy_audio_path\"}\n    _ = decode_nested_example(schema, [audio_example])\n    assert mock_decode_example.called\n    assert mock_decode_example.call_args.args[0] == audio_example\n\n\n@pytest.mark.parametrize(\n    \"schema\",\n    [List(ClassLabel(names=[\"a\", \"b\"])), LargeList(ClassLabel(names=[\"a\", \"b\"]))],\n)\ndef test_encode_nested_example_with_list_types(schema):\n    result = encode_nested_example(schema, [\"b\"])\n    assert result == [1]\n\n\n@pytest.mark.parametrize(\"inner_type\", [Value(\"int32\"), {\"subcolumn\": Value(\"int32\")}])\ndef test_encode_nested_example_sequence_with_none(inner_type):\n    schema = List(inner_type)\n    obj = None\n    result = encode_nested_example(schema, obj)\n    assert result is None\n\n\n@pytest.mark.parametrize(\n    \"features_dict, example, expected_encoded_example\",\n    [\n        ({\"col_1\": ClassLabel(names=[\"a\", \"b\"])}, {\"col_1\": \"b\"}, {\"col_1\": 1}),\n        ({\"col_1\": List(ClassLabel(names=[\"a\", \"b\"]))}, {\"col_1\": [\"b\"]}, {\"col_1\": [1]}),\n        ({\"col_1\": LargeList(ClassLabel(names=[\"a\", \"b\"]))}, {\"col_1\": [\"b\"]}, {\"col_1\": [1]}),\n        ({\"col_1\": List(ClassLabel(names=[\"a\", \"b\"]))}, {\"col_1\": [\"b\"]}, {\"col_1\": [1]}),\n    ],\n)\ndef test_encode_example(features_dict, example, expected_encoded_example):\n    features = Features(features_dict)\n    encoded_example = features.encode_example(example)\n    assert encoded_example == expected_encoded_example\n\n\ndef test_encode_batch_with_example_with_empty_first_elem():\n    features = Features(\n        {\n            \"x\": List(List(ClassLabel(names=[\"a\", \"b\"]))),\n        }\n    )\n    encoded_batch = features.encode_batch(\n        {\n            \"x\": [\n                [[\"a\"], [\"b\"]],\n                [[], [\"b\"]],\n            ]\n        }\n    )\n    assert encoded_batch == {\"x\": [[[0], [1]], [[], [1]]]}\n\n\ndef test_encode_column_dict_with_none():\n    features = Features(\n        {\n            \"x\": {\"a\": ClassLabel(names=[\"a\", \"b\"]), \"b\": Value(\"int32\")},\n        }\n    )\n    encoded_column = features.encode_column([{\"a\": \"a\", \"b\": 1}, None], \"x\")\n    assert encoded_column == [{\"a\": 0, \"b\": 1}, None]\n\n\n@pytest.mark.parametrize(\n    \"feature\",\n    [\n        Value(\"int32\"),\n        ClassLabel(num_classes=2),\n        Translation(languages=[\"en\", \"fr\"]),\n        TranslationVariableLanguages(languages=[\"en\", \"fr\"]),\n    ],\n)\ndef test_dataset_feature_with_none(feature):\n    data = {\"col\": [None]}\n    features = Features({\"col\": feature})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"col\"}\n    assert item[\"col\"] is None\n    batch = dset[:1]\n    assert len(batch) == 1\n    assert batch.keys() == {\"col\"}\n    assert isinstance(batch[\"col\"], list) and all(item is None for item in batch[\"col\"])\n    column = dset[\"col\"]\n    assert len(column) == 1\n    assert isinstance(column, Column) and all(item is None for item in column)\n\n    # nested tests\n\n    data = {\"col\": [[None]]}\n    features = Features({\"col\": List(feature)})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"col\"}\n    assert all(i is None for i in item[\"col\"])\n\n    data = {\"nested\": [{\"col\": None}]}\n    features = Features({\"nested\": {\"col\": feature}})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"nested\"}\n    assert item[\"nested\"].keys() == {\"col\"}\n    assert item[\"nested\"][\"col\"] is None\n\n\ndef iternumpy(key1, value1, value2):\n    if value1.dtype != value2.dtype:  # check only for dtype\n        raise AssertionError(\n            f\"dtype of '{key1}' key for casted object: {value1.dtype} and expected object: {value2.dtype} not matching\"\n        )\n\n\ndef dict_diff(d1: dict, d2: dict):  # check if 2 dictionaries are equal\n    np.testing.assert_equal(d1, d2)  # sanity check if dict values are equal or not\n\n    for (k1, v1), (k2, v2) in zip(d1.items(), d2.items()):  # check if their values have same dtype or not\n        if isinstance(v1, dict):  # nested dictionary case\n            dict_diff(v1, v2)\n        elif isinstance(v1, np.ndarray):  # checks if dtype and value of np.ndarray is equal\n            iternumpy(k1, v1, v2)\n        elif isinstance(v1, list):\n            for element1, element2 in zip(v1, v2):  # iterates over all elements of list\n                if isinstance(element1, dict):\n                    dict_diff(element1, element2)\n                elif isinstance(element1, np.ndarray):\n                    iternumpy(k1, element1, element2)\n\n\nclass CastToPythonObjectsTest(TestCase):\n    def test_cast_to_python_objects_list(self):\n        obj = {\"col_1\": [{\"vec\": [1, 2, 3], \"txt\": \"foo\"}] * 3, \"col_2\": [[1, 2], [3, 4], [5, 6]]}\n        expected_obj = {\"col_1\": [{\"vec\": [1, 2, 3], \"txt\": \"foo\"}] * 3, \"col_2\": [[1, 2], [3, 4], [5, 6]]}\n        casted_obj = cast_to_python_objects(obj)\n        self.assertDictEqual(casted_obj, expected_obj)\n\n    def test_cast_to_python_objects_tuple(self):\n        obj = {\"col_1\": [{\"vec\": (1, 2, 3), \"txt\": \"foo\"}] * 3, \"col_2\": [(1, 2), (3, 4), (5, 6)]}\n        expected_obj = {\"col_1\": [{\"vec\": (1, 2, 3), \"txt\": \"foo\"}] * 3, \"col_2\": [(1, 2), (3, 4), (5, 6)]}\n        casted_obj = cast_to_python_objects(obj)\n        self.assertDictEqual(casted_obj, expected_obj)\n\n    def test_cast_to_python_or_numpy(self):\n        obj = {\"col_1\": [{\"vec\": np.arange(1, 4), \"txt\": \"foo\"}] * 3, \"col_2\": np.arange(1, 7).reshape(3, 2)}\n        expected_obj = {\n            \"col_1\": [{\"vec\": np.array([1, 2, 3]), \"txt\": \"foo\"}] * 3,\n            \"col_2\": np.array([[1, 2], [3, 4], [5, 6]]),\n        }\n        casted_obj = cast_to_python_objects(obj)\n        dict_diff(casted_obj, expected_obj)\n\n    def test_cast_to_python_objects_series(self):\n        obj = {\n            \"col_1\": pd.Series([{\"vec\": [1, 2, 3], \"txt\": \"foo\"}] * 3),\n            \"col_2\": pd.Series([[1, 2], [3, 4], [5, 6]]),\n        }\n        expected_obj = {\"col_1\": [{\"vec\": [1, 2, 3], \"txt\": \"foo\"}] * 3, \"col_2\": [[1, 2], [3, 4], [5, 6]]}\n        casted_obj = cast_to_python_objects(obj)\n        self.assertDictEqual(casted_obj, expected_obj)\n\n    def test_cast_to_python_objects_dataframe(self):\n        obj = pd.DataFrame({\"col_1\": [{\"vec\": [1, 2, 3], \"txt\": \"foo\"}] * 3, \"col_2\": [[1, 2], [3, 4], [5, 6]]})\n        expected_obj = {\"col_1\": [{\"vec\": [1, 2, 3], \"txt\": \"foo\"}] * 3, \"col_2\": [[1, 2], [3, 4], [5, 6]]}\n        casted_obj = cast_to_python_objects(obj)\n        self.assertDictEqual(casted_obj, expected_obj)\n\n    def test_cast_to_python_objects_pandas_timestamp(self):\n        obj = pd.Timestamp(2020, 1, 1)\n        expected_obj = obj.to_pydatetime()\n        casted_obj = cast_to_python_objects(obj)\n        self.assertEqual(casted_obj, expected_obj)\n        casted_obj = cast_to_python_objects(pd.Series([obj]))\n        self.assertListEqual(casted_obj, [expected_obj])\n        casted_obj = cast_to_python_objects(pd.DataFrame({\"a\": [obj]}))\n        self.assertDictEqual(casted_obj, {\"a\": [expected_obj]})\n\n    def test_cast_to_python_objects_pandas_timedelta(self):\n        obj = pd.Timedelta(seconds=1)\n        expected_obj = obj.to_pytimedelta()\n        casted_obj = cast_to_python_objects(obj)\n        self.assertEqual(casted_obj, expected_obj)\n        casted_obj = cast_to_python_objects(pd.Series([obj]))\n        self.assertListEqual(casted_obj, [expected_obj])\n        casted_obj = cast_to_python_objects(pd.DataFrame({\"a\": [obj]}))\n        self.assertDictEqual(casted_obj, {\"a\": [expected_obj]})\n\n    @require_numpy1_on_windows\n    @require_torch\n    def test_cast_to_python_objects_torch(self):\n        import torch\n\n        obj = {\n            \"col_1\": [{\"vec\": torch.tensor(np.arange(1, 4)), \"txt\": \"foo\"}] * 3,\n            \"col_2\": torch.tensor(np.arange(1, 7).reshape(3, 2)),\n        }\n        expected_obj = {\n            \"col_1\": [{\"vec\": np.array([1, 2, 3]), \"txt\": \"foo\"}] * 3,\n            \"col_2\": np.array([[1, 2], [3, 4], [5, 6]]),\n        }\n        casted_obj = cast_to_python_objects(obj)\n        dict_diff(casted_obj, expected_obj)\n\n    @require_tf\n    def test_cast_to_python_objects_tf(self):\n        import tensorflow as tf\n\n        obj = {\n            \"col_1\": [{\"vec\": tf.constant(np.arange(1, 4)), \"txt\": \"foo\"}] * 3,\n            \"col_2\": tf.constant(np.arange(1, 7).reshape(3, 2)),\n        }\n        expected_obj = {\n            \"col_1\": [{\"vec\": np.array([1, 2, 3]), \"txt\": \"foo\"}] * 3,\n            \"col_2\": np.array([[1, 2], [3, 4], [5, 6]]),\n        }\n        casted_obj = cast_to_python_objects(obj)\n        dict_diff(casted_obj, expected_obj)\n\n    @require_jax\n    def test_cast_to_python_objects_jax(self):\n        import jax.numpy as jnp\n\n        obj = {\n            \"col_1\": [{\"vec\": jnp.array(np.arange(1, 4)), \"txt\": \"foo\"}] * 3,\n            \"col_2\": jnp.array(np.arange(1, 7).reshape(3, 2)),\n        }\n        assert obj[\"col_2\"].dtype == jnp.int32\n        expected_obj = {\n            \"col_1\": [{\"vec\": np.array([1, 2, 3], dtype=np.int32), \"txt\": \"foo\"}] * 3,\n            \"col_2\": np.array([[1, 2], [3, 4], [5, 6]], dtype=np.int32),\n        }\n        casted_obj = cast_to_python_objects(obj)\n        dict_diff(casted_obj, expected_obj)\n\n    @patch(\"datasets.features.features._cast_to_python_objects\", side_effect=_cast_to_python_objects)\n    def test_dont_iterate_over_each_element_in_a_list(self, mocked_cast):\n        obj = {\"col_1\": [[1, 2], [3, 4], [5, 6]]}\n        cast_to_python_objects(obj)\n        self.assertEqual(mocked_cast.call_count, 4)  # 4 = depth of obj\n\n\nSIMPLE_FEATURES = [\n    Features(),\n    Features({\"a\": Value(\"int32\")}),\n    Features({\"a\": Value(\"int32\", id=\"my feature\")}),\n    Features({\"a\": Value(\"int32\"), \"b\": Value(\"float64\"), \"c\": Value(\"string\")}),\n]\n\nCUSTOM_FEATURES = [\n    Features({\"label\": ClassLabel(names=[\"negative\", \"positive\"])}),\n    Features({\"array\": Array2D(dtype=\"float32\", shape=(4, 4))}),\n    Features({\"image\": Image()}),\n    Features({\"audio\": Audio()}),\n    Features({\"image\": Image(decode=False)}),\n    Features({\"audio\": Audio(decode=False)}),\n    Features({\"translation\": Translation([\"en\", \"fr\"])}),\n    Features({\"translation\": TranslationVariableLanguages([\"en\", \"fr\"])}),\n    Features({\"json\": Json()}),\n    Features({\"json\": Json(decode=False)}),\n]\n\nNESTED_FEATURES = [\n    Features({\"foo\": {}}),\n    Features({\"foo\": {\"bar\": Value(\"int32\")}}),\n    Features({\"foo\": {\"bar1\": Value(\"int32\"), \"bar2\": Value(\"float64\")}}),\n    Features({\"foo\": List(Value(\"int32\"))}),\n    Features({\"foo\": {\"bar\": List(Value(\"int32\"))}}),\n    Features({\"foo\": List({\"bar\": Value(\"int32\")})}),\n    Features({\"foo\": LargeList(Value(\"int32\"))}),\n    Features({\"foo\": LargeList({\"bar\": Value(\"int32\")})}),\n]\n\nNESTED_CUSTOM_FEATURES = [\n    Features({\"foo\": {\"bar\": ClassLabel(names=[\"negative\", \"positive\"])}}),\n    Features({\"foo\": List(ClassLabel(names=[\"negative\", \"positive\"]))}),\n    Features({\"foo\": List({\"bar\": ClassLabel(names=[\"negative\", \"positive\"])})}),\n    Features({\"foo\": LargeList(ClassLabel(names=[\"negative\", \"positive\"]))}),\n    Features({\"foo\": LargeList({\"bar\": ClassLabel(names=[\"negative\", \"positive\"])})}),\n]\n\n\n@pytest.mark.parametrize(\"features\", SIMPLE_FEATURES + CUSTOM_FEATURES + NESTED_FEATURES + NESTED_CUSTOM_FEATURES)\ndef test_features_to_dict_and_from_dict_round_trip(features: Features):\n    features_dict = features.to_dict()\n    assert isinstance(features_dict, dict)\n    reloaded = Features.from_dict(features_dict)\n    assert features == reloaded\n\n\n@pytest.mark.parametrize(\"features\", SIMPLE_FEATURES + CUSTOM_FEATURES + NESTED_FEATURES + NESTED_CUSTOM_FEATURES)\ndef test_features_to_yaml_list(features: Features):\n    features_yaml_list = features._to_yaml_list()\n    assert isinstance(features_yaml_list, list)\n    reloaded = Features._from_yaml_list(features_yaml_list)\n    assert features == reloaded\n\n\n@pytest.mark.parametrize(\n    \"features_dict, expected_features_dict\",\n    [\n        ({\"col\": [{\"sub_col\": Value(\"int32\")}]}, {\"col\": [{\"sub_col\": Value(\"int32\")}]}),\n        ({\"col\": LargeList({\"sub_col\": Value(\"int32\")})}, {\"col\": LargeList({\"sub_col\": Value(\"int32\")})}),\n        ({\"col\": {\"sub_col\": List(Value(\"int32\"))}}, {\"col.sub_col\": List(Value(\"int32\"))}),\n    ],\n)\ndef test_features_flatten_with_list_types(features_dict, expected_features_dict):\n    features = Features(features_dict)\n    flattened_features = features.flatten()\n    assert flattened_features == Features(expected_features_dict)\n\n\n@pytest.mark.parametrize(\n    \"deserialized_features_dict, expected_features_dict\",\n    [\n        (\n            {\"col\": {\"feature\": {\"dtype\": \"int32\", \"_type\": \"Value\"}, \"_type\": \"List\"}},\n            {\"col\": List(Value(\"int32\"))},\n        ),\n        (\n            {\"col\": {\"feature\": {\"dtype\": \"int32\", \"_type\": \"Value\"}, \"_type\": \"LargeList\"}},\n            {\"col\": LargeList(Value(\"int32\"))},\n        ),\n        (\n            {\"col\": {\"feature\": {\"sub_col\": {\"dtype\": \"int32\", \"_type\": \"Value\"}}, \"_type\": \"List\"}},\n            {\"col\": List({\"sub_col\": Value(\"int32\")})},\n        ),\n        (\n            {\"col\": {\"feature\": {\"sub_col\": {\"dtype\": \"int32\", \"_type\": \"Value\"}}, \"_type\": \"LargeList\"}},\n            {\"col\": LargeList({\"sub_col\": Value(\"int32\")})},\n        ),\n        (\n            {\"col\": {\"feature\": {\"sub_col\": {\"dtype\": \"int32\", \"_type\": \"Value\"}}, \"_type\": \"Sequence\"}},\n            {\"col\": {\"sub_col\": List(Value(\"int32\"))}},\n        ),\n    ],\n)\ndef test_features_from_dict_with_list_types(deserialized_features_dict, expected_features_dict):\n    features = Features.from_dict(deserialized_features_dict)\n    assert features == Features(expected_features_dict)\n\n\n@pytest.mark.parametrize(\n    \"deserialized_feature_dict, expected_feature\",\n    [\n        (\n            {\"feature\": {\"dtype\": \"int32\", \"_type\": \"Value\"}, \"_type\": \"LargeList\"},\n            LargeList(Value(\"int32\")),\n        ),\n        (\n            {\"feature\": {\"dtype\": \"int32\", \"_type\": \"Value\"}, \"_type\": \"List\"},\n            List(Value(\"int32\")),\n        ),\n        (\n            {\"feature\": {\"sub_col\": {\"dtype\": \"int32\", \"_type\": \"Value\"}}, \"_type\": \"List\"},\n            List({\"sub_col\": Value(\"int32\")}),\n        ),\n        (\n            {\"feature\": {\"sub_col\": {\"dtype\": \"int32\", \"_type\": \"Value\"}}, \"_type\": \"LargeList\"},\n            LargeList({\"sub_col\": Value(\"int32\")}),\n        ),\n        (\n            {\"sub_col\": {\"feature\": {\"dtype\": \"int32\", \"_type\": \"Value\"}, \"_type\": \"List\"}},\n            {\"sub_col\": List(Value(\"int32\"))},\n        ),\n    ],\n)\ndef test_generate_from_dict_with_list_types(deserialized_feature_dict, expected_feature):\n    feature = generate_from_dict(deserialized_feature_dict)\n    assert feature == expected_feature\n\n\n@pytest.mark.parametrize(\n    \"features_dict, expected_features_yaml_list\",\n    [\n        ({\"col\": LargeList(Value(\"int32\"))}, [{\"name\": \"col\", \"large_list\": \"int32\"}]),\n        (\n            {\"col\": LargeList({\"sub_col\": Value(\"int32\")})},\n            [{\"name\": \"col\", \"large_list\": [{\"dtype\": \"int32\", \"name\": \"sub_col\"}]}],\n        ),\n    ],\n)\ndef test_features_to_yaml_list_with_large_list(features_dict, expected_features_yaml_list):\n    features = Features(features_dict)\n    features_yaml_list = features._to_yaml_list()\n    assert features_yaml_list == expected_features_yaml_list\n\n\n@pytest.mark.parametrize(\n    \"features_yaml_list, expected_features_dict\",\n    [\n        ([{\"name\": \"col\", \"large_list\": \"int32\"}], {\"col\": LargeList(Value(\"int32\"))}),\n        (\n            [{\"name\": \"col\", \"large_list\": [{\"dtype\": \"int32\", \"name\": \"sub_col\"}]}],\n            {\"col\": LargeList({\"sub_col\": Value(\"int32\")})},\n        ),\n    ],\n)\ndef test_features_from_yaml_list_with_large_list(features_yaml_list, expected_features_dict):\n    features = Features._from_yaml_list(features_yaml_list)\n    assert features == Features(expected_features_dict)\n\n\n@pytest.mark.parametrize(\"features\", SIMPLE_FEATURES + CUSTOM_FEATURES + NESTED_FEATURES + NESTED_CUSTOM_FEATURES)\ndef test_features_to_arrow_schema(features: Features):\n    arrow_schema = features.arrow_schema\n    assert isinstance(arrow_schema, pa.Schema)\n    reloaded = Features.from_arrow_schema(arrow_schema)\n    assert features == reloaded\n\n\nNESTED_COMPARISON = [\n    [\n        [Features({\"email\": Value(dtype=\"string\", id=None)}), Features({\"email\": Value(dtype=\"string\", id=None)})],\n        [Features({\"email\": Value(dtype=\"string\", id=None)}), Features({\"email\": Value(dtype=\"string\", id=None)})],\n    ],\n    [\n        [Features({\"email\": Value(dtype=\"string\", id=None)}), Features({\"email\": Value(dtype=\"null\", id=None)})],\n        [Features({\"email\": Value(dtype=\"string\", id=None)}), Features({\"email\": Value(dtype=\"string\", id=None)})],\n    ],\n    [\n        [\n            Features({\"speaker\": {\"email\": Value(dtype=\"string\", id=None)}}),\n            Features({\"speaker\": {\"email\": Value(dtype=\"string\", id=None)}}),\n        ],\n        [\n            Features({\"speaker\": {\"email\": Value(dtype=\"string\", id=None)}}),\n            Features({\"speaker\": {\"email\": Value(dtype=\"string\", id=None)}}),\n        ],\n    ],\n    [\n        [\n            Features({\"speaker\": {\"email\": Value(dtype=\"string\", id=None)}}),\n            Features({\"speaker\": {\"email\": Value(dtype=\"null\", id=None)}}),\n        ],\n        [\n            Features({\"speaker\": {\"email\": Value(dtype=\"string\", id=None)}}),\n            Features({\"speaker\": {\"email\": Value(dtype=\"string\", id=None)}}),\n        ],\n    ],\n    # List(Value(\"null\")) should be aligned with List(Struct(...))\n    [\n        [\n            Features({\"label\": List({\"type\": Value(\"string\"), \"score\": Value(\"float64\")})}),\n            Features({\"label\": List(Value(\"null\"))}),\n        ],\n        [\n            Features({\"label\": List({\"type\": Value(\"string\"), \"score\": Value(\"float64\")})}),\n            Features({\"label\": List({\"type\": Value(\"string\"), \"score\": Value(\"float64\")})}),\n        ],\n    ],\n    # List(Value(\"null\")) should be aligned with List(Value(\"string\"))\n    [\n        [\n            Features({\"tags\": List(Value(\"string\"))}),\n            Features({\"tags\": List(Value(\"null\"))}),\n        ],\n        [\n            Features({\"tags\": List(Value(\"string\"))}),\n            Features({\"tags\": List(Value(\"string\"))}),\n        ],\n    ],\n    # LargeList(Value(\"null\")) should be aligned with LargeList(Value(\"string\"))\n    [\n        [\n            Features({\"tags\": LargeList(Value(\"string\"))}),\n            Features({\"tags\": LargeList(Value(\"null\"))}),\n        ],\n        [\n            Features({\"tags\": LargeList(Value(\"string\"))}),\n            Features({\"tags\": LargeList(Value(\"string\"))}),\n        ],\n    ],\n    # Reversed order: null feature first, non-null second\n    [\n        [\n            Features({\"label\": List(Value(\"null\"))}),\n            Features({\"label\": List({\"type\": Value(\"string\"), \"score\": Value(\"float64\")})}),\n        ],\n        [\n            Features({\"label\": List({\"type\": Value(\"string\"), \"score\": Value(\"float64\")})}),\n            Features({\"label\": List({\"type\": Value(\"string\"), \"score\": Value(\"float64\")})}),\n        ],\n    ],\n]\n\n\n@pytest.mark.parametrize(\"features\", NESTED_COMPARISON)\ndef test_features_alignment(features: tuple[list[Features], list[Features]]):\n    inputs, expected = features\n    _check_if_features_can_be_aligned(inputs)  # Check that we can align, will raise otherwise.\n    assert _align_features(inputs) == expected\n\n\n@pytest.mark.parametrize(\"dtype\", [pa.int32, pa.string])\ndef test_features_from_arrow_schema_primitive_data_type(dtype):\n    schema = pa.schema([(\"column_name\", dtype())])\n    assert schema == Features.from_arrow_schema(schema).arrow_schema\n\n\n@pytest.mark.parametrize(\"scalar_dtype\", [pa.int32, pa.string])\n@pytest.mark.parametrize(\"list_dtype\", [pa.list_, pa.large_list])\ndef test_features_from_arrow_schema_list_data_type(list_dtype, scalar_dtype):\n    schema = pa.schema([(\"column_name\", list_dtype(scalar_dtype()))])\n    assert schema == Features.from_arrow_schema(schema).arrow_schema\n\n\n@pytest.mark.parametrize(\n    \"feature, other_feature\",\n    [\n        (List(Value(\"int64\")), List(Value(\"int64\"))),\n        (LargeList(Value(\"int64\")), LargeList(Value(\"int64\"))),\n        (List(Value(\"int64\")), List(Value(\"int64\"))),\n        (\n            List({\"sub_col_1\": Value(\"int64\"), \"sub_col_2\": Value(\"int64\")}),\n            List({\"sub_col_2\": Value(\"int64\"), \"sub_col_1\": Value(\"int64\")}),\n        ),\n        (\n            LargeList({\"sub_col_1\": Value(\"int64\"), \"sub_col_2\": Value(\"int64\")}),\n            LargeList({\"sub_col_2\": Value(\"int64\"), \"sub_col_1\": Value(\"int64\")}),\n        ),\n        (\n            {\"sub_col_1\": List(Value(\"int64\")), \"sub_col_2\": List(Value(\"int64\"))},\n            {\"sub_col_2\": List(Value(\"int64\")), \"sub_col_1\": List(Value(\"int64\"))},\n        ),\n    ],\n)\ndef test_features_reorder_fields_as_with_list_types(feature, other_feature):\n    features = Features({\"col\": feature})\n    other_features = Features({\"col\": other_feature})\n    new_features = features.reorder_fields_as(other_features)\n    assert new_features.type == other_features.type\n\n\n@pytest.mark.parametrize(\n    \"feature, expected_arrow_data_type\", [(Value(\"int64\"), pa.int64), (Value(\"string\"), pa.string)]\n)\ndef test_get_nested_type_with_scalar_feature(feature, expected_arrow_data_type):\n    arrow_data_type = get_nested_type(feature)\n    assert arrow_data_type == expected_arrow_data_type()\n\n\n@pytest.mark.parametrize(\n    \"scalar_feature, expected_arrow_primitive_data_type\", [(Value(\"int64\"), pa.int64), (Value(\"string\"), pa.string)]\n)\n@pytest.mark.parametrize(\n    \"list_feature, expected_arrow_nested_data_type\",\n    [(list_with, pa.list_), (LargeList, pa.large_list), (Sequence, pa.list_)],\n)\ndef test_get_nested_type_with_list_feature(\n    list_feature, expected_arrow_nested_data_type, scalar_feature, expected_arrow_primitive_data_type\n):\n    feature = list_feature(scalar_feature)\n    arrow_data_type = get_nested_type(feature)\n    assert arrow_data_type == expected_arrow_nested_data_type(expected_arrow_primitive_data_type())\n\n\n@pytest.mark.parametrize(\n    \"arrow_primitive_data_type, expected_feature\", [(pa.int32, Value(\"int32\")), (pa.string, Value(\"string\"))]\n)\ndef test_generate_from_arrow_type_with_arrow_primitive_data_type(arrow_primitive_data_type, expected_feature):\n    arrow_data_type = arrow_primitive_data_type()\n    feature = generate_from_arrow_type(arrow_data_type)\n    assert feature == expected_feature\n\n\n@pytest.mark.parametrize(\n    \"arrow_primitive_data_type, expected_scalar_feature\", [(pa.int32, Value(\"int32\")), (pa.string, Value(\"string\"))]\n)\n@pytest.mark.parametrize(\n    \"arrow_nested_data_type, expected_list_feature\", [(pa.list_, Sequence), (pa.large_list, LargeList)]\n)\ndef test_generate_from_arrow_type_with_arrow_nested_data_type(\n    arrow_nested_data_type, expected_list_feature, arrow_primitive_data_type, expected_scalar_feature\n):\n    arrow_data_type = arrow_nested_data_type(arrow_primitive_data_type())\n    feature = generate_from_arrow_type(arrow_data_type)\n    expected_feature = expected_list_feature(expected_scalar_feature)\n    assert feature == expected_feature\n\n\n@pytest.mark.parametrize(\n    \"schema\",\n    [[ClassLabel(names=[\"a\", \"b\"])], LargeList(ClassLabel(names=[\"a\", \"b\"])), List(ClassLabel(names=[\"a\", \"b\"]))],\n)\ndef test_check_non_null_non_empty_recursive_with_list_types(schema):\n    assert _check_non_null_non_empty_recursive([], schema) is False\n\n\n@pytest.mark.parametrize(\n    \"schema\",\n    [\n        [[ClassLabel(names=[\"a\", \"b\"])]],\n        LargeList(LargeList(ClassLabel(names=[\"a\", \"b\"]))),\n        List(List(ClassLabel(names=[\"a\", \"b\"]))),\n    ],\n)\ndef test_check_non_null_non_empty_recursive_with_nested_list_types(schema):\n    assert _check_non_null_non_empty_recursive([[]], schema) is False\n\n\n@pytest.mark.parametrize(\"feature\", [LargeList(Audio()), List(Audio())])\ndef test_require_decoding_with_list_types(feature):\n    assert require_decoding(feature)\n\n\n@pytest.mark.parametrize(\"feature\", [LargeList(Audio()), List(Audio())])\ndef test_require_storage_cast_with_list_types(feature):\n    assert require_storage_cast(feature)\n\n\n@pytest.mark.parametrize(\"feature\", [LargeList(Audio()), List(Audio())])\ndef test_require_storage_embed_with_list_types(feature):\n    assert require_storage_embed(feature)\n\n\n@pytest.mark.parametrize(\n    \"feature, expected\",\n    [(List(Value(\"int32\")), List(1)), (LargeList(Value(\"int32\")), LargeList(1)), (List(Value(\"int32\")), List(1))],\n)\ndef test_visit_with_list_types(feature, expected):\n    def func(x):\n        return 1 if isinstance(x, Value) else x\n\n    result = _visit(feature, func)\n    assert result == expected\n\n\n@pytest.mark.parametrize(\n    \"feature, expected\",\n    [\n        (Value(\"null\"), True),\n        (Value(\"string\"), False),\n        (Value(\"int64\"), False),\n        (List(Value(\"null\")), True),\n        (List(Value(\"string\")), False),\n        (List({\"a\": Value(\"string\")}), False),\n        (LargeList(Value(\"null\")), True),\n        (LargeList(Value(\"string\")), False),\n        (LargeList({\"a\": Value(\"string\")}), False),\n        (ClassLabel(names=[\"a\", \"b\"]), False),\n    ],\n)\ndef test_is_null_feature(feature, expected):\n    assert _is_null_feature(feature) == expected\n"
  },
  {
    "path": "tests/features/test_image.py",
    "content": "import os\nimport shutil\nimport tarfile\nimport warnings\nfrom io import BytesIO\nfrom pathlib import Path\n\nimport numpy as np\nimport pandas as pd\nimport pyarrow as pa\nimport pytest\n\nfrom datasets import Column, Dataset, Features, Image, List, Value, concatenate_datasets, load_dataset\nfrom datasets.features.image import encode_np_array, image_to_bytes\n\nfrom ..utils import require_pil\n\n\n@pytest.fixture\ndef tar_jpg_path(shared_datadir, tmp_path_factory):\n    image_path = str(shared_datadir / \"test_image_rgb.jpg\")\n    path = tmp_path_factory.mktemp(\"data\") / \"image_data.jpg.tar\"\n    with tarfile.TarFile(path, \"w\") as f:\n        f.add(image_path, arcname=os.path.basename(image_path))\n    return path\n\n\ndef iter_archive(archive_path):\n    with tarfile.open(archive_path) as tar:\n        for tarinfo in tar:\n            file_path = tarinfo.name\n            file_obj = tar.extractfile(tarinfo)\n            yield file_path, file_obj\n\n\ndef test_image_instantiation():\n    image = Image()\n    assert image.id is None\n    assert image.dtype == \"PIL.Image.Image\"\n    assert image.pa_type == pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()})\n    assert image._type == \"Image\"\n\n\ndef test_image_feature_type_to_arrow():\n    features = Features({\"image\": Image()})\n    assert features.arrow_schema == pa.schema({\"image\": Image().pa_type})\n    features = Features({\"struct_containing_an_image\": {\"image\": Image()}})\n    assert features.arrow_schema == pa.schema({\"struct_containing_an_image\": pa.struct({\"image\": Image().pa_type})})\n    features = Features({\"sequence_of_images\": List(Image())})\n    assert features.arrow_schema == pa.schema({\"sequence_of_images\": pa.list_(Image().pa_type)})\n\n\n@require_pil\n@pytest.mark.parametrize(\n    \"build_example\",\n    [\n        lambda image_path: image_path,\n        lambda image_path: Path(image_path),\n        lambda image_path: open(image_path, \"rb\").read(),\n        lambda image_path: {\"path\": image_path},\n        lambda image_path: {\"path\": image_path, \"bytes\": None},\n        lambda image_path: {\"path\": image_path, \"bytes\": open(image_path, \"rb\").read()},\n        lambda image_path: {\"path\": None, \"bytes\": open(image_path, \"rb\").read()},\n        lambda image_path: {\"bytes\": open(image_path, \"rb\").read()},\n    ],\n)\ndef test_image_feature_encode_example(shared_datadir, build_example):\n    import PIL.Image\n\n    image_path = str(shared_datadir / \"test_image_rgb.jpg\")\n    image = Image()\n    encoded_example = image.encode_example(build_example(image_path))\n    assert isinstance(encoded_example, dict)\n    assert encoded_example.keys() == {\"bytes\", \"path\"}\n    assert encoded_example[\"bytes\"] is not None or encoded_example[\"path\"] is not None\n    decoded_example = image.decode_example(encoded_example)\n    assert isinstance(decoded_example, PIL.Image.Image)\n\n\n@require_pil\ndef test_image_decode_example(shared_datadir):\n    import PIL.Image\n\n    image_path = str(shared_datadir / \"test_image_rgb.jpg\")\n    image = Image()\n    decoded_example = image.decode_example({\"path\": image_path, \"bytes\": None})\n\n    assert isinstance(decoded_example, PIL.Image.Image)\n    assert os.path.samefile(decoded_example.filename, image_path)\n    assert decoded_example.size == (640, 480)\n    assert decoded_example.mode == \"RGB\"\n\n    with pytest.raises(RuntimeError):\n        Image(decode=False).decode_example(image_path)\n\n\n@require_pil\ndef test_image_decode_example_with_exif_orientation_tag(shared_datadir):\n    import PIL.Image\n\n    image_path = str(shared_datadir / \"test_image_rgb.jpg\")\n    buffer = BytesIO()\n    exif = PIL.Image.Exif()\n    exif[PIL.Image.ExifTags.Base.Orientation] = 8  # rotate the image for 90°\n    PIL.Image.open(image_path).save(buffer, format=\"JPEG\", exif=exif.tobytes())\n    image = Image()\n\n    decoded_example = image.decode_example({\"path\": None, \"bytes\": buffer.getvalue()})\n\n    assert isinstance(decoded_example, PIL.Image.Image)\n    assert decoded_example.size == (480, 640)  # rotated\n    assert decoded_example.mode == \"RGB\"\n\n\n@require_pil\ndef test_image_change_mode(shared_datadir):\n    import PIL.Image\n\n    image_path = str(shared_datadir / \"test_image_rgb.jpg\")\n    image = Image(mode=\"YCbCr\")\n    decoded_example = image.decode_example({\"path\": image_path, \"bytes\": None})\n\n    assert isinstance(decoded_example, PIL.Image.Image)\n    assert not hasattr(decoded_example, \"filename\")  # changing the mode drops the filename\n    assert decoded_example.size == (640, 480)\n    assert decoded_example.mode == \"YCbCr\"\n\n\n@require_pil\ndef test_dataset_with_image_feature(shared_datadir):\n    import PIL.Image\n\n    image_path = str(shared_datadir / \"test_image_rgb.jpg\")\n    data = {\"image\": [image_path]}\n    features = Features({\"image\": Image()})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"image\"}\n    assert isinstance(item[\"image\"], PIL.Image.Image)\n    assert os.path.samefile(item[\"image\"].filename, image_path)\n    assert item[\"image\"].format == \"JPEG\"\n    assert item[\"image\"].size == (640, 480)\n    assert item[\"image\"].mode == \"RGB\"\n    batch = dset[:1]\n    assert len(batch) == 1\n    assert batch.keys() == {\"image\"}\n    assert isinstance(batch[\"image\"], list) and all(isinstance(item, PIL.Image.Image) for item in batch[\"image\"])\n    assert os.path.samefile(batch[\"image\"][0].filename, image_path)\n    assert batch[\"image\"][0].format == \"JPEG\"\n    assert batch[\"image\"][0].size == (640, 480)\n    assert batch[\"image\"][0].mode == \"RGB\"\n    column = dset[\"image\"]\n    assert len(column) == 1\n    assert isinstance(column, Column) and all(isinstance(item, PIL.Image.Image) for item in column)\n    assert os.path.samefile(column[0].filename, image_path)\n    assert column[0].format == \"JPEG\"\n    assert column[0].size == (640, 480)\n    assert column[0].mode == \"RGB\"\n\n\n@require_pil\n@pytest.mark.parametrize(\"infer_feature\", [False, True])\ndef test_dataset_with_image_feature_from_pil_image(infer_feature, shared_datadir):\n    import PIL.Image\n\n    image_path = str(shared_datadir / \"test_image_rgb.jpg\")\n    data = {\"image\": [PIL.Image.open(image_path)]}\n    features = Features({\"image\": Image()}) if not infer_feature else None\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"image\"}\n    assert isinstance(item[\"image\"], PIL.Image.Image)\n    assert os.path.samefile(item[\"image\"].filename, image_path)\n    assert item[\"image\"].format == \"JPEG\"\n    assert item[\"image\"].size == (640, 480)\n    assert item[\"image\"].mode == \"RGB\"\n    batch = dset[:1]\n    assert len(batch) == 1\n    assert batch.keys() == {\"image\"}\n    assert isinstance(batch[\"image\"], list) and all(isinstance(item, PIL.Image.Image) for item in batch[\"image\"])\n    assert os.path.samefile(batch[\"image\"][0].filename, image_path)\n    assert batch[\"image\"][0].format == \"JPEG\"\n    assert batch[\"image\"][0].size == (640, 480)\n    assert batch[\"image\"][0].mode == \"RGB\"\n    column = dset[\"image\"]\n    assert len(column) == 1\n    assert isinstance(column, Column) and all(isinstance(item, PIL.Image.Image) for item in column)\n    assert os.path.samefile(column[0].filename, image_path)\n    assert column[0].format == \"JPEG\"\n    assert column[0].size == (640, 480)\n    assert column[0].mode == \"RGB\"\n\n\n@require_pil\ndef test_dataset_with_image_feature_from_np_array():\n    import PIL.Image\n\n    image_array = np.arange(640 * 480, dtype=np.int32).reshape(480, 640)\n    data = {\"image\": [image_array]}\n    features = Features({\"image\": Image()})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"image\"}\n    assert isinstance(item[\"image\"], PIL.Image.Image)\n\n    np.testing.assert_array_equal(np.array(item[\"image\"]), image_array)\n    assert item[\"image\"].filename == \"\"\n    assert item[\"image\"].format in [\"PNG\", \"TIFF\"]\n    assert item[\"image\"].size == (640, 480)\n    batch = dset[:1]\n    assert len(batch) == 1\n    assert batch.keys() == {\"image\"}\n    assert isinstance(batch[\"image\"], list) and all(isinstance(item, PIL.Image.Image) for item in batch[\"image\"])\n    np.testing.assert_array_equal(np.array(batch[\"image\"][0]), image_array)\n    assert batch[\"image\"][0].filename == \"\"\n    assert batch[\"image\"][0].format in [\"PNG\", \"TIFF\"]\n    assert batch[\"image\"][0].size == (640, 480)\n    column = dset[\"image\"]\n    assert len(column) == 1\n    assert isinstance(column, Column) and all(isinstance(item, PIL.Image.Image) for item in column)\n    np.testing.assert_array_equal(np.array(column[0]), image_array)\n    assert column[0].filename == \"\"\n    assert column[0].format in [\"PNG\", \"TIFF\"]\n    assert column[0].size == (640, 480)\n\n\n@require_pil\ndef test_dataset_with_image_feature_tar_jpg(tar_jpg_path):\n    import PIL.Image\n\n    data = {\"image\": []}\n    for file_path, file_obj in iter_archive(tar_jpg_path):\n        data[\"image\"].append({\"path\": file_path, \"bytes\": file_obj.read()})\n        break\n\n    features = Features({\"image\": Image()})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"image\"}\n    assert isinstance(item[\"image\"], PIL.Image.Image)\n    assert item[\"image\"].filename == \"\"\n    assert item[\"image\"].format == \"JPEG\"\n    assert item[\"image\"].size == (640, 480)\n    assert item[\"image\"].mode == \"RGB\"\n    batch = dset[:1]\n    assert len(batch) == 1\n    assert batch.keys() == {\"image\"}\n    assert isinstance(batch[\"image\"], list) and all(isinstance(item, PIL.Image.Image) for item in batch[\"image\"])\n    assert batch[\"image\"][0].filename == \"\"\n    assert batch[\"image\"][0].format == \"JPEG\"\n    assert batch[\"image\"][0].size == (640, 480)\n    assert batch[\"image\"][0].mode == \"RGB\"\n    column = dset[\"image\"]\n    assert len(column) == 1\n    assert isinstance(column, Column) and all(isinstance(item, PIL.Image.Image) for item in column)\n    assert column[0].filename == \"\"\n    assert column[0].format == \"JPEG\"\n    assert column[0].size == (640, 480)\n    assert column[0].mode == \"RGB\"\n\n\n@require_pil\ndef test_dataset_with_image_feature_with_none():\n    data = {\"image\": [None]}\n    features = Features({\"image\": Image()})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"image\"}\n    assert item[\"image\"] is None\n    batch = dset[:1]\n    assert len(batch) == 1\n    assert batch.keys() == {\"image\"}\n    assert isinstance(batch[\"image\"], list) and all(item is None for item in batch[\"image\"])\n    column = dset[\"image\"]\n    assert len(column) == 1\n    assert isinstance(column, Column) and all(item is None for item in column)\n\n    # nested tests\n\n    data = {\"images\": [[None]]}\n    features = Features({\"images\": List(Image())})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"images\"}\n    assert all(i is None for i in item[\"images\"])\n\n    data = {\"nested\": [{\"image\": None}]}\n    features = Features({\"nested\": {\"image\": Image()}})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"nested\"}\n    assert item[\"nested\"].keys() == {\"image\"}\n    assert item[\"nested\"][\"image\"] is None\n\n\n@require_pil\n@pytest.mark.parametrize(\n    \"build_data\",\n    [\n        lambda image_path: {\"image\": [image_path]},\n        lambda image_path: {\"image\": [open(image_path, \"rb\").read()]},\n        lambda image_path: {\"image\": [{\"path\": image_path}]},\n        lambda image_path: {\"image\": [{\"path\": image_path, \"bytes\": None}]},\n        lambda image_path: {\"image\": [{\"path\": image_path, \"bytes\": open(image_path, \"rb\").read()}]},\n        lambda image_path: {\"image\": [{\"path\": None, \"bytes\": open(image_path, \"rb\").read()}]},\n        lambda image_path: {\"image\": [{\"bytes\": open(image_path, \"rb\").read()}]},\n    ],\n)\ndef test_dataset_cast_to_image_features(shared_datadir, build_data):\n    import PIL.Image\n\n    image_path = str(shared_datadir / \"test_image_rgb.jpg\")\n    data = build_data(image_path)\n    dset = Dataset.from_dict(data)\n    item = dset.cast(Features({\"image\": Image()}))[0]\n    assert item.keys() == {\"image\"}\n    assert isinstance(item[\"image\"], PIL.Image.Image)\n    item = dset.cast_column(\"image\", Image())[0]\n    assert item.keys() == {\"image\"}\n    assert isinstance(item[\"image\"], PIL.Image.Image)\n\n\ndef test_dataset_cast_to_image_features_polars(shared_datadir):\n    import PIL.Image\n\n    pl = pytest.importorskip(\"polars\")\n    image_path = str(shared_datadir / \"test_image_rgb.jpg\")\n    df = pl.DataFrame({\"image_path\": [image_path]})\n    dataset = Dataset.from_polars(df)\n    item = dataset.cast_column(\"image_path\", Image())[0]\n    assert item.keys() == {\"image_path\"}\n    assert isinstance(item[\"image_path\"], PIL.Image.Image)\n\n\n@require_pil\ndef test_dataset_concatenate_image_features(shared_datadir):\n    # we use a different data structure between 1 and 2 to make sure they are compatible with each other\n    image_path = str(shared_datadir / \"test_image_rgb.jpg\")\n    data1 = {\"image\": [image_path]}\n    dset1 = Dataset.from_dict(data1, features=Features({\"image\": Image()}))\n    data2 = {\"image\": [{\"bytes\": open(image_path, \"rb\").read()}]}\n    dset2 = Dataset.from_dict(data2, features=Features({\"image\": Image()}))\n    concatenated_dataset = concatenate_datasets([dset1, dset2])\n    assert len(concatenated_dataset) == len(dset1) + len(dset2)\n    assert concatenated_dataset[0][\"image\"] == dset1[0][\"image\"]\n    assert concatenated_dataset[1][\"image\"] == dset2[0][\"image\"]\n\n\n@require_pil\ndef test_dataset_concatenate_nested_image_features(shared_datadir):\n    # we use a different data structure between 1 and 2 to make sure they are compatible with each other\n    image_path = str(shared_datadir / \"test_image_rgb.jpg\")\n    features = Features({\"list_of_structs_of_images\": List({\"image\": Image()})})\n    data1 = {\"list_of_structs_of_images\": [[{\"image\": image_path}]]}\n    dset1 = Dataset.from_dict(data1, features=features)\n    data2 = {\"list_of_structs_of_images\": [[{\"image\": {\"bytes\": open(image_path, \"rb\").read()}}]]}\n    dset2 = Dataset.from_dict(data2, features=features)\n    concatenated_dataset = concatenate_datasets([dset1, dset2])\n    assert len(concatenated_dataset) == len(dset1) + len(dset2)\n    assert (\n        concatenated_dataset[0][\"list_of_structs_of_images\"][0][\"image\"]\n        == dset1[0][\"list_of_structs_of_images\"][0][\"image\"]\n    )\n    assert (\n        concatenated_dataset[1][\"list_of_structs_of_images\"][0][\"image\"]\n        == dset2[0][\"list_of_structs_of_images\"][0][\"image\"]\n    )\n\n\n@require_pil\ndef test_dataset_with_image_feature_map(shared_datadir):\n    image_path = str(shared_datadir / \"test_image_rgb.jpg\")\n    data = {\"image\": [image_path], \"caption\": [\"cats sleeping\"]}\n    features = Features({\"image\": Image(), \"caption\": Value(\"string\")})\n    dset = Dataset.from_dict(data, features=features)\n\n    for item in dset.cast_column(\"image\", Image(decode=False)):\n        assert item.keys() == {\"image\", \"caption\"}\n        assert item == {\"image\": {\"path\": image_path, \"bytes\": None}, \"caption\": \"cats sleeping\"}\n\n    # no decoding\n\n    def process_caption(example):\n        example[\"caption\"] = \"Two \" + example[\"caption\"]\n        return example\n\n    processed_dset = dset.map(process_caption)\n    for item in processed_dset.cast_column(\"image\", Image(decode=False)):\n        assert item.keys() == {\"image\", \"caption\"}\n        assert item == {\"image\": {\"path\": image_path, \"bytes\": None}, \"caption\": \"Two cats sleeping\"}\n\n    # decoding example\n\n    def process_image_by_example(example):\n        example[\"mode\"] = example[\"image\"].mode\n        return example\n\n    decoded_dset = dset.map(process_image_by_example)\n    for item in decoded_dset.cast_column(\"image\", Image(decode=False)):\n        assert item.keys() == {\"image\", \"caption\", \"mode\"}\n        assert os.path.samefile(item[\"image\"][\"path\"], image_path)\n        assert item[\"caption\"] == \"cats sleeping\"\n        assert item[\"mode\"] == \"RGB\"\n\n    # decoding batch\n\n    def process_image_by_batch(batch):\n        batch[\"mode\"] = [image.mode for image in batch[\"image\"]]\n        return batch\n\n    decoded_dset = dset.map(process_image_by_batch, batched=True)\n    for item in decoded_dset.cast_column(\"image\", Image(decode=False)):\n        assert item.keys() == {\"image\", \"caption\", \"mode\"}\n        assert os.path.samefile(item[\"image\"][\"path\"], image_path)\n        assert item[\"caption\"] == \"cats sleeping\"\n        assert item[\"mode\"] == \"RGB\"\n\n\n@require_pil\ndef test_formatted_dataset_with_image_feature_map(shared_datadir):\n    image_path = str(shared_datadir / \"test_image_rgb.jpg\")\n    pil_image = Image().decode_example({\"path\": image_path, \"bytes\": None})\n    data = {\"image\": [image_path], \"caption\": [\"cats sleeping\"]}\n    features = Features({\"image\": Image(), \"caption\": Value(\"string\")})\n\n    dset = Dataset.from_dict(data, features=features)\n    for item in dset.cast_column(\"image\", Image(decode=False)):\n        assert item.keys() == {\"image\", \"caption\"}\n        assert item == {\"image\": {\"path\": image_path, \"bytes\": None}, \"caption\": \"cats sleeping\"}\n\n    def process_image_by_example(example):\n        example[\"num_channels\"] = example[\"image\"].shape[-1]\n        return example\n\n    decoded_dset = dset.with_format(\"numpy\").map(process_image_by_example)\n    for item in decoded_dset.cast_column(\"image\", Image(decode=False)):\n        assert item.keys() == {\"image\", \"caption\", \"num_channels\"}\n        assert item[\"image\"] == encode_np_array(np.array(pil_image))\n        assert item[\"caption\"] == \"cats sleeping\"\n        assert item[\"num_channels\"] == 3\n\n    def process_image_by_batch(batch):\n        batch[\"num_channels\"] = [image.shape[-1] for image in batch[\"image\"]]\n        return batch\n\n    decoded_dset = dset.with_format(\"numpy\").map(process_image_by_batch, batched=True)\n    for item in decoded_dset.cast_column(\"image\", Image(decode=False)):\n        assert item.keys() == {\"image\", \"caption\", \"num_channels\"}\n        assert item[\"image\"] == encode_np_array(np.array(pil_image))\n        assert item[\"caption\"] == \"cats sleeping\"\n        assert item[\"num_channels\"] == 3\n\n\n@require_pil\ndef test_dataset_with_image_feature_map_change_image(shared_datadir):\n    import PIL.Image\n\n    image_path = str(shared_datadir / \"test_image_rgb.jpg\")\n    pil_image = Image().decode_example({\"path\": image_path, \"bytes\": None})\n    data = {\"image\": [image_path]}\n    features = Features({\"image\": Image()})\n    dset = Dataset.from_dict(data, features=features)\n\n    for item in dset.cast_column(\"image\", Image(decode=False)):\n        assert item.keys() == {\"image\"}\n        assert item == {\n            \"image\": {\n                \"bytes\": None,\n                \"path\": image_path,\n            }\n        }\n\n    # return pil image\n\n    def process_image_resize_by_example(example):\n        example[\"image\"] = example[\"image\"].resize((100, 100))\n        return example\n\n    decoded_dset = dset.map(process_image_resize_by_example)\n    for item in decoded_dset.cast_column(\"image\", Image(decode=False)):\n        assert item.keys() == {\"image\"}\n        assert item == {\"image\": {\"bytes\": image_to_bytes(pil_image.resize((100, 100))), \"path\": None}}\n\n    def process_image_resize_by_batch(batch):\n        batch[\"image\"] = [image.resize((100, 100)) for image in batch[\"image\"]]\n        return batch\n\n    decoded_dset = dset.map(process_image_resize_by_batch, batched=True)\n    for item in decoded_dset.cast_column(\"image\", Image(decode=False)):\n        assert item.keys() == {\"image\"}\n        assert item == {\"image\": {\"bytes\": image_to_bytes(pil_image.resize((100, 100))), \"path\": None}}\n\n    # return np.ndarray (e.g. when using albumentations)\n\n    def process_image_resize_by_example_return_np_array(example):\n        example[\"image\"] = np.array(example[\"image\"].resize((100, 100)))\n        return example\n\n    decoded_dset = dset.map(process_image_resize_by_example_return_np_array)\n    for item in decoded_dset.cast_column(\"image\", Image(decode=False)):\n        assert item.keys() == {\"image\"}\n        assert item == {\n            \"image\": {\n                \"bytes\": image_to_bytes(PIL.Image.fromarray(np.array(pil_image.resize((100, 100))))),\n                \"path\": None,\n            }\n        }\n\n    def process_image_resize_by_batch_return_np_array(batch):\n        batch[\"image\"] = [np.array(image.resize((100, 100))) for image in batch[\"image\"]]\n        return batch\n\n    decoded_dset = dset.map(process_image_resize_by_batch_return_np_array, batched=True)\n    for item in decoded_dset.cast_column(\"image\", Image(decode=False)):\n        assert item.keys() == {\"image\"}\n        assert item == {\n            \"image\": {\n                \"bytes\": image_to_bytes(PIL.Image.fromarray(np.array(pil_image.resize((100, 100))))),\n                \"path\": None,\n            }\n        }\n\n\n@require_pil\ndef test_formatted_dataset_with_image_feature(shared_datadir):\n    import PIL.Image\n\n    image_path = str(shared_datadir / \"test_image_rgb.jpg\")\n    data = {\"image\": [image_path, image_path]}\n    features = Features({\"image\": Image()})\n    dset = Dataset.from_dict(data, features=features)\n    with dset.formatted_as(\"numpy\"):\n        item = dset[0]\n        assert item.keys() == {\"image\"}\n        assert isinstance(item[\"image\"], np.ndarray)\n        assert item[\"image\"].shape == (480, 640, 3)\n        batch = dset[:1]\n        assert batch.keys() == {\"image\"}\n        assert len(batch) == 1\n        assert isinstance(batch[\"image\"], np.ndarray)\n        assert batch[\"image\"].shape == (1, 480, 640, 3)\n        column = dset[\"image\"]\n        assert len(column) == 2\n        assert isinstance(column[:], np.ndarray)\n        assert column[:].shape == (2, 480, 640, 3)\n\n    with dset.formatted_as(\"pandas\"):\n        item = dset[0]\n        assert item.shape == (1, 1)\n        assert item.columns == [\"image\"]\n        assert isinstance(item[\"image\"][0], PIL.Image.Image)\n        assert os.path.samefile(item[\"image\"][0].filename, image_path)\n        assert item[\"image\"][0].format == \"JPEG\"\n        assert item[\"image\"][0].size == (640, 480)\n        assert item[\"image\"][0].mode == \"RGB\"\n        batch = dset[:1]\n        assert batch.shape == (1, 1)\n        assert batch.columns == [\"image\"]\n        assert isinstance(batch[\"image\"], pd.Series) and all(\n            isinstance(item, PIL.Image.Image) for item in batch[\"image\"]\n        )\n        assert os.path.samefile(batch[\"image\"][0].filename, image_path)\n        assert batch[\"image\"][0].format == \"JPEG\"\n        assert batch[\"image\"][0].size == (640, 480)\n        assert batch[\"image\"][0].mode == \"RGB\"\n        column = dset[\"image\"]\n        assert len(column) == 2\n        assert isinstance(column, pd.Series) and all(isinstance(item, PIL.Image.Image) for item in column)\n        assert os.path.samefile(column[0].filename, image_path)\n        assert column[0].format == \"JPEG\"\n        assert column[0].size == (640, 480)\n        assert column[0].mode == \"RGB\"\n\n\n@pytest.fixture\ndef img_dataset_dir(shared_datadir, tmp_path):\n    data_dir = tmp_path / \"dummy_img_dataset_data\"\n    data_dir.mkdir()\n    shutil.copy(str(shared_datadir / \"test_image_rgb.jpg\"), str(data_dir / \"image_rgb.jpg\"))\n    with open(data_dir / \"metadata.jsonl\", \"w\") as f:\n        f.write('{\"file_name\": \"image_rgb.jpg\", \"caption\": \"Two cats sleeping\"}\\n')\n    return str(data_dir)\n\n\n@require_pil\n@pytest.mark.parametrize(\"streaming\", [False, True])\ndef test_load_dataset_with_image_feature(shared_datadir, img_dataset_dir, streaming):\n    import PIL.Image\n\n    image_path = os.path.join(img_dataset_dir, \"image_rgb.jpg\")\n    dset = load_dataset(img_dataset_dir, split=\"train\", streaming=streaming)\n    item = dset[0] if not streaming else next(iter(dset))\n    assert item.keys() == {\"image\", \"caption\"}\n    assert isinstance(item[\"image\"], PIL.Image.Image)\n    assert os.path.samefile(item[\"image\"].filename, image_path)\n    assert item[\"image\"].format == \"JPEG\"\n    assert item[\"image\"].size == (640, 480)\n    assert item[\"image\"].mode == \"RGB\"\n\n\n@require_pil\ndef test_dataset_with_image_feature_undecoded(shared_datadir):\n    image_path = str(shared_datadir / \"test_image_rgb.jpg\")\n    data = {\"image\": [image_path]}\n    features = Features({\"image\": Image(decode=False)})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"image\"}\n    assert item[\"image\"] == {\"path\": image_path, \"bytes\": None}\n    batch = dset[:1]\n    assert batch.keys() == {\"image\"}\n    assert len(batch[\"image\"]) == 1\n    assert batch[\"image\"][0] == {\"path\": image_path, \"bytes\": None}\n    column = dset[\"image\"]\n    assert len(column) == 1\n    assert column[0] == {\"path\": image_path, \"bytes\": None}\n\n\n@require_pil\ndef test_formatted_dataset_with_image_feature_undecoded(shared_datadir):\n    image_path = str(shared_datadir / \"test_image_rgb.jpg\")\n    data = {\"image\": [image_path]}\n    features = Features({\"image\": Image(decode=False)})\n    dset = Dataset.from_dict(data, features=features)\n    with dset.formatted_as(\"numpy\"):\n        item = dset[0]\n        assert item.keys() == {\"image\"}\n        assert item[\"image\"] == {\"path\": image_path, \"bytes\": None}\n        batch = dset[:1]\n        assert batch.keys() == {\"image\"}\n        assert len(batch[\"image\"]) == 1\n        assert batch[\"image\"][0] == {\"path\": image_path, \"bytes\": None}\n        column = dset[\"image\"]\n        assert len(column) == 1\n        assert column[0] == {\"path\": image_path, \"bytes\": None}\n\n    with dset.formatted_as(\"pandas\"):\n        item = dset[0]\n        assert item.shape == (1, 1)\n        assert item.columns == [\"image\"]\n        assert item[\"image\"][0] == {\"path\": image_path, \"bytes\": None}\n        batch = dset[:1]\n        assert batch.shape == (1, 1)\n        assert batch.columns == [\"image\"]\n        assert batch[\"image\"][0] == {\"path\": image_path, \"bytes\": None}\n        column = dset[\"image\"]\n        assert len(column) == 1\n        assert column[0] == {\"path\": image_path, \"bytes\": None}\n\n\n@require_pil\ndef test_dataset_with_image_feature_map_undecoded(shared_datadir):\n    image_path = str(shared_datadir / \"test_image_rgb.jpg\")\n    data = {\"image\": [image_path]}\n    features = Features({\"image\": Image(decode=False)})\n    dset = Dataset.from_dict(data, features=features)\n\n    def assert_image_example_undecoded(example):\n        assert example[\"image\"] == {\"path\": image_path, \"bytes\": None}\n\n    dset.map(assert_image_example_undecoded)\n\n    def assert_image_batch_undecoded(batch):\n        for image in batch[\"image\"]:\n            assert image == {\"path\": image_path, \"bytes\": None}\n\n    dset.map(assert_image_batch_undecoded, batched=True)\n\n\n@require_pil\ndef test_image_embed_storage(shared_datadir):\n    image_path = str(shared_datadir / \"test_image_rgb.jpg\")\n    example = {\"bytes\": None, \"path\": image_path}\n    storage = pa.array([example], type=pa.struct({\"bytes\": pa.binary(), \"path\": pa.string()}))\n    embedded_storage = Image().embed_storage(storage)\n    embedded_example = embedded_storage.to_pylist()[0]\n    assert embedded_example == {\"bytes\": open(image_path, \"rb\").read(), \"path\": \"test_image_rgb.jpg\"}\n\n\n@require_pil\n@pytest.mark.parametrize(\n    \"array, dtype_cast, expected_image_format\",\n    [\n        (np.arange(16).reshape(4, 4).astype(np.uint8), \"exact_match\", \"PNG\"),\n        (np.arange(16).reshape(4, 4).astype(np.uint16), \"exact_match\", \"TIFF\"),\n        (np.arange(16).reshape(4, 4).astype(np.int64), \"downcast->|i4\", \"TIFF\"),\n        (np.arange(16).reshape(4, 4).astype(np.complex128), \"error\", None),\n        (np.arange(16).reshape(2, 2, 4).astype(np.uint8), \"exact_match\", \"PNG\"),\n        (np.arange(16).reshape(2, 2, 4), \"downcast->|u1\", \"PNG\"),\n        (np.arange(16).reshape(2, 2, 4).astype(np.float64), \"error\", None),\n    ],\n)\ndef test_encode_np_array(array, dtype_cast, expected_image_format):\n    if dtype_cast.startswith(\"downcast\"):\n        _, dest_dtype = dtype_cast.split(\"->\")\n        dest_dtype = np.dtype(dest_dtype)\n        with pytest.warns(UserWarning, match=f\"Downcasting array dtype.+{dest_dtype}.+\"):\n            encoded_image = Image().encode_example(array)\n    elif dtype_cast == \"error\":\n        with pytest.raises(TypeError):\n            Image().encode_example(array)\n        return\n    else:  # exact_match (no warnings are raised)\n        with warnings.catch_warnings():\n            warnings.simplefilter(\"error\")\n            encoded_image = Image().encode_example(array)\n\n    assert isinstance(encoded_image, dict)\n    assert encoded_image.keys() == {\"path\", \"bytes\"}\n    assert encoded_image[\"path\"] is None\n    assert encoded_image[\"bytes\"] is not None and isinstance(encoded_image[\"bytes\"], bytes)\n    decoded_image = Image().decode_example(encoded_image)\n    assert decoded_image.format == expected_image_format\n    np.testing.assert_array_equal(np.array(decoded_image), array)\n"
  },
  {
    "path": "tests/features/test_nifti.py",
    "content": "## taken from: https://github.com/yarikoptic/nitest-balls1/blob/2cd07d86e2cc2d3c612d5d4d659daccd7a58f126/NIFTI/T1.nii.gz\n\nfrom pathlib import Path\n\nimport pyarrow as pa\nimport pytest\n\nfrom datasets import Dataset, Features, Nifti, load_dataset\nfrom src.datasets.features.nifti import encode_nibabel_image\n\nfrom ..utils import require_nibabel\n\n\n@require_nibabel\n@pytest.mark.parametrize(\"nifti_file\", [\"test_nifti.nii\", \"test_nifti.nii.gz\"])\n@pytest.mark.parametrize(\n    \"build_example\",\n    [\n        lambda nifti_path: nifti_path,\n        lambda nifti_path: Path(nifti_path),\n        lambda nifti_path: open(nifti_path, \"rb\").read(),\n        lambda nifti_path: {\"path\": nifti_path},\n        lambda nifti_path: {\"path\": nifti_path, \"bytes\": None},\n        lambda nifti_path: {\"path\": nifti_path, \"bytes\": open(nifti_path, \"rb\").read()},\n        lambda nifti_path: {\"path\": None, \"bytes\": open(nifti_path, \"rb\").read()},\n        lambda nifti_path: {\"bytes\": open(nifti_path, \"rb\").read()},\n    ],\n)\ndef test_nifti_feature_encode_example(shared_datadir, nifti_file, build_example):\n    import nibabel\n\n    nifti_path = str(shared_datadir / nifti_file)\n    nifti = Nifti()\n    encoded_example = nifti.encode_example(build_example(nifti_path))\n    assert isinstance(encoded_example, dict)\n    assert encoded_example.keys() == {\"bytes\", \"path\"}\n    assert encoded_example[\"bytes\"] is not None or encoded_example[\"path\"] is not None\n    decoded_example = nifti.decode_example(encoded_example)\n    assert isinstance(decoded_example, nibabel.nifti1.Nifti1Image)\n\n\n@require_nibabel\n@pytest.mark.parametrize(\"nifti_file\", [\"test_nifti.nii\", \"test_nifti.nii.gz\"])\ndef test_dataset_with_nifti_feature(shared_datadir, nifti_file):\n    import nibabel\n\n    nifti_path = str(shared_datadir / nifti_file)\n    data = {\"nifti\": [nifti_path]}\n    features = Features({\"nifti\": Nifti()})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"nifti\"}\n    assert isinstance(item[\"nifti\"], nibabel.nifti1.Nifti1Image)\n    batch = dset[:1]\n    assert len(batch) == 1\n    assert batch.keys() == {\"nifti\"}\n    assert isinstance(batch[\"nifti\"], list) and all(\n        isinstance(item, nibabel.nifti1.Nifti1Image) for item in batch[\"nifti\"]\n    )\n    column = dset[\"nifti\"]\n    assert len(column) == 1\n    assert all(isinstance(item, nibabel.nifti1.Nifti1Image) for item in column)\n\n    # from bytes\n    with open(nifti_path, \"rb\") as f:\n        data = {\"nifti\": [f.read()]}\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"nifti\"}\n    assert isinstance(item[\"nifti\"], nibabel.nifti1.Nifti1Image)\n\n\n@require_nibabel\ndef test_encode_nibabel_image(shared_datadir):\n    import nibabel\n\n    nifti_path = str(shared_datadir / \"test_nifti.nii\")\n    img = nibabel.load(nifti_path)\n    encoded_example = encode_nibabel_image(img)\n    nifti = Nifti()\n    assert isinstance(encoded_example, dict)\n    assert encoded_example.keys() == {\"bytes\", \"path\"}\n    assert encoded_example[\"path\"] is not None and encoded_example[\"bytes\"] is None\n    decoded_example = nifti.decode_example(encoded_example)\n    assert isinstance(decoded_example, nibabel.nifti1.Nifti1Image)\n\n    # test bytes only\n    img.file_map = None\n    encoded_example_bytes = encode_nibabel_image(img)\n    assert isinstance(encoded_example_bytes, dict)\n    assert encoded_example_bytes[\"bytes\"] is not None and encoded_example_bytes[\"path\"] is None\n    # this cannot be converted back from bytes (yet)\n\n\n@require_nibabel\ndef test_embed_storage(shared_datadir):\n    from io import BytesIO\n\n    import nibabel as nib\n\n    nifti_path = str(shared_datadir / \"test_nifti.nii\")\n    img = nib.load(nifti_path)\n    nifti = Nifti()\n\n    bytes_array = pa.array([None], type=pa.binary())\n    path_array = pa.array([nifti_path], type=pa.string())\n    storage = pa.StructArray.from_arrays([bytes_array, path_array], [\"bytes\", \"path\"])\n\n    embedded_storage = nifti.embed_storage(storage)\n\n    embedded_bytes = embedded_storage[0][\"bytes\"].as_py()\n\n    bio = BytesIO(embedded_bytes)\n    fh = nib.FileHolder(fileobj=bio)\n    nifti_img = nib.Nifti1Image.from_file_map({\"header\": fh, \"image\": fh})\n\n    assert embedded_bytes is not None\n    assert nifti_img.header == img.header\n    assert (nifti_img.affine == img.affine).all()\n    assert (nifti_img.get_fdata() == img.get_fdata()).all()\n\n\n@require_nibabel\ndef test_load_zipped_file_locally(shared_datadir):\n    import nibabel as nib\n\n    nifti_path = str(shared_datadir / \"test_nifti.nii.gz\")\n\n    ds = load_dataset(\"niftifolder\", data_files=nifti_path)\n    assert isinstance(ds[\"train\"][0][\"nifti\"], nib.nifti1.Nifti1Image)\n\n\n@require_nibabel\ndef test_nifti_lazy_loading(shared_datadir):\n    import nibabel as nib\n    import numpy as np\n\n    nifti_path = str(shared_datadir / \"test_nifti.nii.gz\")\n    nifti = Nifti()\n    encoded_example = nifti.encode_example(nifti_path)\n    decoded_example = nifti.decode_example(encoded_example)\n\n    # Verify that the data object is an ArrayProxy (lazy) and not a numpy array (dense)\n    assert nib.is_proxy(decoded_example.dataobj)\n    assert not isinstance(decoded_example.dataobj, np.ndarray)\n\n    # Verify that we can still access the data\n    data = decoded_example.get_fdata()\n    assert data.shape == (80, 80, 10)\n"
  },
  {
    "path": "tests/features/test_pdf.py",
    "content": "from pathlib import Path\n\nimport pytest\n\nfrom datasets import Dataset, Features, Pdf\n\nfrom ..utils import require_pdfplumber\n\n\n@require_pdfplumber\n@pytest.mark.parametrize(\n    \"build_example\",\n    [\n        lambda pdf_path: pdf_path,\n        lambda pdf_path: Path(pdf_path),\n        lambda pdf_path: open(pdf_path, \"rb\").read(),\n        lambda pdf_path: {\"path\": pdf_path},\n        lambda pdf_path: {\"path\": pdf_path, \"bytes\": None},\n        lambda pdf_path: {\"path\": pdf_path, \"bytes\": open(pdf_path, \"rb\").read()},\n        lambda pdf_path: {\"path\": None, \"bytes\": open(pdf_path, \"rb\").read()},\n        lambda pdf_path: {\"bytes\": open(pdf_path, \"rb\").read()},\n    ],\n)\ndef test_pdf_feature_encode_example(shared_datadir, build_example):\n    import pdfplumber\n\n    pdf_path = str(shared_datadir / \"test_pdf.pdf\")\n    pdf = Pdf()\n    encoded_example = pdf.encode_example(build_example(pdf_path))\n    assert isinstance(encoded_example, dict)\n    assert encoded_example.keys() == {\"bytes\", \"path\"}\n    assert encoded_example[\"bytes\"] is not None or encoded_example[\"path\"] is not None\n    decoded_example = pdf.decode_example(encoded_example)\n    assert isinstance(decoded_example, pdfplumber.pdf.PDF)\n\n\n@require_pdfplumber\ndef test_dataset_with_pdf_feature(shared_datadir):\n    import pdfplumber\n\n    pdf_path = str(shared_datadir / \"test_pdf.pdf\")\n    data = {\"pdf\": [pdf_path]}\n    features = Features({\"pdf\": Pdf()})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"pdf\"}\n    assert isinstance(item[\"pdf\"], pdfplumber.pdf.PDF)\n    batch = dset[:1]\n    assert len(batch) == 1\n    assert batch.keys() == {\"pdf\"}\n    assert isinstance(batch[\"pdf\"], list) and all(isinstance(item, pdfplumber.pdf.PDF) for item in batch[\"pdf\"])\n    column = dset[\"pdf\"]\n    assert len(column) == 1\n    assert isinstance(column, list) and all(isinstance(item, pdfplumber.pdf.PDF) for item in column)\n\n    # from bytes\n    with open(pdf_path, \"rb\") as f:\n        data = {\"pdf\": [f.read()]}\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"pdf\"}\n    assert isinstance(item[\"pdf\"], pdfplumber.pdf.PDF)\n"
  },
  {
    "path": "tests/features/test_video.py",
    "content": "from pathlib import Path\n\nimport pytest\n\nfrom datasets import Column, Dataset, Features, Value, Video, load_dataset\n\nfrom ..utils import require_torchcodec\n\n\n@require_torchcodec\n@pytest.mark.parametrize(\n    \"build_example\",\n    [\n        lambda video_path: video_path,\n        lambda video_path: Path(video_path),\n        lambda video_path: open(video_path, \"rb\").read(),\n        lambda video_path: {\"path\": video_path},\n        lambda video_path: {\"path\": video_path, \"bytes\": None},\n        lambda video_path: {\"path\": video_path, \"bytes\": open(video_path, \"rb\").read()},\n        lambda video_path: {\"path\": None, \"bytes\": open(video_path, \"rb\").read()},\n        lambda video_path: {\"bytes\": open(video_path, \"rb\").read()},\n    ],\n)\ndef test_video_feature_encode_example(shared_datadir, build_example):\n    from torchcodec.decoders import VideoDecoder\n\n    video_path = str(shared_datadir / \"test_video_66x50.mov\")\n    video = Video()\n    encoded_example = video.encode_example(build_example(video_path))\n    assert isinstance(encoded_example, dict)\n    assert encoded_example.keys() == {\"bytes\", \"path\"}\n    assert encoded_example[\"bytes\"] is not None or encoded_example[\"path\"] is not None\n    decoded_example = video.decode_example(encoded_example)\n    assert isinstance(decoded_example, VideoDecoder)\n\n\n@require_torchcodec\ndef test_dataset_with_video_feature(shared_datadir):\n    import torch\n    from torchcodec.decoders import VideoDecoder\n\n    video_path = str(shared_datadir / \"test_video_66x50.mov\")\n    data = {\"video\": [video_path]}\n    features = Features({\"video\": Video()})\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"video\"}\n    assert isinstance(item[\"video\"], VideoDecoder)\n    assert item[\"video\"].get_frame_at(0).data.shape == (3, 50, 66)\n    assert isinstance(item[\"video\"].get_frame_at(0).data, torch.Tensor)\n    batch = dset[:1]\n    assert len(batch) == 1\n    assert batch.keys() == {\"video\"}\n    assert isinstance(batch[\"video\"], list) and all(isinstance(item, VideoDecoder) for item in batch[\"video\"])\n    assert batch[\"video\"][0].get_frame_at(0).data.shape == (3, 50, 66)\n    assert isinstance(batch[\"video\"][0].get_frame_at(0).data, torch.Tensor)\n    column = dset[\"video\"]\n    assert len(column) == 1\n\n    assert isinstance(column, Column) and all(isinstance(item, VideoDecoder) for item in column)\n    assert next(iter(column)).get_frame_at(0).data.shape == (3, 50, 66)\n    assert isinstance(next(iter(column)).get_frame_at(0).data, torch.Tensor)\n\n    # from bytes\n    with open(video_path, \"rb\") as f:\n        data = {\"video\": [f.read()]}\n    dset = Dataset.from_dict(data, features=features)\n    item = dset[0]\n    assert item.keys() == {\"video\"}\n    assert isinstance(item[\"video\"], VideoDecoder)\n    assert item[\"video\"].get_frame_at(0).data.shape == (3, 50, 66)\n    assert isinstance(item[\"video\"].get_frame_at(0).data, torch.Tensor)\n\n\n@require_torchcodec\ndef test_dataset_with_video_map_and_formatted(shared_datadir):\n    from torchcodec.decoders import VideoDecoder\n\n    video_path = str(shared_datadir / \"test_video_66x50.mov\")\n    data = {\"video\": [video_path]}\n    features = Features({\"video\": Video()})\n    dset = Dataset.from_dict(data, features=features)\n    dset = dset.map(lambda x: x).with_format(\"numpy\")\n    example = dset[0]\n    assert isinstance(example[\"video\"], VideoDecoder)\n    # assert isinstance(example[\"video\"][0], np.ndarray)\n\n    # from bytes\n    with open(video_path, \"rb\") as f:\n        data = {\"video\": [f.read()]}\n    dset = Dataset.from_dict(data, features=features)\n    dset = dset.map(lambda x: x).with_format(\"numpy\")\n    example = dset[0]\n    assert isinstance(example[\"video\"], VideoDecoder)\n    # assert isinstance(example[\"video\"][0], np.ndarray)\n\n\n# Dataset casting and mapping\n@require_torchcodec\ndef test_dataset_with_video_feature_map_is_decoded(shared_datadir):\n    video_path = str(shared_datadir / \"test_video_66x50.mov\")\n    data = {\"video\": [video_path], \"text\": [\"Hello\"]}\n    features = Features({\"video\": Video(), \"text\": Value(\"string\")})\n    dset = Dataset.from_dict(data, features=features)\n\n    def process_audio_sampling_rate_by_example(example):\n        begin_stream_seconds = example[\"video\"].metadata.begin_stream_seconds\n        example[\"double_begin_stream_seconds\"] = 2 * begin_stream_seconds\n        return example\n\n    decoded_dset = dset.map(process_audio_sampling_rate_by_example)\n    for item in decoded_dset.cast_column(\"video\", Video(decode=False)):\n        assert item.keys() == {\"video\", \"text\", \"double_begin_stream_seconds\"}\n        assert item[\"double_begin_stream_seconds\"] == 0.0\n\n    def process_audio_sampling_rate_by_batch(batch):\n        double_fps = []\n        for video in batch[\"video\"]:\n            double_fps.append(2 * video.metadata.begin_stream_seconds)\n        batch[\"double_begin_stream_seconds\"] = double_fps\n        return batch\n\n    decoded_dset = dset.map(process_audio_sampling_rate_by_batch, batched=True)\n    for item in decoded_dset.cast_column(\"video\", Video(decode=False)):\n        assert item.keys() == {\"video\", \"text\", \"double_begin_stream_seconds\"}\n        assert item[\"double_begin_stream_seconds\"] == 0.0\n\n\n@pytest.fixture\ndef jsonl_video_dataset_path(shared_datadir, tmp_path_factory):\n    import json\n\n    video_path = str(shared_datadir / \"test_video_66x50.mov\")\n    data = [{\"video\": video_path, \"text\": \"Hello world!\"}]\n    path = str(tmp_path_factory.mktemp(\"data\") / \"video_dataset.jsonl\")\n    with open(path, \"w\") as f:\n        for item in data:\n            f.write(json.dumps(item) + \"\\n\")\n    return path\n\n\n@require_torchcodec\n@pytest.mark.parametrize(\"streaming\", [False, True])\ndef test_load_dataset_with_video_feature(streaming, jsonl_video_dataset_path, shared_datadir):\n    from torchcodec.decoders import VideoDecoder\n\n    video_path = str(shared_datadir / \"test_video_66x50.mov\")\n    data_files = jsonl_video_dataset_path\n    features = Features({\"video\": Video(), \"text\": Value(\"string\")})\n    dset = load_dataset(\"json\", split=\"train\", data_files=data_files, features=features, streaming=streaming)\n    item = dset[0] if not streaming else next(iter(dset))\n    assert item.keys() == {\"video\", \"text\"}\n    assert isinstance(item[\"video\"], VideoDecoder)\n    assert item[\"video\"].get_frame_at(0).data.shape == (3, 50, 66)\n    assert item[\"video\"].metadata.path == video_path\n"
  },
  {
    "path": "tests/fixtures/__init__.py",
    "content": ""
  },
  {
    "path": "tests/fixtures/files.py",
    "content": "import contextlib\nimport csv\nimport json\nimport os\nimport sqlite3\nimport tarfile\nimport textwrap\nimport zipfile\n\nimport pandas as pd\nimport pyarrow as pa\nimport pyarrow.parquet as pq\nimport pytest\n\nimport datasets\nimport datasets.config\n\n\n# dataset + arrow_file\n\n\n@pytest.fixture(scope=\"session\")\ndef dataset():\n    n = 10\n    features = datasets.Features(\n        {\n            \"tokens\": datasets.List(datasets.Value(\"string\")),\n            \"labels\": datasets.List(datasets.ClassLabel(names=[\"negative\", \"positive\"])),\n            \"answers\": {\n                \"text\": datasets.List(datasets.Value(\"string\")),\n                \"answer_start\": datasets.List(datasets.Value(\"int32\")),\n            },\n            \"id\": datasets.Value(\"int64\"),\n        }\n    )\n    dataset = datasets.Dataset.from_dict(\n        {\n            \"tokens\": [[\"foo\"] * 5] * n,\n            \"labels\": [[1] * 5] * n,\n            \"answers\": [{\"answer_start\": [97], \"text\": [\"1976\"]}] * 10,\n            \"id\": list(range(n)),\n        },\n        features=features,\n    )\n    return dataset\n\n\n@pytest.fixture(scope=\"session\")\ndef arrow_file(tmp_path_factory, dataset):\n    filename = str(tmp_path_factory.mktemp(\"data\") / \"file.arrow\")\n    dataset.map(cache_file_name=filename)\n    return filename\n\n\n# FILE_CONTENT + files\n\n\nFILE_CONTENT = \"\"\"\\\n    Text data.\n    Second line of data.\"\"\"\n\n\n@pytest.fixture(scope=\"session\")\ndef text_file_content():\n    return FILE_CONTENT\n\n\n@pytest.fixture(scope=\"session\")\ndef text_file(tmp_path_factory):\n    filename = tmp_path_factory.mktemp(\"data\") / \"file.txt\"\n    data = FILE_CONTENT\n    with open(filename, \"w\") as f:\n        f.write(data)\n    return filename\n\n\n@pytest.fixture(scope=\"session\")\ndef bz2_file(tmp_path_factory):\n    import bz2\n\n    path = tmp_path_factory.mktemp(\"data\") / \"file.txt.bz2\"\n    data = bytes(FILE_CONTENT, \"utf-8\")\n    with bz2.open(path, \"wb\") as f:\n        f.write(data)\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef gz_file(tmp_path_factory):\n    import gzip\n\n    path = str(tmp_path_factory.mktemp(\"data\") / \"file.txt.gz\")\n    data = bytes(FILE_CONTENT, \"utf-8\")\n    with gzip.open(path, \"wb\") as f:\n        f.write(data)\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef lz4_file(tmp_path_factory):\n    if datasets.config.LZ4_AVAILABLE:\n        import lz4.frame\n\n        path = tmp_path_factory.mktemp(\"data\") / \"file.txt.lz4\"\n        data = bytes(FILE_CONTENT, \"utf-8\")\n        with lz4.frame.open(path, \"wb\") as f:\n            f.write(data)\n        return path\n\n\n@pytest.fixture(scope=\"session\")\ndef seven_zip_file(tmp_path_factory, text_file):\n    if datasets.config.PY7ZR_AVAILABLE:\n        import py7zr\n\n        path = tmp_path_factory.mktemp(\"data\") / \"file.txt.7z\"\n        with py7zr.SevenZipFile(path, \"w\") as archive:\n            archive.write(text_file, arcname=os.path.basename(text_file))\n        return path\n\n\n@pytest.fixture(scope=\"session\")\ndef tar_file(tmp_path_factory, text_file):\n    import tarfile\n\n    path = tmp_path_factory.mktemp(\"data\") / \"file.txt.tar\"\n    with tarfile.TarFile(path, \"w\") as f:\n        f.add(text_file, arcname=os.path.basename(text_file))\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef xz_file(tmp_path_factory):\n    import lzma\n\n    path = tmp_path_factory.mktemp(\"data\") / \"file.txt.xz\"\n    data = bytes(FILE_CONTENT, \"utf-8\")\n    with lzma.open(path, \"wb\") as f:\n        f.write(data)\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef zip_file(tmp_path_factory, text_file):\n    import zipfile\n\n    path = tmp_path_factory.mktemp(\"data\") / \"file.txt.zip\"\n    with zipfile.ZipFile(path, \"w\") as f:\n        f.write(text_file, arcname=os.path.basename(text_file))\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef zstd_file(tmp_path_factory):\n    if datasets.config.ZSTANDARD_AVAILABLE:\n        import zstandard as zstd\n\n        path = tmp_path_factory.mktemp(\"data\") / \"file.txt.zst\"\n        data = bytes(FILE_CONTENT, \"utf-8\")\n        with zstd.open(path, \"wb\") as f:\n            f.write(data)\n        return path\n\n\n# xml_file\n\n\n@pytest.fixture(scope=\"session\")\ndef xml_file(tmp_path_factory):\n    filename = tmp_path_factory.mktemp(\"data\") / \"file.xml\"\n    data = textwrap.dedent(\n        \"\"\"\\\n    <?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n    <tmx version=\"1.4\">\n      <header segtype=\"sentence\" srclang=\"ca\" />\n      <body>\n        <tu>\n          <tuv xml:lang=\"ca\"><seg>Contingut 1</seg></tuv>\n          <tuv xml:lang=\"en\"><seg>Content 1</seg></tuv>\n        </tu>\n        <tu>\n          <tuv xml:lang=\"ca\"><seg>Contingut 2</seg></tuv>\n          <tuv xml:lang=\"en\"><seg>Content 2</seg></tuv>\n        </tu>\n        <tu>\n          <tuv xml:lang=\"ca\"><seg>Contingut 3</seg></tuv>\n          <tuv xml:lang=\"en\"><seg>Content 3</seg></tuv>\n        </tu>\n        <tu>\n          <tuv xml:lang=\"ca\"><seg>Contingut 4</seg></tuv>\n          <tuv xml:lang=\"en\"><seg>Content 4</seg></tuv>\n        </tu>\n        <tu>\n          <tuv xml:lang=\"ca\"><seg>Contingut 5</seg></tuv>\n          <tuv xml:lang=\"en\"><seg>Content 5</seg></tuv>\n        </tu>\n      </body>\n    </tmx>\"\"\"\n    )\n    with open(filename, \"w\") as f:\n        f.write(data)\n    return filename\n\n\nDATA = [\n    {\"col_1\": \"0\", \"col_2\": 0, \"col_3\": 0.0},\n    {\"col_1\": \"1\", \"col_2\": 1, \"col_3\": 1.0},\n    {\"col_1\": \"2\", \"col_2\": 2, \"col_3\": 2.0},\n    {\"col_1\": \"3\", \"col_2\": 3, \"col_3\": 3.0},\n]\nDATA2 = [\n    {\"col_1\": \"4\", \"col_2\": 4, \"col_3\": 4.0},\n    {\"col_1\": \"5\", \"col_2\": 5, \"col_3\": 5.0},\n]\nDATA_DICT_OF_LISTS = {\n    \"col_1\": [\"0\", \"1\", \"2\", \"3\"],\n    \"col_2\": [0, 1, 2, 3],\n    \"col_3\": [0.0, 1.0, 2.0, 3.0],\n}\n\nDATA_312 = [\n    {\"col_3\": 0.0, \"col_1\": \"0\", \"col_2\": 0},\n    {\"col_3\": 1.0, \"col_1\": \"1\", \"col_2\": 1},\n]\n\nDATA_STR = [\n    {\"col_1\": \"s0\", \"col_2\": 0, \"col_3\": 0.0},\n    {\"col_1\": \"s1\", \"col_2\": 1, \"col_3\": 1.0},\n    {\"col_1\": \"s2\", \"col_2\": 2, \"col_3\": 2.0},\n    {\"col_1\": \"s3\", \"col_2\": 3, \"col_3\": 3.0},\n]\n\nDATA_MISSING_FIELDS = [\n    {\"col_1\": 1, \"col_2\": 2},\n    {\"col_1\": 1, \"col_3\": 3},\n]\n\nDATA_MIXED_TYPES = [\n    {\"col_1\": 1, \"col_2\": {\"a\": \"a\"}, \"col_3\": [{\"x\": \"x\"}]},\n    {\"col_1\": \"one\", \"col_2\": {\"b\": \"b\"}, \"col_3\": [{\"y\": \"y\"}]},\n    {\"col_1\": None, \"col_2\": None, \"col_3\": [None]},\n]\n\n\n@pytest.fixture(scope=\"session\")\ndef dataset_dict():\n    return DATA_DICT_OF_LISTS\n\n\n@pytest.fixture(scope=\"session\")\ndef arrow_path(tmp_path_factory):\n    dataset = datasets.Dataset.from_dict(DATA_DICT_OF_LISTS)\n    path = str(tmp_path_factory.mktemp(\"data\") / \"dataset.arrow\")\n    dataset.map(cache_file_name=path)\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef sqlite_path(tmp_path_factory):\n    path = str(tmp_path_factory.mktemp(\"data\") / \"dataset.sqlite\")\n    with contextlib.closing(sqlite3.connect(path)) as con:\n        cur = con.cursor()\n        cur.execute(\"CREATE TABLE dataset(col_1 text, col_2 int, col_3 real)\")\n        for item in DATA:\n            cur.execute(\"INSERT INTO dataset(col_1, col_2, col_3) VALUES (?, ?, ?)\", tuple(item.values()))\n        con.commit()\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef csv_path(tmp_path_factory):\n    path = str(tmp_path_factory.mktemp(\"data\") / \"dataset.csv\")\n    with open(path, \"w\", newline=\"\") as f:\n        writer = csv.DictWriter(f, fieldnames=[\"col_1\", \"col_2\", \"col_3\"])\n        writer.writeheader()\n        for item in DATA:\n            writer.writerow(item)\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef csv2_path(tmp_path_factory):\n    path = str(tmp_path_factory.mktemp(\"data\") / \"dataset2.csv\")\n    with open(path, \"w\", newline=\"\") as f:\n        writer = csv.DictWriter(f, fieldnames=[\"col_1\", \"col_2\", \"col_3\"])\n        writer.writeheader()\n        for item in DATA:\n            writer.writerow(item)\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef bz2_csv_path(csv_path, tmp_path_factory):\n    import bz2\n\n    path = tmp_path_factory.mktemp(\"data\") / \"dataset.csv.bz2\"\n    with open(csv_path, \"rb\") as f:\n        data = f.read()\n    # data = bytes(FILE_CONTENT, \"utf-8\")\n    with bz2.open(path, \"wb\") as f:\n        f.write(data)\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef zip_csv_path(csv_path, csv2_path, tmp_path_factory):\n    path = tmp_path_factory.mktemp(\"zip_csv_path\") / \"csv-dataset.zip\"\n    with zipfile.ZipFile(path, \"w\") as f:\n        f.write(csv_path, arcname=os.path.basename(csv_path))\n        f.write(csv2_path, arcname=os.path.basename(csv2_path))\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef zip_uppercase_csv_path(csv_path, csv2_path, tmp_path_factory):\n    path = tmp_path_factory.mktemp(\"data\") / \"dataset.csv.zip\"\n    with zipfile.ZipFile(path, \"w\") as f:\n        f.write(csv_path, arcname=os.path.basename(csv_path.replace(\".csv\", \".CSV\")))\n        f.write(csv2_path, arcname=os.path.basename(csv2_path.replace(\".csv\", \".CSV\")))\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef zip_csv_with_dir_path(csv_path, csv2_path, tmp_path_factory):\n    path = tmp_path_factory.mktemp(\"data\") / \"dataset_with_dir.csv.zip\"\n    with zipfile.ZipFile(path, \"w\") as f:\n        f.write(csv_path, arcname=os.path.join(\"main_dir\", os.path.basename(csv_path)))\n        f.write(csv2_path, arcname=os.path.join(\"main_dir\", os.path.basename(csv2_path)))\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef parquet_path(tmp_path_factory):\n    path = str(tmp_path_factory.mktemp(\"data\") / \"dataset.parquet\")\n    schema = pa.schema(\n        {\n            \"col_1\": pa.string(),\n            \"col_2\": pa.int64(),\n            \"col_3\": pa.float64(),\n        }\n    )\n    with open(path, \"wb\") as f:\n        writer = pq.ParquetWriter(f, schema=schema)\n        pa_table = pa.Table.from_pydict({k: [DATA[i][k] for i in range(len(DATA))] for k in DATA[0]}, schema=schema)\n        writer.write_table(pa_table)\n        writer.close()\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef geoparquet_path(tmp_path_factory):\n    df = pd.read_parquet(path=\"https://github.com/opengeospatial/geoparquet/raw/v1.0.0/examples/example.parquet\")\n    path = str(tmp_path_factory.mktemp(\"data\") / \"dataset.geoparquet\")\n    df.to_parquet(path=path)\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef json_list_of_dicts_path(tmp_path_factory):\n    path = str(tmp_path_factory.mktemp(\"data\") / \"dataset.json\")\n    data = {\"data\": DATA}\n    with open(path, \"w\") as f:\n        json.dump(data, f)\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef json_dict_of_lists_path(tmp_path_factory):\n    path = str(tmp_path_factory.mktemp(\"data\") / \"dataset.json\")\n    data = {\"data\": DATA_DICT_OF_LISTS}\n    with open(path, \"w\") as f:\n        json.dump(data, f)\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef jsonl_path(tmp_path_factory):\n    path = str(tmp_path_factory.mktemp(\"data\") / \"dataset.jsonl\")\n    with open(path, \"w\") as f:\n        for item in DATA:\n            f.write(json.dumps(item) + \"\\n\")\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef jsonl2_path(tmp_path_factory):\n    path = str(tmp_path_factory.mktemp(\"data\") / \"dataset2.jsonl\")\n    with open(path, \"w\") as f:\n        for item in DATA:\n            f.write(json.dumps(item) + \"\\n\")\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef jsonl_312_path(tmp_path_factory):\n    path = str(tmp_path_factory.mktemp(\"data\") / \"dataset_312.jsonl\")\n    with open(path, \"w\") as f:\n        for item in DATA_312:\n            f.write(json.dumps(item) + \"\\n\")\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef jsonl_str_path(tmp_path_factory):\n    path = str(tmp_path_factory.mktemp(\"data\") / \"dataset-str.jsonl\")\n    with open(path, \"w\") as f:\n        for item in DATA_STR:\n            f.write(json.dumps(item) + \"\\n\")\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef jsonl_missing_fields_path(tmp_path_factory):\n    path = str(tmp_path_factory.mktemp(\"data\") / \"dataset-missing-fields.jsonl\")\n    with open(path, \"w\") as f:\n        for item in DATA_MISSING_FIELDS:\n            f.write(json.dumps(item) + \"\\n\")\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef jsonl_mixed_types_path(tmp_path_factory):\n    path = str(tmp_path_factory.mktemp(\"data\") / \"dataset-mixed-types.jsonl\")\n    with open(path, \"w\") as f:\n        for item in DATA_MIXED_TYPES:\n            f.write(json.dumps(item) + \"\\n\")\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef text_gz_path(tmp_path_factory, text_path):\n    import gzip\n\n    path = str(tmp_path_factory.mktemp(\"data\") / \"dataset.txt.gz\")\n    with open(text_path, \"rb\") as orig_file:\n        with gzip.open(path, \"wb\") as zipped_file:\n            zipped_file.writelines(orig_file)\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef jsonl_gz_path(tmp_path_factory, jsonl_path):\n    import gzip\n\n    path = str(tmp_path_factory.mktemp(\"data\") / \"dataset.jsonl.gz\")\n    with open(jsonl_path, \"rb\") as orig_file:\n        with gzip.open(path, \"wb\") as zipped_file:\n            zipped_file.writelines(orig_file)\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef zip_jsonl_path(jsonl_path, jsonl2_path, tmp_path_factory):\n    path = tmp_path_factory.mktemp(\"data\") / \"dataset.jsonl.zip\"\n    with zipfile.ZipFile(path, \"w\") as f:\n        f.write(jsonl_path, arcname=os.path.basename(jsonl_path))\n        f.write(jsonl2_path, arcname=os.path.basename(jsonl2_path))\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef zip_nested_jsonl_path(zip_jsonl_path, jsonl_path, jsonl2_path, tmp_path_factory):\n    path = tmp_path_factory.mktemp(\"data\") / \"dataset_nested.jsonl.zip\"\n    with zipfile.ZipFile(path, \"w\") as f:\n        f.write(zip_jsonl_path, arcname=os.path.join(\"nested\", os.path.basename(zip_jsonl_path)))\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef zip_jsonl_with_dir_path(jsonl_path, jsonl2_path, tmp_path_factory):\n    path = tmp_path_factory.mktemp(\"data\") / \"dataset_with_dir.jsonl.zip\"\n    with zipfile.ZipFile(path, \"w\") as f:\n        f.write(jsonl_path, arcname=os.path.join(\"main_dir\", os.path.basename(jsonl_path)))\n        f.write(jsonl2_path, arcname=os.path.join(\"main_dir\", os.path.basename(jsonl2_path)))\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef tar_jsonl_path(jsonl_path, jsonl2_path, tmp_path_factory):\n    path = tmp_path_factory.mktemp(\"data\") / \"dataset.jsonl.tar\"\n    with tarfile.TarFile(path, \"w\") as f:\n        f.add(jsonl_path, arcname=os.path.basename(jsonl_path))\n        f.add(jsonl2_path, arcname=os.path.basename(jsonl2_path))\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef tar_nested_jsonl_path(tar_jsonl_path, jsonl_path, jsonl2_path, tmp_path_factory):\n    path = tmp_path_factory.mktemp(\"data\") / \"dataset_nested.jsonl.tar\"\n    with tarfile.TarFile(path, \"w\") as f:\n        f.add(tar_jsonl_path, arcname=os.path.join(\"nested\", os.path.basename(tar_jsonl_path)))\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef text_path(tmp_path_factory):\n    data = [\"0\", \"1\", \"2\", \"3\"]\n    path = str(tmp_path_factory.mktemp(\"data\") / \"dataset.txt\")\n    with open(path, \"w\") as f:\n        for item in data:\n            f.write(item + \"\\n\")\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef text2_path(tmp_path_factory):\n    data = [\"0\", \"1\", \"2\", \"3\"]\n    path = str(tmp_path_factory.mktemp(\"data\") / \"dataset2.txt\")\n    with open(path, \"w\") as f:\n        for item in data:\n            f.write(item + \"\\n\")\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef text_dir(tmp_path_factory):\n    data = [\"0\", \"1\", \"2\", \"3\"]\n    path = tmp_path_factory.mktemp(\"data_text_dir\") / \"dataset.txt\"\n    with open(path, \"w\") as f:\n        for item in data:\n            f.write(item + \"\\n\")\n    return path.parent\n\n\n@pytest.fixture(scope=\"session\")\ndef text_dir_with_unsupported_extension(tmp_path_factory):\n    data = [\"0\", \"1\", \"2\", \"3\"]\n    path = tmp_path_factory.mktemp(\"data\") / \"dataset.abc\"\n    with open(path, \"w\") as f:\n        for item in data:\n            f.write(item + \"\\n\")\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef zip_text_path(text_path, text2_path, tmp_path_factory):\n    path = tmp_path_factory.mktemp(\"data\") / \"dataset.text.zip\"\n    with zipfile.ZipFile(path, \"w\") as f:\n        f.write(text_path, arcname=os.path.basename(text_path))\n        f.write(text2_path, arcname=os.path.basename(text2_path))\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef zip_text_with_dir_path(text_path, text2_path, tmp_path_factory):\n    path = tmp_path_factory.mktemp(\"data\") / \"dataset_with_dir.text.zip\"\n    with zipfile.ZipFile(path, \"w\") as f:\n        f.write(text_path, arcname=os.path.join(\"main_dir\", os.path.basename(text_path)))\n        f.write(text2_path, arcname=os.path.join(\"main_dir\", os.path.basename(text2_path)))\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef zip_unsupported_ext_path(text_path, text2_path, tmp_path_factory):\n    path = tmp_path_factory.mktemp(\"data\") / \"dataset.ext.zip\"\n    with zipfile.ZipFile(path, \"w\") as f:\n        f.write(text_path, arcname=os.path.basename(\"unsupported.ext\"))\n        f.write(text2_path, arcname=os.path.basename(\"unsupported_2.ext\"))\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef text_path_with_unicode_new_lines(tmp_path_factory):\n    text = \"\\n\".join([\"First\", \"Second\\u2029with Unicode new line\", \"Third\"])\n    path = str(tmp_path_factory.mktemp(\"data\") / \"dataset_with_unicode_new_lines.txt\")\n    with open(path, \"w\", encoding=\"utf-8\") as f:\n        f.write(text)\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef image_file():\n    return os.path.join(\"tests\", \"features\", \"data\", \"test_image_rgb.jpg\")\n\n\n@pytest.fixture(scope=\"session\")\ndef audio_file():\n    return os.path.join(\"tests\", \"features\", \"data\", \"test_audio_44100.wav\")\n\n\n@pytest.fixture(scope=\"session\")\ndef audio_file_44100():\n    return os.path.join(\"tests\", \"features\", \"data\", \"test_audio_44100.mp3\")\n\n\n@pytest.fixture(scope=\"session\")\ndef audio_file_16000():\n    return os.path.join(\"tests\", \"features\", \"data\", \"test_audio_16000.mp3\")\n\n\n@pytest.fixture(scope=\"session\")\ndef tensor_file(tmp_path_factory):\n    import torch\n\n    path = tmp_path_factory.mktemp(\"data\") / \"tensor.pth\"\n    with open(path, \"wb\") as f:\n        torch.save(torch.ones(128), f)\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef zip_image_path(image_file, tmp_path_factory):\n    path = tmp_path_factory.mktemp(\"data\") / \"dataset.img.zip\"\n    with zipfile.ZipFile(path, \"w\") as f:\n        f.write(image_file, arcname=os.path.basename(image_file))\n        f.write(image_file, arcname=os.path.basename(image_file).replace(\".jpg\", \"2.jpg\"))\n    return path\n\n\n@pytest.fixture(scope=\"session\")\ndef data_dir_with_hidden_files(tmp_path_factory):\n    data_dir = tmp_path_factory.mktemp(\"data_dir\")\n\n    (data_dir / \"subdir\").mkdir()\n    with open(data_dir / \"subdir\" / \"train.txt\", \"w\") as f:\n        f.write(\"foo\\n\" * 10)\n    with open(data_dir / \"subdir\" / \"test.txt\", \"w\") as f:\n        f.write(\"bar\\n\" * 10)\n    # hidden file\n    with open(data_dir / \"subdir\" / \".test.txt\", \"w\") as f:\n        f.write(\"bar\\n\" * 10)\n\n    # hidden directory\n    (data_dir / \".subdir\").mkdir()\n    with open(data_dir / \".subdir\" / \"train.txt\", \"w\") as f:\n        f.write(\"foo\\n\" * 10)\n    with open(data_dir / \".subdir\" / \"test.txt\", \"w\") as f:\n        f.write(\"bar\\n\" * 10)\n\n    return data_dir\n"
  },
  {
    "path": "tests/fixtures/fsspec.py",
    "content": "import posixpath\nfrom pathlib import Path\nfrom unittest.mock import patch\n\nimport pytest\nfrom fsspec.implementations.local import AbstractFileSystem, LocalFileSystem, stringify_path\nfrom fsspec.registry import _registry as _fsspec_registry\n\n\nclass MockFileSystem(AbstractFileSystem):\n    protocol = \"mock\"\n\n    def __init__(self, *args, local_root_dir, **kwargs):\n        super().__init__()\n        self._fs = LocalFileSystem(*args, **kwargs)\n        self.local_root_dir = Path(local_root_dir).resolve().as_posix() + \"/\"\n\n    def mkdir(self, path, *args, **kwargs):\n        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))\n        return self._fs.mkdir(path, *args, **kwargs)\n\n    def makedirs(self, path, *args, **kwargs):\n        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))\n        return self._fs.makedirs(path, *args, **kwargs)\n\n    def rmdir(self, path):\n        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))\n        return self._fs.rmdir(path)\n\n    def ls(self, path, detail=True, *args, **kwargs):\n        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))\n        out = self._fs.ls(path, detail=detail, *args, **kwargs)\n        if detail:\n            return [{**info, \"name\": info[\"name\"][len(self.local_root_dir) :]} for info in out]\n        else:\n            return [name[len(self.local_root_dir) :] for name in out]\n\n    def info(self, path, *args, **kwargs):\n        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))\n        out = dict(self._fs.info(path, *args, **kwargs))\n        out[\"name\"] = out[\"name\"][len(self.local_root_dir) :]\n        return out\n\n    def cp_file(self, path1, path2, *args, **kwargs):\n        path1 = posixpath.join(self.local_root_dir, self._strip_protocol(path1))\n        path2 = posixpath.join(self.local_root_dir, self._strip_protocol(path2))\n        return self._fs.cp_file(path1, path2, *args, **kwargs)\n\n    def rm_file(self, path, *args, **kwargs):\n        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))\n        return self._fs.rm_file(path, *args, **kwargs)\n\n    def rm(self, path, *args, **kwargs):\n        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))\n        return self._fs.rm(path, *args, **kwargs)\n\n    def _open(self, path, *args, **kwargs):\n        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))\n        return self._fs._open(path, *args, **kwargs)\n\n    def created(self, path):\n        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))\n        return self._fs.created(path)\n\n    def modified(self, path):\n        path = posixpath.join(self.local_root_dir, self._strip_protocol(path))\n        return self._fs.modified(path)\n\n    @classmethod\n    def _strip_protocol(cls, path):\n        path = stringify_path(path)\n        if path.startswith(\"mock://\"):\n            path = path[7:]\n        return path\n\n\nclass TmpDirFileSystem(MockFileSystem):\n    protocol = \"tmp\"\n    tmp_dir = None\n\n    def __init__(self, *args, **kwargs):\n        assert self.tmp_dir is not None, \"TmpDirFileSystem.tmp_dir is not set\"\n        super().__init__(*args, **kwargs, local_root_dir=self.tmp_dir, auto_mkdir=True)\n\n    @classmethod\n    def _strip_protocol(cls, path):\n        path = stringify_path(path)\n        if path.startswith(\"tmp://\"):\n            path = path[6:]\n        return path\n\n\n@pytest.fixture\ndef mock_fsspec():\n    _fsspec_registry[\"mock\"] = MockFileSystem\n    _fsspec_registry[\"tmp\"] = TmpDirFileSystem\n    yield\n    del _fsspec_registry[\"mock\"]\n    del _fsspec_registry[\"tmp\"]\n\n\n@pytest.fixture\ndef mockfs(tmp_path_factory, mock_fsspec):\n    local_fs_dir = tmp_path_factory.mktemp(\"mockfs\")\n    return MockFileSystem(local_root_dir=local_fs_dir, auto_mkdir=True)\n\n\n@pytest.fixture\ndef tmpfs(tmp_path_factory, mock_fsspec):\n    tmp_fs_dir = tmp_path_factory.mktemp(\"tmpfs\")\n    with patch.object(TmpDirFileSystem, \"tmp_dir\", tmp_fs_dir):\n        yield TmpDirFileSystem()\n        TmpDirFileSystem.clear_instance_cache()\n"
  },
  {
    "path": "tests/fixtures/hub.py",
    "content": "import os\nimport time\nimport uuid\nfrom contextlib import contextmanager\nfrom typing import Optional\n\nimport pytest\nfrom huggingface_hub.hf_api import HfApi\nfrom huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError\nfrom huggingface_hub.utils._headers import _http_user_agent\nfrom packaging import version\n\nfrom datasets import config\n\n\nif config.HF_HUB_VERSION >= version.parse(\"1.6.0\"):\n    from huggingface_hub.errors import BucketNotFoundError\n\nelse:\n    BucketNotFoundError = None\n\nCI_HUB_USER = \"__DUMMY_TRANSFORMERS_USER__\"\nCI_HUB_USER_FULL_NAME = \"Dummy User\"\nCI_HUB_USER_TOKEN = \"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\"\n\nCI_HUB_ENDPOINT = \"https://hub-ci.huggingface.co\"\nCI_HUB_DATASETS_URL = CI_HUB_ENDPOINT + \"/datasets/{repo_id}/resolve/{revision}/{path}\"\nCI_HFH_HUGGINGFACE_CO_URL_TEMPLATE = CI_HUB_ENDPOINT + \"/{repo_id}/resolve/{revision}/{filename}\"\n\n\n@pytest.fixture\ndef ci_hub_config(monkeypatch):\n    monkeypatch.setattr(\"datasets.config.HF_ENDPOINT\", CI_HUB_ENDPOINT)\n    monkeypatch.setattr(\"datasets.config.HUB_DATASETS_URL\", CI_HUB_DATASETS_URL)\n    monkeypatch.setattr(\"huggingface_hub.constants.HUGGINGFACE_CO_URL_TEMPLATE\", CI_HFH_HUGGINGFACE_CO_URL_TEMPLATE)\n    try:\n        # for backward compatibility with huggingface_hub 0.x\n        monkeypatch.setattr(\n            \"huggingface_hub.file_download.HUGGINGFACE_CO_URL_TEMPLATE\", CI_HFH_HUGGINGFACE_CO_URL_TEMPLATE\n        )\n    except AttributeError:\n        pass\n    old_environ = dict(os.environ)\n    os.environ[\"HF_ENDPOINT\"] = CI_HUB_ENDPOINT\n    yield\n    os.environ.clear()\n    os.environ.update(old_environ)\n\n\n@pytest.fixture\ndef set_ci_hub_access_token(ci_hub_config, monkeypatch):\n    # Enable implicit token\n    monkeypatch.setattr(\"huggingface_hub.constants.HF_HUB_DISABLE_IMPLICIT_TOKEN\", False)\n    old_environ = dict(os.environ)\n    os.environ[\"HF_TOKEN\"] = CI_HUB_USER_TOKEN\n    os.environ[\"HF_HUB_DISABLE_IMPLICIT_TOKEN\"] = \"0\"\n    yield\n    os.environ.clear()\n    os.environ.update(old_environ)\n\n\ndef _http_ci_user_agent(*args, **kwargs):\n    ua = _http_user_agent(*args, **kwargs)\n    return ua + os.environ.get(\"CI_HEADERS\", \"\")\n\n\n@pytest.fixture(autouse=True)\ndef set_hf_ci_headers(monkeypatch):\n    old_environ = dict(os.environ)\n    os.environ[\"TRANSFORMERS_IS_CI\"] = \"1\"\n    monkeypatch.setattr(\"huggingface_hub.utils._headers._http_user_agent\", _http_ci_user_agent)\n    yield\n    os.environ.clear()\n    os.environ.update(old_environ)\n\n\n@pytest.fixture(scope=\"session\")\ndef hf_api():\n    return HfApi(endpoint=CI_HUB_ENDPOINT)\n\n\n@pytest.fixture(scope=\"session\")\ndef hf_token():\n    yield CI_HUB_USER_TOKEN\n\n\n@pytest.fixture\ndef cleanup_repo(hf_api: HfApi):\n    def _cleanup_repo(repo_id):\n        hf_api.delete_repo(repo_id, token=CI_HUB_USER_TOKEN, repo_type=\"dataset\")\n\n    return _cleanup_repo\n\n\n@pytest.fixture\ndef cleanup_bucket(hf_api: HfApi):\n    def _cleanup_bucket(bucket_id):\n        hf_api.delete_bucket(bucket_id, token=CI_HUB_USER_TOKEN)\n\n    return _cleanup_bucket\n\n\n@pytest.fixture\ndef temporary_repo(cleanup_repo):\n    @contextmanager\n    def _temporary_repo(repo_id: Optional[str] = None):\n        repo_id = repo_id or f\"{CI_HUB_USER}/test-dataset-{uuid.uuid4().hex[:6]}-{int(time.time() * 10e3)}\"\n        try:\n            yield repo_id\n        finally:\n            try:\n                cleanup_repo(repo_id)\n            except RepositoryNotFoundError:\n                pass\n\n    return _temporary_repo\n\n\n@pytest.fixture\ndef temporary_bucket(cleanup_bucket):\n    @contextmanager\n    def _temporary_bucket(bucket_id: Optional[str] = None):\n        bucket_id = bucket_id or f\"{CI_HUB_USER}/test-bucket-{uuid.uuid4().hex[:6]}-{int(time.time() * 10e3)}\"\n        try:\n            yield bucket_id\n        finally:\n            try:\n                cleanup_bucket(bucket_id)\n            except BucketNotFoundError:\n                pass\n\n    return _temporary_bucket\n\n\n@pytest.fixture(scope=\"session\")\ndef _hf_gated_dataset_repo_txt_data(hf_api: HfApi, hf_token, text_file_content):\n    repo_name = f\"repo_txt_data-{int(time.time() * 10e6)}\"\n    repo_id = f\"{CI_HUB_USER}/{repo_name}\"\n    hf_api.create_repo(repo_id, token=hf_token, repo_type=\"dataset\")\n    hf_api.upload_file(\n        token=hf_token,\n        path_or_fileobj=text_file_content.encode(),\n        path_in_repo=\"data/text_data.txt\",\n        repo_id=repo_id,\n        repo_type=\"dataset\",\n    )\n    hf_api.update_repo_settings(repo_id, token=hf_token, repo_type=\"dataset\", gated=\"auto\")\n    yield repo_id\n    try:\n        hf_api.delete_repo(repo_id, token=hf_token, repo_type=\"dataset\")\n    except (HfHubHTTPError, ValueError):  # catch http error and token invalid error\n        pass\n\n\n@pytest.fixture()\ndef hf_gated_dataset_repo_txt_data(_hf_gated_dataset_repo_txt_data, ci_hub_config):\n    return _hf_gated_dataset_repo_txt_data\n\n\n@pytest.fixture(scope=\"session\")\ndef hf_private_dataset_repo_txt_data_(hf_api: HfApi, hf_token, text_file_content):\n    repo_name = f\"repo_txt_data-{int(time.time() * 10e6)}\"\n    repo_id = f\"{CI_HUB_USER}/{repo_name}\"\n    hf_api.create_repo(repo_id, token=hf_token, repo_type=\"dataset\", private=True)\n    hf_api.upload_file(\n        token=hf_token,\n        path_or_fileobj=text_file_content.encode(),\n        path_in_repo=\"data/text_data.txt\",\n        repo_id=repo_id,\n        repo_type=\"dataset\",\n    )\n    yield repo_id\n    try:\n        hf_api.delete_repo(repo_id, token=hf_token, repo_type=\"dataset\")\n    except (HfHubHTTPError, ValueError):  # catch http error and token invalid error\n        pass\n\n\n@pytest.fixture()\ndef hf_private_dataset_repo_txt_data(hf_private_dataset_repo_txt_data_, ci_hub_config):\n    return hf_private_dataset_repo_txt_data_\n\n\n@pytest.fixture(scope=\"session\")\ndef hf_private_dataset_repo_zipped_txt_data_(hf_api: HfApi, hf_token, zip_csv_with_dir_path):\n    repo_name = f\"repo_zipped_txt_data-{int(time.time() * 10e6)}\"\n    repo_id = f\"{CI_HUB_USER}/{repo_name}\"\n    hf_api.create_repo(repo_id, token=hf_token, repo_type=\"dataset\", private=True)\n    hf_api.upload_file(\n        token=hf_token,\n        path_or_fileobj=str(zip_csv_with_dir_path),\n        path_in_repo=\"data.zip\",\n        repo_id=repo_id,\n        repo_type=\"dataset\",\n    )\n    yield repo_id\n    try:\n        hf_api.delete_repo(repo_id, token=hf_token, repo_type=\"dataset\")\n    except (HfHubHTTPError, ValueError):  # catch http error and token invalid error\n        pass\n\n\n@pytest.fixture()\ndef hf_private_dataset_repo_zipped_txt_data(hf_private_dataset_repo_zipped_txt_data_, ci_hub_config):\n    return hf_private_dataset_repo_zipped_txt_data_\n\n\n@pytest.fixture(scope=\"session\")\ndef hf_private_dataset_repo_zipped_img_data_(hf_api: HfApi, hf_token, zip_image_path):\n    repo_name = f\"repo_zipped_img_data-{int(time.time() * 10e6)}\"\n    repo_id = f\"{CI_HUB_USER}/{repo_name}\"\n    hf_api.create_repo(repo_id, token=hf_token, repo_type=\"dataset\", private=True)\n    hf_api.upload_file(\n        token=hf_token,\n        path_or_fileobj=str(zip_image_path),\n        path_in_repo=\"data.zip\",\n        repo_id=repo_id,\n        repo_type=\"dataset\",\n    )\n    yield repo_id\n    try:\n        hf_api.delete_repo(repo_id, token=hf_token, repo_type=\"dataset\")\n    except (HfHubHTTPError, ValueError):  # catch http error and token invalid error\n        pass\n\n\n@pytest.fixture()\ndef hf_private_dataset_repo_zipped_img_data(hf_private_dataset_repo_zipped_img_data_, ci_hub_config):\n    return hf_private_dataset_repo_zipped_img_data_\n"
  },
  {
    "path": "tests/io/__init__.py",
    "content": ""
  },
  {
    "path": "tests/io/test_csv.py",
    "content": "import csv\nimport os\n\nimport fsspec\nimport pytest\n\nfrom datasets import Dataset, DatasetDict, Features, NamedSplit, Value\nfrom datasets.io.csv import CsvDatasetReader, CsvDatasetWriter\n\nfrom ..utils import assert_arrow_memory_doesnt_increase, assert_arrow_memory_increases\n\n\ndef _check_csv_dataset(dataset, expected_features):\n    assert isinstance(dataset, Dataset)\n    assert dataset.num_rows == 4\n    assert dataset.num_columns == 3\n    assert dataset.column_names == [\"col_1\", \"col_2\", \"col_3\"]\n    for feature, expected_dtype in expected_features.items():\n        assert dataset.features[feature].dtype == expected_dtype\n\n\n@pytest.mark.parametrize(\"keep_in_memory\", [False, True])\ndef test_dataset_from_csv_keep_in_memory(keep_in_memory, csv_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"int64\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = CsvDatasetReader(csv_path, cache_dir=cache_dir, keep_in_memory=keep_in_memory).read()\n    _check_csv_dataset(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\n    \"features\",\n    [\n        None,\n        {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"},\n        {\"col_1\": \"string\", \"col_2\": \"string\", \"col_3\": \"string\"},\n        {\"col_1\": \"int32\", \"col_2\": \"int32\", \"col_3\": \"int32\"},\n        {\"col_1\": \"float32\", \"col_2\": \"float32\", \"col_3\": \"float32\"},\n    ],\n)\ndef test_dataset_from_csv_features(features, csv_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    # CSV file loses col_1 string dtype information: default now is \"int64\" instead of \"string\"\n    default_expected_features = {\"col_1\": \"int64\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    expected_features = features.copy() if features else default_expected_features\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None\n    )\n    dataset = CsvDatasetReader(csv_path, features=features, cache_dir=cache_dir).read()\n    _check_csv_dataset(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\"split\", [None, NamedSplit(\"train\"), \"train\", \"test\"])\ndef test_dataset_from_csv_split(split, csv_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"int64\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    dataset = CsvDatasetReader(csv_path, cache_dir=cache_dir, split=split).read()\n    _check_csv_dataset(dataset, expected_features)\n    assert dataset.split == split if split else \"train\"\n\n\n@pytest.mark.parametrize(\"path_type\", [str, list])\ndef test_dataset_from_csv_path_type(path_type, csv_path, tmp_path):\n    if issubclass(path_type, str):\n        path = csv_path\n    elif issubclass(path_type, list):\n        path = [csv_path]\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"int64\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    dataset = CsvDatasetReader(path, cache_dir=cache_dir).read()\n    _check_csv_dataset(dataset, expected_features)\n\n\ndef _check_csv_datasetdict(dataset_dict, expected_features, splits=(\"train\",)):\n    assert isinstance(dataset_dict, DatasetDict)\n    for split in splits:\n        dataset = dataset_dict[split]\n        assert dataset.num_rows == 4\n        assert dataset.num_columns == 3\n        assert dataset.column_names == [\"col_1\", \"col_2\", \"col_3\"]\n        for feature, expected_dtype in expected_features.items():\n            assert dataset.features[feature].dtype == expected_dtype\n\n\n@pytest.mark.parametrize(\"keep_in_memory\", [False, True])\ndef test_csv_datasetdict_reader_keep_in_memory(keep_in_memory, csv_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"int64\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = CsvDatasetReader({\"train\": csv_path}, cache_dir=cache_dir, keep_in_memory=keep_in_memory).read()\n    _check_csv_datasetdict(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\n    \"features\",\n    [\n        None,\n        {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"},\n        {\"col_1\": \"string\", \"col_2\": \"string\", \"col_3\": \"string\"},\n        {\"col_1\": \"int32\", \"col_2\": \"int32\", \"col_3\": \"int32\"},\n        {\"col_1\": \"float32\", \"col_2\": \"float32\", \"col_3\": \"float32\"},\n    ],\n)\ndef test_csv_datasetdict_reader_features(features, csv_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    # CSV file loses col_1 string dtype information: default now is \"int64\" instead of \"string\"\n    default_expected_features = {\"col_1\": \"int64\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    expected_features = features.copy() if features else default_expected_features\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None\n    )\n    dataset = CsvDatasetReader({\"train\": csv_path}, features=features, cache_dir=cache_dir).read()\n    _check_csv_datasetdict(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\"split\", [None, NamedSplit(\"train\"), \"train\", \"test\"])\ndef test_csv_datasetdict_reader_split(split, csv_path, tmp_path):\n    if split:\n        path = {split: csv_path}\n    else:\n        path = {\"train\": csv_path, \"test\": csv_path}\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"int64\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    dataset = CsvDatasetReader(path, cache_dir=cache_dir).read()\n    _check_csv_datasetdict(dataset, expected_features, splits=list(path.keys()))\n    assert all(dataset[split].split == split for split in path.keys())\n\n\ndef iter_csv_file(csv_path):\n    with open(csv_path, encoding=\"utf-8\") as csvfile:\n        yield from csv.reader(csvfile)\n\n\ndef test_dataset_to_csv(csv_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    output_csv = os.path.join(cache_dir, \"tmp.csv\")\n    dataset = CsvDatasetReader({\"train\": csv_path}, cache_dir=cache_dir).read()\n    CsvDatasetWriter(dataset[\"train\"], output_csv, num_proc=1).write()\n\n    original_csv = iter_csv_file(csv_path)\n    expected_csv = iter_csv_file(output_csv)\n\n    for row1, row2 in zip(original_csv, expected_csv):\n        assert row1 == row2\n\n\ndef test_dataset_to_csv_multiproc(csv_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    output_csv = os.path.join(cache_dir, \"tmp.csv\")\n    dataset = CsvDatasetReader({\"train\": csv_path}, cache_dir=cache_dir).read()\n    CsvDatasetWriter(dataset[\"train\"], output_csv, num_proc=2).write()\n\n    original_csv = iter_csv_file(csv_path)\n    expected_csv = iter_csv_file(output_csv)\n\n    for row1, row2 in zip(original_csv, expected_csv):\n        assert row1 == row2\n\n\ndef test_dataset_to_csv_invalidproc(csv_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    output_csv = os.path.join(cache_dir, \"tmp.csv\")\n    dataset = CsvDatasetReader({\"train\": csv_path}, cache_dir=cache_dir).read()\n    with pytest.raises(ValueError):\n        CsvDatasetWriter(dataset[\"train\"], output_csv, num_proc=0)\n\n\ndef test_dataset_to_csv_fsspec(dataset, mockfs):\n    dataset_path = \"mock://my_dataset.csv\"\n    writer = CsvDatasetWriter(dataset, dataset_path, storage_options=mockfs.storage_options)\n    assert writer.write() > 0\n    assert mockfs.isfile(dataset_path)\n\n    with fsspec.open(dataset_path, \"rb\", **mockfs.storage_options) as f:\n        assert f.read()\n"
  },
  {
    "path": "tests/io/test_json.py",
    "content": "import io\nimport json\n\nimport fsspec\nimport pytest\n\nfrom datasets import Dataset, DatasetDict, Features, Json, List, NamedSplit, Value\nfrom datasets.io.json import JsonDatasetReader, JsonDatasetWriter\n\nfrom ..fixtures.files import DATA_MIXED_TYPES\nfrom ..utils import assert_arrow_memory_doesnt_increase, assert_arrow_memory_increases\n\n\ndef _check_json_dataset(dataset, expected_features):\n    assert isinstance(dataset, Dataset)\n    assert dataset.num_rows == 4\n    assert dataset.num_columns == 3\n    assert dataset.column_names == [\"col_1\", \"col_2\", \"col_3\"]\n    for feature, expected_dtype in expected_features.items():\n        assert dataset.features[feature].dtype == expected_dtype\n\n\n@pytest.mark.parametrize(\"keep_in_memory\", [False, True])\ndef test_dataset_from_json_keep_in_memory(keep_in_memory, jsonl_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = JsonDatasetReader(jsonl_path, cache_dir=cache_dir, keep_in_memory=keep_in_memory).read()\n    _check_json_dataset(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\n    \"features\",\n    [\n        None,\n        {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"},\n        {\"col_1\": \"string\", \"col_2\": \"string\", \"col_3\": \"string\"},\n        {\"col_1\": \"int32\", \"col_2\": \"int32\", \"col_3\": \"int32\"},\n        {\"col_1\": \"float32\", \"col_2\": \"float32\", \"col_3\": \"float32\"},\n    ],\n)\ndef test_dataset_from_json_features(features, jsonl_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    default_expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    expected_features = features.copy() if features else default_expected_features\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None\n    )\n    dataset = JsonDatasetReader(jsonl_path, features=features, cache_dir=cache_dir).read()\n    _check_json_dataset(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\n    \"features\",\n    [\n        None,\n        {\"col_3\": \"float64\", \"col_1\": \"string\", \"col_2\": \"int64\"},\n    ],\n)\ndef test_dataset_from_json_with_unsorted_column_names(features, jsonl_312_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    default_expected_features = {\"col_3\": \"float64\", \"col_1\": \"string\", \"col_2\": \"int64\"}\n    expected_features = features.copy() if features else default_expected_features\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None\n    )\n    dataset = JsonDatasetReader(jsonl_312_path, features=features, cache_dir=cache_dir).read()\n    assert isinstance(dataset, Dataset)\n    assert dataset.num_rows == 2\n    assert dataset.num_columns == 3\n    assert dataset.column_names == [\"col_3\", \"col_1\", \"col_2\"]\n    for feature, expected_dtype in expected_features.items():\n        assert dataset.features[feature].dtype == expected_dtype\n\n\ndef test_dataset_from_json_with_mismatched_features(jsonl_312_path, tmp_path):\n    # jsonl_312_path features are {\"col_3\": \"float64\", \"col_1\": \"string\", \"col_2\": \"int64\"}\n    features = {\"col_2\": \"int64\", \"col_3\": \"float64\", \"col_1\": \"string\"}\n    expected_features = features.copy()\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None\n    )\n    cache_dir = tmp_path / \"cache\"\n    dataset = JsonDatasetReader(jsonl_312_path, features=features, cache_dir=cache_dir).read()\n    assert isinstance(dataset, Dataset)\n    assert dataset.num_rows == 2\n    assert dataset.num_columns == 3\n    assert dataset.column_names == [\"col_2\", \"col_3\", \"col_1\"]\n    for feature, expected_dtype in expected_features.items():\n        assert dataset.features[feature].dtype == expected_dtype\n\n\ndef test_dataset_from_json_with_missing_fields(jsonl_missing_fields_path, tmp_path):\n    expected_features = {\"col_1\": Value(\"int64\"), \"col_2\": Value(\"int64\"), \"col_3\": Value(\"int64\")}\n\n    cache_dir = tmp_path / \"cache\"\n    dataset = JsonDatasetReader(jsonl_missing_fields_path, cache_dir=cache_dir).read()\n    assert isinstance(dataset, Dataset)\n    assert dataset.num_rows == 2\n    assert dataset.num_columns == 3\n    assert dataset.features == expected_features\n    assert list(dataset) == [\n        {\"col_1\": 1, \"col_2\": 2, \"col_3\": None},\n        {\"col_1\": 1, \"col_2\": None, \"col_3\": 3},\n    ]\n\n\ndef test_dataset_from_json_with_mixed_types(jsonl_mixed_types_path, tmp_path):\n    expected_features = {\"col_1\": Json(), \"col_2\": Json(), \"col_3\": List(Json())}\n\n    cache_dir = tmp_path / \"cache\"\n    dataset = JsonDatasetReader(jsonl_mixed_types_path, cache_dir=cache_dir).read()\n    assert isinstance(dataset, Dataset)\n    assert dataset.num_rows == 3\n    assert dataset.num_columns == 3\n    assert dataset.features == expected_features\n    assert list(dataset) == DATA_MIXED_TYPES\n\n\n@pytest.mark.parametrize(\"split\", [None, NamedSplit(\"train\"), \"train\", \"test\"])\ndef test_dataset_from_json_split(split, jsonl_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    dataset = JsonDatasetReader(jsonl_path, cache_dir=cache_dir, split=split).read()\n    _check_json_dataset(dataset, expected_features)\n    assert dataset.split == split if split else \"train\"\n\n\n@pytest.mark.parametrize(\"path_type\", [str, list])\ndef test_dataset_from_json_path_type(path_type, jsonl_path, tmp_path):\n    if issubclass(path_type, str):\n        path = jsonl_path\n    elif issubclass(path_type, list):\n        path = [jsonl_path]\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    dataset = JsonDatasetReader(path, cache_dir=cache_dir).read()\n    _check_json_dataset(dataset, expected_features)\n\n\ndef _check_json_datasetdict(dataset_dict, expected_features, splits=(\"train\",)):\n    assert isinstance(dataset_dict, DatasetDict)\n    for split in splits:\n        dataset = dataset_dict[split]\n        assert dataset.num_rows == 4\n        assert dataset.num_columns == 3\n        assert dataset.column_names == [\"col_1\", \"col_2\", \"col_3\"]\n        for feature, expected_dtype in expected_features.items():\n            assert dataset.features[feature].dtype == expected_dtype\n\n\n@pytest.mark.parametrize(\"keep_in_memory\", [False, True])\ndef test_datasetdict_from_json_keep_in_memory(keep_in_memory, jsonl_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = JsonDatasetReader({\"train\": jsonl_path}, cache_dir=cache_dir, keep_in_memory=keep_in_memory).read()\n    _check_json_datasetdict(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\n    \"features\",\n    [\n        None,\n        {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"},\n        {\"col_1\": \"string\", \"col_2\": \"string\", \"col_3\": \"string\"},\n        {\"col_1\": \"int32\", \"col_2\": \"int32\", \"col_3\": \"int32\"},\n        {\"col_1\": \"float32\", \"col_2\": \"float32\", \"col_3\": \"float32\"},\n    ],\n)\ndef test_datasetdict_from_json_features(features, jsonl_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    default_expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    expected_features = features.copy() if features else default_expected_features\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None\n    )\n    dataset = JsonDatasetReader({\"train\": jsonl_path}, features=features, cache_dir=cache_dir).read()\n    _check_json_datasetdict(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\"split\", [None, NamedSplit(\"train\"), \"train\", \"test\"])\ndef test_datasetdict_from_json_splits(split, jsonl_path, tmp_path):\n    if split:\n        path = {split: jsonl_path}\n    else:\n        split = \"train\"\n        path = {\"train\": jsonl_path, \"test\": jsonl_path}\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    dataset = JsonDatasetReader(path, cache_dir=cache_dir).read()\n    _check_json_datasetdict(dataset, expected_features, splits=list(path.keys()))\n    assert all(dataset[split].split == split for split in path.keys())\n\n\ndef load_json(buffer):\n    return json.load(buffer)\n\n\ndef load_json_lines(buffer):\n    return [json.loads(line) for line in buffer]\n\n\nclass TestJsonDatasetWriter:\n    @pytest.mark.parametrize(\"lines, load_json_function\", [(True, load_json_lines), (False, load_json)])\n    def test_dataset_to_json_lines(self, lines, load_json_function, dataset):\n        with io.BytesIO() as buffer:\n            JsonDatasetWriter(dataset, buffer, lines=lines).write()\n            buffer.seek(0)\n            exported_content = load_json_function(buffer)\n        assert isinstance(exported_content, list)\n        assert isinstance(exported_content[0], dict)\n        assert len(exported_content) == 10\n\n    @pytest.mark.parametrize(\n        \"orient, container, keys, len_at\",\n        [\n            (\"records\", list, {\"tokens\", \"labels\", \"answers\", \"id\"}, None),\n            (\"split\", dict, {\"columns\", \"data\"}, \"data\"),\n            (\"index\", dict, set(\"0123456789\"), None),\n            (\"columns\", dict, {\"tokens\", \"labels\", \"answers\", \"id\"}, \"tokens\"),\n            (\"values\", list, None, None),\n            (\"table\", dict, {\"schema\", \"data\"}, \"data\"),\n        ],\n    )\n    def test_dataset_to_json_orient(self, orient, container, keys, len_at, dataset):\n        with io.BytesIO() as buffer:\n            JsonDatasetWriter(dataset, buffer, lines=False, orient=orient).write()\n            buffer.seek(0)\n            exported_content = load_json(buffer)\n        assert isinstance(exported_content, container)\n        if keys:\n            if container is dict:\n                assert exported_content.keys() == keys\n            else:\n                assert exported_content[0].keys() == keys\n        else:\n            assert not hasattr(exported_content, \"keys\") and not hasattr(exported_content[0], \"keys\")\n        if len_at:\n            assert len(exported_content[len_at]) == 10\n        else:\n            assert len(exported_content) == 10\n\n    @pytest.mark.parametrize(\"lines, load_json_function\", [(True, load_json_lines), (False, load_json)])\n    def test_dataset_to_json_lines_multiproc(self, lines, load_json_function, dataset):\n        with io.BytesIO() as buffer:\n            JsonDatasetWriter(dataset, buffer, lines=lines, num_proc=2).write()\n            buffer.seek(0)\n            exported_content = load_json_function(buffer)\n        assert isinstance(exported_content, list)\n        assert isinstance(exported_content[0], dict)\n        assert len(exported_content) == 10\n\n    @pytest.mark.parametrize(\n        \"orient, container, keys, len_at\",\n        [\n            (\"records\", list, {\"tokens\", \"labels\", \"answers\", \"id\"}, None),\n            (\"split\", dict, {\"columns\", \"data\"}, \"data\"),\n            (\"index\", dict, set(\"0123456789\"), None),\n            (\"columns\", dict, {\"tokens\", \"labels\", \"answers\", \"id\"}, \"tokens\"),\n            (\"values\", list, None, None),\n            (\"table\", dict, {\"schema\", \"data\"}, \"data\"),\n        ],\n    )\n    def test_dataset_to_json_orient_multiproc(self, orient, container, keys, len_at, dataset):\n        with io.BytesIO() as buffer:\n            JsonDatasetWriter(dataset, buffer, lines=False, orient=orient, num_proc=2).write()\n            buffer.seek(0)\n            exported_content = load_json(buffer)\n        assert isinstance(exported_content, container)\n        if keys:\n            if container is dict:\n                assert exported_content.keys() == keys\n            else:\n                assert exported_content[0].keys() == keys\n        else:\n            assert not hasattr(exported_content, \"keys\") and not hasattr(exported_content[0], \"keys\")\n        if len_at:\n            assert len(exported_content[len_at]) == 10\n        else:\n            assert len(exported_content) == 10\n\n    def test_dataset_to_json_orient_invalidproc(self, dataset):\n        with pytest.raises(ValueError):\n            with io.BytesIO() as buffer:\n                JsonDatasetWriter(dataset, buffer, num_proc=0)\n\n    @pytest.mark.parametrize(\"compression, extension\", [(\"gzip\", \"gz\"), (\"bz2\", \"bz2\"), (\"xz\", \"xz\")])\n    def test_dataset_to_json_compression(self, shared_datadir, tmp_path_factory, extension, compression, dataset):\n        path = tmp_path_factory.mktemp(\"data\") / f\"test.json.{extension}\"\n        original_path = str(shared_datadir / f\"test_file.json.{extension}\")\n        JsonDatasetWriter(dataset, path, compression=compression).write()\n\n        with fsspec.open(path, \"rb\", compression=\"infer\") as f:\n            exported_content = f.read()\n        with fsspec.open(original_path, \"rb\", compression=\"infer\") as f:\n            original_content = f.read()\n        assert exported_content == original_content\n\n    def test_dataset_to_json_fsspec(self, dataset, mockfs):\n        dataset_path = \"mock://my_dataset.json\"\n        writer = JsonDatasetWriter(dataset, dataset_path, storage_options=mockfs.storage_options)\n        assert writer.write() > 0\n        assert mockfs.isfile(dataset_path)\n\n        with fsspec.open(dataset_path, \"rb\", **mockfs.storage_options) as f:\n            assert f.read()\n"
  },
  {
    "path": "tests/io/test_parquet.py",
    "content": "import json\nimport unittest.mock\n\nimport fsspec\nimport pyarrow.parquet as pq\nimport pytest\n\nimport datasets.config\nfrom datasets import Audio, Dataset, DatasetDict, Features, IterableDatasetDict, Json, List, NamedSplit, Value, config\nfrom datasets.arrow_writer import get_arrow_writer_batch_size_from_features\nfrom datasets.features.image import Image\nfrom datasets.info import DatasetInfo\nfrom datasets.io.parquet import ParquetDatasetReader, ParquetDatasetWriter\n\nfrom ..utils import assert_arrow_memory_doesnt_increase, assert_arrow_memory_increases\n\n\nSTRING_FROM_PANDAS = \"large_string\" if datasets.config.PANDAS_VERSION.major >= 3 else \"string\"\n\n\ndef _check_parquet_dataset(dataset, expected_features):\n    assert isinstance(dataset, Dataset)\n    assert dataset.num_rows == 4\n    assert dataset.num_columns == 3\n    assert dataset.column_names == [\"col_1\", \"col_2\", \"col_3\"]\n    for feature, expected_dtype in expected_features.items():\n        assert dataset.features[feature].dtype == expected_dtype\n\n\n@pytest.mark.parametrize(\"keep_in_memory\", [False, True])\ndef test_dataset_from_parquet_keep_in_memory(keep_in_memory, parquet_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = ParquetDatasetReader(parquet_path, cache_dir=cache_dir, keep_in_memory=keep_in_memory).read()\n    _check_parquet_dataset(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\n    \"features\",\n    [\n        None,\n        {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"},\n        {\"col_1\": \"string\", \"col_2\": \"string\", \"col_3\": \"string\"},\n        {\"col_1\": \"int32\", \"col_2\": \"int32\", \"col_3\": \"int32\"},\n        {\"col_1\": \"float32\", \"col_2\": \"float32\", \"col_3\": \"float32\"},\n    ],\n)\ndef test_dataset_from_parquet_features(features, parquet_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    default_expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    expected_features = features.copy() if features else default_expected_features\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None\n    )\n    dataset = ParquetDatasetReader(parquet_path, features=features, cache_dir=cache_dir).read()\n    _check_parquet_dataset(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\"split\", [None, NamedSplit(\"train\"), \"train\", \"test\"])\ndef test_dataset_from_parquet_split(split, parquet_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    dataset = ParquetDatasetReader(parquet_path, cache_dir=cache_dir, split=split).read()\n    _check_parquet_dataset(dataset, expected_features)\n    assert dataset.split == split if split else \"train\"\n\n\n@pytest.mark.parametrize(\"path_type\", [str, list])\ndef test_dataset_from_parquet_path_type(path_type, parquet_path, tmp_path):\n    if issubclass(path_type, str):\n        path = parquet_path\n    elif issubclass(path_type, list):\n        path = [parquet_path]\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    dataset = ParquetDatasetReader(path, cache_dir=cache_dir).read()\n    _check_parquet_dataset(dataset, expected_features)\n\n\ndef test_parquet_read_geoparquet(geoparquet_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    dataset = ParquetDatasetReader(path_or_paths=geoparquet_path, cache_dir=cache_dir).read()\n\n    expected_features = {\n        \"pop_est\": \"float64\",\n        \"continent\": STRING_FROM_PANDAS,\n        \"name\": STRING_FROM_PANDAS,\n        \"gdp_md_est\": \"int64\",\n        \"geometry\": \"binary\",\n    }\n    assert isinstance(dataset, Dataset)\n    assert dataset.num_rows == 5\n    assert dataset.num_columns == 6\n    assert dataset.column_names == [\"pop_est\", \"continent\", \"name\", \"iso_a3\", \"gdp_md_est\", \"geometry\"]\n    for feature, expected_dtype in expected_features.items():\n        assert dataset.features[feature].dtype == expected_dtype\n\n\ndef test_parquet_read_filters(parquet_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    filters = [(\"col_2\", \"==\", 1)]\n    dataset = ParquetDatasetReader(path_or_paths=parquet_path, cache_dir=cache_dir, filters=filters).read()\n\n    assert isinstance(dataset, Dataset)\n    assert all(example[\"col_2\"] == 1 for example in dataset)\n    assert dataset.num_rows == 1\n\n\ndef _check_parquet_datasetdict(dataset_dict, expected_features, splits=(\"train\",)):\n    assert isinstance(dataset_dict, (DatasetDict, IterableDatasetDict))\n    for split in splits:\n        dataset = dataset_dict[split]\n        assert len(list(dataset)) == 4\n        assert dataset.features is not None\n        assert set(dataset.features) == set(expected_features)\n        for feature, expected_dtype in expected_features.items():\n            assert dataset.features[feature].dtype == expected_dtype\n\n\n@pytest.mark.parametrize(\"keep_in_memory\", [False, True])\ndef test_parquet_datasetdict_reader_keep_in_memory(keep_in_memory, parquet_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = ParquetDatasetReader(\n            {\"train\": parquet_path}, cache_dir=cache_dir, keep_in_memory=keep_in_memory\n        ).read()\n    _check_parquet_datasetdict(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\"streaming\", [False, True])\n@pytest.mark.parametrize(\n    \"features\",\n    [\n        None,\n        {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"},\n        {\"col_1\": \"string\", \"col_2\": \"string\", \"col_3\": \"string\"},\n        {\"col_1\": \"int32\", \"col_2\": \"int32\", \"col_3\": \"int32\"},\n        {\"col_1\": \"float32\", \"col_2\": \"float32\", \"col_3\": \"float32\"},\n    ],\n)\ndef test_parquet_datasetdict_reader_features(streaming, features, parquet_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    default_expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    expected_features = features.copy() if features else default_expected_features\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None\n    )\n    dataset = ParquetDatasetReader(\n        {\"train\": parquet_path}, features=features, cache_dir=cache_dir, streaming=streaming\n    ).read()\n    _check_parquet_datasetdict(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\"streaming\", [False, True])\n@pytest.mark.parametrize(\"columns\", [None, [\"col_1\"]])\n@pytest.mark.parametrize(\"pass_features\", [False, True])\n@pytest.mark.parametrize(\"pass_info\", [False, True])\ndef test_parquet_datasetdict_reader_columns(streaming, columns, pass_features, pass_info, parquet_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n\n    default_expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    info = (\n        DatasetInfo(features=Features({feature: Value(dtype) for feature, dtype in default_expected_features.items()}))\n        if pass_info\n        else None\n    )\n\n    expected_features = (\n        {col: default_expected_features[col] for col in columns} if columns else default_expected_features\n    )\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in expected_features.items()}) if pass_features else None\n    )\n\n    dataset = ParquetDatasetReader(\n        {\"train\": parquet_path},\n        columns=columns,\n        features=features,\n        info=info,\n        cache_dir=cache_dir,\n        streaming=streaming,\n    ).read()\n    _check_parquet_datasetdict(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\"split\", [None, NamedSplit(\"train\"), \"train\", \"test\"])\ndef test_parquet_datasetdict_reader_split(split, parquet_path, tmp_path):\n    if split:\n        path = {split: parquet_path}\n    else:\n        split = \"train\"\n        path = {\"train\": parquet_path, \"test\": parquet_path}\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    dataset = ParquetDatasetReader(path, cache_dir=cache_dir).read()\n    _check_parquet_datasetdict(dataset, expected_features, splits=list(path.keys()))\n    assert all(dataset[split].split == split for split in path.keys())\n\n\ndef test_parquet_write(dataset, tmp_path):\n    writer = ParquetDatasetWriter(dataset, tmp_path / \"foo.parquet\")\n    assert writer.write() > 0\n    pf = pq.ParquetFile(tmp_path / \"foo.parquet\")\n    output_table = pf.read()\n    assert dataset.data.table == output_table\n\n\ndef test_parquet_write_uses_content_defined_chunking(dataset, tmp_path):\n    assert config.DEFAULT_CDC_OPTIONS == {\n        \"min_chunk_size\": 256 * 1024,  # 256 KiB\n        \"max_chunk_size\": 1024 * 1024,  # 1 MiB\n        \"norm_level\": 0,\n    }\n\n    with unittest.mock.patch(\"pyarrow.parquet.ParquetWriter\") as MockWriter:\n        writer = ParquetDatasetWriter(dataset, tmp_path / \"foo.parquet\")\n        writer.write()\n        assert MockWriter.call_count == 1\n        _, kwargs = MockWriter.call_args\n        # Save or check the arguments as needed\n        assert \"use_content_defined_chunking\" in kwargs\n        assert kwargs[\"use_content_defined_chunking\"] == config.DEFAULT_CDC_OPTIONS\n\n\ndef test_parquet_writer_persist_cdc_options_as_metadata(dataset, tmp_path):\n    def write_and_get_metadata(**kwargs):\n        # write the dataset to parquet with the default CDC options\n        writer = ParquetDatasetWriter(dataset, tmp_path / \"foo.parquet\", **kwargs)\n        assert writer.write() > 0\n\n        # read the parquet KV metadata\n        metadata = pq.read_metadata(tmp_path / \"foo.parquet\")\n        key_value_metadata = metadata.metadata\n\n        return key_value_metadata\n\n    # by default no arguments are passed, same as passing True using the default options\n    for key_value_metadata in [write_and_get_metadata(), write_and_get_metadata(use_content_defined_chunking=True)]:\n        assert b\"content_defined_chunking\" in key_value_metadata\n        json_encoded_options = key_value_metadata[b\"content_defined_chunking\"].decode(\"utf-8\")\n        assert json.loads(json_encoded_options) == config.DEFAULT_CDC_OPTIONS\n\n    # passing False disables the content defined chunking and doesn't persist the options in metadata\n    key_value_metadata = write_and_get_metadata(use_content_defined_chunking=False)\n    assert b\"content_defined_chunking\" not in key_value_metadata\n\n    # passing custom options, using the custom options\n    custom_cdc_options = {\n        \"min_chunk_size\": 128 * 1024,  # 128 KiB\n        \"max_chunk_size\": 512 * 1024,  # 512 KiB\n        \"norm_level\": 1,\n    }\n    key_value_metadata = write_and_get_metadata(use_content_defined_chunking=custom_cdc_options)\n    assert b\"content_defined_chunking\" in key_value_metadata\n    json_encoded_options = key_value_metadata[b\"content_defined_chunking\"].decode(\"utf-8\")\n    assert json.loads(json_encoded_options) == custom_cdc_options\n\n\ndef test_dataset_to_parquet_keeps_features(shared_datadir, tmp_path):\n    image_path = str(shared_datadir / \"test_image_rgb.jpg\")\n    data = {\"image\": [image_path]}\n    features = Features({\"image\": Image()})\n    dataset = Dataset.from_dict(data, features=features)\n    writer = ParquetDatasetWriter(dataset, tmp_path / \"foo.parquet\")\n    assert writer.write() > 0\n\n    reloaded_dataset = Dataset.from_parquet(str(tmp_path / \"foo.parquet\"))\n    assert dataset.features == reloaded_dataset.features\n\n    reloaded_iterable_dataset = ParquetDatasetReader(str(tmp_path / \"foo.parquet\"), streaming=True).read()\n    assert dataset.features == reloaded_iterable_dataset.features\n\n\ndef test_dataset_to_parquet_json_for_empty_struct(shared_datadir, tmp_path):\n    data = {\"empty_struct\": [{}]}\n    features = Features({\"empty_struct\": Json()})\n    dataset = Dataset.from_dict(data, features=features)\n    writer = ParquetDatasetWriter(dataset, tmp_path / \"foo.parquet\")\n    assert writer.write() > 0\n\n    reloaded_dataset = Dataset.from_parquet(str(tmp_path / \"foo.parquet\"))\n    assert dataset.features == reloaded_dataset.features\n    assert dataset[0] == {\"empty_struct\": {}}\n\n    reloaded_iterable_dataset = ParquetDatasetReader(str(tmp_path / \"foo.parquet\"), streaming=True).read()\n    assert dataset.features == reloaded_iterable_dataset.features\n    assert next(iter(dataset)) == {\"empty_struct\": {}}\n\n\n@pytest.mark.parametrize(\n    \"feature, expected\",\n    [\n        (Features({\"foo\": Value(\"int32\")}), None),\n        (Features({\"image\": Image(), \"foo\": Value(\"int32\")}), config.ARROW_RECORD_BATCH_SIZE_FOR_IMAGE_DATASETS),\n        (Features({\"nested\": List(Audio())}), config.ARROW_RECORD_BATCH_SIZE_FOR_AUDIO_DATASETS),\n    ],\n)\ndef test_get_arrow_writer_batch_size_from_features(feature, expected):\n    assert get_arrow_writer_batch_size_from_features(feature) == expected\n\n\ndef test_dataset_to_parquet_fsspec(dataset, mockfs):\n    dataset_path = \"mock://my_dataset.csv\"\n    writer = ParquetDatasetWriter(dataset, dataset_path, storage_options=mockfs.storage_options)\n    assert writer.write() > 0\n    assert mockfs.isfile(dataset_path)\n\n    with fsspec.open(dataset_path, \"rb\", **mockfs.storage_options) as f:\n        assert f.read()\n"
  },
  {
    "path": "tests/io/test_sql.py",
    "content": "import contextlib\nimport os\nimport sqlite3\n\nimport pytest\n\nimport datasets.config\nfrom datasets import Dataset, Features, Value\nfrom datasets.io.sql import SqlDatasetReader, SqlDatasetWriter\n\nfrom ..utils import assert_arrow_memory_doesnt_increase, assert_arrow_memory_increases, require_sqlalchemy\n\n\nSTRING_FROM_PANDAS = \"large_string\" if datasets.config.PANDAS_VERSION.major >= 3 else \"string\"\n\n\ndef _check_sql_dataset(dataset, expected_features):\n    assert isinstance(dataset, Dataset)\n    assert dataset.num_rows == 4\n    assert dataset.num_columns == 3\n    assert dataset.column_names == [\"col_1\", \"col_2\", \"col_3\"]\n    for feature, expected_dtype in expected_features.items():\n        assert dataset.features[feature].dtype == expected_dtype\n\n\n@require_sqlalchemy\n@pytest.mark.parametrize(\"keep_in_memory\", [False, True])\ndef test_dataset_from_sql_keep_in_memory(keep_in_memory, sqlite_path, tmp_path, set_sqlalchemy_silence_uber_warning):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": STRING_FROM_PANDAS, \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = SqlDatasetReader(\n            \"dataset\", \"sqlite:///\" + sqlite_path, cache_dir=cache_dir, keep_in_memory=keep_in_memory\n        ).read()\n    _check_sql_dataset(dataset, expected_features)\n\n\n@require_sqlalchemy\n@pytest.mark.parametrize(\n    \"features\",\n    [\n        None,\n        {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"},\n        {\"col_1\": \"string\", \"col_2\": \"string\", \"col_3\": \"string\"},\n        {\"col_1\": \"int32\", \"col_2\": \"int32\", \"col_3\": \"int32\"},\n        {\"col_1\": \"float32\", \"col_2\": \"float32\", \"col_3\": \"float32\"},\n    ],\n)\ndef test_dataset_from_sql_features(features, sqlite_path, tmp_path, set_sqlalchemy_silence_uber_warning):\n    cache_dir = tmp_path / \"cache\"\n    default_expected_features = {\"col_1\": STRING_FROM_PANDAS, \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    expected_features = features.copy() if features else default_expected_features\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None\n    )\n    dataset = SqlDatasetReader(\"dataset\", \"sqlite:///\" + sqlite_path, features=features, cache_dir=cache_dir).read()\n    _check_sql_dataset(dataset, expected_features)\n\n\ndef iter_sql_file(sqlite_path):\n    with contextlib.closing(sqlite3.connect(sqlite_path)) as con:\n        cur = con.cursor()\n        cur.execute(\"SELECT * FROM dataset\")\n        for row in cur:\n            yield row\n\n\n@require_sqlalchemy\ndef test_dataset_to_sql(sqlite_path, tmp_path, set_sqlalchemy_silence_uber_warning):\n    cache_dir = tmp_path / \"cache\"\n    output_sqlite_path = os.path.join(cache_dir, \"tmp.sql\")\n    dataset = SqlDatasetReader(\"dataset\", \"sqlite:///\" + sqlite_path, cache_dir=cache_dir).read()\n    SqlDatasetWriter(dataset, \"dataset\", \"sqlite:///\" + output_sqlite_path, num_proc=1).write()\n\n    original_sql = iter_sql_file(sqlite_path)\n    expected_sql = iter_sql_file(output_sqlite_path)\n\n    for row1, row2 in zip(original_sql, expected_sql):\n        assert row1 == row2\n\n\n@require_sqlalchemy\ndef test_dataset_to_sql_multiproc(sqlite_path, tmp_path, set_sqlalchemy_silence_uber_warning):\n    cache_dir = tmp_path / \"cache\"\n    output_sqlite_path = os.path.join(cache_dir, \"tmp.sql\")\n    dataset = SqlDatasetReader(\"dataset\", \"sqlite:///\" + sqlite_path, cache_dir=cache_dir).read()\n    SqlDatasetWriter(dataset, \"dataset\", \"sqlite:///\" + output_sqlite_path, num_proc=2).write()\n\n    original_sql = iter_sql_file(sqlite_path)\n    expected_sql = iter_sql_file(output_sqlite_path)\n\n    for row1, row2 in zip(original_sql, expected_sql):\n        assert row1 == row2\n\n\n@require_sqlalchemy\ndef test_dataset_to_sql_invalidproc(sqlite_path, tmp_path, set_sqlalchemy_silence_uber_warning):\n    cache_dir = tmp_path / \"cache\"\n    output_sqlite_path = os.path.join(cache_dir, \"tmp.sql\")\n    dataset = SqlDatasetReader(\"dataset\", \"sqlite:///\" + sqlite_path, cache_dir=cache_dir).read()\n    with pytest.raises(ValueError):\n        SqlDatasetWriter(dataset, \"dataset\", \"sqlite:///\" + output_sqlite_path, num_proc=0).write()\n"
  },
  {
    "path": "tests/io/test_text.py",
    "content": "import pytest\n\nfrom datasets import Dataset, DatasetDict, Features, NamedSplit, Value\nfrom datasets.io.text import TextDatasetReader\n\nfrom ..utils import assert_arrow_memory_doesnt_increase, assert_arrow_memory_increases\n\n\ndef _check_text_dataset(dataset, expected_features):\n    assert isinstance(dataset, Dataset)\n    assert dataset.num_rows == 4\n    assert dataset.num_columns == 1\n    assert dataset.column_names == [\"text\"]\n    for feature, expected_dtype in expected_features.items():\n        assert dataset.features[feature].dtype == expected_dtype\n\n\n@pytest.mark.parametrize(\"keep_in_memory\", [False, True])\ndef test_dataset_from_text_keep_in_memory(keep_in_memory, text_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"text\": \"string\"}\n    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = TextDatasetReader(text_path, cache_dir=cache_dir, keep_in_memory=keep_in_memory).read()\n    _check_text_dataset(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\n    \"features\",\n    [\n        None,\n        {\"text\": \"string\"},\n        {\"text\": \"int32\"},\n        {\"text\": \"float32\"},\n    ],\n)\ndef test_dataset_from_text_features(features, text_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    default_expected_features = {\"text\": \"string\"}\n    expected_features = features.copy() if features else default_expected_features\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None\n    )\n    dataset = TextDatasetReader(text_path, features=features, cache_dir=cache_dir).read()\n    _check_text_dataset(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\"split\", [None, NamedSplit(\"train\"), \"train\", \"test\"])\ndef test_dataset_from_text_split(split, text_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"text\": \"string\"}\n    dataset = TextDatasetReader(text_path, cache_dir=cache_dir, split=split).read()\n    _check_text_dataset(dataset, expected_features)\n    assert dataset.split == split if split else \"train\"\n\n\n@pytest.mark.parametrize(\"path_type\", [str, list])\ndef test_dataset_from_text_path_type(path_type, text_path, tmp_path):\n    if issubclass(path_type, str):\n        path = text_path\n    elif issubclass(path_type, list):\n        path = [text_path]\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"text\": \"string\"}\n    dataset = TextDatasetReader(path, cache_dir=cache_dir).read()\n    _check_text_dataset(dataset, expected_features)\n\n\ndef _check_text_datasetdict(dataset_dict, expected_features, splits=(\"train\",)):\n    assert isinstance(dataset_dict, DatasetDict)\n    for split in splits:\n        dataset = dataset_dict[split]\n        assert dataset.num_rows == 4\n        assert dataset.num_columns == 1\n        assert dataset.column_names == [\"text\"]\n        for feature, expected_dtype in expected_features.items():\n            assert dataset.features[feature].dtype == expected_dtype\n\n\n@pytest.mark.parametrize(\"keep_in_memory\", [False, True])\ndef test_datasetdict_from_text_keep_in_memory(keep_in_memory, text_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"text\": \"string\"}\n    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = TextDatasetReader({\"train\": text_path}, cache_dir=cache_dir, keep_in_memory=keep_in_memory).read()\n    _check_text_datasetdict(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\n    \"features\",\n    [\n        None,\n        {\"text\": \"string\"},\n        {\"text\": \"int32\"},\n        {\"text\": \"float32\"},\n    ],\n)\ndef test_datasetdict_from_text_features(features, text_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    # CSV file loses col_1 string dtype information: default now is \"int64\" instead of \"string\"\n    default_expected_features = {\"text\": \"string\"}\n    expected_features = features.copy() if features else default_expected_features\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None\n    )\n    dataset = TextDatasetReader({\"train\": text_path}, features=features, cache_dir=cache_dir).read()\n    _check_text_datasetdict(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\"split\", [None, NamedSplit(\"train\"), \"train\", \"test\"])\ndef test_datasetdict_from_text_split(split, text_path, tmp_path):\n    if split:\n        path = {split: text_path}\n    else:\n        split = \"train\"\n        path = {\"train\": text_path, \"test\": text_path}\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"text\": \"string\"}\n    dataset = TextDatasetReader(path, cache_dir=cache_dir).read()\n    _check_text_datasetdict(dataset, expected_features, splits=list(path.keys()))\n    assert all(dataset[split].split == split for split in path.keys())\n"
  },
  {
    "path": "tests/packaged_modules/__init__.py",
    "content": ""
  },
  {
    "path": "tests/packaged_modules/test_arrow.py",
    "content": "import pyarrow as pa\nimport pytest\n\nfrom datasets.builder import InvalidConfigName\nfrom datasets.data_files import DataFilesList\nfrom datasets.packaged_modules.arrow.arrow import Arrow, ArrowConfig\n\n\n@pytest.fixture\ndef arrow_file_streaming_format(tmp_path):\n    filename = tmp_path / \"stream.arrow\"\n    testdata = [[1, 1, 1], [0, 100, 6], [1, 90, 900]]\n\n    schema = pa.schema([pa.field(\"input_ids\", pa.list_(pa.int32()))])\n    array = pa.array(testdata, type=pa.list_(pa.int32()))\n    table = pa.Table.from_arrays([array], schema=schema)\n    with open(filename, \"wb\") as f:\n        with pa.ipc.new_stream(f, schema) as writer:\n            writer.write_table(table)\n    return str(filename)\n\n\n@pytest.fixture\ndef arrow_file_file_format(tmp_path):\n    filename = tmp_path / \"file.arrow\"\n    testdata = [[1, 1, 1], [0, 100, 6], [1, 90, 900]]\n\n    schema = pa.schema([pa.field(\"input_ids\", pa.list_(pa.int32()))])\n    array = pa.array(testdata, type=pa.list_(pa.int32()))\n    table = pa.Table.from_arrays([array], schema=schema)\n    with open(filename, \"wb\") as f:\n        with pa.ipc.new_file(f, schema) as writer:\n            writer.write_table(table)\n    return str(filename)\n\n\n@pytest.mark.parametrize(\n    \"file_fixture, config_kwargs\",\n    [\n        (\"arrow_file_streaming_format\", {}),\n        (\"arrow_file_file_format\", {}),\n    ],\n)\ndef test_arrow_generate_tables(file_fixture, config_kwargs, request):\n    arrow = Arrow(**config_kwargs)\n    generator = arrow._generate_tables([request.getfixturevalue(file_fixture)])\n    pa_table = pa.concat_tables([table for _, table in generator])\n\n    expected = {\"input_ids\": [[1, 1, 1], [0, 100, 6], [1, 90, 900]]}\n    assert pa_table.to_pydict() == expected\n\n\ndef test_config_raises_when_invalid_name() -> None:\n    with pytest.raises(InvalidConfigName, match=\"Bad characters\"):\n        _ = ArrowConfig(name=\"name-with-*-invalid-character\")\n\n\n@pytest.mark.parametrize(\"data_files\", [\"str_path\", [\"str_path\"], DataFilesList([\"str_path\"], [()])])\ndef test_config_raises_when_invalid_data_files(data_files) -> None:\n    with pytest.raises(ValueError, match=\"Expected a DataFilesDict\"):\n        _ = ArrowConfig(name=\"name\", data_files=data_files)\n"
  },
  {
    "path": "tests/packaged_modules/test_audiofolder.py",
    "content": "import shutil\nimport textwrap\n\nimport numpy as np\nimport pytest\n\nfrom datasets import Audio, ClassLabel, Features\nfrom datasets.builder import InvalidConfigName\nfrom datasets.data_files import DataFilesDict, DataFilesList, get_data_patterns\nfrom datasets.download.streaming_download_manager import StreamingDownloadManager\nfrom datasets.packaged_modules.audiofolder.audiofolder import AudioFolder, AudioFolderConfig\n\nfrom ..utils import require_torchcodec\n\n\n@pytest.fixture\ndef cache_dir(tmp_path):\n    return str(tmp_path / \"audiofolder_cache_dir\")\n\n\n@pytest.fixture\ndef data_files_with_labels_no_metadata(tmp_path, audio_file):\n    data_dir = tmp_path / \"data_files_with_labels_no_metadata\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    subdir_class_0 = data_dir / \"fr\"\n    subdir_class_0.mkdir(parents=True, exist_ok=True)\n    subdir_class_1 = data_dir / \"uk\"\n    subdir_class_1.mkdir(parents=True, exist_ok=True)\n\n    audio_filename = subdir_class_0 / \"audio_fr.wav\"\n    shutil.copyfile(audio_file, audio_filename)\n    audio_filename2 = subdir_class_1 / \"audio_uk.wav\"\n    shutil.copyfile(audio_file, audio_filename2)\n\n    data_files_with_labels_no_metadata = DataFilesDict.from_patterns(\n        get_data_patterns(str(data_dir)), data_dir.as_posix()\n    )\n\n    return data_files_with_labels_no_metadata\n\n\n@pytest.fixture\ndef audio_file_with_metadata(tmp_path, audio_file):\n    audio_filename = tmp_path / \"audio_file.wav\"\n    shutil.copyfile(audio_file, audio_filename)\n    audio_metadata_filename = tmp_path / \"metadata.jsonl\"\n    audio_metadata = textwrap.dedent(\n        \"\"\"\\\n        {\"file_name\": \"audio_file.wav\", \"text\": \"Audio transcription\"}\n        \"\"\"\n    )\n    with open(audio_metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(audio_metadata)\n    return str(audio_filename), str(audio_metadata_filename)\n\n\n@pytest.fixture\ndef data_files_with_one_split_and_metadata(tmp_path, audio_file):\n    data_dir = tmp_path / \"audiofolder_data_dir_with_metadata\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    subdir = data_dir / \"subdir\"\n    subdir.mkdir(parents=True, exist_ok=True)\n\n    audio_filename = data_dir / \"audio_file.wav\"\n    shutil.copyfile(audio_file, audio_filename)\n    audio_filename2 = data_dir / \"audio_file2.wav\"\n    shutil.copyfile(audio_file, audio_filename2)\n    audio_filename3 = subdir / \"audio_file3.wav\"  # in subdir\n    shutil.copyfile(audio_file, audio_filename3)\n\n    audio_metadata_filename = data_dir / \"metadata.jsonl\"\n    audio_metadata = textwrap.dedent(\n        \"\"\"\\\n        {\"file_name\": \"audio_file.wav\", \"text\": \"First audio transcription\"}\n        {\"file_name\": \"audio_file2.wav\", \"text\": \"Second audio transcription\"}\n        {\"file_name\": \"subdir/audio_file3.wav\", \"text\": \"Third audio transcription (in subdir)\"}\n        \"\"\"\n    )\n    with open(audio_metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(audio_metadata)\n    data_files_with_one_split_and_metadata = DataFilesDict.from_patterns(\n        get_data_patterns(str(data_dir)), data_dir.as_posix()\n    )\n    assert len(data_files_with_one_split_and_metadata) == 1\n    assert len(data_files_with_one_split_and_metadata[\"train\"]) == 4\n    return data_files_with_one_split_and_metadata\n\n\n@pytest.fixture(params=[\"jsonl\", \"csv\"])\ndef data_files_with_two_splits_and_metadata(request, tmp_path, audio_file):\n    data_dir = tmp_path / \"audiofolder_data_dir_with_metadata\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    train_dir = data_dir / \"train\"\n    train_dir.mkdir(parents=True, exist_ok=True)\n    test_dir = data_dir / \"test\"\n    test_dir.mkdir(parents=True, exist_ok=True)\n\n    audio_filename = train_dir / \"audio_file.wav\"  # train audio\n    shutil.copyfile(audio_file, audio_filename)\n    audio_filename2 = train_dir / \"audio_file2.wav\"  # train audio\n    shutil.copyfile(audio_file, audio_filename2)\n    audio_filename3 = test_dir / \"audio_file3.wav\"  # test audio\n    shutil.copyfile(audio_file, audio_filename3)\n\n    train_audio_metadata_filename = train_dir / f\"metadata.{request.param}\"\n    audio_metadata = (\n        textwrap.dedent(\n            \"\"\"\\\n        {\"file_name\": \"audio_file.wav\", \"text\": \"First train audio transcription\"}\n        {\"file_name\": \"audio_file2.wav\", \"text\": \"Second train audio transcription\"}\n        \"\"\"\n        )\n        if request.param == \"jsonl\"\n        else textwrap.dedent(\n            \"\"\"\\\n        file_name,text\n        audio_file.wav,First train audio transcription\n        audio_file2.wav,Second train audio transcription\n        \"\"\"\n        )\n    )\n    with open(train_audio_metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(audio_metadata)\n    test_audio_metadata_filename = test_dir / f\"metadata.{request.param}\"\n    audio_metadata = (\n        textwrap.dedent(\n            \"\"\"\\\n        {\"file_name\": \"audio_file3.wav\", \"text\": \"Test audio transcription\"}\n        \"\"\"\n        )\n        if request.param == \"jsonl\"\n        else textwrap.dedent(\n            \"\"\"\\\n        file_name,text\n        audio_file3.wav,Test audio transcription\n        \"\"\"\n        )\n    )\n    with open(test_audio_metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(audio_metadata)\n    data_files_with_two_splits_and_metadata = DataFilesDict.from_patterns(\n        get_data_patterns(str(data_dir)), data_dir.as_posix()\n    )\n    assert len(data_files_with_two_splits_and_metadata) == 2\n    assert len(data_files_with_two_splits_and_metadata[\"train\"]) == 3\n    assert len(data_files_with_two_splits_and_metadata[\"test\"]) == 2\n    return data_files_with_two_splits_and_metadata\n\n\n@pytest.fixture\ndef data_files_with_zip_archives(tmp_path, audio_file_44100, audio_file_16000):\n    data_dir = tmp_path / \"audiofolder_data_dir_with_zip_archives\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    archive_dir = data_dir / \"archive\"\n    archive_dir.mkdir(parents=True, exist_ok=True)\n    subdir = archive_dir / \"subdir\"\n    subdir.mkdir(parents=True, exist_ok=True)\n\n    audio_filename = archive_dir / \"audio_file.mp3\"\n    shutil.copyfile(audio_file_44100, audio_filename)\n    audio_filename2 = subdir / \"audio_file2.mp3\"  # in subdir\n    shutil.copyfile(audio_file_16000, audio_filename2)\n\n    audio_metadata_filename = archive_dir / \"metadata.jsonl\"\n    audio_metadata = textwrap.dedent(\n        \"\"\"\\\n        {\"file_name\": \"audio_file.mp3\", \"text\": \"First audio transcription\"}\n        {\"file_name\": \"subdir/audio_file2.mp3\", \"text\": \"Second audio transcription (in subdir)\"}\n        \"\"\"\n    )\n\n    with open(audio_metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(audio_metadata)\n\n    shutil.make_archive(str(archive_dir), \"zip\", archive_dir)\n    shutil.rmtree(str(archive_dir))\n\n    data_files_with_zip_archives = DataFilesDict.from_patterns(get_data_patterns(str(data_dir)), data_dir.as_posix())\n\n    assert len(data_files_with_zip_archives) == 1\n    assert len(data_files_with_zip_archives[\"train\"]) == 1\n    return data_files_with_zip_archives\n\n\ndef test_config_raises_when_invalid_name() -> None:\n    with pytest.raises(InvalidConfigName, match=\"Bad characters\"):\n        _ = AudioFolderConfig(name=\"name-with-*-invalid-character\")\n\n\n@pytest.mark.parametrize(\"data_files\", [\"str_path\", [\"str_path\"], DataFilesList([\"str_path\"], [()])])\ndef test_config_raises_when_invalid_data_files(data_files) -> None:\n    with pytest.raises(ValueError, match=\"Expected a DataFilesDict\"):\n        _ = AudioFolderConfig(name=\"name\", data_files=data_files)\n\n\n@require_torchcodec\n# check that labels are inferred correctly from dir names\ndef test_generate_examples_with_labels(data_files_with_labels_no_metadata, cache_dir):\n    # there are no metadata.jsonl files in this test case\n    audiofolder = AudioFolder(data_files=data_files_with_labels_no_metadata, cache_dir=cache_dir, drop_labels=False)\n    audiofolder.download_and_prepare()\n    assert audiofolder.info.features == Features({\"audio\": Audio(), \"label\": ClassLabel(names=[\"fr\", \"uk\"])})\n    dataset = list(audiofolder.as_dataset()[\"train\"])\n    label_feature = audiofolder.info.features[\"label\"]\n\n    assert dataset[0][\"label\"] == label_feature._str2int[\"fr\"]\n    assert dataset[1][\"label\"] == label_feature._str2int[\"uk\"]\n\n\n@require_torchcodec\n@pytest.mark.parametrize(\"drop_metadata\", [None, True, False])\n@pytest.mark.parametrize(\"drop_labels\", [None, True, False])\ndef test_generate_examples_drop_labels(data_files_with_labels_no_metadata, drop_metadata, drop_labels):\n    audiofolder = AudioFolder(\n        drop_metadata=drop_metadata, drop_labels=drop_labels, data_files=data_files_with_labels_no_metadata\n    )\n    gen_kwargs = audiofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs\n    # removing the labels explicitly requires drop_labels=True\n    assert gen_kwargs[\"add_labels\"] is not bool(drop_labels)\n    assert gen_kwargs[\"add_metadata\"] is False  # metadata files is not present in this case\n    generator = audiofolder._generate_examples(**gen_kwargs)\n    if not drop_labels:\n        assert all(\n            example.keys() == {\"audio\", \"label\"} and all(val is not None for val in example.values())\n            for _, example in generator\n        )\n    else:\n        assert all(\n            example.keys() == {\"audio\"} and all(val is not None for val in example.values())\n            for _, example in generator\n        )\n\n\n@require_torchcodec\n@pytest.mark.parametrize(\"drop_metadata\", [None, True, False])\n@pytest.mark.parametrize(\"drop_labels\", [None, True, False])\ndef test_generate_examples_drop_metadata(audio_file_with_metadata, drop_metadata, drop_labels):\n    audio_file, audio_metadata_file = audio_file_with_metadata\n    audiofolder = AudioFolder(\n        drop_metadata=drop_metadata, drop_labels=drop_labels, data_files={\"train\": [audio_file, audio_metadata_file]}\n    )\n    gen_kwargs = audiofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs\n    # since the dataset has metadata, removing the metadata explicitly requires drop_metadata=True\n    assert gen_kwargs[\"add_metadata\"] is not bool(drop_metadata)\n    # since the dataset has metadata, adding the labels explicitly requires drop_labels=False\n    assert gen_kwargs[\"add_labels\"] is False\n    generator = audiofolder._generate_examples(**gen_kwargs)\n    expected_columns = {\"audio\"}\n    if gen_kwargs[\"add_metadata\"]:\n        expected_columns.add(\"text\")\n    if gen_kwargs[\"add_labels\"]:\n        expected_columns.add(\"label\")\n    result = [example for _, example in generator]\n    assert len(result) == 1\n    example = result[0]\n    assert example.keys() == expected_columns\n    for column in expected_columns:\n        assert example[column] is not None\n\n\n@require_torchcodec\n@pytest.mark.parametrize(\"streaming\", [False, True])\ndef test_data_files_with_metadata_and_single_split(streaming, cache_dir, data_files_with_one_split_and_metadata):\n    data_files = data_files_with_one_split_and_metadata\n    audiofolder = AudioFolder(data_files=data_files, cache_dir=cache_dir)\n    audiofolder.download_and_prepare()\n    datasets = audiofolder.as_streaming_dataset() if streaming else audiofolder.as_dataset()\n    for split, data_files in data_files.items():\n        expected_num_of_audios = len(data_files) - 1  # don't count the metadata file\n        assert split in datasets\n        dataset = list(datasets[split])\n        assert len(dataset) == expected_num_of_audios\n        # make sure each sample has its own audio and metadata\n        assert len({example[\"audio\"].metadata.path for example in dataset}) == expected_num_of_audios\n        assert len({example[\"text\"] for example in dataset}) == expected_num_of_audios\n        assert all(example[\"text\"] is not None for example in dataset)\n\n\n@require_torchcodec\n@pytest.mark.parametrize(\"streaming\", [False, True])\ndef test_data_files_with_metadata_and_multiple_splits(streaming, cache_dir, data_files_with_two_splits_and_metadata):\n    data_files = data_files_with_two_splits_and_metadata\n    audiofolder = AudioFolder(data_files=data_files, cache_dir=cache_dir)\n    audiofolder.download_and_prepare()\n    datasets = audiofolder.as_streaming_dataset() if streaming else audiofolder.as_dataset()\n    for split, data_files in data_files.items():\n        expected_num_of_audios = len(data_files) - 1  # don't count the metadata file\n        assert split in datasets\n        dataset = list(datasets[split])\n        assert len(dataset) == expected_num_of_audios\n        # make sure each sample has its own audio and metadata\n        assert len({example[\"audio\"].metadata.path for example in dataset}) == expected_num_of_audios\n        assert len({example[\"text\"] for example in dataset}) == expected_num_of_audios\n        assert all(example[\"text\"] is not None for example in dataset)\n\n\n@require_torchcodec\n@pytest.mark.parametrize(\"streaming\", [False, True])\ndef test_data_files_with_metadata_and_archives(streaming, cache_dir, data_files_with_zip_archives):\n    audiofolder = AudioFolder(data_files=data_files_with_zip_archives, cache_dir=cache_dir)\n    audiofolder.download_and_prepare()\n    datasets = audiofolder.as_streaming_dataset() if streaming else audiofolder.as_dataset()\n    for split, data_files in data_files_with_zip_archives.items():\n        num_of_archives = len(data_files)  # the metadata file is inside the archive\n        expected_num_of_audios = 2 * num_of_archives\n        assert split in datasets\n        dataset = list(datasets[split])\n        assert len(dataset) == expected_num_of_audios\n        # make sure each sample has its own audio (all arrays are different) and metadata\n        assert (\n            sum(\n                np.array_equal(\n                    dataset[0][\"audio\"].get_all_samples().data.numpy(), example[\"audio\"].get_all_samples().data.numpy()\n                )\n                for example in dataset[1:]\n            )\n            == 0\n        )\n        assert len({example[\"text\"] for example in dataset}) == expected_num_of_audios\n        assert all(example[\"text\"] is not None for example in dataset)\n\n\n@require_torchcodec\ndef test_data_files_with_wrong_metadata_file_name(cache_dir, tmp_path, audio_file):\n    data_dir = tmp_path / \"data_dir_with_bad_metadata\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    shutil.copyfile(audio_file, data_dir / \"audio_file.wav\")\n    audio_metadata_filename = data_dir / \"bad_metadata.jsonl\"  # bad file\n    audio_metadata = textwrap.dedent(\n        \"\"\"\\\n        {\"file_name\": \"audio_file.wav\", \"text\": \"Audio transcription\"}\n        \"\"\"\n    )\n    with open(audio_metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(audio_metadata)\n\n    data_files_with_bad_metadata = DataFilesDict.from_patterns(get_data_patterns(str(data_dir)), data_dir.as_posix())\n    audiofolder = AudioFolder(data_files=data_files_with_bad_metadata, cache_dir=cache_dir)\n    audiofolder.download_and_prepare()\n    dataset = audiofolder.as_dataset(split=\"train\")\n    # check that there are no metadata, since the metadata file name doesn't have the right name\n    assert \"text\" not in dataset.column_names\n\n\n@require_torchcodec\ndef test_data_files_with_custom_audio_file_name_column_in_metadata_file(cache_dir, tmp_path, audio_file):\n    data_dir = tmp_path / \"data_dir_with_custom_file_name_metadata\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    shutil.copyfile(audio_file, data_dir / \"audio_file.wav\")\n    audio_metadata_filename = data_dir / \"metadata.jsonl\"\n    audio_metadata = textwrap.dedent(  # with bad column \"bad_file_name\" instead of \"file_name\"\n        \"\"\"\\\n        {\"speech_file_name\": \"audio_file.wav\", \"text\": \"Audio transcription\"}\n        \"\"\"\n    )\n    with open(audio_metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(audio_metadata)\n\n    data_files_with_bad_metadata = DataFilesDict.from_patterns(get_data_patterns(str(data_dir)), data_dir.as_posix())\n    audiofolder = AudioFolder(data_files=data_files_with_bad_metadata, cache_dir=cache_dir)\n    audiofolder.download_and_prepare()\n    dataset = audiofolder.as_dataset(split=\"train\")\n    assert \"speech\" in dataset.features\n    assert \"speech_file_name\" not in dataset.features\n\n\n@require_torchcodec\ndef test_data_files_with_with_metadata_in_different_formats(cache_dir, tmp_path, audio_file):\n    data_dir = tmp_path / \"data_dir_with_metadata_in_different_format\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    shutil.copyfile(audio_file, data_dir / \"audio_file.wav\")\n    audio_metadata_filename_jsonl = data_dir / \"metadata.jsonl\"\n    audio_metadata_jsonl = textwrap.dedent(\n        \"\"\"\\\n        {\"file_name\": \"audio_file.wav\", \"text\": \"Audio transcription\"}\n        \"\"\"\n    )\n    with open(audio_metadata_filename_jsonl, \"w\", encoding=\"utf-8\") as f:\n        f.write(audio_metadata_jsonl)\n    audio_metadata_filename_csv = data_dir / \"metadata.csv\"\n    audio_metadata_csv = textwrap.dedent(\n        \"\"\"\\\n        file_name,text\n        audio_file.wav,Audio transcription\n        \"\"\"\n    )\n    with open(audio_metadata_filename_csv, \"w\", encoding=\"utf-8\") as f:\n        f.write(audio_metadata_csv)\n\n    data_files_with_bad_metadata = DataFilesDict.from_patterns(get_data_patterns(str(data_dir)), data_dir.as_posix())\n    audiofolder = AudioFolder(data_files=data_files_with_bad_metadata, cache_dir=cache_dir)\n    with pytest.raises(ValueError) as exc_info:\n        audiofolder.download_and_prepare()\n    assert \"metadata files with different extensions\" in str(exc_info.value)\n"
  },
  {
    "path": "tests/packaged_modules/test_cache.py",
    "content": "from pathlib import Path\n\nimport pytest\n\nfrom datasets import load_dataset\nfrom datasets.packaged_modules.cache.cache import Cache\n\n\nSAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA = \"hf-internal-testing/audiofolder_single_config_in_metadata\"\nSAMPLE_DATASET_TWO_CONFIG_IN_METADATA = \"hf-internal-testing/audiofolder_two_configs_in_metadata\"\nSAMPLE_DATASET_CAPITAL_LETTERS_IN_NAME = \"hf-internal-testing/DatasetWithCapitalLetters\"\n\n\ndef test_cache(text_dir: Path, tmp_path: Path):\n    cache_dir = tmp_path / \"test_cache\"\n    ds = load_dataset(str(text_dir), cache_dir=str(cache_dir))\n    hash = Path(ds[\"train\"].cache_files[0][\"filename\"]).parts[-2]\n    cache = Cache(cache_dir=str(cache_dir), dataset_name=text_dir.name, hash=hash)\n    reloaded = cache.as_dataset()\n    assert list(ds) == list(reloaded)\n    assert list(ds[\"train\"]) == list(reloaded[\"train\"])\n\n\ndef test_cache_streaming(text_dir: Path, tmp_path: Path):\n    cache_dir = tmp_path / \"test_cache_streaming\"\n    ds = load_dataset(str(text_dir), cache_dir=str(cache_dir))\n    hash = Path(ds[\"train\"].cache_files[0][\"filename\"]).parts[-2]\n    cache = Cache(cache_dir=str(cache_dir), dataset_name=text_dir.name, hash=hash)\n    reloaded = cache.as_streaming_dataset()\n    assert list(ds) == list(reloaded)\n    assert list(ds[\"train\"]) == list(reloaded[\"train\"])\n\n\ndef test_cache_auto_hash(text_dir: Path, tmp_path: Path):\n    cache_dir = tmp_path / \"test_cache_auto_hash\"\n    ds = load_dataset(str(text_dir), cache_dir=str(cache_dir))\n    cache = Cache(cache_dir=str(cache_dir), dataset_name=text_dir.name, version=\"auto\", hash=\"auto\")\n    reloaded = cache.as_dataset()\n    assert list(ds) == list(reloaded)\n    assert list(ds[\"train\"]) == list(reloaded[\"train\"])\n\n\ndef test_cache_auto_hash_with_custom_config(text_dir: Path, tmp_path: Path):\n    cache_dir = tmp_path / \"test_cache_auto_hash_with_custom_config\"\n    ds = load_dataset(str(text_dir), sample_by=\"paragraph\", cache_dir=str(cache_dir))\n    another_ds = load_dataset(str(text_dir), cache_dir=str(cache_dir))\n    cache = Cache(\n        cache_dir=str(cache_dir), dataset_name=text_dir.name, version=\"auto\", hash=\"auto\", sample_by=\"paragraph\"\n    )\n    another_cache = Cache(cache_dir=str(cache_dir), dataset_name=text_dir.name, version=\"auto\", hash=\"auto\")\n    assert cache.config_id.endswith(\"paragraph\")\n    assert not another_cache.config_id.endswith(\"paragraph\")\n    reloaded = cache.as_dataset()\n    another_reloaded = another_cache.as_dataset()\n    assert list(ds) == list(reloaded)\n    assert list(ds[\"train\"]) == list(reloaded[\"train\"])\n    assert list(another_ds) == list(another_reloaded)\n    assert list(another_ds[\"train\"]) == list(another_reloaded[\"train\"])\n\n\ndef test_cache_missing(text_dir: Path, tmp_path: Path):\n    cache_dir = tmp_path / \"test_cache_missing\"\n    load_dataset(str(text_dir), cache_dir=str(cache_dir))\n    Cache(cache_dir=str(cache_dir), dataset_name=text_dir.name, version=\"auto\", hash=\"auto\").download_and_prepare()\n    with pytest.raises(ValueError):\n        Cache(cache_dir=str(cache_dir), dataset_name=\"missing\", version=\"auto\", hash=\"auto\").download_and_prepare()\n    with pytest.raises(ValueError):\n        Cache(cache_dir=str(cache_dir), dataset_name=text_dir.name, hash=\"missing\").download_and_prepare()\n    with pytest.raises(ValueError):\n        Cache(\n            cache_dir=str(cache_dir), dataset_name=text_dir.name, config_name=\"missing\", version=\"auto\", hash=\"auto\"\n        ).download_and_prepare()\n\n\n@pytest.mark.integration\ndef test_cache_multi_configs(tmp_path: Path):\n    cache_dir = tmp_path / \"test_cache_multi_configs\"\n    repo_id = SAMPLE_DATASET_TWO_CONFIG_IN_METADATA\n    dataset_name = repo_id.split(\"/\")[-1]\n    config_name = \"v1\"\n    ds = load_dataset(repo_id, config_name, cache_dir=str(cache_dir))\n    cache = Cache(\n        cache_dir=str(cache_dir),\n        dataset_name=dataset_name,\n        repo_id=repo_id,\n        config_name=config_name,\n        version=\"auto\",\n        hash=\"auto\",\n    )\n    reloaded = cache.as_dataset()\n    assert list(ds) == list(reloaded)\n    assert len(ds[\"train\"]) == len(reloaded[\"train\"])\n    with pytest.raises(ValueError) as excinfo:\n        Cache(\n            cache_dir=str(cache_dir),\n            dataset_name=dataset_name,\n            repo_id=repo_id,\n            config_name=\"missing\",\n            version=\"auto\",\n            hash=\"auto\",\n        )\n    assert config_name in str(excinfo.value)\n\n\n@pytest.mark.integration\ndef test_cache_single_config(tmp_path: Path):\n    cache_dir = tmp_path / \"test_cache_single_config\"\n    repo_id = SAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA\n    dataset_name = repo_id.split(\"/\")[-1]\n    config_name = \"custom\"\n    ds = load_dataset(repo_id, cache_dir=str(cache_dir))\n    cache = Cache(cache_dir=str(cache_dir), dataset_name=dataset_name, repo_id=repo_id, version=\"auto\", hash=\"auto\")\n    reloaded = cache.as_dataset()\n    assert list(ds) == list(reloaded)\n    assert len(ds[\"train\"]) == len(reloaded[\"train\"])\n    cache = Cache(\n        cache_dir=str(cache_dir),\n        dataset_name=dataset_name,\n        config_name=config_name,\n        repo_id=repo_id,\n        version=\"auto\",\n        hash=\"auto\",\n    )\n    reloaded = cache.as_dataset()\n    assert list(ds) == list(reloaded)\n    assert len(ds[\"train\"]) == len(reloaded[\"train\"])\n    with pytest.raises(ValueError) as excinfo:\n        Cache(\n            cache_dir=str(cache_dir),\n            dataset_name=dataset_name,\n            repo_id=repo_id,\n            config_name=\"missing\",\n            version=\"auto\",\n            hash=\"auto\",\n        )\n    assert config_name in str(excinfo.value)\n\n\n@pytest.mark.integration\ndef test_cache_capital_letters(tmp_path: Path):\n    cache_dir = tmp_path / \"test_cache_capital_letters\"\n    repo_id = SAMPLE_DATASET_CAPITAL_LETTERS_IN_NAME\n    dataset_name = repo_id.split(\"/\")[-1]\n    ds = load_dataset(repo_id, cache_dir=str(cache_dir))\n    cache = Cache(cache_dir=str(cache_dir), dataset_name=dataset_name, repo_id=repo_id, version=\"auto\", hash=\"auto\")\n    reloaded = cache.as_dataset()\n    assert list(ds) == list(reloaded)\n    assert len(ds[\"train\"]) == len(reloaded[\"train\"])\n    cache = Cache(\n        cache_dir=str(cache_dir),\n        dataset_name=dataset_name,\n        repo_id=repo_id,\n        version=\"auto\",\n        hash=\"auto\",\n    )\n    reloaded = cache.as_dataset()\n    assert list(ds) == list(reloaded)\n    assert len(ds[\"train\"]) == len(reloaded[\"train\"])\n"
  },
  {
    "path": "tests/packaged_modules/test_csv.py",
    "content": "import os\nimport textwrap\n\nimport pyarrow as pa\nimport pytest\n\nfrom datasets import ClassLabel, Features, Image\nfrom datasets.builder import InvalidConfigName\nfrom datasets.data_files import DataFilesList\nfrom datasets.packaged_modules.csv.csv import Csv, CsvConfig\n\nfrom ..utils import require_pil\n\n\n@pytest.fixture\ndef csv_file(tmp_path):\n    filename = tmp_path / \"file.csv\"\n    data = textwrap.dedent(\n        \"\"\"\\\n        header1,header2\n        1,2\n        10,20\n        \"\"\"\n    )\n    with open(filename, \"w\") as f:\n        f.write(data)\n    return str(filename)\n\n\n@pytest.fixture\ndef malformed_csv_file(tmp_path):\n    filename = tmp_path / \"malformed_file.csv\"\n    data = textwrap.dedent(\n        \"\"\"\\\n        header1,header2\n        1,2\n        10,20,\n        \"\"\"\n    )\n    with open(filename, \"w\") as f:\n        f.write(data)\n    return str(filename)\n\n\n@pytest.fixture\ndef csv_file_with_image(tmp_path, image_file):\n    filename = tmp_path / \"csv_with_image.csv\"\n    data = textwrap.dedent(\n        f\"\"\"\\\n        image\n        {image_file}\n        \"\"\"\n    )\n    with open(filename, \"w\") as f:\n        f.write(data)\n    return str(filename)\n\n\n@pytest.fixture\ndef csv_file_with_label(tmp_path):\n    filename = tmp_path / \"csv_with_label.csv\"\n    data = textwrap.dedent(\n        \"\"\"\\\n        label\n        good\n        bad\n        good\n        \"\"\"\n    )\n    with open(filename, \"w\") as f:\n        f.write(data)\n    return str(filename)\n\n\n@pytest.fixture\ndef csv_file_with_int_list(tmp_path):\n    filename = tmp_path / \"csv_with_int_list.csv\"\n    data = textwrap.dedent(\n        \"\"\"\\\n        int_list\n        1 2 3\n        4 5 6\n        7 8 9\n        \"\"\"\n    )\n    with open(filename, \"w\") as f:\n        f.write(data)\n    return str(filename)\n\n\ndef test_config_raises_when_invalid_name() -> None:\n    with pytest.raises(InvalidConfigName, match=\"Bad characters\"):\n        _ = CsvConfig(name=\"name-with-*-invalid-character\")\n\n\n@pytest.mark.parametrize(\"data_files\", [\"str_path\", [\"str_path\"], DataFilesList([\"str_path\"], [()])])\ndef test_config_raises_when_invalid_data_files(data_files) -> None:\n    with pytest.raises(ValueError, match=\"Expected a DataFilesDict\"):\n        _ = CsvConfig(name=\"name\", data_files=data_files)\n\n\ndef test_csv_generate_tables_raises_error_with_malformed_csv(csv_file, malformed_csv_file, caplog):\n    csv = Csv()\n    base_files = [csv_file, malformed_csv_file]\n    files_iterables = [[file] for file in base_files]\n    generator = csv._generate_tables(base_files=base_files, files_iterables=files_iterables)\n    with pytest.raises(ValueError, match=\"Error tokenizing data\"):\n        for _ in generator:\n            pass\n    assert any(\n        record.levelname == \"ERROR\"\n        and \"Failed to read file\" in record.message\n        and os.path.basename(malformed_csv_file) in record.message\n        for record in caplog.records\n    )\n\n\n@require_pil\ndef test_csv_cast_image(csv_file_with_image):\n    with open(csv_file_with_image, encoding=\"utf-8\") as f:\n        image_file = f.read().splitlines()[1]\n    csv = Csv(encoding=\"utf-8\", features=Features({\"image\": Image()}))\n    base_files = [csv_file_with_image]\n    files_iterables = [[file] for file in base_files]\n    generator = csv._generate_tables(base_files=base_files, files_iterables=files_iterables)\n    pa_table = pa.concat_tables([table for _, table in generator])\n    assert pa_table.schema.field(\"image\").type == Image()()\n    generated_content = pa_table.to_pydict()[\"image\"]\n    assert generated_content == [{\"path\": image_file, \"bytes\": None}]\n\n\ndef test_csv_cast_label(csv_file_with_label):\n    with open(csv_file_with_label, encoding=\"utf-8\") as f:\n        labels = f.read().splitlines()[1:]\n    csv = Csv(encoding=\"utf-8\", features=Features({\"label\": ClassLabel(names=[\"good\", \"bad\"])}))\n    base_files = [csv_file_with_label]\n    files_iterables = [[file] for file in base_files]\n    generator = csv._generate_tables(base_files=base_files, files_iterables=files_iterables)\n    pa_table = pa.concat_tables([table for _, table in generator])\n    assert pa_table.schema.field(\"label\").type == ClassLabel(names=[\"good\", \"bad\"])()\n    generated_content = pa_table.to_pydict()[\"label\"]\n    assert generated_content == [ClassLabel(names=[\"good\", \"bad\"]).str2int(label) for label in labels]\n\n\ndef test_csv_convert_int_list(csv_file_with_int_list):\n    csv = Csv(encoding=\"utf-8\", sep=\",\", converters={\"int_list\": lambda x: [int(i) for i in x.split()]})\n    base_files = [csv_file_with_int_list]\n    files_iterables = [[file] for file in base_files]\n    generator = csv._generate_tables(base_files=base_files, files_iterables=files_iterables)\n    pa_table = pa.concat_tables([table for _, table in generator])\n    assert pa.types.is_list(pa_table.schema.field(\"int_list\").type)\n    generated_content = pa_table.to_pydict()[\"int_list\"]\n    assert generated_content == [[1, 2, 3], [4, 5, 6], [7, 8, 9]]\n"
  },
  {
    "path": "tests/packaged_modules/test_folder_based_builder.py",
    "content": "import importlib\nimport shutil\nimport textwrap\n\nimport pytest\n\nfrom datasets import ClassLabel, DownloadManager\nfrom datasets.builder import InvalidConfigName\nfrom datasets.data_files import DataFilesDict, DataFilesList, get_data_patterns\nfrom datasets.download.streaming_download_manager import StreamingDownloadManager\nfrom datasets.packaged_modules.folder_based_builder.folder_based_builder import (\n    FolderBasedBuilder,\n    FolderBasedBuilderConfig,\n)\n\n\nremote_files = [\n    \"https://huggingface.co/datasets/hf-internal-testing/textfolder/resolve/main/hallo.txt\",\n    \"https://huggingface.co/datasets/hf-internal-testing/textfolder/resolve/main/hello.txt\",\n    \"https://huggingface.co/datasets/hf-internal-testing/textfolder/resolve/main/class1/bonjour.txt\",\n    \"https://huggingface.co/datasets/hf-internal-testing/textfolder/resolve/main/class1/bonjour2.txt\",\n]\n\n\nclass DummyFeature:\n    pass\n\n\nclass DummyFolderBasedBuilder(FolderBasedBuilder):\n    BASE_FEATURE = DummyFeature\n    BASE_COLUMN_NAME = \"base\"\n    BUILDER_CONFIG_CLASS = FolderBasedBuilderConfig\n    EXTENSIONS = [\".txt\"]\n\n\n@pytest.fixture\ndef cache_dir(tmp_path):\n    return str(tmp_path / \"autofolder_cache_dir\")\n\n\n@pytest.fixture\ndef auto_text_file(text_file):\n    return str(text_file)\n\n\n@pytest.fixture\ndef data_files_with_labels_no_metadata(tmp_path, auto_text_file):\n    data_dir = tmp_path / \"data_files_with_labels_no_metadata\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    subdir_class_0 = data_dir / \"class0\"\n    subdir_class_0.mkdir(parents=True, exist_ok=True)\n    subdir_class_1 = data_dir / \"class1\"\n    subdir_class_1.mkdir(parents=True, exist_ok=True)\n\n    filename = subdir_class_0 / \"file0.txt\"\n    shutil.copyfile(auto_text_file, filename)\n    filename2 = subdir_class_1 / \"file1.txt\"\n    shutil.copyfile(auto_text_file, filename2)\n\n    data_files_with_labels_no_metadata = DataFilesDict.from_patterns(\n        get_data_patterns(str(data_dir)), data_dir.as_posix()\n    )\n\n    return data_files_with_labels_no_metadata\n\n\n@pytest.fixture\ndef data_files_with_different_levels_no_metadata(tmp_path, auto_text_file):\n    data_dir = tmp_path / \"data_files_with_different_levels\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    subdir_class_0 = data_dir / \"class0\"\n    subdir_class_0.mkdir(parents=True, exist_ok=True)\n    subdir_class_1 = data_dir / \"subdir\" / \"class1\"\n    subdir_class_1.mkdir(parents=True, exist_ok=True)\n\n    filename = subdir_class_0 / \"file0.txt\"\n    shutil.copyfile(auto_text_file, filename)\n    filename2 = subdir_class_1 / \"file1.txt\"\n    shutil.copyfile(auto_text_file, filename2)\n\n    data_files_with_different_levels = DataFilesDict.from_patterns(\n        get_data_patterns(str(data_dir)), data_dir.as_posix()\n    )\n\n    return data_files_with_different_levels\n\n\n@pytest.fixture\ndef data_files_with_one_label_no_metadata(tmp_path, auto_text_file):\n    # only one label found = all files in a single dir/in a root dir\n    data_dir = tmp_path / \"data_files_with_one_label\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n\n    filename = data_dir / \"file0.txt\"\n    shutil.copyfile(auto_text_file, filename)\n    filename2 = data_dir / \"file1.txt\"\n    shutil.copyfile(auto_text_file, filename2)\n\n    data_files_with_one_label = DataFilesDict.from_patterns(get_data_patterns(str(data_dir)), data_dir.as_posix())\n\n    return data_files_with_one_label\n\n\n@pytest.fixture\ndef files_with_labels_and_duplicated_label_key_in_metadata(tmp_path, auto_text_file):\n    data_dir = tmp_path / \"files_with_labels_and_label_key_in_metadata\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    subdir_class_0 = data_dir / \"class0\"\n    subdir_class_0.mkdir(parents=True, exist_ok=True)\n    subdir_class_1 = data_dir / \"class1\"\n    subdir_class_1.mkdir(parents=True, exist_ok=True)\n\n    filename = subdir_class_0 / \"file_class0.txt\"\n    shutil.copyfile(auto_text_file, filename)\n    filename2 = subdir_class_1 / \"file_class1.txt\"\n    shutil.copyfile(auto_text_file, filename2)\n\n    metadata_filename = tmp_path / data_dir / \"metadata.jsonl\"\n    metadata = textwrap.dedent(\n        \"\"\"\\\n        {\"file_name\": \"class0/file_class0.txt\", \"additional_feature\": \"First dummy file\", \"label\": \"CLASS_0\"}\n        {\"file_name\": \"class1/file_class1.txt\", \"additional_feature\": \"Second dummy file\", \"label\": \"CLASS_1\"}\n        \"\"\"\n    )\n    with open(metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(metadata)\n\n    return str(filename), str(filename2), str(metadata_filename)\n\n\n@pytest.fixture\ndef file_with_metadata(tmp_path, text_file):\n    filename = tmp_path / \"file.txt\"\n    shutil.copyfile(text_file, filename)\n    metadata_filename = tmp_path / \"metadata.jsonl\"\n    metadata = textwrap.dedent(\n        \"\"\"\\\n        {\"file_name\": \"file.txt\", \"additional_feature\": \"Dummy file\"}\n        \"\"\"\n    )\n    with open(metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(metadata)\n    return str(filename), str(metadata_filename)\n\n\n@pytest.fixture\ndef data_files_with_one_split_and_metadata(tmp_path, auto_text_file):\n    data_dir = tmp_path / \"autofolder_data_dir_with_metadata_one_split\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    subdir = data_dir / \"subdir\"\n    subdir.mkdir(parents=True, exist_ok=True)\n\n    filename = data_dir / \"file.txt\"\n    shutil.copyfile(auto_text_file, filename)\n    filename2 = data_dir / \"file2.txt\"\n    shutil.copyfile(auto_text_file, filename2)\n    filename3 = subdir / \"file3.txt\"  # in subdir\n    shutil.copyfile(auto_text_file, filename3)\n\n    metadata_filename = data_dir / \"metadata.jsonl\"\n    metadata = textwrap.dedent(\n        \"\"\"\\\n        {\"file_name\": \"file.txt\", \"additional_feature\": \"Dummy file\"}\n        {\"file_name\": \"file2.txt\", \"additional_feature\": \"Second dummy file\"}\n        {\"file_name\": \"./subdir/file3.txt\", \"additional_feature\": \"Third dummy file\"}\n        \"\"\"\n    )\n    with open(metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(metadata)\n    data_files_with_one_split_and_metadata = DataFilesDict.from_patterns(\n        get_data_patterns(str(data_dir)), data_dir.as_posix()\n    )\n    assert len(data_files_with_one_split_and_metadata) == 1\n    assert len(data_files_with_one_split_and_metadata[\"train\"]) == 4\n    return data_files_with_one_split_and_metadata\n\n\n@pytest.fixture\ndef data_files_with_two_splits_and_metadata(tmp_path, auto_text_file):\n    data_dir = tmp_path / \"autofolder_data_dir_with_metadata_two_splits\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    train_dir = data_dir / \"train\"\n    train_dir.mkdir(parents=True, exist_ok=True)\n    test_dir = data_dir / \"test\"\n    test_dir.mkdir(parents=True, exist_ok=True)\n\n    filename = train_dir / \"file.txt\"  # train\n    shutil.copyfile(auto_text_file, filename)\n    filename2 = train_dir / \"file2.txt\"  # train\n    shutil.copyfile(auto_text_file, filename2)\n    filename3 = test_dir / \"file3.txt\"  # test\n    shutil.copyfile(auto_text_file, filename3)\n\n    train_metadata_filename = train_dir / \"metadata.jsonl\"\n    train_metadata = textwrap.dedent(\n        \"\"\"\\\n        {\"file_name\": \"file.txt\", \"additional_feature\": \"Train dummy file\"}\n        {\"file_name\": \"file2.txt\", \"additional_feature\": \"Second train dummy file\"}\n        \"\"\"\n    )\n    with open(train_metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(train_metadata)\n    test_metadata_filename = test_dir / \"metadata.jsonl\"\n    test_metadata = textwrap.dedent(\n        \"\"\"\\\n        {\"file_name\": \"file3.txt\", \"additional_feature\": \"Test dummy file\"}\n        \"\"\"\n    )\n    with open(test_metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(test_metadata)\n    data_files_with_two_splits_and_metadata = DataFilesDict.from_patterns(\n        get_data_patterns(str(data_dir)), data_dir.as_posix()\n    )\n    assert len(data_files_with_two_splits_and_metadata) == 2\n    assert len(data_files_with_two_splits_and_metadata[\"train\"]) == 3\n    assert len(data_files_with_two_splits_and_metadata[\"test\"]) == 2\n    return data_files_with_two_splits_and_metadata\n\n\n@pytest.fixture\ndef data_files_with_zip_archives(tmp_path, auto_text_file):\n    data_dir = tmp_path / \"autofolder_data_dir_with_zip_archives\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    archive_dir = data_dir / \"archive\"\n    archive_dir.mkdir(parents=True, exist_ok=True)\n    subdir = archive_dir / \"subdir\"\n    subdir.mkdir(parents=True, exist_ok=True)\n\n    filename = archive_dir / \"file.txt\"\n    shutil.copyfile(auto_text_file, filename)\n    filename2 = subdir / \"file2.txt\"  # in subdir\n    shutil.copyfile(auto_text_file, filename2)\n\n    metadata_filename = archive_dir / \"metadata.jsonl\"\n    metadata = textwrap.dedent(\n        \"\"\"\\\n        {\"file_name\": \"file.txt\", \"additional_feature\": \"Dummy file\"}\n        {\"file_name\": \"subdir/file2.txt\", \"additional_feature\": \"Second dummy file\"}\n        \"\"\"\n    )\n    with open(metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(metadata)\n\n    shutil.make_archive(archive_dir, \"zip\", archive_dir)\n    shutil.rmtree(str(archive_dir))\n\n    data_files_with_zip_archives = DataFilesDict.from_patterns(get_data_patterns(str(data_dir)), data_dir.as_posix())\n\n    assert len(data_files_with_zip_archives) == 1\n    assert len(data_files_with_zip_archives[\"train\"]) == 1\n    return data_files_with_zip_archives\n\n\ndef test_config_raises_when_invalid_name() -> None:\n    with pytest.raises(InvalidConfigName, match=\"Bad characters\"):\n        _ = FolderBasedBuilderConfig(name=\"name-with-*-invalid-character\")\n\n\n@pytest.mark.parametrize(\"data_files\", [\"str_path\", [\"str_path\"], DataFilesList([\"str_path\"], [()])])\ndef test_config_raises_when_invalid_data_files(data_files) -> None:\n    with pytest.raises(ValueError, match=\"Expected a DataFilesDict\"):\n        _ = FolderBasedBuilderConfig(name=\"name\", data_files=data_files)\n\n\ndef test_inferring_labels_from_data_dirs(data_files_with_labels_no_metadata, cache_dir):\n    autofolder = DummyFolderBasedBuilder(\n        data_files=data_files_with_labels_no_metadata, cache_dir=cache_dir, drop_labels=False\n    )\n    gen_kwargs = autofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs\n    assert autofolder.info.features[\"label\"] == ClassLabel(names=[\"class0\", \"class1\"])\n    generator = autofolder._generate_examples(**gen_kwargs)\n    assert all(example[\"label\"] in {\"class0\", \"class1\"} for _, example in generator)\n\n\ndef test_default_folder_builder_not_usable(data_files_with_labels_no_metadata, cache_dir):\n    # builder would try to access non-existing attributes of a default `BuilderConfig` class\n    # as a custom one is not provided\n    with pytest.raises(AttributeError):\n        _ = FolderBasedBuilder(\n            data_files=data_files_with_labels_no_metadata,\n            cache_dir=cache_dir,\n        )\n\n\n# test that AutoFolder is extended for streaming when it's child class is instantiated:\n# see line 115 in src/datasets/streaming.py\ndef test_streaming_patched():\n    _ = DummyFolderBasedBuilder(data_dir=\".\")\n    module = importlib.import_module(FolderBasedBuilder.__module__)\n    assert hasattr(module, \"_patched_for_streaming\")\n    assert module._patched_for_streaming\n\n\n@pytest.mark.parametrize(\"drop_metadata\", [None, True, False])\n@pytest.mark.parametrize(\"drop_labels\", [None, True, False])\ndef test_generate_examples_drop_labels(\n    data_files_with_labels_no_metadata, auto_text_file, drop_metadata, drop_labels, cache_dir\n):\n    autofolder = DummyFolderBasedBuilder(\n        data_files=data_files_with_labels_no_metadata,\n        drop_metadata=drop_metadata,\n        drop_labels=drop_labels,\n        cache_dir=cache_dir,\n    )\n    gen_kwargs = autofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs\n    # removing labels explicitly requires drop_labels=True\n    assert gen_kwargs[\"add_labels\"] is not bool(drop_labels)\n    assert gen_kwargs[\"add_metadata\"] is False\n    generator = autofolder._generate_examples(**gen_kwargs)\n    if not drop_labels:\n        assert all(\n            example.keys() == {\"base\", \"label\"} and all(val is not None for val in example.values())\n            for _, example in generator\n        )\n    else:\n        assert all(\n            example.keys() == {\"base\"} and all(val is not None for val in example.values()) for _, example in generator\n        )\n\n\n@pytest.mark.parametrize(\"drop_metadata\", [None, True, False])\n@pytest.mark.parametrize(\"drop_labels\", [None, True, False])\ndef test_generate_examples_drop_metadata(file_with_metadata, drop_metadata, drop_labels, cache_dir):\n    file, metadata_file = file_with_metadata\n    autofolder = DummyFolderBasedBuilder(\n        data_files=[file, metadata_file],\n        drop_metadata=drop_metadata,\n        drop_labels=drop_labels,\n        cache_dir=cache_dir,\n    )\n    gen_kwargs = autofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs\n    # since the dataset has metadata, removing the metadata explicitly requires drop_metadata=True\n    assert gen_kwargs[\"add_metadata\"] is not bool(drop_metadata)\n    # since the dataset has metadata, adding the labels explicitly requires drop_labels=False\n    assert gen_kwargs[\"add_labels\"] is False\n    generator = autofolder._generate_examples(**gen_kwargs)\n    expected_columns = {\"base\"}\n    if gen_kwargs[\"add_metadata\"]:\n        expected_columns.add(\"additional_feature\")\n    if gen_kwargs[\"add_labels\"]:\n        expected_columns.add(\"label\")\n    result = [example for _, example in generator]\n    assert len(result) == 1\n    example = result[0]\n    assert example.keys() == expected_columns\n    for column in expected_columns:\n        assert example[column] is not None\n\n\n@pytest.mark.parametrize(\"remote\", [True, False])\n@pytest.mark.parametrize(\"drop_labels\", [None, True, False])\ndef test_data_files_with_different_levels_no_metadata(\n    data_files_with_different_levels_no_metadata, drop_labels, remote, cache_dir\n):\n    data_files = remote_files if remote else data_files_with_different_levels_no_metadata\n    autofolder = DummyFolderBasedBuilder(\n        data_files=data_files,\n        cache_dir=cache_dir,\n        drop_labels=drop_labels,\n    )\n    gen_kwargs = autofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs\n    generator = autofolder._generate_examples(**gen_kwargs)\n    if drop_labels is not False:\n        # with None (default) we should drop labels if files are on different levels in dir structure\n        assert \"label\" not in autofolder.info.features\n        assert all(example.keys() == {\"base\"} for _, example in generator)\n    else:\n        assert \"label\" in autofolder.info.features\n        assert isinstance(autofolder.info.features[\"label\"], ClassLabel)\n        assert all(example.keys() == {\"base\", \"label\"} for _, example in generator)\n\n\n@pytest.mark.parametrize(\"remote\", [False, True])\n@pytest.mark.parametrize(\"drop_labels\", [None, True, False])\ndef test_data_files_with_one_label_no_metadata(data_files_with_one_label_no_metadata, drop_labels, remote, cache_dir):\n    data_files = remote_files[:2] if remote else data_files_with_one_label_no_metadata\n    autofolder = DummyFolderBasedBuilder(\n        data_files=data_files,\n        cache_dir=cache_dir,\n        drop_labels=drop_labels,\n    )\n    gen_kwargs = autofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs\n    generator = autofolder._generate_examples(**gen_kwargs)\n    if drop_labels is not False:\n        # with None (default) we should drop labels if only one label is found (=if there is a single dir)\n        assert \"label\" not in autofolder.info.features\n        assert all(example.keys() == {\"base\"} for _, example in generator)\n    else:\n        assert \"label\" in autofolder.info.features\n        assert isinstance(autofolder.info.features[\"label\"], ClassLabel)\n        assert all(example.keys() == {\"base\", \"label\"} for _, example in generator)\n\n\n@pytest.mark.parametrize(\"streaming\", [False, True])\n@pytest.mark.parametrize(\"n_splits\", [1, 2])\ndef test_data_files_with_metadata_and_splits(\n    streaming, cache_dir, n_splits, data_files_with_one_split_and_metadata, data_files_with_two_splits_and_metadata\n):\n    data_files = data_files_with_one_split_and_metadata if n_splits == 1 else data_files_with_two_splits_and_metadata\n    autofolder = DummyFolderBasedBuilder(\n        data_files=data_files,\n        cache_dir=cache_dir,\n    )\n    download_manager = StreamingDownloadManager() if streaming else DownloadManager()\n    generated_splits = autofolder._split_generators(download_manager)\n    for (split, files), generated_split in zip(data_files.items(), generated_splits):\n        assert split == generated_split.name\n        expected_num_of_examples = len(files) - 1\n        generated_examples = list(autofolder._generate_examples(**generated_split.gen_kwargs))\n        assert len(generated_examples) == expected_num_of_examples\n        assert len({example[\"base\"] for _, example in generated_examples}) == expected_num_of_examples\n        assert len({example[\"additional_feature\"] for _, example in generated_examples}) == expected_num_of_examples\n        assert all(example[\"additional_feature\"] is not None for _, example in generated_examples)\n\n\n@pytest.mark.parametrize(\"streaming\", [False, True])\ndef test_data_files_with_metadata_and_archives(streaming, cache_dir, data_files_with_zip_archives):\n    autofolder = DummyFolderBasedBuilder(data_files=data_files_with_zip_archives, cache_dir=cache_dir)\n    download_manager = StreamingDownloadManager() if streaming else DownloadManager()\n    generated_splits = autofolder._split_generators(download_manager)\n    for (split, files), generated_split in zip(data_files_with_zip_archives.items(), generated_splits):\n        assert split == generated_split.name\n        num_of_archives = len(files)\n        expected_num_of_examples = 2 * num_of_archives\n        generated_examples = list(autofolder._generate_examples(**generated_split.gen_kwargs))\n        assert len(generated_examples) == expected_num_of_examples\n        assert len({example[\"base\"] for _, example in generated_examples}) == expected_num_of_examples\n        assert len({example[\"additional_feature\"] for _, example in generated_examples}) == expected_num_of_examples\n        assert all(example[\"additional_feature\"] is not None for _, example in generated_examples)\n\n\ndef test_data_files_with_wrong_metadata_file_name(cache_dir, tmp_path, auto_text_file):\n    data_dir = tmp_path / \"data_dir_with_bad_metadata\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    shutil.copyfile(auto_text_file, data_dir / \"file.txt\")\n    metadata_filename = data_dir / \"bad_metadata.jsonl\"  # bad file\n    metadata = textwrap.dedent(\n        \"\"\"\\\n        {\"file_name\": \"file.txt\", \"additional_feature\": \"Dummy file\"}\n        \"\"\"\n    )\n    with open(metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(metadata)\n\n    data_files_with_bad_metadata = DataFilesDict.from_patterns(get_data_patterns(str(data_dir)), data_dir.as_posix())\n    autofolder = DummyFolderBasedBuilder(data_files=data_files_with_bad_metadata, cache_dir=cache_dir)\n    gen_kwargs = autofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs\n    generator = autofolder._generate_examples(**gen_kwargs)\n    assert all(\"additional_feature\" not in example for _, example in generator)\n\n\ndef test_data_files_with_custom_file_name_column_in_metadata_file(cache_dir, tmp_path, auto_text_file):\n    data_dir = tmp_path / \"data_dir_with_custom_file_name_metadata\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    shutil.copyfile(auto_text_file, data_dir / \"file.txt\")\n    metadata_filename = data_dir / \"metadata.jsonl\"\n    metadata = textwrap.dedent(  # with bad column \"bad_file_name\" instead of \"file_name\"\n        \"\"\"\\\n        {\"text_file_name\": \"file.txt\", \"additional_feature\": \"Dummy file\"}\n        \"\"\"\n    )\n    with open(metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(metadata)\n\n    data_files_with_bad_metadata = DataFilesDict.from_patterns(get_data_patterns(str(data_dir)), data_dir.as_posix())\n    autofolder = DummyFolderBasedBuilder(data_files=data_files_with_bad_metadata, cache_dir=cache_dir)\n    gen_kwargs = autofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs\n    generator = autofolder._generate_examples(**gen_kwargs)\n    assert all(\"text\" in example and \"text_file_name\" not in example for _, example in generator)\n\n\ndef test_data_files_with_custom_file_names_column_in_metadata_file_large_string_list(\n    cache_dir, tmp_path, auto_text_file\n):\n    import pyarrow as pa\n    import pyarrow.parquet as pq\n\n    data_dir = tmp_path / \"data_dir_with_custom_file_names_metadata\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    shutil.copyfile(auto_text_file, data_dir / \"file.txt\")\n    metadata_filename = data_dir / \"metadata.parquet\"\n    pq.write_table(\n        pa.Table.from_arrays(\n            [\n                pa.array([[\"file.txt\"]], type=pa.list_(pa.large_string())),\n                pa.array([\"Dummy file\"], type=pa.large_string()),\n            ],\n            names=[\"text_file_names\", \"additional_feature\"],\n        ),\n        metadata_filename,\n    )\n\n    data_files_with_metadata = DataFilesDict.from_patterns(get_data_patterns(str(data_dir)), data_dir.as_posix())\n    autofolder = DummyFolderBasedBuilder(data_files=data_files_with_metadata, cache_dir=cache_dir)\n    gen_kwargs = autofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs\n    generator = autofolder._generate_examples(**gen_kwargs)\n    examples = [example for _, example in generator]\n    assert len(examples) == 1\n    assert \"text\" in examples[0] and \"text_file_names\" not in examples[0]\n    assert len(examples[0][\"text\"]) == 1 and examples[0][\"text\"][0].endswith(\"file.txt\")\n"
  },
  {
    "path": "tests/packaged_modules/test_hdf5.py",
    "content": "import h5py\nimport numpy as np\nimport pytest\n\nfrom datasets import Array2D, Array3D, Array4D, Features, List, Value, load_dataset\nfrom datasets.builder import InvalidConfigName\nfrom datasets.data_files import DataFilesList\nfrom datasets.exceptions import DatasetGenerationError\nfrom datasets.packaged_modules.hdf5.hdf5 import HDF5, HDF5Config\n\n\n@pytest.fixture\ndef hdf5_file(tmp_path):\n    \"\"\"Create a basic HDF5 file with numeric datasets.\"\"\"\n    filename = tmp_path / \"basic.h5\"\n    n_rows = 5\n\n    with h5py.File(filename, \"w\") as f:\n        f.create_dataset(\"int32\", data=np.arange(n_rows, dtype=np.int32))\n        f.create_dataset(\"float32\", data=np.arange(n_rows, dtype=np.float32) / 10.0)\n        f.create_dataset(\"bool\", data=np.array([True, False, True, False, True]))\n\n    return str(filename)\n\n\n@pytest.fixture\ndef hdf5_file_with_groups(tmp_path):\n    \"\"\"Create an HDF5 file with nested groups.\"\"\"\n    filename = tmp_path / \"nested.h5\"\n    n_rows = 3\n\n    with h5py.File(filename, \"w\") as f:\n        f.create_dataset(\"root_data\", data=np.arange(n_rows, dtype=np.int32))\n        grp = f.create_group(\"group1\")\n        grp.create_dataset(\"group_data\", data=np.arange(n_rows, dtype=np.float32))\n        subgrp = grp.create_group(\"subgroup\")\n        subgrp.create_dataset(\"sub_data\", data=np.arange(n_rows, dtype=np.int64))\n\n    return str(filename)\n\n\n@pytest.fixture\ndef hdf5_file_with_arrays(tmp_path):\n    \"\"\"Create an HDF5 file with multi-dimensional arrays.\"\"\"\n    filename = tmp_path / \"arrays.h5\"\n    n_rows = 4\n\n    with h5py.File(filename, \"w\") as f:\n        # 2D array (should become Array2D)\n        f.create_dataset(\"matrix_2d\", data=np.random.randn(n_rows, 3, 4).astype(np.float32))\n        # 3D array (should become Array3D)\n        f.create_dataset(\"tensor_3d\", data=np.random.randn(n_rows, 2, 3, 4).astype(np.float64))\n        # 4D array (should become Array4D)\n        f.create_dataset(\"tensor_4d\", data=np.random.randn(n_rows, 2, 3, 4, 5).astype(np.float32))\n        # 5D array (should become Array5D)\n        f.create_dataset(\"tensor_5d\", data=np.random.randn(n_rows, 2, 3, 4, 5, 6).astype(np.float64))\n        # 1D array (should become Value)\n        f.create_dataset(\"vector_1d\", data=np.random.randn(n_rows, 10).astype(np.float32))\n\n    return str(filename)\n\n\n@pytest.fixture\ndef hdf5_file_with_different_dtypes(tmp_path):\n    \"\"\"Create an HDF5 file with various numeric dtypes.\"\"\"\n    filename = tmp_path / \"dtypes.h5\"\n    n_rows = 3\n\n    with h5py.File(filename, \"w\") as f:\n        f.create_dataset(\"int8\", data=np.arange(n_rows, dtype=np.int8))\n        f.create_dataset(\"int16\", data=np.arange(n_rows, dtype=np.int16))\n        f.create_dataset(\"int64\", data=np.arange(n_rows, dtype=np.int64))\n        f.create_dataset(\"uint8\", data=np.arange(n_rows, dtype=np.uint8))\n        f.create_dataset(\"uint16\", data=np.arange(n_rows, dtype=np.uint16))\n        f.create_dataset(\"uint32\", data=np.arange(n_rows, dtype=np.uint32))\n        f.create_dataset(\"uint64\", data=np.arange(n_rows, dtype=np.uint64))\n        f.create_dataset(\"float16\", data=np.arange(n_rows, dtype=np.float16) / 10.0)\n        f.create_dataset(\"float64\", data=np.arange(n_rows, dtype=np.float64) / 10.0)\n        f.create_dataset(\"bytes\", data=np.array([b\"row_%d\" % i for i in range(n_rows)], dtype=\"S10\"))\n\n    return str(filename)\n\n\n@pytest.fixture\ndef hdf5_file_with_vlen_arrays(tmp_path):\n    \"\"\"Create an HDF5 file with variable-length arrays using HDF5's vlen_dtype.\"\"\"\n    filename = tmp_path / \"vlen.h5\"\n    n_rows = 4\n\n    with h5py.File(filename, \"w\") as f:\n        # Variable-length arrays of different sizes using vlen_dtype\n        vlen_arrays = [[1, 2, 3], [4, 5], [6, 7, 8, 9], [10]]\n        # Create variable-length int dataset using vlen_dtype\n        dt = h5py.vlen_dtype(np.dtype(\"int32\"))\n        dset = f.create_dataset(\"vlen_ints\", (n_rows,), dtype=dt)\n        for i, arr in enumerate(vlen_arrays):\n            dset[i] = arr\n\n        # Mixed types (some empty arrays) - use variable-length with empty arrays\n        mixed_data = [\n            [1, 2, 3],\n            [],  # Empty array\n            [4, 5],\n            [6],\n        ]\n        dt_mixed = h5py.vlen_dtype(np.dtype(\"int32\"))\n        dset_mixed = f.create_dataset(\"mixed_data\", (n_rows,), dtype=dt_mixed)\n        for i, arr in enumerate(mixed_data):\n            dset_mixed[i] = arr\n\n    return str(filename)\n\n\n@pytest.fixture\ndef hdf5_file_with_variable_length_strings(tmp_path):\n    \"\"\"Create an HDF5 file with variable-length string datasets.\"\"\"\n    filename = tmp_path / \"var_strings.h5\"\n    n_rows = 4\n\n    with h5py.File(filename, \"w\") as f:\n        # Variable-length string dataset\n        var_strings = [\"short\", \"medium length string\", \"very long string with many characters\", \"tiny\"]\n        # Create variable-length string dataset using vlen_dtype\n        dt = h5py.vlen_dtype(str)\n        dset = f.create_dataset(\"var_strings\", (n_rows,), dtype=dt)\n        for i, s in enumerate(var_strings):\n            dset[i] = s\n\n        # Variable-length bytes dataset\n        var_bytes = [b\"short\", b\"medium length bytes\", b\"very long bytes with many characters\", b\"tiny\"]\n        dt_bytes = h5py.vlen_dtype(bytes)\n        dset_bytes = f.create_dataset(\"var_bytes\", (n_rows,), dtype=dt_bytes)\n        for i, b in enumerate(var_bytes):\n            dset_bytes[i] = b\n\n    return str(filename)\n\n\n@pytest.fixture\ndef hdf5_file_with_complex_data(tmp_path):\n    \"\"\"Create an HDF5 file with complex number datasets.\"\"\"\n    filename = tmp_path / \"complex.h5\"\n\n    with h5py.File(filename, \"w\") as f:\n        # Complex numbers\n        complex_data = np.array([1 + 2j, 3 + 4j, 5 + 6j, 7 + 8j], dtype=np.complex64)\n        f.create_dataset(\"complex_64\", data=complex_data)\n\n        # Complex double precision\n        complex_double = np.array([1.5 + 2.5j, 3.5 + 4.5j, 5.5 + 6.5j, 7.5 + 8.5j], dtype=np.complex128)\n        f.create_dataset(\"complex_128\", data=complex_double)\n\n        # Complex array\n        complex_array = np.array(\n            [[1 + 2j, 3 + 4j], [5 + 6j, 7 + 8j], [9 + 10j, 11 + 12j], [13 + 14j, 15 + 16j]], dtype=np.complex64\n        )\n        f.create_dataset(\"complex_array\", data=complex_array)\n\n    return str(filename)\n\n\n@pytest.fixture\ndef hdf5_file_with_compound_data(tmp_path):\n    \"\"\"Create an HDF5 file with compound/structured datasets.\"\"\"\n    filename = tmp_path / \"compound.h5\"\n\n    with h5py.File(filename, \"w\") as f:\n        # Simple compound type\n        dt_simple = np.dtype([(\"x\", \"i4\"), (\"y\", \"f8\")])\n        compound_simple = np.array([(1, 2.5), (3, 4.5), (5, 6.5)], dtype=dt_simple)\n        f.create_dataset(\"simple_compound\", data=compound_simple)\n\n        # Compound type with complex numbers\n        dt_complex = np.dtype([(\"real\", \"f4\"), (\"imag\", \"f4\")])\n        compound_complex = np.array([(1.0, 2.0), (3.0, 4.0), (5.0, 6.0)], dtype=dt_complex)\n        f.create_dataset(\"complex_compound\", data=compound_complex)\n\n        # Nested compound type\n        dt_nested = np.dtype([(\"position\", [(\"x\", \"i4\"), (\"y\", \"i4\")]), (\"velocity\", [(\"vx\", \"f4\"), (\"vy\", \"f4\")])])\n        compound_nested = np.array([((1, 2), (1.5, 2.5)), ((3, 4), (3.5, 4.5)), ((5, 6), (5.5, 6.5))], dtype=dt_nested)\n        f.create_dataset(\"nested_compound\", data=compound_nested)\n\n    return str(filename)\n\n\n@pytest.fixture\ndef hdf5_file_with_compound_complex_arrays(tmp_path):\n    \"\"\"Create an HDF5 file with compound datasets containing complex arrays.\"\"\"\n    filename = tmp_path / \"compound_complex_arrays.h5\"\n\n    with h5py.File(filename, \"w\") as f:\n        # Compound type with complex arrays\n        dt_complex_arrays = np.dtype(\n            [\n                (\"position\", [(\"x\", \"i4\"), (\"y\", \"i4\")]),\n                (\"complex_field\", \"c8\"),\n                (\"complex_array\", \"c8\", (2, 3)),\n                (\"nested_complex\", [(\"real\", \"f4\"), (\"imag\", \"f4\")]),\n            ]\n        )\n\n        # Create data with complex numbers\n        compound_data = np.array(\n            [\n                (\n                    (1, 2),\n                    1.0 + 2.0j,\n                    [[1.0 + 2.0j, 3.0 + 4.0j, 5.0 + 6.0j], [7.0 + 8.0j, 9.0 + 10.0j, 11.0 + 12.0j]],\n                    (1.5, 2.5),\n                ),\n                (\n                    (3, 4),\n                    3.0 + 4.0j,\n                    [[13.0 + 14.0j, 15.0 + 16.0j, 17.0 + 18.0j], [19.0 + 20.0j, 21.0 + 22.0j, 23.0 + 24.0j]],\n                    (3.5, 4.5),\n                ),\n                (\n                    (5, 6),\n                    5.0 + 6.0j,\n                    [[25.0 + 26.0j, 27.0 + 28.0j, 29.0 + 30.0j], [31.0 + 32.0j, 33.0 + 34.0j, 35.0 + 36.0j]],\n                    (5.5, 6.5),\n                ),\n            ],\n            dtype=dt_complex_arrays,\n        )\n\n        f.create_dataset(\"compound_with_complex\", data=compound_data)\n\n    return str(filename)\n\n\n@pytest.fixture\ndef hdf5_file_with_mismatched_lengths(tmp_path):\n    \"\"\"Create an HDF5 file with datasets of different lengths (should raise error).\"\"\"\n    filename = tmp_path / \"mismatched.h5\"\n\n    with h5py.File(filename, \"w\") as f:\n        f.create_dataset(\"data1\", data=np.arange(5, dtype=np.int32))\n        # Dataset with 3 rows (mismatched)\n        f.create_dataset(\"data2\", data=np.arange(3, dtype=np.int32))\n        f.create_dataset(\"data3\", data=np.random.randn(5, 3, 4).astype(np.float32))\n        f.create_dataset(\"data4\", data=np.arange(5, dtype=np.float64) / 10.0)\n        f.create_dataset(\"data5\", data=np.array([True, False, True, False, True]))\n        var_strings = [\"short\", \"medium length\", \"very long string\", \"tiny\", \"another string\"]\n        dt = h5py.vlen_dtype(str)\n        dset = f.create_dataset(\"data6\", (5,), dtype=dt)\n        for i, s in enumerate(var_strings):\n            dset[i] = s\n\n    return str(filename)\n\n\n@pytest.fixture\ndef hdf5_file_with_zero_dimensions(tmp_path):\n    \"\"\"Create an HDF5 file with zero dimensions (should be handled gracefully).\"\"\"\n    filename = tmp_path / \"zero_dims.h5\"\n\n    with h5py.File(filename, \"w\") as f:\n        # Create a dataset with a zero dimension\n        f.create_dataset(\"zero_dim\", data=np.zeros((3, 0, 2), dtype=np.float32))\n        # Create a dataset with zero in the middle dimension\n        f.create_dataset(\"zero_middle\", data=np.zeros((3, 0), dtype=np.int32))\n        # Create a dataset with zero in the last dimension\n        f.create_dataset(\"zero_last\", data=np.zeros((3, 2, 0), dtype=np.float64))\n\n    return str(filename)\n\n\n@pytest.fixture\ndef empty_hdf5_file(tmp_path):\n    \"\"\"Create an HDF5 file with no datasets (should warn and skip).\"\"\"\n    filename = tmp_path / \"empty.h5\"\n\n    with h5py.File(filename, \"w\") as f:\n        # Create only groups, no datasets\n        f.create_group(\"empty_group\")\n        grp = f.create_group(\"another_group\")\n        grp.create_group(\"subgroup\")\n\n    return str(filename)\n\n\n@pytest.fixture\ndef hdf5_file_with_mixed_data_types(tmp_path):\n    \"\"\"Create an HDF5 file with mixed data types in the same file.\"\"\"\n    filename = tmp_path / \"mixed.h5\"\n    n_rows = 3\n\n    with h5py.File(filename, \"w\") as f:\n        # Regular numeric data\n        f.create_dataset(\"regular_int\", data=np.arange(n_rows, dtype=np.int32))\n        f.create_dataset(\"regular_float\", data=np.arange(n_rows, dtype=np.float32))\n\n        # Complex data\n        complex_data = np.array([1 + 2j, 3 + 4j, 5 + 6j], dtype=np.complex64)\n        f.create_dataset(\"complex_data\", data=complex_data)\n\n        # Compound data\n        dt_compound = np.dtype([(\"x\", \"i4\"), (\"y\", \"f8\")])\n        compound_data = np.array([(1, 2.5), (3, 4.5), (5, 6.5)], dtype=dt_compound)\n        f.create_dataset(\"compound_data\", data=compound_data)\n\n    return str(filename)\n\n\ndef test_config_raises_when_invalid_name():\n    \"\"\"Test that invalid config names raise an error.\"\"\"\n    with pytest.raises(InvalidConfigName, match=\"Bad characters\"):\n        _ = HDF5Config(name=\"name-with-*-invalid-character\")\n\n\n@pytest.mark.parametrize(\"data_files\", [\"str_path\", [\"str_path\"], DataFilesList([\"str_path\"], [()])])\ndef test_config_raises_when_invalid_data_files(data_files):\n    \"\"\"Test that invalid data_files parameter raises an error.\"\"\"\n    with pytest.raises(ValueError, match=\"Expected a DataFilesDict\"):\n        _ = HDF5Config(name=\"name\", data_files=data_files)\n\n\ndef test_hdf5_basic_functionality(hdf5_file):\n    \"\"\"Test basic HDF5 loading with simple numeric datasets.\"\"\"\n    dataset = load_dataset(\"hdf5\", data_files=[hdf5_file], split=\"train\")\n\n    assert \"int32\" in dataset.column_names\n    assert \"float32\" in dataset.column_names\n    assert \"bool\" in dataset.column_names\n\n    assert np.asarray(dataset.data[\"int32\"]).dtype == np.int32\n    assert np.asarray(dataset.data[\"float32\"]).dtype == np.float32\n    assert np.asarray(dataset.data[\"bool\"]).dtype == np.bool_\n\n    assert dataset[\"int32\"] == [0, 1, 2, 3, 4]\n    float32_data = dataset[\"float32\"]\n    expected_float32 = [0.0, 0.1, 0.2, 0.3, 0.4]\n    np.testing.assert_allclose(float32_data, expected_float32, rtol=1e-6)\n\n\ndef test_hdf5_nested_groups(hdf5_file_with_groups):\n    \"\"\"Test HDF5 loading with nested groups.\"\"\"\n    dataset = load_dataset(\"hdf5\", data_files=[hdf5_file_with_groups], split=\"train\")\n\n    expected_columns = {\"root_data\", \"group1\"}\n    assert set(dataset.column_names) == expected_columns\n\n    # Check data\n    root_data = dataset[\"root_data\"]\n    group1_data = dataset[\"group1\"]\n    assert root_data == [0, 1, 2]\n    assert group1_data == [\n        {\"group_data\": 0.0, \"subgroup\": {\"sub_data\": 0}},\n        {\"group_data\": 1.0, \"subgroup\": {\"sub_data\": 1}},\n        {\"group_data\": 2.0, \"subgroup\": {\"sub_data\": 2}},\n    ]\n\n\ndef test_hdf5_multi_dimensional_arrays(hdf5_file_with_arrays):\n    \"\"\"Test HDF5 loading with multi-dimensional arrays.\"\"\"\n    dataset = load_dataset(\"hdf5\", data_files=[hdf5_file_with_arrays], split=\"train\")\n\n    expected_columns = {\"matrix_2d\", \"tensor_3d\", \"tensor_4d\", \"tensor_5d\", \"vector_1d\"}\n    assert set(dataset.column_names) == expected_columns\n\n    # Check shapes\n    matrix_2d = dataset[\"matrix_2d\"]\n    assert len(matrix_2d) == 4  # 4 rows\n    assert len(matrix_2d[0]) == 3  # 3 rows in each matrix\n    assert len(matrix_2d[0][0]) == 4  # 4 columns in each matrix\n\n\ndef test_hdf5_vlen_arrays(hdf5_file_with_vlen_arrays):\n    \"\"\"Test HDF5 loading with variable-length arrays (int32).\"\"\"\n    dataset = load_dataset(\"hdf5\", data_files=[hdf5_file_with_vlen_arrays], split=\"train\")\n\n    expected_columns = {\"vlen_ints\", \"mixed_data\"}\n    assert set(dataset.column_names) == expected_columns\n\n    # Check vlen_ints data\n    vlen_ints = dataset[\"vlen_ints\"]\n    assert len(vlen_ints) == 4\n    assert vlen_ints[0] == [1, 2, 3]\n    assert vlen_ints[1] == [4, 5]\n    assert vlen_ints[2] == [6, 7, 8, 9]\n    assert vlen_ints[3] == [10]\n\n    # Check mixed_data (with None values)\n    mixed_data = dataset[\"mixed_data\"]\n    assert len(mixed_data) == 4\n    assert mixed_data[0] == [1, 2, 3]\n    assert mixed_data[1] == []  # Empty array instead of None\n    assert mixed_data[2] == [4, 5]\n    assert mixed_data[3] == [6]\n\n\ndef test_hdf5_variable_length_strings(hdf5_file_with_variable_length_strings):\n    \"\"\"Test HDF5 loading with variable-length string datasets.\"\"\"\n    dataset = load_dataset(\"hdf5\", data_files=[hdf5_file_with_variable_length_strings], split=\"train\")\n    expected_columns = {\"var_strings\", \"var_bytes\"}\n    assert set(dataset.column_names) == expected_columns\n\n    # Check variable-length strings (converted to strings for usability)\n    var_strings = dataset[\"var_strings\"]\n    assert len(var_strings) == 4\n    assert var_strings[0] == \"short\"\n    assert var_strings[1] == \"medium length string\"\n    assert var_strings[2] == \"very long string with many characters\"\n    assert var_strings[3] == \"tiny\"\n\n    # Check variable-length bytes (converted to strings for usability)\n    var_bytes = dataset[\"var_bytes\"]\n    assert len(var_bytes) == 4\n    assert var_bytes[0] == \"short\"\n    assert var_bytes[1] == \"medium length bytes\"\n    assert var_bytes[2] == \"very long bytes with many characters\"\n    assert var_bytes[3] == \"tiny\"\n\n\ndef test_hdf5_different_dtypes(hdf5_file_with_different_dtypes):\n    \"\"\"Test HDF5 loading with various numeric dtypes.\"\"\"\n    dataset = load_dataset(\"hdf5\", data_files=[hdf5_file_with_different_dtypes], split=\"train\")\n    expected_columns = {\"int8\", \"int16\", \"int64\", \"uint8\", \"uint16\", \"uint32\", \"uint64\", \"float16\", \"float64\", \"bytes\"}\n    assert set(dataset.column_names) == expected_columns\n\n    # Check specific dtypes\n    int8_data = dataset[\"int8\"]\n    assert int8_data == [0, 1, 2]\n\n    bytes_data = dataset[\"bytes\"]\n    assert bytes_data == [b\"row_0\", b\"row_1\", b\"row_2\"]\n\n\ndef test_hdf5_batch_processing(hdf5_file):\n    \"\"\"Test HDF5 loading with custom batch size.\"\"\"\n    config = HDF5Config(batch_size=2)\n    hdf5 = HDF5()\n    hdf5.config = config\n    generator = hdf5._generate_tables([hdf5_file])\n\n    tables = list(generator)\n    # Should have 3 batches: [0,1], [2,3], [4]\n    assert len(tables) == 3\n\n    # Check first batch\n    _, first_batch = tables[0]\n    assert len(first_batch) == 2\n\n    # Check last batch\n    _, last_batch = tables[2]\n    assert len(last_batch) == 1\n\n\ndef test_hdf5_column_filtering(hdf5_file_with_groups):\n    \"\"\"Test HDF5 loading with column filtering.\"\"\"\n    features = Features({\"root_data\": Value(\"int32\"), \"group1\": Features({\"group_data\": Value(\"float32\")})})\n    dataset = load_dataset(\"hdf5\", data_files=[hdf5_file_with_groups], split=\"train\", features=features)\n\n    expected_columns = {\"root_data\", \"group1\"}\n    assert set(dataset.column_names) == expected_columns\n\n    # Check that subgroup is filtered out\n    group1_data = dataset[\"group1\"]\n    assert group1_data == [\n        {\"group_data\": 0.0},\n        {\"group_data\": 1.0},\n        {\"group_data\": 2.0},\n    ]\n\n\ndef test_hdf5_feature_specification(hdf5_file):\n    \"\"\"Test HDF5 loading with explicit feature specification.\"\"\"\n    features = Features({\"int32\": Value(\"int32\"), \"float32\": Value(\"float64\"), \"bool\": Value(\"bool\")})\n    dataset = load_dataset(\"hdf5\", data_files=[hdf5_file], split=\"train\", features=features)\n\n    # Check that features are properly cast\n    assert np.asarray(dataset.data[\"float32\"]).dtype == np.float64\n    assert np.asarray(dataset.data[\"int32\"]).dtype == np.int32\n    assert np.asarray(dataset.data[\"bool\"]).dtype == np.bool_\n\n\ndef test_hdf5_mismatched_lengths_error(hdf5_file_with_mismatched_lengths):\n    \"\"\"Test that mismatched dataset lengths raise an error.\"\"\"\n    with pytest.raises(DatasetGenerationError) as exc_info:\n        load_dataset(\"hdf5\", data_files=[hdf5_file_with_mismatched_lengths], split=\"train\")\n\n    assert isinstance(exc_info.value.__cause__, ValueError)\n    assert \"3 but expected 5\" in str(exc_info.value.__cause__)\n\n\ndef test_hdf5_zero_dimensions_handling(hdf5_file_with_zero_dimensions, caplog):\n    \"\"\"Test that zero dimensions are handled gracefully.\"\"\"\n    dataset = load_dataset(\"hdf5\", data_files=[hdf5_file_with_zero_dimensions], split=\"train\")\n\n    expected_columns = {\"zero_dim\", \"zero_middle\", \"zero_last\"}\n    assert set(dataset.column_names) == expected_columns\n\n    # Check that the data is loaded (should be empty arrays)\n    zero_dim_data = dataset[\"zero_dim\"]\n    assert len(zero_dim_data) == 3  # 3 rows\n    assert all(len(row) == 0 for row in zero_dim_data)  # Each row is empty\n\n    # Check that shape info is lost\n    assert all(isinstance(col, List) and col.length == -1 for col in dataset.features.values())\n\n    # Check for the warnings\n    assert (\n        len(\n            [\n                record.message\n                for record in caplog.records\n                if record.levelname == \"WARNING\" and \"dimension with size 0\" in record.message\n            ]\n        )\n        == 3\n    )\n\n\ndef test_hdf5_empty_file_warning(empty_hdf5_file, hdf5_file_with_arrays, caplog):\n    \"\"\"Test that empty files (no datasets) are skipped with a warning.\"\"\"\n    load_dataset(\"hdf5\", data_files=[hdf5_file_with_arrays, empty_hdf5_file], split=\"train\")\n\n    # Check that warning was logged\n    assert any(\n        record.levelname == \"WARNING\" and \"contains no data, skipping\" in record.message for record in caplog.records\n    )\n\n\ndef test_hdf5_feature_inference(hdf5_file_with_arrays):\n    \"\"\"Test automatic feature inference from HDF5 datasets.\"\"\"\n    dataset = load_dataset(\"hdf5\", data_files=[hdf5_file_with_arrays], split=\"train\")\n\n    # Check that features were inferred\n    assert dataset.features is not None\n\n    # Check specific feature types\n    features = dataset.features\n    # (n_rows, 3, 4) -> Array2D with shape (3, 4)\n    assert isinstance(features[\"matrix_2d\"], Array2D)\n    assert features[\"matrix_2d\"].shape == (3, 4)\n    # (n_rows, 2, 3, 4) -> Array3D with shape (2, 3, 4)\n    assert isinstance(features[\"tensor_3d\"], Array3D)\n    assert features[\"tensor_3d\"].shape == (2, 3, 4)\n    # (n_rows, 2, 3, 4, 5) -> Array4D with shape (2, 3, 4, 5)\n    assert isinstance(features[\"tensor_4d\"], Array4D)\n    assert features[\"tensor_4d\"].shape == (2, 3, 4, 5)\n    # (n_rows, 10) -> List of length 10\n    assert isinstance(features[\"vector_1d\"], List)\n    assert features[\"vector_1d\"].length == 10\n\n\ndef test_hdf5_vlen_feature_inference(hdf5_file_with_vlen_arrays):\n    \"\"\"Test automatic feature inference from variable-length HDF5 datasets.\"\"\"\n    dataset = load_dataset(\"hdf5\", data_files=[hdf5_file_with_vlen_arrays], split=\"train\")\n\n    # Check that features were inferred\n    assert dataset.features is not None\n\n    # Check specific feature types for variable-length arrays\n    features = dataset.features\n    # Variable-length arrays should become List features by default (for small datasets)\n    assert isinstance(features[\"vlen_ints\"], List)\n    assert isinstance(features[\"mixed_data\"], List)\n\n    # Check that the inner feature types are correct\n    assert isinstance(features[\"vlen_ints\"].feature, Value)\n    assert features[\"vlen_ints\"].feature.dtype == \"int32\"\n    assert isinstance(features[\"mixed_data\"].feature, Value)\n    assert features[\"mixed_data\"].feature.dtype == \"int32\"\n\n\ndef test_hdf5_variable_string_feature_inference(hdf5_file_with_variable_length_strings):\n    \"\"\"Test automatic feature inference from variable-length string datasets.\"\"\"\n    dataset = load_dataset(\"hdf5\", data_files=[hdf5_file_with_variable_length_strings], split=\"train\")\n\n    # Check that features were inferred\n    assert dataset.features is not None\n\n    # Check specific feature types for variable-length strings\n    features = dataset.features\n    # Variable-length strings should become Value(\"string\") features\n    assert isinstance(features[\"var_strings\"], Value)\n    assert isinstance(features[\"var_bytes\"], Value)\n\n    # Check that the feature types are correct\n    assert features[\"var_strings\"].dtype == \"string\"\n    assert features[\"var_bytes\"].dtype == \"string\"\n\n\ndef test_hdf5_invalid_features(hdf5_file_with_arrays):\n    \"\"\"Test that invalid features raise an error.\"\"\"\n    features = Features({\"fakefeature\": Value(\"int32\")})\n    with pytest.raises(ValueError):\n        load_dataset(\"hdf5\", data_files=[hdf5_file_with_arrays], split=\"train\", features=features)\n\n    # try with one valid and one invalid feature\n    features = Features({\"matrix_2d\": Array2D(shape=(3, 4), dtype=\"float32\"), \"fakefeature\": Value(\"int32\")})\n    with pytest.raises(DatasetGenerationError):\n        load_dataset(\"hdf5\", data_files=[hdf5_file_with_arrays], split=\"train\", features=features)\n\n\ndef test_hdf5_no_data_files_error():\n    \"\"\"Test that missing data_files raises an error.\"\"\"\n    config = HDF5Config(name=\"test\", data_files=None)\n    hdf5 = HDF5()\n    hdf5.config = config\n\n    with pytest.raises(ValueError, match=\"At least one data file must be specified\"):\n        hdf5._split_generators(None)\n\n\ndef test_hdf5_complex_numbers(hdf5_file_with_complex_data):\n    \"\"\"Test HDF5 loading with complex number datasets.\"\"\"\n    dataset = load_dataset(\"hdf5\", data_files=[hdf5_file_with_complex_data], split=\"train\")\n\n    # Check that complex numbers are represented as nested Features\n    expected_columns = {\n        \"complex_64\",\n        \"complex_128\",\n        \"complex_array\",\n    }\n    assert set(dataset.column_names) == expected_columns\n\n    # Check complex_64 data\n    complex_64_data = dataset[\"complex_64\"]\n    assert len(complex_64_data) == 4\n    assert complex_64_data[0] == {\"real\": 1.0, \"imag\": 2.0}\n    assert complex_64_data[1] == {\"real\": 3.0, \"imag\": 4.0}\n    assert complex_64_data[2] == {\"real\": 5.0, \"imag\": 6.0}\n    assert complex_64_data[3] == {\"real\": 7.0, \"imag\": 8.0}\n\n    assert np.asarray(dataset.data[\"complex_64\"].flatten()[0]).dtype == np.float32\n    assert np.asarray(dataset.data[\"complex_64\"].flatten()[1]).dtype == np.float32\n    assert (np.asarray(dataset.data[\"complex_64\"].flatten()[0]) == np.array([1, 3, 5, 7], dtype=np.float32)).all()\n    assert (np.asarray(dataset.data[\"complex_64\"].flatten()[1]) == np.array([2, 4, 6, 8], dtype=np.float32)).all()\n\n    assert np.asarray(dataset.data[\"complex_128\"].flatten()[0]).dtype == np.float64\n    assert np.asarray(dataset.data[\"complex_128\"].flatten()[1]).dtype == np.float64\n    assert (\n        np.asarray(dataset.data[\"complex_128\"].flatten()[0]) == np.array([1.5, 3.5, 5.5, 7.5], dtype=np.float64)\n    ).all()\n    assert (\n        np.asarray(dataset.data[\"complex_128\"].flatten()[1]) == np.array([2.5, 4.5, 6.5, 8.5], dtype=np.float64)\n    ).all()\n\n\ndef test_hdf5_compound_types(hdf5_file_with_compound_data):\n    \"\"\"Test HDF5 loading with compound/structured datasets.\"\"\"\n    dataset = load_dataset(\"hdf5\", data_files=[hdf5_file_with_compound_data], split=\"train\")\n\n    # Check that compound types are represented as nested structures\n    expected_columns = {\n        \"simple_compound\",\n        \"complex_compound\",\n        \"nested_compound\",\n    }\n    assert set(dataset.column_names) == expected_columns\n\n    # Check simple compound data\n    simple_compound_data = dataset[\"simple_compound\"]\n    assert len(simple_compound_data) == 3\n    assert simple_compound_data[0] == {\"x\": 1, \"y\": 2.5}\n    assert simple_compound_data[1] == {\"x\": 3, \"y\": 4.5}\n    assert simple_compound_data[2] == {\"x\": 5, \"y\": 6.5}\n\n\ndef test_hdf5_feature_inference_complex(hdf5_file_with_complex_data):\n    \"\"\"Test automatic feature inference for complex datasets.\"\"\"\n    dataset = load_dataset(\"hdf5\", data_files=[hdf5_file_with_complex_data], split=\"train\")\n\n    # Check that features were inferred correctly\n    assert dataset.features is not None\n    features = dataset.features\n\n    # Check complex number features\n    assert \"complex_64\" in features\n    # Complex features are represented as dict, not Features object\n    assert isinstance(features[\"complex_64\"], dict)\n    assert features[\"complex_64\"][\"real\"] == Value(\"float32\")\n    assert features[\"complex_64\"][\"imag\"] == Value(\"float32\")\n\n\ndef test_hdf5_feature_inference_compound(hdf5_file_with_compound_data):\n    \"\"\"Test automatic feature inference for compound datasets.\"\"\"\n    dataset = load_dataset(\"hdf5\", data_files=[hdf5_file_with_compound_data], split=\"train\")\n\n    # Check that features were inferred correctly\n    assert dataset.features is not None\n    features = dataset.features\n\n    # Check compound type features\n    assert \"simple_compound\" in features\n    # Compound features are represented as dict, not Features object\n    assert isinstance(features[\"simple_compound\"], dict)\n    assert features[\"simple_compound\"][\"x\"] == Value(\"int32\")\n    assert features[\"simple_compound\"][\"y\"] == Value(\"float64\")\n\n\ndef test_hdf5_mixed_data_types(hdf5_file_with_mixed_data_types):\n    \"\"\"Test HDF5 loading with mixed data types in the same file.\"\"\"\n    dataset = load_dataset(\"hdf5\", data_files=[hdf5_file_with_mixed_data_types], split=\"train\")\n\n    # Check all expected columns are present\n    expected_columns = {\n        \"regular_int\",\n        \"regular_float\",\n        \"complex_data\",\n        \"compound_data\",\n    }\n    assert set(dataset.column_names) == expected_columns\n\n    # Check data types\n    assert dataset[\"regular_int\"] == [0, 1, 2]\n    assert len(dataset[\"complex_data\"]) == 3\n    assert len(dataset[\"compound_data\"]) == 3\n\n\ndef test_hdf5_mismatched_lengths_with_column_filtering(hdf5_file_with_mismatched_lengths):\n    \"\"\"Test that mismatched dataset lengths are ignored when the mismatched dataset is excluded via columns config.\"\"\"\n    # Test 1: Include only the first dataset\n    features = Features({\"data1\": Value(\"int32\")})\n    dataset = load_dataset(\"hdf5\", data_files=[hdf5_file_with_mismatched_lengths], split=\"train\", features=features)\n\n    # Should work without error since we're only including the first dataset\n    expected_columns = {\"data1\"}\n    assert set(dataset.column_names) == expected_columns\n    assert \"data2\" not in dataset.column_names\n\n    # Check the data\n    data1_values = dataset[\"data1\"]\n    assert data1_values == [0, 1, 2, 3, 4]\n\n    # Test 2: Include multiple compatible datasets (all with 5 rows)\n    features = Features(\n        {\n            \"data1\": Value(\"int32\"),\n            \"data3\": Array2D(shape=(3, 4), dtype=\"float32\"),\n            \"data4\": Value(\"float64\"),\n            \"data5\": Value(\"bool\"),\n            \"data6\": Value(\"string\"),\n        }\n    )\n    dataset2 = load_dataset(\"hdf5\", data_files=[hdf5_file_with_mismatched_lengths], split=\"train\", features=features)\n\n    # Should work without error since we're excluding the mismatched dataset\n    expected_columns2 = {\"data1\", \"data3\", \"data4\", \"data5\", \"data6\"}\n    assert set(dataset2.column_names) == expected_columns2\n    assert \"data2\" not in dataset2.column_names\n\n    # Check data types and values\n    assert dataset2[\"data1\"] == [0, 1, 2, 3, 4]  # int32\n    assert len(dataset2[\"data3\"]) == 5  # Array2D\n    assert len(dataset2[\"data3\"][0]) == 3  # 3 rows in each 2D array\n    assert len(dataset2[\"data3\"][0][0]) == 4  # 4 columns in each 2D array\n    np.testing.assert_allclose(dataset2[\"data4\"], [0.0, 0.1, 0.2, 0.3, 0.4], rtol=1e-6)  # float64\n    assert dataset2[\"data5\"] == [True, False, True, False, True]  # boolean\n    assert dataset2[\"data6\"] == [\n        \"short\",\n        \"medium length\",\n        \"very long string\",\n        \"tiny\",\n        \"another string\",\n    ]  # vlen string\n\n\ndef test_hdf5_compound_with_complex_arrays(hdf5_file_with_compound_complex_arrays):\n    \"\"\"Test HDF5 loading with compound datasets containing complex arrays.\"\"\"\n    dataset = load_dataset(\"hdf5\", data_files=[hdf5_file_with_compound_complex_arrays], split=\"train\")\n\n    # Check that compound types with complex arrays are represented as nested structures\n    expected_columns = {\"compound_with_complex\"}\n    assert set(dataset.column_names) == expected_columns\n\n    # Check compound data with complex arrays\n    compound_data = dataset[\"compound_with_complex\"]\n    assert len(compound_data) == 3\n\n    # Check first row\n    first_row = compound_data[0]\n    assert first_row[\"position\"][\"x\"] == 1\n    assert first_row[\"position\"][\"y\"] == 2\n\n    # Check complex field (should be represented as real/imag structure)\n    assert first_row[\"complex_field\"][\"real\"] == 1.0\n    assert first_row[\"complex_field\"][\"imag\"] == 2.0\n\n    # Check complex array (should be represented as nested real/imag structures)\n    complex_array = first_row[\"complex_array\"]\n    assert len(complex_array[\"real\"]) == 2  # 2 rows\n    assert len(complex_array[\"real\"][0]) == 3  # 3 columns\n\n    # Check first element of complex array\n    assert complex_array[\"real\"][0][0] == 1.0\n    assert complex_array[\"imag\"][0][0] == 2.0\n\n    # Check nested complex field\n    assert first_row[\"nested_complex\"][\"real\"] == 1.5\n    assert first_row[\"nested_complex\"][\"imag\"] == 2.5\n\n\ndef test_hdf5_feature_inference_compound_complex_arrays(hdf5_file_with_compound_complex_arrays):\n    \"\"\"Test automatic feature inference for compound datasets with complex arrays.\"\"\"\n    dataset = load_dataset(\"hdf5\", data_files=[hdf5_file_with_compound_complex_arrays], split=\"train\")\n\n    # Check that features were inferred correctly\n    assert dataset.features is not None\n    features = dataset.features\n\n    # Check compound type features with complex arrays\n    assert \"compound_with_complex\" in features\n\n    # Check nested structure\n    compound_features = features[\"compound_with_complex\"]\n    assert \"position\" in compound_features\n    assert \"complex_field\" in compound_features\n    assert \"complex_array\" in compound_features\n    assert \"nested_complex\" in compound_features\n\n    # Check position field (nested compound)\n    assert compound_features[\"position\"][\"x\"] == Value(\"int32\")\n    assert compound_features[\"position\"][\"y\"] == Value(\"int32\")\n\n    # Check complex field (should be real/imag structure)\n    assert compound_features[\"complex_field\"][\"real\"] == Value(\"float32\")\n    assert compound_features[\"complex_field\"][\"imag\"] == Value(\"float32\")\n\n    # Check complex array (should be nested real/imag structures)\n    assert compound_features[\"complex_array\"][\"real\"] == Array2D(shape=(2, 3), dtype=\"float32\")\n    assert compound_features[\"complex_array\"][\"imag\"] == Array2D(shape=(2, 3), dtype=\"float32\")\n\n    # Check nested complex field\n    assert compound_features[\"nested_complex\"][\"real\"] == Value(\"float32\")\n    assert compound_features[\"nested_complex\"][\"imag\"] == Value(\"float32\")\n"
  },
  {
    "path": "tests/packaged_modules/test_imagefolder.py",
    "content": "import shutil\nimport textwrap\n\nimport numpy as np\nimport pytest\n\nfrom datasets import ClassLabel, Features, Image\nfrom datasets.builder import InvalidConfigName\nfrom datasets.data_files import DataFilesDict, DataFilesList, get_data_patterns\nfrom datasets.download.streaming_download_manager import StreamingDownloadManager\nfrom datasets.packaged_modules.imagefolder.imagefolder import ImageFolder, ImageFolderConfig\n\nfrom ..utils import require_pil\n\n\n@pytest.fixture\ndef cache_dir(tmp_path):\n    return str(tmp_path / \"imagefolder_cache_dir\")\n\n\n@pytest.fixture\ndef data_files_with_labels_no_metadata(tmp_path, image_file):\n    data_dir = tmp_path / \"data_files_with_labels_no_metadata\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    subdir_class_0 = data_dir / \"cat\"\n    subdir_class_0.mkdir(parents=True, exist_ok=True)\n    subdir_class_1 = data_dir / \"dog\"\n    subdir_class_1.mkdir(parents=True, exist_ok=True)\n\n    image_filename = subdir_class_0 / \"image_cat.jpg\"\n    shutil.copyfile(image_file, image_filename)\n    image_filename2 = subdir_class_1 / \"image_dog.jpg\"\n    shutil.copyfile(image_file, image_filename2)\n\n    data_files_with_labels_no_metadata = DataFilesDict.from_patterns(\n        get_data_patterns(str(data_dir)), data_dir.as_posix()\n    )\n\n    return data_files_with_labels_no_metadata\n\n\n@pytest.fixture\ndef image_files_with_labels_and_duplicated_label_key_in_metadata(tmp_path, image_file):\n    data_dir = tmp_path / \"image_files_with_labels_and_label_key_in_metadata\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    subdir_class_0 = data_dir / \"cat\"\n    subdir_class_0.mkdir(parents=True, exist_ok=True)\n    subdir_class_1 = data_dir / \"dog\"\n    subdir_class_1.mkdir(parents=True, exist_ok=True)\n\n    image_filename = subdir_class_0 / \"image_cat.jpg\"\n    shutil.copyfile(image_file, image_filename)\n    image_filename2 = subdir_class_1 / \"image_dog.jpg\"\n    shutil.copyfile(image_file, image_filename2)\n\n    image_metadata_filename = tmp_path / data_dir / \"metadata.jsonl\"\n    image_metadata = textwrap.dedent(\n        \"\"\"\\\n        {\"file_name\": \"cat/image_cat.jpg\", \"caption\": \"Nice image of a cat\", \"label\": \"Cat\"}\n        {\"file_name\": \"dog/image_dog.jpg\", \"caption\": \"Nice image of a dog\", \"label\": \"Dog\"}\n        \"\"\"\n    )\n    with open(image_metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(image_metadata)\n\n    return str(image_filename), str(image_filename2), str(image_metadata_filename)\n\n\n@pytest.fixture\ndef image_file_with_metadata(tmp_path, image_file):\n    image_filename = tmp_path / \"image_rgb.jpg\"\n    shutil.copyfile(image_file, image_filename)\n    image_metadata_filename = tmp_path / \"metadata.jsonl\"\n    image_metadata = textwrap.dedent(\n        \"\"\"\\\n        {\"file_name\": \"image_rgb.jpg\", \"caption\": \"Nice image\"}\n        \"\"\"\n    )\n    with open(image_metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(image_metadata)\n    return str(image_filename), str(image_metadata_filename)\n\n\n@pytest.fixture\ndef image_files_with_metadata_that_misses_one_image(tmp_path, image_file):\n    image_filename = tmp_path / \"image_rgb.jpg\"\n    shutil.copyfile(image_file, image_filename)\n    image_filename2 = tmp_path / \"image_rgb2.jpg\"\n    shutil.copyfile(image_file, image_filename2)\n    image_metadata_filename = tmp_path / \"metadata.jsonl\"\n    image_metadata = textwrap.dedent(\n        \"\"\"\\\n        {\"file_name\": \"image_rgb.jpg\", \"caption\": \"Nice image\"}\n        \"\"\"\n    )\n    with open(image_metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(image_metadata)\n    return str(image_filename), str(image_filename2), str(image_metadata_filename)\n\n\n@pytest.fixture(params=[\"jsonl\", \"csv\"])\ndef data_files_with_one_split_and_metadata(request, tmp_path, image_file):\n    data_dir = tmp_path / \"imagefolder_data_dir_with_metadata_one_split\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    subdir = data_dir / \"subdir\"\n    subdir.mkdir(parents=True, exist_ok=True)\n\n    image_filename = data_dir / \"image_rgb.jpg\"\n    shutil.copyfile(image_file, image_filename)\n    image_filename2 = data_dir / \"image_rgb2.jpg\"\n    shutil.copyfile(image_file, image_filename2)\n    image_filename3 = subdir / \"image_rgb3.jpg\"  # in subdir\n    shutil.copyfile(image_file, image_filename3)\n\n    image_metadata_filename = data_dir / f\"metadata.{request.param}\"\n    image_metadata = (\n        textwrap.dedent(\n            \"\"\"\\\n        {\"file_name\": \"image_rgb.jpg\", \"caption\": \"Nice image\"}\n        {\"file_name\": \"image_rgb2.jpg\", \"caption\": \"Nice second image\"}\n        {\"file_name\": \"subdir/image_rgb3.jpg\", \"caption\": \"Nice third image\"}\n        \"\"\"\n        )\n        if request.param == \"jsonl\"\n        else textwrap.dedent(\n            \"\"\"\\\n        file_name,caption\n        image_rgb.jpg,Nice image\n        image_rgb2.jpg,Nice second image\n        subdir/image_rgb3.jpg,Nice third image\n        \"\"\"\n        )\n    )\n    with open(image_metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(image_metadata)\n    data_files_with_one_split_and_metadata = DataFilesDict.from_patterns(\n        get_data_patterns(str(data_dir)), data_dir.as_posix()\n    )\n    assert len(data_files_with_one_split_and_metadata) == 1\n    assert len(data_files_with_one_split_and_metadata[\"train\"]) == 4\n    return data_files_with_one_split_and_metadata\n\n\n@pytest.fixture(params=[\"jsonl\", \"csv\"])\ndef data_files_with_two_splits_and_metadata(request, tmp_path, image_file):\n    data_dir = tmp_path / \"imagefolder_data_dir_with_metadata_two_splits\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    train_dir = data_dir / \"train\"\n    train_dir.mkdir(parents=True, exist_ok=True)\n    test_dir = data_dir / \"test\"\n    test_dir.mkdir(parents=True, exist_ok=True)\n\n    image_filename = train_dir / \"image_rgb.jpg\"  # train image\n    shutil.copyfile(image_file, image_filename)\n    image_filename2 = train_dir / \"image_rgb2.jpg\"  # train image\n    shutil.copyfile(image_file, image_filename2)\n    image_filename3 = test_dir / \"image_rgb3.jpg\"  # test image\n    shutil.copyfile(image_file, image_filename3)\n\n    train_image_metadata_filename = train_dir / f\"metadata.{request.param}\"\n    image_metadata = (\n        textwrap.dedent(\n            \"\"\"\\\n        {\"file_name\": \"image_rgb.jpg\", \"caption\": \"Nice train image\"}\n        {\"file_name\": \"image_rgb2.jpg\", \"caption\": \"Nice second train image\"}\n        \"\"\"\n        )\n        if request.param == \"jsonl\"\n        else textwrap.dedent(\n            \"\"\"\\\n        file_name,caption\n        image_rgb.jpg,Nice train image\n        image_rgb2.jpg,Nice second train image\n        \"\"\"\n        )\n    )\n    with open(train_image_metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(image_metadata)\n    test_image_metadata_filename = test_dir / f\"metadata.{request.param}\"\n    image_metadata = (\n        textwrap.dedent(\n            \"\"\"\\\n        {\"file_name\": \"image_rgb3.jpg\", \"caption\": \"Nice test image\"}\n        \"\"\"\n        )\n        if request.param == \"jsonl\"\n        else textwrap.dedent(\n            \"\"\"\\\n        file_name,caption\n        image_rgb3.jpg,Nice test image\n        \"\"\"\n        )\n    )\n    with open(test_image_metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(image_metadata)\n    data_files_with_two_splits_and_metadata = DataFilesDict.from_patterns(\n        get_data_patterns(str(data_dir)), data_dir.as_posix()\n    )\n    assert len(data_files_with_two_splits_and_metadata) == 2\n    assert len(data_files_with_two_splits_and_metadata[\"train\"]) == 3\n    assert len(data_files_with_two_splits_and_metadata[\"test\"]) == 2\n    return data_files_with_two_splits_and_metadata\n\n\n@pytest.fixture\ndef data_files_with_zip_archives(tmp_path, image_file):\n    from PIL import Image, ImageOps\n\n    data_dir = tmp_path / \"imagefolder_data_dir_with_zip_archives\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    archive_dir = data_dir / \"archive\"\n    archive_dir.mkdir(parents=True, exist_ok=True)\n    subdir = archive_dir / \"subdir\"\n    subdir.mkdir(parents=True, exist_ok=True)\n\n    image_filename = archive_dir / \"image_rgb.jpg\"\n    shutil.copyfile(image_file, image_filename)\n    image_filename2 = subdir / \"image_rgb2.jpg\"  # in subdir\n    # make sure they're two different images\n    # Indeed we won't be able to compare the image.filename, since the archive is not extracted in streaming mode\n    ImageOps.flip(Image.open(image_file)).save(image_filename2)\n\n    image_metadata_filename = archive_dir / \"metadata.jsonl\"\n    image_metadata = textwrap.dedent(\n        \"\"\"\\\n        {\"file_name\": \"image_rgb.jpg\", \"caption\": \"Nice image\"}\n        {\"file_name\": \"subdir/image_rgb2.jpg\", \"caption\": \"Nice second image\"}\n        \"\"\"\n    )\n    with open(image_metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(image_metadata)\n\n    shutil.make_archive(archive_dir, \"zip\", archive_dir)\n    shutil.rmtree(str(archive_dir))\n\n    data_files_with_zip_archives = DataFilesDict.from_patterns(get_data_patterns(str(data_dir)), data_dir.as_posix())\n\n    assert len(data_files_with_zip_archives) == 1\n    assert len(data_files_with_zip_archives[\"train\"]) == 1\n    return data_files_with_zip_archives\n\n\ndef test_config_raises_when_invalid_name() -> None:\n    with pytest.raises(InvalidConfigName, match=\"Bad characters\"):\n        _ = ImageFolderConfig(name=\"name-with-*-invalid-character\")\n\n\n@pytest.mark.parametrize(\"data_files\", [\"str_path\", [\"str_path\"], DataFilesList([\"str_path\"], [()])])\ndef test_config_raises_when_invalid_data_files(data_files) -> None:\n    with pytest.raises(ValueError, match=\"Expected a DataFilesDict\"):\n        _ = ImageFolderConfig(name=\"name\", data_files=data_files)\n\n\n@require_pil\n# check that labels are inferred correctly from dir names\ndef test_generate_examples_with_labels(data_files_with_labels_no_metadata, cache_dir):\n    # there are no metadata.jsonl files in this test case\n    imagefolder = ImageFolder(data_files=data_files_with_labels_no_metadata, cache_dir=cache_dir, drop_labels=False)\n    imagefolder.download_and_prepare()\n    assert imagefolder.info.features == Features({\"image\": Image(), \"label\": ClassLabel(names=[\"cat\", \"dog\"])})\n    dataset = list(imagefolder.as_dataset()[\"train\"])\n    label_feature = imagefolder.info.features[\"label\"]\n\n    assert dataset[0][\"label\"] == label_feature._str2int[\"cat\"]\n    assert dataset[1][\"label\"] == label_feature._str2int[\"dog\"]\n\n\n@require_pil\n@pytest.mark.parametrize(\"drop_metadata\", [None, True, False])\n@pytest.mark.parametrize(\"drop_labels\", [None, True, False])\ndef test_generate_examples_drop_labels(data_files_with_labels_no_metadata, drop_metadata, drop_labels):\n    imagefolder = ImageFolder(\n        drop_metadata=drop_metadata, drop_labels=drop_labels, data_files=data_files_with_labels_no_metadata\n    )\n    gen_kwargs = imagefolder._split_generators(StreamingDownloadManager())[0].gen_kwargs\n    # removing the labels explicitly requires drop_labels=True\n    assert gen_kwargs[\"add_labels\"] is not bool(drop_labels)\n    assert gen_kwargs[\"add_metadata\"] is False\n    generator = imagefolder._generate_examples(**gen_kwargs)\n    if not drop_labels:\n        assert all(\n            example.keys() == {\"image\", \"label\"} and all(val is not None for val in example.values())\n            for _, example in generator\n        )\n    else:\n        assert all(\n            example.keys() == {\"image\"} and all(val is not None for val in example.values())\n            for _, example in generator\n        )\n\n\n@require_pil\n@pytest.mark.parametrize(\"drop_metadata\", [None, True, False])\n@pytest.mark.parametrize(\"drop_labels\", [None, True, False])\ndef test_generate_examples_drop_metadata(image_file_with_metadata, drop_metadata, drop_labels):\n    image_file, image_metadata_file = image_file_with_metadata\n    imagefolder = ImageFolder(\n        drop_metadata=drop_metadata, drop_labels=drop_labels, data_files={\"train\": [image_file, image_metadata_file]}\n    )\n    gen_kwargs = imagefolder._split_generators(StreamingDownloadManager())[0].gen_kwargs\n    # since the dataset has metadata, removing the metadata explicitly requires drop_metadata=True\n    assert gen_kwargs[\"add_metadata\"] is not bool(drop_metadata)\n    # since the dataset has metadata, adding the labels explicitly requires drop_labels=False\n    assert gen_kwargs[\"add_labels\"] is False\n    generator = imagefolder._generate_examples(**gen_kwargs)\n    expected_columns = {\"image\"}\n    if gen_kwargs[\"add_metadata\"]:\n        expected_columns.add(\"caption\")\n    if gen_kwargs[\"add_labels\"]:\n        expected_columns.add(\"label\")\n    result = [example for _, example in generator]\n    assert len(result) == 1\n    example = result[0]\n    assert example.keys() == expected_columns\n    for column in expected_columns:\n        assert example[column] is not None\n\n\n@require_pil\n@pytest.mark.parametrize(\"streaming\", [False, True])\ndef test_data_files_with_metadata_and_single_split(streaming, cache_dir, data_files_with_one_split_and_metadata):\n    data_files = data_files_with_one_split_and_metadata\n    imagefolder = ImageFolder(data_files=data_files, cache_dir=cache_dir)\n    imagefolder.download_and_prepare()\n    datasets = imagefolder.as_streaming_dataset() if streaming else imagefolder.as_dataset()\n    for split, data_files in data_files.items():\n        expected_num_of_images = len(data_files) - 1  # don't count the metadata file\n        assert split in datasets\n        dataset = list(datasets[split])\n        assert len(dataset) == expected_num_of_images\n        # make sure each sample has its own image and metadata\n        assert len({example[\"image\"].filename for example in dataset}) == expected_num_of_images\n        assert len({example[\"caption\"] for example in dataset}) == expected_num_of_images\n        assert all(example[\"caption\"] is not None for example in dataset)\n\n\n@require_pil\n@pytest.mark.parametrize(\"streaming\", [False, True])\ndef test_data_files_with_metadata_and_multiple_splits(streaming, cache_dir, data_files_with_two_splits_and_metadata):\n    data_files = data_files_with_two_splits_and_metadata\n    imagefolder = ImageFolder(data_files=data_files, cache_dir=cache_dir)\n    imagefolder.download_and_prepare()\n    datasets = imagefolder.as_streaming_dataset() if streaming else imagefolder.as_dataset()\n    for split, data_files in data_files.items():\n        expected_num_of_images = len(data_files) - 1  # don't count the metadata file\n        assert split in datasets\n        dataset = list(datasets[split])\n        assert len(dataset) == expected_num_of_images\n        # make sure each sample has its own image and metadata\n        assert len({example[\"image\"].filename for example in dataset}) == expected_num_of_images\n        assert len({example[\"caption\"] for example in dataset}) == expected_num_of_images\n        assert all(example[\"caption\"] is not None for example in dataset)\n\n\n@require_pil\n@pytest.mark.parametrize(\"streaming\", [False, True])\ndef test_data_files_with_metadata_and_archives(streaming, cache_dir, data_files_with_zip_archives):\n    imagefolder = ImageFolder(data_files=data_files_with_zip_archives, cache_dir=cache_dir)\n    imagefolder.download_and_prepare()\n    datasets = imagefolder.as_streaming_dataset() if streaming else imagefolder.as_dataset()\n    for split, data_files in data_files_with_zip_archives.items():\n        num_of_archives = len(data_files)  # the metadata file is inside the archive\n        expected_num_of_images = 2 * num_of_archives\n        assert split in datasets\n        dataset = list(datasets[split])\n        assert len(dataset) == expected_num_of_images\n        # make sure each sample has its own image and metadata\n        assert len({np.array(example[\"image\"])[0, 0, 0] for example in dataset}) == expected_num_of_images\n        assert len({example[\"caption\"] for example in dataset}) == expected_num_of_images\n        assert all(example[\"caption\"] is not None for example in dataset)\n\n\n@require_pil\ndef test_data_files_with_wrong_metadata_file_name(cache_dir, tmp_path, image_file):\n    data_dir = tmp_path / \"data_dir_with_bad_metadata\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    shutil.copyfile(image_file, data_dir / \"image_rgb.jpg\")\n    image_metadata_filename = data_dir / \"bad_metadata.jsonl\"  # bad file\n    image_metadata = textwrap.dedent(\n        \"\"\"\\\n        {\"file_name\": \"image_rgb.jpg\", \"caption\": \"Nice image\"}\n        \"\"\"\n    )\n    with open(image_metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(image_metadata)\n\n    data_files_with_bad_metadata = DataFilesDict.from_patterns(get_data_patterns(str(data_dir)), data_dir.as_posix())\n    imagefolder = ImageFolder(data_files=data_files_with_bad_metadata, cache_dir=cache_dir)\n    imagefolder.download_and_prepare()\n    dataset = imagefolder.as_dataset(split=\"train\")\n    # check that there are no metadata, since the metadata file name doesn't have the right name\n    assert \"caption\" not in dataset.column_names\n\n\n@require_pil\ndef test_data_files_with_custom_image_file_name_column_in_metadata_file(cache_dir, tmp_path, image_file):\n    data_dir = tmp_path / \"data_dir_with_custom_file_name_metadata\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    shutil.copyfile(image_file, data_dir / \"image_rgb.jpg\")\n    image_metadata_filename = data_dir / \"metadata.jsonl\"\n    image_metadata = textwrap.dedent(  # with bad column \"bad_file_name\" instead of \"file_name\"\n        \"\"\"\\\n        {\"picture_file_name\": \"image_rgb.jpg\", \"caption\": \"Nice image\"}\n        \"\"\"\n    )\n    with open(image_metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(image_metadata)\n\n    data_files_with_bad_metadata = DataFilesDict.from_patterns(get_data_patterns(str(data_dir)), data_dir.as_posix())\n    imagefolder = ImageFolder(data_files=data_files_with_bad_metadata, cache_dir=cache_dir)\n    imagefolder.download_and_prepare()\n    dataset = imagefolder.as_dataset(split=\"train\")\n    assert \"picture\" in dataset.features\n    assert \"picture_file_name\" not in dataset.features\n\n\n@require_pil\ndef test_data_files_with_with_metadata_in_different_formats(cache_dir, tmp_path, image_file):\n    data_dir = tmp_path / \"data_dir_with_metadata_in_different_format\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    shutil.copyfile(image_file, data_dir / \"image_rgb.jpg\")\n    image_metadata_filename_jsonl = data_dir / \"metadata.jsonl\"\n    image_metadata_jsonl = textwrap.dedent(\n        \"\"\"\\\n        {\"file_name\": \"image_rgb.jpg\", \"caption\": \"Nice image\"}\n        \"\"\"\n    )\n    with open(image_metadata_filename_jsonl, \"w\", encoding=\"utf-8\") as f:\n        f.write(image_metadata_jsonl)\n    image_metadata_filename_csv = data_dir / \"metadata.csv\"\n    image_metadata_csv = textwrap.dedent(\n        \"\"\"\\\n        file_name,caption\n        image_rgb.jpg,Nice image\n        \"\"\"\n    )\n    with open(image_metadata_filename_csv, \"w\", encoding=\"utf-8\") as f:\n        f.write(image_metadata_csv)\n\n    data_files_with_bad_metadata = DataFilesDict.from_patterns(get_data_patterns(str(data_dir)), data_dir.as_posix())\n    imagefolder = ImageFolder(data_files=data_files_with_bad_metadata, cache_dir=cache_dir)\n    with pytest.raises(ValueError) as exc_info:\n        imagefolder.download_and_prepare()\n    assert \"metadata files with different extensions\" in str(exc_info.value)\n"
  },
  {
    "path": "tests/packaged_modules/test_json.py",
    "content": "import json\nimport textwrap\n\nimport pyarrow as pa\nimport pytest\n\nfrom datasets import Features, Value\nfrom datasets.builder import InvalidConfigName\nfrom datasets.data_files import DataFilesList\nfrom datasets.packaged_modules.json.json import Json, JsonConfig\n\n\n@pytest.fixture\ndef jsonl_file(tmp_path):\n    filename = tmp_path / \"file.jsonl\"\n    data = textwrap.dedent(\n        \"\"\"\\\n        {\"col_1\": -1}\n        {\"col_1\": 1, \"col_2\": 2}\n        {\"col_1\": 10, \"col_2\": 20}\n        \"\"\"\n    )\n    with open(filename, \"w\") as f:\n        f.write(data)\n    return str(filename)\n\n\n# ndjson format is no longer maintained (see: https://github.com/ndjson/ndjson-spec/issues/35#issuecomment-1285673417)\n@pytest.fixture\ndef ndjson_file(tmp_path):\n    filename = tmp_path / \"file.ndjson\"\n    data = textwrap.dedent(\n        \"\"\"\\\n        {\"col_1\": -1}\n        {\"col_1\": 1, \"col_2\": 2}\n        {\"col_1\": 10, \"col_2\": 20}\n        \"\"\"\n    )\n    with open(filename, \"w\") as f:\n        f.write(data)\n    return str(filename)\n\n\n@pytest.fixture\ndef jsonl_file_utf16_encoded(tmp_path):\n    filename = tmp_path / \"file_utf16_encoded.jsonl\"\n    data = textwrap.dedent(\n        \"\"\"\\\n        {\"col_1\": -1}\n        {\"col_1\": 1, \"col_2\": 2}\n        {\"col_1\": 10, \"col_2\": 20}\n        \"\"\"\n    )\n    with open(filename, \"w\", encoding=\"utf-16\") as f:\n        f.write(data)\n    return str(filename)\n\n\n@pytest.fixture\ndef json_file_with_list_of_dicts(tmp_path):\n    filename = tmp_path / \"file_with_list_of_dicts.json\"\n    data = textwrap.dedent(\n        \"\"\"\\\n        [\n            {\"col_1\": -1},\n            {\"col_1\": 1, \"col_2\": 2},\n            {\"col_1\": 10, \"col_2\": 20}\n        ]\n        \"\"\"\n    )\n    with open(filename, \"w\") as f:\n        f.write(data)\n    return str(filename)\n\n\n@pytest.fixture\ndef json_file_with_list_of_strings(tmp_path):\n    filename = tmp_path / \"file_with_list_of_strings.json\"\n    data = textwrap.dedent(\n        \"\"\"\\\n        [\n            \"First text.\",\n            \"Second text.\",\n            \"Third text.\"\n        ]\n        \"\"\"\n    )\n    with open(filename, \"w\") as f:\n        f.write(data)\n    return str(filename)\n\n\n@pytest.fixture\ndef json_file_with_list_of_dicts_field(tmp_path):\n    filename = tmp_path / \"file_with_list_of_dicts_field.json\"\n    data = textwrap.dedent(\n        \"\"\"\\\n        {\n            \"field1\": 1,\n            \"field2\": \"aabb\",\n            \"field3\": [\n                {\"col_1\": -1},\n                {\"col_1\": 1, \"col_2\": 2},\n                {\"col_1\": 10, \"col_2\": 20}\n            ]\n        }\n        \"\"\"\n    )\n    with open(filename, \"w\") as f:\n        f.write(data)\n    return str(filename)\n\n\n@pytest.fixture\ndef json_file_with_list_of_strings_field(tmp_path):\n    path = tmp_path / \"file.json\"\n    data = textwrap.dedent(\n        \"\"\"\\\n        {\n            \"field1\": 1,\n            \"field2\": \"aabb\",\n            \"field3\": [\n                \"First text.\",\n                \"Second text.\",\n                \"Third text.\"\n            ]\n        }\n        \"\"\"\n    )\n    with open(path, \"w\") as f:\n        f.write(data)\n    return str(path)\n\n\n@pytest.fixture\ndef json_file_with_dict_of_lists_field(tmp_path):\n    path = tmp_path / \"file.json\"\n    data = textwrap.dedent(\n        \"\"\"\\\n        {\n            \"field1\": 1,\n            \"field2\": \"aabb\",\n            \"field3\": {\n                \"col_1\": [-1, 1, 10],\n                \"col_2\": [null, 2, 20]\n            }\n        }\n        \"\"\"\n    )\n    with open(path, \"w\") as f:\n        f.write(data)\n    return str(path)\n\n\n@pytest.fixture\ndef json_file_with_list_of_dicts_with_sorted_columns(tmp_path):\n    path = tmp_path / \"file.json\"\n    data = textwrap.dedent(\n        \"\"\"\\\n        [\n            {\"ID\": 0, \"Language\": \"Language-0\", \"Topic\": \"Topic-0\"},\n            {\"ID\": 1, \"Language\": \"Language-1\", \"Topic\": \"Topic-1\"},\n            {\"ID\": 2, \"Language\": \"Language-2\", \"Topic\": \"Topic-2\"}\n        ]\n        \"\"\"\n    )\n    with open(path, \"w\") as f:\n        f.write(data)\n    return str(path)\n\n\n@pytest.fixture\ndef json_file_with_list_of_dicts_with_sorted_columns_field(tmp_path):\n    path = tmp_path / \"file.json\"\n    data = textwrap.dedent(\n        \"\"\"\\\n        {\n            \"field1\": 1,\n            \"field2\": \"aabb\",\n            \"field3\": [\n                {\"ID\": 0, \"Language\": \"Language-0\", \"Topic\": \"Topic-0\"},\n                {\"ID\": 1, \"Language\": \"Language-1\", \"Topic\": \"Topic-1\"},\n                {\"ID\": 2, \"Language\": \"Language-2\", \"Topic\": \"Topic-2\"}\n            ]\n        }\n        \"\"\"\n    )\n    with open(path, \"w\") as f:\n        f.write(data)\n    return str(path)\n\n\n@pytest.fixture\ndef jsonl_file_with_mix_of_str_and_int(tmp_path):\n    filename = tmp_path / \"file.jsonl\"\n    data = textwrap.dedent(\n        \"\"\"\\\n        {\"col_1\": -1}\n        {\"col_1\": 1}\n        {\"col_1\": \"foo\"}\n        \"\"\"\n    )\n    with open(filename, \"w\") as f:\n        f.write(data)\n    return str(filename)\n\n\n@pytest.fixture\ndef jsonl_file_with_dicts_of_varying_keys(tmp_path):\n    filename = tmp_path / \"file.jsonl\"\n    data = textwrap.dedent(\n        \"\"\"\\\n        {\"col_1\": {\"a\": 0}}\n        {\"col_1\": {\"b\": 0}}\n        {\"col_1\": {\"c\": 0}}\n        \"\"\"\n    )\n    with open(filename, \"w\") as f:\n        f.write(data)\n    return str(filename)\n\n\n@pytest.fixture\ndef jsonl_file_with_lists_of_dicts_of_varying_keys(tmp_path):\n    filename = tmp_path / \"file.jsonl\"\n    data = textwrap.dedent(\n        \"\"\"\\\n        {\"col_1\": [{\"a\": 0}, {\"b\": 0}]}\n        {\"col_1\": [{\"c\": 0}, {\"d\": 0}]}\n        \"\"\"\n    )\n    with open(filename, \"w\") as f:\n        f.write(data)\n    return str(filename)\n\n\n_messages = [\n    {\"role\": \"user\", \"content\": \"Turn on the living room lights and play my electronic music playlist.\"},\n    {\n        \"role\": \"assistant\",\n        \"tool_calls\": [\n            {\n                \"type\": \"function\",\n                \"function\": {\"name\": \"control_light\", \"arguments\": {\"room\": \"living room\", \"state\": \"on\"}},\n            },\n            {\n                \"type\": \"function\",\n                \"function\": {\n                    \"name\": \"play_music\",\n                    \"arguments\": {\n                        \"playlist\": \"electronic\"\n                    },  # mixed-type here since keys [\"playlist\"] and [\"room\", \"state\"] are different\n                },\n            },\n        ],\n    },\n    {\"role\": \"tool\", \"name\": \"control_light\", \"content\": \"The lights in the living room are now on.\"},\n    {\"role\": \"tool\", \"name\": \"play_music\", \"content\": \"The music is now playing.\"},\n    {\"role\": \"assistant\", \"content\": \"Done!\"},\n]\n\nEXPECTED_SIMPLE = {\"col_1\": [-1, 1, 10], \"col_2\": [None, 2, 20]}\nEXPECTED_LIST_OF_STRINGS = {\"text\": [\"First text.\", \"Second text.\", \"Third text.\"]}\nEXPECTED_MIX = {\"col_1\": [-1, 1, \"foo\"]}\nEXPECTED_DICTS_WITH_VARYING_KEYS = {\"col_1\": [{\"a\": 0}, {\"b\": 0}, {\"c\": 0}]}\nEXPECTED_LISTS_OF_DICTS_WITH_VARYING_KEYS = {\"col_1\": [[{\"a\": 0}, {\"b\": 0}], [{\"c\": 0}, {\"d\": 0}]]}\nEXPECTED_MESSAGES = {\"messages\": [_messages]}\n\n\n@pytest.fixture\ndef jsonl_file_with_messages(tmp_path):\n    filename = tmp_path / \"file.jsonl\"\n    data = json.dumps({\"messages\": _messages})\n    with open(filename, \"w\") as f:\n        f.write(data)\n    return str(filename)\n\n\ndef test_config_raises_when_invalid_name() -> None:\n    with pytest.raises(InvalidConfigName, match=\"Bad characters\"):\n        _ = JsonConfig(name=\"name-with-*-invalid-character\")\n\n\n@pytest.mark.parametrize(\"data_files\", [\"str_path\", [\"str_path\"], DataFilesList([\"str_path\"], [()])])\ndef test_config_raises_when_invalid_data_files(data_files) -> None:\n    with pytest.raises(ValueError, match=\"Expected a DataFilesDict\"):\n        _ = JsonConfig(name=\"name\", data_files=data_files)\n\n\n@pytest.mark.parametrize(\n    \"file_fixture, config_kwargs, expected\",\n    [\n        (\"jsonl_file\", {}, EXPECTED_SIMPLE),\n        (\"ndjson_file\", {}, EXPECTED_SIMPLE),\n        (\"jsonl_file_utf16_encoded\", {\"encoding\": \"utf-16\"}, EXPECTED_SIMPLE),\n        (\"json_file_with_list_of_dicts\", {}, EXPECTED_SIMPLE),\n        (\"json_file_with_list_of_dicts_field\", {\"field\": \"field3\"}, EXPECTED_SIMPLE),\n        (\"json_file_with_list_of_strings\", {}, EXPECTED_LIST_OF_STRINGS),\n        (\"json_file_with_list_of_strings_field\", {\"field\": \"field3\"}, EXPECTED_LIST_OF_STRINGS),\n        (\"json_file_with_dict_of_lists_field\", {\"field\": \"field3\"}, EXPECTED_SIMPLE),\n        (\"jsonl_file_with_mix_of_str_and_int\", {}, EXPECTED_MIX),\n        (\"jsonl_file_with_dicts_of_varying_keys\", {}, EXPECTED_DICTS_WITH_VARYING_KEYS),\n        (\"jsonl_file_with_lists_of_dicts_of_varying_keys\", {}, EXPECTED_LISTS_OF_DICTS_WITH_VARYING_KEYS),\n        (\"jsonl_file_with_messages\", {}, EXPECTED_MESSAGES),\n    ],\n)\ndef test_json_generate_tables(file_fixture, config_kwargs, expected, request):\n    json = Json(**config_kwargs)\n    base_files = [request.getfixturevalue(file_fixture)]\n    files_iterables = [[file] for file in base_files]\n    generator = json._generate_tables(base_files=base_files, files_iterables=files_iterables)\n    pa_table = pa.concat_tables([table for _, table in generator])\n    out = Features.from_arrow_schema(pa_table.schema).decode_batch(pa_table.to_pydict())\n    assert out == expected\n\n\n@pytest.mark.parametrize(\n    \"file_fixture, config_kwargs\",\n    [\n        (\n            \"jsonl_file\",\n            {\"features\": Features({\"col_1\": Value(\"int64\"), \"col_2\": Value(\"int64\"), \"missing_col\": Value(\"string\")})},\n        ),\n        (\n            \"json_file_with_list_of_dicts\",\n            {\"features\": Features({\"col_1\": Value(\"int64\"), \"col_2\": Value(\"int64\"), \"missing_col\": Value(\"string\")})},\n        ),\n        (\n            \"json_file_with_list_of_dicts_field\",\n            {\n                \"field\": \"field3\",\n                \"features\": Features(\n                    {\"col_1\": Value(\"int64\"), \"col_2\": Value(\"int64\"), \"missing_col\": Value(\"string\")}\n                ),\n            },\n        ),\n    ],\n)\ndef test_json_generate_tables_with_missing_features(file_fixture, config_kwargs, request):\n    json = Json(**config_kwargs)\n    base_files = [request.getfixturevalue(file_fixture)]\n    files_iterables = [[file] for file in base_files]\n    generator = json._generate_tables(base_files=base_files, files_iterables=files_iterables)\n    pa_table = pa.concat_tables([table for _, table in generator])\n    assert pa_table.to_pydict() == {\"col_1\": [-1, 1, 10], \"col_2\": [None, 2, 20], \"missing_col\": [None, None, None]}\n\n\n@pytest.mark.parametrize(\n    \"file_fixture, config_kwargs\",\n    [\n        (\"json_file_with_list_of_dicts_with_sorted_columns\", {}),\n        (\"json_file_with_list_of_dicts_with_sorted_columns_field\", {\"field\": \"field3\"}),\n    ],\n)\ndef test_json_generate_tables_with_sorted_columns(file_fixture, config_kwargs, request):\n    json = Json(**config_kwargs)\n    base_files = [request.getfixturevalue(file_fixture)]\n    files_iterables = [[file] for file in base_files]\n    generator = json._generate_tables(base_files=base_files, files_iterables=files_iterables)\n    pa_table = pa.concat_tables([table for _, table in generator])\n    assert pa_table.column_names == [\"ID\", \"Language\", \"Topic\"]\n"
  },
  {
    "path": "tests/packaged_modules/test_lance.py",
    "content": "import lance\nimport numpy as np\nimport pyarrow as pa\nimport pytest\n\nfrom datasets import load_dataset\n\n\n@pytest.fixture\ndef lance_dataset(tmp_path) -> str:\n    data = pa.table(\n        {\n            \"id\": pa.array([1, 2, 3, 4]),\n            \"value\": pa.array([10.0, 20.0, 30.0, 40.0]),\n            \"text\": pa.array([\"a\", \"b\", \"c\", \"d\"]),\n            \"vector\": pa.FixedSizeListArray.from_arrays(pa.array([0.1] * 16, pa.float32()), list_size=4),\n        }\n    )\n    dataset_path = tmp_path / \"test_dataset.lance\"\n    lance.write_dataset(data, dataset_path)\n    return str(dataset_path)\n\n\n@pytest.fixture\ndef lance_hf_dataset(tmp_path) -> str:\n    data = pa.table(\n        {\n            \"id\": pa.array([1, 2, 3, 4]),\n            \"value\": pa.array([10.0, 20.0, 30.0, 40.0]),\n            \"text\": pa.array([\"a\", \"b\", \"c\", \"d\"]),\n            \"vector\": pa.FixedSizeListArray.from_arrays(pa.array([0.1] * 16, pa.float32()), list_size=4),\n        }\n    )\n    dataset_dir = tmp_path / \"data\" / \"train.lance\"\n    dataset_dir.parent.mkdir(parents=True, exist_ok=True)\n    lance.write_dataset(data, dataset_dir)\n    lance.write_dataset(data[:2], tmp_path / \"data\" / \"test.lance\")\n\n    with open(tmp_path / \"README.md\", \"w\") as f:\n        f.write(\"\"\"---\nsize_categories:\n- 1M<n<10M\nsource_datasets:\n- lance_test\n---\n    # Test Lance Dataset\\n\\n\n    # My Markdown is fancier\\n\n\"\"\")\n\n    return str(tmp_path)\n\n\ndef test_load_lance_dataset(lance_dataset):\n    dataset_dict = load_dataset(lance_dataset)\n    assert \"train\" in dataset_dict.keys()\n\n    dataset = dataset_dict[\"train\"]\n    assert \"id\" in dataset.column_names\n    assert \"value\" in dataset.column_names\n    assert \"text\" in dataset.column_names\n    assert \"vector\" in dataset.column_names\n    ids = dataset[\"id\"]\n    assert ids == [1, 2, 3, 4]\n\n\n@pytest.mark.parametrize(\"streaming\", [False, True])\ndef test_load_hf_dataset(lance_hf_dataset, streaming):\n    dataset_dict = load_dataset(lance_hf_dataset, columns=[\"id\", \"text\"], streaming=streaming)\n    assert \"train\" in dataset_dict.keys()\n    assert \"test\" in dataset_dict.keys()\n    dataset = dataset_dict[\"train\"]\n\n    assert \"id\" in dataset.column_names\n    assert \"text\" in dataset.column_names\n    assert \"value\" not in dataset.column_names\n    assert \"vector\" not in dataset.column_names\n    ids = list(dataset[\"id\"])\n    assert ids == [1, 2, 3, 4]\n    text = list(dataset[\"text\"])\n    assert text == [\"a\", \"b\", \"c\", \"d\"]\n    assert \"value\" not in dataset.column_names\n\n\ndef test_load_vectors(lance_hf_dataset):\n    dataset_dict = load_dataset(lance_hf_dataset, columns=[\"vector\"])\n    assert \"train\" in dataset_dict.keys()\n    dataset = dataset_dict[\"train\"]\n\n    assert \"vector\" in dataset.column_names\n    vectors = dataset.data[\"vector\"].combine_chunks().values.to_numpy(zero_copy_only=False)\n    assert np.allclose(vectors, np.full(16, 0.1))\n\n\n@pytest.mark.parametrize(\"streaming\", [False, True])\ndef test_load_lance_streaming_modes(lance_hf_dataset, streaming):\n    \"\"\"Test loading Lance dataset in both streaming and non-streaming modes.\"\"\"\n    from datasets import IterableDataset\n\n    ds = load_dataset(lance_hf_dataset, split=\"train\", streaming=streaming)\n    if streaming:\n        assert isinstance(ds, IterableDataset)\n        items = list(ds)\n    else:\n        items = list(ds)\n    assert len(items) == 4\n    assert all(\"id\" in item for item in items)\n"
  },
  {
    "path": "tests/packaged_modules/test_pandas.py",
    "content": "import pytest\n\nfrom datasets.builder import InvalidConfigName\nfrom datasets.data_files import DataFilesList\nfrom datasets.packaged_modules.pandas.pandas import PandasConfig\n\n\ndef test_config_raises_when_invalid_name() -> None:\n    with pytest.raises(InvalidConfigName, match=\"Bad characters\"):\n        _ = PandasConfig(name=\"name-with-*-invalid-character\")\n\n\n@pytest.mark.parametrize(\"data_files\", [\"str_path\", [\"str_path\"], DataFilesList([\"str_path\"], [()])])\ndef test_config_raises_when_invalid_data_files(data_files) -> None:\n    with pytest.raises(ValueError, match=\"Expected a DataFilesDict\"):\n        _ = PandasConfig(name=\"name\", data_files=data_files)\n"
  },
  {
    "path": "tests/packaged_modules/test_parquet.py",
    "content": "import pytest\n\nfrom datasets.builder import InvalidConfigName\nfrom datasets.data_files import DataFilesList\nfrom datasets.packaged_modules.parquet.parquet import ParquetConfig\n\n\ndef test_config_raises_when_invalid_name() -> None:\n    with pytest.raises(InvalidConfigName, match=\"Bad characters\"):\n        _ = ParquetConfig(name=\"name-with-*-invalid-character\")\n\n\n@pytest.mark.parametrize(\"data_files\", [\"str_path\", [\"str_path\"], DataFilesList([\"str_path\"], [()])])\ndef test_config_raises_when_invalid_data_files(data_files) -> None:\n    with pytest.raises(ValueError, match=\"Expected a DataFilesDict\"):\n        _ = ParquetConfig(name=\"name\", data_files=data_files)\n"
  },
  {
    "path": "tests/packaged_modules/test_spark.py",
    "content": "from unittest.mock import patch\n\nimport numpy as np\nimport pyspark\nimport pytest\n\nfrom datasets import Features, Image, IterableDataset\nfrom datasets.builder import InvalidConfigName\nfrom datasets.data_files import DataFilesList\nfrom datasets.packaged_modules.spark.spark import (\n    Spark,\n    SparkConfig,\n    SparkExamplesIterable,\n    _generate_iterable_examples,\n)\n\nfrom ..utils import (\n    require_dill_gt_0_3_2,\n    require_not_windows,\n)\n\n\ndef _get_expected_row_ids_and_row_dicts_for_partition_order(df, partition_order):\n    expected_row_ids_and_row_dicts = []\n    for part_id in partition_order:\n        partition = df.where(f\"SPARK_PARTITION_ID() = {part_id}\").collect()\n        for row_idx, row in enumerate(partition):\n            expected_row_ids_and_row_dicts.append(((part_id, row_idx), row.asDict()))\n    return expected_row_ids_and_row_dicts\n\n\ndef test_config_raises_when_invalid_name() -> None:\n    with pytest.raises(InvalidConfigName, match=\"Bad characters\"):\n        _ = SparkConfig(name=\"name-with-*-invalid-character\")\n\n\n@pytest.mark.parametrize(\"data_files\", [\"str_path\", [\"str_path\"], DataFilesList([\"str_path\"], [()])])\ndef test_config_raises_when_invalid_data_files(data_files) -> None:\n    with pytest.raises(ValueError, match=\"Expected a DataFilesDict\"):\n        _ = SparkConfig(name=\"name\", data_files=data_files)\n\n\n@require_not_windows\n@require_dill_gt_0_3_2\ndef test_repartition_df_if_needed():\n    spark = pyspark.sql.SparkSession.builder.master(\"local[*]\").appName(\"pyspark\").getOrCreate()\n    df = spark.range(100).repartition(1)\n    spark_builder = Spark(df)\n    # The id ints will be converted to Pyarrow int64s, so each row will be 8 bytes. Setting a max_shard_size of 16 means\n    # that each partition can hold 2 rows.\n    spark_builder._repartition_df_if_needed(max_shard_size=16)\n    # Given that the dataframe has 100 rows and each partition has 2 rows, we expect 50 partitions.\n    assert spark_builder.df.rdd.getNumPartitions() == 50\n\n\n@require_not_windows\n@require_dill_gt_0_3_2\ndef test_generate_iterable_examples():\n    spark = pyspark.sql.SparkSession.builder.master(\"local[*]\").appName(\"pyspark\").getOrCreate()\n    df = spark.range(10).repartition(2)\n    partition_order = [1, 0]\n    iterator = _generate_iterable_examples(df, partition_order)  # Reverse the partitions.\n    expected_row_ids_and_row_dicts = _get_expected_row_ids_and_row_dicts_for_partition_order(df, partition_order)\n\n    for i, (row_id, row_dict) in enumerate(iterator):\n        expected_row_id, expected_row_dict = expected_row_ids_and_row_dicts[i]\n        assert row_id == expected_row_id\n        assert row_dict == expected_row_dict\n\n\n@require_not_windows\n@require_dill_gt_0_3_2\ndef test_spark_examples_iterable():\n    spark = pyspark.sql.SparkSession.builder.master(\"local[*]\").appName(\"pyspark\").getOrCreate()\n    df = spark.range(10).repartition(1)\n    it = SparkExamplesIterable(df)\n    assert it.num_shards == 1\n    for i, (row_key, row_dict) in enumerate(it):\n        assert row_key == (0, i)\n        assert row_dict == {\"id\": i}\n\n\n@require_not_windows\n@require_dill_gt_0_3_2\ndef test_spark_examples_iterable_shuffle():\n    spark = pyspark.sql.SparkSession.builder.master(\"local[*]\").appName(\"pyspark\").getOrCreate()\n    df = spark.range(30).repartition(3)\n    # Mock the generator so that shuffle reverses the partition indices.\n    with patch(\"numpy.random.Generator\") as generator_mock:\n        generator_mock.shuffle.side_effect = lambda x: x.reverse()\n        expected_row_ids_and_row_dicts = _get_expected_row_ids_and_row_dicts_for_partition_order(df, [2, 1, 0])\n\n        shuffled_it = SparkExamplesIterable(df).shuffle_data_sources(generator_mock)\n        assert shuffled_it.num_shards == 3\n        for i, (row_id, row_dict) in enumerate(shuffled_it):\n            expected_row_id, expected_row_dict = expected_row_ids_and_row_dicts[i]\n            assert row_id == expected_row_id\n            assert row_dict == expected_row_dict\n\n\n@require_not_windows\n@require_dill_gt_0_3_2\ndef test_spark_examples_iterable_shard():\n    spark = pyspark.sql.SparkSession.builder.master(\"local[*]\").appName(\"pyspark\").getOrCreate()\n    df = spark.range(20).repartition(4)\n\n    # Partitions 0 and 2\n    shard_it_1 = SparkExamplesIterable(df).shard_data_sources(index=0, num_shards=2, contiguous=False)\n    assert shard_it_1.num_shards == 2\n    expected_row_ids_and_row_dicts_1 = _get_expected_row_ids_and_row_dicts_for_partition_order(df, [0, 2])\n    for i, (row_id, row_dict) in enumerate(shard_it_1):\n        expected_row_id, expected_row_dict = expected_row_ids_and_row_dicts_1[i]\n        assert row_id == expected_row_id\n        assert row_dict == expected_row_dict\n\n    # Partitions 1 and 3\n    shard_it_2 = SparkExamplesIterable(df).shard_data_sources(index=1, num_shards=2, contiguous=False)\n    assert shard_it_2.num_shards == 2\n    expected_row_ids_and_row_dicts_2 = _get_expected_row_ids_and_row_dicts_for_partition_order(df, [1, 3])\n    for i, (row_id, row_dict) in enumerate(shard_it_2):\n        expected_row_id, expected_row_dict = expected_row_ids_and_row_dicts_2[i]\n        assert row_id == expected_row_id\n        assert row_dict == expected_row_dict\n\n\n@require_not_windows\n@require_dill_gt_0_3_2\ndef test_repartition_df_if_needed_max_num_df_rows():\n    spark = pyspark.sql.SparkSession.builder.master(\"local[*]\").appName(\"pyspark\").getOrCreate()\n    df = spark.range(100).repartition(1)\n    spark_builder = Spark(df)\n    # Choose a small max_shard_size for maximum partitioning.\n    spark_builder._repartition_df_if_needed(max_shard_size=1)\n    # The new number of partitions should not be greater than the number of rows.\n    assert spark_builder.df.rdd.getNumPartitions() == 100\n\n\n@require_not_windows\n@require_dill_gt_0_3_2\ndef test_iterable_image_features():\n    spark = pyspark.sql.SparkSession.builder.master(\"local[*]\").appName(\"pyspark\").getOrCreate()\n    img_bytes = np.zeros((10, 10, 3), dtype=np.uint8).tobytes()\n    data = [(img_bytes,)]\n    df = spark.createDataFrame(data, \"image: binary\")\n    features = Features({\"image\": Image(decode=False)})\n    dset = IterableDataset.from_spark(df, features=features)\n    item = next(iter(dset))\n    assert item.keys() == {\"image\"}\n    assert item == {\"image\": {\"path\": None, \"bytes\": img_bytes}}\n\n\n@require_not_windows\n@require_dill_gt_0_3_2\ndef test_iterable_image_features_decode():\n    from io import BytesIO\n\n    import PIL.Image\n\n    spark = pyspark.sql.SparkSession.builder.master(\"local[*]\").appName(\"pyspark\").getOrCreate()\n    img = PIL.Image.fromarray(np.zeros((10, 10, 3), dtype=np.uint8), \"RGB\")\n    buffer = BytesIO()\n    img.save(buffer, format=\"PNG\")\n    img_bytes = bytes(buffer.getvalue())\n    data = [(img_bytes,)]\n    df = spark.createDataFrame(data, \"image: binary\")\n    features = Features({\"image\": Image()})\n    dset = IterableDataset.from_spark(df, features=features)\n    item = next(iter(dset))\n    assert item.keys() == {\"image\"}\n    assert isinstance(item[\"image\"], PIL.Image.Image)\n"
  },
  {
    "path": "tests/packaged_modules/test_sql.py",
    "content": "import pytest\n\nfrom datasets.builder import InvalidConfigName\nfrom datasets.data_files import DataFilesList\nfrom datasets.packaged_modules.sql.sql import SqlConfig\n\n\ndef test_config_raises_when_invalid_name() -> None:\n    with pytest.raises(InvalidConfigName, match=\"Bad characters\"):\n        _ = SqlConfig(name=\"name-with-*-invalid-character\")\n\n\n@pytest.mark.parametrize(\"data_files\", [\"str_path\", [\"str_path\"], DataFilesList([\"str_path\"], [()])])\ndef test_config_raises_when_invalid_data_files(data_files) -> None:\n    with pytest.raises(ValueError, match=\"Expected a DataFilesDict\"):\n        _ = SqlConfig(name=\"name\", data_files=data_files)\n"
  },
  {
    "path": "tests/packaged_modules/test_text.py",
    "content": "import textwrap\n\nimport pyarrow as pa\nimport pytest\n\nfrom datasets import Features, Image\nfrom datasets.builder import InvalidConfigName\nfrom datasets.data_files import DataFilesList\nfrom datasets.packaged_modules.text.text import Text, TextConfig\n\nfrom ..utils import require_pil\n\n\n@pytest.fixture\ndef text_file(tmp_path):\n    filename = tmp_path / \"text.txt\"\n    data = textwrap.dedent(\n        \"\"\"\\\n        Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.\n        Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.\n        Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.\n        Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n\n        Second paragraph:\n        Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.\n        Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.\n        Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.\n        Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n        \"\"\"\n    )\n    with open(filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(data)\n    return str(filename)\n\n\n@pytest.fixture\ndef text_file_with_image(tmp_path, image_file):\n    filename = tmp_path / \"text_with_image.txt\"\n    with open(filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(image_file)\n    return str(filename)\n\n\ndef test_config_raises_when_invalid_name() -> None:\n    with pytest.raises(InvalidConfigName, match=\"Bad characters\"):\n        _ = TextConfig(name=\"name-with-*-invalid-character\")\n\n\n@pytest.mark.parametrize(\"data_files\", [\"str_path\", [\"str_path\"], DataFilesList([\"str_path\"], [()])])\ndef test_config_raises_when_invalid_data_files(data_files) -> None:\n    with pytest.raises(ValueError, match=\"Expected a DataFilesDict\"):\n        _ = TextConfig(name=\"name\", data_files=data_files)\n\n\n@pytest.mark.parametrize(\"keep_linebreaks\", [True, False])\ndef test_text_linebreaks(text_file, keep_linebreaks):\n    with open(text_file, encoding=\"utf-8\") as f:\n        expected_content = f.read().splitlines(keepends=keep_linebreaks)\n    text = Text(keep_linebreaks=keep_linebreaks, encoding=\"utf-8\")\n    generator = text._generate_tables(base_files=[text_file], files_iterables=[[text_file]])\n    generated_content = pa.concat_tables([table for _, table in generator]).to_pydict()[\"text\"]\n    assert generated_content == expected_content\n\n\n@require_pil\ndef test_text_cast_image(text_file_with_image):\n    with open(text_file_with_image, encoding=\"utf-8\") as f:\n        image_file = f.read().splitlines()[0]\n    text = Text(encoding=\"utf-8\", features=Features({\"image\": Image()}))\n    generator = text._generate_tables(base_files=[text_file_with_image], files_iterables=[[text_file_with_image]])\n    pa_table = pa.concat_tables([table for _, table in generator])\n    assert pa_table.schema.field(\"image\").type == Image()()\n    generated_content = pa_table.to_pydict()[\"image\"]\n    assert generated_content == [{\"path\": image_file, \"bytes\": None}]\n\n\n@pytest.mark.parametrize(\"sample_by\", [\"line\", \"paragraph\", \"document\"])\ndef test_text_sample_by(sample_by, text_file):\n    with open(text_file, encoding=\"utf-8\") as f:\n        expected_content = f.read()\n    if sample_by == \"line\":\n        expected_content = expected_content.splitlines()\n    elif sample_by == \"paragraph\":\n        expected_content = expected_content.split(\"\\n\\n\")\n    elif sample_by == \"document\":\n        expected_content = [expected_content]\n    text = Text(sample_by=sample_by, encoding=\"utf-8\", chunksize=100)\n    generator = text._generate_tables(base_files=[text_file], files_iterables=[[text_file]])\n    generated_content = pa.concat_tables([table for _, table in generator]).to_pydict()[\"text\"]\n    assert generated_content == expected_content\n"
  },
  {
    "path": "tests/packaged_modules/test_videofolder.py",
    "content": "import shutil\nimport textwrap\nfrom pathlib import Path\n\nimport pytest\n\nfrom datasets import ClassLabel, DownloadManager, Features, Video\nfrom datasets.builder import InvalidConfigName\nfrom datasets.data_files import DataFilesDict, DataFilesList, get_data_patterns\nfrom datasets.download.streaming_download_manager import StreamingDownloadManager\nfrom datasets.packaged_modules.videofolder.videofolder import VideoFolder, VideoFolderConfig\n\n\n@pytest.fixture\ndef cache_dir(tmp_path):\n    return str(tmp_path / \"videofolder_cache_dir\")\n\n\n@pytest.fixture\ndef video_file_path():\n    return Path(__file__).resolve().parents[1] / \"features\" / \"data\" / \"test_video_66x50.mov\"\n\n\n@pytest.fixture\ndef data_files_with_labels_no_metadata(tmp_path, video_file_path):\n    data_dir = tmp_path / \"data_files_with_labels_no_metadata\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    subdir_class_0 = data_dir / \"cat\"\n    subdir_class_0.mkdir(parents=True, exist_ok=True)\n    subdir_class_1 = data_dir / \"dog\"\n    subdir_class_1.mkdir(parents=True, exist_ok=True)\n\n    video_filename = subdir_class_0 / \"video_cat.mov\"\n    shutil.copyfile(video_file_path, video_filename)\n    video_filename2 = subdir_class_1 / \"video_dog.mov\"\n    shutil.copyfile(video_file_path, video_filename2)\n\n    data_files = DataFilesDict.from_patterns(get_data_patterns(str(data_dir)), data_dir.as_posix())\n    return data_files\n\n\n@pytest.fixture\ndef video_file_with_metadata(tmp_path, video_file_path):\n    video_filename = tmp_path / \"video.mov\"\n    shutil.copyfile(video_file_path, video_filename)\n    metadata_filename = tmp_path / \"metadata.jsonl\"\n    metadata = textwrap.dedent(\n        \"\"\"\\\n        {\"file_name\": \"video.mov\", \"caption\": \"A short video\"}\n        \"\"\"\n    )\n    with open(metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(metadata)\n    return str(video_filename), str(metadata_filename)\n\n\n@pytest.fixture\ndef data_files_with_zip_archives(tmp_path, video_file_path):\n    data_dir = tmp_path / \"videofolder_data_dir_with_zip_archives\"\n    data_dir.mkdir(parents=True, exist_ok=True)\n    archive_dir = data_dir / \"archive\"\n    archive_dir.mkdir(parents=True, exist_ok=True)\n    subdir = archive_dir / \"subdir\"\n    subdir.mkdir(parents=True, exist_ok=True)\n\n    video_filename = archive_dir / \"video.mov\"\n    shutil.copyfile(video_file_path, video_filename)\n    video_filename2 = subdir / \"video2.mov\"\n    shutil.copyfile(video_file_path, video_filename2)\n\n    metadata_filename = archive_dir / \"metadata.jsonl\"\n    metadata = textwrap.dedent(\n        \"\"\"\\\n        {\"file_name\": \"video.mov\", \"caption\": \"First video\"}\n        {\"file_name\": \"subdir/video2.mov\", \"caption\": \"Second video\"}\n        \"\"\"\n    )\n    with open(metadata_filename, \"w\", encoding=\"utf-8\") as f:\n        f.write(metadata)\n\n    shutil.make_archive(archive_dir, \"zip\", archive_dir)\n    shutil.rmtree(str(archive_dir))\n\n    data_files = DataFilesDict.from_patterns(get_data_patterns(str(data_dir)), data_dir.as_posix())\n    assert len(data_files) == 1\n    assert len(data_files[\"train\"]) == 1\n    return data_files\n\n\ndef test_config_raises_when_invalid_name() -> None:\n    with pytest.raises(InvalidConfigName, match=\"Bad characters\"):\n        _ = VideoFolderConfig(name=\"name-with-*-invalid-character\")\n\n\n@pytest.mark.parametrize(\"data_files\", [\"str_path\", [\"str_path\"], DataFilesList([\"str_path\"], [()])])\ndef test_config_raises_when_invalid_data_files(data_files) -> None:\n    with pytest.raises(ValueError, match=\"Expected a DataFilesDict\"):\n        _ = VideoFolderConfig(name=\"name\", data_files=data_files)\n\n\ndef test_generate_examples_with_labels(data_files_with_labels_no_metadata, cache_dir):\n    videofolder = VideoFolder(data_files=data_files_with_labels_no_metadata, cache_dir=cache_dir, drop_labels=False)\n    gen_kwargs = videofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs\n    assert videofolder.info.features == Features({\"video\": Video(), \"label\": ClassLabel(names=[\"cat\", \"dog\"])})\n    generator = videofolder._generate_examples(**gen_kwargs)\n    assert all(example[\"label\"] in {\"cat\", \"dog\"} for _, example in generator)\n\n\ndef test_generate_examples_with_metadata(video_file_with_metadata, cache_dir):\n    video_file, metadata_file = video_file_with_metadata\n    videofolder = VideoFolder(data_files=[video_file, metadata_file], cache_dir=cache_dir)\n    gen_kwargs = videofolder._split_generators(StreamingDownloadManager())[0].gen_kwargs\n    generated_examples = [example for _, example in videofolder._generate_examples(**gen_kwargs)]\n    assert len(generated_examples) == 1\n    assert generated_examples[0].keys() == {\"video\", \"caption\"}\n    assert generated_examples[0][\"video\"].endswith(\"video.mov\")\n    assert generated_examples[0][\"caption\"] == \"A short video\"\n\n\n@pytest.mark.parametrize(\"streaming\", [False, True])\ndef test_data_files_with_metadata_and_archives(streaming, cache_dir, data_files_with_zip_archives):\n    videofolder = VideoFolder(data_files=data_files_with_zip_archives, cache_dir=cache_dir)\n    download_manager = StreamingDownloadManager() if streaming else DownloadManager()\n    generated_splits = videofolder._split_generators(download_manager)\n    for (split, files), generated_split in zip(data_files_with_zip_archives.items(), generated_splits):\n        assert split == generated_split.name\n        num_of_archives = len(files)\n        expected_num_of_examples = 2 * num_of_archives\n        generated_examples = list(videofolder._generate_examples(**generated_split.gen_kwargs))\n        assert len(generated_examples) == expected_num_of_examples\n        assert len({example[\"video\"] for _, example in generated_examples}) == expected_num_of_examples\n        assert len({example[\"caption\"] for _, example in generated_examples}) == expected_num_of_examples\n"
  },
  {
    "path": "tests/packaged_modules/test_webdataset.py",
    "content": "import json\nimport tarfile\nfrom pathlib import Path\n\nimport pytest\n\nfrom datasets import Audio, DownloadManager, Features, Image, List, Value, Video\nfrom datasets.packaged_modules.webdataset.webdataset import WebDataset\n\nfrom ..utils import (\n    require_numpy1_on_windows,\n    require_pil,\n    require_torch,\n    require_torchcodec,\n)\n\n\n@pytest.fixture\ndef gzipped_text_wds_file(tmp_path, text_gz_path):\n    filename = tmp_path / \"file.tar\"\n    num_examples = 3\n    with tarfile.open(str(filename), \"w\") as f:\n        for example_idx in range(num_examples):\n            f.add(text_gz_path, f\"{example_idx:05d}.txt.gz\")\n    return str(filename)\n\n\n@pytest.fixture\ndef image_wds_file(tmp_path, image_file):\n    json_file = tmp_path / \"data.json\"\n    filename = tmp_path / \"file.tar\"\n    num_examples = 3\n    with json_file.open(\"w\", encoding=\"utf-8\") as f:\n        f.write(json.dumps({\"caption\": \"this is an image\"}))\n    with tarfile.open(str(filename), \"w\") as f:\n        for example_idx in range(num_examples):\n            f.add(json_file, f\"{example_idx:05d}.json\")\n            f.add(image_file, f\"{example_idx:05d}.jpg\")\n    return str(filename)\n\n\n@pytest.fixture\ndef upper_lower_case_file(tmp_path):\n    tar_path = tmp_path / \"file.tar\"\n    num_examples = 3\n    variants = [\n        (\"INFO1\", \"json\"),\n        (\"info2\", \"json\"),\n        (\"info3\", \"JSON\"),\n        (\"info3\", \"json\"),  # should probably remove if testing on a case insensitive filesystem\n    ]\n    with tarfile.open(tar_path, \"w\") as tar:\n        for example_idx in range(num_examples):\n            example_name = f\"{example_idx:05d}_{'a' if example_idx % 2 else 'A'}\"\n            for tag, ext in variants:\n                caption_path = tmp_path / f\"{example_name}.{tag}.{ext}\"\n                caption_text = {\"caption\": f\"caption for {example_name}.{tag}.{ext}\"}\n                caption_path.write_text(json.dumps(caption_text), encoding=\"utf-8\")\n                tar.add(caption_path, arcname=f\"{example_name}.{tag}.{ext}\")\n    return str(tar_path)\n\n\n@pytest.fixture\ndef audio_wds_file(tmp_path, audio_file):\n    json_file = tmp_path / \"data.json\"\n    filename = tmp_path / \"file.tar\"\n    num_examples = 3\n    with json_file.open(\"w\", encoding=\"utf-8\") as f:\n        f.write(json.dumps({\"transcript\": \"this is a transcript\"}))\n    with tarfile.open(str(filename), \"w\") as f:\n        for example_idx in range(num_examples):\n            f.add(json_file, f\"{example_idx:05d}.json\")\n            f.add(audio_file, f\"{example_idx:05d}.wav\")\n    return str(filename)\n\n\n@pytest.fixture\ndef video_wds_file(tmp_path):\n    json_file = tmp_path / \"data.json\"\n    filename = tmp_path / \"file.tar\"\n    video_file = Path(__file__).resolve().parents[1] / \"features\" / \"data\" / \"test_video_66x50.mov\"\n    num_examples = 3\n    with json_file.open(\"w\", encoding=\"utf-8\") as f:\n        f.write(json.dumps({\"caption\": \"this is a video\"}))\n    with tarfile.open(str(filename), \"w\") as f:\n        for example_idx in range(num_examples):\n            f.add(json_file, f\"{example_idx:05d}.json\")\n            f.add(video_file, f\"{example_idx:05d}.mov\")\n    return str(filename)\n\n\n@pytest.fixture\ndef bad_wds_file(tmp_path, image_file, text_file):\n    json_file = tmp_path / \"data.json\"\n    filename = tmp_path / \"bad_file.tar\"\n    with json_file.open(\"w\", encoding=\"utf-8\") as f:\n        f.write(json.dumps({\"caption\": \"this is an image\"}))\n    with tarfile.open(str(filename), \"w\") as f:\n        f.add(image_file)\n        f.add(json_file)\n    return str(filename)\n\n\n@pytest.fixture\ndef tensor_wds_file(tmp_path, tensor_file):\n    json_file = tmp_path / \"data.json\"\n    filename = tmp_path / \"file.tar\"\n    num_examples = 3\n    with json_file.open(\"w\", encoding=\"utf-8\") as f:\n        f.write(json.dumps({\"text\": \"this is a text\"}))\n    with tarfile.open(str(filename), \"w\") as f:\n        for example_idx in range(num_examples):\n            f.add(json_file, f\"{example_idx:05d}.json\")\n            f.add(tensor_file, f\"{example_idx:05d}.pth\")\n    return str(filename)\n\n\n@require_pil\ndef test_gzipped_text_webdataset(gzipped_text_wds_file, text_path):\n    data_files = {\"train\": [gzipped_text_wds_file]}\n    webdataset = WebDataset(data_files=data_files)\n    split_generators = webdataset._split_generators(DownloadManager())\n    assert webdataset.info.features == Features(\n        {\n            \"__key__\": Value(\"string\"),\n            \"__url__\": Value(\"string\"),\n            \"txt.gz\": Value(\"string\"),\n        }\n    )\n    assert len(split_generators) == 1\n    split_generator = split_generators[0]\n    assert split_generator.name == \"train\"\n    generator = webdataset._generate_examples(**split_generator.gen_kwargs)\n    _, examples = zip(*generator)\n    assert len(examples) == 3\n    assert isinstance(examples[0][\"txt.gz\"], str)\n    with open(text_path, \"r\") as f:\n        assert examples[0][\"txt.gz\"].replace(\"\\r\\n\", \"\\n\") == f.read().replace(\"\\r\\n\", \"\\n\")\n\n\n@require_pil\ndef test_image_webdataset(image_wds_file):\n    import PIL.Image\n\n    data_files = {\"train\": [image_wds_file]}\n    webdataset = WebDataset(data_files=data_files)\n    split_generators = webdataset._split_generators(DownloadManager())\n    assert webdataset.info.features == Features(\n        {\n            \"__key__\": Value(\"string\"),\n            \"__url__\": Value(\"string\"),\n            \"json\": {\"caption\": Value(\"string\")},\n            \"jpg\": Image(),\n        }\n    )\n    assert len(split_generators) == 1\n    split_generator = split_generators[0]\n    assert split_generator.name == \"train\"\n    generator = webdataset._generate_examples(**split_generator.gen_kwargs)\n    _, examples = zip(*generator)\n    assert len(examples) == 3\n    assert isinstance(examples[0][\"json\"], dict)\n    assert isinstance(examples[0][\"json\"][\"caption\"], str)\n    assert isinstance(examples[0][\"jpg\"], dict)  # keep encoded to avoid unecessary copies\n    encoded = webdataset.info.features.encode_example(examples[0])\n    decoded = webdataset.info.features.decode_example(encoded)\n    assert isinstance(decoded[\"json\"], dict)\n    assert isinstance(decoded[\"json\"][\"caption\"], str)\n    assert isinstance(decoded[\"jpg\"], PIL.Image.Image)\n\n\ndef test_upper_lower_case(upper_lower_case_file):\n    variants = [\n        (\"INFO1\", \"json\"),\n        (\"info2\", \"json\"),\n        (\"info3\", \"JSON\"),\n        (\"info3\", \"json\"),\n    ]\n\n    data_files = {\"train\": [upper_lower_case_file]}\n    webdataset = WebDataset(data_files=data_files)\n    split_generators = webdataset._split_generators(DownloadManager())\n\n    variant_keys = [f\"{tag}.{ext}\" for tag, ext in variants]\n    assert webdataset.info.features == Features(\n        {\n            \"__key__\": Value(\"string\"),\n            \"__url__\": Value(\"string\"),\n            **{k: {\"caption\": Value(\"string\")} for k in variant_keys},\n        }\n    )\n\n    assert len(split_generators) == 1\n    split_generator = split_generators[0]\n    assert split_generator.name == \"train\"\n    generator = webdataset._generate_examples(**split_generator.gen_kwargs)\n    _, examples = zip(*generator)\n\n    assert len(examples) == 3\n    for example_idx, example in enumerate(examples):\n        example_name = example[\"__key__\"]\n        expected_example_name = f\"{example_idx:05d}_{'a' if example_idx % 2 else 'A'}\"\n\n        assert example_name == expected_example_name\n        for key in variant_keys:\n            assert isinstance(example[key], dict)\n            assert example[key][\"caption\"] == f\"caption for {example_name}.{key}\"\n\n        encoded = webdataset.info.features.encode_example(example)\n        decoded = webdataset.info.features.decode_example(encoded)\n        for key in variant_keys:\n            assert decoded[key][\"caption\"] == example[key][\"caption\"]\n\n\n@require_pil\ndef test_image_webdataset_missing_keys(image_wds_file):\n    import PIL.Image\n\n    data_files = {\"train\": [image_wds_file]}\n    features = Features(\n        {\n            \"__key__\": Value(\"string\"),\n            \"__url__\": Value(\"string\"),\n            \"json\": {\"caption\": Value(\"string\")},\n            \"jpg\": Image(),\n            \"jpeg\": Image(),  # additional field\n            \"txt\": Value(\"string\"),  # additional field\n        }\n    )\n    webdataset = WebDataset(data_files=data_files, features=features)\n    split_generators = webdataset._split_generators(DownloadManager())\n    assert webdataset.info.features == features\n    split_generator = split_generators[0]\n    assert split_generator.name == \"train\"\n    generator = webdataset._generate_examples(**split_generator.gen_kwargs)\n    _, example = next(iter(generator))\n    encoded = webdataset.info.features.encode_example(example)\n    decoded = webdataset.info.features.decode_example(encoded)\n    assert isinstance(decoded[\"json\"], dict)\n    assert isinstance(decoded[\"json\"][\"caption\"], str)\n    assert isinstance(decoded[\"jpg\"], PIL.Image.Image)\n    assert decoded[\"jpeg\"] is None\n    assert decoded[\"txt\"] is None\n\n\n@require_torchcodec\ndef test_audio_webdataset(audio_wds_file):\n    from torchcodec.decoders import AudioDecoder\n\n    data_files = {\"train\": [audio_wds_file]}\n    webdataset = WebDataset(data_files=data_files)\n    split_generators = webdataset._split_generators(DownloadManager())\n    assert webdataset.info.features == Features(\n        {\n            \"__key__\": Value(\"string\"),\n            \"__url__\": Value(\"string\"),\n            \"json\": {\"transcript\": Value(\"string\")},\n            \"wav\": Audio(),\n        }\n    )\n    assert len(split_generators) == 1\n    split_generator = split_generators[0]\n    assert split_generator.name == \"train\"\n    generator = webdataset._generate_examples(**split_generator.gen_kwargs)\n    _, examples = zip(*generator)\n    assert len(examples) == 3\n    assert isinstance(examples[0][\"json\"], dict)\n    assert isinstance(examples[0][\"json\"][\"transcript\"], str)\n    assert isinstance(examples[0][\"wav\"], dict)\n    assert isinstance(examples[0][\"wav\"][\"bytes\"], bytes)  # keep encoded to avoid unecessary copies\n    encoded = webdataset.info.features.encode_example(examples[0])\n    decoded = webdataset.info.features.decode_example(encoded)\n    assert isinstance(decoded[\"json\"], dict)\n    assert isinstance(decoded[\"json\"][\"transcript\"], str)\n    assert isinstance(decoded[\"wav\"], AudioDecoder)\n\n\ndef test_video_webdataset(video_wds_file):\n    data_files = {\"train\": [video_wds_file]}\n    webdataset = WebDataset(data_files=data_files)\n    split_generators = webdataset._split_generators(DownloadManager())\n    assert webdataset.info.features == Features(\n        {\n            \"__key__\": Value(\"string\"),\n            \"__url__\": Value(\"string\"),\n            \"json\": {\"caption\": Value(\"string\")},\n            \"mov\": Video(),\n        }\n    )\n    assert len(split_generators) == 1\n    split_generator = split_generators[0]\n    assert split_generator.name == \"train\"\n    generator = webdataset._generate_examples(**split_generator.gen_kwargs)\n    _, examples = zip(*generator)\n    assert len(examples) == 3\n    assert isinstance(examples[0][\"json\"], dict)\n    assert isinstance(examples[0][\"json\"][\"caption\"], str)\n    assert isinstance(examples[0][\"mov\"], bytes)\n\n\ndef test_webdataset_errors_on_bad_file(bad_wds_file):\n    data_files = {\"train\": [bad_wds_file]}\n    webdataset = WebDataset(data_files=data_files)\n    with pytest.raises(ValueError):\n        webdataset._split_generators(DownloadManager())\n\n\n@require_pil\ndef test_webdataset_with_features(image_wds_file):\n    import PIL.Image\n\n    data_files = {\"train\": [image_wds_file]}\n    features = Features(\n        {\n            \"__key__\": Value(\"string\"),\n            \"__url__\": Value(\"string\"),\n            \"json\": {\"caption\": Value(\"string\"), \"additional_field\": Value(\"int64\")},\n            \"jpg\": Image(),\n        }\n    )\n    webdataset = WebDataset(data_files=data_files, features=features)\n    split_generators = webdataset._split_generators(DownloadManager())\n    assert webdataset.info.features == features\n    split_generator = split_generators[0]\n    assert split_generator.name == \"train\"\n    generator = webdataset._generate_examples(**split_generator.gen_kwargs)\n    _, example = next(iter(generator))\n    encoded = webdataset.info.features.encode_example(example)\n    decoded = webdataset.info.features.decode_example(encoded)\n    assert decoded[\"json\"][\"additional_field\"] is None\n    assert isinstance(decoded[\"json\"], dict)\n    assert isinstance(decoded[\"json\"][\"caption\"], str)\n    assert isinstance(decoded[\"jpg\"], PIL.Image.Image)\n\n\n@require_numpy1_on_windows\n@require_torch\ndef test_tensor_webdataset(tensor_wds_file):\n    import torch\n\n    data_files = {\"train\": [tensor_wds_file]}\n    webdataset = WebDataset(data_files=data_files)\n    split_generators = webdataset._split_generators(DownloadManager())\n    assert webdataset.info.features == Features(\n        {\n            \"__key__\": Value(\"string\"),\n            \"__url__\": Value(\"string\"),\n            \"json\": {\"text\": Value(\"string\")},\n            \"pth\": List(Value(\"float32\")),\n        }\n    )\n    assert len(split_generators) == 1\n    split_generator = split_generators[0]\n    assert split_generator.name == \"train\"\n    generator = webdataset._generate_examples(**split_generator.gen_kwargs)\n    _, examples = zip(*generator)\n    assert len(examples) == 3\n    assert isinstance(examples[0][\"json\"], dict)\n    assert isinstance(examples[0][\"json\"][\"text\"], str)\n    assert isinstance(examples[0][\"pth\"], torch.Tensor)  # keep encoded to avoid unecessary copies\n    encoded = webdataset.info.features.encode_example(examples[0])\n    decoded = webdataset.info.features.decode_example(encoded)\n    assert isinstance(decoded[\"json\"], dict)\n    assert isinstance(decoded[\"json\"][\"text\"], str)\n    assert isinstance(decoded[\"pth\"], list)\n"
  },
  {
    "path": "tests/test_arrow_dataset.py",
    "content": "import asyncio\nimport contextlib\nimport copy\nimport itertools\nimport json\nimport os\nimport pickle\nimport re\nimport sys\nimport tempfile\nimport time\nfrom functools import partial\nfrom pathlib import Path\nfrom unittest import TestCase\nfrom unittest.mock import MagicMock, patch\n\nimport numpy as np\nimport numpy.testing as npt\nimport pandas as pd\nimport pyarrow as pa\nimport pytest\nfrom absl.testing import parameterized\nfrom fsspec.core import strip_protocol\nfrom packaging import version\n\nimport datasets.arrow_dataset\nimport datasets.config\nfrom datasets import concatenate_datasets, interleave_datasets, load_from_disk\nfrom datasets.arrow_dataset import Dataset, transmit_format, update_metadata_with_features\nfrom datasets.dataset_dict import DatasetDict\nfrom datasets.features import (\n    Array2D,\n    Array3D,\n    ClassLabel,\n    Features,\n    Image,\n    Json,\n    LargeList,\n    List,\n    Translation,\n    TranslationVariableLanguages,\n    Value,\n)\nfrom datasets.info import DatasetInfo\nfrom datasets.iterable_dataset import IterableDataset\nfrom datasets.splits import NamedSplit\nfrom datasets.table import ConcatenationTable, InMemoryTable, MemoryMappedTable\nfrom datasets.utils.logging import INFO, get_logger\nfrom datasets.utils.py_utils import temp_seed\n\nfrom .utils import (\n    assert_arrow_memory_doesnt_increase,\n    assert_arrow_memory_increases,\n    require_dill_gt_0_3_2,\n    require_jax,\n    require_not_windows,\n    require_numpy1_on_windows,\n    require_pil,\n    require_polars,\n    require_pyspark,\n    require_sqlalchemy,\n    require_tf,\n    require_torch,\n    require_transformers,\n    set_current_working_directory_to_temp_dir,\n)\n\n\nclass PickableMagicMock(MagicMock):\n    def __reduce__(self):\n        return MagicMock, ()\n\n\nclass Unpicklable:\n    def __init__(self, **kwargs):\n        for key, value in kwargs.items():\n            setattr(self, key, value)\n\n    def __getstate__(self):\n        raise pickle.PicklingError()\n\n\ndef picklable_map_function(x):\n    return {\"id\": int(x[\"filename\"].split(\"_\")[-1])}\n\n\ndef picklable_map_function_with_indices(x, i):\n    return {\"id\": i}\n\n\ndef picklable_map_function_with_rank(x, r):\n    return {\"rank\": r}\n\n\ndef picklable_map_function_with_indices_and_rank(x, i, r):\n    return {\"id\": i, \"rank\": r}\n\n\ndef picklable_filter_function(x):\n    return int(x[\"filename\"].split(\"_\")[-1]) < 10\n\n\ndef picklable_filter_function_with_rank(x, r):\n    return r == 0\n\n\ndef assert_arrow_metadata_are_synced_with_dataset_features(dataset: Dataset):\n    assert dataset.data.schema.metadata is not None\n    assert b\"huggingface\" in dataset.data.schema.metadata\n    metadata = json.loads(dataset.data.schema.metadata[b\"huggingface\"].decode())\n    assert \"info\" in metadata\n    features = DatasetInfo.from_dict(metadata[\"info\"]).features\n    assert features is not None\n    assert features == dataset.features\n    assert features == Features.from_arrow_schema(dataset.data.schema)\n    assert list(features) == dataset.data.column_names\n    assert list(features) == list(dataset.features)\n\n\nIN_MEMORY_PARAMETERS = [\n    {\"testcase_name\": name, \"in_memory\": im} for im, name in [(True, \"in_memory\"), (False, \"on_disk\")]\n]\n\nSTRING_FROM_PANDAS = \"large_string\" if datasets.config.PANDAS_VERSION.major >= 3 else \"string\"\n\n\n@parameterized.named_parameters(IN_MEMORY_PARAMETERS)\nclass BaseDatasetTest(TestCase):\n    @pytest.fixture(autouse=True)\n    def inject_fixtures(self, caplog, set_sqlalchemy_silence_uber_warning):\n        self._caplog = caplog\n\n    def _create_dummy_dataset(\n        self,\n        in_memory: bool,\n        tmp_dir: str,\n        multiple_columns=False,\n        array_features=False,\n        nested_features=False,\n        int_to_float=False,\n    ) -> Dataset:\n        assert int(multiple_columns) + int(array_features) + int(nested_features) < 2\n        if multiple_columns:\n            data = {\"col_1\": [3, 2, 1, 0], \"col_2\": [\"a\", \"b\", \"c\", \"d\"], \"col_3\": [False, True, False, True]}\n            dset = Dataset.from_dict(data)\n        elif array_features:\n            data = {\n                \"col_1\": [[[True, False], [False, True]]] * 4,  # 2D\n                \"col_2\": [[[[\"a\", \"b\"], [\"c\", \"d\"]], [[\"e\", \"f\"], [\"g\", \"h\"]]]] * 4,  # 3D array\n                \"col_3\": [[3, 2, 1, 0]] * 4,  # List\n            }\n            features = Features(\n                {\n                    \"col_1\": Array2D(shape=(2, 2), dtype=\"bool\"),\n                    \"col_2\": Array3D(shape=(2, 2, 2), dtype=\"string\"),\n                    \"col_3\": List(Value(\"int64\")),\n                }\n            )\n            dset = Dataset.from_dict(data, features=features)\n        elif nested_features:\n            data = {\"nested\": [{\"a\": i, \"x\": i * 10, \"c\": i * 100} for i in range(1, 11)]}\n            features = Features({\"nested\": {\"a\": Value(\"int64\"), \"x\": Value(\"int64\"), \"c\": Value(\"int64\")}})\n            dset = Dataset.from_dict(data, features=features)\n        elif int_to_float:\n            data = {\n                \"text\": [\"text1\", \"text2\", \"text3\", \"text4\"],\n                \"labels\": [[1, 1, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 1, 1], [0, 0, 0, 1, 0]],\n            }\n            dset = Dataset.from_dict(data)\n        else:\n            dset = Dataset.from_dict({\"filename\": [\"my_name-train\" + \"_\" + str(x) for x in np.arange(30).tolist()]})\n        if not in_memory:\n            dset = self._to(in_memory, tmp_dir, dset)\n        return dset\n\n    def _to(self, in_memory, tmp_dir, *datasets):\n        if in_memory:\n            datasets = [dataset.map(keep_in_memory=True) for dataset in datasets]\n        else:\n            start = 0\n            while os.path.isfile(os.path.join(tmp_dir, f\"dataset{start}.arrow\")):\n                start += 1\n            datasets = [\n                dataset.map(cache_file_name=os.path.join(tmp_dir, f\"dataset{start + i}.arrow\"))\n                for i, dataset in enumerate(datasets)\n            ]\n        return datasets if len(datasets) > 1 else datasets[0]\n\n    def test_dummy_dataset(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                self.assertEqual(dset[0][\"filename\"], \"my_name-train_0\")\n                self.assertEqual(dset[\"filename\"][0], \"my_name-train_0\")\n\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                self.assertDictEqual(\n                    dset.features,\n                    Features({\"col_1\": Value(\"int64\"), \"col_2\": Value(\"string\"), \"col_3\": Value(\"bool\")}),\n                )\n                self.assertEqual(dset[0][\"col_1\"], 3)\n                self.assertEqual(dset[\"col_1\"][0], 3)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, array_features=True) as dset:\n                self.assertDictEqual(\n                    dset.features,\n                    Features(\n                        {\n                            \"col_1\": Array2D(shape=(2, 2), dtype=\"bool\"),\n                            \"col_2\": Array3D(shape=(2, 2, 2), dtype=\"string\"),\n                            \"col_3\": List(Value(\"int64\")),\n                        }\n                    ),\n                )\n                self.assertEqual(dset[0][\"col_2\"], [[[\"a\", \"b\"], [\"c\", \"d\"]], [[\"e\", \"f\"], [\"g\", \"h\"]]])\n                self.assertEqual(dset[\"col_2\"][0], [[[\"a\", \"b\"], [\"c\", \"d\"]], [[\"e\", \"f\"], [\"g\", \"h\"]]])\n\n    def test_dataset_getitem(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                self.assertEqual(dset[0][\"filename\"], \"my_name-train_0\")\n                self.assertEqual(dset[\"filename\"][0], \"my_name-train_0\")\n\n                self.assertEqual(dset[-1][\"filename\"], \"my_name-train_29\")\n                self.assertEqual(dset[\"filename\"][-1], \"my_name-train_29\")\n\n                self.assertListEqual(dset[:2][\"filename\"], [\"my_name-train_0\", \"my_name-train_1\"])\n                self.assertListEqual(dset[\"filename\"][:2], [\"my_name-train_0\", \"my_name-train_1\"])\n\n                self.assertEqual(dset[:-1][\"filename\"][-1], \"my_name-train_28\")\n                self.assertEqual(dset[\"filename\"][:-1][-1], \"my_name-train_28\")\n\n                self.assertListEqual(dset[[0, -1]][\"filename\"], [\"my_name-train_0\", \"my_name-train_29\"])\n                self.assertListEqual(dset[range(0, -2, -1)][\"filename\"], [\"my_name-train_0\", \"my_name-train_29\"])\n                self.assertListEqual(dset[np.array([0, -1])][\"filename\"], [\"my_name-train_0\", \"my_name-train_29\"])\n                self.assertListEqual(dset[pd.Series([0, -1])][\"filename\"], [\"my_name-train_0\", \"my_name-train_29\"])\n\n                with dset.select(range(2)) as dset_subset:\n                    self.assertListEqual(dset_subset[-1:][\"filename\"], [\"my_name-train_1\"])\n                    self.assertListEqual(dset_subset[\"filename\"][-1:], [\"my_name-train_1\"])\n\n    def test_dummy_dataset_deepcopy(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir).select(range(10)) as dset:\n                with assert_arrow_memory_doesnt_increase():\n                    dset2 = copy.deepcopy(dset)\n                # don't copy the underlying arrow data using memory\n                self.assertEqual(len(dset2), 10)\n                self.assertDictEqual(dset2.features, Features({\"filename\": Value(\"string\")}))\n                self.assertEqual(dset2[0][\"filename\"], \"my_name-train_0\")\n                self.assertEqual(dset2[\"filename\"][0], \"my_name-train_0\")\n                del dset2\n\n    def test_dummy_dataset_pickle(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            tmp_file = os.path.join(tmp_dir, \"dset.pt\")\n\n            with self._create_dummy_dataset(in_memory, tmp_dir).select(range(0, 10, 2)) as dset:\n                with open(tmp_file, \"wb\") as f:\n                    pickle.dump(dset, f)\n\n            with open(tmp_file, \"rb\") as f:\n                with pickle.load(f) as dset:\n                    self.assertEqual(len(dset), 5)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertEqual(dset[0][\"filename\"], \"my_name-train_0\")\n                    self.assertEqual(dset[\"filename\"][0], \"my_name-train_0\")\n\n            with self._create_dummy_dataset(in_memory, tmp_dir).select(\n                range(0, 10, 2), indices_cache_file_name=os.path.join(tmp_dir, \"ind.arrow\")\n            ) as dset:\n                if not in_memory:\n                    dset._data.table = Unpicklable()\n                dset._indices.table = Unpicklable()\n                with open(tmp_file, \"wb\") as f:\n                    pickle.dump(dset, f)\n\n            with open(tmp_file, \"rb\") as f:\n                with pickle.load(f) as dset:\n                    self.assertEqual(len(dset), 5)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertEqual(dset[0][\"filename\"], \"my_name-train_0\")\n                    self.assertEqual(dset[\"filename\"][0], \"my_name-train_0\")\n\n    def test_dummy_dataset_serialize(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with set_current_working_directory_to_temp_dir():\n                with self._create_dummy_dataset(in_memory, tmp_dir).select(range(10)) as dset:\n                    dataset_path = \"my_dataset\"  # rel path\n                    dset.save_to_disk(dataset_path)\n\n                with Dataset.load_from_disk(dataset_path) as dset:\n                    self.assertEqual(len(dset), 10)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertEqual(dset[0][\"filename\"], \"my_name-train_0\")\n                    self.assertEqual(dset[\"filename\"][0], \"my_name-train_0\")\n                    expected = dset.to_dict()\n\n            with self._create_dummy_dataset(in_memory, tmp_dir).select(range(10)) as dset:\n                dataset_path = os.path.join(tmp_dir, \"my_dataset\")  # abs path\n                dset.save_to_disk(dataset_path)\n\n            with Dataset.load_from_disk(dataset_path) as dset:\n                self.assertEqual(len(dset), 10)\n                self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                self.assertEqual(dset[0][\"filename\"], \"my_name-train_0\")\n                self.assertEqual(dset[\"filename\"][0], \"my_name-train_0\")\n\n            with self._create_dummy_dataset(in_memory, tmp_dir).select(\n                range(10), indices_cache_file_name=os.path.join(tmp_dir, \"ind.arrow\")\n            ) as dset:\n                with assert_arrow_memory_doesnt_increase():\n                    dset.save_to_disk(dataset_path)\n\n            with Dataset.load_from_disk(dataset_path) as dset:\n                self.assertEqual(len(dset), 10)\n                self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                self.assertEqual(dset[0][\"filename\"], \"my_name-train_0\")\n                self.assertEqual(dset[\"filename\"][0], \"my_name-train_0\")\n\n            with self._create_dummy_dataset(in_memory, tmp_dir, nested_features=True) as dset:\n                with assert_arrow_memory_doesnt_increase():\n                    dset.save_to_disk(dataset_path)\n\n            with Dataset.load_from_disk(dataset_path) as dset:\n                self.assertEqual(len(dset), 10)\n                self.assertDictEqual(\n                    dset.features,\n                    Features({\"nested\": {\"a\": Value(\"int64\"), \"x\": Value(\"int64\"), \"c\": Value(\"int64\")}}),\n                )\n                self.assertDictEqual(dset[0][\"nested\"], {\"a\": 1, \"c\": 100, \"x\": 10})\n                self.assertDictEqual(dset[\"nested\"][0], {\"a\": 1, \"c\": 100, \"x\": 10})\n\n            with self._create_dummy_dataset(in_memory, tmp_dir).select(range(10)) as dset:\n                with assert_arrow_memory_doesnt_increase():\n                    dset.save_to_disk(dataset_path, num_shards=4)\n\n            with Dataset.load_from_disk(dataset_path) as dset:\n                self.assertEqual(len(dset), 10)\n                self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                self.assertDictEqual(dset.to_dict(), expected)\n                self.assertEqual(len(dset.cache_files), 4)\n\n            with self._create_dummy_dataset(in_memory, tmp_dir).select(range(10)) as dset:\n                with assert_arrow_memory_doesnt_increase():\n                    dset.save_to_disk(dataset_path, num_proc=2)\n\n            with Dataset.load_from_disk(dataset_path) as dset:\n                self.assertEqual(len(dset), 10)\n                self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                self.assertDictEqual(dset.to_dict(), expected)\n                self.assertEqual(len(dset.cache_files), 2)\n\n            with self._create_dummy_dataset(in_memory, tmp_dir).select(range(10)) as dset:\n                with assert_arrow_memory_doesnt_increase():\n                    dset.save_to_disk(dataset_path, num_shards=7, num_proc=2)\n\n            with Dataset.load_from_disk(dataset_path) as dset:\n                self.assertEqual(len(dset), 10)\n                self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                self.assertDictEqual(dset.to_dict(), expected)\n                self.assertEqual(len(dset.cache_files), 7)\n\n            with self._create_dummy_dataset(in_memory, tmp_dir).select(range(10)) as dset:\n                with assert_arrow_memory_doesnt_increase():\n                    max_shard_size = dset._estimate_nbytes() // 2 + 1\n                    dset.save_to_disk(dataset_path, max_shard_size=max_shard_size)\n\n            with Dataset.load_from_disk(dataset_path) as dset:\n                self.assertEqual(len(dset), 10)\n                self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                self.assertDictEqual(dset.to_dict(), expected)\n                self.assertEqual(len(dset.cache_files), 2)\n\n    def test_dummy_dataset_load_from_disk(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir).select(range(10)) as dset:\n                dataset_path = os.path.join(tmp_dir, \"my_dataset\")\n                dset.save_to_disk(dataset_path)\n\n            with load_from_disk(dataset_path) as dset:\n                self.assertEqual(len(dset), 10)\n                self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                self.assertEqual(dset[0][\"filename\"], \"my_name-train_0\")\n                self.assertEqual(dset[\"filename\"][0], \"my_name-train_0\")\n\n    def test_restore_saved_format(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                dset.set_format(type=\"numpy\", columns=[\"col_1\"], output_all_columns=True)\n                dataset_path = os.path.join(tmp_dir, \"my_dataset\")\n                dset.save_to_disk(dataset_path)\n\n                with load_from_disk(dataset_path) as loaded_dset:\n                    self.assertEqual(dset.format, loaded_dset.format)\n\n    def test_set_format_numpy_multiple_columns(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                fingerprint = dset._fingerprint\n                dset.set_format(type=\"numpy\", columns=[\"col_1\"])\n                self.assertEqual(len(dset[0]), 1)\n                self.assertIsInstance(dset[0][\"col_1\"], np.int64)\n                self.assertEqual(dset[0][\"col_1\"].item(), 3)\n                self.assertIsInstance(dset[\"col_1\"][:], np.ndarray)\n                self.assertListEqual(list(dset[\"col_1\"][:].shape), [4])\n                np.testing.assert_array_equal(dset[\"col_1\"][:], np.array([3, 2, 1, 0]))\n                self.assertNotEqual(dset._fingerprint, fingerprint)\n\n                dset.reset_format()\n                with dset.formatted_as(type=\"numpy\", columns=[\"col_1\"]):\n                    self.assertEqual(len(dset[0]), 1)\n                    self.assertIsInstance(dset[0][\"col_1\"], np.int64)\n                    self.assertEqual(dset[0][\"col_1\"].item(), 3)\n                    self.assertIsInstance(dset[\"col_1\"][:], np.ndarray)\n                    self.assertListEqual(list(dset[\"col_1\"][:].shape), [4])\n                    np.testing.assert_array_equal(dset[\"col_1\"], np.array([3, 2, 1, 0]))\n\n                self.assertEqual(dset.format[\"type\"], None)\n                self.assertEqual(dset.format[\"format_kwargs\"], {})\n                self.assertEqual(dset.format[\"columns\"], dset.column_names)\n                self.assertEqual(dset.format[\"output_all_columns\"], False)\n\n                dset.set_format(type=\"numpy\", columns=[\"col_1\"], output_all_columns=True)\n                self.assertEqual(len(dset[0]), 3)\n                self.assertIsInstance(dset[0][\"col_2\"], str)\n                self.assertEqual(dset[0][\"col_2\"], \"a\")\n\n                dset.set_format(type=\"numpy\", columns=[\"col_1\", \"col_2\"])\n                self.assertEqual(len(dset[0]), 2)\n                self.assertIsInstance(dset[0][\"col_2\"], np.str_)\n                self.assertEqual(dset[0][\"col_2\"].item(), \"a\")\n\n    @require_numpy1_on_windows\n    @require_torch\n    def test_set_format_torch(self, in_memory):\n        import torch\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                dset.set_format(type=\"torch\", columns=[\"col_1\"])\n                self.assertEqual(len(dset[0]), 1)\n                self.assertIsInstance(dset[0][\"col_1\"], torch.Tensor)\n                self.assertIsInstance(dset[\"col_1\"][:], torch.Tensor)\n                self.assertListEqual(list(dset[0][\"col_1\"].shape), [])\n                self.assertEqual(dset[0][\"col_1\"].item(), 3)\n\n                dset.set_format(type=\"torch\", columns=[\"col_1\"], output_all_columns=True)\n                self.assertEqual(len(dset[0]), 3)\n                self.assertIsInstance(dset[0][\"col_2\"], str)\n                self.assertEqual(dset[0][\"col_2\"], \"a\")\n\n                dset.set_format(type=\"torch\")\n                self.assertEqual(len(dset[0]), 3)\n                self.assertIsInstance(dset[0][\"col_1\"], torch.Tensor)\n                self.assertIsInstance(dset[\"col_1\"][:], torch.Tensor)\n                self.assertListEqual(list(dset[0][\"col_1\"].shape), [])\n                self.assertEqual(dset[0][\"col_1\"].item(), 3)\n                self.assertIsInstance(dset[0][\"col_2\"], str)\n                self.assertEqual(dset[0][\"col_2\"], \"a\")\n                self.assertIsInstance(dset[0][\"col_3\"], torch.Tensor)\n                self.assertIsInstance(dset[\"col_3\"][:], torch.Tensor)\n                self.assertListEqual(list(dset[0][\"col_3\"].shape), [])\n\n    @require_tf\n    def test_set_format_tf(self, in_memory):\n        import tensorflow as tf\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                dset.set_format(type=\"tensorflow\", columns=[\"col_1\"])\n                self.assertEqual(len(dset[0]), 1)\n                self.assertIsInstance(dset[0][\"col_1\"], tf.Tensor)\n                self.assertListEqual(list(dset[0][\"col_1\"].shape), [])\n                self.assertEqual(dset[0][\"col_1\"].numpy().item(), 3)\n\n                dset.set_format(type=\"tensorflow\", columns=[\"col_1\"], output_all_columns=True)\n                self.assertEqual(len(dset[0]), 3)\n                self.assertIsInstance(dset[0][\"col_2\"], str)\n                self.assertEqual(dset[0][\"col_2\"], \"a\")\n\n                dset.set_format(type=\"tensorflow\", columns=[\"col_1\", \"col_2\"])\n                self.assertEqual(len(dset[0]), 2)\n                self.assertEqual(dset[0][\"col_2\"].numpy().decode(\"utf-8\"), \"a\")\n\n    def test_set_format_pandas(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                dset.set_format(type=\"pandas\", columns=[\"col_1\"])\n                self.assertEqual(len(dset[0].columns), 1)\n                self.assertIsInstance(dset[0], pd.DataFrame)\n                self.assertListEqual(list(dset[0].shape), [1, 1])\n                self.assertEqual(dset[0][\"col_1\"].item(), 3)\n\n                dset.set_format(type=\"pandas\", columns=[\"col_1\", \"col_2\"])\n                self.assertEqual(len(dset[0].columns), 2)\n                self.assertEqual(dset[0][\"col_2\"].item(), \"a\")\n\n    @require_polars\n    def test_set_format_polars(self, in_memory):\n        import polars as pl\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                dset.set_format(type=\"polars\", columns=[\"col_1\"])\n                self.assertEqual(len(dset[0].columns), 1)\n                self.assertIsInstance(dset[0], pl.DataFrame)\n                self.assertListEqual(list(dset[0].shape), [1, 1])\n                self.assertEqual(dset[0][\"col_1\"].item(), 3)\n\n                dset.set_format(type=\"polars\", columns=[\"col_1\", \"col_2\"])\n                self.assertEqual(len(dset[0].columns), 2)\n                self.assertEqual(dset[0][\"col_2\"].item(), \"a\")\n\n    def test_set_transform(self, in_memory):\n        def transform(batch):\n            return {k: [str(i).upper() for i in v] for k, v in batch.items()}\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                dset.set_transform(transform=transform, columns=[\"col_1\"])\n                self.assertEqual(dset.format[\"type\"], \"custom\")\n                self.assertEqual(len(dset[0].keys()), 1)\n                self.assertEqual(dset[0][\"col_1\"], \"3\")\n                self.assertEqual(dset[:2][\"col_1\"], [\"3\", \"2\"])\n                self.assertEqual(dset[\"col_1\"][:2], [\"3\", \"2\"])\n\n                prev_format = dset.format\n                dset.set_format(**dset.format)\n                self.assertEqual(prev_format, dset.format)\n\n                dset.set_transform(transform=transform, columns=[\"col_1\", \"col_2\"])\n                self.assertEqual(len(dset[0].keys()), 2)\n                self.assertEqual(dset[0][\"col_2\"], \"A\")\n\n    def test_transmit_format(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                transform = datasets.arrow_dataset.transmit_format(lambda x: x)\n                # make sure identity transform doesn't apply unnecessary format\n                self.assertEqual(dset._fingerprint, transform(dset)._fingerprint)\n                dset.set_format(**dset.format)\n                self.assertEqual(dset._fingerprint, transform(dset)._fingerprint)\n                # check lists comparisons\n                dset.set_format(columns=[\"col_1\"])\n                self.assertEqual(dset._fingerprint, transform(dset)._fingerprint)\n                dset.set_format(columns=[\"col_1\", \"col_2\"])\n                self.assertEqual(dset._fingerprint, transform(dset)._fingerprint)\n                dset.set_format(\"numpy\", columns=[\"col_1\", \"col_2\"])\n                self.assertEqual(dset._fingerprint, transform(dset)._fingerprint)\n\n    def test_cast(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                features = dset.features\n                features[\"col_1\"] = Value(\"float64\")\n                features = Features({k: features[k] for k in list(features)[::-1]})\n                fingerprint = dset._fingerprint\n                # TODO: with assert_arrow_memory_increases() if in_memory else assert_arrow_memory_doesnt_increase():\n                with dset.cast(features) as casted_dset:\n                    self.assertEqual(casted_dset.num_columns, 3)\n                    self.assertEqual(casted_dset.features[\"col_1\"], Value(\"float64\"))\n                    self.assertIsInstance(casted_dset[0][\"col_1\"], float)\n                    self.assertNotEqual(casted_dset._fingerprint, fingerprint)\n                    self.assertNotEqual(casted_dset, dset)\n                    assert_arrow_metadata_are_synced_with_dataset_features(casted_dset)\n\n    def test_class_encode_column(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                with self.assertRaises(ValueError):\n                    dset.class_encode_column(column=\"does not exist\")\n\n                with dset.class_encode_column(\"col_1\") as casted_dset:\n                    self.assertIsInstance(casted_dset.features[\"col_1\"], ClassLabel)\n                    self.assertListEqual(casted_dset.features[\"col_1\"].names, [\"0\", \"1\", \"2\", \"3\"])\n                    self.assertListEqual(casted_dset[\"col_1\"][:], [3, 2, 1, 0])\n                    self.assertNotEqual(casted_dset._fingerprint, dset._fingerprint)\n                    self.assertNotEqual(casted_dset, dset)\n                    assert_arrow_metadata_are_synced_with_dataset_features(casted_dset)\n\n                with dset.class_encode_column(\"col_2\") as casted_dset:\n                    self.assertIsInstance(casted_dset.features[\"col_2\"], ClassLabel)\n                    self.assertListEqual(casted_dset.features[\"col_2\"].names, [\"a\", \"b\", \"c\", \"d\"])\n                    self.assertListEqual(casted_dset[\"col_2\"][:], [0, 1, 2, 3])\n                    self.assertNotEqual(casted_dset._fingerprint, dset._fingerprint)\n                    self.assertNotEqual(casted_dset, dset)\n                    assert_arrow_metadata_are_synced_with_dataset_features(casted_dset)\n\n                with dset.class_encode_column(\"col_3\") as casted_dset:\n                    self.assertIsInstance(casted_dset.features[\"col_3\"], ClassLabel)\n                    self.assertListEqual(casted_dset.features[\"col_3\"].names, [\"False\", \"True\"])\n                    self.assertListEqual(casted_dset[\"col_3\"][:], [0, 1, 0, 1])\n                    self.assertNotEqual(casted_dset._fingerprint, dset._fingerprint)\n                    self.assertNotEqual(casted_dset, dset)\n                    assert_arrow_metadata_are_synced_with_dataset_features(casted_dset)\n\n            # Test raises if feature is an array / sequence\n            with self._create_dummy_dataset(in_memory, tmp_dir, array_features=True) as dset:\n                for column in dset.column_names:\n                    with self.assertRaises(ValueError):\n                        dset.class_encode_column(column)\n\n    def test_remove_columns(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                fingerprint = dset._fingerprint\n                with dset.remove_columns(column_names=\"col_1\") as new_dset:\n                    self.assertEqual(new_dset.num_columns, 2)\n                    self.assertListEqual(list(new_dset.column_names), [\"col_2\", \"col_3\"])\n                    self.assertNotEqual(new_dset._fingerprint, fingerprint)\n                    assert_arrow_metadata_are_synced_with_dataset_features(new_dset)\n\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                with dset.remove_columns(column_names=[\"col_1\", \"col_2\", \"col_3\"]) as new_dset:\n                    self.assertEqual(new_dset.num_columns, 0)\n                    self.assertNotEqual(new_dset._fingerprint, fingerprint)\n                    assert_arrow_metadata_are_synced_with_dataset_features(new_dset)\n\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                dset._format_columns = [\"col_1\", \"col_2\", \"col_3\"]\n                with dset.remove_columns(column_names=[\"col_1\"]) as new_dset:\n                    self.assertListEqual(new_dset._format_columns, [\"col_2\", \"col_3\"])\n                    self.assertEqual(new_dset.num_columns, 2)\n                    self.assertListEqual(list(new_dset.column_names), [\"col_2\", \"col_3\"])\n                    self.assertNotEqual(new_dset._fingerprint, fingerprint)\n                    assert_arrow_metadata_are_synced_with_dataset_features(new_dset)\n\n    def test_rename_column(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                fingerprint = dset._fingerprint\n                with dset.rename_column(original_column_name=\"col_1\", new_column_name=\"new_name\") as new_dset:\n                    self.assertEqual(new_dset.num_columns, 3)\n                    self.assertListEqual(list(new_dset.column_names), [\"new_name\", \"col_2\", \"col_3\"])\n                    self.assertListEqual(list(dset.column_names), [\"col_1\", \"col_2\", \"col_3\"])\n                    self.assertNotEqual(new_dset._fingerprint, fingerprint)\n                    assert_arrow_metadata_are_synced_with_dataset_features(new_dset)\n\n    def test_rename_columns(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                fingerprint = dset._fingerprint\n                with dset.rename_columns({\"col_1\": \"new_name\"}) as new_dset:\n                    self.assertEqual(new_dset.num_columns, 3)\n                    self.assertListEqual(list(new_dset.column_names), [\"new_name\", \"col_2\", \"col_3\"])\n                    self.assertListEqual(list(dset.column_names), [\"col_1\", \"col_2\", \"col_3\"])\n                    self.assertNotEqual(new_dset._fingerprint, fingerprint)\n\n                with dset.rename_columns({\"col_1\": \"new_name\", \"col_2\": \"new_name2\"}) as new_dset:\n                    self.assertEqual(new_dset.num_columns, 3)\n                    self.assertListEqual(list(new_dset.column_names), [\"new_name\", \"new_name2\", \"col_3\"])\n                    self.assertListEqual(list(dset.column_names), [\"col_1\", \"col_2\", \"col_3\"])\n                    self.assertNotEqual(new_dset._fingerprint, fingerprint)\n\n                # Original column not in dataset\n                with self.assertRaises(ValueError):\n                    dset.rename_columns({\"not_there\": \"new_name\"})\n\n                # Empty new name\n                with self.assertRaises(ValueError):\n                    dset.rename_columns({\"col_1\": \"\"})\n\n                # Duplicates\n                with self.assertRaises(ValueError):\n                    dset.rename_columns({\"col_1\": \"new_name\", \"col_2\": \"new_name\"})\n\n    def test_select_columns(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                fingerprint = dset._fingerprint\n                with dset.select_columns(column_names=[]) as new_dset:\n                    self.assertEqual(new_dset.num_columns, 0)\n                    self.assertListEqual(list(new_dset.column_names), [])\n                    self.assertNotEqual(new_dset._fingerprint, fingerprint)\n                    assert_arrow_metadata_are_synced_with_dataset_features(new_dset)\n\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                fingerprint = dset._fingerprint\n                with dset.select_columns(column_names=\"col_1\") as new_dset:\n                    self.assertEqual(new_dset.num_columns, 1)\n                    self.assertListEqual(list(new_dset.column_names), [\"col_1\"])\n                    self.assertNotEqual(new_dset._fingerprint, fingerprint)\n                    assert_arrow_metadata_are_synced_with_dataset_features(new_dset)\n\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                with dset.select_columns(column_names=[\"col_1\", \"col_2\", \"col_3\"]) as new_dset:\n                    self.assertEqual(new_dset.num_columns, 3)\n                    self.assertListEqual(list(new_dset.column_names), [\"col_1\", \"col_2\", \"col_3\"])\n                    self.assertNotEqual(new_dset._fingerprint, fingerprint)\n                    assert_arrow_metadata_are_synced_with_dataset_features(new_dset)\n\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                with dset.select_columns(column_names=[\"col_3\", \"col_2\", \"col_1\"]) as new_dset:\n                    self.assertEqual(new_dset.num_columns, 3)\n                    self.assertListEqual(list(new_dset.column_names), [\"col_3\", \"col_2\", \"col_1\"])\n                    self.assertNotEqual(new_dset._fingerprint, fingerprint)\n                    assert_arrow_metadata_are_synced_with_dataset_features(new_dset)\n\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                dset._format_columns = [\"col_1\", \"col_2\", \"col_3\"]\n                with dset.select_columns(column_names=[\"col_1\"]) as new_dset:\n                    self.assertListEqual(new_dset._format_columns, [\"col_1\"])\n                    self.assertEqual(new_dset.num_columns, 1)\n                    self.assertListEqual(list(new_dset.column_names), [\"col_1\"])\n                    self.assertNotEqual(new_dset._fingerprint, fingerprint)\n                    assert_arrow_metadata_are_synced_with_dataset_features(new_dset)\n\n    def test_concatenate(self, in_memory):\n        data1, data2, data3 = {\"id\": [0, 1, 2]}, {\"id\": [3, 4, 5]}, {\"id\": [6, 7]}\n        info1 = DatasetInfo(description=\"Dataset1\")\n        info2 = DatasetInfo(description=\"Dataset2\")\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            dset1, dset2, dset3 = (\n                Dataset.from_dict(data1, info=info1),\n                Dataset.from_dict(data2, info=info2),\n                Dataset.from_dict(data3),\n            )\n            dset1, dset2, dset3 = self._to(in_memory, tmp_dir, dset1, dset2, dset3)\n\n            with concatenate_datasets([dset1, dset2, dset3]) as dset_concat:\n                self.assertTupleEqual((len(dset1), len(dset2), len(dset3)), (3, 3, 2))\n                self.assertEqual(len(dset_concat), len(dset1) + len(dset2) + len(dset3))\n                self.assertListEqual(dset_concat[\"id\"][:], [0, 1, 2, 3, 4, 5, 6, 7])\n                self.assertEqual(len(dset_concat.cache_files), 0 if in_memory else 3)\n                self.assertEqual(dset_concat.info.description, \"Dataset1\\n\\nDataset2\")\n            del dset1, dset2, dset3\n\n    def test_concatenate_formatted(self, in_memory):\n        data1, data2, data3 = {\"id\": [0, 1, 2]}, {\"id\": [3, 4, 5]}, {\"id\": [6, 7]}\n        info1 = DatasetInfo(description=\"Dataset1\")\n        info2 = DatasetInfo(description=\"Dataset2\")\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            dset1, dset2, dset3 = (\n                Dataset.from_dict(data1, info=info1),\n                Dataset.from_dict(data2, info=info2),\n                Dataset.from_dict(data3),\n            )\n            dset1, dset2, dset3 = self._to(in_memory, tmp_dir, dset1, dset2, dset3)\n\n            dset1.set_format(\"numpy\")\n            with concatenate_datasets([dset1, dset2, dset3]) as dset_concat:\n                self.assertEqual(dset_concat.format[\"type\"], None)\n            dset2.set_format(\"numpy\")\n            dset3.set_format(\"numpy\")\n            with concatenate_datasets([dset1, dset2, dset3]) as dset_concat:\n                self.assertEqual(dset_concat.format[\"type\"], \"numpy\")\n            del dset1, dset2, dset3\n\n    def test_concatenate_with_indices(self, in_memory):\n        data1, data2, data3 = {\"id\": [0, 1, 2] * 2}, {\"id\": [3, 4, 5] * 2}, {\"id\": [6, 7, 8]}\n        info1 = DatasetInfo(description=\"Dataset1\")\n        info2 = DatasetInfo(description=\"Dataset2\")\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            dset1, dset2, dset3 = (\n                Dataset.from_dict(data1, info=info1),\n                Dataset.from_dict(data2, info=info2),\n                Dataset.from_dict(data3),\n            )\n            dset1, dset2, dset3 = self._to(in_memory, tmp_dir, dset1, dset2, dset3)\n            dset1, dset2, dset3 = dset1.select([2, 1, 0]), dset2.select([2, 1, 0]), dset3\n\n            with concatenate_datasets([dset3, dset2, dset1]) as dset_concat:\n                self.assertTupleEqual((len(dset1), len(dset2), len(dset3)), (3, 3, 3))\n                self.assertEqual(len(dset_concat), len(dset1) + len(dset2) + len(dset3))\n                self.assertListEqual(dset_concat[\"id\"][:], [6, 7, 8, 5, 4, 3, 2, 1, 0])\n                # in_memory = False:\n                # 3 cache files for the dset_concat._data table\n                # no cache file for the indices because it's in memory\n                # in_memory = True:\n                # no cache files since both dset_concat._data and dset_concat._indices are in memory\n                self.assertEqual(len(dset_concat.cache_files), 0 if in_memory else 3)\n                self.assertEqual(dset_concat.info.description, \"Dataset2\\n\\nDataset1\")\n\n            dset1 = dset1.rename_columns({\"id\": \"id1\"})\n            dset2 = dset2.rename_columns({\"id\": \"id2\"})\n            dset3 = dset3.rename_columns({\"id\": \"id3\"})\n            with concatenate_datasets([dset1, dset2, dset3], axis=1) as dset_concat:\n                self.assertTupleEqual((len(dset1), len(dset2), len(dset3)), (3, 3, 3))\n                self.assertEqual(len(dset_concat), len(dset1))\n                self.assertListEqual(dset_concat[\"id1\"][:], [2, 1, 0])\n                self.assertListEqual(dset_concat[\"id2\"][:], [5, 4, 3])\n                self.assertListEqual(dset_concat[\"id3\"][:], [6, 7, 8])\n                # in_memory = False:\n                # 3 cache files for the dset_concat._data table\n                # no cache file for the indices because it's None\n                # in_memory = True:\n                # no cache files since dset_concat._data is in memory and dset_concat._indices is None\n                self.assertEqual(len(dset_concat.cache_files), 0 if in_memory else 3)\n                self.assertIsNone(dset_concat._indices)\n                self.assertEqual(dset_concat.info.description, \"Dataset1\\n\\nDataset2\")\n\n            with concatenate_datasets([dset1], axis=1) as dset_concat:\n                self.assertEqual(len(dset_concat), len(dset1))\n                self.assertListEqual(dset_concat[\"id1\"][:], [2, 1, 0])\n                # in_memory = False:\n                # 1 cache file for the dset_concat._data table\n                # no cache file for the indices because it's in memory\n                # in_memory = True:\n                # no cache files since both dset_concat._data and dset_concat._indices are in memory\n                self.assertEqual(len(dset_concat.cache_files), 0 if in_memory else 1)\n                self.assertTrue(dset_concat._indices == dset1._indices)\n                self.assertEqual(dset_concat.info.description, \"Dataset1\")\n            del dset1, dset2, dset3\n\n    def test_concatenate_with_indices_from_disk(self, in_memory):\n        data1, data2, data3 = {\"id\": [0, 1, 2] * 2}, {\"id\": [3, 4, 5] * 2}, {\"id\": [6, 7]}\n        info1 = DatasetInfo(description=\"Dataset1\")\n        info2 = DatasetInfo(description=\"Dataset2\")\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            dset1, dset2, dset3 = (\n                Dataset.from_dict(data1, info=info1),\n                Dataset.from_dict(data2, info=info2),\n                Dataset.from_dict(data3),\n            )\n            dset1, dset2, dset3 = self._to(in_memory, tmp_dir, dset1, dset2, dset3)\n            dset1, dset2, dset3 = (\n                dset1.select([2, 1, 0], indices_cache_file_name=os.path.join(tmp_dir, \"i1.arrow\")),\n                dset2.select([2, 1, 0], indices_cache_file_name=os.path.join(tmp_dir, \"i2.arrow\")),\n                dset3.select([1, 0], indices_cache_file_name=os.path.join(tmp_dir, \"i3.arrow\")),\n            )\n\n            with concatenate_datasets([dset3, dset2, dset1]) as dset_concat:\n                self.assertTupleEqual((len(dset1), len(dset2), len(dset3)), (3, 3, 2))\n                self.assertEqual(len(dset_concat), len(dset1) + len(dset2) + len(dset3))\n                self.assertListEqual(dset_concat[\"id\"][:], [7, 6, 5, 4, 3, 2, 1, 0])\n                # in_memory = False:\n                # 3 cache files for the dset_concat._data table, and 1 for the dset_concat._indices_table\n                # There is only 1 for the indices tables (i1.arrow)\n                # Indeed, the others are brought to memory since an offset is applied to them.\n                # in_memory = True:\n                # 1 cache file for i1.arrow since both dset_concat._data and dset_concat._indices are in memory\n                self.assertEqual(len(dset_concat.cache_files), 1 if in_memory else 3 + 1)\n                self.assertEqual(dset_concat.info.description, \"Dataset2\\n\\nDataset1\")\n            del dset1, dset2, dset3\n\n    def test_concatenate_pickle(self, in_memory):\n        data1, data2, data3 = {\"id\": [0, 1, 2] * 2}, {\"id\": [3, 4, 5] * 2}, {\"id\": [6, 7], \"foo\": [\"bar\", \"bar\"]}\n        info1 = DatasetInfo(description=\"Dataset1\")\n        info2 = DatasetInfo(description=\"Dataset2\")\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            dset1, dset2, dset3 = (\n                Dataset.from_dict(data1, info=info1),\n                Dataset.from_dict(data2, info=info2),\n                Dataset.from_dict(data3),\n            )\n            schema = dset1.data.schema\n            # mix from in-memory and on-disk datasets\n            dset1, dset2 = self._to(in_memory, tmp_dir, dset1, dset2)\n            dset3 = self._to(not in_memory, tmp_dir, dset3)\n            dset1, dset2, dset3 = (\n                dset1.select(\n                    [2, 1, 0],\n                    keep_in_memory=in_memory,\n                    indices_cache_file_name=os.path.join(tmp_dir, \"i1.arrow\") if not in_memory else None,\n                ),\n                dset2.select(\n                    [2, 1, 0],\n                    keep_in_memory=in_memory,\n                    indices_cache_file_name=os.path.join(tmp_dir, \"i2.arrow\") if not in_memory else None,\n                ),\n                dset3.select(\n                    [1, 0],\n                    keep_in_memory=in_memory,\n                    indices_cache_file_name=os.path.join(tmp_dir, \"i3.arrow\") if not in_memory else None,\n                ),\n            )\n\n            dset3 = dset3.rename_column(\"foo\", \"new_foo\")\n            dset3 = dset3.remove_columns(\"new_foo\")\n            if in_memory:\n                dset3._data.table = Unpicklable(schema=schema)\n            else:\n                dset1._data.table, dset2._data.table = Unpicklable(schema=schema), Unpicklable(schema=schema)\n            dset1, dset2, dset3 = (pickle.loads(pickle.dumps(d)) for d in (dset1, dset2, dset3))\n            with concatenate_datasets([dset3, dset2, dset1]) as dset_concat:\n                if not in_memory:\n                    dset_concat._data.table = Unpicklable(schema=schema)\n                with pickle.loads(pickle.dumps(dset_concat)) as dset_concat:\n                    self.assertTupleEqual((len(dset1), len(dset2), len(dset3)), (3, 3, 2))\n                    self.assertEqual(len(dset_concat), len(dset1) + len(dset2) + len(dset3))\n                    self.assertListEqual(dset_concat[\"id\"][:], [7, 6, 5, 4, 3, 2, 1, 0])\n                    # in_memory = True: 1 cache file for dset3\n                    # in_memory = False: 2 caches files for dset1 and dset2, and 1 cache file for i1.arrow\n                    self.assertEqual(len(dset_concat.cache_files), 1 if in_memory else 2 + 1)\n                    self.assertEqual(dset_concat.info.description, \"Dataset2\\n\\nDataset1\")\n            del dset1, dset2, dset3\n\n    def test_repeat(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                repeated_dset = dset.repeat(3)\n                column_values_dict = {col: dset[col] for col in dset.column_names}\n                for col, single_values in column_values_dict.items():\n                    self.assertListEqual(repeated_dset[col][:], single_values[:] * 3)\n                del repeated_dset\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                with pytest.raises(ValueError):\n                    dset.repeat(None)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                repeated_dset = dset.repeat(0)\n                self.assertEqual(len(repeated_dset), 0)\n                del repeated_dset\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                repeated_dset = dset.repeat(-1)\n                self.assertEqual(len(repeated_dset), 0)\n                del repeated_dset\n\n    def test_flatten(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with Dataset.from_dict(\n                {\"a\": [{\"b\": {\"c\": [\"text\"]}}] * 10, \"foo\": [1] * 10},\n                features=Features({\"a\": {\"b\": {\"c\": List(Value(\"string\"))}}, \"foo\": Value(\"int64\")}),\n            ) as dset:\n                with self._to(in_memory, tmp_dir, dset) as dset:\n                    fingerprint = dset._fingerprint\n                    with dset.flatten() as dset:\n                        self.assertListEqual(sorted(dset.column_names), [\"a.b.c\", \"foo\"])\n                        self.assertListEqual(sorted(dset.features.keys()), [\"a.b.c\", \"foo\"])\n                        self.assertDictEqual(\n                            dset.features, Features({\"a.b.c\": List(Value(\"string\")), \"foo\": Value(\"int64\")})\n                        )\n                        self.assertNotEqual(dset._fingerprint, fingerprint)\n                        assert_arrow_metadata_are_synced_with_dataset_features(dset)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with Dataset.from_dict(\n                {\"a\": [{\"en\": \"Thank you\", \"fr\": \"Merci\"}] * 10, \"foo\": [1] * 10},\n                features=Features({\"a\": Translation(languages=[\"en\", \"fr\"]), \"foo\": Value(\"int64\")}),\n            ) as dset:\n                with self._to(in_memory, tmp_dir, dset) as dset:\n                    fingerprint = dset._fingerprint\n                    with dset.flatten() as dset:\n                        self.assertListEqual(sorted(dset.column_names), [\"a.en\", \"a.fr\", \"foo\"])\n                        self.assertListEqual(sorted(dset.features.keys()), [\"a.en\", \"a.fr\", \"foo\"])\n                        self.assertDictEqual(\n                            dset.features,\n                            Features({\"a.en\": Value(\"string\"), \"a.fr\": Value(\"string\"), \"foo\": Value(\"int64\")}),\n                        )\n                        self.assertNotEqual(dset._fingerprint, fingerprint)\n                        assert_arrow_metadata_are_synced_with_dataset_features(dset)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with Dataset.from_dict(\n                {\"a\": [{\"en\": \"the cat\", \"fr\": [\"le chat\", \"la chatte\"], \"de\": \"die katze\"}] * 10, \"foo\": [1] * 10},\n                features=Features(\n                    {\n                        \"a\": TranslationVariableLanguages(languages=[\"en\", \"fr\", \"de\"]),\n                        \"foo\": Value(\"int64\"),\n                    }\n                ),\n            ) as dset:\n                with self._to(in_memory, tmp_dir, dset) as dset:\n                    fingerprint = dset._fingerprint\n                    with dset.flatten() as dset:\n                        self.assertListEqual(sorted(dset.column_names), [\"a.language\", \"a.translation\", \"foo\"])\n                        self.assertListEqual(sorted(dset.features.keys()), [\"a.language\", \"a.translation\", \"foo\"])\n                        self.assertDictEqual(\n                            dset.features,\n                            Features(\n                                {\n                                    \"a.language\": List(Value(\"string\")),\n                                    \"a.translation\": List(Value(\"string\")),\n                                    \"foo\": Value(\"int64\"),\n                                }\n                            ),\n                        )\n                        self.assertNotEqual(dset._fingerprint, fingerprint)\n                        assert_arrow_metadata_are_synced_with_dataset_features(dset)\n\n    @require_pil\n    def test_flatten_complex_image(self, in_memory):\n        # decoding turned on\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with Dataset.from_dict(\n                {\"a\": [np.arange(4 * 4 * 3, dtype=np.uint8).reshape(4, 4, 3)] * 10, \"foo\": [1] * 10},\n                features=Features({\"a\": Image(), \"foo\": Value(\"int64\")}),\n            ) as dset:\n                with self._to(in_memory, tmp_dir, dset) as dset:\n                    fingerprint = dset._fingerprint\n                    with dset.flatten() as dset:\n                        self.assertListEqual(sorted(dset.column_names), [\"a\", \"foo\"])\n                        self.assertListEqual(sorted(dset.features.keys()), [\"a\", \"foo\"])\n                        self.assertDictEqual(dset.features, Features({\"a\": Image(), \"foo\": Value(\"int64\")}))\n                        self.assertNotEqual(dset._fingerprint, fingerprint)\n                        assert_arrow_metadata_are_synced_with_dataset_features(dset)\n\n        # decoding turned on + nesting\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with Dataset.from_dict(\n                {\"a\": [{\"b\": np.arange(4 * 4 * 3, dtype=np.uint8).reshape(4, 4, 3)}] * 10, \"foo\": [1] * 10},\n                features=Features({\"a\": {\"b\": Image()}, \"foo\": Value(\"int64\")}),\n            ) as dset:\n                with self._to(in_memory, tmp_dir, dset) as dset:\n                    fingerprint = dset._fingerprint\n                    with dset.flatten() as dset:\n                        self.assertListEqual(sorted(dset.column_names), [\"a.b\", \"foo\"])\n                        self.assertListEqual(sorted(dset.features.keys()), [\"a.b\", \"foo\"])\n                        self.assertDictEqual(dset.features, Features({\"a.b\": Image(), \"foo\": Value(\"int64\")}))\n                        self.assertNotEqual(dset._fingerprint, fingerprint)\n                        assert_arrow_metadata_are_synced_with_dataset_features(dset)\n\n        # decoding turned off\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with Dataset.from_dict(\n                {\"a\": [np.arange(4 * 4 * 3, dtype=np.uint8).reshape(4, 4, 3)] * 10, \"foo\": [1] * 10},\n                features=Features({\"a\": Image(decode=False), \"foo\": Value(\"int64\")}),\n            ) as dset:\n                with self._to(in_memory, tmp_dir, dset) as dset:\n                    fingerprint = dset._fingerprint\n                    with dset.flatten() as dset:\n                        self.assertListEqual(sorted(dset.column_names), [\"a.bytes\", \"a.path\", \"foo\"])\n                        self.assertListEqual(sorted(dset.features.keys()), [\"a.bytes\", \"a.path\", \"foo\"])\n                        self.assertDictEqual(\n                            dset.features,\n                            Features({\"a.bytes\": Value(\"binary\"), \"a.path\": Value(\"string\"), \"foo\": Value(\"int64\")}),\n                        )\n                        self.assertNotEqual(dset._fingerprint, fingerprint)\n                        assert_arrow_metadata_are_synced_with_dataset_features(dset)\n\n        # decoding turned off + nesting\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with Dataset.from_dict(\n                {\"a\": [{\"b\": np.arange(4 * 4 * 3, dtype=np.uint8).reshape(4, 4, 3)}] * 10, \"foo\": [1] * 10},\n                features=Features({\"a\": {\"b\": Image(decode=False)}, \"foo\": Value(\"int64\")}),\n            ) as dset:\n                with self._to(in_memory, tmp_dir, dset) as dset:\n                    fingerprint = dset._fingerprint\n                    with dset.flatten() as dset:\n                        self.assertListEqual(sorted(dset.column_names), [\"a.b.bytes\", \"a.b.path\", \"foo\"])\n                        self.assertListEqual(sorted(dset.features.keys()), [\"a.b.bytes\", \"a.b.path\", \"foo\"])\n                        self.assertDictEqual(\n                            dset.features,\n                            Features(\n                                {\n                                    \"a.b.bytes\": Value(\"binary\"),\n                                    \"a.b.path\": Value(\"string\"),\n                                    \"foo\": Value(\"int64\"),\n                                }\n                            ),\n                        )\n                        self.assertNotEqual(dset._fingerprint, fingerprint)\n                        assert_arrow_metadata_are_synced_with_dataset_features(dset)\n\n    def test_map(self, in_memory):\n        # standard\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                fingerprint = dset._fingerprint\n                with dset.map(\n                    lambda x: {\"name\": x[\"filename\"][:-2], \"id\": int(x[\"filename\"].split(\"_\")[-1])}\n                ) as dset_test:\n                    self.assertEqual(len(dset_test), 30)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(\n                        dset_test.features,\n                        Features({\"filename\": Value(\"string\"), \"name\": Value(\"string\"), \"id\": Value(\"int64\")}),\n                    )\n                    self.assertListEqual(dset_test[\"id\"][:], list(range(30)))\n                    self.assertNotEqual(dset_test._fingerprint, fingerprint)\n                    assert_arrow_metadata_are_synced_with_dataset_features(dset_test)\n\n        # no transform\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                fingerprint = dset._fingerprint\n                with dset.map(lambda x: None) as dset_test:\n                    self.assertEqual(len(dset_test), 30)\n                    self.assertEqual(dset_test._fingerprint, fingerprint)\n                    assert_arrow_metadata_are_synced_with_dataset_features(dset_test)\n\n        # with indices\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with dset.map(\n                    lambda x, i: {\"name\": x[\"filename\"][:-2], \"id\": i}, with_indices=True\n                ) as dset_test_with_indices:\n                    self.assertEqual(len(dset_test_with_indices), 30)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(\n                        dset_test_with_indices.features,\n                        Features({\"filename\": Value(\"string\"), \"name\": Value(\"string\"), \"id\": Value(\"int64\")}),\n                    )\n                    self.assertListEqual(dset_test_with_indices[\"id\"][:], list(range(30)))\n                    assert_arrow_metadata_are_synced_with_dataset_features(dset_test_with_indices)\n\n        # interrupted\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n\n                def func(x, i):\n                    if i == 4:\n                        raise KeyboardInterrupt()\n                    return {\"name\": x[\"filename\"][:-2], \"id\": i}\n\n                tmp_file = os.path.join(tmp_dir, \"test.arrow\")\n                self.assertRaises(\n                    KeyboardInterrupt,\n                    dset.map,\n                    function=func,\n                    with_indices=True,\n                    cache_file_name=tmp_file,\n                    writer_batch_size=2,\n                )\n                self.assertFalse(os.path.exists(tmp_file))\n                with dset.map(\n                    lambda x, i: {\"name\": x[\"filename\"][:-2], \"id\": i},\n                    with_indices=True,\n                    cache_file_name=tmp_file,\n                    writer_batch_size=2,\n                ) as dset_test_with_indices:\n                    self.assertTrue(os.path.exists(tmp_file))\n                    self.assertEqual(len(dset_test_with_indices), 30)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(\n                        dset_test_with_indices.features,\n                        Features({\"filename\": Value(\"string\"), \"name\": Value(\"string\"), \"id\": Value(\"int64\")}),\n                    )\n                    self.assertListEqual(dset_test_with_indices[\"id\"][:], list(range(30)))\n                    assert_arrow_metadata_are_synced_with_dataset_features(dset_test_with_indices)\n\n        # formatted\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                dset.set_format(\"numpy\", columns=[\"col_1\"])\n                with dset.map(lambda x: {\"col_1_plus_one\": x[\"col_1\"] + 1}) as dset_test:\n                    self.assertEqual(len(dset_test), 4)\n                    self.assertEqual(dset_test.format[\"type\"], \"numpy\")\n                    self.assertIsInstance(dset_test[\"col_1\"][:], np.ndarray)\n                    self.assertIsInstance(dset_test[\"col_1_plus_one\"][:], np.ndarray)\n                    self.assertListEqual(sorted(dset_test[0].keys()), [\"col_1\", \"col_1_plus_one\"])\n                    self.assertListEqual(sorted(dset_test.column_names), [\"col_1\", \"col_1_plus_one\", \"col_2\", \"col_3\"])\n                    assert_arrow_metadata_are_synced_with_dataset_features(dset_test)\n        # casting int labels to float labels\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, int_to_float=True) as dset:\n\n                def _preprocess(examples):\n                    result = {\"labels\": [list(map(float, labels)) for labels in examples[\"labels\"]]}\n                    return result\n\n                with dset.map(\n                    _preprocess, remove_columns=[\"labels\", \"text\"], batched=True, try_original_type=True\n                ) as dset_test:\n                    for labels in dset_test[\"labels\"]:\n                        for label in labels:\n                            self.assertIsInstance(label, int)\n\n                with dset.map(\n                    _preprocess, remove_columns=[\"labels\", \"text\"], batched=True, try_original_type=False\n                ) as dset_test:\n                    for labels in dset_test[\"labels\"]:\n                        for label in labels:\n                            self.assertIsInstance(label, float)\n\n    def test_map_multiprocessing(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:  # standard\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                fingerprint = dset._fingerprint\n                with dset.map(picklable_map_function, num_proc=2) as dset_test:\n                    self.assertEqual(len(dset_test), 30)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(\n                        dset_test.features,\n                        Features({\"filename\": Value(\"string\"), \"id\": Value(\"int64\")}),\n                    )\n                    self.assertEqual(len(dset_test.cache_files), 0 if in_memory else 2)\n                    if not in_memory:\n                        self.assertIn(\"_of_00002.arrow\", dset_test.cache_files[0][\"filename\"])\n                    self.assertListEqual(dset_test[\"id\"][:], list(range(30)))\n                    self.assertNotEqual(dset_test._fingerprint, fingerprint)\n                    assert_arrow_metadata_are_synced_with_dataset_features(dset_test)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:  # num_proc > num rows\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                fingerprint = dset._fingerprint\n                with dset.select([0, 1], keep_in_memory=True).map(picklable_map_function, num_proc=10) as dset_test:\n                    self.assertEqual(len(dset_test), 2)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(\n                        dset_test.features,\n                        Features({\"filename\": Value(\"string\"), \"id\": Value(\"int64\")}),\n                    )\n                    self.assertEqual(len(dset_test.cache_files), 0 if in_memory else 2)\n                    self.assertListEqual(dset_test[\"id\"][:], list(range(2)))\n                    self.assertNotEqual(dset_test._fingerprint, fingerprint)\n                    assert_arrow_metadata_are_synced_with_dataset_features(dset_test)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:  # with_indices\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                fingerprint = dset._fingerprint\n                with dset.map(picklable_map_function_with_indices, num_proc=3, with_indices=True) as dset_test:\n                    self.assertEqual(len(dset_test), 30)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(\n                        dset_test.features,\n                        Features({\"filename\": Value(\"string\"), \"id\": Value(\"int64\")}),\n                    )\n                    self.assertEqual(len(dset_test.cache_files), 0 if in_memory else 3)\n                    self.assertListEqual(dset_test[\"id\"][:], list(range(30)))\n                    self.assertNotEqual(dset_test._fingerprint, fingerprint)\n                    assert_arrow_metadata_are_synced_with_dataset_features(dset_test)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:  # with_rank\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                fingerprint = dset._fingerprint\n                with dset.map(picklable_map_function_with_rank, num_proc=3, with_rank=True) as dset_test:\n                    self.assertEqual(len(dset_test), 30)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(\n                        dset_test.features,\n                        Features({\"filename\": Value(\"string\"), \"rank\": Value(\"int64\")}),\n                    )\n                    self.assertEqual(len(dset_test.cache_files), 0 if in_memory else 3)\n                    self.assertListEqual(dset_test[\"rank\"][:], [0] * 10 + [1] * 10 + [2] * 10)\n                    self.assertNotEqual(dset_test._fingerprint, fingerprint)\n                    assert_arrow_metadata_are_synced_with_dataset_features(dset_test)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:  # with_indices AND with_rank\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                fingerprint = dset._fingerprint\n                with dset.map(\n                    picklable_map_function_with_indices_and_rank, num_proc=3, with_indices=True, with_rank=True\n                ) as dset_test:\n                    self.assertEqual(len(dset_test), 30)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(\n                        dset_test.features,\n                        Features({\"filename\": Value(\"string\"), \"id\": Value(\"int64\"), \"rank\": Value(\"int64\")}),\n                    )\n                    self.assertEqual(len(dset_test.cache_files), 0 if in_memory else 3)\n                    self.assertListEqual(dset_test[\"id\"][:], list(range(30)))\n                    self.assertListEqual(dset_test[\"rank\"][:], [0] * 10 + [1] * 10 + [2] * 10)\n                    self.assertNotEqual(dset_test._fingerprint, fingerprint)\n                    assert_arrow_metadata_are_synced_with_dataset_features(dset_test)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:  # new_fingerprint\n            new_fingerprint = \"foobar\"\n            invalid_new_fingerprint = \"foobar/hey\"\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                fingerprint = dset._fingerprint\n                self.assertRaises(\n                    ValueError, dset.map, picklable_map_function, num_proc=2, new_fingerprint=invalid_new_fingerprint\n                )\n                with dset.map(picklable_map_function, num_proc=2, new_fingerprint=new_fingerprint) as dset_test:\n                    self.assertEqual(len(dset_test), 30)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(\n                        dset_test.features,\n                        Features({\"filename\": Value(\"string\"), \"id\": Value(\"int64\")}),\n                    )\n                    self.assertEqual(len(dset_test.cache_files), 0 if in_memory else 2)\n                    self.assertListEqual(dset_test[\"id\"][:], list(range(30)))\n                    self.assertNotEqual(dset_test._fingerprint, fingerprint)\n                    self.assertEqual(dset_test._fingerprint, new_fingerprint)\n                    assert_arrow_metadata_are_synced_with_dataset_features(dset_test)\n                    file_names = sorted(Path(cache_file[\"filename\"]).name for cache_file in dset_test.cache_files)\n                    for i, file_name in enumerate(file_names):\n                        self.assertIn(new_fingerprint + f\"_{i:05d}\", file_name)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:  # lambda (requires multiprocess from pathos)\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                fingerprint = dset._fingerprint\n                with dset.map(lambda x: {\"id\": int(x[\"filename\"].split(\"_\")[-1])}, num_proc=2) as dset_test:\n                    self.assertEqual(len(dset_test), 30)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(\n                        dset_test.features,\n                        Features({\"filename\": Value(\"string\"), \"id\": Value(\"int64\")}),\n                    )\n                    self.assertEqual(len(dset_test.cache_files), 0 if in_memory else 2)\n                    self.assertListEqual(dset_test[\"id\"][:], list(range(30)))\n                    self.assertNotEqual(dset_test._fingerprint, fingerprint)\n                    assert_arrow_metadata_are_synced_with_dataset_features(dset_test)\n\n    def test_map_new_features(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                features = Features({\"filename\": Value(\"string\"), \"label\": ClassLabel(names=[\"positive\", \"negative\"])})\n                with dset.map(\n                    lambda x, i: {\"label\": i % 2}, with_indices=True, features=features\n                ) as dset_test_with_indices:\n                    self.assertEqual(len(dset_test_with_indices), 30)\n                    self.assertDictEqual(\n                        dset_test_with_indices.features,\n                        features,\n                    )\n                    assert_arrow_metadata_are_synced_with_dataset_features(dset_test_with_indices)\n\n    def test_map_batched(self, in_memory):\n        def map_batched(example):\n            return {\"filename_new\": [x + \"_extension\" for x in example[\"filename\"]]}\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with dset.map(map_batched, batched=True) as dset_test_batched:\n                    self.assertEqual(len(dset_test_batched), 30)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(\n                        dset_test_batched.features,\n                        Features({\"filename\": Value(\"string\"), \"filename_new\": Value(\"string\")}),\n                    )\n                    assert_arrow_metadata_are_synced_with_dataset_features(dset_test_batched)\n\n        # change batch size and drop the last batch\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                batch_size = 4\n                with dset.map(\n                    map_batched, batched=True, batch_size=batch_size, drop_last_batch=True\n                ) as dset_test_batched:\n                    self.assertEqual(len(dset_test_batched), 30 // batch_size * batch_size)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(\n                        dset_test_batched.features,\n                        Features({\"filename\": Value(\"string\"), \"filename_new\": Value(\"string\")}),\n                    )\n                    assert_arrow_metadata_are_synced_with_dataset_features(dset_test_batched)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with dset.formatted_as(\"numpy\", columns=[\"filename\"]):\n                    with dset.map(map_batched, batched=True) as dset_test_batched:\n                        self.assertEqual(len(dset_test_batched), 30)\n                        self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                        self.assertDictEqual(\n                            dset_test_batched.features,\n                            Features({\"filename\": Value(\"string\"), \"filename_new\": Value(\"string\")}),\n                        )\n                        assert_arrow_metadata_are_synced_with_dataset_features(dset_test_batched)\n\n        def map_batched_with_indices(example, idx):\n            return {\"filename_new\": [x + \"_extension_\" + str(idx) for x in example[\"filename\"]]}\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with dset.map(\n                    map_batched_with_indices, batched=True, with_indices=True\n                ) as dset_test_with_indices_batched:\n                    self.assertEqual(len(dset_test_with_indices_batched), 30)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(\n                        dset_test_with_indices_batched.features,\n                        Features({\"filename\": Value(\"string\"), \"filename_new\": Value(\"string\")}),\n                    )\n                    assert_arrow_metadata_are_synced_with_dataset_features(dset_test_with_indices_batched)\n\n        # check remove columns for even if the function modifies input in-place\n        def map_batched_modifying_inputs_inplace(example):\n            result = {\"filename_new\": [x + \"_extension\" for x in example[\"filename\"]]}\n            del example[\"filename\"]\n            return result\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with dset.map(\n                    map_batched_modifying_inputs_inplace, batched=True, remove_columns=\"filename\"\n                ) as dset_test_modifying_inputs_inplace:\n                    self.assertEqual(len(dset_test_modifying_inputs_inplace), 30)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(\n                        dset_test_modifying_inputs_inplace.features,\n                        Features({\"filename_new\": Value(\"string\")}),\n                    )\n                    assert_arrow_metadata_are_synced_with_dataset_features(dset_test_modifying_inputs_inplace)\n\n    def test_map_nested(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with Dataset.from_dict({\"field\": [\"a\", \"b\"]}) as dset:\n                with self._to(in_memory, tmp_dir, dset) as dset:\n                    with dset.map(lambda example: {\"otherfield\": {\"capital\": example[\"field\"].capitalize()}}) as dset:\n                        with dset.map(lambda example: {\"otherfield\": {\"append_x\": example[\"field\"] + \"x\"}}) as dset:\n                            self.assertEqual(dset[0], {\"field\": \"a\", \"otherfield\": {\"append_x\": \"ax\"}})\n\n    def test_map_return_example_as_dict_value(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with Dataset.from_dict({\"en\": [\"aa\", \"bb\"], \"fr\": [\"cc\", \"dd\"]}) as dset:\n                with self._to(in_memory, tmp_dir, dset) as dset:\n                    with dset.map(lambda example: {\"translation\": example}) as dset:\n                        self.assertEqual(dset[0], {\"en\": \"aa\", \"fr\": \"cc\", \"translation\": {\"en\": \"aa\", \"fr\": \"cc\"}})\n\n    def test_map_fn_kwargs(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with Dataset.from_dict({\"id\": range(10)}) as dset:\n                with self._to(in_memory, tmp_dir, dset) as dset:\n                    fn_kwargs = {\"offset\": 3}\n                    with dset.map(\n                        lambda example, offset: {\"id+offset\": example[\"id\"] + offset}, fn_kwargs=fn_kwargs\n                    ) as mapped_dset:\n                        assert mapped_dset[\"id+offset\"] == list(range(3, 13))\n                    with dset.map(\n                        lambda id, offset: {\"id+offset\": id + offset}, fn_kwargs=fn_kwargs, input_columns=\"id\"\n                    ) as mapped_dset:\n                        assert mapped_dset[\"id+offset\"] == list(range(3, 13))\n                    with dset.map(\n                        lambda id, i, offset: {\"id+offset\": i + offset},\n                        fn_kwargs=fn_kwargs,\n                        input_columns=\"id\",\n                        with_indices=True,\n                    ) as mapped_dset:\n                        assert mapped_dset[\"id+offset\"] == list(range(3, 13))\n\n    def test_map_caching(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            self._caplog.clear()\n            with self._caplog.at_level(INFO, logger=get_logger().name):\n                with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                    with patch(\n                        \"datasets.arrow_dataset.Dataset._map_single\",\n                        autospec=Dataset._map_single,\n                        side_effect=Dataset._map_single,\n                    ) as mock_map_single:\n                        with dset.map(lambda x: {\"foo\": \"bar\"}) as dset_test1:\n                            dset_test1_data_files = list(dset_test1.cache_files)\n                        self.assertEqual(mock_map_single.call_count, 1)\n                        with dset.map(lambda x: {\"foo\": \"bar\"}) as dset_test2:\n                            self.assertEqual(dset_test1_data_files, dset_test2.cache_files)\n                            self.assertEqual(len(dset_test2.cache_files), 1 - int(in_memory))\n                            self.assertTrue((\"Loading cached processed dataset\" in self._caplog.text) ^ in_memory)\n                        self.assertEqual(mock_map_single.call_count, 2 if in_memory else 1)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            self._caplog.clear()\n            with self._caplog.at_level(INFO, logger=get_logger().name):\n                with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                    with dset.map(lambda x: {\"foo\": \"bar\"}) as dset_test1:\n                        dset_test1_data_files = list(dset_test1.cache_files)\n                    with dset.map(lambda x: {\"foo\": \"bar\"}, load_from_cache_file=False) as dset_test2:\n                        self.assertEqual(dset_test1_data_files, dset_test2.cache_files)\n                        self.assertEqual(len(dset_test2.cache_files), 1 - int(in_memory))\n                        self.assertNotIn(\"Loading cached processed dataset\", self._caplog.text)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            self._caplog.clear()\n            with self._caplog.at_level(INFO, logger=get_logger().name):\n                with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                    with dset.map(lambda x: {\"foo\": \"bar\"}, num_proc=2) as dset_test1:\n                        dset_test1_data_files = list(dset_test1.cache_files)\n                    with dset.map(lambda x: {\"foo\": \"bar\"}, num_proc=2) as dset_test2:\n                        self.assertEqual(dset_test1_data_files, dset_test2.cache_files)\n                        self.assertTrue(\n                            (len(re.findall(\"Loading cached processed dataset\", self._caplog.text)) == 1) ^ in_memory\n                        )\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            self._caplog.clear()\n            with self._caplog.at_level(INFO, logger=get_logger().name):\n                with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                    with dset.map(lambda x: {\"foo\": \"bar\"}, num_proc=2) as dset_test1:\n                        dset_test1_data_files = list(dset_test1.cache_files)\n                    with dset.map(lambda x: {\"foo\": \"bar\"}, num_proc=2, load_from_cache_file=False) as dset_test2:\n                        self.assertEqual(dset_test1_data_files, dset_test2.cache_files)\n                        self.assertEqual(len(dset_test2.cache_files), (1 - int(in_memory)) * 2)\n                        self.assertNotIn(\"Loading cached processed dataset\", self._caplog.text)\n\n        if not in_memory:\n            try:\n                self._caplog.clear()\n                with tempfile.TemporaryDirectory() as tmp_dir:\n                    with self._caplog.at_level(INFO, logger=get_logger().name):\n                        with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                            datasets.disable_caching()\n                            with dset.map(lambda x: {\"foo\": \"bar\"}) as dset_test1:\n                                with dset.map(lambda x: {\"foo\": \"bar\"}) as dset_test2:\n                                    self.assertNotEqual(dset_test1.cache_files, dset_test2.cache_files)\n                                    self.assertEqual(len(dset_test1.cache_files), 1)\n                                    self.assertEqual(len(dset_test2.cache_files), 1)\n                                    self.assertNotIn(\"Loading cached processed dataset\", self._caplog.text)\n                                    # make sure the arrow files are going to be removed\n                                    self.assertIn(\n                                        Path(tempfile.gettempdir()),\n                                        Path(dset_test1.cache_files[0][\"filename\"]).parents,\n                                    )\n                                    self.assertIn(\n                                        Path(tempfile.gettempdir()),\n                                        Path(dset_test2.cache_files[0][\"filename\"]).parents,\n                                    )\n            finally:\n                datasets.enable_caching()\n\n    def test_suffix_template_format(self, in_memory):\n        with (\n            tempfile.TemporaryDirectory() as tmp_dir,\n            self._caplog.at_level(INFO, logger=get_logger().name),\n            self._create_dummy_dataset(in_memory, tmp_dir) as dset,\n            self.assertRaises(ValueError) as e,\n            dset.map(lambda x: {\"foo\": \"bar\"}, suffix_template=\"_{}_of_{}\"),\n        ):\n            self.assertIn(\n                \"suffix_template must contain exactly the fields 'rank' and 'num_proc', got: \",\n                e.exception.args[0],\n            )\n\n    def test_cache_file_name_no_ext_raises_error(self, in_memory):\n        with (\n            tempfile.TemporaryDirectory() as tmp_dir,\n            self._caplog.at_level(INFO, logger=get_logger().name),\n            self._create_dummy_dataset(in_memory, tmp_dir) as dset,\n            self.assertRaises(ValueError) as e,\n            dset.map(lambda x: {\"foo\": \"bar\"}, cache_file_name=os.path.join(tmp_dir, \"train\")),\n        ):\n            self.assertIn(\"Expected cache_file_name to have an extension, but got: \", e.exception.args[0])\n\n    def test_map_caching_reuses_cache_with_different_num_proc(self, in_memory):\n        for dset_test1_num_proc, dset_test2_num_proc in [(1, 2), (2, 1)]:\n            with (\n                tempfile.TemporaryDirectory() as tmp_dir,\n                self._caplog.at_level(INFO, logger=get_logger().name),\n                self._create_dummy_dataset(in_memory, tmp_dir) as dset,\n            ):\n                # cannot mock _map_single here because mock objects aren't picklable\n                # see: https://github.com/python/cpython/issues/100090\n                self._caplog.clear()\n                with dset.map(lambda x: {\"foo\": \"bar\"}, num_proc=dset_test1_num_proc) as dset_test1:\n                    dset_test1_data_files = list(dset_test1.cache_files)\n                    self.assertFalse(\"Loading cached processed dataset\" in self._caplog.text)\n\n                self._caplog.clear()\n                with dset.map(lambda x: {\"foo\": \"bar\"}, num_proc=dset_test2_num_proc) as dset_test2:\n                    self.assertEqual(dset_test1_data_files, dset_test2.cache_files)\n                    self.assertEqual(len(dset_test2.cache_files), 0 if in_memory else dset_test1_num_proc)\n                    self.assertTrue((\"Loading cached processed dataset\" in self._caplog.text) ^ in_memory)\n\n    def test_map_caching_partial_remap(self, in_memory):\n        with (\n            tempfile.TemporaryDirectory() as tmp_dir,\n            self._caplog.at_level(INFO, logger=get_logger().name),\n            self._create_dummy_dataset(in_memory, tmp_dir) as dset,\n        ):\n            # cannot mock _map_single here because mock objects aren't picklable\n            # see: https://github.com/python/cpython/issues/100090\n            self._caplog.clear()\n            dset_test1_num_proc = 4\n            with dset.map(lambda x: {\"foo\": \"bar\"}, num_proc=dset_test1_num_proc) as dset_test1:\n                dset_test1_data_files = list(dset_test1.cache_files)\n                self.assertFalse(\"Loading cached processed dataset\" in self._caplog.text)\n\n            num_files_to_delete = 2\n            expected_msg = (\n                f\"Reprocessing {num_files_to_delete}/{dset_test1_num_proc} shards because some of them \"\n                \"were missing from the cache.\"\n            )\n            for cache_file in dset_test1_data_files[num_files_to_delete:]:\n                os.remove(cache_file[\"filename\"])\n\n            self._caplog.clear()\n            dset_test2_num_proc = None\n            with dset.map(lambda x: {\"foo\": \"bar\"}, num_proc=dset_test2_num_proc) as dset_test2:\n                self.assertEqual(dset_test1_data_files, dset_test2.cache_files)\n                self.assertEqual(len(dset_test2.cache_files), 0 if in_memory else dset_test1_num_proc)\n                self.assertTrue((expected_msg in self._caplog.text) ^ in_memory)\n                self.assertFalse(f\"Spawning {dset_test1_num_proc} processes\" in self._caplog.text)\n                self.assertFalse(f\"Spawning {dset_test2_num_proc} processes\" in self._caplog.text)\n\n            for cache_file in dset_test1_data_files[num_files_to_delete:]:\n                os.remove(cache_file[\"filename\"])\n\n            self._caplog.clear()\n            dset_test2_num_proc = 1\n            with dset.map(lambda x: {\"foo\": \"bar\"}, num_proc=dset_test2_num_proc) as dset_test2:\n                self.assertEqual(dset_test1_data_files, dset_test2.cache_files)\n                self.assertEqual(len(dset_test2.cache_files), 0 if in_memory else dset_test1_num_proc)\n                self.assertTrue((expected_msg in self._caplog.text) ^ in_memory)\n                self.assertFalse(f\"Spawning {dset_test1_num_proc} process\" in self._caplog.text)\n                self.assertTrue(f\"Spawning {dset_test2_num_proc} process\" in self._caplog.text)\n\n            for cache_file in dset_test1_data_files[num_files_to_delete:]:\n                os.remove(cache_file[\"filename\"])\n\n            self._caplog.clear()\n            dset_test3_num_proc = 3\n            with dset.map(lambda x: {\"foo\": \"bar\"}, num_proc=dset_test3_num_proc) as dset_test3:\n                self.assertEqual(dset_test1_data_files, dset_test3.cache_files)\n                self.assertEqual(len(dset_test3.cache_files), 0 if in_memory else dset_test1_num_proc)\n                self.assertTrue((expected_msg in self._caplog.text) ^ in_memory)\n                self.assertTrue(f\"Spawning {dset_test3_num_proc} processes\" in self._caplog.text)\n\n    def test_map_return_pa_table(self, in_memory):\n        def func_return_single_row_pa_table(x):\n            return pa.table({\"id\": [0], \"text\": [\"a\"]})\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with dset.map(func_return_single_row_pa_table) as dset_test:\n                    self.assertEqual(len(dset_test), 30)\n                    self.assertDictEqual(\n                        dset_test.features,\n                        Features({\"id\": Value(\"int64\"), \"text\": Value(\"string\")}),\n                    )\n                    self.assertEqual(dset_test[0][\"id\"], 0)\n                    self.assertEqual(dset_test[0][\"text\"], \"a\")\n\n        # Batched\n        def func_return_single_row_pa_table_batched(x):\n            batch_size = len(x[next(iter(x))])\n            return pa.table({\"id\": [0] * batch_size, \"text\": [\"a\"] * batch_size})\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with dset.map(func_return_single_row_pa_table_batched, batched=True) as dset_test:\n                    self.assertEqual(len(dset_test), 30)\n                    self.assertDictEqual(\n                        dset_test.features,\n                        Features({\"id\": Value(\"int64\"), \"text\": Value(\"string\")}),\n                    )\n                    self.assertEqual(dset_test[0][\"id\"], 0)\n                    self.assertEqual(dset_test[0][\"text\"], \"a\")\n\n        # Error when returning a table with more than one row in the non-batched mode\n        def func_return_multi_row_pa_table(x):\n            return pa.table({\"id\": [0, 1], \"text\": [\"a\", \"b\"]})\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                self.assertRaises(ValueError, dset.map, func_return_multi_row_pa_table)\n\n        # arrow formatted dataset\n        def func_return_table_from_expression(t):\n            import pyarrow.dataset as pds\n\n            return pds.dataset(t).to_table(\n                columns={\"new_column\": pds.field(\"\")._call(\"ascii_capitalize\", [pds.field(\"filename\")])}\n            )\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with dset.with_format(\"arrow\").map(func_return_table_from_expression, batched=True) as dset_test:\n                    self.assertEqual(len(dset_test), 30)\n                    self.assertDictEqual(\n                        dset_test.features,\n                        Features({\"new_column\": Value(\"string\")}),\n                    )\n                    self.assertEqual(dset_test.with_format(None)[0][\"new_column\"], dset[0][\"filename\"].capitalize())\n\n    def test_map_return_pd_dataframe(self, in_memory):\n        def func_return_single_row_pd_dataframe(x):\n            return pd.DataFrame({\"id\": [0], \"text\": [\"a\"]})\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with dset.map(func_return_single_row_pd_dataframe) as dset_test:\n                    self.assertEqual(len(dset_test), 30)\n                    self.assertDictEqual(\n                        dset_test.features,\n                        Features({\"id\": Value(\"int64\"), \"text\": Value(STRING_FROM_PANDAS)}),\n                    )\n                    self.assertEqual(dset_test[0][\"id\"], 0)\n                    self.assertEqual(dset_test[0][\"text\"], \"a\")\n\n        # Batched\n        def func_return_single_row_pd_dataframe_batched(x):\n            batch_size = len(x[next(iter(x))])\n            return pd.DataFrame({\"id\": [0] * batch_size, \"text\": [\"a\"] * batch_size})\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with dset.map(func_return_single_row_pd_dataframe_batched, batched=True) as dset_test:\n                    self.assertEqual(len(dset_test), 30)\n                    self.assertDictEqual(\n                        dset_test.features,\n                        Features({\"id\": Value(\"int64\"), \"text\": Value(STRING_FROM_PANDAS)}),\n                    )\n                    self.assertEqual(dset_test[0][\"id\"], 0)\n                    self.assertEqual(dset_test[0][\"text\"], \"a\")\n\n        # Error when returning a table with more than one row in the non-batched mode\n        def func_return_multi_row_pd_dataframe(x):\n            return pd.DataFrame({\"id\": [0, 1], \"text\": [\"a\", \"b\"]})\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                self.assertRaises(ValueError, dset.map, func_return_multi_row_pd_dataframe)\n\n    @require_polars\n    def test_map_return_pl_dataframe(self, in_memory):\n        import polars as pl\n\n        def func_return_single_row_pl_dataframe(x):\n            return pl.DataFrame({\"id\": [0], \"text\": [\"a\"]})\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with dset.map(func_return_single_row_pl_dataframe) as dset_test:\n                    self.assertEqual(len(dset_test), 30)\n                    self.assertDictEqual(\n                        dset_test.features,\n                        Features({\"id\": Value(\"int64\"), \"text\": Value(\"large_string\")}),\n                    )\n                    self.assertEqual(dset_test[0][\"id\"], 0)\n                    self.assertEqual(dset_test[0][\"text\"], \"a\")\n\n        # Batched\n        def func_return_single_row_pl_dataframe_batched(x):\n            batch_size = len(x[next(iter(x))])\n            return pl.DataFrame({\"id\": [0] * batch_size, \"text\": [\"a\"] * batch_size})\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with dset.map(func_return_single_row_pl_dataframe_batched, batched=True) as dset_test:\n                    self.assertEqual(len(dset_test), 30)\n                    self.assertDictEqual(\n                        dset_test.features,\n                        Features({\"id\": Value(\"int64\"), \"text\": Value(\"large_string\")}),\n                    )\n                    self.assertEqual(dset_test[0][\"id\"], 0)\n                    self.assertEqual(dset_test[0][\"text\"], \"a\")\n\n        # Error when returning a table with more than one row in the non-batched mode\n        def func_return_multi_row_pl_dataframe(x):\n            return pl.DataFrame({\"id\": [0, 1], \"text\": [\"a\", \"b\"]})\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                self.assertRaises(ValueError, dset.map, func_return_multi_row_pl_dataframe)\n\n    @require_numpy1_on_windows\n    @require_torch\n    def test_map_torch(self, in_memory):\n        import torch\n\n        def func(example):\n            return {\"tensor\": torch.tensor([1.0, 2, 3])}\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with dset.map(func) as dset_test:\n                    self.assertEqual(len(dset_test), 30)\n                    self.assertDictEqual(\n                        dset_test.features,\n                        Features({\"filename\": Value(\"string\"), \"tensor\": List(Value(\"float32\"))}),\n                    )\n                    self.assertListEqual(dset_test[0][\"tensor\"], [1, 2, 3])\n\n    @require_tf\n    def test_map_tf(self, in_memory):\n        import tensorflow as tf\n\n        def func(example):\n            return {\"tensor\": tf.constant([1.0, 2, 3])}\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with dset.map(func) as dset_test:\n                    self.assertEqual(len(dset_test), 30)\n                    self.assertDictEqual(\n                        dset_test.features,\n                        Features({\"filename\": Value(\"string\"), \"tensor\": List(Value(\"float32\"))}),\n                    )\n                    self.assertListEqual(dset_test[0][\"tensor\"], [1, 2, 3])\n\n    @require_jax\n    def test_map_jax(self, in_memory):\n        import jax.numpy as jnp\n\n        def func(example):\n            return {\"tensor\": jnp.asarray([1.0, 2, 3])}\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with dset.map(func) as dset_test:\n                    self.assertEqual(len(dset_test), 30)\n                    self.assertDictEqual(\n                        dset_test.features,\n                        Features({\"filename\": Value(\"string\"), \"tensor\": List(Value(\"float32\"))}),\n                    )\n                    self.assertListEqual(dset_test[0][\"tensor\"], [1, 2, 3])\n\n    def test_map_numpy(self, in_memory):\n        def func(example):\n            return {\"tensor\": np.array([1.0, 2, 3])}\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with dset.map(func) as dset_test:\n                    self.assertEqual(len(dset_test), 30)\n                    self.assertDictEqual(\n                        dset_test.features,\n                        Features({\"filename\": Value(\"string\"), \"tensor\": List(Value(\"float64\"))}),\n                    )\n                    self.assertListEqual(dset_test[0][\"tensor\"], [1, 2, 3])\n\n    @require_numpy1_on_windows\n    @require_torch\n    def test_map_tensor_batched(self, in_memory):\n        import torch\n\n        def func(batch):\n            return {\"tensor\": torch.tensor([[1.0, 2, 3]] * len(batch[\"filename\"]))}\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with dset.map(func, batched=True) as dset_test:\n                    self.assertEqual(len(dset_test), 30)\n                    self.assertDictEqual(\n                        dset_test.features,\n                        Features({\"filename\": Value(\"string\"), \"tensor\": List(Value(\"float32\"))}),\n                    )\n                    self.assertListEqual(dset_test[0][\"tensor\"], [1, 2, 3])\n\n    def test_map_input_columns(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                with dset.map(lambda col_1: {\"label\": col_1 % 2}, input_columns=\"col_1\") as mapped_dset:\n                    self.assertEqual(mapped_dset[0].keys(), {\"col_1\", \"col_2\", \"col_3\", \"label\"})\n                    self.assertEqual(\n                        mapped_dset.features,\n                        Features(\n                            {\n                                \"col_1\": Value(\"int64\"),\n                                \"col_2\": Value(\"string\"),\n                                \"col_3\": Value(\"bool\"),\n                                \"label\": Value(\"int64\"),\n                            }\n                        ),\n                    )\n\n    def test_map_remove_columns(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with dset.map(lambda x, i: {\"name\": x[\"filename\"][:-2], \"id\": i}, with_indices=True) as dset:\n                    self.assertTrue(\"id\" in dset[0])\n                    self.assertDictEqual(\n                        dset.features,\n                        Features({\"filename\": Value(\"string\"), \"name\": Value(\"string\"), \"id\": Value(\"int64\")}),\n                    )\n                    assert_arrow_metadata_are_synced_with_dataset_features(dset)\n                    with dset.map(lambda x: x, remove_columns=[\"id\"]) as mapped_dset:\n                        self.assertTrue(\"id\" not in mapped_dset[0])\n                        self.assertDictEqual(\n                            mapped_dset.features, Features({\"filename\": Value(\"string\"), \"name\": Value(\"string\")})\n                        )\n                        assert_arrow_metadata_are_synced_with_dataset_features(mapped_dset)\n                        with mapped_dset.with_format(\"numpy\", columns=mapped_dset.column_names) as mapped_dset:\n                            with mapped_dset.map(\n                                lambda x: {\"name\": 1}, remove_columns=mapped_dset.column_names\n                            ) as mapped_dset:\n                                self.assertTrue(\"filename\" not in mapped_dset[0])\n                                self.assertTrue(\"name\" in mapped_dset[0])\n                                self.assertDictEqual(mapped_dset.features, Features({\"name\": Value(dtype=\"int64\")}))\n                                assert_arrow_metadata_are_synced_with_dataset_features(mapped_dset)\n                    # empty dataset\n                    columns_names = dset.column_names\n                    with dset.select([]) as empty_dset:\n                        self.assertEqual(len(empty_dset), 0)\n                        with empty_dset.map(lambda x: {}, remove_columns=columns_names[0]) as mapped_dset:\n                            self.assertListEqual(columns_names[1:], mapped_dset.column_names)\n                            assert_arrow_metadata_are_synced_with_dataset_features(mapped_dset)\n\n    def test_map_stateful_callable(self, in_memory):\n        # be sure that the state of the map callable is unaffected\n        # before processing the dataset examples\n\n        class ExampleCounter:\n            def __init__(self, batched=False):\n                self.batched = batched\n                # state\n                self.cnt = 0\n\n            def __call__(self, example):\n                if self.batched:\n                    self.cnt += len(example)\n                else:\n                    self.cnt += 1\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                ex_cnt = ExampleCounter()\n                dset.map(ex_cnt)\n                self.assertEqual(ex_cnt.cnt, len(dset))\n\n                ex_cnt = ExampleCounter(batched=True)\n                dset.map(ex_cnt)\n                self.assertEqual(ex_cnt.cnt, len(dset))\n\n    @require_not_windows\n    def test_map_crash_subprocess(self, in_memory):\n        # be sure that a crash in one of the subprocess will not\n        # hang dataset.map() call forever\n\n        def do_crash(row):\n            import os\n\n            os.kill(os.getpid(), 9)\n            return row\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with pytest.raises(RuntimeError) as excinfo:\n                    dset.map(do_crash, num_proc=2)\n                assert str(excinfo.value) == (\n                    \"One of the subprocesses has abruptly died during map operation.\"\n                    \"To debug the error, disable multiprocessing.\"\n                )\n\n    def test_map_on_mixed_types(self, in_memory):\n        mixed_data = {\n            \"mixed_type\": [-1, 1, \"foo\"],\n            \"mix_struct_and_non_struct\": [{\"a\": 0}, [0]],\n            \"mixed_dict_keys\": [{\"a\": 0}, {\"b\": 0}, {\"c\": 0}],\n            \"mixed_dict_keys2\": [[{\"a\": 0}, {\"b\": 0}], [{\"c\": 0}, {\"d\": 0}]],\n            \"messages\": _messages,\n        }\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with dset.map(\n                    lambda x: mixed_data, on_mixed_types=\"use_json\", remove_columns=dset.column_names\n                ) as dset:\n                    self.assertDictEqual(dset[0], mixed_data)\n\n    def test_filter(self, in_memory):\n        # keep only first five examples\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                fingerprint = dset._fingerprint\n                with dset.filter(lambda x, i: i < 5, with_indices=True) as dset_filter_first_five:\n                    self.assertEqual(len(dset_filter_first_five), 5)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(dset_filter_first_five.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertNotEqual(dset_filter_first_five._fingerprint, fingerprint)\n\n        # filter filenames with even id at the end + formatted\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                dset.set_format(\"numpy\")\n                fingerprint = dset._fingerprint\n                with dset.filter(lambda x: int(x[\"filename\"][-1]) % 2 == 0) as dset_filter_even_num:\n                    self.assertEqual(len(dset_filter_even_num), 15)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(dset_filter_even_num.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertNotEqual(dset_filter_even_num._fingerprint, fingerprint)\n                    self.assertEqual(dset_filter_even_num.format[\"type\"], \"numpy\")\n\n    def test_filter_with_indices_mapping(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            dset = Dataset.from_dict({\"col\": [0, 1, 2]})\n            with self._to(in_memory, tmp_dir, dset) as dset:\n                with dset.filter(lambda x: x[\"col\"] > 0) as dset:\n                    self.assertListEqual(dset[\"col\"][:], [1, 2])\n                    with dset.filter(lambda x: x[\"col\"] < 2) as dset:\n                        self.assertListEqual(dset[\"col\"][:], [1])\n\n    def test_filter_empty(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                self.assertIsNone(dset._indices, None)\n\n                tmp_file = os.path.join(tmp_dir, \"test.arrow\")\n                with dset.filter(lambda _: False, cache_file_name=tmp_file) as dset:\n                    self.assertEqual(len(dset), 0)\n                    self.assertIsNotNone(dset._indices, None)\n\n                    tmp_file_2 = os.path.join(tmp_dir, \"test_2.arrow\")\n                    with dset.filter(lambda _: False, cache_file_name=tmp_file_2) as dset2:\n                        self.assertEqual(len(dset2), 0)\n                        self.assertEqual(dset._indices, dset2._indices)\n\n    def test_filter_batched(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            dset = Dataset.from_dict({\"col\": [0, 1, 2]})\n            with self._to(in_memory, tmp_dir, dset) as dset:\n                with dset.filter(lambda x: [i > 0 for i in x[\"col\"]], batched=True) as dset:\n                    self.assertListEqual(dset[\"col\"][:], [1, 2])\n                    with dset.filter(lambda x: [i < 2 for i in x[\"col\"]], batched=True) as dset:\n                        self.assertListEqual(dset[\"col\"][:], [1])\n\n    def test_filter_input_columns(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            dset = Dataset.from_dict({\"col_1\": [0, 1, 2], \"col_2\": [\"a\", \"b\", \"c\"]})\n            with self._to(in_memory, tmp_dir, dset) as dset:\n                with dset.filter(lambda x: x > 0, input_columns=[\"col_1\"]) as filtered_dset:\n                    self.assertListEqual(filtered_dset.column_names, dset.column_names)\n                    self.assertListEqual(filtered_dset[\"col_1\"][:], [1, 2])\n                    self.assertListEqual(filtered_dset[\"col_2\"][:], [\"b\", \"c\"])\n\n    def test_filter_fn_kwargs(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with Dataset.from_dict({\"id\": range(10)}) as dset:\n                with self._to(in_memory, tmp_dir, dset) as dset:\n                    fn_kwargs = {\"max_offset\": 3}\n                    with dset.filter(\n                        lambda example, max_offset: example[\"id\"] < max_offset, fn_kwargs=fn_kwargs\n                    ) as filtered_dset:\n                        assert len(filtered_dset) == 3\n                    with dset.filter(\n                        lambda id, max_offset: id < max_offset, fn_kwargs=fn_kwargs, input_columns=\"id\"\n                    ) as filtered_dset:\n                        assert len(filtered_dset) == 3\n                    with dset.filter(\n                        lambda id, i, max_offset: i < max_offset,\n                        fn_kwargs=fn_kwargs,\n                        input_columns=\"id\",\n                        with_indices=True,\n                    ) as filtered_dset:\n                        assert len(filtered_dset) == 3\n\n    def test_filter_multiprocessing(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                fingerprint = dset._fingerprint\n                with dset.filter(picklable_filter_function, num_proc=2) as dset_filter_first_ten:\n                    self.assertEqual(len(dset_filter_first_ten), 10)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(dset_filter_first_ten.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertEqual(len(dset_filter_first_ten.cache_files), 0 if in_memory else 2)\n                    self.assertNotEqual(dset_filter_first_ten._fingerprint, fingerprint)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:  # with_rank\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                fingerprint = dset._fingerprint\n                with dset.filter(\n                    picklable_filter_function_with_rank, num_proc=2, with_rank=True\n                ) as dset_filter_first_rank:\n                    self.assertEqual(len(dset_filter_first_rank), min(len(dset) // 2, len(dset)))\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(dset_filter_first_rank.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertEqual(len(dset_filter_first_rank.cache_files), 0 if in_memory else 2)\n                    self.assertNotEqual(dset_filter_first_rank._fingerprint, fingerprint)\n\n    def test_filter_caching(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            self._caplog.clear()\n            with self._caplog.at_level(INFO, logger=get_logger().name):\n                with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                    with dset.filter(lambda x, i: i < 5, with_indices=True) as dset_filter_first_five1:\n                        dset_test1_data_files = list(dset_filter_first_five1.cache_files)\n                    with dset.filter(lambda x, i: i < 5, with_indices=True) as dset_filter_first_five2:\n                        self.assertEqual(dset_test1_data_files, dset_filter_first_five2.cache_files)\n                        self.assertEqual(len(dset_filter_first_five2.cache_files), 0 if in_memory else 2)\n                        self.assertTrue((\"Loading cached processed dataset\" in self._caplog.text) ^ in_memory)\n\n    def test_keep_features_after_transform_specified(self, in_memory):\n        features = Features(\n            {\n                \"tokens\": List(Value(\"string\")),\n                \"labels\": List(ClassLabel(names=[\"negative\", \"positive\"])),\n            }\n        )\n\n        def invert_labels(x):\n            return {\"labels\": [(1 - label) for label in x[\"labels\"]]}\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with Dataset.from_dict(\n                {\"tokens\": [[\"foo\"] * 5] * 10, \"labels\": [[1] * 5] * 10}, features=features\n            ) as dset:\n                with self._to(in_memory, tmp_dir, dset) as dset:\n                    with dset.map(invert_labels, features=features) as inverted_dset:\n                        self.assertEqual(inverted_dset.features.type, features.type)\n                        self.assertDictEqual(inverted_dset.features, features)\n                        assert_arrow_metadata_are_synced_with_dataset_features(inverted_dset)\n\n    def test_keep_features_after_transform_unspecified(self, in_memory):\n        features = Features(\n            {\n                \"tokens\": List(Value(\"string\")),\n                \"labels\": List(ClassLabel(names=[\"negative\", \"positive\"])),\n            }\n        )\n\n        def invert_labels(x):\n            return {\"labels\": [(1 - label) for label in x[\"labels\"]]}\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with Dataset.from_dict(\n                {\"tokens\": [[\"foo\"] * 5] * 10, \"labels\": [[1] * 5] * 10}, features=features\n            ) as dset:\n                with self._to(in_memory, tmp_dir, dset) as dset:\n                    with dset.map(invert_labels) as inverted_dset:\n                        self.assertEqual(inverted_dset.features.type, features.type)\n                        self.assertDictEqual(inverted_dset.features, features)\n                        assert_arrow_metadata_are_synced_with_dataset_features(inverted_dset)\n\n    def test_keep_features_after_transform_to_file(self, in_memory):\n        features = Features(\n            {\n                \"tokens\": List(Value(\"string\")),\n                \"labels\": List(ClassLabel(names=[\"negative\", \"positive\"])),\n            }\n        )\n\n        def invert_labels(x):\n            return {\"labels\": [(1 - label) for label in x[\"labels\"]]}\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with Dataset.from_dict(\n                {\"tokens\": [[\"foo\"] * 5] * 10, \"labels\": [[1] * 5] * 10}, features=features\n            ) as dset:\n                with self._to(in_memory, tmp_dir, dset) as dset:\n                    tmp_file = os.path.join(tmp_dir, \"test.arrow\")\n                    dset.map(invert_labels, cache_file_name=tmp_file)\n                    with Dataset.from_file(tmp_file) as inverted_dset:\n                        self.assertEqual(inverted_dset.features.type, features.type)\n                        self.assertDictEqual(inverted_dset.features, features)\n\n    def test_keep_features_after_transform_to_memory(self, in_memory):\n        features = Features(\n            {\n                \"tokens\": List(Value(\"string\")),\n                \"labels\": List(ClassLabel(names=[\"negative\", \"positive\"])),\n            }\n        )\n\n        def invert_labels(x):\n            return {\"labels\": [(1 - label) for label in x[\"labels\"]]}\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with Dataset.from_dict(\n                {\"tokens\": [[\"foo\"] * 5] * 10, \"labels\": [[1] * 5] * 10}, features=features\n            ) as dset:\n                with self._to(in_memory, tmp_dir, dset) as dset:\n                    with dset.map(invert_labels, keep_in_memory=True) as inverted_dset:\n                        self.assertEqual(inverted_dset.features.type, features.type)\n                        self.assertDictEqual(inverted_dset.features, features)\n\n    def test_keep_features_after_loading_from_cache(self, in_memory):\n        features = Features(\n            {\n                \"tokens\": List(Value(\"string\")),\n                \"labels\": List(ClassLabel(names=[\"negative\", \"positive\"])),\n            }\n        )\n\n        def invert_labels(x):\n            return {\"labels\": [(1 - label) for label in x[\"labels\"]]}\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with Dataset.from_dict(\n                {\"tokens\": [[\"foo\"] * 5] * 10, \"labels\": [[1] * 5] * 10}, features=features\n            ) as dset:\n                with self._to(in_memory, tmp_dir, dset) as dset:\n                    tmp_file1 = os.path.join(tmp_dir, \"test1.arrow\")\n                    tmp_file2 = os.path.join(tmp_dir, \"test2.arrow\")\n                    # TODO: Why mapped twice?\n                    inverted_dset = dset.map(invert_labels, cache_file_name=tmp_file1)\n                    inverted_dset = dset.map(invert_labels, cache_file_name=tmp_file2)\n                    self.assertGreater(len(inverted_dset.cache_files), 0)\n                    self.assertEqual(inverted_dset.features.type, features.type)\n                    self.assertDictEqual(inverted_dset.features, features)\n                    del inverted_dset\n\n    def test_keep_features_with_new_features(self, in_memory):\n        features = Features(\n            {\n                \"tokens\": List(Value(\"string\")),\n                \"labels\": List(ClassLabel(names=[\"negative\", \"positive\"])),\n            }\n        )\n\n        def invert_labels(x):\n            return {\"labels\": [(1 - label) for label in x[\"labels\"]], \"labels2\": x[\"labels\"]}\n\n        expected_features = Features(\n            {\n                \"tokens\": List(Value(\"string\")),\n                \"labels\": List(ClassLabel(names=[\"negative\", \"positive\"])),\n                \"labels2\": List(Value(\"int64\")),\n            }\n        )\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with Dataset.from_dict(\n                {\"tokens\": [[\"foo\"] * 5] * 10, \"labels\": [[1] * 5] * 10}, features=features\n            ) as dset:\n                with self._to(in_memory, tmp_dir, dset) as dset:\n                    with dset.map(invert_labels) as inverted_dset:\n                        self.assertEqual(inverted_dset.features.type, expected_features.type)\n                        self.assertDictEqual(inverted_dset.features, expected_features)\n                        assert_arrow_metadata_are_synced_with_dataset_features(inverted_dset)\n\n    def test_select(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                # select every two example\n                indices = list(range(0, len(dset), 2))\n                tmp_file = os.path.join(tmp_dir, \"test.arrow\")\n                fingerprint = dset._fingerprint\n                with dset.select(indices, indices_cache_file_name=tmp_file) as dset_select_even:\n                    self.assertIsNotNone(dset_select_even._indices)  # an indices mapping is created\n                    self.assertTrue(os.path.exists(tmp_file))\n                    self.assertEqual(len(dset_select_even), 15)\n                    for row in dset_select_even:\n                        self.assertEqual(int(row[\"filename\"][-1]) % 2, 0)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(dset_select_even.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertNotEqual(dset_select_even._fingerprint, fingerprint)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                indices = list(range(0, len(dset)))\n                with dset.select(indices) as dset_select_all:\n                    # no indices mapping, since the indices are contiguous\n                    # (in this case the arrow table is simply sliced, which is more efficient)\n                    self.assertIsNone(dset_select_all._indices)\n                    self.assertEqual(len(dset_select_all), len(dset))\n                    self.assertListEqual(list(dset_select_all), list(dset))\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(dset_select_all.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertNotEqual(dset_select_all._fingerprint, fingerprint)\n                indices = range(0, len(dset))\n                with dset.select(indices) as dset_select_all:\n                    # same but with range\n                    self.assertIsNone(dset_select_all._indices)\n                    self.assertEqual(len(dset_select_all), len(dset))\n                    self.assertListEqual(list(dset_select_all), list(dset))\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(dset_select_all.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertNotEqual(dset_select_all._fingerprint, fingerprint)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                bad_indices = list(range(5))\n                bad_indices[-1] = len(dset) + 10  # out of bounds\n                tmp_file = os.path.join(tmp_dir, \"test.arrow\")\n                self.assertRaises(\n                    Exception,\n                    dset.select,\n                    indices=bad_indices,\n                    indices_cache_file_name=tmp_file,\n                    writer_batch_size=2,\n                )\n                self.assertFalse(os.path.exists(tmp_file))\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                indices = iter(range(len(dset)))  # iterator of contiguous indices\n                with dset.select(indices) as dset_select_all:\n                    # no indices mapping, since the indices are contiguous\n                    self.assertIsNone(dset_select_all._indices)\n                    self.assertEqual(len(dset_select_all), len(dset))\n                indices = reversed(range(len(dset)))  # iterator of not contiguous indices\n                tmp_file = os.path.join(tmp_dir, \"test.arrow\")\n                with dset.select(indices, indices_cache_file_name=tmp_file) as dset_select_all:\n                    # new indices mapping, since the indices are not contiguous\n                    self.assertIsNotNone(dset_select_all._indices)\n                    self.assertEqual(len(dset_select_all), len(dset))\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                bad_indices = list(range(5))\n                bad_indices[3] = \"foo\"  # wrong type\n                tmp_file = os.path.join(tmp_dir, \"test.arrow\")\n                self.assertRaises(\n                    Exception,\n                    dset.select,\n                    indices=bad_indices,\n                    indices_cache_file_name=tmp_file,\n                    writer_batch_size=2,\n                )\n                self.assertFalse(os.path.exists(tmp_file))\n                dset.set_format(\"numpy\")\n                with dset.select(\n                    range(5),\n                    indices_cache_file_name=tmp_file,\n                    writer_batch_size=2,\n                ) as dset_select_five:\n                    self.assertIsNone(dset_select_five._indices)\n                    self.assertEqual(len(dset_select_five), 5)\n                    self.assertEqual(dset_select_five.format[\"type\"], \"numpy\")\n                    for i, row in enumerate(dset_select_five):\n                        self.assertEqual(int(row[\"filename\"][-1]), i)\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(dset_select_five.features, Features({\"filename\": Value(\"string\")}))\n\n    def test_select_then_map(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with dset.select([0]) as d1:\n                    with d1.map(lambda x: {\"id\": int(x[\"filename\"].split(\"_\")[-1])}) as d1:\n                        self.assertEqual(d1[0][\"id\"], 0)\n                with dset.select([1]) as d2:\n                    with d2.map(lambda x: {\"id\": int(x[\"filename\"].split(\"_\")[-1])}) as d2:\n                        self.assertEqual(d2[0][\"id\"], 1)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                with dset.select([0], indices_cache_file_name=os.path.join(tmp_dir, \"i1.arrow\")) as d1:\n                    with d1.map(lambda x: {\"id\": int(x[\"filename\"].split(\"_\")[-1])}) as d1:\n                        self.assertEqual(d1[0][\"id\"], 0)\n                with dset.select([1], indices_cache_file_name=os.path.join(tmp_dir, \"i2.arrow\")) as d2:\n                    with d2.map(lambda x: {\"id\": int(x[\"filename\"].split(\"_\")[-1])}) as d2:\n                        self.assertEqual(d2[0][\"id\"], 1)\n\n    def test_pickle_after_many_transforms_on_disk(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                self.assertEqual(len(dset.cache_files), 0 if in_memory else 1)\n                with dset.rename_column(\"filename\", \"file\") as dset:\n                    self.assertListEqual(dset.column_names, [\"file\"])\n                    with dset.select(range(5)) as dset:\n                        self.assertEqual(len(dset), 5)\n                        with dset.map(lambda x: {\"id\": int(x[\"file\"][-1])}) as dset:\n                            self.assertListEqual(sorted(dset.column_names), [\"file\", \"id\"])\n                            with dset.rename_column(\"id\", \"number\") as dset:\n                                self.assertListEqual(sorted(dset.column_names), [\"file\", \"number\"])\n                                with dset.select([1, 0]) as dset:\n                                    self.assertEqual(dset[0][\"file\"], \"my_name-train_1\")\n                                    self.assertEqual(dset[0][\"number\"], 1)\n\n                                    self.assertEqual(dset._indices[\"indices\"].to_pylist(), [1, 0])\n                                    if not in_memory:\n                                        self.assertIn(\n                                            (\"rename_columns\", ([\"file\", \"number\"],), {}),\n                                            dset._data.replays,\n                                        )\n                                    if not in_memory:\n                                        dset._data.table = Unpicklable()  # check that we don't pickle the entire table\n\n                                    pickled = pickle.dumps(dset)\n                                    with pickle.loads(pickled) as loaded:\n                                        self.assertEqual(loaded[0][\"file\"], \"my_name-train_1\")\n                                        self.assertEqual(loaded[0][\"number\"], 1)\n\n    def test_shuffle(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                tmp_file = os.path.join(tmp_dir, \"test.arrow\")\n                fingerprint = dset._fingerprint\n\n                with dset.shuffle(seed=1234, keep_in_memory=True) as dset_shuffled:\n                    self.assertEqual(len(dset_shuffled), 30)\n                    self.assertEqual(dset_shuffled[0][\"filename\"], \"my_name-train_28\")\n                    self.assertEqual(dset_shuffled[2][\"filename\"], \"my_name-train_10\")\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(dset_shuffled.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertNotEqual(dset_shuffled._fingerprint, fingerprint)\n\n                with dset.shuffle(seed=1234, indices_cache_file_name=tmp_file) as dset_shuffled:\n                    self.assertEqual(len(dset_shuffled), 30)\n                    self.assertEqual(dset_shuffled[0][\"filename\"], \"my_name-train_28\")\n                    self.assertEqual(dset_shuffled[2][\"filename\"], \"my_name-train_10\")\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(dset_shuffled.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertNotEqual(dset_shuffled._fingerprint, fingerprint)\n\n                    # Reproducibility\n                    tmp_file = os.path.join(tmp_dir, \"test_2.arrow\")\n                    with dset.shuffle(seed=1234, indices_cache_file_name=tmp_file) as dset_shuffled_2:\n                        self.assertSequenceEqual(dset_shuffled[\"filename\"], dset_shuffled_2[\"filename\"])\n\n                # Compatible with temp_seed\n                with temp_seed(42), dset.shuffle() as d1:\n                    with temp_seed(42), dset.shuffle() as d2, dset.shuffle() as d3:\n                        self.assertSequenceEqual(d1[\"filename\"], d2[\"filename\"])\n                        self.assertEqual(d1._fingerprint, d2._fingerprint)\n                        self.assertNotEqual(d3[\"filename\"], d2[\"filename\"])\n                        self.assertNotEqual(d3._fingerprint, d2._fingerprint)\n\n    def test_sort(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            # Sort on a single key\n            with self._create_dummy_dataset(in_memory=in_memory, tmp_dir=tmp_dir) as dset:\n                # Keep only 10 examples\n                tmp_file = os.path.join(tmp_dir, \"test.arrow\")\n                with dset.select(range(10), indices_cache_file_name=tmp_file) as dset:\n                    tmp_file = os.path.join(tmp_dir, \"test_2.arrow\")\n                    with dset.shuffle(seed=1234, indices_cache_file_name=tmp_file) as dset:\n                        self.assertEqual(len(dset), 10)\n                        self.assertEqual(dset[0][\"filename\"], \"my_name-train_8\")\n                        self.assertEqual(dset[1][\"filename\"], \"my_name-train_9\")\n                        # Sort\n                        tmp_file = os.path.join(tmp_dir, \"test_3.arrow\")\n                        fingerprint = dset._fingerprint\n                        with dset.sort(\"filename\", indices_cache_file_name=tmp_file) as dset_sorted:\n                            for i, row in enumerate(dset_sorted):\n                                self.assertEqual(int(row[\"filename\"][-1]), i)\n                            self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                            self.assertDictEqual(dset_sorted.features, Features({\"filename\": Value(\"string\")}))\n                            self.assertNotEqual(dset_sorted._fingerprint, fingerprint)\n                            # Sort reversed\n                            tmp_file = os.path.join(tmp_dir, \"test_4.arrow\")\n                            fingerprint = dset._fingerprint\n                            with dset.sort(\"filename\", indices_cache_file_name=tmp_file, reverse=True) as dset_sorted:\n                                for i, row in enumerate(dset_sorted):\n                                    self.assertEqual(int(row[\"filename\"][-1]), len(dset_sorted) - 1 - i)\n                                self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                                self.assertDictEqual(dset_sorted.features, Features({\"filename\": Value(\"string\")}))\n                                self.assertNotEqual(dset_sorted._fingerprint, fingerprint)\n                            # formatted\n                            dset.set_format(\"numpy\")\n                            with dset.sort(\"filename\") as dset_sorted_formatted:\n                                self.assertEqual(dset_sorted_formatted.format[\"type\"], \"numpy\")\n            # Sort on multiple keys\n            with self._create_dummy_dataset(in_memory=in_memory, tmp_dir=tmp_dir, multiple_columns=True) as dset:\n                tmp_file = os.path.join(tmp_dir, \"test_5.arrow\")\n                fingerprint = dset._fingerprint\n                # Throw error when reverse is a list of bools that does not match the length of column_names\n                with pytest.raises(ValueError):\n                    dset.sort([\"col_1\", \"col_2\", \"col_3\"], reverse=[False])\n                with dset.shuffle(seed=1234, indices_cache_file_name=tmp_file) as dset:\n                    # Sort\n                    with dset.sort([\"col_1\", \"col_2\", \"col_3\"], reverse=[False, True, False]) as dset_sorted:\n                        for i, row in enumerate(dset_sorted):\n                            self.assertEqual(row[\"col_1\"], i)\n                        self.assertDictEqual(\n                            dset.features,\n                            Features(\n                                {\n                                    \"col_1\": Value(\"int64\"),\n                                    \"col_2\": Value(\"string\"),\n                                    \"col_3\": Value(\"bool\"),\n                                }\n                            ),\n                        )\n                        self.assertDictEqual(\n                            dset_sorted.features,\n                            Features(\n                                {\n                                    \"col_1\": Value(\"int64\"),\n                                    \"col_2\": Value(\"string\"),\n                                    \"col_3\": Value(\"bool\"),\n                                }\n                            ),\n                        )\n                        self.assertNotEqual(dset_sorted._fingerprint, fingerprint)\n                        # Sort reversed\n                        with dset.sort([\"col_1\", \"col_2\", \"col_3\"], reverse=[True, False, True]) as dset_sorted:\n                            for i, row in enumerate(dset_sorted):\n                                self.assertEqual(row[\"col_1\"], len(dset_sorted) - 1 - i)\n                            self.assertDictEqual(\n                                dset.features,\n                                Features(\n                                    {\n                                        \"col_1\": Value(\"int64\"),\n                                        \"col_2\": Value(\"string\"),\n                                        \"col_3\": Value(\"bool\"),\n                                    }\n                                ),\n                            )\n                            self.assertDictEqual(\n                                dset_sorted.features,\n                                Features(\n                                    {\n                                        \"col_1\": Value(\"int64\"),\n                                        \"col_2\": Value(\"string\"),\n                                        \"col_3\": Value(\"bool\"),\n                                    }\n                                ),\n                            )\n                            self.assertNotEqual(dset_sorted._fingerprint, fingerprint)\n                            # formatted\n                            dset.set_format(\"numpy\")\n                            with dset.sort(\n                                [\"col_1\", \"col_2\", \"col_3\"], reverse=[False, True, False]\n                            ) as dset_sorted_formatted:\n                                self.assertEqual(dset_sorted_formatted.format[\"type\"], \"numpy\")\n\n    def test_to_csv(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            # File path argument\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                file_path = os.path.join(tmp_dir, \"test_path.csv\")\n                bytes_written = dset.to_csv(path_or_buf=file_path)\n\n                self.assertTrue(os.path.isfile(file_path))\n                self.assertEqual(bytes_written, os.path.getsize(file_path))\n                csv_dset = pd.read_csv(file_path)\n\n                self.assertEqual(csv_dset.shape, dset.shape)\n                self.assertListEqual(list(csv_dset.columns), list(dset.column_names))\n\n            # File buffer argument\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                file_path = os.path.join(tmp_dir, \"test_buffer.csv\")\n                with open(file_path, \"wb+\") as buffer:\n                    bytes_written = dset.to_csv(path_or_buf=buffer)\n\n                self.assertTrue(os.path.isfile(file_path))\n                self.assertEqual(bytes_written, os.path.getsize(file_path))\n                csv_dset = pd.read_csv(file_path)\n\n                self.assertEqual(csv_dset.shape, dset.shape)\n                self.assertListEqual(list(csv_dset.columns), list(dset.column_names))\n\n            # After a select/shuffle transform\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                dset = dset.select(range(0, len(dset), 2)).shuffle()\n                file_path = os.path.join(tmp_dir, \"test_path.csv\")\n                bytes_written = dset.to_csv(path_or_buf=file_path)\n\n                self.assertTrue(os.path.isfile(file_path))\n                self.assertEqual(bytes_written, os.path.getsize(file_path))\n                csv_dset = pd.read_csv(file_path)\n\n                self.assertEqual(csv_dset.shape, dset.shape)\n                self.assertListEqual(list(csv_dset.columns), list(dset.column_names))\n\n            # With array features\n            with self._create_dummy_dataset(in_memory, tmp_dir, array_features=True) as dset:\n                file_path = os.path.join(tmp_dir, \"test_path.csv\")\n                bytes_written = dset.to_csv(path_or_buf=file_path)\n\n                self.assertTrue(os.path.isfile(file_path))\n                self.assertEqual(bytes_written, os.path.getsize(file_path))\n                csv_dset = pd.read_csv(file_path)\n\n                self.assertEqual(csv_dset.shape, dset.shape)\n                self.assertListEqual(list(csv_dset.columns), list(dset.column_names))\n\n    def test_to_dict(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                # Full\n                dset_to_dict = dset.to_dict()\n                self.assertIsInstance(dset_to_dict, dict)\n                self.assertListEqual(sorted(dset_to_dict.keys()), sorted(dset.column_names))\n\n                for col_name in dset.column_names:\n                    self.assertLessEqual(len(dset_to_dict[col_name]), len(dset))\n\n                # With index mapping\n                with dset.select([1, 0, 3]) as dset:\n                    dset_to_dict = dset.to_dict()\n                    self.assertIsInstance(dset_to_dict, dict)\n                    self.assertEqual(len(dset_to_dict), 3)\n                    self.assertListEqual(sorted(dset_to_dict.keys()), sorted(dset.column_names))\n\n                    for col_name in dset.column_names:\n                        self.assertIsInstance(dset_to_dict[col_name], list)\n                        self.assertEqual(len(dset_to_dict[col_name]), len(dset))\n\n    def test_to_list(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                dset_to_list = dset.to_list()\n                self.assertIsInstance(dset_to_list, list)\n                for row in dset_to_list:\n                    self.assertIsInstance(row, dict)\n                    self.assertListEqual(sorted(row.keys()), sorted(dset.column_names))\n\n                # With index mapping\n                with dset.select([1, 0, 3]) as dset:\n                    dset_to_list = dset.to_list()\n                    self.assertIsInstance(dset_to_list, list)\n                    self.assertEqual(len(dset_to_list), 3)\n                    for row in dset_to_list:\n                        self.assertIsInstance(row, dict)\n                        self.assertListEqual(sorted(row.keys()), sorted(dset.column_names))\n\n    def test_to_pandas(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            # Batched\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                batch_size = dset.num_rows - 1\n                to_pandas_generator = dset.to_pandas(batched=True, batch_size=batch_size)\n\n                for batch in to_pandas_generator:\n                    self.assertIsInstance(batch, pd.DataFrame)\n                    self.assertListEqual(sorted(batch.columns), sorted(dset.column_names))\n                    for col_name in dset.column_names:\n                        self.assertLessEqual(len(batch[col_name]), batch_size)\n\n                # Full\n                dset_to_pandas = dset.to_pandas()\n                self.assertIsInstance(dset_to_pandas, pd.DataFrame)\n                self.assertListEqual(sorted(dset_to_pandas.columns), sorted(dset.column_names))\n                for col_name in dset.column_names:\n                    self.assertEqual(len(dset_to_pandas[col_name]), len(dset))\n\n                # With index mapping\n                with dset.select([1, 0, 3]) as dset:\n                    dset_to_pandas = dset.to_pandas()\n                    self.assertIsInstance(dset_to_pandas, pd.DataFrame)\n                    self.assertEqual(len(dset_to_pandas), 3)\n                    self.assertListEqual(sorted(dset_to_pandas.columns), sorted(dset.column_names))\n\n                    for col_name in dset.column_names:\n                        self.assertEqual(len(dset_to_pandas[col_name]), dset.num_rows)\n\n    @require_polars\n    def test_to_polars(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            # Batched\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                batch_size = dset.num_rows - 1\n                to_polars_generator = dset.to_polars(batched=True, batch_size=batch_size)\n\n                for batch in to_polars_generator:\n                    self.assertIsInstance(batch, sys.modules[\"polars\"].DataFrame)\n                    self.assertListEqual(sorted(batch.columns), sorted(dset.column_names))\n                    for col_name in dset.column_names:\n                        self.assertLessEqual(len(batch[col_name]), batch_size)\n                    del batch\n\n                # Full\n                dset_to_polars = dset.to_polars()\n                self.assertIsInstance(dset_to_polars, sys.modules[\"polars\"].DataFrame)\n                self.assertListEqual(sorted(dset_to_polars.columns), sorted(dset.column_names))\n                for col_name in dset.column_names:\n                    self.assertEqual(len(dset_to_polars[col_name]), len(dset))\n\n                # With index mapping\n                with dset.select([1, 0, 3]) as dset:\n                    dset_to_polars = dset.to_polars()\n                    self.assertIsInstance(dset_to_polars, sys.modules[\"polars\"].DataFrame)\n                    self.assertEqual(len(dset_to_polars), 3)\n                    self.assertListEqual(sorted(dset_to_polars.columns), sorted(dset.column_names))\n\n                    for col_name in dset.column_names:\n                        self.assertEqual(len(dset_to_polars[col_name]), dset.num_rows)\n\n    def test_to_parquet(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            # File path argument\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                file_path = os.path.join(tmp_dir, \"test_path.parquet\")\n                dset.to_parquet(path_or_buf=file_path)\n\n                self.assertTrue(os.path.isfile(file_path))\n                # self.assertEqual(bytes_written, os.path.getsize(file_path))  # because of compression, the number of bytes doesn't match\n                parquet_dset = pd.read_parquet(file_path)\n\n                self.assertEqual(parquet_dset.shape, dset.shape)\n                self.assertListEqual(list(parquet_dset.columns), list(dset.column_names))\n\n            # File buffer argument\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                file_path = os.path.join(tmp_dir, \"test_buffer.parquet\")\n                with open(file_path, \"wb+\") as buffer:\n                    dset.to_parquet(path_or_buf=buffer)\n\n                self.assertTrue(os.path.isfile(file_path))\n                # self.assertEqual(bytes_written, os.path.getsize(file_path))  # because of compression, the number of bytes doesn't match\n                parquet_dset = pd.read_parquet(file_path)\n\n                self.assertEqual(parquet_dset.shape, dset.shape)\n                self.assertListEqual(list(parquet_dset.columns), list(dset.column_names))\n\n            # After a select/shuffle transform\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                dset = dset.select(range(0, len(dset), 2)).shuffle()\n                file_path = os.path.join(tmp_dir, \"test_path.parquet\")\n                dset.to_parquet(path_or_buf=file_path)\n\n                self.assertTrue(os.path.isfile(file_path))\n                # self.assertEqual(bytes_written, os.path.getsize(file_path))  # because of compression, the number of bytes doesn't match\n                parquet_dset = pd.read_parquet(file_path)\n\n                self.assertEqual(parquet_dset.shape, dset.shape)\n                self.assertListEqual(list(parquet_dset.columns), list(dset.column_names))\n\n            # With array features\n            with self._create_dummy_dataset(in_memory, tmp_dir, array_features=True) as dset:\n                file_path = os.path.join(tmp_dir, \"test_path.parquet\")\n                dset.to_parquet(path_or_buf=file_path)\n\n                self.assertTrue(os.path.isfile(file_path))\n                # self.assertEqual(bytes_written, os.path.getsize(file_path))  # because of compression, the number of bytes doesn't match\n                parquet_dset = pd.read_parquet(file_path)\n\n                self.assertEqual(parquet_dset.shape, dset.shape)\n                self.assertListEqual(list(parquet_dset.columns), list(dset.column_names))\n\n    @require_sqlalchemy\n    def test_to_sql(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            # Destionation specified as database URI string\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                file_path = os.path.join(tmp_dir, \"test_path.sqlite\")\n                _ = dset.to_sql(\"data\", \"sqlite:///\" + file_path)\n\n                self.assertTrue(os.path.isfile(file_path))\n                sql_dset = pd.read_sql(\"data\", \"sqlite:///\" + file_path)\n\n                self.assertEqual(sql_dset.shape, dset.shape)\n                self.assertListEqual(list(sql_dset.columns), list(dset.column_names))\n\n            # Destionation specified as sqlite3 connection\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                import sqlite3\n\n                file_path = os.path.join(tmp_dir, \"test_path.sqlite\")\n                with contextlib.closing(sqlite3.connect(file_path)) as con:\n                    _ = dset.to_sql(\"data\", con, if_exists=\"replace\")\n\n                self.assertTrue(os.path.isfile(file_path))\n                sql_dset = pd.read_sql(\"data\", \"sqlite:///\" + file_path)\n\n                self.assertEqual(sql_dset.shape, dset.shape)\n                self.assertListEqual(list(sql_dset.columns), list(dset.column_names))\n\n            # Test writing to a database in chunks\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                file_path = os.path.join(tmp_dir, \"test_path.sqlite\")\n                _ = dset.to_sql(\"data\", \"sqlite:///\" + file_path, batch_size=1, if_exists=\"replace\")\n\n                self.assertTrue(os.path.isfile(file_path))\n                sql_dset = pd.read_sql(\"data\", \"sqlite:///\" + file_path)\n\n                self.assertEqual(sql_dset.shape, dset.shape)\n                self.assertListEqual(list(sql_dset.columns), list(dset.column_names))\n\n            # After a select/shuffle transform\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                dset = dset.select(range(0, len(dset), 2)).shuffle()\n                file_path = os.path.join(tmp_dir, \"test_path.sqlite\")\n                _ = dset.to_sql(\"data\", \"sqlite:///\" + file_path, if_exists=\"replace\")\n\n                self.assertTrue(os.path.isfile(file_path))\n                sql_dset = pd.read_sql(\"data\", \"sqlite:///\" + file_path)\n\n                self.assertEqual(sql_dset.shape, dset.shape)\n                self.assertListEqual(list(sql_dset.columns), list(dset.column_names))\n\n            # With array features\n            if datasets.config.PANDAS_VERSION.major >= 3:\n                # Pandas 3 can't save and reload string data\n                # pandas/_libs/lib.pyx:732: in pandas._libs.lib.ensure_string_array\n                # E   UnicodeDecodeError: 'utf-8' codec can't decode byte 0x98 in position 0: invalid start byte\n                # pandas/_libs/lib.pyx:846: UnicodeDecodeError\n                return\n            with self._create_dummy_dataset(in_memory, tmp_dir, array_features=True) as dset:\n                file_path = os.path.join(tmp_dir, \"test_path.sqlite\")\n                _ = dset.to_sql(\"data\", \"sqlite:///\" + file_path, if_exists=\"replace\")\n\n                self.assertTrue(os.path.isfile(file_path))\n                sql_dset = pd.read_sql(\"data\", \"sqlite:///\" + file_path)\n\n                self.assertEqual(sql_dset.shape, dset.shape)\n                self.assertListEqual(list(sql_dset.columns), list(dset.column_names))\n\n    def test_train_test_split(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                fingerprint = dset._fingerprint\n                dset_dict = dset.train_test_split(test_size=10, shuffle=False)\n                self.assertListEqual(list(dset_dict.keys()), [\"train\", \"test\"])\n                dset_train = dset_dict[\"train\"]\n                dset_test = dset_dict[\"test\"]\n\n                self.assertEqual(len(dset_train), 20)\n                self.assertEqual(len(dset_test), 10)\n                self.assertEqual(dset_train[0][\"filename\"], \"my_name-train_0\")\n                self.assertEqual(dset_train[-1][\"filename\"], \"my_name-train_19\")\n                self.assertEqual(dset_test[0][\"filename\"], \"my_name-train_20\")\n                self.assertEqual(dset_test[-1][\"filename\"], \"my_name-train_29\")\n                self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                self.assertDictEqual(dset_train.features, Features({\"filename\": Value(\"string\")}))\n                self.assertDictEqual(dset_test.features, Features({\"filename\": Value(\"string\")}))\n                self.assertNotEqual(dset_train._fingerprint, fingerprint)\n                self.assertNotEqual(dset_test._fingerprint, fingerprint)\n                self.assertNotEqual(dset_train._fingerprint, dset_test._fingerprint)\n\n                dset_dict = dset.train_test_split(test_size=0.5, shuffle=False)\n                self.assertListEqual(list(dset_dict.keys()), [\"train\", \"test\"])\n                dset_train = dset_dict[\"train\"]\n                dset_test = dset_dict[\"test\"]\n\n                self.assertEqual(len(dset_train), 15)\n                self.assertEqual(len(dset_test), 15)\n                self.assertEqual(dset_train[0][\"filename\"], \"my_name-train_0\")\n                self.assertEqual(dset_train[-1][\"filename\"], \"my_name-train_14\")\n                self.assertEqual(dset_test[0][\"filename\"], \"my_name-train_15\")\n                self.assertEqual(dset_test[-1][\"filename\"], \"my_name-train_29\")\n                self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                self.assertDictEqual(dset_train.features, Features({\"filename\": Value(\"string\")}))\n                self.assertDictEqual(dset_test.features, Features({\"filename\": Value(\"string\")}))\n\n                dset_dict = dset.train_test_split(train_size=10, shuffle=False)\n                self.assertListEqual(list(dset_dict.keys()), [\"train\", \"test\"])\n                dset_train = dset_dict[\"train\"]\n                dset_test = dset_dict[\"test\"]\n\n                self.assertEqual(len(dset_train), 10)\n                self.assertEqual(len(dset_test), 20)\n                self.assertEqual(dset_train[0][\"filename\"], \"my_name-train_0\")\n                self.assertEqual(dset_train[-1][\"filename\"], \"my_name-train_9\")\n                self.assertEqual(dset_test[0][\"filename\"], \"my_name-train_10\")\n                self.assertEqual(dset_test[-1][\"filename\"], \"my_name-train_29\")\n                self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                self.assertDictEqual(dset_train.features, Features({\"filename\": Value(\"string\")}))\n                self.assertDictEqual(dset_test.features, Features({\"filename\": Value(\"string\")}))\n\n                dset.set_format(\"numpy\")\n                dset_dict = dset.train_test_split(train_size=10, seed=42)\n                self.assertListEqual(list(dset_dict.keys()), [\"train\", \"test\"])\n                dset_train = dset_dict[\"train\"]\n                dset_test = dset_dict[\"test\"]\n\n                self.assertEqual(len(dset_train), 10)\n                self.assertEqual(len(dset_test), 20)\n                self.assertEqual(dset_train.format[\"type\"], \"numpy\")\n                self.assertEqual(dset_test.format[\"type\"], \"numpy\")\n                self.assertNotEqual(dset_train[0][\"filename\"].item(), \"my_name-train_0\")\n                self.assertNotEqual(dset_train[-1][\"filename\"].item(), \"my_name-train_9\")\n                self.assertNotEqual(dset_test[0][\"filename\"].item(), \"my_name-train_10\")\n                self.assertNotEqual(dset_test[-1][\"filename\"].item(), \"my_name-train_29\")\n                self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                self.assertDictEqual(dset_train.features, Features({\"filename\": Value(\"string\")}))\n                self.assertDictEqual(dset_test.features, Features({\"filename\": Value(\"string\")}))\n                del dset_test, dset_train, dset_dict  # DatasetDict\n\n    def test_shard(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir, self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n            tmp_file = os.path.join(tmp_dir, \"test.arrow\")\n            with dset.select(range(10), indices_cache_file_name=tmp_file) as dset:\n                self.assertEqual(len(dset), 10)\n                # Shard non-contiguous\n                tmp_file_1 = os.path.join(tmp_dir, \"test_1.arrow\")\n                fingerprint = dset._fingerprint\n                with dset.shard(\n                    num_shards=8, index=1, contiguous=False, indices_cache_file_name=tmp_file_1\n                ) as dset_sharded:\n                    self.assertEqual(2, len(dset_sharded))\n                    self.assertEqual([\"my_name-train_1\", \"my_name-train_9\"], dset_sharded[\"filename\"])\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(dset_sharded.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertNotEqual(dset_sharded._fingerprint, fingerprint)\n                # Shard contiguous\n                tmp_file_2 = os.path.join(tmp_dir, \"test_2.arrow\")\n                with dset.shard(\n                    num_shards=3, index=0, contiguous=True, indices_cache_file_name=tmp_file_2\n                ) as dset_sharded_contiguous:\n                    self.assertEqual([f\"my_name-train_{i}\" for i in (0, 1, 2, 3)], dset_sharded_contiguous[\"filename\"])\n                    self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\")}))\n                    self.assertDictEqual(dset_sharded_contiguous.features, Features({\"filename\": Value(\"string\")}))\n                    # Test lengths of sharded contiguous\n                    self.assertEqual(\n                        [4, 3, 3],\n                        [\n                            len(dset.shard(3, index=i, contiguous=True, indices_cache_file_name=tmp_file_2 + str(i)))\n                            for i in range(3)\n                        ],\n                    )\n                # formatted\n                dset.set_format(\"numpy\")\n                with dset.shard(num_shards=3, index=0) as dset_sharded_formatted:\n                    self.assertEqual(dset_sharded_formatted.format[\"type\"], \"numpy\")\n\n    def test_flatten_indices(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                self.assertIsNone(dset._indices)\n\n                tmp_file = os.path.join(tmp_dir, \"test.arrow\")\n                with dset.select(range(0, 10, 2), indices_cache_file_name=tmp_file) as dset:\n                    self.assertEqual(len(dset), 5)\n\n                    self.assertIsNotNone(dset._indices)\n\n                    tmp_file_2 = os.path.join(tmp_dir, \"test_2.arrow\")\n                    fingerprint = dset._fingerprint\n                    dset.set_format(\"numpy\")\n                    with dset.flatten_indices(cache_file_name=tmp_file_2) as dset:\n                        self.assertEqual(len(dset), 5)\n                        self.assertEqual(len(dset.data), len(dset))\n                        self.assertIsNone(dset._indices)\n                        self.assertNotEqual(dset._fingerprint, fingerprint)\n                        self.assertEqual(dset.format[\"type\"], \"numpy\")\n                        # Test unique works\n                        dset.unique(dset.column_names[0])\n                        assert_arrow_metadata_are_synced_with_dataset_features(dset)\n\n        # Empty indices mapping\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir) as dset:\n                self.assertIsNone(dset._indices, None)\n\n                tmp_file = os.path.join(tmp_dir, \"test.arrow\")\n                with dset.filter(lambda _: False, cache_file_name=tmp_file) as dset:\n                    self.assertEqual(len(dset), 0)\n\n                    self.assertIsNotNone(dset._indices, None)\n\n                    tmp_file_2 = os.path.join(tmp_dir, \"test_2.arrow\")\n                    fingerprint = dset._fingerprint\n                    dset.set_format(\"numpy\")\n                    with dset.flatten_indices(cache_file_name=tmp_file_2) as dset:\n                        self.assertEqual(len(dset), 0)\n                        self.assertEqual(len(dset.data), len(dset))\n                        self.assertIsNone(dset._indices, None)\n                        self.assertNotEqual(dset._fingerprint, fingerprint)\n                        self.assertEqual(dset.format[\"type\"], \"numpy\")\n                        # Test unique works\n                        dset.unique(dset.column_names[0])\n                        assert_arrow_metadata_are_synced_with_dataset_features(dset)\n\n    @require_tf\n    @require_torch\n    def test_format_vectors(self, in_memory):\n        import numpy as np\n        import tensorflow as tf\n        import torch\n\n        with (\n            tempfile.TemporaryDirectory() as tmp_dir,\n            self._create_dummy_dataset(in_memory, tmp_dir) as dset,\n            dset.map(lambda ex, i: {\"vec\": np.ones(3) * i}, with_indices=True) as dset,\n        ):\n            columns = dset.column_names\n\n            self.assertIsNotNone(dset[0])\n            self.assertIsNotNone(dset[:2])\n            for col in columns:\n                self.assertIsInstance(dset[0][col], (str, list))\n                self.assertIsInstance(dset[:2][col], list)\n            self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\"), \"vec\": List(Value(\"float64\"))}))\n\n            dset.set_format(\"tensorflow\")\n            self.assertIsNotNone(dset[0])\n            self.assertIsNotNone(dset[:2])\n            for col in columns:\n                self.assertIsInstance(dset[0][col], (tf.Tensor, tf.RaggedTensor))\n                self.assertIsInstance(dset[:2][col], (tf.Tensor, tf.RaggedTensor))\n                self.assertIsInstance(dset[col][:2], (tf.Tensor, tf.RaggedTensor))\n            self.assertTupleEqual(tuple(dset[:2][\"vec\"].shape), (2, 3))\n            self.assertTupleEqual(tuple(dset[\"vec\"][:2].shape), (2, 3))\n\n            dset.set_format(\"numpy\")\n            self.assertIsNotNone(dset[0])\n            self.assertIsNotNone(dset[:2])\n            self.assertIsInstance(dset[0][\"filename\"], np.str_)\n            self.assertIsInstance(dset[:2][\"filename\"], np.ndarray)\n            self.assertIsInstance(dset[\"filename\"][:], np.ndarray)\n            self.assertIsInstance(dset[0][\"vec\"], np.ndarray)\n            self.assertIsInstance(dset[:2][\"vec\"], np.ndarray)\n            self.assertIsInstance(dset[\"vec\"][:2], np.ndarray)\n            self.assertTupleEqual(dset[:2][\"vec\"].shape, (2, 3))\n            self.assertTupleEqual(dset[\"vec\"][:2].shape, (2, 3))\n\n            dset.set_format(\"torch\", columns=[\"vec\"])\n            self.assertIsNotNone(dset[0])\n            self.assertIsNotNone(dset[:2])\n            # torch.Tensor is only for numerical columns\n            self.assertIsInstance(dset[0][\"vec\"], torch.Tensor)\n            self.assertIsInstance(dset[:2][\"vec\"], torch.Tensor)\n            self.assertIsInstance(dset[\"vec\"][:2], torch.Tensor)\n            self.assertTupleEqual(dset[:2][\"vec\"].shape, (2, 3))\n            self.assertTupleEqual(dset[\"vec\"][:2].shape, (2, 3))\n\n    @require_tf\n    @require_torch\n    def test_format_ragged_vectors(self, in_memory):\n        import numpy as np\n        import tensorflow as tf\n        import torch\n\n        with (\n            tempfile.TemporaryDirectory() as tmp_dir,\n            self._create_dummy_dataset(in_memory, tmp_dir) as dset,\n            dset.map(lambda ex, i: {\"vec\": np.ones(3 + i) * i}, with_indices=True) as dset,\n        ):\n            columns = dset.column_names\n\n            self.assertIsNotNone(dset[0])\n            self.assertIsNotNone(dset[:2])\n            for col in columns:\n                self.assertIsInstance(dset[0][col], (str, list))\n                self.assertIsInstance(dset[:2][col], list)\n            self.assertDictEqual(dset.features, Features({\"filename\": Value(\"string\"), \"vec\": List(Value(\"float64\"))}))\n\n            dset.set_format(\"tensorflow\")\n            self.assertIsNotNone(dset[0])\n            self.assertIsNotNone(dset[:2])\n            for col in columns:\n                self.assertIsInstance(dset[0][col], tf.Tensor)\n                self.assertIsInstance(dset[:2][col], tf.RaggedTensor if col == \"vec\" else tf.Tensor)\n                self.assertIsInstance(dset[col][:2], tf.RaggedTensor if col == \"vec\" else tf.Tensor)\n            # dim is None for ragged vectors in tensorflow\n            self.assertListEqual(dset[:2][\"vec\"].shape.as_list(), [2, None])\n            self.assertListEqual(dset[\"vec\"][:2].shape.as_list(), [2, None])\n\n            dset.set_format(\"numpy\")\n            self.assertIsNotNone(dset[0])\n            self.assertIsNotNone(dset[:2])\n            self.assertIsInstance(dset[0][\"filename\"], np.str_)\n            self.assertIsInstance(dset[:2][\"filename\"], np.ndarray)\n            self.assertIsInstance(dset[\"filename\"][:2], np.ndarray)\n            self.assertIsInstance(dset[0][\"vec\"], np.ndarray)\n            self.assertIsInstance(dset[:2][\"vec\"], np.ndarray)\n            self.assertIsInstance(dset[\"vec\"][:], np.ndarray)\n            # array is flat for ragged vectors in numpy\n            self.assertTupleEqual(dset[:2][\"vec\"].shape, (2,))\n            self.assertTupleEqual(dset[\"vec\"][:2].shape, (2,))\n\n            dset.set_format(\"torch\")\n            self.assertIsNotNone(dset[0])\n            self.assertIsNotNone(dset[:2])\n            self.assertIsInstance(dset[0][\"filename\"], str)\n            self.assertIsInstance(dset[:2][\"filename\"], list)\n            self.assertIsInstance(dset[\"filename\"][:2], list)\n            self.assertIsInstance(dset[0][\"vec\"], torch.Tensor)\n            self.assertIsInstance(dset[:2][\"vec\"][0], torch.Tensor)\n            self.assertIsInstance(dset[\"vec\"][0], torch.Tensor)\n            # pytorch doesn't support ragged tensors, so we should have lists\n            self.assertIsInstance(dset[:2][\"vec\"], list)\n            self.assertIsInstance(dset[:2][\"vec\"][0], torch.Tensor)\n            self.assertIsInstance(dset[\"vec\"][:2], list)\n            self.assertIsInstance(dset[\"vec\"][0], torch.Tensor)\n\n    @require_tf\n    @require_torch\n    def test_format_nested(self, in_memory):\n        import numpy as np\n        import tensorflow as tf\n        import torch\n\n        with (\n            tempfile.TemporaryDirectory() as tmp_dir,\n            self._create_dummy_dataset(in_memory, tmp_dir) as dset,\n            dset.map(lambda ex: {\"nested\": [{\"foo\": np.ones(3)}] * len(ex[\"filename\"])}, batched=True) as dset,\n        ):\n            self.assertDictEqual(\n                dset.features, Features({\"filename\": Value(\"string\"), \"nested\": {\"foo\": List(Value(\"float64\"))}})\n            )\n\n            dset.set_format(\"tensorflow\")\n            self.assertIsNotNone(dset[0])\n            self.assertIsInstance(dset[0][\"nested\"][\"foo\"], (tf.Tensor, tf.RaggedTensor))\n            self.assertIsNotNone(dset[:2])\n            self.assertIsInstance(dset[:2][\"nested\"][0][\"foo\"], (tf.Tensor, tf.RaggedTensor))\n            self.assertIsInstance(dset[\"nested\"][0][\"foo\"], (tf.Tensor, tf.RaggedTensor))\n\n            dset.set_format(\"numpy\")\n            self.assertIsNotNone(dset[0])\n            self.assertIsInstance(dset[0][\"nested\"][\"foo\"], np.ndarray)\n            self.assertIsNotNone(dset[:2])\n            self.assertIsInstance(dset[:2][\"nested\"][0][\"foo\"], np.ndarray)\n            self.assertIsInstance(dset[\"nested\"][0][\"foo\"], np.ndarray)\n\n            dset.set_format(\"torch\", columns=\"nested\")\n            self.assertIsNotNone(dset[0])\n            self.assertIsInstance(dset[0][\"nested\"][\"foo\"], torch.Tensor)\n            self.assertIsNotNone(dset[:2])\n            self.assertIsInstance(dset[:2][\"nested\"][0][\"foo\"], torch.Tensor)\n            self.assertIsInstance(dset[\"nested\"][0][\"foo\"], torch.Tensor)\n\n    def test_format_pandas(self, in_memory):\n        import pandas as pd\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                dset.set_format(\"pandas\")\n                self.assertIsInstance(dset[0], pd.DataFrame)\n                self.assertIsInstance(dset[:2], pd.DataFrame)\n                self.assertIsInstance(dset[\"col_1\"], pd.Series)\n\n    @require_polars\n    def test_format_polars(self, in_memory):\n        import polars as pl\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                dset.set_format(\"polars\")\n                self.assertIsInstance(dset[0], pl.DataFrame)\n                self.assertIsInstance(dset[:2], pl.DataFrame)\n                self.assertIsInstance(dset[\"col_1\"], pl.Series)\n\n    def test_transmit_format_single(self, in_memory):\n        @transmit_format\n        def my_single_transform(self, return_factory, *args, **kwargs):\n            return return_factory()\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            return_factory = partial(\n                self._create_dummy_dataset, in_memory=in_memory, tmp_dir=tmp_dir, multiple_columns=True\n            )\n            with return_factory() as dset:\n                dset.set_format(\"numpy\", columns=[\"col_1\"])\n                prev_format = dset.format\n                with my_single_transform(dset, return_factory) as transformed_dset:\n                    self.assertDictEqual(transformed_dset.format, prev_format)\n\n    def test_transmit_format_dict(self, in_memory):\n        @transmit_format\n        def my_split_transform(self, return_factory, *args, **kwargs):\n            return DatasetDict({\"train\": return_factory()})\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            return_factory = partial(\n                self._create_dummy_dataset, in_memory=in_memory, tmp_dir=tmp_dir, multiple_columns=True\n            )\n            with return_factory() as dset:\n                dset.set_format(\"numpy\", columns=[\"col_1\"])\n                prev_format = dset.format\n                transformed_dset = my_split_transform(dset, return_factory)[\"train\"]\n                self.assertDictEqual(transformed_dset.format, prev_format)\n\n                del transformed_dset  # DatasetDict\n\n    def test_with_format(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                with dset.with_format(\"numpy\", columns=[\"col_1\"]) as dset2:\n                    dset.set_format(\"numpy\", columns=[\"col_1\"])\n                    self.assertDictEqual(dset.format, dset2.format)\n                    self.assertEqual(dset._fingerprint, dset2._fingerprint)\n                    # dset.reset_format()\n                    # self.assertNotEqual(dset.format, dset2.format)\n                    # self.assertNotEqual(dset._fingerprint, dset2._fingerprint)\n\n    def test_with_transform(self, in_memory):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:\n                transform = lambda x: {\"foo\": x[\"col_1\"]}  # noqa: E731\n                with dset.with_transform(transform, columns=[\"col_1\"]) as dset2:\n                    dset.set_transform(transform, columns=[\"col_1\"])\n                    self.assertDictEqual(dset.format, dset2.format)\n                    self.assertEqual(dset._fingerprint, dset2._fingerprint)\n                    dset.reset_format()\n                    self.assertNotEqual(dset.format, dset2.format)\n                    self.assertNotEqual(dset._fingerprint, dset2._fingerprint)\n\n    @require_tf\n    def test_tf_dataset_conversion(self, in_memory):\n        tmp_dir = tempfile.TemporaryDirectory()\n        for num_workers in [0, 1, 2]:\n            if num_workers > 0 and sys.platform == \"win32\" and not in_memory:\n                continue  # This test hangs on the Py3.10 test worker, but it runs fine locally on my Windows machine\n            with self._create_dummy_dataset(in_memory, tmp_dir.name, array_features=True) as dset:\n                tf_dataset = dset.to_tf_dataset(columns=\"col_3\", batch_size=2, num_workers=num_workers)\n                batch = next(iter(tf_dataset))\n                self.assertEqual(batch.shape.as_list(), [2, 4])\n                self.assertEqual(batch.dtype.name, \"int64\")\n            with self._create_dummy_dataset(in_memory, tmp_dir.name, multiple_columns=True) as dset:\n                tf_dataset = dset.to_tf_dataset(columns=\"col_1\", batch_size=2, num_workers=num_workers)\n                batch = next(iter(tf_dataset))\n                self.assertEqual(batch.shape.as_list(), [2])\n                self.assertEqual(batch.dtype.name, \"int64\")\n            with self._create_dummy_dataset(in_memory, tmp_dir.name, multiple_columns=True) as dset:\n                # Check that it works with all default options (except batch_size because the dummy dataset only has 4)\n                tf_dataset = dset.to_tf_dataset(batch_size=2, num_workers=num_workers)\n                batch = next(iter(tf_dataset))\n                self.assertEqual(batch[\"col_1\"].shape.as_list(), [2])\n                self.assertEqual(batch[\"col_2\"].shape.as_list(), [2])\n                self.assertEqual(batch[\"col_1\"].dtype.name, \"int64\")\n                self.assertEqual(batch[\"col_2\"].dtype.name, \"string\")  # Assert that we're converting strings properly\n            with self._create_dummy_dataset(in_memory, tmp_dir.name, multiple_columns=True) as dset:\n                # Check that when we use a transform that creates a new column from existing column values\n                # but don't load the old columns that the new column depends on in the final dataset,\n                # that they're still kept around long enough to be used in the transform\n                transform_dset = dset.with_transform(\n                    lambda x: {\"new_col\": [val * 2 for val in x[\"col_1\"]], \"col_1\": x[\"col_1\"]}\n                )\n                tf_dataset = transform_dset.to_tf_dataset(columns=\"new_col\", batch_size=2, num_workers=num_workers)\n                batch = next(iter(tf_dataset))\n                self.assertEqual(batch.shape.as_list(), [2])\n                self.assertEqual(batch.dtype.name, \"int64\")\n                del transform_dset\n        del tf_dataset  # For correct cleanup\n\n    @require_tf\n    def test_tf_index_reshuffling(self, in_memory):\n        # This test checks that when we do two epochs over a tf.data.Dataset from to_tf_dataset\n        # that we get a different shuffle order each time\n        # It also checks that when we aren't shuffling, that the dataset order is fully preserved\n        # even when loading is split across multiple workers\n        data = {\"col_1\": list(range(20))}\n        for num_workers in [0, 1, 2, 3]:\n            with Dataset.from_dict(data) as dset:\n                tf_dataset = dset.to_tf_dataset(batch_size=10, shuffle=True, num_workers=num_workers)\n                indices = []\n                for batch in tf_dataset:\n                    indices.append(batch[\"col_1\"])\n                indices = np.concatenate([arr.numpy() for arr in indices])\n                second_indices = []\n                for batch in tf_dataset:\n                    second_indices.append(batch[\"col_1\"])\n                second_indices = np.concatenate([arr.numpy() for arr in second_indices])\n                self.assertFalse(np.array_equal(indices, second_indices))\n                self.assertEqual(len(indices), len(np.unique(indices)))\n                self.assertEqual(len(second_indices), len(np.unique(second_indices)))\n\n                tf_dataset = dset.to_tf_dataset(batch_size=1, shuffle=False, num_workers=num_workers)\n                for i, batch in enumerate(tf_dataset):\n                    # Assert that the unshuffled order is fully preserved even when multiprocessing\n                    self.assertEqual(i, batch[\"col_1\"].numpy())\n\n    @require_tf\n    def test_tf_label_renaming(self, in_memory):\n        # Protect TF-specific imports in here\n        import tensorflow as tf\n\n        from datasets.utils.tf_utils import minimal_tf_collate_fn_with_renaming\n\n        tmp_dir = tempfile.TemporaryDirectory()\n        with self._create_dummy_dataset(in_memory, tmp_dir.name, multiple_columns=True) as dset:\n            with dset.rename_columns({\"col_1\": \"features\", \"col_2\": \"label\"}) as new_dset:\n                tf_dataset = new_dset.to_tf_dataset(collate_fn=minimal_tf_collate_fn_with_renaming, batch_size=4)\n                batch = next(iter(tf_dataset))\n                self.assertTrue(\"labels\" in batch and \"features\" in batch)\n\n                tf_dataset = new_dset.to_tf_dataset(\n                    columns=[\"features\", \"labels\"], collate_fn=minimal_tf_collate_fn_with_renaming, batch_size=4\n                )\n                batch = next(iter(tf_dataset))\n                self.assertTrue(\"labels\" in batch and \"features\" in batch)\n\n                tf_dataset = new_dset.to_tf_dataset(\n                    columns=[\"features\", \"label\"], collate_fn=minimal_tf_collate_fn_with_renaming, batch_size=4\n                )\n                batch = next(iter(tf_dataset))\n                self.assertTrue(\"labels\" in batch and \"features\" in batch)  # Assert renaming was handled correctly\n\n                tf_dataset = new_dset.to_tf_dataset(\n                    columns=[\"features\"],\n                    label_cols=[\"labels\"],\n                    collate_fn=minimal_tf_collate_fn_with_renaming,\n                    batch_size=4,\n                )\n                batch = next(iter(tf_dataset))\n                self.assertEqual(len(batch), 2)\n                # Assert that we don't have any empty entries here\n                self.assertTrue(isinstance(batch[0], tf.Tensor) and isinstance(batch[1], tf.Tensor))\n\n                tf_dataset = new_dset.to_tf_dataset(\n                    columns=[\"features\"],\n                    label_cols=[\"label\"],\n                    collate_fn=minimal_tf_collate_fn_with_renaming,\n                    batch_size=4,\n                )\n                batch = next(iter(tf_dataset))\n                self.assertEqual(len(batch), 2)\n                # Assert that we don't have any empty entries here\n                self.assertTrue(isinstance(batch[0], tf.Tensor) and isinstance(batch[1], tf.Tensor))\n\n                tf_dataset = new_dset.to_tf_dataset(\n                    columns=[\"features\"],\n                    collate_fn=minimal_tf_collate_fn_with_renaming,\n                    batch_size=4,\n                )\n                batch = next(iter(tf_dataset))\n                # Assert that labels didn't creep in when we don't ask for them\n                # just because the collate_fn added them\n                self.assertTrue(isinstance(batch, tf.Tensor))\n\n        del tf_dataset  # For correct cleanup\n\n    @require_tf\n    def test_tf_dataset_options(self, in_memory):\n        tmp_dir = tempfile.TemporaryDirectory()\n        # Test that batch_size option works as expected\n        with self._create_dummy_dataset(in_memory, tmp_dir.name, array_features=True) as dset:\n            tf_dataset = dset.to_tf_dataset(columns=\"col_3\", batch_size=2)\n            batch = next(iter(tf_dataset))\n            self.assertEqual(batch.shape.as_list(), [2, 4])\n            self.assertEqual(batch.dtype.name, \"int64\")\n        # Test that batch_size=None (optional) works as expected\n        with self._create_dummy_dataset(in_memory, tmp_dir.name, multiple_columns=True) as dset:\n            tf_dataset = dset.to_tf_dataset(columns=\"col_3\", batch_size=None)\n            single_example = next(iter(tf_dataset))\n            self.assertEqual(single_example.shape.as_list(), [])\n            self.assertEqual(single_example.dtype.name, \"int64\")\n            # Assert that we can batch it with `tf.data.Dataset.batch` method\n            batched_dataset = tf_dataset.batch(batch_size=2)\n            batch = next(iter(batched_dataset))\n            self.assertEqual(batch.shape.as_list(), [2])\n            self.assertEqual(batch.dtype.name, \"int64\")\n        # Test that batching a batch_size=None dataset produces the same results as using batch_size arg\n        with self._create_dummy_dataset(in_memory, tmp_dir.name, multiple_columns=True) as dset:\n            batch_size = 2\n            tf_dataset_no_batch = dset.to_tf_dataset(columns=\"col_3\")\n            tf_dataset_batch = dset.to_tf_dataset(columns=\"col_3\", batch_size=batch_size)\n            self.assertEqual(tf_dataset_no_batch.element_spec, tf_dataset_batch.unbatch().element_spec)\n            self.assertEqual(tf_dataset_no_batch.cardinality(), tf_dataset_batch.cardinality() * batch_size)\n            for batch_1, batch_2 in zip(tf_dataset_no_batch.batch(batch_size=batch_size), tf_dataset_batch):\n                self.assertEqual(batch_1.shape, batch_2.shape)\n                self.assertEqual(batch_1.dtype, batch_2.dtype)\n                self.assertListEqual(batch_1.numpy().tolist(), batch_2.numpy().tolist())\n        # Test that requesting label_cols works as expected\n        with self._create_dummy_dataset(in_memory, tmp_dir.name, multiple_columns=True) as dset:\n            tf_dataset = dset.to_tf_dataset(columns=\"col_1\", label_cols=[\"col_2\", \"col_3\"], batch_size=4)\n            batch = next(iter(tf_dataset))\n            self.assertEqual(len(batch), 2)\n            self.assertEqual(set(batch[1].keys()), {\"col_2\", \"col_3\"})\n            self.assertEqual(batch[0].dtype.name, \"int64\")\n            # Assert data comes out as expected and isn't shuffled\n            self.assertEqual(batch[0].numpy().tolist(), [3, 2, 1, 0])\n            self.assertEqual(batch[1][\"col_2\"].numpy().tolist(), [b\"a\", b\"b\", b\"c\", b\"d\"])\n            self.assertEqual(batch[1][\"col_3\"].numpy().tolist(), [0, 1, 0, 1])\n        # Check that incomplete batches are dropped if requested\n        with self._create_dummy_dataset(in_memory, tmp_dir.name, multiple_columns=True) as dset:\n            tf_dataset = dset.to_tf_dataset(columns=\"col_1\", batch_size=3)\n            tf_dataset_with_drop = dset.to_tf_dataset(columns=\"col_1\", batch_size=3, drop_remainder=True)\n            self.assertEqual(len(tf_dataset), 2)  # One batch of 3 and one batch of 1\n            self.assertEqual(len(tf_dataset_with_drop), 1)  # Incomplete batch of 1 is dropped\n        # Test that `NotImplementedError` is raised `batch_size` is None and `num_workers` is > 0\n        with self._create_dummy_dataset(in_memory, tmp_dir.name, multiple_columns=True) as dset:\n            with self.assertRaisesRegex(\n                NotImplementedError, \"`batch_size` must be specified when using multiple workers\"\n            ):\n                dset.to_tf_dataset(columns=\"col_1\", batch_size=None, num_workers=2)\n        del tf_dataset  # For correct cleanup\n        del tf_dataset_with_drop\n\n\n_messages = [\n    {\"role\": \"user\", \"content\": \"Turn on the living room lights and play my electronic music playlist.\"},\n    {\n        \"role\": \"assistant\",\n        \"tool_calls\": [\n            {\n                \"type\": \"function\",\n                \"function\": {\"name\": \"control_light\", \"arguments\": {\"room\": \"living room\", \"state\": \"on\"}},\n            },\n            {\n                \"type\": \"function\",\n                \"function\": {\n                    \"name\": \"play_music\",\n                    \"arguments\": {\n                        \"playlist\": \"electronic\"\n                    },  # mixed-type here since keys [\"playlist\"] and [\"room\", \"state\"] are different\n                },\n            },\n        ],\n    },\n    {\"role\": \"tool\", \"name\": \"control_light\", \"content\": \"The lights in the living room are now on.\"},\n    {\"role\": \"tool\", \"name\": \"play_music\", \"content\": \"The music is now playing.\"},\n    {\"role\": \"assistant\", \"content\": \"Done!\"},\n]\n\n\nclass MiscellaneousDatasetTest(TestCase):\n    def test_from_pandas(self):\n        data = {\"col_1\": [3, 2, 1, 0], \"col_2\": [\"a\", \"b\", \"c\", \"d\"]}\n        df = pd.DataFrame.from_dict(data)\n        with Dataset.from_pandas(df) as dset:\n            self.assertSequenceEqual(dset[\"col_1\"], data[\"col_1\"])\n            self.assertSequenceEqual(dset[\"col_2\"], data[\"col_2\"])\n            self.assertListEqual(list(dset.features.keys()), [\"col_1\", \"col_2\"])\n            self.assertDictEqual(\n                dset.features, Features({\"col_1\": Value(\"int64\"), \"col_2\": Value(STRING_FROM_PANDAS)})\n            )\n\n        features = Features({\"col_1\": Value(\"int64\"), \"col_2\": Value(\"string\")})\n        with Dataset.from_pandas(df, features=features) as dset:\n            self.assertSequenceEqual(dset[\"col_1\"], data[\"col_1\"])\n            self.assertSequenceEqual(dset[\"col_2\"], data[\"col_2\"])\n            self.assertListEqual(list(dset.features.keys()), [\"col_1\", \"col_2\"])\n            self.assertDictEqual(dset.features, Features({\"col_1\": Value(\"int64\"), \"col_2\": Value(\"string\")}))\n\n        features = Features({\"col_1\": Value(\"int64\"), \"col_2\": Value(\"string\")})\n        with Dataset.from_pandas(df, features=features, info=DatasetInfo(features=features)) as dset:\n            self.assertSequenceEqual(dset[\"col_1\"], data[\"col_1\"])\n            self.assertSequenceEqual(dset[\"col_2\"], data[\"col_2\"])\n            self.assertListEqual(list(dset.features.keys()), [\"col_1\", \"col_2\"])\n            self.assertDictEqual(dset.features, Features({\"col_1\": Value(\"int64\"), \"col_2\": Value(\"string\")}))\n\n        features = Features({\"col_1\": List(Value(\"string\")), \"col_2\": Value(\"string\")})\n        self.assertRaises(TypeError, Dataset.from_pandas, df, features=features)\n\n    @require_polars\n    def test_from_polars(self):\n        import polars as pl\n\n        data = {\"col_1\": [3, 2, 1, 0], \"col_2\": [\"a\", \"b\", \"c\", \"d\"]}\n        df = pl.from_dict(data)\n        with Dataset.from_polars(df) as dset:\n            self.assertSequenceEqual(dset[\"col_1\"], data[\"col_1\"])\n            self.assertSequenceEqual(dset[\"col_2\"], data[\"col_2\"])\n            self.assertListEqual(list(dset.features.keys()), [\"col_1\", \"col_2\"])\n            self.assertDictEqual(dset.features, Features({\"col_1\": Value(\"int64\"), \"col_2\": Value(\"large_string\")}))\n\n        features = Features({\"col_1\": Value(\"int64\"), \"col_2\": Value(\"large_string\")})\n        with Dataset.from_polars(df, features=features) as dset:\n            self.assertSequenceEqual(dset[\"col_1\"], data[\"col_1\"])\n            self.assertSequenceEqual(dset[\"col_2\"], data[\"col_2\"])\n            self.assertListEqual(list(dset.features.keys()), [\"col_1\", \"col_2\"])\n            self.assertDictEqual(dset.features, Features({\"col_1\": Value(\"int64\"), \"col_2\": Value(\"large_string\")}))\n\n        features = Features({\"col_1\": Value(\"int64\"), \"col_2\": Value(\"large_string\")})\n        with Dataset.from_polars(df, features=features, info=DatasetInfo(features=features)) as dset:\n            self.assertSequenceEqual(dset[\"col_1\"], data[\"col_1\"])\n            self.assertSequenceEqual(dset[\"col_2\"], data[\"col_2\"])\n            self.assertListEqual(list(dset.features.keys()), [\"col_1\", \"col_2\"])\n            self.assertDictEqual(dset.features, Features({\"col_1\": Value(\"int64\"), \"col_2\": Value(\"large_string\")}))\n\n        features = Features({\"col_1\": List(Value(\"string\")), \"col_2\": Value(\"large_string\")})\n        self.assertRaises(TypeError, Dataset.from_polars, df, features=features)\n\n    def test_from_dict(self):\n        data = {\"col_1\": [3, 2, 1, 0], \"col_2\": [\"a\", \"b\", \"c\", \"d\"], \"col_3\": pa.array([True, False, True, False])}\n        with Dataset.from_dict(data) as dset:\n            self.assertSequenceEqual(dset[\"col_1\"], data[\"col_1\"])\n            self.assertSequenceEqual(dset[\"col_2\"], data[\"col_2\"])\n            self.assertSequenceEqual(dset[\"col_3\"], data[\"col_3\"].to_pylist())\n            self.assertListEqual(list(dset.features.keys()), [\"col_1\", \"col_2\", \"col_3\"])\n            self.assertDictEqual(\n                dset.features, Features({\"col_1\": Value(\"int64\"), \"col_2\": Value(\"string\"), \"col_3\": Value(\"bool\")})\n            )\n\n        features = Features({\"col_1\": Value(\"int64\"), \"col_2\": Value(\"string\"), \"col_3\": Value(\"bool\")})\n        with Dataset.from_dict(data, features=features) as dset:\n            self.assertSequenceEqual(dset[\"col_1\"], data[\"col_1\"])\n            self.assertSequenceEqual(dset[\"col_2\"], data[\"col_2\"])\n            self.assertSequenceEqual(dset[\"col_3\"], data[\"col_3\"].to_pylist())\n            self.assertListEqual(list(dset.features.keys()), [\"col_1\", \"col_2\", \"col_3\"])\n            self.assertDictEqual(\n                dset.features, Features({\"col_1\": Value(\"int64\"), \"col_2\": Value(\"string\"), \"col_3\": Value(\"bool\")})\n            )\n\n        features = Features({\"col_1\": Value(\"int64\"), \"col_2\": Value(\"string\"), \"col_3\": Value(\"bool\")})\n        with Dataset.from_dict(data, features=features, info=DatasetInfo(features=features)) as dset:\n            self.assertSequenceEqual(dset[\"col_1\"], data[\"col_1\"])\n            self.assertSequenceEqual(dset[\"col_2\"], data[\"col_2\"])\n            self.assertSequenceEqual(dset[\"col_3\"], data[\"col_3\"].to_pylist())\n            self.assertListEqual(list(dset.features.keys()), [\"col_1\", \"col_2\", \"col_3\"])\n            self.assertDictEqual(\n                dset.features, Features({\"col_1\": Value(\"int64\"), \"col_2\": Value(\"string\"), \"col_3\": Value(\"bool\")})\n            )\n\n        features = Features({\"col_1\": Value(\"string\"), \"col_2\": Value(\"string\"), \"col_3\": Value(\"int32\")})\n        with Dataset.from_dict(data, features=features) as dset:\n            # the integers are converted to strings\n            self.assertSequenceEqual(dset[\"col_1\"], [str(x) for x in data[\"col_1\"]])\n            self.assertSequenceEqual(dset[\"col_2\"], data[\"col_2\"])\n            self.assertSequenceEqual(dset[\"col_3\"], [int(x) for x in data[\"col_3\"].to_pylist()])\n            self.assertListEqual(list(dset.features.keys()), [\"col_1\", \"col_2\", \"col_3\"])\n            self.assertDictEqual(\n                dset.features, Features({\"col_1\": Value(\"string\"), \"col_2\": Value(\"string\"), \"col_3\": Value(\"int32\")})\n            )\n\n        features = Features({\"col_1\": Value(\"int64\"), \"col_2\": Value(\"int64\"), \"col_3\": Value(\"bool\")})\n        self.assertRaises(ValueError, Dataset.from_dict, data, features=features)\n\n    def test_from_dict_on_mixed_types(self):\n        data = {\"col_1\": [-1, 1, \"foo\"]}\n        with Dataset.from_dict(data, on_mixed_types=\"use_json\") as dset:\n            self.assertEqual(dset[:], data)\n        data = {\"col_1\": [{\"a\": 0}, [0]]}\n        with Dataset.from_dict(data, on_mixed_types=\"use_json\") as dset:\n            self.assertEqual(dset[:], data)\n        data = {\"col_1\": [{\"a\": 0}, {\"b\": 0}, {\"c\": 0}]}\n        with Dataset.from_dict(data, on_mixed_types=\"use_json\") as dset:\n            self.assertEqual(dset[:], data)\n        data = {\"col_1\": [[{\"a\": 0}, {\"b\": 0}], [{\"c\": 0}, {\"d\": 0}]]}\n        with Dataset.from_dict(data, on_mixed_types=\"use_json\") as dset:\n            self.assertEqual(dset[:], data)\n        data = {\"messages\": [_messages]}\n        with Dataset.from_dict(data, on_mixed_types=\"use_json\") as dset:\n            self.assertEqual(dset[:], data)\n        data = {\"empty_struct\": [{}]}\n        with Dataset.from_dict(data, on_mixed_types=\"use_json\") as dset:\n            self.assertEqual(dset[:], data)\n            self.assertEqual(dset.features[\"empty_struct\"], Json())\n\n    def test_concatenate_mixed_memory_and_disk(self):\n        data1, data2, data3 = {\"id\": [0, 1, 2]}, {\"id\": [3, 4, 5]}, {\"id\": [6, 7]}\n        info1 = DatasetInfo(description=\"Dataset1\")\n        info2 = DatasetInfo(description=\"Dataset2\")\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with (\n                Dataset.from_dict(data1, info=info1).map(cache_file_name=os.path.join(tmp_dir, \"d1.arrow\")) as dset1,\n                Dataset.from_dict(data2, info=info2).map(cache_file_name=os.path.join(tmp_dir, \"d2.arrow\")) as dset2,\n                Dataset.from_dict(data3) as dset3,\n            ):\n                with concatenate_datasets([dset1, dset2, dset3]) as concatenated_dset:\n                    self.assertEqual(len(concatenated_dset), len(dset1) + len(dset2) + len(dset3))\n                    self.assertSequenceEqual(concatenated_dset[\"id\"], dset1[\"id\"][:] + dset2[\"id\"][:] + dset3[\"id\"][:])\n\n    @require_transformers\n    @pytest.mark.integration\n    def test_set_format_encode(self):\n        from transformers import BertTokenizer\n\n        tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n\n        def encode(batch):\n            return tokenizer(batch[\"text\"], padding=\"longest\", return_tensors=\"np\")\n\n        with Dataset.from_dict({\"text\": [\"hello there\", \"foo\"]}) as dset:\n            dset.set_transform(transform=encode)\n            self.assertEqual(str(dset[:2]), str(encode({\"text\": [\"hello there\", \"foo\"]})))\n\n    @require_tf\n    def test_tf_string_encoding(self):\n        data = {\"col_1\": [\"á\", \"é\", \"í\", \"ó\", \"ú\"], \"col_2\": [\"à\", \"è\", \"ì\", \"ò\", \"ù\"]}\n        with Dataset.from_dict(data) as dset:\n            tf_dset_wo_batch = dset.to_tf_dataset(columns=[\"col_1\", \"col_2\"])\n            for tf_row, row in zip(tf_dset_wo_batch, dset):\n                self.assertEqual(tf_row[\"col_1\"].numpy().decode(\"utf-8\"), row[\"col_1\"])\n                self.assertEqual(tf_row[\"col_2\"].numpy().decode(\"utf-8\"), row[\"col_2\"])\n\n            tf_dset_w_batch = dset.to_tf_dataset(columns=[\"col_1\", \"col_2\"], batch_size=2)\n            for tf_row, row in zip(tf_dset_w_batch.unbatch(), dset):\n                self.assertEqual(tf_row[\"col_1\"].numpy().decode(\"utf-8\"), row[\"col_1\"])\n                self.assertEqual(tf_row[\"col_2\"].numpy().decode(\"utf-8\"), row[\"col_2\"])\n\n            self.assertEqual(tf_dset_w_batch.unbatch().element_spec, tf_dset_wo_batch.element_spec)\n            self.assertEqual(tf_dset_w_batch.element_spec, tf_dset_wo_batch.batch(2).element_spec)\n\n\ndef test_cast_with_sliced_list():\n    old_features = Features({\"foo\": List(Value(\"int64\"))})\n    new_features = Features({\"foo\": List(Value(\"int32\"))})\n    dataset = Dataset.from_dict({\"foo\": [[i] * (i % 3) for i in range(20)]}, features=old_features)\n    casted_dataset = dataset.cast(new_features, batch_size=2)  # small batch size to slice the ListArray\n    assert dataset[\"foo\"] == casted_dataset[\"foo\"]\n    assert casted_dataset.features == new_features\n\n\n@pytest.mark.parametrize(\"include_nulls\", [False, True])\ndef test_class_encode_column_with_none(include_nulls):\n    dataset = Dataset.from_dict({\"col_1\": [\"a\", \"b\", \"c\", None, \"d\", None]})\n    dataset = dataset.class_encode_column(\"col_1\", include_nulls=include_nulls)\n    class_names = [\"a\", \"b\", \"c\", \"d\"]\n    if include_nulls:\n        class_names += [\"None\"]\n    assert isinstance(dataset.features[\"col_1\"], ClassLabel)\n    assert set(dataset.features[\"col_1\"].names) == set(class_names)\n    assert (None in dataset.unique(\"col_1\")) == (not include_nulls)\n\n\n@pytest.mark.parametrize(\"null_placement\", [\"first\", \"last\"])\ndef test_sort_with_none(null_placement):\n    dataset = Dataset.from_dict({\"col_1\": [\"item_2\", \"item_3\", \"item_1\", None, \"item_4\", None]})\n    dataset = dataset.sort(\"col_1\", null_placement=null_placement)\n    if null_placement == \"first\":\n        assert dataset[\"col_1\"] == [None, None, \"item_1\", \"item_2\", \"item_3\", \"item_4\"]\n    else:\n        assert dataset[\"col_1\"] == [\"item_1\", \"item_2\", \"item_3\", \"item_4\", None, None]\n\n\ndef test_update_metadata_with_features(dataset_dict):\n    table1 = pa.Table.from_pydict(dataset_dict)\n    features1 = Features.from_arrow_schema(table1.schema)\n    features2 = features1.copy()\n    features2[\"col_2\"] = ClassLabel(num_classes=len(table1))\n    assert features1 != features2\n\n    table2 = update_metadata_with_features(table1, features2)\n    metadata = json.loads(table2.schema.metadata[b\"huggingface\"].decode())\n    assert features2 == Features.from_dict(metadata[\"info\"][\"features\"])\n\n    with Dataset(table1) as dset1, Dataset(table2) as dset2:\n        assert dset1.features == features1\n        assert dset2.features == features2\n\n\n@pytest.mark.parametrize(\"dataset_type\", [\"in_memory\", \"memory_mapped\", \"mixed\"])\n@pytest.mark.parametrize(\"axis, expected_shape\", [(0, (4, 3)), (1, (2, 6))])\ndef test_concatenate_datasets(dataset_type, axis, expected_shape, dataset_dict, arrow_path):\n    table = {\n        \"in_memory\": InMemoryTable.from_pydict(dataset_dict),\n        \"memory_mapped\": MemoryMappedTable.from_file(arrow_path),\n    }\n    tables = [\n        table[dataset_type if dataset_type != \"mixed\" else \"memory_mapped\"].slice(0, 2),  # shape = (2, 3)\n        table[dataset_type if dataset_type != \"mixed\" else \"in_memory\"].slice(2, 4),  # shape = (2, 3)\n    ]\n    if axis == 1:  # don't duplicate columns\n        tables[1] = tables[1].rename_columns([col + \"_bis\" for col in tables[1].column_names])\n    datasets = [Dataset(table) for table in tables]\n    dataset = concatenate_datasets(datasets, axis=axis)\n    assert dataset.shape == expected_shape\n    assert_arrow_metadata_are_synced_with_dataset_features(dataset)\n\n\ndef test_concatenate_datasets_new_columns():\n    dataset1 = Dataset.from_dict({\"col_1\": [\"a\", \"b\", \"c\"]})\n    dataset2 = Dataset.from_dict({\"col_1\": [\"d\", \"e\", \"f\"], \"col_2\": [True, False, True]})\n    dataset = concatenate_datasets([dataset1, dataset2])\n    assert dataset.data.shape == (6, 2)\n    assert dataset.features == Features({\"col_1\": Value(\"string\"), \"col_2\": Value(\"bool\")})\n    assert dataset[:] == {\"col_1\": [\"a\", \"b\", \"c\", \"d\", \"e\", \"f\"], \"col_2\": [None, None, None, True, False, True]}\n    dataset3 = Dataset.from_dict({\"col_3\": [\"a_1\"]})\n    dataset = concatenate_datasets([dataset, dataset3])\n    assert dataset.data.shape == (7, 3)\n    assert dataset.features == Features({\"col_1\": Value(\"string\"), \"col_2\": Value(\"bool\"), \"col_3\": Value(\"string\")})\n    assert dataset[:] == {\n        \"col_1\": [\"a\", \"b\", \"c\", \"d\", \"e\", \"f\", None],\n        \"col_2\": [None, None, None, True, False, True, None],\n        \"col_3\": [None, None, None, None, None, None, \"a_1\"],\n    }\n\n\n@pytest.mark.parametrize(\"axis\", [0, 1])\ndef test_concatenate_datasets_complex_features(axis):\n    n = 5\n    dataset1 = Dataset.from_dict(\n        {\"col_1\": [0] * n, \"col_2\": list(range(n))},\n        features=Features({\"col_1\": Value(\"int32\"), \"col_2\": ClassLabel(num_classes=n)}),\n    )\n    if axis == 1:\n        dataset2 = dataset1.rename_columns({col: col + \"_\" for col in dataset1.column_names})\n        expected_features = Features({**dataset1.features, **dataset2.features})\n    else:\n        dataset2 = dataset1\n        expected_features = dataset1.features\n    assert concatenate_datasets([dataset1, dataset2], axis=axis).features == expected_features\n\n\n@pytest.mark.parametrize(\"other_dataset_type\", [\"in_memory\", \"memory_mapped\", \"concatenation\"])\n@pytest.mark.parametrize(\"axis, expected_shape\", [(0, (8, 3)), (1, (4, 6))])\ndef test_concatenate_datasets_with_concatenation_tables(\n    axis, expected_shape, other_dataset_type, dataset_dict, arrow_path\n):\n    def _create_concatenation_table(axis):\n        if axis == 0:  # shape: (4, 3) = (4, 1) + (4, 2)\n            concatenation_table = ConcatenationTable.from_blocks(\n                [\n                    [\n                        InMemoryTable.from_pydict({\"col_1\": dataset_dict[\"col_1\"]}),\n                        MemoryMappedTable.from_file(arrow_path).remove_column(0),\n                    ]\n                ]\n            )\n        elif axis == 1:  # shape: (4, 3) = (1, 3) + (3, 3)\n            concatenation_table = ConcatenationTable.from_blocks(\n                [\n                    [InMemoryTable.from_pydict(dataset_dict).slice(0, 1)],\n                    [MemoryMappedTable.from_file(arrow_path).slice(1, 4)],\n                ]\n            )\n        return concatenation_table\n\n    concatenation_table = _create_concatenation_table(axis)\n    assert concatenation_table.shape == (4, 3)\n\n    if other_dataset_type == \"in_memory\":\n        other_table = InMemoryTable.from_pydict(dataset_dict)\n    elif other_dataset_type == \"memory_mapped\":\n        other_table = MemoryMappedTable.from_file(arrow_path)\n    elif other_dataset_type == \"concatenation\":\n        other_table = _create_concatenation_table(axis)\n    assert other_table.shape == (4, 3)\n\n    tables = [concatenation_table, other_table]\n\n    if axis == 1:  # don't duplicate columns\n        tables[1] = tables[1].rename_columns([col + \"_bis\" for col in tables[1].column_names])\n\n    for tables in [tables, reversed(tables)]:\n        datasets = [Dataset(table) for table in tables]\n        dataset = concatenate_datasets(datasets, axis=axis)\n        assert dataset.shape == expected_shape\n\n\ndef test_concatenate_datasets_duplicate_columns(dataset):\n    with pytest.raises(ValueError) as excinfo:\n        concatenate_datasets([dataset, dataset], axis=1)\n    assert \"duplicated\" in str(excinfo.value)\n\n\ndef test_interleave_datasets():\n    d1 = Dataset.from_dict({\"a\": [0, 1, 2]})\n    d2 = Dataset.from_dict({\"a\": [10, 11, 12, 13]})\n    d3 = Dataset.from_dict({\"a\": [22, 21, 20]}).select([2, 1, 0])\n    dataset = interleave_datasets([d1, d2, d3])\n    expected_length = 3 * min(len(d1), len(d2), len(d3))\n    expected_values = [x[\"a\"] for x in itertools.chain(*zip(d1, d2, d3))]\n    assert isinstance(dataset, Dataset)\n    assert len(dataset) == expected_length\n    assert dataset[\"a\"] == expected_values\n    assert dataset._fingerprint == interleave_datasets([d1, d2, d3])._fingerprint\n\n\ndef test_interleave_datasets_probabilities():\n    seed = 42\n    probabilities = [0.3, 0.5, 0.2]\n    d1 = Dataset.from_dict({\"a\": [0, 1, 2]})\n    d2 = Dataset.from_dict({\"a\": [10, 11, 12, 13]})\n    d3 = Dataset.from_dict({\"a\": [22, 21, 20]}).select([2, 1, 0])\n    dataset = interleave_datasets([d1, d2, d3], probabilities=probabilities, seed=seed)\n    expected_length = 7  # hardcoded\n    expected_values = [10, 11, 20, 12, 0, 21, 13]  # hardcoded\n    assert isinstance(dataset, Dataset)\n    assert len(dataset) == expected_length\n    assert dataset[\"a\"] == expected_values\n    assert (\n        dataset._fingerprint == interleave_datasets([d1, d2, d3], probabilities=probabilities, seed=seed)._fingerprint\n    )\n\n\ndef test_interleave_datasets_oversampling_strategy():\n    d1 = Dataset.from_dict({\"a\": [0, 1, 2]})\n    d2 = Dataset.from_dict({\"a\": [10, 11, 12, 13]})\n    d3 = Dataset.from_dict({\"a\": [22, 21, 20]}).select([2, 1, 0])\n    dataset = interleave_datasets([d1, d2, d3], stopping_strategy=\"all_exhausted\")\n    expected_length = 3 * max(len(d1), len(d2), len(d3))\n    expected_values = [0, 10, 20, 1, 11, 21, 2, 12, 22, 0, 13, 20]  # hardcoded\n    assert isinstance(dataset, Dataset)\n    assert len(dataset) == expected_length\n    assert dataset[\"a\"] == expected_values\n    assert dataset._fingerprint == interleave_datasets([d1, d2, d3], stopping_strategy=\"all_exhausted\")._fingerprint\n\n\ndef test_interleave_datasets_probabilities_oversampling_strategy():\n    seed = 42\n    probabilities = [0.3, 0.5, 0.2]\n    d1 = Dataset.from_dict({\"a\": [0, 1, 2]})\n    d2 = Dataset.from_dict({\"a\": [10, 11, 12, 13]})\n    d3 = Dataset.from_dict({\"a\": [22, 21, 20]}).select([2, 1, 0])\n    dataset = interleave_datasets(\n        [d1, d2, d3], stopping_strategy=\"all_exhausted\", probabilities=probabilities, seed=seed\n    )\n    expected_length = 16  # hardcoded\n    expected_values = [10, 11, 20, 12, 0, 21, 13, 10, 1, 11, 12, 22, 13, 20, 10, 2]  # hardcoded\n    assert isinstance(dataset, Dataset)\n    assert len(dataset) == expected_length\n    assert dataset[\"a\"] == expected_values\n    assert (\n        dataset._fingerprint\n        == interleave_datasets(\n            [d1, d2, d3], stopping_strategy=\"all_exhausted\", probabilities=probabilities, seed=seed\n        )._fingerprint\n    )\n\n\n@pytest.mark.parametrize(\"batch_size\", [4, 5])\n@pytest.mark.parametrize(\"drop_last_batch\", [False, True])\ndef test_dataset_iter_batch(batch_size, drop_last_batch):\n    n = 25\n    dset = Dataset.from_dict({\"i\": list(range(n))})\n    all_col_values = list(range(n))\n    batches = []\n    for i, batch in enumerate(dset.iter(batch_size, drop_last_batch=drop_last_batch)):\n        assert batch == {\"i\": all_col_values[i * batch_size : (i + 1) * batch_size]}\n        batches.append(batch)\n    if drop_last_batch:\n        assert all(len(batch[\"i\"]) == batch_size for batch in batches)\n    else:\n        assert all(len(batch[\"i\"]) == batch_size for batch in batches[:-1])\n        assert len(batches[-1][\"i\"]) <= batch_size\n\n\n@pytest.mark.parametrize(\n    \"column, expected_dtype\",\n    [([\"a\", \"b\", \"c\", \"d\"], \"string\"), ([1, 2, 3, 4], \"int64\"), ([1.0, 2.0, 3.0, 4.0], \"float64\")],\n)\n@pytest.mark.parametrize(\"in_memory\", [False, True])\n@pytest.mark.parametrize(\n    \"transform\",\n    [\n        None,\n        (\"shuffle\", (42,), {}),\n        (\"with_format\", (\"pandas\",), {}),\n        (\"class_encode_column\", (\"col_2\",), {}),\n        (\"select\", (range(3),), {}),\n    ],\n)\ndef test_dataset_add_column(column, expected_dtype, in_memory, transform, dataset_dict, arrow_path):\n    column_name = \"col_4\"\n    original_dataset = (\n        Dataset(InMemoryTable.from_pydict(dataset_dict))\n        if in_memory\n        else Dataset(MemoryMappedTable.from_file(arrow_path))\n    )\n    if transform is not None:\n        transform_name, args, kwargs = transform\n        original_dataset: Dataset = getattr(original_dataset, transform_name)(*args, **kwargs)\n    column = column[:3] if transform is not None and transform_name == \"select\" else column\n    dataset = original_dataset.add_column(column_name, column)\n    assert dataset.data.shape == (3, 4) if transform is not None and transform_name == \"select\" else (4, 4)\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    # Sort expected features as in the original dataset\n    expected_features = {feature: expected_features[feature] for feature in original_dataset.features}\n    # Add new column feature\n    expected_features[column_name] = expected_dtype\n    assert dataset.data.column_names == list(expected_features.keys())\n    for feature, expected_dtype in expected_features.items():\n        assert dataset.features[feature].dtype == expected_dtype\n    assert len(dataset.data.blocks) == 1 if in_memory else 2  # multiple InMemoryTables are consolidated as one\n    assert dataset.format[\"type\"] == original_dataset.format[\"type\"]\n    assert dataset._fingerprint != original_dataset._fingerprint\n    dataset.reset_format()\n    original_dataset.reset_format()\n    assert all(dataset[col] == original_dataset[col] for col in original_dataset.column_names)\n    assert set(dataset[\"col_4\"]) == set(column)\n    if dataset._indices is not None:\n        dataset_indices = dataset._indices[\"indices\"].to_pylist()\n        expected_dataset_indices = original_dataset._indices[\"indices\"].to_pylist()\n        assert dataset_indices == expected_dataset_indices\n    assert_arrow_metadata_are_synced_with_dataset_features(dataset)\n\n\n@pytest.mark.parametrize(\n    \"transform\",\n    [None, (\"shuffle\", (42,), {}), (\"with_format\", (\"pandas\",), {}), (\"class_encode_column\", (\"col_2\",), {})],\n)\n@pytest.mark.parametrize(\"in_memory\", [False, True])\n@pytest.mark.parametrize(\n    \"item\",\n    [\n        {\"col_1\": \"2\", \"col_2\": 2, \"col_3\": 2.0},\n        {\"col_1\": \"2\", \"col_2\": \"2\", \"col_3\": \"2\"},\n        {\"col_1\": 2, \"col_2\": 2, \"col_3\": 2},\n        {\"col_1\": 2.0, \"col_2\": 2.0, \"col_3\": 2.0},\n    ],\n)\ndef test_dataset_add_item(item, in_memory, dataset_dict, arrow_path, transform):\n    dataset_to_test = (\n        Dataset(InMemoryTable.from_pydict(dataset_dict))\n        if in_memory\n        else Dataset(MemoryMappedTable.from_file(arrow_path))\n    )\n    if transform is not None:\n        transform_name, args, kwargs = transform\n        dataset_to_test: Dataset = getattr(dataset_to_test, transform_name)(*args, **kwargs)\n    dataset = dataset_to_test.add_item(item)\n    assert dataset.data.shape == (5, 3)\n    expected_features = dataset_to_test.features\n    assert sorted(dataset.data.column_names) == sorted(expected_features.keys())\n    for feature, expected_dtype in expected_features.items():\n        assert dataset.features[feature] == expected_dtype\n    assert len(dataset.data.blocks) == 1 if in_memory else 2  # multiple InMemoryTables are consolidated as one\n    assert dataset.format[\"type\"] == dataset_to_test.format[\"type\"]\n    assert dataset._fingerprint != dataset_to_test._fingerprint\n    dataset.reset_format()\n    dataset_to_test.reset_format()\n    assert dataset[:-1] == dataset_to_test[:]\n    assert {k: int(v) for k, v in dataset[-1].items()} == {k: int(v) for k, v in item.items()}\n    if dataset._indices is not None:\n        dataset_indices = dataset._indices[\"indices\"].to_pylist()\n        dataset_to_test_indices = dataset_to_test._indices[\"indices\"].to_pylist()\n        assert dataset_indices == dataset_to_test_indices + [len(dataset_to_test._data)]\n\n\ndef test_dataset_add_item_new_columns():\n    dataset = Dataset.from_dict({\"col_1\": [0, 1, 2]}, features=Features({\"col_1\": Value(\"uint8\")}))\n    dataset = dataset.add_item({\"col_1\": 3, \"col_2\": \"a\"})\n    assert dataset.data.shape == (4, 2)\n    assert dataset.features == Features({\"col_1\": Value(\"uint8\"), \"col_2\": Value(\"string\")})\n    assert dataset[:] == {\"col_1\": [0, 1, 2, 3], \"col_2\": [None, None, None, \"a\"]}\n    dataset = dataset.add_item({\"col_3\": True})\n    assert dataset.data.shape == (5, 3)\n    assert dataset.features == Features({\"col_1\": Value(\"uint8\"), \"col_2\": Value(\"string\"), \"col_3\": Value(\"bool\")})\n    assert dataset[:] == {\n        \"col_1\": [0, 1, 2, 3, None],\n        \"col_2\": [None, None, None, \"a\", None],\n        \"col_3\": [None, None, None, None, True],\n    }\n\n\ndef test_dataset_add_item_introduce_feature_type():\n    dataset = Dataset.from_dict({\"col_1\": [None, None, None]})\n    dataset = dataset.add_item({\"col_1\": \"a\"})\n    assert dataset.data.shape == (4, 1)\n    assert dataset.features == Features({\"col_1\": Value(\"string\")})\n    assert dataset[:] == {\"col_1\": [None, None, None, \"a\"]}\n\n\ndef test_dataset_filter_batched_indices():\n    ds = Dataset.from_dict({\"num\": [0, 1, 2, 3]})\n    ds = ds.filter(lambda num: num % 2 == 0, input_columns=\"num\", batch_size=2)\n    assert all(item[\"num\"] % 2 == 0 for item in ds)\n\n\n@pytest.mark.parametrize(\"in_memory\", [False, True])\ndef test_dataset_from_file(in_memory, dataset, arrow_file):\n    filename = arrow_file\n    with assert_arrow_memory_increases() if in_memory else assert_arrow_memory_doesnt_increase():\n        dataset_from_file = Dataset.from_file(filename, in_memory=in_memory)\n    assert dataset_from_file.features.type == dataset.features.type\n    assert dataset_from_file.features == dataset.features\n    assert dataset_from_file.cache_files == ([{\"filename\": filename}] if not in_memory else [])\n\n\ndef _check_csv_dataset(dataset, expected_features):\n    assert isinstance(dataset, Dataset)\n    assert dataset.num_rows == 4\n    assert dataset.num_columns == 3\n    assert dataset.column_names == [\"col_1\", \"col_2\", \"col_3\"]\n    for feature, expected_dtype in expected_features.items():\n        assert dataset.features[feature].dtype == expected_dtype\n\n\n@pytest.mark.parametrize(\"keep_in_memory\", [False, True])\ndef test_dataset_from_csv_keep_in_memory(keep_in_memory, csv_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"int64\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = Dataset.from_csv(csv_path, cache_dir=cache_dir, keep_in_memory=keep_in_memory)\n    _check_csv_dataset(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\n    \"features\",\n    [\n        None,\n        {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"},\n        {\"col_1\": \"string\", \"col_2\": \"string\", \"col_3\": \"string\"},\n        {\"col_1\": \"int32\", \"col_2\": \"int32\", \"col_3\": \"int32\"},\n        {\"col_1\": \"float32\", \"col_2\": \"float32\", \"col_3\": \"float32\"},\n    ],\n)\ndef test_dataset_from_csv_features(features, csv_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    # CSV file loses col_1 string dtype information: default now is \"int64\" instead of \"string\"\n    default_expected_features = {\"col_1\": \"int64\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    expected_features = features.copy() if features else default_expected_features\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None\n    )\n    dataset = Dataset.from_csv(csv_path, features=features, cache_dir=cache_dir)\n    _check_csv_dataset(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\"split\", [None, NamedSplit(\"train\"), \"train\", \"test\"])\ndef test_dataset_from_csv_split(split, csv_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"int64\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    dataset = Dataset.from_csv(csv_path, cache_dir=cache_dir, split=split)\n    _check_csv_dataset(dataset, expected_features)\n    assert dataset.split == split if split else \"train\"\n\n\n@pytest.mark.parametrize(\"path_type\", [str, list])\ndef test_dataset_from_csv_path_type(path_type, csv_path, tmp_path):\n    if issubclass(path_type, str):\n        path = csv_path\n    elif issubclass(path_type, list):\n        path = [csv_path]\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"int64\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    dataset = Dataset.from_csv(path, cache_dir=cache_dir)\n    _check_csv_dataset(dataset, expected_features)\n\n\ndef _check_json_dataset(dataset, expected_features):\n    assert isinstance(dataset, Dataset)\n    assert dataset.num_rows == 4\n    assert dataset.num_columns == 3\n    assert dataset.column_names == [\"col_1\", \"col_2\", \"col_3\"]\n    for feature, expected_dtype in expected_features.items():\n        assert dataset.features[feature].dtype == expected_dtype\n\n\n@pytest.mark.parametrize(\"keep_in_memory\", [False, True])\ndef test_dataset_from_json_keep_in_memory(keep_in_memory, jsonl_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = Dataset.from_json(jsonl_path, cache_dir=cache_dir, keep_in_memory=keep_in_memory)\n    _check_json_dataset(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\n    \"features\",\n    [\n        None,\n        {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"},\n        {\"col_1\": \"string\", \"col_2\": \"string\", \"col_3\": \"string\"},\n        {\"col_1\": \"int32\", \"col_2\": \"int32\", \"col_3\": \"int32\"},\n        {\"col_1\": \"float32\", \"col_2\": \"float32\", \"col_3\": \"float32\"},\n    ],\n)\ndef test_dataset_from_json_features(features, jsonl_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    default_expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    expected_features = features.copy() if features else default_expected_features\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None\n    )\n    dataset = Dataset.from_json(jsonl_path, features=features, cache_dir=cache_dir)\n    _check_json_dataset(dataset, expected_features)\n\n\ndef test_dataset_from_json_with_class_label_feature(jsonl_str_path, tmp_path):\n    features = Features(\n        {\n            \"col_1\": ClassLabel(names=[\"s0\", \"s1\", \"s2\", \"s3\"]),\n            \"col_2\": Value(\"int64\"),\n            \"col_3\": Value(\"float64\"),\n        }\n    )\n    cache_dir = tmp_path / \"cache\"\n    dataset = Dataset.from_json(jsonl_str_path, features=features, cache_dir=cache_dir)\n    assert dataset.features[\"col_1\"].dtype == \"int64\"\n\n\n@pytest.mark.parametrize(\"split\", [None, NamedSplit(\"train\"), \"train\", \"test\"])\ndef test_dataset_from_json_split(split, jsonl_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    dataset = Dataset.from_json(jsonl_path, cache_dir=cache_dir, split=split)\n    _check_json_dataset(dataset, expected_features)\n    assert dataset.split == split if split else \"train\"\n\n\n@pytest.mark.parametrize(\"path_type\", [str, list])\ndef test_dataset_from_json_path_type(path_type, jsonl_path, tmp_path):\n    if issubclass(path_type, str):\n        path = jsonl_path\n    elif issubclass(path_type, list):\n        path = [jsonl_path]\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    dataset = Dataset.from_json(path, cache_dir=cache_dir)\n    _check_json_dataset(dataset, expected_features)\n\n\ndef _check_parquet_dataset(dataset, expected_features):\n    assert isinstance(dataset, Dataset)\n    assert dataset.num_rows == 4\n    assert dataset.num_columns == 3\n    assert dataset.column_names == [\"col_1\", \"col_2\", \"col_3\"]\n    for feature, expected_dtype in expected_features.items():\n        assert dataset.features[feature].dtype == expected_dtype\n\n\n@pytest.mark.parametrize(\"keep_in_memory\", [False, True])\ndef test_dataset_from_parquet_keep_in_memory(keep_in_memory, parquet_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = Dataset.from_parquet(parquet_path, cache_dir=cache_dir, keep_in_memory=keep_in_memory)\n    _check_parquet_dataset(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\n    \"features\",\n    [\n        None,\n        {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"},\n        {\"col_1\": \"string\", \"col_2\": \"string\", \"col_3\": \"string\"},\n        {\"col_1\": \"int32\", \"col_2\": \"int32\", \"col_3\": \"int32\"},\n        {\"col_1\": \"float32\", \"col_2\": \"float32\", \"col_3\": \"float32\"},\n    ],\n)\ndef test_dataset_from_parquet_features(features, parquet_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    default_expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    expected_features = features.copy() if features else default_expected_features\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None\n    )\n    dataset = Dataset.from_parquet(parquet_path, features=features, cache_dir=cache_dir)\n    _check_parquet_dataset(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\"split\", [None, NamedSplit(\"train\"), \"train\", \"test\"])\ndef test_dataset_from_parquet_split(split, parquet_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    dataset = Dataset.from_parquet(parquet_path, cache_dir=cache_dir, split=split)\n    _check_parquet_dataset(dataset, expected_features)\n    assert dataset.split == split if split else \"train\"\n\n\n@pytest.mark.parametrize(\"path_type\", [str, list])\ndef test_dataset_from_parquet_path_type(path_type, parquet_path, tmp_path):\n    if issubclass(path_type, str):\n        path = parquet_path\n    elif issubclass(path_type, list):\n        path = [parquet_path]\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    dataset = Dataset.from_parquet(path, cache_dir=cache_dir)\n    _check_parquet_dataset(dataset, expected_features)\n\n\ndef _check_text_dataset(dataset, expected_features):\n    assert isinstance(dataset, Dataset)\n    assert dataset.num_rows == 4\n    assert dataset.num_columns == 1\n    assert dataset.column_names == [\"text\"]\n    for feature, expected_dtype in expected_features.items():\n        assert dataset.features[feature].dtype == expected_dtype\n\n\n@pytest.mark.parametrize(\"keep_in_memory\", [False, True])\ndef test_dataset_from_text_keep_in_memory(keep_in_memory, text_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"text\": \"string\"}\n    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = Dataset.from_text(text_path, cache_dir=cache_dir, keep_in_memory=keep_in_memory)\n    _check_text_dataset(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\n    \"features\",\n    [\n        None,\n        {\"text\": \"string\"},\n        {\"text\": \"int32\"},\n        {\"text\": \"float32\"},\n    ],\n)\ndef test_dataset_from_text_features(features, text_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    default_expected_features = {\"text\": \"string\"}\n    expected_features = features.copy() if features else default_expected_features\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None\n    )\n    dataset = Dataset.from_text(text_path, features=features, cache_dir=cache_dir)\n    _check_text_dataset(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\"split\", [None, NamedSplit(\"train\"), \"train\", \"test\"])\ndef test_dataset_from_text_split(split, text_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"text\": \"string\"}\n    dataset = Dataset.from_text(text_path, cache_dir=cache_dir, split=split)\n    _check_text_dataset(dataset, expected_features)\n    assert dataset.split == split if split else \"train\"\n\n\n@pytest.mark.parametrize(\"path_type\", [str, list])\ndef test_dataset_from_text_path_type(path_type, text_path, tmp_path):\n    if issubclass(path_type, str):\n        path = text_path\n    elif issubclass(path_type, list):\n        path = [text_path]\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"text\": \"string\"}\n    dataset = Dataset.from_text(path, cache_dir=cache_dir)\n    _check_text_dataset(dataset, expected_features)\n\n\n@pytest.fixture\ndef data_generator():\n    def _gen():\n        data = [\n            {\"col_1\": \"0\", \"col_2\": 0, \"col_3\": 0.0},\n            {\"col_1\": \"1\", \"col_2\": 1, \"col_3\": 1.0},\n            {\"col_1\": \"2\", \"col_2\": 2, \"col_3\": 2.0},\n            {\"col_1\": \"3\", \"col_2\": 3, \"col_3\": 3.0},\n        ]\n        for item in data:\n            yield item\n\n    return _gen\n\n\ndef _check_generator_dataset(dataset, expected_features, split):\n    assert isinstance(dataset, Dataset)\n    assert dataset.num_rows == 4\n    assert dataset.num_columns == 3\n    assert dataset.split == split\n    assert dataset.column_names == [\"col_1\", \"col_2\", \"col_3\"]\n    for feature, expected_dtype in expected_features.items():\n        assert dataset.features[feature].dtype == expected_dtype\n\n\n@pytest.mark.parametrize(\"keep_in_memory\", [False, True])\ndef test_dataset_from_generator_keep_in_memory(keep_in_memory, data_generator, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = Dataset.from_generator(data_generator, cache_dir=cache_dir, keep_in_memory=keep_in_memory)\n    _check_generator_dataset(dataset, expected_features, NamedSplit(\"train\"))\n\n\n@pytest.mark.parametrize(\n    \"features\",\n    [\n        None,\n        {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"},\n        {\"col_1\": \"string\", \"col_2\": \"string\", \"col_3\": \"string\"},\n        {\"col_1\": \"int32\", \"col_2\": \"int32\", \"col_3\": \"int32\"},\n        {\"col_1\": \"float32\", \"col_2\": \"float32\", \"col_3\": \"float32\"},\n    ],\n)\ndef test_dataset_from_generator_features(features, data_generator, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    default_expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    expected_features = features.copy() if features else default_expected_features\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None\n    )\n    dataset = Dataset.from_generator(data_generator, features=features, cache_dir=cache_dir)\n    _check_generator_dataset(dataset, expected_features, NamedSplit(\"train\"))\n\n\n@pytest.mark.parametrize(\n    \"split\",\n    [None, NamedSplit(\"train\"), \"train\", NamedSplit(\"foo\"), \"foo\"],\n)\ndef test_dataset_from_generator_split(split, data_generator, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    default_expected_split = \"train\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    expected_split = split if split else default_expected_split\n    if split:\n        dataset = Dataset.from_generator(data_generator, cache_dir=cache_dir, split=split)\n    else:\n        dataset = Dataset.from_generator(data_generator, cache_dir=cache_dir)\n    _check_generator_dataset(dataset, expected_features, expected_split)\n\n\n@pytest.mark.parametrize(\"fingerprint\", [None, \"test-dataset\"])\ndef test_dataset_from_generator_fingerprint(fingerprint, data_generator, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    dataset = Dataset.from_generator(data_generator, cache_dir=cache_dir, fingerprint=fingerprint)\n    _check_generator_dataset(dataset, expected_features, NamedSplit(\"train\"))\n    if fingerprint:\n        assert dataset._fingerprint == fingerprint\n\n\n@require_not_windows\n@require_dill_gt_0_3_2\n@require_pyspark\ndef test_from_spark():\n    import pyspark\n\n    spark = pyspark.sql.SparkSession.builder.master(\"local[*]\").appName(\"pyspark\").getOrCreate()\n    data = [\n        (\"0\", 0, 0.0),\n        (\"1\", 1, 1.0),\n        (\"2\", 2, 2.0),\n        (\"3\", 3, 3.0),\n    ]\n    df = spark.createDataFrame(data, \"col_1: string, col_2: int, col_3: float\")\n    dataset = Dataset.from_spark(df)\n    assert isinstance(dataset, Dataset)\n    assert dataset.num_rows == 4\n    assert dataset.num_columns == 3\n    assert dataset.column_names == [\"col_1\", \"col_2\", \"col_3\"]\n\n\n@require_not_windows\n@require_dill_gt_0_3_2\n@require_pyspark\ndef test_from_spark_features():\n    import PIL.Image\n    import pyspark\n\n    spark = pyspark.sql.SparkSession.builder.master(\"local[*]\").appName(\"pyspark\").getOrCreate()\n    data = [(0, np.arange(4 * 4 * 3).reshape(4, 4, 3).tolist())]\n    df = spark.createDataFrame(data, \"idx: int, image: array<array<array<int>>>\")\n    features = Features({\"idx\": Value(\"int64\"), \"image\": Image()})\n    dataset = Dataset.from_spark(\n        df,\n        features=features,\n    )\n    assert isinstance(dataset, Dataset)\n    assert dataset.num_rows == 1\n    assert dataset.num_columns == 2\n    assert dataset.column_names == [\"idx\", \"image\"]\n    assert isinstance(dataset[0][\"image\"], PIL.Image.Image)\n    assert dataset.features == features\n    assert_arrow_metadata_are_synced_with_dataset_features(dataset)\n\n\n@require_not_windows\n@require_dill_gt_0_3_2\n@require_pyspark\ndef test_from_spark_different_cache():\n    import pyspark\n\n    spark = pyspark.sql.SparkSession.builder.master(\"local[*]\").appName(\"pyspark\").getOrCreate()\n    df = spark.createDataFrame([(\"0\", 0)], \"col_1: string, col_2: int\")\n    dataset = Dataset.from_spark(df)\n    assert isinstance(dataset, Dataset)\n    different_df = spark.createDataFrame([(\"1\", 1)], \"col_1: string, col_2: int\")\n    different_dataset = Dataset.from_spark(different_df)\n    assert isinstance(different_dataset, Dataset)\n    assert dataset[0][\"col_1\"] == \"0\"\n    # Check to make sure that the second dataset wasn't read from the cache.\n    assert different_dataset[0][\"col_1\"] == \"1\"\n\n\ndef _check_sql_dataset(dataset, expected_features):\n    assert isinstance(dataset, Dataset)\n    assert dataset.num_rows == 4\n    assert dataset.num_columns == 3\n    assert dataset.column_names == [\"col_1\", \"col_2\", \"col_3\"]\n    for feature, expected_dtype in expected_features.items():\n        assert dataset.features[feature].dtype == expected_dtype\n\n\n@require_sqlalchemy\n@pytest.mark.parametrize(\"con_type\", [\"string\", \"engine\"])\ndef test_dataset_from_sql_con_type(con_type, sqlite_path, tmp_path, set_sqlalchemy_silence_uber_warning, caplog):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": STRING_FROM_PANDAS, \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    if con_type == \"string\":\n        con = \"sqlite:///\" + sqlite_path\n    elif con_type == \"engine\":\n        import sqlalchemy\n\n        con = sqlalchemy.create_engine(\"sqlite:///\" + sqlite_path)\n    with caplog.at_level(INFO, logger=get_logger().name):\n        dataset = Dataset.from_sql(\n            \"dataset\",\n            con,\n            cache_dir=cache_dir,\n        )\n    if con_type == \"string\":\n        assert \"couldn't be hashed properly\" not in caplog.text\n    elif con_type == \"engine\":\n        assert \"couldn't be hashed properly\" in caplog.text\n    dataset = Dataset.from_sql(\n        \"dataset\",\n        con,\n        cache_dir=cache_dir,\n    )\n    _check_sql_dataset(dataset, expected_features)\n\n\n@require_sqlalchemy\n@pytest.mark.parametrize(\n    \"features\",\n    [\n        None,\n        {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"},\n        {\"col_1\": \"string\", \"col_2\": \"string\", \"col_3\": \"string\"},\n        {\"col_1\": \"int32\", \"col_2\": \"int32\", \"col_3\": \"int32\"},\n        {\"col_1\": \"float32\", \"col_2\": \"float32\", \"col_3\": \"float32\"},\n    ],\n)\ndef test_dataset_from_sql_features(features, sqlite_path, tmp_path, set_sqlalchemy_silence_uber_warning):\n    cache_dir = tmp_path / \"cache\"\n    default_expected_features = {\"col_1\": STRING_FROM_PANDAS, \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    expected_features = features.copy() if features else default_expected_features\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None\n    )\n    dataset = Dataset.from_sql(\"dataset\", \"sqlite:///\" + sqlite_path, features=features, cache_dir=cache_dir)\n    _check_sql_dataset(dataset, expected_features)\n\n\n@require_sqlalchemy\n@pytest.mark.parametrize(\"keep_in_memory\", [False, True])\ndef test_dataset_from_sql_keep_in_memory(keep_in_memory, sqlite_path, tmp_path, set_sqlalchemy_silence_uber_warning):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": STRING_FROM_PANDAS, \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = Dataset.from_sql(\n            \"dataset\", \"sqlite:///\" + sqlite_path, cache_dir=cache_dir, keep_in_memory=keep_in_memory\n        )\n    _check_sql_dataset(dataset, expected_features)\n\n\ndef test_dataset_to_json(dataset, tmp_path):\n    file_path = tmp_path / \"test_path.jsonl\"\n    bytes_written = dataset.to_json(path_or_buf=file_path)\n    assert file_path.is_file()\n    assert bytes_written == file_path.stat().st_size\n    df = pd.read_json(file_path, orient=\"records\", lines=True)\n    assert df.shape == dataset.shape\n    assert list(df.columns) == list(dataset.column_names)\n\n\n@pytest.mark.parametrize(\"in_memory\", [False, True])\n@pytest.mark.parametrize(\n    \"method_and_params\",\n    [\n        (\"rename_column\", (), {\"original_column_name\": \"labels\", \"new_column_name\": \"label\"}),\n        (\"remove_columns\", (), {\"column_names\": \"labels\"}),\n        (\n            \"cast\",\n            (),\n            {\n                \"features\": Features(\n                    {\n                        \"tokens\": List(Value(\"string\")),\n                        \"labels\": List(Value(\"int16\")),\n                        \"answers\": {\n                            \"text\": List(Value(\"string\")),\n                            \"answer_start\": List(Value(\"int32\")),\n                        },\n                        \"id\": Value(\"int32\"),\n                    }\n                )\n            },\n        ),\n        (\"flatten\", (), {}),\n    ],\n)\ndef test_pickle_dataset_after_transforming_the_table(in_memory, method_and_params, arrow_file):\n    method, args, kwargs = method_and_params\n    with (\n        Dataset.from_file(arrow_file, in_memory=in_memory) as dataset,\n        Dataset.from_file(arrow_file, in_memory=in_memory) as reference_dataset,\n    ):\n        out = getattr(dataset, method)(*args, **kwargs)\n        dataset = out if out is not None else dataset\n        pickled_dataset = pickle.dumps(dataset)\n        reloaded_dataset = pickle.loads(pickled_dataset)\n\n        assert dataset._data != reference_dataset._data\n        assert dataset._data.table == reloaded_dataset._data.table\n\n\ndef test_dummy_dataset_serialize_fs(dataset, mockfs):\n    dataset_path = \"mock://my_dataset\"\n    dataset.save_to_disk(dataset_path, storage_options=mockfs.storage_options)\n    assert mockfs.isdir(dataset_path)\n    assert mockfs.glob(dataset_path + \"/*\")\n    reloaded = Dataset.load_from_disk(dataset_path, storage_options=mockfs.storage_options)\n    assert len(reloaded) == len(dataset)\n    assert reloaded.features == dataset.features\n    assert reloaded.to_dict() == dataset.to_dict()\n\n\n@pytest.mark.parametrize(\n    \"uri_or_path\",\n    [\n        \"relative/path\",\n        \"/absolute/path\",\n        \"hf://bucket/relative/path\",\n        \"hdfs://relative/path\",\n        \"hdfs:///absolute/path\",\n    ],\n)\ndef test_build_local_temp_path(uri_or_path):\n    extracted_path = strip_protocol(uri_or_path)\n    local_temp_path = Dataset._build_local_temp_path(extracted_path).as_posix()\n    extracted_path_without_anchor = Path(extracted_path).relative_to(Path(extracted_path).anchor).as_posix()\n    # Check that the local temp path is relative to the system temp dir\n    path_relative_to_tmp_dir = Path(local_temp_path).relative_to(Path(tempfile.gettempdir())).as_posix()\n\n    assert (\n        \"hdfs://\" not in path_relative_to_tmp_dir\n        and \"hf://\" not in path_relative_to_tmp_dir\n        and not local_temp_path.startswith(extracted_path_without_anchor)\n        and local_temp_path.endswith(extracted_path_without_anchor)\n    ), f\"Local temp path: {local_temp_path}\"\n\n\nclass StratifiedTest(TestCase):\n    def test_errors_train_test_split_stratify(self):\n        ys = [\n            np.array([0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2]),\n            np.array([0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),\n            np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2),\n            np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5]),\n            np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5]),\n        ]\n        for i in range(len(ys)):\n            features = Features({\"text\": Value(\"int64\"), \"label\": ClassLabel(len(np.unique(ys[i])))})\n            data = {\"text\": np.ones(len(ys[i])), \"label\": ys[i]}\n            d1 = Dataset.from_dict(data, features=features)\n\n            # For checking stratify_by_column exist as key in self.features.keys()\n            if i == 0:\n                self.assertRaises(ValueError, d1.train_test_split, 0.33, stratify_by_column=\"labl\")\n\n            # For checking minimum class count error\n            elif i == 1:\n                self.assertRaises(ValueError, d1.train_test_split, 0.33, stratify_by_column=\"label\")\n\n            # For check typeof label as ClassLabel type\n            elif i == 2:\n                d1 = Dataset.from_dict(data)\n                self.assertRaises(ValueError, d1.train_test_split, 0.33, stratify_by_column=\"label\")\n\n            # For checking test_size should be greater than or equal to number of classes\n            elif i == 3:\n                self.assertRaises(ValueError, d1.train_test_split, 0.30, stratify_by_column=\"label\")\n\n            # For checking train_size should be greater than or equal to number of classes\n            elif i == 4:\n                self.assertRaises(ValueError, d1.train_test_split, 0.60, stratify_by_column=\"label\")\n\n    def test_train_test_split_startify(self):\n        ys = [\n            np.array([0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2]),\n            np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),\n            np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2),\n            np.array([0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3]),\n            np.array([0] * 800 + [1] * 50),\n        ]\n        for y in ys:\n            features = Features({\"text\": Value(\"int64\"), \"label\": ClassLabel(len(np.unique(y)))})\n            data = {\"text\": np.ones(len(y)), \"label\": y}\n            d1 = Dataset.from_dict(data, features=features)\n            d1 = d1.train_test_split(test_size=0.33, stratify_by_column=\"label\")\n            y = np.asanyarray(y)  # To make it indexable for y[train]\n            test_size = np.ceil(0.33 * len(y))\n            train_size = len(y) - test_size\n            npt.assert_array_equal(np.unique(d1[\"train\"][\"label\"]), np.unique(d1[\"test\"][\"label\"]))\n\n            # checking classes proportion\n            p_train = np.bincount(np.unique(d1[\"train\"][\"label\"], return_inverse=True)[1]) / float(\n                len(d1[\"train\"][\"label\"])\n            )\n            p_test = np.bincount(np.unique(d1[\"test\"][\"label\"], return_inverse=True)[1]) / float(\n                len(d1[\"test\"][\"label\"])\n            )\n            npt.assert_array_almost_equal(p_train, p_test, 1)\n            assert len(d1[\"train\"][\"text\"]) + len(d1[\"test\"][\"text\"]) == y.size\n            assert len(d1[\"train\"][\"text\"]) == train_size\n            assert len(d1[\"test\"][\"text\"]) == test_size\n\n\ndef test_dataset_estimate_nbytes():\n    ds = Dataset.from_dict({\"a\": [\"0\" * 100] * 100})\n    assert 0.9 * ds._estimate_nbytes() < 100 * 100, \"must be smaller than full dataset size\"\n\n    ds = Dataset.from_dict({\"a\": [\"0\" * 100] * 100}).select([0])\n    assert 0.9 * ds._estimate_nbytes() < 100 * 100, \"must be smaller than one chunk\"\n\n    ds = Dataset.from_dict({\"a\": [\"0\" * 100] * 100})\n    ds = concatenate_datasets([ds] * 100)\n    assert 0.9 * ds._estimate_nbytes() < 100 * 100 * 100, \"must be smaller than full dataset size\"\n    assert 1.1 * ds._estimate_nbytes() > 100 * 100 * 100, \"must be bigger than full dataset size\"\n\n    ds = Dataset.from_dict({\"a\": [\"0\" * 100] * 100})\n    ds = concatenate_datasets([ds] * 100).select([0])\n    assert 0.9 * ds._estimate_nbytes() < 100 * 100, \"must be smaller than one chunk\"\n\n\ndef test_dataset_to_iterable_dataset(dataset: Dataset):\n    iterable_dataset = dataset.to_iterable_dataset()\n    assert isinstance(iterable_dataset, IterableDataset)\n    assert list(iterable_dataset) == list(dataset)\n    assert iterable_dataset.features == dataset.features\n    iterable_dataset = dataset.to_iterable_dataset(num_shards=3)\n    assert isinstance(iterable_dataset, IterableDataset)\n    assert list(iterable_dataset) == list(dataset)\n    assert iterable_dataset.features == dataset.features\n    assert iterable_dataset.num_shards == 3\n    with pytest.raises(ValueError):\n        dataset.to_iterable_dataset(num_shards=len(dataset) + 1)\n    assert dataset.with_format(\"torch\").to_iterable_dataset()._formatting.format_type == \"torch\"\n    with pytest.raises(NotImplementedError):\n        dataset.with_format(\"torch\", columns=[dataset.column_names[0]]).to_iterable_dataset()\n\n\n@require_pil\ndef test_dataset_format_with_unformatted_image():\n    import PIL\n\n    ds = Dataset.from_dict(\n        {\"a\": [np.arange(4 * 4 * 3).reshape(4, 4, 3)] * 10, \"b\": [[0, 1]] * 10},\n        Features({\"a\": Image(), \"b\": List(Value(\"int64\"))}),\n    )\n    ds.set_format(\"np\", columns=[\"b\"], output_all_columns=True)\n    assert isinstance(ds[0][\"a\"], PIL.Image.Image)\n    assert isinstance(ds[0][\"b\"], np.ndarray)\n\n\n@pytest.mark.parametrize(\"batch_size\", [1, 4])\n@require_torch\ndef test_dataset_with_torch_dataloader(dataset, batch_size):\n    from torch.utils.data import DataLoader\n\n    from datasets import config\n\n    dataloader = DataLoader(dataset, batch_size=batch_size)\n    with patch.object(dataset, \"_getitem\", wraps=dataset._getitem) as mock_getitem:\n        out = list(dataloader)\n        getitem_call_count = mock_getitem.call_count\n    assert len(out) == len(dataset) // batch_size + int(len(dataset) % batch_size > 0)\n    # calling dataset[list_of_indices] is much more efficient than [dataset[idx] for idx in list of indices]\n    if config.TORCH_VERSION >= version.parse(\"1.13.0\"):\n        assert getitem_call_count == len(dataset) // batch_size + int(len(dataset) % batch_size > 0)\n\n\n@pytest.mark.parametrize(\"return_lazy_dict\", [True, False, \"mix\"])\ndef test_map_cases(return_lazy_dict):\n    def f(x):\n        \"\"\"May return a mix of LazyDict and regular Dict\"\"\"\n        if x[\"a\"] < 2:\n            x[\"a\"] = -1\n            return dict(x) if return_lazy_dict is False else x\n        else:\n            return x if return_lazy_dict is True else {}\n\n    ds = Dataset.from_dict({\"a\": [0, 1, 2, 3]})\n    ds = ds.map(f)\n    outputs = ds[:]\n    assert outputs == {\"a\": [-1, -1, 2, 3]}\n\n    def f(x):\n        \"\"\"May return a mix of LazyDict and regular Dict, but sometimes with None values\"\"\"\n        if x[\"a\"] < 2:\n            x[\"a\"] = None\n            return dict(x) if return_lazy_dict is False else x\n        else:\n            return x if return_lazy_dict is True else {}\n\n    ds = Dataset.from_dict({\"a\": [0, 1, 2, 3]})\n    ds = ds.map(f)\n    outputs = ds[:]\n    assert outputs == {\"a\": [None, None, 2, 3]}\n\n    def f(x):\n        \"\"\"Return a LazyDict, but we remove a lazy column and add a new one\"\"\"\n        if x[\"a\"] < 2:\n            x[\"b\"] = -1\n            return x\n        else:\n            x[\"b\"] = x[\"a\"]\n            return x\n\n    ds = Dataset.from_dict({\"a\": [0, 1, 2, 3]})\n    ds = ds.map(f, remove_columns=[\"a\"])\n    outputs = ds[:]\n    assert outputs == {\"b\": [-1, -1, 2, 3]}\n\n    # The formatted dataset version removes the lazy column from a different dictionary, hence it should be preserved in the output\n    ds = Dataset.from_dict({\"a\": [0, 1, 2, 3]})\n    ds = ds.with_format(\"numpy\")\n    ds = ds.map(f, remove_columns=[\"a\"])\n    ds = ds.with_format(None)\n    outputs = ds[:]\n    assert outputs == {\"a\": [0, 1, 2, 3], \"b\": [-1, -1, 2, 3]}\n\n    def f(x):\n        \"\"\"May return a mix of LazyDict and regular Dict, but we replace a lazy column\"\"\"\n        if x[\"a\"] < 2:\n            x[\"a\"] = -1\n            return dict(x) if return_lazy_dict is False else x\n        else:\n            x[\"a\"] = x[\"a\"]\n            return x if return_lazy_dict is True else {\"a\": x[\"a\"]}\n\n    ds = Dataset.from_dict({\"a\": [0, 1, 2, 3]})\n    ds = ds.map(f, remove_columns=[\"a\"])\n    outputs = ds[:]\n    assert outputs == ({\"a\": [-1, -1, 2, 3]} if return_lazy_dict is False else {})\n\n    def f(x):\n        \"\"\"May return a mix of LazyDict and regular Dict, but we modify a nested lazy column in-place\"\"\"\n        if x[\"a\"][\"b\"] < 2:\n            x[\"a\"][\"c\"] = -1\n            return dict(x) if return_lazy_dict is False else x\n        else:\n            x[\"a\"][\"c\"] = x[\"a\"][\"b\"]\n            return x if return_lazy_dict is True else {}\n\n    ds = Dataset.from_dict({\"a\": [{\"b\": 0}, {\"b\": 1}, {\"b\": 2}, {\"b\": 3}]})\n    ds = ds.map(f)\n    outputs = ds[:]\n    assert outputs == {\"a\": [{\"b\": 0, \"c\": -1}, {\"b\": 1, \"c\": -1}, {\"b\": 2, \"c\": 2}, {\"b\": 3, \"c\": 3}]}\n\n    def f(x):\n        \"\"\"May return a mix of LazyDict and regular Dict, but using an extension type\"\"\"\n        if x[\"a\"][0][0] < 2:\n            x[\"a\"] = [[-1]]\n            return dict(x) if return_lazy_dict is False else x\n        else:\n            return x if return_lazy_dict is True else {}\n\n    features = Features({\"a\": Array2D(shape=(1, 1), dtype=\"int32\")})\n    ds = Dataset.from_dict({\"a\": [[[i]] for i in [0, 1, 2, 3]]}, features=features)\n    ds = ds.map(f)\n    outputs = ds[:]\n    assert outputs == {\"a\": [[[i]] for i in [-1, -1, 2, 3]]}\n\n    def f(x):\n        \"\"\"May return a mix of LazyDict and regular Dict, but using a nested extension type\"\"\"\n        if x[\"a\"][\"nested\"][0][0] < 2:\n            x[\"a\"] = {\"nested\": [[-1]]}\n            return dict(x) if return_lazy_dict is False else x\n        else:\n            return x if return_lazy_dict is True else {}\n\n    features = Features({\"a\": {\"nested\": Array2D(shape=(1, 1), dtype=\"int64\")}})\n    ds = Dataset.from_dict({\"a\": [{\"nested\": [[i]]} for i in [0, 1, 2, 3]]}, features=features)\n    ds = ds.map(f)\n    outputs = ds[:]\n    assert outputs == {\"a\": [{\"nested\": [[i]]} for i in [-1, -1, 2, 3]]}\n\n\ndef test_map_async():\n    dset = Dataset.from_dict({\"x\": range(100)})\n\n    async def f(example):\n        await asyncio.sleep(0.1)\n        return {\"y\": 1}\n\n    _start = time.time()\n    out = dset.map(f)\n    assert time.time() - _start < 2.0\n    assert out[0][\"y\"] == 1\n\n    async def f(batch):\n        await asyncio.sleep(0.1)\n        return {\"y\": [1] * len(batch[\"x\"])}\n\n    _start = time.time()\n    out = dset.map(f, batched=True)\n    assert time.time() - _start < 2.0\n    assert out[0][\"y\"] == 1\n\n\ndef test_filter_async():\n    dset = Dataset.from_dict({\"x\": range(100)})\n\n    async def f(example):\n        await asyncio.sleep(0.1)\n        return example[\"x\"] == 42\n\n    _start = time.time()\n    out = dset.filter(f)\n    assert time.time() - _start < 2.0\n    assert len(out) == 1\n\n    async def f(batch):\n        await asyncio.sleep(0.1)\n        return [x == 42 for x in batch[\"x\"]]\n\n    _start = time.time()\n    out = dset.filter(f, batched=True)\n    assert time.time() - _start < 2.0\n    assert len(out) == 1\n\n\ndef test_dataset_getitem_int_np_equivalence():\n    ds = Dataset.from_dict({\"a\": [0, 1, 2, 3]})\n\n    assert ds[1] == ds[np.int64(1)]\n\n\ndef test_dataset_getitem_raises():\n    ds = Dataset.from_dict({\"a\": [0, 1, 2, 3]})\n    with pytest.raises(TypeError):\n        ds[False]\n    with pytest.raises(TypeError):\n        ds._getitem(True)\n    with pytest.raises(TypeError):\n        ds[np.bool_(True)]\n    with pytest.raises(TypeError):\n        ds[1.0]\n\n\ndef test_categorical_dataset(tmpdir):\n    n_legs = pa.array([2, 4, 5, 100])\n    animals = pa.array([\"Flamingo\", \"Horse\", \"Brittle stars\", \"Centipede\"]).cast(\n        pa.dictionary(pa.int32(), pa.string())\n    )\n    names = [\"n_legs\", \"animals\"]\n\n    table = pa.Table.from_arrays([n_legs, animals], names=names)\n    table_path = str(tmpdir / \"data.parquet\")\n    pa.parquet.write_table(table, table_path)\n\n    dataset = Dataset.from_parquet(table_path)\n    entry = dataset[0]\n\n    # Categorical types get transparently converted to string\n    assert entry[\"animals\"] == \"Flamingo\"\n\n\ndef test_dataset_batch():\n    # Create a simple Dataset\n    data = {\"id\": list(range(10)), \"text\": [f\"Text {i}\" for i in range(10)]}\n    ds = Dataset.from_dict(data)\n\n    # Test with batch_size=3, drop_last_batch=False\n    batched_ds = ds.batch(batch_size=3, drop_last_batch=False)\n    batches = list(batched_ds)\n\n    assert len(batches) == 4  # 3 full batches and 1 partial batch\n    for i, batch in enumerate(batches[:3]):  # Check full batches\n        assert len(batch[\"id\"]) == 3\n        assert len(batch[\"text\"]) == 3\n        assert batch[\"id\"] == [3 * i, 3 * i + 1, 3 * i + 2]\n        assert batch[\"text\"] == [f\"Text {3 * i}\", f\"Text {3 * i + 1}\", f\"Text {3 * i + 2}\"]\n\n    # Check last partial batch\n    assert len(batches[3][\"id\"]) == 1\n    assert len(batches[3][\"text\"]) == 1\n    assert batches[3][\"id\"] == [9]\n    assert batches[3][\"text\"] == [\"Text 9\"]\n\n    # Test with batch_size=3, drop_last_batch=True\n    batched_ds = ds.batch(batch_size=3, drop_last_batch=True)\n    batches = list(batched_ds)\n\n    assert len(batches) == 3  # Only full batches\n    for i, batch in enumerate(batches):\n        assert len(batch[\"id\"]) == 3\n        assert len(batch[\"text\"]) == 3\n        assert batch[\"id\"] == [3 * i, 3 * i + 1, 3 * i + 2]\n        assert batch[\"text\"] == [f\"Text {3 * i}\", f\"Text {3 * i + 1}\", f\"Text {3 * i + 2}\"]\n\n    # Test with batch_size=4 (doesn't evenly divide dataset size)\n    batched_ds = ds.batch(batch_size=4, drop_last_batch=False)\n    batches = list(batched_ds)\n\n    assert len(batches) == 3  # 2 full batches and 1 partial batch\n    for i, batch in enumerate(batches[:2]):  # Check full batches\n        assert len(batch[\"id\"]) == 4\n        assert len(batch[\"text\"]) == 4\n        assert batch[\"id\"] == [4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3]\n        assert batch[\"text\"] == [f\"Text {4 * i}\", f\"Text {4 * i + 1}\", f\"Text {4 * i + 2}\", f\"Text {4 * i + 3}\"]\n\n    # Check last partial batch\n    assert len(batches[2][\"id\"]) == 2\n    assert len(batches[2][\"text\"]) == 2\n    assert batches[2][\"id\"] == [8, 9]\n    assert batches[2][\"text\"] == [\"Text 8\", \"Text 9\"]\n\n\ndef test_dataset_from_dict_with_large_list():\n    data = {\"col_1\": [[1, 2], [3, 4]]}\n    features = Features({\"col_1\": LargeList(Value(\"int64\"))})\n    ds = Dataset.from_dict(data, features=features)\n    assert isinstance(ds, Dataset)\n    assert pa.types.is_large_list(ds.data.schema.field(\"col_1\").type)\n\n\ndef test_dataset_save_to_disk_with_large_list(tmp_path):\n    data = {\"col_1\": [[1, 2], [3, 4]]}\n    features = Features({\"col_1\": LargeList(Value(\"int64\"))})\n    ds = Dataset.from_dict(data, features=features)\n    dataset_path = tmp_path / \"dataset_dir\"\n    ds.save_to_disk(dataset_path)\n    assert (dataset_path / \"data-00000-of-00001.arrow\").exists()\n\n\ndef test_dataset_save_to_disk_and_load_from_disk_round_trip_with_large_list(tmp_path):\n    data = {\"col_1\": [[1, 2], [3, 4]]}\n    features = Features({\"col_1\": LargeList(Value(\"int64\"))})\n    ds = Dataset.from_dict(data, features=features)\n    dataset_path = tmp_path / \"dataset_dir\"\n    ds.save_to_disk(dataset_path)\n    assert (dataset_path / \"data-00000-of-00001.arrow\").exists()\n    loaded_ds = load_from_disk(dataset_path)\n    assert len(loaded_ds) == len(ds)\n    assert loaded_ds.features == ds.features\n    assert loaded_ds.to_dict() == ds.to_dict()\n\n\n@require_polars\ndef test_from_polars_with_large_list():\n    import polars as pl\n\n    df = pl.from_dict({\"col_1\": [[1, 2], [3, 4]]})\n    ds = Dataset.from_polars(df)\n    assert isinstance(ds, Dataset)\n\n\n@require_polars\ndef test_from_polars_save_to_disk_with_large_list(tmp_path):\n    import polars as pl\n\n    df = pl.from_dict({\"col_1\": [[1, 2], [3, 4]]})\n    ds = Dataset.from_polars(df)\n    dataset_path = tmp_path / \"dataset_dir\"\n    ds.save_to_disk(dataset_path)\n    assert (dataset_path / \"data-00000-of-00001.arrow\").exists()\n\n\n@require_polars\ndef test_from_polars_save_to_disk_and_load_from_disk_round_trip_with_large_list(tmp_path):\n    import polars as pl\n\n    df = pl.from_dict({\"col_1\": [[1, 2], [3, 4]]})\n    ds = Dataset.from_polars(df)\n    dataset_path = tmp_path / \"dataset_dir\"\n    ds.save_to_disk(dataset_path)\n    assert (dataset_path / \"data-00000-of-00001.arrow\").exists()\n    loaded_ds = load_from_disk(dataset_path)\n    assert len(loaded_ds) == len(ds)\n    assert loaded_ds.features == ds.features\n    assert loaded_ds.to_dict() == ds.to_dict()\n\n\n@require_polars\ndef test_polars_round_trip():\n    ds = Dataset.from_dict({\"x\": [[1, 2], [3, 4, 5]], \"y\": [\"a\", \"b\"]})\n    assert isinstance(Dataset.from_polars(ds.to_polars()), Dataset)\n\n\ndef test_add_column():\n    from datasets import Dataset\n\n    ds = Dataset.from_dict({\"a\": [1, 2]})\n    ds = ds.add_column(\"b\", [3, 4])\n    assert \"b\" in ds.features\n    assert ds[0] == {\"a\": 1, \"b\": 3}\n    assert ds[1] == {\"a\": 2, \"b\": 4}\n\n\ndef test_process_large_few_examples(tmp_path):\n    # GH 7911\n    from datasets import Dataset\n\n    target_size = 2 * 1024\n\n    base_text = \"This is a sample sentence that will be repeated many times to create a large dataset. \" * 100\n    large_text = \"\"\n\n    while len(large_text.encode(\"utf-8\")) < target_size:\n        large_text += base_text\n\n    data = {\"text\": [large_text], \"label\": [0], \"id\": [1]}\n\n    ds = Dataset.from_dict(data)\n\n    dataset_path = tmp_path / \"sample_dataset\"\n    # make sure this is split into 2 shards\n    ds.save_to_disk(dataset_path, max_shard_size=\"1KB\")\n    assert (dataset_path / \"data-00000-of-00001.arrow\").exists()\n"
  },
  {
    "path": "tests/test_arrow_reader.py",
    "content": "import os\nimport tempfile\nfrom pathlib import Path\nfrom unittest import TestCase\n\nimport pyarrow as pa\nimport pytest\n\nfrom datasets.arrow_dataset import Dataset\nfrom datasets.arrow_reader import ArrowReader, BaseReader, FileInstructions, ReadInstruction, make_file_instructions\nfrom datasets.info import DatasetInfo\nfrom datasets.splits import NamedSplit, Split, SplitDict, SplitInfo\n\nfrom .utils import assert_arrow_memory_doesnt_increase, assert_arrow_memory_increases\n\n\nclass ReaderTest(BaseReader):\n    \"\"\"\n    Build a Dataset object out of Instruction instance(s).\n    This reader is made for testing. It mocks file reads.\n    \"\"\"\n\n    def _get_table_from_filename(self, filename_skip_take, in_memory=False):\n        \"\"\"Returns a Dataset instance from given (filename, skip, take).\"\"\"\n        filename, skip, take = (\n            filename_skip_take[\"filename\"],\n            filename_skip_take[\"skip\"] if \"skip\" in filename_skip_take else None,\n            filename_skip_take[\"take\"] if \"take\" in filename_skip_take else None,\n        )\n        open(os.path.join(filename), \"wb\").close()\n        pa_table = pa.Table.from_pydict({\"filename\": [Path(filename).name] * 100})\n        if take == -1:\n            take = len(pa_table) - skip\n        if skip is not None and take is not None:\n            pa_table = pa_table.slice(skip, take)\n        return pa_table\n\n\nclass BaseReaderTest(TestCase):\n    def test_read(self):\n        name = \"my_name\"\n        train_info = SplitInfo(name=\"train\", num_examples=100)\n        test_info = SplitInfo(name=\"test\", num_examples=100)\n        split_infos = [train_info, test_info]\n        split_dict = SplitDict()\n        split_dict.add(train_info)\n        split_dict.add(test_info)\n        info = DatasetInfo(splits=split_dict)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            reader = ReaderTest(tmp_dir, info)\n\n            instructions = \"test[:33%]\"\n            dset = Dataset(**reader.read(name, instructions, split_infos))\n            self.assertEqual(dset[\"filename\"][0], f\"{name}-test\")\n            self.assertEqual(dset.num_rows, 33)\n            self.assertEqual(dset.num_columns, 1)\n\n            instructions1 = [\"train\", \"test[:33%]\"]\n            instructions2 = [Split.TRAIN, ReadInstruction.from_spec(\"test[:33%]\")]\n            for instructions in [instructions1, instructions2]:\n                datasets_kwargs = [reader.read(name, instr, split_infos) for instr in instructions]\n                train_dset, test_dset = (Dataset(**dataset_kwargs) for dataset_kwargs in datasets_kwargs)\n                self.assertEqual(train_dset[\"filename\"][0], f\"{name}-train\")\n                self.assertEqual(train_dset.num_rows, 100)\n                self.assertEqual(train_dset.num_columns, 1)\n                self.assertIsInstance(train_dset.split, NamedSplit)\n                self.assertEqual(str(train_dset.split), \"train\")\n                self.assertEqual(test_dset[\"filename\"][0], f\"{name}-test\")\n                self.assertEqual(test_dset.num_rows, 33)\n                self.assertEqual(test_dset.num_columns, 1)\n                self.assertIsInstance(test_dset.split, NamedSplit)\n                self.assertEqual(str(test_dset.split), \"test[:33%]\")\n                del train_dset, test_dset\n\n    def test_read_sharded(self):\n        name = \"my_name\"\n        train_info = SplitInfo(name=\"train\", num_examples=1000, shard_lengths=[100] * 10)\n        split_infos = [train_info]\n        split_dict = SplitDict()\n        split_dict.add(train_info)\n        info = DatasetInfo(splits=split_dict)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            reader = ReaderTest(tmp_dir, info)\n\n            instructions = \"train[:33%]\"\n            dset = Dataset(**reader.read(name, instructions, split_infos))\n            self.assertEqual(dset[\"filename\"][0], f\"{name}-train-00000-of-00010\")\n            self.assertEqual(dset[\"filename\"][-1], f\"{name}-train-00003-of-00010\")\n            self.assertEqual(dset.num_rows, 330)\n            self.assertEqual(dset.num_columns, 1)\n\n    def test_read_files(self):\n        train_info = SplitInfo(name=\"train\", num_examples=100)\n        test_info = SplitInfo(name=\"test\", num_examples=100)\n        split_dict = SplitDict()\n        split_dict.add(train_info)\n        split_dict.add(test_info)\n        info = DatasetInfo(splits=split_dict)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            reader = ReaderTest(tmp_dir, info)\n\n            files = [\n                {\"filename\": os.path.join(tmp_dir, \"train\")},\n                {\"filename\": os.path.join(tmp_dir, \"test\"), \"skip\": 10, \"take\": 10},\n            ]\n            dset = Dataset(**reader.read_files(files, original_instructions=\"train+test[10:20]\"))\n            self.assertEqual(dset.num_rows, 110)\n            self.assertEqual(dset.num_columns, 1)\n            del dset\n\n\n@pytest.mark.parametrize(\"in_memory\", [False, True])\ndef test_read_table(in_memory, dataset, arrow_file):\n    filename = arrow_file\n    with assert_arrow_memory_increases() if in_memory else assert_arrow_memory_doesnt_increase():\n        table = ArrowReader.read_table(filename, in_memory=in_memory)\n    assert table.shape == dataset.data.shape\n    assert set(table.column_names) == set(dataset.data.column_names)\n    assert dict(table.to_pydict()) == dict(dataset.data.to_pydict())  # to_pydict returns OrderedDict\n\n\n@pytest.mark.parametrize(\"in_memory\", [False, True])\ndef test_read_files(in_memory, dataset, arrow_file):\n    filename = arrow_file\n    reader = ArrowReader(\"\", None)\n    with assert_arrow_memory_increases() if in_memory else assert_arrow_memory_doesnt_increase():\n        dataset_kwargs = reader.read_files([{\"filename\": filename}], in_memory=in_memory)\n    assert dataset_kwargs.keys() == {\"arrow_table\", \"info\", \"split\"}\n    table = dataset_kwargs[\"arrow_table\"]\n    assert table.shape == dataset.data.shape\n    assert set(table.column_names) == set(dataset.data.column_names)\n    assert dict(table.to_pydict()) == dict(dataset.data.to_pydict())  # to_pydict returns OrderedDict\n\n\ndef test_read_instruction_spec():\n    assert ReadInstruction(\"train\", to=10, unit=\"abs\").to_spec() == \"train[:10]\"\n    assert ReadInstruction(\"train\", from_=-80, to=10, unit=\"%\").to_spec() == \"train[-80%:10%]\"\n\n    spec_train_test = \"train+test\"\n    assert ReadInstruction.from_spec(spec_train_test).to_spec() == spec_train_test\n\n    spec_train_abs = \"train[2:10]\"\n    assert ReadInstruction.from_spec(spec_train_abs).to_spec() == spec_train_abs\n\n    spec_train_pct = \"train[15%:-20%]\"\n    assert ReadInstruction.from_spec(spec_train_pct).to_spec() == spec_train_pct\n\n    spec_train_pct_rounding = \"train[:10%](closest)\"\n    assert ReadInstruction.from_spec(spec_train_pct_rounding).to_spec() == \"train[:10%]\"\n\n    spec_train_pct_rounding = \"train[:10%](pct1_dropremainder)\"\n    assert ReadInstruction.from_spec(spec_train_pct_rounding).to_spec() == spec_train_pct_rounding\n\n    spec_train_test_pct_rounding = \"train[:10%](pct1_dropremainder)+test[-10%:](pct1_dropremainder)\"\n    assert ReadInstruction.from_spec(spec_train_test_pct_rounding).to_spec() == spec_train_test_pct_rounding\n\n\ndef test_make_file_instructions_basic():\n    name = \"dummy\"\n    split_infos = [SplitInfo(name=\"train\", num_examples=100)]\n    instruction = \"train[:33%]\"\n    filetype_suffix = \"arrow\"\n    prefix_path = \"prefix\"\n\n    file_instructions = make_file_instructions(name, split_infos, instruction, filetype_suffix, prefix_path)\n    assert isinstance(file_instructions, FileInstructions)\n    assert file_instructions.num_examples == 33\n    assert file_instructions.file_instructions == [\n        {\"filename\": os.path.join(prefix_path, f\"{name}-train.arrow\"), \"skip\": 0, \"take\": 33}\n    ]\n\n    split_infos = [SplitInfo(name=\"train\", num_examples=100, shard_lengths=[10] * 10)]\n    file_instructions = make_file_instructions(name, split_infos, instruction, filetype_suffix, prefix_path)\n    assert isinstance(file_instructions, FileInstructions)\n    assert file_instructions.num_examples == 33\n    assert file_instructions.file_instructions == [\n        {\"filename\": os.path.join(prefix_path, f\"{name}-train-00000-of-00010.arrow\"), \"skip\": 0, \"take\": -1},\n        {\"filename\": os.path.join(prefix_path, f\"{name}-train-00001-of-00010.arrow\"), \"skip\": 0, \"take\": -1},\n        {\"filename\": os.path.join(prefix_path, f\"{name}-train-00002-of-00010.arrow\"), \"skip\": 0, \"take\": -1},\n        {\"filename\": os.path.join(prefix_path, f\"{name}-train-00003-of-00010.arrow\"), \"skip\": 0, \"take\": 3},\n    ]\n\n\n@pytest.mark.parametrize(\n    \"split_name, instruction, shard_lengths, read_range\",\n    [\n        (\"train\", \"train[-20%:]\", 100, (80, 100)),\n        (\"train\", \"train[:200]\", 100, (0, 100)),\n        (\"train\", \"train[:-200]\", 100, None),\n        (\"train\", \"train[-200:]\", 100, (0, 100)),\n        (\"train\", \"train[-20%:]\", [10] * 10, (80, 100)),\n        (\"train\", \"train[:200]\", [10] * 10, (0, 100)),\n        (\"train\", \"train[:-200]\", [10] * 10, None),\n        (\"train\", \"train[-200:]\", [10] * 10, (0, 100)),\n    ],\n)\ndef test_make_file_instructions(split_name, instruction, shard_lengths, read_range):\n    name = \"dummy\"\n    split_infos = split_infos = [\n        SplitInfo(\n            name=\"train\",\n            num_examples=shard_lengths if not isinstance(shard_lengths, list) else sum(shard_lengths),\n            shard_lengths=shard_lengths if isinstance(shard_lengths, list) else None,\n        )\n    ]\n    filetype_suffix = \"arrow\"\n    prefix_path = \"prefix\"\n    file_instructions = make_file_instructions(name, split_infos, instruction, filetype_suffix, prefix_path)\n    assert isinstance(file_instructions, FileInstructions)\n    assert file_instructions.num_examples == (read_range[1] - read_range[0] if read_range is not None else 0)\n    if read_range is None:\n        assert file_instructions.file_instructions == []\n    else:\n        if not isinstance(shard_lengths, list):\n            assert file_instructions.file_instructions == [\n                {\n                    \"filename\": os.path.join(prefix_path, f\"{name}-{split_name}.arrow\"),\n                    \"skip\": read_range[0],\n                    \"take\": read_range[1] - read_range[0],\n                }\n            ]\n        else:\n            file_instructions_list = []\n            shard_offset = 0\n            for i, shard_length in enumerate(shard_lengths):\n                filename = os.path.join(prefix_path, f\"{name}-{split_name}-{i:05d}-of-{len(shard_lengths):05d}.arrow\")\n                if shard_offset <= read_range[0] < shard_offset + shard_length:\n                    file_instructions_list.append(\n                        {\n                            \"filename\": filename,\n                            \"skip\": read_range[0] - shard_offset,\n                            \"take\": read_range[1] - read_range[0]\n                            if read_range[1] < shard_offset + shard_length\n                            else -1,\n                        }\n                    )\n                elif shard_offset < read_range[1] <= shard_offset + shard_length:\n                    file_instructions_list.append(\n                        {\n                            \"filename\": filename,\n                            \"skip\": 0,\n                            \"take\": read_range[1] - shard_offset\n                            if read_range[1] < shard_offset + shard_length\n                            else -1,\n                        }\n                    )\n                elif read_range[0] < shard_offset and read_range[1] > shard_offset + shard_length:\n                    file_instructions_list.append(\n                        {\n                            \"filename\": filename,\n                            \"skip\": 0,\n                            \"take\": -1,\n                        }\n                    )\n                shard_offset += shard_length\n            assert file_instructions.file_instructions == file_instructions_list\n\n\n@pytest.mark.parametrize(\"name, expected_exception\", [(None, TypeError), (\"\", ValueError)])\ndef test_make_file_instructions_raises(name, expected_exception):\n    split_infos = [SplitInfo(name=\"train\", num_examples=100)]\n    instruction = \"train\"\n    filetype_suffix = \"arrow\"\n    prefix_path = \"prefix_path\"\n    with pytest.raises(expected_exception):\n        _ = make_file_instructions(name, split_infos, instruction, filetype_suffix, prefix_path)\n"
  },
  {
    "path": "tests/test_arrow_writer.py",
    "content": "import copy\nimport json\nimport os\nimport tempfile\nfrom unittest import TestCase\nfrom unittest.mock import patch\n\nimport numpy as np\nimport pyarrow as pa\nimport pyarrow.parquet as pq\nimport pytest\n\nfrom datasets import config\nfrom datasets.arrow_writer import ArrowWriter, OptimizedTypedSequence, ParquetWriter, TypedSequence\nfrom datasets.features import Array2D, ClassLabel, Features, Image, Value\nfrom datasets.features.features import Array2DExtensionType, cast_to_python_objects\n\nfrom .utils import require_pil\n\n\nclass TypedSequenceTest(TestCase):\n    def test_no_type(self):\n        arr = pa.array(TypedSequence([1, 2, 3]))\n        self.assertEqual(arr.type, pa.int64())\n\n    def test_array_type_forbidden(self):\n        with self.assertRaises(ValueError):\n            _ = pa.array(TypedSequence([1, 2, 3]), type=pa.int64())\n\n    def test_try_type_and_type_forbidden(self):\n        with self.assertRaises(ValueError):\n            _ = pa.array(TypedSequence([1, 2, 3], try_type=Value(\"bool\"), type=Value(\"int64\")))\n\n    def test_compatible_type(self):\n        arr = pa.array(TypedSequence([1, 2, 3], type=Value(\"int32\")))\n        self.assertEqual(arr.type, pa.int32())\n\n    def test_incompatible_type(self):\n        with self.assertRaises((TypeError, pa.lib.ArrowInvalid)):\n            _ = pa.array(TypedSequence([\"foo\", \"bar\"], type=Value(\"int64\")))\n\n    def test_try_compatible_type(self):\n        arr = pa.array(TypedSequence([1, 2, 3], try_type=Value(\"int32\")))\n        self.assertEqual(arr.type, pa.int32())\n\n    def test_try_incompatible_type(self):\n        arr = pa.array(TypedSequence([\"foo\", \"bar\"], try_type=Value(\"int64\")))\n        self.assertEqual(arr.type, pa.string())\n\n    def test_compatible_extension_type(self):\n        arr = pa.array(TypedSequence([[[1, 2, 3]]], type=Array2D((1, 3), \"int64\")))\n        self.assertEqual(arr.type, Array2DExtensionType((1, 3), \"int64\"))\n\n    def test_incompatible_extension_type(self):\n        with self.assertRaises((TypeError, pa.lib.ArrowInvalid)):\n            _ = pa.array(TypedSequence([\"foo\", \"bar\"], type=Array2D((1, 3), \"int64\")))\n\n    def test_try_compatible_extension_type(self):\n        arr = pa.array(TypedSequence([[[1, 2, 3]]], try_type=Array2D((1, 3), \"int64\")))\n        self.assertEqual(arr.type, Array2DExtensionType((1, 3), \"int64\"))\n\n    def test_try_incompatible_extension_type(self):\n        arr = pa.array(TypedSequence([\"foo\", \"bar\"], try_type=Array2D((1, 3), \"int64\")))\n        self.assertEqual(arr.type, pa.string())\n\n    @require_pil\n    def test_exhaustive_cast(self):\n        import PIL.Image\n\n        pil_image = PIL.Image.fromarray(np.arange(10, dtype=np.uint8).reshape(2, 5))\n        with patch(\n            \"datasets.arrow_writer.cast_to_python_objects\", side_effect=cast_to_python_objects\n        ) as mock_cast_to_python_objects:\n            _ = pa.array(TypedSequence([{\"path\": None, \"bytes\": b\"image_bytes\"}, pil_image], type=Image()))\n            args, kwargs = mock_cast_to_python_objects.call_args_list[-1]\n            self.assertIn(\"optimize_list_casting\", kwargs)\n            self.assertFalse(kwargs[\"optimize_list_casting\"])\n\n\ndef _check_output(output, expected_num_chunks: int):\n    stream = pa.BufferReader(output) if isinstance(output, pa.Buffer) else pa.memory_map(output)\n    f = pa.ipc.open_stream(stream)\n    pa_table: pa.Table = f.read_all()\n    assert len(pa_table.to_batches()) == expected_num_chunks\n    assert pa_table.to_pydict() == {\"col_1\": [\"foo\", \"bar\"], \"col_2\": [1, 2]}\n    del pa_table\n\n\n@pytest.mark.parametrize(\"writer_batch_size\", [None, 1, 10])\n@pytest.mark.parametrize(\n    \"fields\",\n    [\n        None,\n        {\"col_1\": pa.string(), \"col_2\": pa.int64()},\n        {\"col_1\": pa.string(), \"col_2\": pa.int32()},\n        {\"col_2\": pa.int64(), \"col_1\": pa.string()},\n    ],\n)\ndef test_write(fields, writer_batch_size):\n    output = pa.BufferOutputStream()\n    schema = pa.schema(fields) if fields else None\n    with ArrowWriter(stream=output, schema=schema, writer_batch_size=writer_batch_size) as writer:\n        writer.write({\"col_1\": \"foo\", \"col_2\": 1})\n        writer.write({\"col_1\": \"bar\", \"col_2\": 2})\n        num_examples, num_bytes = writer.finalize()\n    assert num_examples == 2\n    assert num_bytes > 0\n    if not fields:\n        fields = {\"col_1\": pa.string(), \"col_2\": pa.int64()}\n    assert writer._schema == pa.schema(fields, metadata=writer._schema.metadata)\n    _check_output(output.getvalue(), expected_num_chunks=num_examples if writer_batch_size == 1 else 1)\n\n\ndef test_write_with_features():\n    output = pa.BufferOutputStream()\n    features = Features({\"labels\": ClassLabel(names=[\"neg\", \"pos\"])})\n    with ArrowWriter(stream=output, features=features) as writer:\n        writer.write({\"labels\": 0})\n        writer.write({\"labels\": 1})\n        num_examples, num_bytes = writer.finalize()\n    assert num_examples == 2\n    assert num_bytes > 0\n    assert writer._schema == features.arrow_schema\n    assert writer._schema.metadata == features.arrow_schema.metadata\n    stream = pa.BufferReader(output.getvalue())\n    f = pa.ipc.open_stream(stream)\n    pa_table: pa.Table = f.read_all()\n    schema = pa_table.schema\n    assert pa_table.num_rows == 2\n    assert schema == features.arrow_schema\n    assert schema.metadata == features.arrow_schema.metadata\n    assert features == Features.from_arrow_schema(schema)\n\n\n@pytest.mark.parametrize(\"writer_batch_size\", [None, 2, 10])\ndef test_write_with_keys(writer_batch_size):\n    output = pa.BufferOutputStream()\n    with ArrowWriter(\n        stream=output,\n        writer_batch_size=writer_batch_size,\n    ) as writer:\n        writer.write({\"col_1\": \"foo\", \"col_2\": 1})\n        writer.write({\"col_1\": \"bar\", \"col_2\": 2})\n        num_examples, num_bytes = writer.finalize()\n    assert num_examples == 2\n    assert num_bytes > 0\n    _check_output(output.getvalue(), expected_num_chunks=num_examples if writer_batch_size == 1 else 1)\n\n\n@pytest.mark.parametrize(\"writer_batch_size\", [None, 1, 10])\n@pytest.mark.parametrize(\n    \"fields\", [None, {\"col_1\": pa.string(), \"col_2\": pa.int64()}, {\"col_1\": pa.string(), \"col_2\": pa.int32()}]\n)\ndef test_write_batch(fields, writer_batch_size):\n    output = pa.BufferOutputStream()\n    schema = pa.schema(fields) if fields else None\n    with ArrowWriter(stream=output, schema=schema, writer_batch_size=writer_batch_size) as writer:\n        writer.write_batch({\"col_1\": [\"foo\", \"bar\"], \"col_2\": [1, 2]})\n        writer.write_batch({\"col_1\": [], \"col_2\": []})\n        num_examples, num_bytes = writer.finalize()\n    assert num_examples == 2\n    assert num_bytes > 0\n    if not fields:\n        fields = {\"col_1\": pa.string(), \"col_2\": pa.int64()}\n    assert writer._schema == pa.schema(fields, metadata=writer._schema.metadata)\n    _check_output(output.getvalue(), expected_num_chunks=num_examples if writer_batch_size == 1 else 1)\n\n\n@pytest.mark.parametrize(\"writer_batch_size\", [None, 1, 10])\n@pytest.mark.parametrize(\n    \"fields\", [None, {\"col_1\": pa.string(), \"col_2\": pa.int64()}, {\"col_1\": pa.string(), \"col_2\": pa.int32()}]\n)\ndef test_write_table(fields, writer_batch_size):\n    output = pa.BufferOutputStream()\n    schema = pa.schema(fields) if fields else None\n    with ArrowWriter(stream=output, schema=schema, writer_batch_size=writer_batch_size) as writer:\n        writer.write_table(pa.Table.from_pydict({\"col_1\": [\"foo\", \"bar\"], \"col_2\": [1, 2]}))\n        num_examples, num_bytes = writer.finalize()\n    assert num_examples == 2\n    assert num_bytes > 0\n    if not fields:\n        fields = {\"col_1\": pa.string(), \"col_2\": pa.int64()}\n    assert writer._schema == pa.schema(fields, metadata=writer._schema.metadata)\n    _check_output(output.getvalue(), expected_num_chunks=num_examples if writer_batch_size == 1 else 1)\n\n\n@pytest.mark.parametrize(\"writer_batch_size\", [None, 1, 10])\n@pytest.mark.parametrize(\n    \"fields\", [None, {\"col_1\": pa.string(), \"col_2\": pa.int64()}, {\"col_1\": pa.string(), \"col_2\": pa.int32()}]\n)\ndef test_write_row(fields, writer_batch_size):\n    output = pa.BufferOutputStream()\n    schema = pa.schema(fields) if fields else None\n    with ArrowWriter(stream=output, schema=schema, writer_batch_size=writer_batch_size) as writer:\n        writer.write_row(pa.Table.from_pydict({\"col_1\": [\"foo\"], \"col_2\": [1]}))\n        writer.write_row(pa.Table.from_pydict({\"col_1\": [\"bar\"], \"col_2\": [2]}))\n        num_examples, num_bytes = writer.finalize()\n    assert num_examples == 2\n    assert num_bytes > 0\n    if not fields:\n        fields = {\"col_1\": pa.string(), \"col_2\": pa.int64()}\n    assert writer._schema == pa.schema(fields, metadata=writer._schema.metadata)\n    _check_output(output.getvalue(), expected_num_chunks=num_examples if writer_batch_size == 1 else 1)\n\n\ndef test_write_file():\n    with tempfile.TemporaryDirectory() as tmp_dir:\n        fields = {\"col_1\": pa.string(), \"col_2\": pa.int64()}\n        output = os.path.join(tmp_dir, \"test.arrow\")\n        with ArrowWriter(path=output, schema=pa.schema(fields)) as writer:\n            writer.write_batch({\"col_1\": [\"foo\", \"bar\"], \"col_2\": [1, 2]})\n            num_examples, num_bytes = writer.finalize()\n        assert num_examples == 2\n        assert num_bytes > 0\n        assert writer._schema == pa.schema(fields, metadata=writer._schema.metadata)\n        _check_output(output, 1)\n\n\ndef get_base_dtype(arr_type):\n    if pa.types.is_list(arr_type):\n        return get_base_dtype(arr_type.value_type)\n    else:\n        return arr_type\n\n\ndef change_first_primitive_element_in_list(lst, value):\n    if isinstance(lst[0], list):\n        change_first_primitive_element_in_list(lst[0], value)\n    else:\n        lst[0] = value\n\n\n@pytest.mark.parametrize(\"optimized_int_type, expected_dtype\", [(None, pa.int64()), (Value(\"int32\"), pa.int32())])\n@pytest.mark.parametrize(\"sequence\", [[1, 2, 3], [[1, 2, 3]], [[[1, 2, 3]]]])\ndef test_optimized_int_type_for_typed_sequence(sequence, optimized_int_type, expected_dtype):\n    arr = pa.array(TypedSequence(sequence, optimized_int_type=optimized_int_type))\n    assert get_base_dtype(arr.type) == expected_dtype\n\n\n@pytest.mark.parametrize(\n    \"col, expected_dtype\",\n    [\n        (\"attention_mask\", pa.int8()),\n        (\"special_tokens_mask\", pa.int8()),\n        (\"token_type_ids\", pa.int8()),\n        (\"input_ids\", pa.int32()),\n        (\"other\", pa.int64()),\n    ],\n)\n@pytest.mark.parametrize(\"sequence\", [[1, 2, 3], [[1, 2, 3]], [[[1, 2, 3]]]])\ndef test_optimized_typed_sequence(sequence, col, expected_dtype):\n    # in range\n    arr = pa.array(OptimizedTypedSequence(sequence, col=col))\n    assert get_base_dtype(arr.type) == expected_dtype\n\n    # not in range\n    if col != \"other\":\n        # avoids errors due to in-place modifications\n        sequence = copy.deepcopy(sequence)\n        value = np.iinfo(expected_dtype.to_pandas_dtype()).max + 1\n        change_first_primitive_element_in_list(sequence, value)\n        arr = pa.array(OptimizedTypedSequence(sequence, col=col))\n        assert get_base_dtype(arr.type) == pa.int64()\n\n\n@pytest.mark.parametrize(\"raise_exception\", [False, True])\ndef test_arrow_writer_closes_stream(raise_exception, tmp_path):\n    path = str(tmp_path / \"dataset-train.arrow\")\n    try:\n        with ArrowWriter(path=path) as writer:\n            if raise_exception:\n                raise pa.lib.ArrowInvalid()\n            else:\n                writer.stream.close()\n    except pa.lib.ArrowInvalid:\n        pass\n    finally:\n        assert writer.stream.closed\n\n\ndef test_arrow_writer_with_filesystem(mockfs):\n    path = \"mock://dataset-train.arrow\"\n    with ArrowWriter(path=path, storage_options=mockfs.storage_options) as writer:\n        assert isinstance(writer._fs, type(mockfs))\n        assert writer._fs.storage_options == mockfs.storage_options\n        writer.write({\"col_1\": \"foo\", \"col_2\": 1})\n        writer.write({\"col_1\": \"bar\", \"col_2\": 2})\n        num_examples, num_bytes = writer.finalize()\n    assert num_examples == 2\n    assert num_bytes > 0\n    assert mockfs.exists(path)\n\n\ndef test_parquet_writer_write():\n    output = pa.BufferOutputStream()\n    with ParquetWriter(stream=output) as writer:\n        writer.write({\"col_1\": \"foo\", \"col_2\": 1})\n        writer.write({\"col_1\": \"bar\", \"col_2\": 2})\n        num_examples, num_bytes = writer.finalize()\n    assert num_examples == 2\n    assert num_bytes > 0\n    stream = pa.BufferReader(output.getvalue())\n    pa_table: pa.Table = pq.read_table(stream)\n    assert pa_table.to_pydict() == {\"col_1\": [\"foo\", \"bar\"], \"col_2\": [1, 2]}\n\n\ndef test_parquet_writer_uses_content_defined_chunking():\n    def write_and_get_argument_and_metadata(**kwargs):\n        output = pa.BufferOutputStream()\n        with patch(\"pyarrow.parquet.ParquetWriter\", wraps=pq.ParquetWriter) as MockWriter:\n            with ParquetWriter(stream=output, **kwargs) as writer:\n                writer.write({\"col_1\": \"foo\", \"col_2\": 1})\n                writer.write({\"col_1\": \"bar\", \"col_2\": 2})\n                writer.finalize()\n            assert MockWriter.call_count == 1\n            _, kwargs = MockWriter.call_args\n            assert \"use_content_defined_chunking\" in kwargs\n\n        # read metadata from the output stream\n        with pa.input_stream(output.getvalue()) as stream:\n            metadata = pq.read_metadata(stream)\n            key_value_metadata = metadata.metadata\n\n        return kwargs[\"use_content_defined_chunking\"], key_value_metadata\n\n    # not passing the use_content_defined_chunking argument, using the default\n    passed_arg, key_value_metadata = write_and_get_argument_and_metadata()\n    assert passed_arg == config.DEFAULT_CDC_OPTIONS\n    assert b\"content_defined_chunking\" in key_value_metadata\n    json_encoded_options = key_value_metadata[b\"content_defined_chunking\"].decode(\"utf-8\")\n    assert json.loads(json_encoded_options) == config.DEFAULT_CDC_OPTIONS\n\n    # passing True, using the default options\n    passed_arg, key_value_metadata = write_and_get_argument_and_metadata(use_content_defined_chunking=True)\n    assert passed_arg == config.DEFAULT_CDC_OPTIONS\n    assert b\"content_defined_chunking\" in key_value_metadata\n    json_encoded_options = key_value_metadata[b\"content_defined_chunking\"].decode(\"utf-8\")\n    assert json.loads(json_encoded_options) == config.DEFAULT_CDC_OPTIONS\n\n    # passing False, not using content defined chunking\n    passed_arg, key_value_metadata = write_and_get_argument_and_metadata(use_content_defined_chunking=False)\n    assert passed_arg is False\n    assert b\"content_defined_chunking\" not in key_value_metadata\n\n    # passing custom options, using the custom options\n    custom_cdc_options = {\n        \"min_chunk_size\": 128 * 1024,  # 128 KiB\n        \"max_chunk_size\": 512 * 1024,  # 512 KiB\n        \"norm_level\": 1,\n    }\n    passed_arg, key_value_metadata = write_and_get_argument_and_metadata(\n        use_content_defined_chunking=custom_cdc_options\n    )\n    assert passed_arg == custom_cdc_options\n    assert b\"content_defined_chunking\" in key_value_metadata\n    json_encoded_options = key_value_metadata[b\"content_defined_chunking\"].decode(\"utf-8\")\n    assert json.loads(json_encoded_options) == custom_cdc_options\n\n    # passing None or wrong options raise by pyarrow\n    with pytest.raises(TypeError):\n        write_and_get_argument_and_metadata(use_content_defined_chunking=None)\n    with pytest.raises(TypeError):\n        write_and_get_argument_and_metadata(use_content_defined_chunking=\"invalid_options\")\n    with pytest.raises(ValueError):\n        write_and_get_argument_and_metadata(use_content_defined_chunking={\"invalid_option\": 1})\n\n\ndef test_parquet_writer_writes_page_index():\n    output = pa.BufferOutputStream()\n    with patch(\"pyarrow.parquet.ParquetWriter\", wraps=pq.ParquetWriter) as MockWriter:\n        with ParquetWriter(stream=output) as writer:\n            writer.write({\"col_1\": \"foo\", \"col_2\": 1})\n            writer.write({\"col_1\": \"bar\", \"col_2\": 2})\n            writer.finalize()\n        assert MockWriter.call_count == 1\n        _, kwargs = MockWriter.call_args\n        assert \"write_page_index\" in kwargs\n        assert kwargs[\"write_page_index\"]\n\n\n@require_pil\n@pytest.mark.parametrize(\"embed_local_files\", [False, True])\ndef test_writer_embed_local_files(tmp_path, embed_local_files):\n    import PIL.Image\n\n    image_path = str(tmp_path / \"test_image_rgb.jpg\")\n    PIL.Image.fromarray(np.zeros((5, 5), dtype=np.uint8)).save(image_path, format=\"png\")\n    output = pa.BufferOutputStream()\n    with ParquetWriter(\n        stream=output, features=Features({\"image\": Image()}), embed_local_files=embed_local_files\n    ) as writer:\n        writer.write({\"image\": image_path})\n        writer.finalize()\n    stream = pa.BufferReader(output.getvalue())\n    pa_table: pa.Table = pq.read_table(stream)\n    out = pa_table.to_pydict()\n    if embed_local_files:\n        assert isinstance(out[\"image\"][0][\"path\"], str)\n        with open(image_path, \"rb\") as f:\n            assert out[\"image\"][0][\"bytes\"] == f.read()\n    else:\n        assert out[\"image\"][0][\"path\"] == image_path\n        assert out[\"image\"][0][\"bytes\"] is None\n\n\ndef test_always_nullable():\n    non_nullable_schema = pa.schema([pa.field(\"col_1\", pa.string(), nullable=False)])\n    output = pa.BufferOutputStream()\n    with ArrowWriter(stream=output) as writer:\n        writer._build_writer(inferred_schema=non_nullable_schema)\n        assert writer._schema == pa.schema([pa.field(\"col_1\", pa.string())])\n"
  },
  {
    "path": "tests/test_builder.py",
    "content": "import importlib\nimport os\nimport tempfile\nimport types\nfrom contextlib import nullcontext as does_not_raise\nfrom multiprocessing import Process\nfrom pathlib import Path\nfrom unittest import TestCase\nfrom unittest.mock import patch\n\nimport numpy as np\nimport pyarrow as pa\nimport pyarrow.parquet as pq\nimport pytest\nfrom multiprocess.pool import Pool\n\nfrom datasets.arrow_dataset import Dataset\nfrom datasets.arrow_writer import ArrowWriter\nfrom datasets.builder import (\n    ArrowBasedBuilder,\n    BuilderConfig,\n    DatasetBuilder,\n    GeneratorBasedBuilder,\n    InvalidConfigName,\n    Key,\n)\nfrom datasets.data_files import DataFilesList\nfrom datasets.dataset_dict import DatasetDict, IterableDatasetDict\nfrom datasets.download.download_manager import DownloadMode\nfrom datasets.features import Features, List, Value\nfrom datasets.info import DatasetInfo, PostProcessedInfo\nfrom datasets.iterable_dataset import IterableDataset\nfrom datasets.load import configure_builder_class\nfrom datasets.splits import Split, SplitDict, SplitGenerator, SplitInfo\nfrom datasets.streaming import xjoin\nfrom datasets.utils.file_utils import is_local_path\nfrom datasets.utils.info_utils import VerificationMode\nfrom datasets.utils.logging import INFO, get_logger\n\nfrom .utils import (\n    assert_arrow_memory_doesnt_increase,\n    assert_arrow_memory_increases,\n    require_faiss,\n    set_current_working_directory_to_temp_dir,\n)\n\n\nclass DummyBuilder(DatasetBuilder):\n    def _info(self):\n        return DatasetInfo(features=Features({\"text\": Value(\"string\")}))\n\n    def _split_generators(self, dl_manager):\n        return [SplitGenerator(name=Split.TRAIN)]\n\n    def _prepare_split(self, split_generator, **kwargs):\n        fname = f\"{self.dataset_name}-{split_generator.name}.arrow\"\n        with ArrowWriter(features=self.info.features, path=os.path.join(self._output_dir, fname)) as writer:\n            writer.write_batch({\"text\": [\"foo\"] * 100})\n            num_examples, num_bytes = writer.finalize()\n        split_generator.split_info.num_examples = num_examples\n        split_generator.split_info.num_bytes = num_bytes\n\n\nclass DummyGeneratorBasedBuilder(GeneratorBasedBuilder):\n    def _info(self):\n        return DatasetInfo(features=Features({\"text\": Value(\"string\")}))\n\n    def _split_generators(self, dl_manager):\n        return [SplitGenerator(name=Split.TRAIN)]\n\n    def _generate_examples(self):\n        for i in range(100):\n            yield (0, i), {\"text\": \"foo\"}\n\n\nclass DummyArrowBasedBuilder(ArrowBasedBuilder):\n    def _info(self):\n        return DatasetInfo(features=Features({\"text\": Value(\"string\")}))\n\n    def _split_generators(self, dl_manager):\n        return [SplitGenerator(name=Split.TRAIN)]\n\n    def _generate_tables(self):\n        for i in range(10):\n            yield (0, i), pa.table({\"text\": [\"foo\"] * 10})\n\n\nclass DummyGeneratorBasedBuilderWithIntegers(GeneratorBasedBuilder):\n    def _info(self):\n        return DatasetInfo(features=Features({\"id\": Value(\"int8\")}))\n\n    def _split_generators(self, dl_manager):\n        return [SplitGenerator(name=Split.TRAIN)]\n\n    def _generate_examples(self):\n        for i in range(100):\n            yield (0, i), {\"id\": i}\n\n\nclass DummyGeneratorBasedBuilderConfig(BuilderConfig):\n    def __init__(self, content=\"foo\", times=2, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n        self.content = content\n        self.times = times\n\n\nclass DummyGeneratorBasedBuilderWithConfig(GeneratorBasedBuilder):\n    BUILDER_CONFIG_CLASS = DummyGeneratorBasedBuilderConfig\n\n    def _info(self):\n        return DatasetInfo(features=Features({\"text\": Value(\"string\")}))\n\n    def _split_generators(self, dl_manager):\n        return [SplitGenerator(name=Split.TRAIN)]\n\n    def _generate_examples(self):\n        for i in range(100):\n            yield (0, i), {\"text\": self.config.content * self.config.times}\n\n\nclass DummyBuilderWithMultipleConfigs(DummyBuilder):\n    BUILDER_CONFIGS = [\n        DummyGeneratorBasedBuilderConfig(name=\"a\"),\n        DummyGeneratorBasedBuilderConfig(name=\"b\"),\n    ]\n\n\nclass DummyBuilderWithDefaultConfig(DummyBuilderWithMultipleConfigs):\n    DEFAULT_CONFIG_NAME = \"a\"\n\n\nclass DummyBuilderWithDownload(DummyBuilder):\n    def __init__(self, *args, rel_path=None, abs_path=None, **kwargs):\n        super().__init__(*args, **kwargs)\n        self._rel_path = rel_path\n        self._abs_path = abs_path\n\n    def _split_generators(self, dl_manager):\n        if self._rel_path is not None:\n            assert os.path.exists(dl_manager.download(self._rel_path)), \"dl_manager must support relative paths\"\n        if self._abs_path is not None:\n            assert os.path.exists(dl_manager.download(self._abs_path)), \"dl_manager must support absolute paths\"\n        return [SplitGenerator(name=Split.TRAIN)]\n\n\nclass DummyArrowBasedBuilderWithShards(ArrowBasedBuilder):\n    def _info(self):\n        return DatasetInfo(features=Features({\"id\": Value(\"int8\"), \"filepath\": Value(\"string\")}))\n\n    def _split_generators(self, dl_manager):\n        return [SplitGenerator(name=Split.TRAIN, gen_kwargs={\"filepaths\": [f\"data{i}.txt\" for i in range(4)]})]\n\n    def _generate_tables(self, filepaths):\n        for shard_idx, filepath in enumerate(filepaths):\n            for i in range(10):\n                yield Key(shard_idx, i), pa.table({\"id\": range(10 * i, 10 * (i + 1)), \"filepath\": [filepath] * 10})\n\n\nclass DummyGeneratorBasedBuilderWithShards(GeneratorBasedBuilder):\n    def _info(self):\n        return DatasetInfo(features=Features({\"id\": Value(\"int8\"), \"filepath\": Value(\"string\")}))\n\n    def _split_generators(self, dl_manager):\n        return [SplitGenerator(name=Split.TRAIN, gen_kwargs={\"filepaths\": [f\"data{i}.txt\" for i in range(4)]})]\n\n    def _generate_examples(self, filepaths):\n        for shard_idx, filepath in enumerate(filepaths):\n            for i in range(100):\n                yield Key(shard_idx, i), {\"id\": i, \"filepath\": filepath}\n\n\nclass DummyArrowBasedBuilderWithAmbiguousShards(ArrowBasedBuilder):\n    def _info(self):\n        return DatasetInfo(features=Features({\"id\": Value(\"int8\"), \"filepath\": Value(\"string\")}))\n\n    def _split_generators(self, dl_manager):\n        return [\n            SplitGenerator(\n                name=Split.TRAIN,\n                gen_kwargs={\n                    \"filepaths\": [f\"data{i}.txt\" for i in range(4)],\n                    \"dummy_kwarg_with_different_length\": [f\"dummy_data{i}.txt\" for i in range(3)],\n                },\n            )\n        ]\n\n    def _generate_tables(self, filepaths, dummy_kwarg_with_different_length):\n        for shard_idx, filepath in enumerate(filepaths):\n            for i in range(10):\n                yield Key(shard_idx, i), pa.table({\"id\": range(10 * i, 10 * (i + 1)), \"filepath\": [filepath] * 10})\n\n\nclass DummyGeneratorBasedBuilderWithAmbiguousShards(GeneratorBasedBuilder):\n    def _info(self):\n        return DatasetInfo(features=Features({\"id\": Value(\"int8\"), \"filepath\": Value(\"string\")}))\n\n    def _split_generators(self, dl_manager):\n        return [\n            SplitGenerator(\n                name=Split.TRAIN,\n                gen_kwargs={\n                    \"filepaths\": [f\"data{i}.txt\" for i in range(4)],\n                    \"dummy_kwarg_with_different_length\": [f\"dummy_data{i}.txt\" for i in range(3)],\n                },\n            )\n        ]\n\n    def _generate_examples(self, filepaths, dummy_kwarg_with_different_length):\n        for shard_idx, filepath in enumerate(filepaths):\n            for i in range(100):\n                yield Key(shard_idx, i), {\"id\": i, \"filepath\": filepath}\n\n\ndef _run_concurrent_download_and_prepare(tmp_dir):\n    builder = DummyBuilder(cache_dir=tmp_dir)\n    builder.download_and_prepare(download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)\n    return builder\n\n\ndef check_streaming(builder):\n    builders_module = importlib.import_module(builder.__module__)\n    assert builders_module._patched_for_streaming\n    assert builders_module.os.path.join is xjoin\n\n\nclass BuilderTest(TestCase):\n    def test_download_and_prepare(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            builder = DummyBuilder(cache_dir=tmp_dir)\n            builder.download_and_prepare(download_mode=DownloadMode.FORCE_REDOWNLOAD)\n            self.assertTrue(\n                os.path.exists(\n                    os.path.join(\n                        tmp_dir, builder.dataset_name, \"default\", \"0.0.0\", f\"{builder.dataset_name}-train.arrow\"\n                    )\n                )\n            )\n            self.assertDictEqual(builder.info.features, Features({\"text\": Value(\"string\")}))\n            self.assertEqual(builder.info.splits[\"train\"].num_examples, 100)\n            self.assertTrue(\n                os.path.exists(os.path.join(tmp_dir, builder.dataset_name, \"default\", \"0.0.0\", \"dataset_info.json\"))\n            )\n\n    def test_download_and_prepare_checksum_computation(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            builder_no_verification = DummyBuilder(cache_dir=tmp_dir)\n            builder_no_verification.download_and_prepare(download_mode=DownloadMode.FORCE_REDOWNLOAD)\n            self.assertTrue(\n                all(v[\"checksum\"] is not None for _, v in builder_no_verification.info.download_checksums.items())\n            )\n            builder_with_verification = DummyBuilder(cache_dir=tmp_dir)\n            builder_with_verification.download_and_prepare(\n                download_mode=DownloadMode.FORCE_REDOWNLOAD,\n                verification_mode=VerificationMode.ALL_CHECKS,\n            )\n            self.assertTrue(\n                all(v[\"checksum\"] is None for _, v in builder_with_verification.info.download_checksums.items())\n            )\n\n    def test_concurrent_download_and_prepare(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            processes = 2\n            with Pool(processes=processes) as pool:\n                jobs = [\n                    pool.apply_async(_run_concurrent_download_and_prepare, kwds={\"tmp_dir\": tmp_dir})\n                    for _ in range(processes)\n                ]\n                builders = [job.get() for job in jobs]\n                for builder in builders:\n                    self.assertTrue(\n                        os.path.exists(\n                            os.path.join(\n                                tmp_dir,\n                                builder.dataset_name,\n                                \"default\",\n                                \"0.0.0\",\n                                f\"{builder.dataset_name}-train.arrow\",\n                            )\n                        )\n                    )\n                    self.assertDictEqual(builder.info.features, Features({\"text\": Value(\"string\")}))\n                    self.assertEqual(builder.info.splits[\"train\"].num_examples, 100)\n                    self.assertTrue(\n                        os.path.exists(\n                            os.path.join(tmp_dir, builder.dataset_name, \"default\", \"0.0.0\", \"dataset_info.json\")\n                        )\n                    )\n\n    def test_download_and_prepare_with_base_path(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            rel_path = \"dummy1.data\"\n            abs_path = os.path.join(tmp_dir, \"dummy2.data\")\n            # test relative path is missing\n            builder = DummyBuilderWithDownload(cache_dir=tmp_dir, rel_path=rel_path)\n            with self.assertRaises(FileNotFoundError):\n                builder.download_and_prepare(download_mode=DownloadMode.FORCE_REDOWNLOAD, base_path=tmp_dir)\n            # test absolute path is missing\n            builder = DummyBuilderWithDownload(cache_dir=tmp_dir, abs_path=abs_path)\n            with self.assertRaises(FileNotFoundError):\n                builder.download_and_prepare(download_mode=DownloadMode.FORCE_REDOWNLOAD, base_path=tmp_dir)\n            # test that they are both properly loaded when they exist\n            open(os.path.join(tmp_dir, rel_path), \"w\")\n            open(abs_path, \"w\")\n            builder = DummyBuilderWithDownload(cache_dir=tmp_dir, rel_path=rel_path, abs_path=abs_path)\n            builder.download_and_prepare(download_mode=DownloadMode.FORCE_REDOWNLOAD, base_path=tmp_dir)\n            self.assertTrue(\n                os.path.exists(\n                    os.path.join(\n                        tmp_dir,\n                        builder.dataset_name,\n                        \"default\",\n                        \"0.0.0\",\n                        f\"{builder.dataset_name}-train.arrow\",\n                    )\n                )\n            )\n\n    def test_as_dataset_with_post_process(self):\n        def _post_process(self, dataset, resources_paths):\n            def char_tokenize(example):\n                return {\"tokens\": list(example[\"text\"])}\n\n            return dataset.map(char_tokenize, cache_file_name=resources_paths[\"tokenized_dataset\"])\n\n        def _post_processing_resources(self, split):\n            return {\"tokenized_dataset\": f\"tokenized_dataset-{split}.arrow\"}\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            builder = DummyBuilder(cache_dir=tmp_dir)\n            builder.info.post_processed = PostProcessedInfo(\n                features=Features({\"text\": Value(\"string\"), \"tokens\": List(Value(\"string\"))})\n            )\n            builder._post_process = types.MethodType(_post_process, builder)\n            builder._post_processing_resources = types.MethodType(_post_processing_resources, builder)\n            os.makedirs(builder.cache_dir)\n\n            builder.info.splits = SplitDict()\n            builder.info.splits.add(SplitInfo(\"train\", num_examples=10))\n            builder.info.splits.add(SplitInfo(\"test\", num_examples=10))\n\n            for split in builder.info.splits:\n                with ArrowWriter(\n                    path=os.path.join(builder.cache_dir, f\"{builder.dataset_name}-{split}.arrow\"),\n                    features=Features({\"text\": Value(\"string\")}),\n                ) as writer:\n                    writer.write_batch({\"text\": [\"foo\"] * 10})\n                    writer.finalize()\n\n                with ArrowWriter(\n                    path=os.path.join(builder.cache_dir, f\"tokenized_dataset-{split}.arrow\"),\n                    features=Features({\"text\": Value(\"string\"), \"tokens\": List(Value(\"string\"))}),\n                ) as writer:\n                    writer.write_batch({\"text\": [\"foo\"] * 10, \"tokens\": [list(\"foo\")] * 10})\n                    writer.finalize()\n\n            dsets = builder.as_dataset()\n            self.assertIsInstance(dsets, DatasetDict)\n            self.assertListEqual(list(dsets.keys()), [\"train\", \"test\"])\n            self.assertEqual(len(dsets[\"train\"]), 10)\n            self.assertEqual(len(dsets[\"test\"]), 10)\n            self.assertDictEqual(\n                dsets[\"train\"].features, Features({\"text\": Value(\"string\"), \"tokens\": List(Value(\"string\"))})\n            )\n            self.assertDictEqual(\n                dsets[\"test\"].features, Features({\"text\": Value(\"string\"), \"tokens\": List(Value(\"string\"))})\n            )\n            self.assertListEqual(dsets[\"train\"].column_names, [\"text\", \"tokens\"])\n            self.assertListEqual(dsets[\"test\"].column_names, [\"text\", \"tokens\"])\n            del dsets\n\n            dset = builder.as_dataset(\"train\")\n            self.assertIsInstance(dset, Dataset)\n            self.assertEqual(dset.split, \"train\")\n            self.assertEqual(len(dset), 10)\n            self.assertDictEqual(dset.features, Features({\"text\": Value(\"string\"), \"tokens\": List(Value(\"string\"))}))\n            self.assertListEqual(dset.column_names, [\"text\", \"tokens\"])\n            self.assertGreater(builder.info.post_processing_size, 0)\n            self.assertGreater(\n                builder.info.post_processed.resources_checksums[\"train\"][\"tokenized_dataset\"][\"num_bytes\"], 0\n            )\n            del dset\n\n            dset = builder.as_dataset(\"train+test[:30%]\")\n            self.assertIsInstance(dset, Dataset)\n            self.assertEqual(dset.split, \"train+test[:30%]\")\n            self.assertEqual(len(dset), 13)\n            self.assertDictEqual(dset.features, Features({\"text\": Value(\"string\"), \"tokens\": List(Value(\"string\"))}))\n            self.assertListEqual(dset.column_names, [\"text\", \"tokens\"])\n            del dset\n\n            dset = builder.as_dataset(\"all\")\n            self.assertIsInstance(dset, Dataset)\n            self.assertEqual(dset.split, \"train+test\")\n            self.assertEqual(len(dset), 20)\n            self.assertDictEqual(dset.features, Features({\"text\": Value(\"string\"), \"tokens\": List(Value(\"string\"))}))\n            self.assertListEqual(dset.column_names, [\"text\", \"tokens\"])\n            del dset\n\n        def _post_process(self, dataset, resources_paths):\n            return dataset.select([0, 1], keep_in_memory=True)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            builder = DummyBuilder(cache_dir=tmp_dir)\n            builder._post_process = types.MethodType(_post_process, builder)\n            os.makedirs(builder.cache_dir)\n\n            builder.info.splits = SplitDict()\n            builder.info.splits.add(SplitInfo(\"train\", num_examples=10))\n            builder.info.splits.add(SplitInfo(\"test\", num_examples=10))\n\n            for split in builder.info.splits:\n                with ArrowWriter(\n                    path=os.path.join(builder.cache_dir, f\"{builder.dataset_name}-{split}.arrow\"),\n                    features=Features({\"text\": Value(\"string\")}),\n                ) as writer:\n                    writer.write_batch({\"text\": [\"foo\"] * 10})\n                    writer.finalize()\n\n                with ArrowWriter(\n                    path=os.path.join(builder.cache_dir, f\"small_dataset-{split}.arrow\"),\n                    features=Features({\"text\": Value(\"string\")}),\n                ) as writer:\n                    writer.write_batch({\"text\": [\"foo\"] * 2})\n                    writer.finalize()\n\n            dsets = builder.as_dataset()\n            self.assertIsInstance(dsets, DatasetDict)\n            self.assertListEqual(list(dsets.keys()), [\"train\", \"test\"])\n            self.assertEqual(len(dsets[\"train\"]), 2)\n            self.assertEqual(len(dsets[\"test\"]), 2)\n            self.assertDictEqual(dsets[\"train\"].features, Features({\"text\": Value(\"string\")}))\n            self.assertDictEqual(dsets[\"test\"].features, Features({\"text\": Value(\"string\")}))\n            self.assertListEqual(dsets[\"train\"].column_names, [\"text\"])\n            self.assertListEqual(dsets[\"test\"].column_names, [\"text\"])\n            del dsets\n\n            dset = builder.as_dataset(\"train\")\n            self.assertIsInstance(dset, Dataset)\n            self.assertEqual(dset.split, \"train\")\n            self.assertEqual(len(dset), 2)\n            self.assertDictEqual(dset.features, Features({\"text\": Value(\"string\")}))\n            self.assertListEqual(dset.column_names, [\"text\"])\n            del dset\n\n            dset = builder.as_dataset(\"train+test[:30%]\")\n            self.assertIsInstance(dset, Dataset)\n            self.assertEqual(dset.split, \"train+test[:30%]\")\n            self.assertEqual(len(dset), 2)\n            self.assertDictEqual(dset.features, Features({\"text\": Value(\"string\")}))\n            self.assertListEqual(dset.column_names, [\"text\"])\n            del dset\n\n    @require_faiss\n    def test_as_dataset_with_post_process_with_index(self):\n        def _post_process(self, dataset, resources_paths):\n            if os.path.exists(resources_paths[\"index\"]):\n                dataset.load_faiss_index(\"my_index\", resources_paths[\"index\"])\n                return dataset\n            else:\n                dataset.add_faiss_index_from_external_arrays(\n                    external_arrays=np.ones((len(dataset), 8)), string_factory=\"Flat\", index_name=\"my_index\"\n                )\n                dataset.save_faiss_index(\"my_index\", resources_paths[\"index\"])\n                return dataset\n\n        def _post_processing_resources(self, split):\n            return {\"index\": f\"Flat-{split}.faiss\"}\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            builder = DummyBuilder(cache_dir=tmp_dir)\n            builder._post_process = types.MethodType(_post_process, builder)\n            builder._post_processing_resources = types.MethodType(_post_processing_resources, builder)\n            os.makedirs(builder.cache_dir)\n\n            builder.info.splits = SplitDict()\n            builder.info.splits.add(SplitInfo(\"train\", num_examples=10))\n            builder.info.splits.add(SplitInfo(\"test\", num_examples=10))\n\n            for split in builder.info.splits:\n                with ArrowWriter(\n                    path=os.path.join(builder.cache_dir, f\"{builder.dataset_name}-{split}.arrow\"),\n                    features=Features({\"text\": Value(\"string\")}),\n                ) as writer:\n                    writer.write_batch({\"text\": [\"foo\"] * 10})\n                    writer.finalize()\n\n                with ArrowWriter(\n                    path=os.path.join(builder.cache_dir, f\"small_dataset-{split}.arrow\"),\n                    features=Features({\"text\": Value(\"string\")}),\n                ) as writer:\n                    writer.write_batch({\"text\": [\"foo\"] * 2})\n                    writer.finalize()\n\n            dsets = builder.as_dataset()\n            self.assertIsInstance(dsets, DatasetDict)\n            self.assertListEqual(list(dsets.keys()), [\"train\", \"test\"])\n            self.assertEqual(len(dsets[\"train\"]), 10)\n            self.assertEqual(len(dsets[\"test\"]), 10)\n            self.assertDictEqual(dsets[\"train\"].features, Features({\"text\": Value(\"string\")}))\n            self.assertDictEqual(dsets[\"test\"].features, Features({\"text\": Value(\"string\")}))\n            self.assertListEqual(dsets[\"train\"].column_names, [\"text\"])\n            self.assertListEqual(dsets[\"test\"].column_names, [\"text\"])\n            self.assertListEqual(dsets[\"train\"].list_indexes(), [\"my_index\"])\n            self.assertListEqual(dsets[\"test\"].list_indexes(), [\"my_index\"])\n            self.assertGreater(builder.info.post_processing_size, 0)\n            self.assertGreater(builder.info.post_processed.resources_checksums[\"train\"][\"index\"][\"num_bytes\"], 0)\n            del dsets\n\n            dset = builder.as_dataset(\"train\")\n            self.assertIsInstance(dset, Dataset)\n            self.assertEqual(dset.split, \"train\")\n            self.assertEqual(len(dset), 10)\n            self.assertDictEqual(dset.features, Features({\"text\": Value(\"string\")}))\n            self.assertListEqual(dset.column_names, [\"text\"])\n            self.assertListEqual(dset.list_indexes(), [\"my_index\"])\n            del dset\n\n            dset = builder.as_dataset(\"train+test[:30%]\")\n            self.assertIsInstance(dset, Dataset)\n            self.assertEqual(dset.split, \"train+test[:30%]\")\n            self.assertEqual(len(dset), 13)\n            self.assertDictEqual(dset.features, Features({\"text\": Value(\"string\")}))\n            self.assertListEqual(dset.column_names, [\"text\"])\n            self.assertListEqual(dset.list_indexes(), [\"my_index\"])\n            del dset\n\n    def test_download_and_prepare_with_post_process(self):\n        def _post_process(self, dataset, resources_paths):\n            def char_tokenize(example):\n                return {\"tokens\": list(example[\"text\"])}\n\n            return dataset.map(char_tokenize, cache_file_name=resources_paths[\"tokenized_dataset\"])\n\n        def _post_processing_resources(self, split):\n            return {\"tokenized_dataset\": f\"tokenized_dataset-{split}.arrow\"}\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            builder = DummyBuilder(cache_dir=tmp_dir)\n            builder.info.post_processed = PostProcessedInfo(\n                features=Features({\"text\": Value(\"string\"), \"tokens\": List(Value(\"string\"))})\n            )\n            builder._post_process = types.MethodType(_post_process, builder)\n            builder._post_processing_resources = types.MethodType(_post_processing_resources, builder)\n            builder.download_and_prepare(download_mode=DownloadMode.FORCE_REDOWNLOAD)\n            self.assertTrue(\n                os.path.exists(\n                    os.path.join(\n                        tmp_dir, builder.dataset_name, \"default\", \"0.0.0\", f\"{builder.dataset_name}-train.arrow\"\n                    )\n                )\n            )\n            self.assertDictEqual(builder.info.features, Features({\"text\": Value(\"string\")}))\n            self.assertDictEqual(\n                builder.info.post_processed.features,\n                Features({\"text\": Value(\"string\"), \"tokens\": List(Value(\"string\"))}),\n            )\n            self.assertEqual(builder.info.splits[\"train\"].num_examples, 100)\n            self.assertTrue(\n                os.path.exists(os.path.join(tmp_dir, builder.dataset_name, \"default\", \"0.0.0\", \"dataset_info.json\"))\n            )\n\n        def _post_process(self, dataset, resources_paths):\n            return dataset.select([0, 1], keep_in_memory=True)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            builder = DummyBuilder(cache_dir=tmp_dir)\n            builder._post_process = types.MethodType(_post_process, builder)\n            builder.download_and_prepare(download_mode=DownloadMode.FORCE_REDOWNLOAD)\n            self.assertTrue(\n                os.path.exists(\n                    os.path.join(\n                        tmp_dir, builder.dataset_name, \"default\", \"0.0.0\", f\"{builder.dataset_name}-train.arrow\"\n                    )\n                )\n            )\n            self.assertDictEqual(builder.info.features, Features({\"text\": Value(\"string\")}))\n            self.assertIsNone(builder.info.post_processed)\n            self.assertEqual(builder.info.splits[\"train\"].num_examples, 100)\n            self.assertTrue(\n                os.path.exists(os.path.join(tmp_dir, builder.dataset_name, \"default\", \"0.0.0\", \"dataset_info.json\"))\n            )\n\n        def _post_process(self, dataset, resources_paths):\n            if os.path.exists(resources_paths[\"index\"]):\n                dataset.load_faiss_index(\"my_index\", resources_paths[\"index\"])\n                return dataset\n            else:\n                dataset = dataset.add_faiss_index_from_external_arrays(\n                    external_arrays=np.ones((len(dataset), 8)), string_factory=\"Flat\", index_name=\"my_index\"\n                )\n                dataset.save_faiss_index(\"my_index\", resources_paths[\"index\"])\n                return dataset\n\n        def _post_processing_resources(self, split):\n            return {\"index\": f\"Flat-{split}.faiss\"}\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            builder = DummyBuilder(cache_dir=tmp_dir)\n            builder._post_process = types.MethodType(_post_process, builder)\n            builder._post_processing_resources = types.MethodType(_post_processing_resources, builder)\n            builder.download_and_prepare(download_mode=DownloadMode.FORCE_REDOWNLOAD)\n            self.assertTrue(\n                os.path.exists(\n                    os.path.join(\n                        tmp_dir, builder.dataset_name, \"default\", \"0.0.0\", f\"{builder.dataset_name}-train.arrow\"\n                    )\n                )\n            )\n            self.assertDictEqual(builder.info.features, Features({\"text\": Value(\"string\")}))\n            self.assertIsNone(builder.info.post_processed)\n            self.assertEqual(builder.info.splits[\"train\"].num_examples, 100)\n            self.assertTrue(\n                os.path.exists(os.path.join(tmp_dir, builder.dataset_name, \"default\", \"0.0.0\", \"dataset_info.json\"))\n            )\n\n    def test_error_download_and_prepare(self):\n        def _prepare_split(self, split_generator, **kwargs):\n            raise ValueError()\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            builder = DummyBuilder(cache_dir=tmp_dir)\n            builder._prepare_split = types.MethodType(_prepare_split, builder)\n            self.assertRaises(\n                ValueError,\n                builder.download_and_prepare,\n                download_mode=DownloadMode.FORCE_REDOWNLOAD,\n            )\n            self.assertRaises(FileNotFoundError, builder.as_dataset)\n\n    def test_generator_based_download_and_prepare(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            builder = DummyGeneratorBasedBuilder(cache_dir=tmp_dir)\n            builder.download_and_prepare(download_mode=DownloadMode.FORCE_REDOWNLOAD)\n            self.assertTrue(\n                os.path.exists(\n                    os.path.join(\n                        tmp_dir,\n                        builder.dataset_name,\n                        \"default\",\n                        \"0.0.0\",\n                        f\"{builder.dataset_name}-train.arrow\",\n                    )\n                )\n            )\n            self.assertDictEqual(builder.info.features, Features({\"text\": Value(\"string\")}))\n            self.assertEqual(builder.info.splits[\"train\"].num_examples, 100)\n            self.assertTrue(\n                os.path.exists(os.path.join(tmp_dir, builder.dataset_name, \"default\", \"0.0.0\", \"dataset_info.json\"))\n            )\n\n    def test_cache_dir_no_args(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            builder = DummyGeneratorBasedBuilder(cache_dir=tmp_dir, data_dir=None, data_files=None)\n            relative_cache_dir_parts = Path(builder._relative_data_dir()).parts\n            self.assertTupleEqual(relative_cache_dir_parts, (builder.dataset_name, \"default\", \"0.0.0\"))\n\n    def test_cache_dir_for_data_files(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            dummy_data1 = os.path.join(tmp_dir, \"dummy_data1.txt\")\n            with open(dummy_data1, \"w\", encoding=\"utf-8\") as f:\n                f.writelines(\"foo bar\")\n            dummy_data2 = os.path.join(tmp_dir, \"dummy_data2.txt\")\n            with open(dummy_data2, \"w\", encoding=\"utf-8\") as f:\n                f.writelines(\"foo bar\\n\")\n\n            builder = DummyGeneratorBasedBuilder(cache_dir=tmp_dir, data_files=dummy_data1)\n            other_builder = DummyGeneratorBasedBuilder(cache_dir=tmp_dir, data_files=dummy_data1)\n            self.assertEqual(builder.cache_dir, other_builder.cache_dir)\n            other_builder = DummyGeneratorBasedBuilder(cache_dir=tmp_dir, data_files=[dummy_data1])\n            self.assertEqual(builder.cache_dir, other_builder.cache_dir)\n            other_builder = DummyGeneratorBasedBuilder(cache_dir=tmp_dir, data_files={\"train\": dummy_data1})\n            self.assertEqual(builder.cache_dir, other_builder.cache_dir)\n            other_builder = DummyGeneratorBasedBuilder(cache_dir=tmp_dir, data_files={Split.TRAIN: dummy_data1})\n            self.assertEqual(builder.cache_dir, other_builder.cache_dir)\n            other_builder = DummyGeneratorBasedBuilder(cache_dir=tmp_dir, data_files={\"train\": [dummy_data1]})\n            self.assertEqual(builder.cache_dir, other_builder.cache_dir)\n            other_builder = DummyGeneratorBasedBuilder(cache_dir=tmp_dir, data_files={\"test\": dummy_data1})\n            self.assertNotEqual(builder.cache_dir, other_builder.cache_dir)\n            other_builder = DummyGeneratorBasedBuilder(cache_dir=tmp_dir, data_files=dummy_data2)\n            self.assertNotEqual(builder.cache_dir, other_builder.cache_dir)\n            other_builder = DummyGeneratorBasedBuilder(cache_dir=tmp_dir, data_files=[dummy_data2])\n            self.assertNotEqual(builder.cache_dir, other_builder.cache_dir)\n            other_builder = DummyGeneratorBasedBuilder(cache_dir=tmp_dir, data_files=[dummy_data1, dummy_data2])\n            self.assertNotEqual(builder.cache_dir, other_builder.cache_dir)\n\n            builder = DummyGeneratorBasedBuilder(cache_dir=tmp_dir, data_files=[dummy_data1, dummy_data2])\n            other_builder = DummyGeneratorBasedBuilder(cache_dir=tmp_dir, data_files=[dummy_data1, dummy_data2])\n            self.assertEqual(builder.cache_dir, other_builder.cache_dir)\n            other_builder = DummyGeneratorBasedBuilder(cache_dir=tmp_dir, data_files=[dummy_data2, dummy_data1])\n            self.assertNotEqual(builder.cache_dir, other_builder.cache_dir)\n\n            builder = DummyGeneratorBasedBuilder(\n                cache_dir=tmp_dir, data_files={\"train\": dummy_data1, \"test\": dummy_data2}\n            )\n            other_builder = DummyGeneratorBasedBuilder(\n                cache_dir=tmp_dir, data_files={\"train\": dummy_data1, \"test\": dummy_data2}\n            )\n            self.assertEqual(builder.cache_dir, other_builder.cache_dir)\n            other_builder = DummyGeneratorBasedBuilder(\n                cache_dir=tmp_dir, data_files={\"train\": [dummy_data1], \"test\": dummy_data2}\n            )\n            self.assertEqual(builder.cache_dir, other_builder.cache_dir)\n            other_builder = DummyGeneratorBasedBuilder(\n                cache_dir=tmp_dir, data_files={\"train\": dummy_data1, \"validation\": dummy_data2}\n            )\n            self.assertNotEqual(builder.cache_dir, other_builder.cache_dir)\n            other_builder = DummyGeneratorBasedBuilder(\n                cache_dir=tmp_dir,\n                data_files={\"train\": [dummy_data1, dummy_data2], \"test\": dummy_data2},\n            )\n            self.assertNotEqual(builder.cache_dir, other_builder.cache_dir)\n\n    def test_cache_dir_for_features(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            f1 = Features({\"id\": Value(\"int8\")})\n            f2 = Features({\"id\": Value(\"int32\")})\n            builder = DummyGeneratorBasedBuilderWithIntegers(cache_dir=tmp_dir, features=f1)\n            other_builder = DummyGeneratorBasedBuilderWithIntegers(cache_dir=tmp_dir, features=f1)\n            self.assertEqual(builder.cache_dir, other_builder.cache_dir)\n            other_builder = DummyGeneratorBasedBuilderWithIntegers(cache_dir=tmp_dir, features=f2)\n            self.assertNotEqual(builder.cache_dir, other_builder.cache_dir)\n\n    def test_cache_dir_for_config_kwargs(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            # create config on the fly\n            builder = DummyGeneratorBasedBuilderWithConfig(cache_dir=tmp_dir, content=\"foo\", times=2)\n            other_builder = DummyGeneratorBasedBuilderWithConfig(cache_dir=tmp_dir, times=2, content=\"foo\")\n            self.assertEqual(builder.cache_dir, other_builder.cache_dir)\n            self.assertIn(\"content=foo\", builder.cache_dir)\n            self.assertIn(\"times=2\", builder.cache_dir)\n            other_builder = DummyGeneratorBasedBuilderWithConfig(cache_dir=tmp_dir, content=\"bar\", times=2)\n            self.assertNotEqual(builder.cache_dir, other_builder.cache_dir)\n            other_builder = DummyGeneratorBasedBuilderWithConfig(cache_dir=tmp_dir, content=\"foo\")\n            self.assertNotEqual(builder.cache_dir, other_builder.cache_dir)\n\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            # overwrite an existing config\n            builder = DummyBuilderWithMultipleConfigs(cache_dir=tmp_dir, config_name=\"a\", content=\"foo\", times=2)\n            other_builder = DummyBuilderWithMultipleConfigs(cache_dir=tmp_dir, config_name=\"a\", times=2, content=\"foo\")\n            self.assertEqual(builder.cache_dir, other_builder.cache_dir)\n            self.assertIn(\"content=foo\", builder.cache_dir)\n            self.assertIn(\"times=2\", builder.cache_dir)\n            other_builder = DummyBuilderWithMultipleConfigs(cache_dir=tmp_dir, config_name=\"a\", content=\"bar\", times=2)\n            self.assertNotEqual(builder.cache_dir, other_builder.cache_dir)\n            other_builder = DummyBuilderWithMultipleConfigs(cache_dir=tmp_dir, config_name=\"a\", content=\"foo\")\n            self.assertNotEqual(builder.cache_dir, other_builder.cache_dir)\n\n    def test_config_names(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with self.assertRaises(ValueError) as error_context:\n                DummyBuilderWithMultipleConfigs(cache_dir=tmp_dir, data_files=None, data_dir=None)\n            self.assertIn(\"Please pick one among the available configs\", str(error_context.exception))\n\n            builder = DummyBuilderWithMultipleConfigs(cache_dir=tmp_dir, config_name=\"a\")\n            self.assertEqual(builder.config.name, \"a\")\n\n            builder = DummyBuilderWithMultipleConfigs(cache_dir=tmp_dir, config_name=\"b\")\n            self.assertEqual(builder.config.name, \"b\")\n\n            with self.assertRaises(ValueError):\n                DummyBuilderWithMultipleConfigs(cache_dir=tmp_dir)\n\n            builder = DummyBuilderWithDefaultConfig(cache_dir=tmp_dir)\n            self.assertEqual(builder.config.name, \"a\")\n\n    def test_cache_dir_for_data_dir(self):\n        with tempfile.TemporaryDirectory() as tmp_dir, tempfile.TemporaryDirectory() as data_dir:\n            builder = DummyGeneratorBasedBuilder(cache_dir=tmp_dir, config_name=\"a\", data_dir=data_dir)\n            other_builder = DummyGeneratorBasedBuilder(cache_dir=tmp_dir, config_name=\"a\", data_dir=data_dir)\n            self.assertEqual(builder.cache_dir, other_builder.cache_dir)\n            other_builder = DummyGeneratorBasedBuilder(cache_dir=tmp_dir, config_name=\"a\", data_dir=tmp_dir)\n            self.assertNotEqual(builder.cache_dir, other_builder.cache_dir)\n\n    def test_cache_dir_for_configured_builder(self):\n        with tempfile.TemporaryDirectory() as tmp_dir, tempfile.TemporaryDirectory() as data_dir:\n            builder_cls = configure_builder_class(\n                DummyGeneratorBasedBuilder,\n                builder_configs=[BuilderConfig(data_dir=data_dir)],\n                default_config_name=None,\n                dataset_name=\"dummy\",\n            )\n            builder = builder_cls(cache_dir=tmp_dir, hash=\"abc\")\n            other_builder = builder_cls(cache_dir=tmp_dir, hash=\"abc\")\n            self.assertEqual(builder.cache_dir, other_builder.cache_dir)\n            other_builder = builder_cls(cache_dir=tmp_dir, hash=\"def\")\n            self.assertNotEqual(builder.cache_dir, other_builder.cache_dir)\n\n\ndef test_config_raises_when_invalid_name() -> None:\n    with pytest.raises(InvalidConfigName, match=\"Bad characters\"):\n        _ = BuilderConfig(name=\"name-with-*-invalid-character\")\n\n\n@pytest.mark.parametrize(\"data_files\", [\"str_path\", [\"str_path\"], DataFilesList([\"str_path\"], [()])])\ndef test_config_raises_when_invalid_data_files(data_files) -> None:\n    with pytest.raises(ValueError, match=\"Expected a DataFilesDict\"):\n        _ = BuilderConfig(name=\"name\", data_files=data_files)\n\n\ndef test_arrow_based_download_and_prepare(tmp_path):\n    builder = DummyArrowBasedBuilder(cache_dir=tmp_path)\n    builder.download_and_prepare()\n    assert os.path.exists(\n        os.path.join(\n            tmp_path,\n            builder.dataset_name,\n            \"default\",\n            \"0.0.0\",\n            f\"{builder.dataset_name}-train.arrow\",\n        )\n    )\n    assert builder.info.features, Features({\"text\": Value(\"string\")})\n    assert builder.info.splits[\"train\"].num_examples == 100\n    assert os.path.exists(os.path.join(tmp_path, builder.dataset_name, \"default\", \"0.0.0\", \"dataset_info.json\"))\n\n\n@pytest.mark.parametrize(\n    \"split, expected_dataset_class, expected_dataset_length\",\n    [\n        (None, DatasetDict, 10),\n        (\"train\", Dataset, 10),\n        (\"train+test[:30%]\", Dataset, 13),\n    ],\n)\n@pytest.mark.parametrize(\"in_memory\", [False, True])\ndef test_builder_as_dataset(split, expected_dataset_class, expected_dataset_length, in_memory, tmp_path):\n    cache_dir = str(tmp_path)\n    builder = DummyBuilder(cache_dir=cache_dir)\n    os.makedirs(builder.cache_dir)\n\n    builder.info.splits = SplitDict()\n    builder.info.splits.add(SplitInfo(\"train\", num_examples=10))\n    builder.info.splits.add(SplitInfo(\"test\", num_examples=10))\n\n    for info_split in builder.info.splits:\n        with ArrowWriter(\n            path=os.path.join(builder.cache_dir, f\"{builder.dataset_name}-{info_split}.arrow\"),\n            features=Features({\"text\": Value(\"string\")}),\n        ) as writer:\n            writer.write_batch({\"text\": [\"foo\"] * 10})\n            writer.finalize()\n\n    with assert_arrow_memory_increases() if in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = builder.as_dataset(split=split, in_memory=in_memory)\n    assert isinstance(dataset, expected_dataset_class)\n    if isinstance(dataset, DatasetDict):\n        assert list(dataset.keys()) == [\"train\", \"test\"]\n        datasets = dataset.values()\n        expected_splits = [\"train\", \"test\"]\n    elif isinstance(dataset, Dataset):\n        datasets = [dataset]\n        expected_splits = [split]\n    for dataset, expected_split in zip(datasets, expected_splits):\n        assert dataset.split == expected_split\n        assert len(dataset) == expected_dataset_length\n        assert dataset.features == Features({\"text\": Value(\"string\")})\n        dataset.column_names == [\"text\"]\n\n\n@pytest.mark.parametrize(\"in_memory\", [False, True])\ndef test_generator_based_builder_as_dataset(in_memory, tmp_path):\n    cache_dir = tmp_path / \"data\"\n    cache_dir.mkdir()\n    cache_dir = str(cache_dir)\n    builder = DummyGeneratorBasedBuilder(cache_dir=cache_dir)\n    builder.download_and_prepare(download_mode=DownloadMode.FORCE_REDOWNLOAD)\n    with assert_arrow_memory_increases() if in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = builder.as_dataset(\"train\", in_memory=in_memory)\n    assert dataset.data.to_pydict() == {\"text\": [\"foo\"] * 100}\n\n\n@pytest.mark.parametrize(\n    \"writer_batch_size, default_writer_batch_size, expected_chunks\", [(None, None, 1), (None, 5, 20), (10, None, 10)]\n)\ndef test_custom_writer_batch_size(tmp_path, writer_batch_size, default_writer_batch_size, expected_chunks):\n    cache_dir = str(tmp_path)\n    if default_writer_batch_size:\n        DummyGeneratorBasedBuilder.DEFAULT_WRITER_BATCH_SIZE = default_writer_batch_size\n    builder = DummyGeneratorBasedBuilder(cache_dir=cache_dir, writer_batch_size=writer_batch_size)\n    assert builder._writer_batch_size == (writer_batch_size or default_writer_batch_size)\n    builder.download_and_prepare(download_mode=DownloadMode.FORCE_REDOWNLOAD)\n    dataset = builder.as_dataset(\"train\")\n    assert len(dataset.data[0].chunks) == expected_chunks\n\n\ndef test_builder_as_streaming_dataset(tmp_path):\n    dummy_builder = DummyGeneratorBasedBuilder(cache_dir=str(tmp_path))\n    check_streaming(dummy_builder)\n    dsets = dummy_builder.as_streaming_dataset()\n    assert isinstance(dsets, IterableDatasetDict)\n    assert isinstance(dsets[\"train\"], IterableDataset)\n    assert len(list(dsets[\"train\"])) == 100\n    dset = dummy_builder.as_streaming_dataset(split=\"train\")\n    assert isinstance(dset, IterableDataset)\n    assert len(list(dset)) == 100\n\n\ndef _run_test_builder_streaming_works_in_subprocesses(builder):\n    check_streaming(builder)\n    dset = builder.as_streaming_dataset(split=\"train\")\n    assert isinstance(dset, IterableDataset)\n    assert len(list(dset)) == 100\n\n\ndef test_builder_streaming_works_in_subprocess(tmp_path):\n    dummy_builder = DummyGeneratorBasedBuilder(cache_dir=str(tmp_path))\n    p = Process(target=_run_test_builder_streaming_works_in_subprocesses, args=(dummy_builder,))\n    p.start()\n    p.join()\n\n\nclass DummyBuilderWithVersion(GeneratorBasedBuilder):\n    VERSION = \"2.0.0\"\n\n    def _info(self):\n        return DatasetInfo(features=Features({\"text\": Value(\"string\")}))\n\n    def _split_generators(self, dl_manager):\n        pass\n\n    def _generate_examples(self):\n        pass\n\n\nclass DummyBuilderWithBuilderConfigs(GeneratorBasedBuilder):\n    BUILDER_CONFIGS = [BuilderConfig(name=\"custom\", version=\"2.0.0\")]\n\n    def _info(self):\n        return DatasetInfo(features=Features({\"text\": Value(\"string\")}))\n\n    def _split_generators(self, dl_manager):\n        pass\n\n    def _generate_examples(self):\n        pass\n\n\nclass CustomBuilderConfig(BuilderConfig):\n    def __init__(self, date=None, language=None, version=\"2.0.0\", **kwargs):\n        name = f\"{date}.{language}\"\n        super().__init__(name=name, version=version, **kwargs)\n        self.date = date\n        self.language = language\n\n\nclass DummyBuilderWithCustomBuilderConfigs(GeneratorBasedBuilder):\n    BUILDER_CONFIGS = [CustomBuilderConfig(date=\"20220501\", language=\"en\")]\n    BUILDER_CONFIG_CLASS = CustomBuilderConfig\n\n    def _info(self):\n        return DatasetInfo(features=Features({\"text\": Value(\"string\")}))\n\n    def _split_generators(self, dl_manager):\n        pass\n\n    def _generate_examples(self):\n        pass\n\n\n@pytest.mark.parametrize(\n    \"builder_class, kwargs\",\n    [\n        (DummyBuilderWithVersion, {}),\n        (DummyBuilderWithBuilderConfigs, {\"config_name\": \"custom\"}),\n        (DummyBuilderWithCustomBuilderConfigs, {\"config_name\": \"20220501.en\"}),\n        (DummyBuilderWithCustomBuilderConfigs, {\"date\": \"20220501\", \"language\": \"ca\"}),\n    ],\n)\ndef test_builder_config_version(builder_class, kwargs, tmp_path):\n    cache_dir = str(tmp_path)\n    builder = builder_class(cache_dir=cache_dir, **kwargs)\n    assert builder.config.version == \"2.0.0\"\n\n\ndef test_builder_download_and_prepare_with_absolute_output_dir(tmp_path):\n    builder = DummyGeneratorBasedBuilder()\n    output_dir = str(tmp_path)\n    builder.download_and_prepare(output_dir)\n    assert builder._output_dir.startswith(tmp_path.resolve().as_posix())\n    assert os.path.exists(os.path.join(output_dir, \"dataset_info.json\"))\n    assert os.path.exists(os.path.join(output_dir, f\"{builder.dataset_name}-train.arrow\"))\n    assert not os.path.exists(os.path.join(output_dir + \".incomplete\"))\n\n\ndef test_builder_download_and_prepare_with_relative_output_dir():\n    with set_current_working_directory_to_temp_dir():\n        builder = DummyGeneratorBasedBuilder()\n        output_dir = \"test-out\"\n        builder.download_and_prepare(output_dir)\n        assert Path(builder._output_dir).resolve().as_posix().startswith(Path(output_dir).resolve().as_posix())\n        assert os.path.exists(os.path.join(output_dir, \"dataset_info.json\"))\n        assert os.path.exists(os.path.join(output_dir, f\"{builder.dataset_name}-train.arrow\"))\n        assert not os.path.exists(os.path.join(output_dir + \".incomplete\"))\n\n\ndef test_builder_with_filesystem_download_and_prepare(tmp_path, mockfs):\n    builder = DummyGeneratorBasedBuilder(cache_dir=tmp_path)\n    builder.download_and_prepare(\"mock://my_dataset\", storage_options=mockfs.storage_options)\n    assert builder._output_dir.startswith(\"mock://my_dataset\")\n    assert is_local_path(builder._cache_downloaded_dir)\n    assert isinstance(builder._fs, type(mockfs))\n    assert builder._fs.storage_options == mockfs.storage_options\n    assert mockfs.exists(\"my_dataset/dataset_info.json\")\n    assert mockfs.exists(f\"my_dataset/{builder.dataset_name}-train.arrow\")\n    assert not mockfs.exists(\"my_dataset.incomplete\")\n\n\ndef test_builder_with_filesystem_download_and_prepare_reload(tmp_path, mockfs, caplog):\n    builder = DummyGeneratorBasedBuilder(cache_dir=tmp_path)\n    mockfs.makedirs(\"my_dataset\")\n    DatasetInfo().write_to_directory(\"mock://my_dataset\", storage_options=mockfs.storage_options)\n    mockfs.touch(f\"my_dataset/{builder.dataset_name}-train.arrow\")\n    caplog.clear()\n    with caplog.at_level(INFO, logger=get_logger().name):\n        builder.download_and_prepare(\"mock://my_dataset\", storage_options=mockfs.storage_options)\n    assert \"Found cached dataset\" in caplog.text\n\n\ndef test_generator_based_builder_download_and_prepare_as_parquet(tmp_path):\n    builder = DummyGeneratorBasedBuilder(cache_dir=tmp_path)\n    builder.download_and_prepare(file_format=\"parquet\")\n    assert builder.info.splits[\"train\"].num_examples == 100\n    parquet_path = os.path.join(\n        tmp_path, builder.dataset_name, \"default\", \"0.0.0\", f\"{builder.dataset_name}-train.parquet\"\n    )\n    assert os.path.exists(parquet_path)\n    assert pq.ParquetFile(parquet_path) is not None\n\n\ndef test_generator_based_builder_download_and_prepare_sharded(tmp_path):\n    writer_batch_size = 25\n    builder = DummyGeneratorBasedBuilder(cache_dir=tmp_path, writer_batch_size=writer_batch_size)\n    with patch(\"datasets.config.MAX_SHARD_SIZE\", 1):  # one batch per shard\n        builder.download_and_prepare(file_format=\"parquet\")\n    expected_num_shards = 100 // writer_batch_size\n    assert builder.info.splits[\"train\"].num_examples == 100\n    assert builder.info.splits[\"train\"].shard_lengths == [25] * 4\n    assert builder.info.splits[\"train\"].original_shard_lengths is None\n    parquet_path = os.path.join(\n        tmp_path,\n        builder.dataset_name,\n        \"default\",\n        \"0.0.0\",\n        f\"{builder.dataset_name}-train-00000-of-{expected_num_shards:05d}.parquet\",\n    )\n    assert os.path.exists(parquet_path)\n    parquet_files = [\n        pq.ParquetFile(parquet_path)\n        for parquet_path in Path(tmp_path).rglob(\n            f\"{builder.dataset_name}-train-*-of-{expected_num_shards:05d}.parquet\"\n        )\n    ]\n    assert len(parquet_files) == expected_num_shards\n    assert sum(parquet_file.metadata.num_rows for parquet_file in parquet_files) == 100\n\n\ndef test_generator_based_builder_download_and_prepare_with_max_shard_size(tmp_path):\n    writer_batch_size = 25\n    builder = DummyGeneratorBasedBuilder(cache_dir=tmp_path, writer_batch_size=writer_batch_size)\n    builder.download_and_prepare(file_format=\"parquet\", max_shard_size=1)  # one batch per shard\n    expected_num_shards = 100 // writer_batch_size\n    assert builder.info.splits[\"train\"].num_examples == 100\n    parquet_path = os.path.join(\n        tmp_path,\n        builder.dataset_name,\n        \"default\",\n        \"0.0.0\",\n        f\"{builder.dataset_name}-train-00000-of-{expected_num_shards:05d}.parquet\",\n    )\n    assert os.path.exists(parquet_path)\n    parquet_files = [\n        pq.ParquetFile(parquet_path)\n        for parquet_path in Path(tmp_path).rglob(\n            f\"{builder.dataset_name}-train-*-of-{expected_num_shards:05d}.parquet\"\n        )\n    ]\n    assert len(parquet_files) == expected_num_shards\n    assert sum(parquet_file.metadata.num_rows for parquet_file in parquet_files) == 100\n\n\ndef test_generator_based_builder_download_and_prepare_with_num_proc(tmp_path):\n    builder = DummyGeneratorBasedBuilderWithShards(cache_dir=tmp_path)\n    builder.download_and_prepare(num_proc=2)\n    expected_num_shards = 2\n    assert builder.info.splits[\"train\"].num_examples == 400\n    assert builder.info.splits[\"train\"].shard_lengths == [200, 200]\n    assert builder.info.splits[\"train\"].original_shard_lengths == [100] * 4\n    arrow_path = os.path.join(\n        tmp_path,\n        builder.dataset_name,\n        \"default\",\n        \"0.0.0\",\n        f\"{builder.dataset_name}-train-00000-of-{expected_num_shards:05d}.arrow\",\n    )\n    assert os.path.exists(arrow_path)\n    ds = builder.as_dataset(\"train\")\n    assert len(ds) == 400\n    assert ds.to_dict() == {\n        \"id\": [i for _ in range(4) for i in range(100)],\n        \"filepath\": [f\"data{i}.txt\" for i in range(4) for _ in range(100)],\n    }\n\n\n@pytest.mark.parametrize(\n    \"num_proc, expectation\", [(None, does_not_raise()), (1, does_not_raise()), (2, pytest.raises(RuntimeError))]\n)\ndef test_generator_based_builder_download_and_prepare_with_ambiguous_shards(num_proc, expectation, tmp_path):\n    builder = DummyGeneratorBasedBuilderWithAmbiguousShards(cache_dir=tmp_path)\n    with expectation:\n        builder.download_and_prepare(num_proc=num_proc)\n\n\ndef test_arrow_based_builder_download_and_prepare_as_parquet(tmp_path):\n    builder = DummyArrowBasedBuilder(cache_dir=tmp_path)\n    builder.download_and_prepare(file_format=\"parquet\")\n    assert builder.info.splits[\"train\"].num_examples == 100\n    parquet_path = os.path.join(\n        tmp_path, builder.dataset_name, \"default\", \"0.0.0\", f\"{builder.dataset_name}-train.parquet\"\n    )\n    assert os.path.exists(parquet_path)\n    assert pq.ParquetFile(parquet_path) is not None\n\n\ndef test_arrow_based_builder_download_and_prepare_sharded(tmp_path):\n    builder = DummyArrowBasedBuilder(cache_dir=tmp_path)\n    with patch(\"datasets.config.MAX_SHARD_SIZE\", 1):  # one batch per shard\n        builder.download_and_prepare(file_format=\"parquet\")\n    expected_num_shards = 10\n    assert builder.info.splits[\"train\"].num_examples == 100\n    assert builder.info.splits[\"train\"].shard_lengths == [10] * 10\n    assert builder.info.splits[\"train\"].original_shard_lengths is None\n    parquet_path = os.path.join(\n        tmp_path,\n        builder.dataset_name,\n        \"default\",\n        \"0.0.0\",\n        f\"{builder.dataset_name}-train-00000-of-{expected_num_shards:05d}.parquet\",\n    )\n    assert os.path.exists(parquet_path)\n    parquet_files = [\n        pq.ParquetFile(parquet_path)\n        for parquet_path in Path(tmp_path).rglob(\n            f\"{builder.dataset_name}-train-*-of-{expected_num_shards:05d}.parquet\"\n        )\n    ]\n    assert len(parquet_files) == expected_num_shards\n    assert sum(parquet_file.metadata.num_rows for parquet_file in parquet_files) == 100\n\n\ndef test_arrow_based_builder_download_and_prepare_with_max_shard_size(tmp_path):\n    builder = DummyArrowBasedBuilder(cache_dir=tmp_path)\n    builder.download_and_prepare(file_format=\"parquet\", max_shard_size=1)  # one table per shard\n    expected_num_shards = 10\n    assert builder.info.splits[\"train\"].num_examples == 100\n    parquet_path = os.path.join(\n        tmp_path,\n        builder.dataset_name,\n        \"default\",\n        \"0.0.0\",\n        f\"{builder.dataset_name}-train-00000-of-{expected_num_shards:05d}.parquet\",\n    )\n    assert os.path.exists(parquet_path)\n    parquet_files = [\n        pq.ParquetFile(parquet_path)\n        for parquet_path in Path(tmp_path).rglob(\n            f\"{builder.dataset_name}-train-*-of-{expected_num_shards:05d}.parquet\"\n        )\n    ]\n    assert len(parquet_files) == expected_num_shards\n    assert sum(parquet_file.metadata.num_rows for parquet_file in parquet_files) == 100\n\n\ndef test_arrow_based_builder_download_and_prepare_with_num_proc(tmp_path):\n    builder = DummyArrowBasedBuilderWithShards(cache_dir=tmp_path)\n    builder.download_and_prepare(num_proc=2)\n    expected_num_shards = 2\n    assert builder.info.splits[\"train\"].num_examples == 400\n    assert builder.info.splits[\"train\"].shard_lengths == [200, 200]\n    assert builder.info.splits[\"train\"].original_shard_lengths == [100] * 4\n    arrow_path = os.path.join(\n        tmp_path,\n        builder.dataset_name,\n        \"default\",\n        \"0.0.0\",\n        f\"{builder.dataset_name}-train-00000-of-{expected_num_shards:05d}.arrow\",\n    )\n    assert os.path.exists(arrow_path)\n    ds = builder.as_dataset(\"train\")\n    assert len(ds) == 400\n    assert ds.to_dict() == {\n        \"id\": [i for _ in range(4) for i in range(100)],\n        \"filepath\": [f\"data{i}.txt\" for i in range(4) for _ in range(100)],\n    }\n\n\n@pytest.mark.parametrize(\n    \"num_proc, expectation\", [(None, does_not_raise()), (1, does_not_raise()), (2, pytest.raises(RuntimeError))]\n)\ndef test_arrow_based_builder_download_and_prepare_with_ambiguous_shards(num_proc, expectation, tmp_path):\n    builder = DummyArrowBasedBuilderWithAmbiguousShards(cache_dir=tmp_path)\n    with expectation:\n        builder.download_and_prepare(num_proc=num_proc)\n"
  },
  {
    "path": "tests/test_data_files.py",
    "content": "import copy\nimport os\nfrom pathlib import Path\nfrom typing import List\nfrom unittest.mock import patch\n\nimport fsspec\nimport pytest\nfrom fsspec.registry import _registry as _fsspec_registry\nfrom fsspec.spec import AbstractFileSystem\n\nfrom datasets.data_files import (\n    DataFilesDict,\n    DataFilesList,\n    DataFilesPatternsDict,\n    DataFilesPatternsList,\n    _get_data_files_patterns,\n    _is_inside_unrequested_special_dir,\n    _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir,\n    get_data_patterns,\n    resolve_pattern,\n)\nfrom datasets.fingerprint import Hasher\n\n\n_TEST_PATTERNS = [\"*\", \"**\", \"**/*\", \"*.txt\", \"data/*\", \"**/*.txt\", \"**/train.txt\"]\n_FILES_TO_IGNORE = {\".dummy\", \"README.md\", \"dummy_data.zip\", \"dataset_infos.json\"}\n_DIRS_TO_IGNORE = {\"data/.dummy_subdir\", \"__pycache__\"}\n_TEST_PATTERNS_SIZES = {\n    \"*\": 0,\n    \"**\": 4,\n    \"**/*\": 4,\n    \"*.txt\": 0,\n    \"data/*\": 2,\n    \"data/**\": 4,\n    \"**/*.txt\": 4,\n    \"**/train.txt\": 2,\n}\n\n_TEST_URL = \"https://raw.githubusercontent.com/huggingface/datasets/9675a5a1e7b99a86f9c250f6ea5fa5d1e6d5cc7d/setup.py\"\n\n\n@pytest.fixture\ndef complex_data_dir(tmp_path):\n    data_dir = tmp_path / \"complex_data_dir\"\n    data_dir.mkdir()\n\n    (data_dir / \"data\").mkdir()\n    with open(data_dir / \"data\" / \"train.txt\", \"w\") as f:\n        f.write(\"foo\\n\" * 10)\n    with open(data_dir / \"data\" / \"test.txt\", \"w\") as f:\n        f.write(\"bar\\n\" * 10)\n\n    with open(data_dir / \"README.md\", \"w\") as f:\n        f.write(\"This is a readme\")\n    with open(data_dir / \".dummy\", \"w\") as f:\n        f.write(\"this is a dummy file that is not a data file\")\n\n    (data_dir / \"data\" / \"subdir\").mkdir()\n    with open(data_dir / \"data\" / \"subdir\" / \"train.txt\", \"w\") as f:\n        f.write(\"foo\\n\" * 10)\n    with open(data_dir / \"data\" / \"subdir\" / \"test.txt\", \"w\") as f:\n        f.write(\"bar\\n\" * 10)\n\n    (data_dir / \"data\" / \".dummy_subdir\").mkdir()\n    with open(data_dir / \"data\" / \".dummy_subdir\" / \"train.txt\", \"w\") as f:\n        f.write(\"foo\\n\" * 10)\n    with open(data_dir / \"data\" / \".dummy_subdir\" / \"test.txt\", \"w\") as f:\n        f.write(\"bar\\n\" * 10)\n\n    (data_dir / \"__pycache__\").mkdir()\n    with open(data_dir / \"__pycache__\" / \"script.py\", \"w\") as f:\n        f.write(\"foo\\n\" * 10)\n\n    return str(data_dir)\n\n\ndef is_relative_to(path, *other):\n    # A built-in method in Python 3.9+\n    try:\n        path.relative_to(*other)\n        return True\n    except ValueError:\n        return False\n\n\n@pytest.fixture\ndef pattern_results(complex_data_dir):\n    # We use fsspec glob as a reference for data files resolution from patterns.\n    # This is the same as dask for example.\n    #\n    # /!\\ Here are some behaviors specific to fsspec glob that are different from glob.glob, Path.glob, Path.match or fnmatch:\n    # - '*' matches only first level items\n    # - '**' matches all items\n    # - '**/*' matches all at least second level items\n    #\n    # More generally:\n    # - '*' matches any character except a forward-slash (to match just the file or directory name)\n    # - '**' matches any character including a forward-slash /\n\n    return {\n        pattern: sorted(\n            Path(os.path.abspath(path)).as_posix()\n            for path in fsspec.filesystem(\"file\").glob(os.path.join(complex_data_dir, pattern))\n            if Path(path).name not in _FILES_TO_IGNORE\n            and not any(\n                is_relative_to(Path(path), os.path.join(complex_data_dir, dir_path)) for dir_path in _DIRS_TO_IGNORE\n            )\n            and Path(path).is_file()\n        )\n        for pattern in _TEST_PATTERNS\n    }\n\n\n@pytest.fixture\ndef hub_dataset_repo_path(tmpfs, complex_data_dir):\n    for path in Path(complex_data_dir).rglob(\"*\"):\n        if path.is_file():\n            with tmpfs.open(path.relative_to(complex_data_dir).as_posix(), \"wb\") as f:\n                f.write(path.read_bytes())\n    yield \"tmp://\"\n\n\n@pytest.fixture\ndef hub_dataset_repo_patterns_results(hub_dataset_repo_path, complex_data_dir, pattern_results):\n    return {\n        pattern: [\n            hub_dataset_repo_path + Path(path).relative_to(complex_data_dir).as_posix()\n            for path in pattern_results[pattern]\n        ]\n        for pattern in pattern_results\n    }\n\n\ndef test_is_inside_unrequested_special_dir(complex_data_dir, pattern_results):\n    # usual patterns outside special dir work fine\n    for pattern, result in pattern_results.items():\n        if result:\n            matched_rel_path = str(Path(result[0]).relative_to(complex_data_dir))\n            assert _is_inside_unrequested_special_dir(matched_rel_path, pattern) is False\n    # check behavior for special dir\n    f = _is_inside_unrequested_special_dir\n    assert f(\"__pycache__/b.txt\", \"**\") is True\n    assert f(\"__pycache__/b.txt\", \"*/b.txt\") is True\n    assert f(\"__pycache__/b.txt\", \"__pycache__/*\") is False\n    assert f(\"__pycache__/__b.txt\", \"__pycache__/*\") is False\n    assert f(\"__pycache__/__b.txt\", \"__*/*\") is False\n    assert f(\"__b.txt\", \"*\") is False\n\n\ndef test_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(complex_data_dir, pattern_results):\n    # usual patterns outside hidden dir work fine\n    for pattern, result in pattern_results.items():\n        if result:\n            matched_rel_path = str(Path(result[0]).relative_to(complex_data_dir))\n            assert _is_inside_unrequested_special_dir(matched_rel_path, pattern) is False\n    # check behavior for hidden dir and file\n    f = _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir\n    assert f(\".hidden_file.txt\", \"**\") is True\n    assert f(\".hidden_file.txt\", \".*\") is False\n    assert f(\".hidden_dir/a.txt\", \"**\") is True\n    assert f(\".hidden_dir/a.txt\", \".*/*\") is False\n    assert f(\".hidden_dir/a.txt\", \".hidden_dir/*\") is False\n    assert f(\".hidden_dir/.hidden_file.txt\", \"**\") is True\n    assert f(\".hidden_dir/.hidden_file.txt\", \".*/*\") is True\n    assert f(\".hidden_dir/.hidden_file.txt\", \".*/.*\") is False\n    assert f(\".hidden_dir/.hidden_file.txt\", \".hidden_dir/*\") is True\n    assert f(\".hidden_dir/.hidden_file.txt\", \".hidden_dir/.*\") is False\n\n\n@pytest.mark.parametrize(\"pattern\", _TEST_PATTERNS)\ndef test_pattern_results_fixture(pattern_results, pattern):\n    assert len(pattern_results[pattern]) == _TEST_PATTERNS_SIZES[pattern]\n    assert all(Path(path).is_file() for path in pattern_results[pattern])\n\n\n@pytest.mark.parametrize(\"pattern\", _TEST_PATTERNS)\ndef test_resolve_pattern_locally(complex_data_dir, pattern, pattern_results):\n    try:\n        resolved_data_files = resolve_pattern(pattern, complex_data_dir)\n        assert sorted(str(f) for f in resolved_data_files) == pattern_results[pattern]\n    except FileNotFoundError:\n        assert len(pattern_results[pattern]) == 0\n\n\ndef test_resolve_pattern_locally_with_dot_in_base_path(complex_data_dir):\n    base_path_with_dot = os.path.join(complex_data_dir, \"data\", \".dummy_subdir\")\n    resolved_data_files = resolve_pattern(os.path.join(base_path_with_dot, \"train.txt\"), base_path_with_dot)\n    assert len(resolved_data_files) == 1\n\n\n@pytest.mark.parametrize(\"archive_jsonl\", [\"tar_jsonl_path\", \"zip_jsonl_path\"])\ndef test_resolve_pattern_locally_prefixed_archive_glob(archive_jsonl, request):\n    archive_path = str(request.getfixturevalue(archive_jsonl))\n    protocol = \"tar\" if archive_path.endswith(\".tar\") else \"zip\"\n    resolved_data_files = resolve_pattern(f\"{protocol}://*::{archive_path}\", base_path=\"\")\n    assert sorted(os.path.basename(path.split(\"::\")[0]) for path in resolved_data_files) == [\n        \"dataset.jsonl\",\n        \"dataset2.jsonl\",\n    ]\n    assert all(path.endswith(f\"::{archive_path}\") for path in resolved_data_files)\n\n\ndef test_resolve_pattern_locally_with_absolute_path(tmp_path, complex_data_dir):\n    abs_path = os.path.join(complex_data_dir, \"data\", \"train.txt\")\n    resolved_data_files = resolve_pattern(abs_path, str(tmp_path / \"blabla\"))\n    assert len(resolved_data_files) == 1\n\n\ndef test_resolve_pattern_locally_with_double_dots(tmp_path, complex_data_dir):\n    path_with_double_dots = os.path.join(complex_data_dir, \"data\", \"subdir\", \"..\", \"train.txt\")\n    resolved_data_files = resolve_pattern(path_with_double_dots, str(tmp_path / \"blabla\"))\n    assert len(resolved_data_files) == 1\n\n\ndef test_resolve_pattern_locally_returns_hidden_file_only_if_requested(complex_data_dir):\n    with pytest.raises(FileNotFoundError):\n        resolve_pattern(\"*dummy\", complex_data_dir)\n    resolved_data_files = resolve_pattern(\".dummy\", complex_data_dir)\n    assert len(resolved_data_files) == 1\n\n\ndef test_resolve_pattern_locally_hidden_base_path(tmp_path):\n    hidden = tmp_path / \".test_hidden_base_path\"\n    hidden.mkdir()\n    (tmp_path / \".test_hidden_base_path\" / \"a.txt\").touch()\n    resolved_data_files = resolve_pattern(\"*\", str(hidden))\n    assert len(resolved_data_files) == 1\n\n\ndef test_resolve_pattern_locallyreturns_hidden_dir_only_if_requested(complex_data_dir):\n    with pytest.raises(FileNotFoundError):\n        resolve_pattern(\"data/*dummy_subdir/train.txt\", complex_data_dir)\n    resolved_data_files = resolve_pattern(\"data/.dummy_subdir/train.txt\", complex_data_dir)\n    assert len(resolved_data_files) == 1\n    resolved_data_files = resolve_pattern(\"*/.dummy_subdir/train.txt\", complex_data_dir)\n    assert len(resolved_data_files) == 1\n\n\ndef test_resolve_pattern_locally_returns_special_dir_only_if_requested(complex_data_dir):\n    with pytest.raises(FileNotFoundError):\n        resolve_pattern(\"data/*dummy_subdir/train.txt\", complex_data_dir)\n    resolved_data_files = resolve_pattern(\"data/.dummy_subdir/train.txt\", complex_data_dir)\n    assert len(resolved_data_files) == 1\n    resolved_data_files = resolve_pattern(\"*/.dummy_subdir/train.txt\", complex_data_dir)\n    assert len(resolved_data_files) == 1\n\n\ndef test_resolve_pattern_locally_special_base_path(tmp_path):\n    special = tmp_path / \"__test_special_base_path__\"\n    special.mkdir()\n    (tmp_path / \"__test_special_base_path__\" / \"a.txt\").touch()\n    resolved_data_files = resolve_pattern(\"*\", str(special))\n    assert len(resolved_data_files) == 1\n\n\n@pytest.mark.parametrize(\"pattern,size,extensions\", [(\"**\", 4, [\".txt\"]), (\"**\", 4, None), (\"**\", 0, [\".blablabla\"])])\ndef test_resolve_pattern_locally_with_extensions(complex_data_dir, pattern, size, extensions):\n    if size > 0:\n        resolved_data_files = resolve_pattern(pattern, complex_data_dir, allowed_extensions=extensions)\n        assert len(resolved_data_files) == size\n    else:\n        with pytest.raises(FileNotFoundError):\n            resolve_pattern(pattern, complex_data_dir, allowed_extensions=extensions)\n\n\ndef test_fail_resolve_pattern_locally(complex_data_dir):\n    with pytest.raises(FileNotFoundError):\n        resolve_pattern(complex_data_dir, [\"blablabla\"])\n\n\n@pytest.mark.skipif(os.name == \"nt\", reason=\"Windows does not support symlinks in the default mode\")\ndef test_resolve_pattern_locally_does_not_resolve_symbolic_links(tmp_path, complex_data_dir):\n    (tmp_path / \"train_data_symlink.txt\").symlink_to(os.path.join(complex_data_dir, \"data\", \"train.txt\"))\n    resolved_data_files = resolve_pattern(\"train_data_symlink.txt\", str(tmp_path))\n    assert len(resolved_data_files) == 1\n    assert Path(resolved_data_files[0]) == tmp_path / \"train_data_symlink.txt\"\n\n\ndef test_resolve_pattern_locally_sorted_files(tmp_path_factory):\n    path = str(tmp_path_factory.mktemp(\"unsorted_text_files\"))\n    unsorted_names = [\"0.txt\", \"2.txt\", \"3.txt\"]\n    for name in unsorted_names:\n        with open(os.path.join(path, name), \"w\"):\n            pass\n    resolved_data_files = resolve_pattern(\"*\", path)\n    resolved_names = [os.path.basename(data_file) for data_file in resolved_data_files]\n    assert resolved_names == sorted(unsorted_names)\n\n\n@pytest.mark.parametrize(\"pattern\", _TEST_PATTERNS)\ndef test_resolve_pattern_in_dataset_repository(hub_dataset_repo_path, pattern, hub_dataset_repo_patterns_results):\n    try:\n        resolved_data_files = resolve_pattern(pattern, hub_dataset_repo_path)\n        assert sorted(str(f) for f in resolved_data_files) == hub_dataset_repo_patterns_results[pattern]\n    except FileNotFoundError:\n        assert len(hub_dataset_repo_patterns_results[pattern]) == 0\n\n\n@pytest.mark.parametrize(\n    \"pattern,size,base_path\", [(\"**\", 4, None), (\"**\", 4, \"data\"), (\"**\", 2, \"data/subdir\"), (\"**\", 0, \"data/subdir2\")]\n)\ndef test_resolve_pattern_in_dataset_repository_with_base_path(hub_dataset_repo_path, pattern, size, base_path):\n    base_path = hub_dataset_repo_path + (base_path or \"\")\n    if size > 0:\n        resolved_data_files = resolve_pattern(pattern, base_path)\n        assert len(resolved_data_files) == size\n    else:\n        with pytest.raises(FileNotFoundError):\n            resolve_pattern(pattern, base_path)\n\n\n@pytest.mark.parametrize(\"pattern,size,extensions\", [(\"**\", 4, [\".txt\"]), (\"**\", 4, None), (\"**\", 0, [\".blablabla\"])])\ndef test_resolve_pattern_in_dataset_repository_with_extensions(hub_dataset_repo_path, pattern, size, extensions):\n    if size > 0:\n        resolved_data_files = resolve_pattern(pattern, hub_dataset_repo_path, allowed_extensions=extensions)\n        assert len(resolved_data_files) == size\n    else:\n        with pytest.raises(FileNotFoundError):\n            resolved_data_files = resolve_pattern(pattern, hub_dataset_repo_path, allowed_extensions=extensions)\n\n\ndef test_fail_resolve_pattern_in_dataset_repository(hub_dataset_repo_path):\n    with pytest.raises(FileNotFoundError):\n        resolve_pattern(\"blablabla\", hub_dataset_repo_path)\n\n\ndef test_resolve_pattern_in_dataset_repository_returns_hidden_file_only_if_requested(hub_dataset_repo_path):\n    with pytest.raises(FileNotFoundError):\n        resolve_pattern(\"*dummy\", hub_dataset_repo_path)\n    resolved_data_files = resolve_pattern(\".dummy\", hub_dataset_repo_path)\n    assert len(resolved_data_files) == 1\n\n\ndef test_resolve_pattern_in_dataset_repository_hidden_base_path(tmpfs):\n    tmpfs.touch(\".hidden/a.txt\")\n    resolved_data_files = resolve_pattern(\"*\", base_path=\"tmp://.hidden\")\n    assert len(resolved_data_files) == 1\n\n\ndef test_resolve_pattern_in_dataset_repository_returns_hidden_dir_only_if_requested(hub_dataset_repo_path):\n    with pytest.raises(FileNotFoundError):\n        resolve_pattern(\"data/*dummy_subdir/train.txt\", hub_dataset_repo_path)\n    resolved_data_files = resolve_pattern(\"data/.dummy_subdir/train.txt\", hub_dataset_repo_path)\n    assert len(resolved_data_files) == 1\n    resolved_data_files = resolve_pattern(\"*/.dummy_subdir/train.txt\", hub_dataset_repo_path)\n    assert len(resolved_data_files) == 1\n\n\ndef test_resolve_pattern_in_dataset_repository_returns_special_dir_only_if_requested(hub_dataset_repo_path):\n    with pytest.raises(FileNotFoundError):\n        resolve_pattern(\"data/*dummy_subdir/train.txt\", hub_dataset_repo_path)\n    resolved_data_files = resolve_pattern(\"data/.dummy_subdir/train.txt\", hub_dataset_repo_path)\n    assert len(resolved_data_files) == 1\n    resolved_data_files = resolve_pattern(\"*/.dummy_subdir/train.txt\", hub_dataset_repo_path)\n    assert len(resolved_data_files) == 1\n\n\ndef test_resolve_pattern_in_dataset_repository_special_base_path(tmpfs):\n    tmpfs.touch(\"__special__/a.txt\")\n    resolved_data_files = resolve_pattern(\"*\", base_path=\"tmp://__special__\")\n    assert len(resolved_data_files) == 1\n\n\n@pytest.fixture\ndef dummy_fs():\n    DummyTestFS = mock_fs([\"train.txt\", \"test.txt\"])\n    _fsspec_registry[\"mock\"] = DummyTestFS\n    _fsspec_registry[\"dummy\"] = DummyTestFS\n    yield\n    del _fsspec_registry[\"mock\"]\n    del _fsspec_registry[\"dummy\"]\n\n\ndef test_resolve_pattern_fs(dummy_fs):\n    resolved_data_files = resolve_pattern(\"mock://train.txt\", base_path=\"\")\n    assert resolved_data_files == [\"mock://train.txt\"]\n\n\n@pytest.mark.parametrize(\"pattern\", _TEST_PATTERNS)\ndef test_DataFilesList_from_patterns_in_dataset_repository_(\n    hub_dataset_repo_path, hub_dataset_repo_patterns_results, pattern\n):\n    try:\n        data_files_list = DataFilesList.from_patterns([pattern], hub_dataset_repo_path)\n        assert sorted(data_files_list) == hub_dataset_repo_patterns_results[pattern]\n        assert len(data_files_list.origin_metadata) == len(data_files_list)\n    except FileNotFoundError:\n        assert len(hub_dataset_repo_patterns_results[pattern]) == 0\n\n\ndef test_DataFilesList_from_patterns_locally_with_extra_files(complex_data_dir, text_file):\n    data_files_list = DataFilesList.from_patterns([_TEST_URL, text_file.as_posix()], complex_data_dir)\n    assert list(data_files_list) == [_TEST_URL, text_file.as_posix()]\n    assert len(data_files_list.origin_metadata) == 2\n\n\ndef test_DataFilesList_from_patterns_raises_FileNotFoundError(complex_data_dir):\n    with pytest.raises(FileNotFoundError):\n        DataFilesList.from_patterns([\"file_that_doesnt_exist.txt\"], complex_data_dir)\n\n\nclass TestDataFilesDict:\n    def test_key_order_after_copy(self):\n        data_files = DataFilesDict({\"train\": \"train.csv\", \"test\": \"test.csv\"})\n        copied_data_files = copy.deepcopy(data_files)\n        assert list(copied_data_files.keys()) == list(data_files.keys())  # test split order with list()\n\n\n@pytest.mark.parametrize(\"pattern\", _TEST_PATTERNS)\ndef test_DataFilesDict_from_patterns_in_dataset_repository(\n    hub_dataset_repo_path, hub_dataset_repo_patterns_results, pattern\n):\n    split_name = \"train\"\n    try:\n        data_files = DataFilesDict.from_patterns({split_name: [pattern]}, hub_dataset_repo_path)\n        assert all(isinstance(data_files_list, DataFilesList) for data_files_list in data_files.values())\n        assert sorted(data_files[split_name]) == hub_dataset_repo_patterns_results[pattern]\n    except FileNotFoundError:\n        assert len(hub_dataset_repo_patterns_results[pattern]) == 0\n\n\n@pytest.mark.parametrize(\n    \"pattern,size,base_path,split_name\",\n    [\n        (\"**\", 4, None, \"train\"),\n        (\"**\", 4, \"data\", \"train\"),\n        (\"**\", 2, \"data/subdir\", \"train\"),\n        (\"**\", 0, \"data/subdir2\", \"train\"),\n    ],\n)\ndef test_DataFilesDict_from_patterns_in_dataset_repository_with_base_path(\n    hub_dataset_repo_path, pattern, size, base_path, split_name\n):\n    base_path = hub_dataset_repo_path + (base_path or \"\")\n    if size > 0:\n        data_files = DataFilesDict.from_patterns({split_name: [pattern]}, base_path=base_path)\n        assert len(data_files[split_name]) == size\n    else:\n        with pytest.raises(FileNotFoundError):\n            resolve_pattern(pattern, base_path)\n\n\n@pytest.mark.parametrize(\"pattern\", _TEST_PATTERNS)\ndef test_DataFilesDict_from_patterns_locally(complex_data_dir, pattern_results, pattern):\n    split_name = \"train\"\n    try:\n        data_files = DataFilesDict.from_patterns({split_name: [pattern]}, complex_data_dir)\n        assert all(isinstance(data_files_list, DataFilesList) for data_files_list in data_files.values())\n        assert sorted(data_files[split_name]) == pattern_results[pattern]\n    except FileNotFoundError:\n        assert len(pattern_results[pattern]) == 0\n\n\ndef test_DataFilesDict_from_patterns_in_dataset_repository_hashing(hub_dataset_repo_path):\n    patterns = {\"train\": [\"**/train.txt\"], \"test\": [\"**/test.txt\"]}\n    data_files1 = DataFilesDict.from_patterns(patterns, hub_dataset_repo_path)\n    data_files2 = DataFilesDict.from_patterns(patterns, hub_dataset_repo_path)\n    assert Hasher.hash(data_files1) == Hasher.hash(data_files2)\n\n    data_files2 = DataFilesDict(sorted(data_files1.items(), reverse=True))\n    assert Hasher.hash(data_files1) == Hasher.hash(data_files2)\n\n    # the tmpfs used to mock the hub repo is based on a local directory\n    # therefore os.stat is used to get the mtime of the data files\n    with patch(\"os.stat\", return_value=os.stat(__file__)):\n        data_files2 = DataFilesDict.from_patterns(patterns, hub_dataset_repo_path)\n        assert Hasher.hash(data_files1) != Hasher.hash(data_files2)\n\n\ndef test_DataFilesDict_from_patterns_locally_or_remote_hashing(text_file):\n    patterns = {\"train\": [_TEST_URL], \"test\": [str(text_file)]}\n    data_files1 = DataFilesDict.from_patterns(patterns)\n    data_files2 = DataFilesDict.from_patterns(patterns)\n    assert Hasher.hash(data_files1) == Hasher.hash(data_files2)\n\n    data_files2 = DataFilesDict(sorted(data_files1.items(), reverse=True))\n    assert Hasher.hash(data_files1) == Hasher.hash(data_files2)\n\n    patterns2 = {\"train\": [_TEST_URL], \"test\": [_TEST_URL]}\n    data_files2 = DataFilesDict.from_patterns(patterns2)\n    assert Hasher.hash(data_files1) != Hasher.hash(data_files2)\n\n    with patch(\"fsspec.implementations.http._file_info\", return_value={}):\n        data_files2 = DataFilesDict.from_patterns(patterns)\n        assert Hasher.hash(data_files1) != Hasher.hash(data_files2)\n\n    with patch(\"os.stat\", return_value=os.stat(__file__)):\n        data_files2 = DataFilesDict.from_patterns(patterns)\n        assert Hasher.hash(data_files1) != Hasher.hash(data_files2)\n\n\ndef test_DataFilesPatternsList(text_file):\n    data_files_patterns = DataFilesPatternsList([str(text_file)], allowed_extensions=[None])\n    data_files = data_files_patterns.resolve(base_path=\"\")\n    assert data_files == [text_file.as_posix()]\n    assert isinstance(data_files, DataFilesList)\n    data_files_patterns = DataFilesPatternsList([str(text_file)], allowed_extensions=[[\".txt\"]])\n    data_files = data_files_patterns.resolve(base_path=\"\")\n    assert data_files == [text_file.as_posix()]\n    assert isinstance(data_files, DataFilesList)\n    data_files_patterns = DataFilesPatternsList([str(text_file).replace(\".txt\", \".tx*\")], allowed_extensions=[None])\n    data_files = data_files_patterns.resolve(base_path=\"\")\n    assert data_files == [text_file.as_posix()]\n    assert isinstance(data_files, DataFilesList)\n    data_files_patterns = DataFilesPatternsList([Path(text_file).name], allowed_extensions=[None])\n    data_files = data_files_patterns.resolve(base_path=str(Path(text_file).parent))\n    assert data_files == [text_file.as_posix()]\n    data_files_patterns = DataFilesPatternsList([str(text_file)], allowed_extensions=[[\".zip\"]])\n    with pytest.raises(FileNotFoundError):\n        data_files_patterns.resolve(base_path=\"\")\n\n\ndef test_DataFilesPatternsDict(text_file):\n    data_files_patterns_dict = DataFilesPatternsDict(\n        {\"train\": DataFilesPatternsList([str(text_file)], allowed_extensions=[None])}\n    )\n    data_files_dict = data_files_patterns_dict.resolve(base_path=\"\")\n    assert data_files_dict == {\"train\": [text_file.as_posix()]}\n    assert isinstance(data_files_dict, DataFilesDict)\n    assert isinstance(data_files_dict[\"train\"], DataFilesList)\n\n\ndef mock_fs(file_paths: List[str]):\n    \"\"\"\n    Set up a mock filesystem for fsspec containing the provided files\n\n    Example:\n\n    ```py\n    >>> DummyTestFS = mock_fs([\"data/train.txt\", \"data.test.txt\"])\n    >>> fs = DummyTestFS()\n    >>> assert fsspec.get_filesystem_class(\"mock\").__name__ == \"DummyTestFS\"\n    >>> assert type(fs).__name__ == \"DummyTestFS\"\n    >>> print(fs.glob(\"**\"))\n    [\"data\", \"data/train.txt\", \"data.test.txt\"]\n    ```\n    \"\"\"\n    file_paths = [file_path.split(\"://\")[-1] for file_path in file_paths]\n    dir_paths = {\n        \"/\".join(file_path.split(\"/\")[: i + 1]) for file_path in file_paths for i in range(file_path.count(\"/\"))\n    }\n    fs_contents = [{\"name\": dir_path, \"type\": \"directory\"} for dir_path in dir_paths] + [\n        {\"name\": file_path, \"type\": \"file\", \"size\": 10} for file_path in file_paths\n    ]\n\n    class DummyTestFS(AbstractFileSystem):\n        protocol = (\"mock\", \"dummy\")\n        _fs_contents = fs_contents\n\n        def ls(self, path, detail=True, refresh=True, **kwargs):\n            if kwargs.pop(\"strip_proto\", True):\n                path = self._strip_protocol(path)\n\n            files = not refresh and self._ls_from_cache(path)\n            if not files:\n                files = [file for file in self._fs_contents if path == self._parent(file[\"name\"])]\n                files.sort(key=lambda file: file[\"name\"])\n                self.dircache[path.rstrip(\"/\")] = files\n\n            if detail:\n                return files\n            return [file[\"name\"] for file in files]\n\n    return DummyTestFS\n\n\n@pytest.mark.parametrize(\"base_path\", [\"\", \"mock://\", \"my_dir\"])\n@pytest.mark.parametrize(\n    \"data_file_per_split\",\n    [\n        # === Main cases ===\n        # file named after split at the root\n        {\"train\": \"train.txt\", \"validation\": \"valid.txt\", \"test\": \"test.txt\"},\n        # file named after split in a directory\n        {\n            \"train\": \"data/train.txt\",\n            \"validation\": \"data/valid.txt\",\n            \"test\": \"data/test.txt\",\n        },\n        # directory named after split\n        {\n            \"train\": \"train/split.txt\",\n            \"validation\": \"valid/split.txt\",\n            \"test\": \"test/split.txt\",\n        },\n        # sharded splits\n        {\n            \"train\": [f\"data/train_{i}.txt\" for i in range(3)],\n            \"validation\": [f\"data/validation_{i}.txt\" for i in range(3)],\n            \"test\": [f\"data/test_{i}.txt\" for i in range(3)],\n        },\n        # sharded splits with standard format (+ custom split name)\n        {\n            \"train\": [f\"data/train-0000{i}-of-00003.txt\" for i in range(3)],\n            \"validation\": [f\"data/validation-0000{i}-of-00003.txt\" for i in range(3)],\n            \"test\": [f\"data/test-0000{i}-of-00003.txt\" for i in range(3)],\n            \"random\": [f\"data/random-0000{i}-of-00003.txt\" for i in range(3)],\n        },\n        # === Secondary cases ===\n        # Default to train split\n        {\"train\": \"dataset.txt\"},\n        {\"train\": \"data/dataset.txt\"},\n        {\"train\": [\"data/image.jpg\", \"metadata.jsonl\"]},\n        {\"train\": [\"data/image.jpg\", \"metadata.csv\"]},\n        # With prefix or suffix in directory or file names\n        {\"train\": \"my_train_dir/dataset.txt\"},\n        {\"train\": \"data/my_train_file.txt\"},\n        {\"test\": \"my_test_dir/dataset.txt\"},\n        {\"test\": \"data/my_test_file.txt\"},\n        {\"validation\": \"my_validation_dir/dataset.txt\"},\n        {\"validation\": \"data/my_validation_file.txt\"},\n        {\"train\": \"train_dir/dataset.txt\"},\n        {\"train\": \"data/train_file.txt\"},\n        {\"test\": \"test_dir/dataset.txt\"},\n        {\"test\": \"data/test_file.txt\"},\n        {\"validation\": \"validation_dir/dataset.txt\"},\n        {\"validation\": \"data/validation_file.txt\"},\n        {\"train\": \"my_train/dataset.txt\"},\n        {\"train\": \"data/my_train.txt\"},\n        {\"test\": \"my_test/dataset.txt\"},\n        {\"test\": \"data/my_test.txt\"},\n        {\"validation\": \"my_validation/dataset.txt\"},\n        {\"validation\": \"data/my_validation.txt\"},\n        # With test<>eval aliases\n        {\"test\": \"eval.txt\"},\n        {\"test\": \"data/eval.txt\"},\n        {\"test\": \"eval/dataset.txt\"},\n        # With valid<>dev aliases\n        {\"validation\": \"dev.txt\"},\n        {\"validation\": \"data/dev.txt\"},\n        {\"validation\": \"dev/dataset.txt\"},\n        # With valid<>val aliases\n        {\"validation\": \"val.txt\"},\n        {\"validation\": \"data/val.txt\"},\n        # With other extensions\n        {\"train\": \"train.parquet\", \"validation\": \"valid.parquet\", \"test\": \"test.parquet\"},\n        # With \"dev\" or \"eval\" without separators\n        {\"train\": \"developers_list.txt\"},\n        {\"train\": \"data/seqeval_results.txt\"},\n        {\"train\": \"contest.txt\"},\n        # With supported separators\n        {\"test\": \"my.test.file.txt\"},\n        {\"test\": \"my-test-file.txt\"},\n        {\"test\": \"my_test_file.txt\"},\n        {\"test\": \"my test file.txt\"},\n        {\"test\": \"my-test_file.txt\"},\n        {\"test\": \"test00001.txt\"},\n        # <split>.<split> case\n        {\"test\": \"test/train.txt\"},\n    ],\n)\ndef test_get_data_files_patterns(base_path, data_file_per_split):\n    data_file_per_split = {k: v if isinstance(v, list) else [v] for k, v in data_file_per_split.items()}\n    data_file_per_split = {\n        split: [\n            base_path + (\"/\" if base_path and base_path[-1] != \"/\" else \"\") + file_path\n            for file_path in data_file_per_split[split]\n        ]\n        for split in data_file_per_split\n    }\n    file_paths = sum(data_file_per_split.values(), [])\n    DummyTestFS = mock_fs(file_paths)\n    fs = DummyTestFS()\n\n    def resolver(pattern):\n        pattern = base_path + (\"/\" if base_path and base_path[-1] != \"/\" else \"\") + pattern\n        return [\n            file_path[len(fs._strip_protocol(base_path)) :].lstrip(\"/\")\n            for file_path in fs.glob(pattern)\n            if fs.isfile(file_path)\n        ]\n\n    patterns_per_split = _get_data_files_patterns(resolver)\n    assert list(patterns_per_split.keys()) == list(data_file_per_split.keys())  # Test split order with list()\n    for split, patterns in patterns_per_split.items():\n        matched = [file_path for pattern in patterns for file_path in resolver(pattern)]\n        expected = [\n            fs._strip_protocol(file_path)[len(fs._strip_protocol(base_path)) :].lstrip(\"/\")\n            for file_path in data_file_per_split[split]\n        ]\n        assert matched == expected\n\n\ndef test_get_data_patterns_from_directory_with_the_word_data_twice(tmp_path):\n    repo_dir = tmp_path / \"directory-name-ending-with-the-word-data\"  # parent directory contains the word \"data/\"\n    data_dir = repo_dir / \"data\"\n    data_dir.mkdir(parents=True)\n    data_file = data_dir / \"train-00001-of-00009.parquet\"\n    data_file.touch()\n    data_file_patterns = get_data_patterns(repo_dir.as_posix())\n    assert data_file_patterns == {\"train\": [\"data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*\"]}\n"
  },
  {
    "path": "tests/test_dataset_dict.py",
    "content": "import os\nimport tempfile\nfrom types import SimpleNamespace\nfrom unittest import TestCase\nfrom unittest.mock import patch\n\nimport numpy as np\nimport pandas as pd\nimport pytest\nfrom fsspec.implementations.memory import MemoryFileSystem\n\nfrom datasets import load_from_disk\nfrom datasets.arrow_dataset import Dataset\nfrom datasets.dataset_dict import DatasetDict, IterableDatasetDict\nfrom datasets.features import ClassLabel, Features, List, Value\nfrom datasets.iterable_dataset import IterableDataset\nfrom datasets.splits import NamedSplit, SplitInfo\n\nfrom .utils import (\n    assert_arrow_memory_doesnt_increase,\n    assert_arrow_memory_increases,\n    require_numpy1_on_windows,\n    require_polars,\n    require_tf,\n    require_torch,\n)\n\n\nclass DatasetDictTest(TestCase):\n    def _create_dummy_dataset(self, multiple_columns=False, int_to_float=False):\n        if multiple_columns:\n            data = {\"col_1\": [3, 2, 1, 0], \"col_2\": [\"a\", \"b\", \"c\", \"d\"]}\n            dset = Dataset.from_dict(data)\n        elif int_to_float:\n            data = {\n                \"text\": [\"text1\", \"text2\", \"text3\", \"text4\"],\n                \"labels\": [[1, 1, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 1, 1], [0, 0, 0, 1, 0]],\n            }\n            dset = Dataset.from_dict(data)\n        else:\n            dset = Dataset.from_dict(\n                {\"filename\": [\"my_name-train\" + \"_\" + f\"{x:03d}\" for x in np.arange(30).tolist()]}\n            )\n        return dset\n\n    def _create_dummy_dataset_dict(self, multiple_columns=False, int_to_float=False) -> DatasetDict:\n        return DatasetDict(\n            {\n                \"train\": self._create_dummy_dataset(multiple_columns=multiple_columns, int_to_float=int_to_float),\n                \"test\": self._create_dummy_dataset(multiple_columns=multiple_columns, int_to_float=int_to_float),\n            }\n        )\n\n    def _create_dummy_iterable_dataset(self, multiple_columns=False) -> IterableDataset:\n        def gen():\n            if multiple_columns:\n                data = {\"col_1\": [3, 2, 1, 0], \"col_2\": [\"a\", \"b\", \"c\", \"d\"]}\n                for v1, v2 in zip(data[\"col_1\"], data[\"col_2\"]):\n                    yield {\"col_1\": v1, \"col_2\": v2}\n            else:\n                for x in range(30):\n                    yield {\"filename\": \"my_name-train\" + \"_\" + f\"{x:03d}\"}\n\n        return IterableDataset.from_generator(gen)\n\n    def _create_dummy_iterable_dataset_dict(self, multiple_columns=False) -> IterableDatasetDict:\n        return IterableDatasetDict(\n            {\n                \"train\": self._create_dummy_iterable_dataset(multiple_columns=multiple_columns),\n                \"test\": self._create_dummy_iterable_dataset(multiple_columns=multiple_columns),\n            }\n        )\n\n    def test_flatten(self):\n        dset_split = Dataset.from_dict(\n            {\"a\": [{\"b\": {\"c\": [\"text\"]}}] * 10, \"foo\": [1] * 10},\n            features=Features({\"a\": {\"b\": {\"c\": List(Value(\"string\"))}}, \"foo\": Value(\"int64\")}),\n        )\n        dset = DatasetDict({\"train\": dset_split, \"test\": dset_split})\n        dset = dset.flatten()\n        self.assertDictEqual(dset.column_names, {\"train\": [\"a.b.c\", \"foo\"], \"test\": [\"a.b.c\", \"foo\"]})\n        self.assertListEqual(sorted(dset[\"train\"].features.keys()), [\"a.b.c\", \"foo\"])\n        self.assertDictEqual(dset[\"train\"].features, Features({\"a.b.c\": List(Value(\"string\")), \"foo\": Value(\"int64\")}))\n        del dset\n\n    def test_set_format_numpy(self):\n        dset = self._create_dummy_dataset_dict(multiple_columns=True)\n        dset.set_format(type=\"numpy\", columns=[\"col_1\"])\n        for dset_split in dset.values():\n            self.assertEqual(len(dset_split[0]), 1)\n            self.assertIsInstance(dset_split[0][\"col_1\"], np.int64)\n            self.assertEqual(dset_split[0][\"col_1\"].item(), 3)\n\n        dset.reset_format()\n        with dset.formatted_as(type=\"numpy\", columns=[\"col_1\"]):\n            for dset_split in dset.values():\n                self.assertEqual(len(dset_split[0]), 1)\n                self.assertIsInstance(dset_split[0][\"col_1\"], np.int64)\n                self.assertEqual(dset_split[0][\"col_1\"].item(), 3)\n\n        for dset_split in dset.values():\n            self.assertEqual(dset_split.format[\"type\"], None)\n            self.assertEqual(dset_split.format[\"format_kwargs\"], {})\n            self.assertEqual(dset_split.format[\"columns\"], dset_split.column_names)\n            self.assertEqual(dset_split.format[\"output_all_columns\"], False)\n\n        dset.set_format(type=\"numpy\", columns=[\"col_1\"], output_all_columns=True)\n        for dset_split in dset.values():\n            self.assertEqual(len(dset_split[0]), 2)\n            self.assertIsInstance(dset_split[0][\"col_2\"], str)\n            self.assertEqual(dset_split[0][\"col_2\"], \"a\")\n\n        dset.set_format(type=\"numpy\", columns=[\"col_1\", \"col_2\"])\n        for dset_split in dset.values():\n            self.assertEqual(len(dset_split[0]), 2)\n            self.assertIsInstance(dset_split[0][\"col_2\"], np.str_)\n            self.assertEqual(dset_split[0][\"col_2\"].item(), \"a\")\n        del dset\n\n    @require_numpy1_on_windows\n    @require_torch\n    def test_set_format_torch(self):\n        import torch\n\n        dset = self._create_dummy_dataset_dict(multiple_columns=True)\n        dset.set_format(type=\"torch\", columns=[\"col_1\"])\n        for dset_split in dset.values():\n            self.assertEqual(len(dset_split[0]), 1)\n            self.assertIsInstance(dset_split[0][\"col_1\"], torch.Tensor)\n            self.assertListEqual(list(dset_split[0][\"col_1\"].shape), [])\n            self.assertEqual(dset_split[0][\"col_1\"].item(), 3)\n\n        dset.set_format(type=\"torch\", columns=[\"col_1\"], output_all_columns=True)\n        for dset_split in dset.values():\n            self.assertEqual(len(dset_split[0]), 2)\n            self.assertIsInstance(dset_split[0][\"col_2\"], str)\n            self.assertEqual(dset_split[0][\"col_2\"], \"a\")\n\n        dset.set_format(type=\"torch\")\n        for dset_split in dset.values():\n            self.assertEqual(len(dset_split[0]), 2)\n            self.assertIsInstance(dset_split[0][\"col_1\"], torch.Tensor)\n            self.assertListEqual(list(dset_split[0][\"col_1\"].shape), [])\n            self.assertEqual(dset_split[0][\"col_1\"].item(), 3)\n            self.assertIsInstance(dset_split[0][\"col_2\"], str)\n            self.assertEqual(dset_split[0][\"col_2\"], \"a\")\n        del dset\n\n    @require_tf\n    def test_set_format_tf(self):\n        import tensorflow as tf\n\n        dset = self._create_dummy_dataset_dict(multiple_columns=True)\n        dset.set_format(type=\"tensorflow\", columns=[\"col_1\"])\n        for dset_split in dset.values():\n            self.assertEqual(len(dset_split[0]), 1)\n            self.assertIsInstance(dset_split[0][\"col_1\"], tf.Tensor)\n            self.assertListEqual(list(dset_split[0][\"col_1\"].shape), [])\n            self.assertEqual(dset_split[0][\"col_1\"].numpy().item(), 3)\n\n        dset.set_format(type=\"tensorflow\", columns=[\"col_1\"], output_all_columns=True)\n        for dset_split in dset.values():\n            self.assertEqual(len(dset_split[0]), 2)\n            self.assertIsInstance(dset_split[0][\"col_2\"], str)\n            self.assertEqual(dset_split[0][\"col_2\"], \"a\")\n\n        dset.set_format(type=\"tensorflow\", columns=[\"col_1\", \"col_2\"])\n        for dset_split in dset.values():\n            self.assertEqual(len(dset_split[0]), 2)\n            self.assertEqual(dset_split[0][\"col_2\"].numpy().decode(\"utf-8\"), \"a\")\n        del dset\n\n    def test_set_format_pandas(self):\n        dset = self._create_dummy_dataset_dict(multiple_columns=True)\n        dset.set_format(type=\"pandas\", columns=[\"col_1\"])\n        for dset_split in dset.values():\n            self.assertEqual(len(dset_split[0].columns), 1)\n            self.assertIsInstance(dset_split[0], pd.DataFrame)\n            self.assertListEqual(list(dset_split[0].shape), [1, 1])\n            self.assertEqual(dset_split[0][\"col_1\"].item(), 3)\n\n        dset.set_format(type=\"pandas\", columns=[\"col_1\", \"col_2\"])\n        for dset_split in dset.values():\n            self.assertEqual(len(dset_split[0].columns), 2)\n            self.assertEqual(dset_split[0][\"col_2\"].item(), \"a\")\n        del dset\n\n    @require_polars\n    def test_set_format_polars(self):\n        import polars as pl\n\n        dset = self._create_dummy_dataset_dict(multiple_columns=True)\n        dset.set_format(type=\"polars\", columns=[\"col_1\"])\n        for dset_split in dset.values():\n            self.assertEqual(len(dset_split[0].columns), 1)\n            self.assertIsInstance(dset_split[0], pl.DataFrame)\n            self.assertEqual(dset_split[0].shape, (1, 1))\n            self.assertEqual(dset_split[0][\"col_1\"].item(), 3)\n\n        dset.set_format(type=\"polars\", columns=[\"col_1\", \"col_2\"])\n        for dset_split in dset.values():\n            self.assertEqual(len(dset_split[0].columns), 2)\n            self.assertEqual(dset_split[0][\"col_2\"].item(), \"a\")\n        del dset\n\n    def test_set_transform(self):\n        def transform(batch):\n            return {k: [str(i).upper() for i in v] for k, v in batch.items()}\n\n        dset = self._create_dummy_dataset_dict(multiple_columns=True)\n        dset.set_transform(transform=transform, columns=[\"col_1\"])\n        for dset_split in dset.values():\n            self.assertEqual(dset_split.format[\"type\"], \"custom\")\n            self.assertEqual(len(dset_split[0].keys()), 1)\n            self.assertEqual(dset_split[0][\"col_1\"], \"3\")\n            self.assertEqual(dset_split[:2][\"col_1\"], [\"3\", \"2\"])\n            self.assertEqual(dset_split[\"col_1\"][:2], [\"3\", \"2\"])\n\n        prev_format = dset[list(dset.keys())[0]].format\n        for dset_split in dset.values():\n            dset_split.set_format(**dset_split.format)\n            self.assertEqual(prev_format, dset_split.format)\n\n        dset.set_transform(transform=transform, columns=[\"col_1\", \"col_2\"])\n        for dset_split in dset.values():\n            self.assertEqual(len(dset_split[0].keys()), 2)\n            self.assertEqual(dset_split[0][\"col_2\"], \"A\")\n        del dset\n\n    def test_with_format(self):\n        dset = self._create_dummy_dataset_dict(multiple_columns=True)\n        dset2 = dset.with_format(\"numpy\", columns=[\"col_1\"])\n        dset.set_format(\"numpy\", columns=[\"col_1\"])\n        for dset_split, dset_split2 in zip(dset.values(), dset2.values()):\n            self.assertDictEqual(dset_split.format, dset_split2.format)\n        del dset, dset2\n\n    def test_with_transform(self):\n        def transform(batch):\n            return {k: [str(i).upper() for i in v] for k, v in batch.items()}\n\n        dset = self._create_dummy_dataset_dict(multiple_columns=True)\n        dset2 = dset.with_transform(transform, columns=[\"col_1\"])\n        dset.set_transform(transform, columns=[\"col_1\"])\n        for dset_split, dset_split2 in zip(dset.values(), dset2.values()):\n            self.assertDictEqual(dset_split.format, dset_split2.format)\n        del dset, dset2\n\n    def test_cast(self):\n        dset = self._create_dummy_dataset_dict(multiple_columns=True)\n        features = dset[\"train\"].features\n        features[\"col_1\"] = Value(\"float64\")\n        dset = dset.cast(features)\n        for dset_split in dset.values():\n            self.assertEqual(dset_split.num_columns, 2)\n            self.assertEqual(dset_split.features[\"col_1\"], Value(\"float64\"))\n            self.assertIsInstance(dset_split[0][\"col_1\"], float)\n        del dset\n\n    def test_remove_columns(self):\n        dset = self._create_dummy_dataset_dict(multiple_columns=True)\n        dset = dset.remove_columns(column_names=\"col_1\")\n        for dset_split in dset.values():\n            self.assertEqual(dset_split.num_columns, 1)\n            self.assertListEqual(list(dset_split.column_names), [\"col_2\"])\n\n        dset = self._create_dummy_dataset_dict(multiple_columns=True)\n        dset = dset.remove_columns(column_names=[\"col_1\", \"col_2\"])\n        for dset_split in dset.values():\n            self.assertEqual(dset_split.num_columns, 0)\n\n        dset = self._create_dummy_dataset_dict(multiple_columns=True)\n        for dset_split in dset.values():\n            dset_split._format_columns = [\"col_1\", \"col_2\"]\n        dset = dset.remove_columns(column_names=[\"col_1\"])\n        for dset_split in dset.values():\n            self.assertListEqual(dset_split._format_columns, [\"col_2\"])\n            self.assertEqual(dset_split.num_columns, 1)\n            self.assertListEqual(list(dset_split.column_names), [\"col_2\"])\n        del dset\n\n    def test_rename_column(self):\n        dset = self._create_dummy_dataset_dict(multiple_columns=True)\n        dset = dset.rename_column(original_column_name=\"col_1\", new_column_name=\"new_name\")\n        for dset_split in dset.values():\n            self.assertEqual(dset_split.num_columns, 2)\n            self.assertListEqual(list(dset_split.column_names), [\"new_name\", \"col_2\"])\n        del dset\n\n    def test_select_columns(self):\n        dset = self._create_dummy_dataset_dict(multiple_columns=True)\n        dset = dset.select_columns(column_names=[])\n        for dset_split in dset.values():\n            self.assertEqual(dset_split.num_columns, 0)\n\n        dset = self._create_dummy_dataset_dict(multiple_columns=True)\n        dset = dset.select_columns(column_names=\"col_1\")\n        for dset_split in dset.values():\n            self.assertEqual(dset_split.num_columns, 1)\n            self.assertListEqual(list(dset_split.column_names), [\"col_1\"])\n\n        dset = self._create_dummy_dataset_dict(multiple_columns=True)\n        dset = dset.select_columns(column_names=[\"col_1\", \"col_2\"])\n        for dset_split in dset.values():\n            self.assertEqual(dset_split.num_columns, 2)\n\n        dset = self._create_dummy_dataset_dict(multiple_columns=True)\n        for dset_split in dset.values():\n            dset_split._format_columns = [\"col_1\", \"col_2\"]\n        dset = dset.select_columns(column_names=[\"col_1\"])\n        for dset_split in dset.values():\n            self.assertEqual(dset_split.num_columns, 1)\n            self.assertListEqual(list(dset_split.column_names), [\"col_1\"])\n            self.assertListEqual(dset_split._format_columns, [\"col_1\"])\n\n    def test_map(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            dsets = self._create_dummy_dataset_dict()\n\n            mapped_dsets_1: DatasetDict = dsets.map(lambda ex: {\"foo\": [\"bar\"] * len(ex[\"filename\"])}, batched=True)\n            self.assertListEqual(list(dsets.keys()), list(mapped_dsets_1.keys()))\n            self.assertListEqual(mapped_dsets_1[\"train\"].column_names, [\"filename\", \"foo\"])\n\n            cache_file_names = {\n                \"train\": os.path.join(tmp_dir, \"train.arrow\"),\n                \"test\": os.path.join(tmp_dir, \"test.arrow\"),\n            }\n            mapped_dsets_2: DatasetDict = mapped_dsets_1.map(\n                lambda ex: {\"bar\": [\"foo\"] * len(ex[\"filename\"])}, batched=True, cache_file_names=cache_file_names\n            )\n            self.assertListEqual(list(dsets.keys()), list(mapped_dsets_2.keys()))\n            self.assertListEqual(sorted(mapped_dsets_2[\"train\"].column_names), sorted([\"filename\", \"foo\", \"bar\"]))\n            del dsets, mapped_dsets_1, mapped_dsets_2\n\n        # casting int labels to float labels\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            dset_dict = self._create_dummy_dataset_dict(int_to_float=True)\n\n            def _preprocess(examples):\n                result = {\"labels\": [list(map(float, labels)) for labels in examples[\"labels\"]]}\n                return result\n\n            with dset_dict.map(\n                _preprocess, remove_columns=[\"labels\", \"text\"], batched=True, try_original_type=True\n            ) as dset_test:\n                for labels in dset_test[\"test\"][\"labels\"]:\n                    for label in labels:\n                        self.assertIsInstance(label, int)\n\n            with dset_dict.map(\n                _preprocess, remove_columns=[\"labels\", \"text\"], batched=True, try_original_type=False\n            ) as dset_test:\n                for labels in dset_test[\"test\"][\"labels\"]:\n                    for label in labels:\n                        self.assertIsInstance(label, float)\n\n    def test_iterable_map(self):\n        dsets = self._create_dummy_iterable_dataset_dict()\n        fn_kwargs = {\"n\": 3}\n        mapped_dsets: IterableDatasetDict = dsets.map(\n            lambda x, n: {\"foo\": [n] * len(x[\"filename\"])},\n            batched=True,\n            fn_kwargs=fn_kwargs,\n        )\n        mapped_example = next(iter(mapped_dsets[\"train\"]))\n        self.assertListEqual(sorted(mapped_example.keys()), sorted([\"filename\", \"foo\"]))\n        self.assertLessEqual(mapped_example[\"foo\"], 3)\n        del dsets, mapped_dsets\n\n    def test_filter(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            dsets = self._create_dummy_dataset_dict()\n\n            filtered_dsets_1: DatasetDict = dsets.filter(lambda ex: int(ex[\"filename\"].split(\"_\")[-1]) < 10)\n            self.assertListEqual(list(dsets.keys()), list(filtered_dsets_1.keys()))\n            self.assertEqual(len(filtered_dsets_1[\"train\"]), 10)\n\n            cache_file_names = {\n                \"train\": os.path.join(tmp_dir, \"train.arrow\"),\n                \"test\": os.path.join(tmp_dir, \"test.arrow\"),\n            }\n            filtered_dsets_2: DatasetDict = filtered_dsets_1.filter(\n                lambda ex: int(ex[\"filename\"].split(\"_\")[-1]) < 5, cache_file_names=cache_file_names\n            )\n            self.assertListEqual(list(dsets.keys()), list(filtered_dsets_2.keys()))\n            self.assertEqual(len(filtered_dsets_2[\"train\"]), 5)\n\n            filtered_dsets_3: DatasetDict = dsets.filter(\n                lambda examples: [int(ex.split(\"_\")[-1]) < 10 for ex in examples[\"filename\"]], batched=True\n            )\n            self.assertListEqual(list(dsets.keys()), list(filtered_dsets_3.keys()))\n            self.assertEqual(len(filtered_dsets_3[\"train\"]), 10)\n            del dsets, filtered_dsets_1, filtered_dsets_2, filtered_dsets_3\n\n    def test_iterable_filter(self):\n        dsets = self._create_dummy_iterable_dataset_dict()\n        example = next(iter(dsets[\"train\"]))\n        fn_kwargs = {\"n\": 3}\n        filtered_dsets: IterableDatasetDict = dsets.filter(\n            lambda ex, n: n < int(ex[\"filename\"].split(\"_\")[-1]), fn_kwargs=fn_kwargs\n        )\n        filtered_example = next(iter(filtered_dsets[\"train\"]))\n        self.assertListEqual(list(example.keys()), list(filtered_example.keys()))\n        self.assertEqual(int(filtered_example[\"filename\"].split(\"_\")[-1]), 4)  # id starts from 3\n        del dsets, filtered_dsets\n\n    def test_iterable_dataset_dict_push_to_hub_max_shard_size_and_num_shards_are_mutually_exclusive(self):\n        dsets = self._create_dummy_iterable_dataset_dict()\n        with pytest.raises(ValueError, match=\"either max_shard_size or num_shards\"):\n            dsets.push_to_hub(\"user/dataset\", max_shard_size=\"1MB\", num_shards={\"train\": 1, \"test\": 1})\n\n    def test_iterable_dataset_dict_push_to_hub_forwards_max_shard_size_to_each_split(self):\n        class DummyApi:\n            def __init__(self, *args, **kwargs):\n                pass\n\n            def repo_info(self, repo_id, repo_type=\"dataset\", revision=None):\n                return SimpleNamespace(id=repo_id, sha=\"dummy-sha\")\n\n            def create_branch(self, *args, **kwargs):\n                pass\n\n            def list_repo_tree(self, *args, **kwargs):\n                return []\n\n            def create_commit(self, *args, **kwargs):\n                return SimpleNamespace(commit_url=\"https://hf.co/commit/dummy\")\n\n        dummy_fs = MemoryFileSystem(skip_instance_cache=True)\n        dummy_fs.touch(\"datasets/user/dataset@dummy-sha/README.md\")\n\n        forwarded_calls = []\n\n        def mock_push_parquet_shards_to_hub(\n            resolved_output_path,\n            data_dir,\n            split,\n            token,\n            create_pr,\n            max_shard_size,\n            num_shards,\n            embed_external_files,\n            num_proc,\n        ):\n            forwarded_calls.append(\n                {\n                    \"split\": split,\n                    \"max_shard_size\": max_shard_size,\n                    \"num_shards\": num_shards,\n                }\n            )\n            return [], [], Features(), SplitInfo(name=split), 0\n\n        dsets = self._create_dummy_iterable_dataset_dict()\n        max_shard_size = sum(split_dataset.num_shards for split_dataset in dsets.values())\n        with (\n            patch(\"datasets.dataset_dict.HfApi\", DummyApi),\n            patch(\"datasets.dataset_dict.HfFileSystem\", return_value=dummy_fs),\n            patch.object(IterableDataset, \"_push_parquet_shards_to_hub\", side_effect=mock_push_parquet_shards_to_hub),\n        ):\n            dsets.push_to_hub(\"user/dataset\", max_shard_size=max_shard_size)\n\n        assert {call[\"split\"] for call in forwarded_calls} == set(dsets.keys())\n        assert all(call[\"max_shard_size\"] == max_shard_size for call in forwarded_calls)\n        assert all(call[\"num_shards\"] is None for call in forwarded_calls)\n\n    def test_sort(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            dsets = self._create_dummy_dataset_dict()\n\n            sorted_dsets_1: DatasetDict = dsets.sort(\"filename\")\n            self.assertListEqual(list(dsets.keys()), list(sorted_dsets_1.keys()))\n            self.assertListEqual(\n                [f.split(\"_\")[-1] for f in sorted_dsets_1[\"train\"][\"filename\"]],\n                sorted(f\"{x:03d}\" for x in range(30)),\n            )\n\n            indices_cache_file_names = {\n                \"train\": os.path.join(tmp_dir, \"train.arrow\"),\n                \"test\": os.path.join(tmp_dir, \"test.arrow\"),\n            }\n            sorted_dsets_2: DatasetDict = sorted_dsets_1.sort(\n                \"filename\", indices_cache_file_names=indices_cache_file_names, reverse=True\n            )\n            self.assertListEqual(list(dsets.keys()), list(sorted_dsets_2.keys()))\n            self.assertListEqual(\n                [f.split(\"_\")[-1] for f in sorted_dsets_2[\"train\"][\"filename\"]],\n                sorted((f\"{x:03d}\" for x in range(30)), reverse=True),\n            )\n            del dsets, sorted_dsets_1, sorted_dsets_2\n\n    def test_shuffle(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            dsets = self._create_dummy_dataset_dict()\n\n            indices_cache_file_names = {\n                \"train\": os.path.join(tmp_dir, \"train.arrow\"),\n                \"test\": os.path.join(tmp_dir, \"test.arrow\"),\n            }\n            seeds = {\n                \"train\": 1234,\n                \"test\": 1234,\n            }\n            dsets_shuffled = dsets.shuffle(\n                seeds=seeds, indices_cache_file_names=indices_cache_file_names, load_from_cache_file=False\n            )\n            self.assertSequenceEqual(dsets_shuffled[\"train\"][\"filename\"], dsets_shuffled[\"test\"][\"filename\"])\n\n            self.assertEqual(len(dsets_shuffled[\"train\"]), 30)\n            self.assertEqual(dsets_shuffled[\"train\"][0][\"filename\"], \"my_name-train_028\")\n            self.assertEqual(dsets_shuffled[\"train\"][2][\"filename\"], \"my_name-train_010\")\n            self.assertDictEqual(dsets[\"train\"].features, Features({\"filename\": Value(\"string\")}))\n            self.assertDictEqual(dsets_shuffled[\"train\"].features, Features({\"filename\": Value(\"string\")}))\n\n            # Reproducibility\n            indices_cache_file_names_2 = {\n                \"train\": os.path.join(tmp_dir, \"train_2.arrow\"),\n                \"test\": os.path.join(tmp_dir, \"test_2.arrow\"),\n            }\n            dsets_shuffled_2 = dsets.shuffle(\n                seeds=seeds, indices_cache_file_names=indices_cache_file_names_2, load_from_cache_file=False\n            )\n            self.assertSequenceEqual(dsets_shuffled[\"train\"][\"filename\"], dsets_shuffled_2[\"train\"][\"filename\"])\n\n            seeds = {\n                \"train\": 1234,\n                \"test\": 1,\n            }\n            indices_cache_file_names_3 = {\n                \"train\": os.path.join(tmp_dir, \"train_3.arrow\"),\n                \"test\": os.path.join(tmp_dir, \"test_3.arrow\"),\n            }\n            dsets_shuffled_3 = dsets.shuffle(\n                seeds=seeds, indices_cache_file_names=indices_cache_file_names_3, load_from_cache_file=False\n            )\n            self.assertNotEqual(dsets_shuffled_3[\"train\"][\"filename\"], dsets_shuffled_3[\"test\"][\"filename\"])\n\n            # other input types\n            dsets_shuffled_int = dsets.shuffle(42)\n            dsets_shuffled_alias = dsets.shuffle(seed=42)\n            dsets_shuffled_none = dsets.shuffle()\n            self.assertEqual(len(dsets_shuffled_int[\"train\"]), 30)\n            self.assertEqual(len(dsets_shuffled_alias[\"train\"]), 30)\n            self.assertEqual(len(dsets_shuffled_none[\"train\"]), 30)\n\n            del dsets, dsets_shuffled, dsets_shuffled_2, dsets_shuffled_3\n            del dsets_shuffled_int, dsets_shuffled_alias, dsets_shuffled_none\n\n    def test_flatten_indices(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            dsets = self._create_dummy_dataset_dict()\n\n            indices_cache_file_names = {\n                \"train\": os.path.join(tmp_dir, \"train.arrow\"),\n                \"test\": os.path.join(tmp_dir, \"test.arrow\"),\n            }\n            dsets_shuffled = dsets.shuffle(\n                seed=42, indices_cache_file_names=indices_cache_file_names, load_from_cache_file=False\n            )\n\n            self.assertIsNotNone(dsets_shuffled[\"train\"]._indices)\n            self.assertIsNotNone(dsets_shuffled[\"test\"]._indices)\n\n            dsets_flat = dsets_shuffled.flatten_indices()\n\n            self.assertIsNone(dsets_flat[\"train\"]._indices)\n            self.assertIsNone(dsets_flat[\"test\"]._indices)\n\n            del dsets, dsets_shuffled, dsets_flat\n\n    def test_check_values_type(self):\n        dsets = self._create_dummy_dataset_dict()\n        dsets[\"bad_split\"] = None\n        self.assertRaises(TypeError, dsets.map, lambda x: x)\n        self.assertRaises(TypeError, dsets.filter, lambda x: True)\n        self.assertRaises(TypeError, dsets.shuffle)\n        self.assertRaises(TypeError, dsets.sort, \"filename\")\n        del dsets\n\n    def test_serialization(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            dsets = self._create_dummy_dataset_dict()\n            dsets.save_to_disk(tmp_dir)\n            reloaded_dsets = DatasetDict.load_from_disk(tmp_dir)\n            self.assertListEqual(sorted(reloaded_dsets), [\"test\", \"train\"])\n            self.assertEqual(len(reloaded_dsets[\"train\"]), 30)\n            self.assertListEqual(reloaded_dsets[\"train\"].column_names, [\"filename\"])\n            self.assertEqual(len(reloaded_dsets[\"test\"]), 30)\n            self.assertListEqual(reloaded_dsets[\"test\"].column_names, [\"filename\"])\n            del reloaded_dsets\n\n            del dsets[\"test\"]\n            dsets.save_to_disk(tmp_dir)\n            reloaded_dsets = DatasetDict.load_from_disk(tmp_dir)\n            self.assertListEqual(sorted(reloaded_dsets), [\"train\"])\n            self.assertEqual(len(reloaded_dsets[\"train\"]), 30)\n            self.assertListEqual(reloaded_dsets[\"train\"].column_names, [\"filename\"])\n            del dsets, reloaded_dsets\n\n            dsets = self._create_dummy_dataset_dict()\n            dsets.save_to_disk(tmp_dir, num_shards={\"train\": 3, \"test\": 2})\n            reloaded_dsets = DatasetDict.load_from_disk(tmp_dir)\n            self.assertListEqual(sorted(reloaded_dsets), [\"test\", \"train\"])\n            self.assertEqual(len(reloaded_dsets[\"train\"]), 30)\n            self.assertListEqual(reloaded_dsets[\"train\"].column_names, [\"filename\"])\n            self.assertEqual(len(reloaded_dsets[\"train\"].cache_files), 3)\n            self.assertEqual(len(reloaded_dsets[\"test\"]), 30)\n            self.assertListEqual(reloaded_dsets[\"test\"].column_names, [\"filename\"])\n            self.assertEqual(len(reloaded_dsets[\"test\"].cache_files), 2)\n            del reloaded_dsets\n\n            dsets = self._create_dummy_dataset_dict()\n            dsets.save_to_disk(tmp_dir, num_proc=2)\n            reloaded_dsets = DatasetDict.load_from_disk(tmp_dir)\n            self.assertListEqual(sorted(reloaded_dsets), [\"test\", \"train\"])\n            self.assertEqual(len(reloaded_dsets[\"train\"]), 30)\n            self.assertListEqual(reloaded_dsets[\"train\"].column_names, [\"filename\"])\n            self.assertEqual(len(reloaded_dsets[\"train\"].cache_files), 2)\n            self.assertEqual(len(reloaded_dsets[\"test\"]), 30)\n            self.assertListEqual(reloaded_dsets[\"test\"].column_names, [\"filename\"])\n            self.assertEqual(len(reloaded_dsets[\"test\"].cache_files), 2)\n            del reloaded_dsets\n\n    def test_load_from_disk(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            dsets = self._create_dummy_dataset_dict()\n            dsets.save_to_disk(tmp_dir)\n            del dsets\n            dsets = load_from_disk(tmp_dir)\n            self.assertListEqual(sorted(dsets), [\"test\", \"train\"])\n            self.assertEqual(len(dsets[\"train\"]), 30)\n            self.assertListEqual(dsets[\"train\"].column_names, [\"filename\"])\n            self.assertEqual(len(dsets[\"test\"]), 30)\n            self.assertListEqual(dsets[\"test\"].column_names, [\"filename\"])\n            del dsets\n\n    def test_align_labels_with_mapping(self):\n        train_features = Features(\n            {\n                \"input_text\": Value(\"string\"),\n                \"input_labels\": ClassLabel(num_classes=3, names=[\"entailment\", \"neutral\", \"contradiction\"]),\n            }\n        )\n        test_features = Features(\n            {\n                \"input_text\": Value(\"string\"),\n                \"input_labels\": ClassLabel(num_classes=3, names=[\"entailment\", \"contradiction\", \"neutral\"]),\n            }\n        )\n        train_data = {\"input_text\": [\"a\", \"a\", \"b\", \"b\", \"c\", \"c\"], \"input_labels\": [0, 0, 1, 1, 2, 2]}\n        test_data = {\"input_text\": [\"a\", \"a\", \"c\", \"c\", \"b\", \"b\"], \"input_labels\": [0, 0, 1, 1, 2, 2]}\n        label2id = {\"CONTRADICTION\": 0, \"ENTAILMENT\": 2, \"NEUTRAL\": 1}\n        id2label = {v: k for k, v in label2id.items()}\n        train_expected_labels = [2, 2, 1, 1, 0, 0]\n        test_expected_labels = [2, 2, 0, 0, 1, 1]\n        train_expected_label_names = [id2label[idx] for idx in train_expected_labels]\n        test_expected_label_names = [id2label[idx] for idx in test_expected_labels]\n        dsets = DatasetDict(\n            {\n                \"train\": Dataset.from_dict(train_data, features=train_features),\n                \"test\": Dataset.from_dict(test_data, features=test_features),\n            }\n        )\n        dsets = dsets.align_labels_with_mapping(label2id, \"input_labels\")\n        self.assertListEqual(train_expected_labels, dsets[\"train\"][\"input_labels\"][:])\n        self.assertListEqual(test_expected_labels, dsets[\"test\"][\"input_labels\"][:])\n        train_aligned_label_names = [\n            dsets[\"train\"].features[\"input_labels\"].int2str(idx) for idx in dsets[\"train\"][\"input_labels\"]\n        ]\n        test_aligned_label_names = [\n            dsets[\"test\"].features[\"input_labels\"].int2str(idx) for idx in dsets[\"test\"][\"input_labels\"]\n        ]\n        self.assertListEqual(train_expected_label_names, train_aligned_label_names)\n        self.assertListEqual(test_expected_label_names, test_aligned_label_names)\n\n\ndef test_dummy_datasetdict_serialize_fs(mockfs):\n    dataset_dict = DatasetDict(\n        {\n            \"train\": Dataset.from_dict({\"a\": range(30)}),\n            \"test\": Dataset.from_dict({\"a\": range(10)}),\n        }\n    )\n    dataset_path = \"mock://my_dataset\"\n    dataset_dict.save_to_disk(dataset_path, storage_options=mockfs.storage_options)\n    assert mockfs.isdir(dataset_path)\n    assert mockfs.glob(dataset_path + \"/*\")\n    reloaded = DatasetDict.load_from_disk(dataset_path, storage_options=mockfs.storage_options)\n    assert list(reloaded) == list(dataset_dict)\n    for k in dataset_dict:\n        assert reloaded[k].features == dataset_dict[k].features\n        assert reloaded[k].to_dict() == dataset_dict[k].to_dict()\n\n\ndef _check_csv_datasetdict(dataset_dict, expected_features, splits=(\"train\",)):\n    assert isinstance(dataset_dict, DatasetDict)\n    for split in splits:\n        dataset = dataset_dict[split]\n        assert dataset.num_rows == 4\n        assert dataset.num_columns == 3\n        assert dataset.column_names == [\"col_1\", \"col_2\", \"col_3\"]\n        for feature, expected_dtype in expected_features.items():\n            assert dataset.features[feature].dtype == expected_dtype\n\n\n@pytest.mark.parametrize(\"keep_in_memory\", [False, True])\ndef test_datasetdict_from_csv_keep_in_memory(keep_in_memory, csv_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"int64\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = DatasetDict.from_csv({\"train\": csv_path}, cache_dir=cache_dir, keep_in_memory=keep_in_memory)\n    _check_csv_datasetdict(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\n    \"features\",\n    [\n        None,\n        {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"},\n        {\"col_1\": \"string\", \"col_2\": \"string\", \"col_3\": \"string\"},\n        {\"col_1\": \"int32\", \"col_2\": \"int32\", \"col_3\": \"int32\"},\n        {\"col_1\": \"float32\", \"col_2\": \"float32\", \"col_3\": \"float32\"},\n    ],\n)\ndef test_datasetdict_from_csv_features(features, csv_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    # CSV file loses col_1 string dtype information: default now is \"int64\" instead of \"string\"\n    default_expected_features = {\"col_1\": \"int64\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    expected_features = features.copy() if features else default_expected_features\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None\n    )\n    dataset = DatasetDict.from_csv({\"train\": csv_path}, features=features, cache_dir=cache_dir)\n    _check_csv_datasetdict(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\"split\", [None, NamedSplit(\"train\"), \"train\", \"test\"])\ndef test_datasetdict_from_csv_split(split, csv_path, tmp_path):\n    if split:\n        path = {split: csv_path}\n    else:\n        split = \"train\"\n        path = {\"train\": csv_path, \"test\": csv_path}\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"int64\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    dataset = DatasetDict.from_csv(path, cache_dir=cache_dir)\n    _check_csv_datasetdict(dataset, expected_features, splits=list(path.keys()))\n    assert all(dataset[split].split == split for split in path.keys())\n\n\ndef _check_json_datasetdict(dataset_dict, expected_features, splits=(\"train\",)):\n    assert isinstance(dataset_dict, DatasetDict)\n    for split in splits:\n        dataset = dataset_dict[split]\n        assert dataset.num_rows == 4\n        assert dataset.num_columns == 3\n        assert dataset.column_names == [\"col_1\", \"col_2\", \"col_3\"]\n        for feature, expected_dtype in expected_features.items():\n            assert dataset.features[feature].dtype == expected_dtype\n\n\n@pytest.mark.parametrize(\"keep_in_memory\", [False, True])\ndef test_datasetdict_from_json_keep_in_memory(keep_in_memory, jsonl_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = DatasetDict.from_json({\"train\": jsonl_path}, cache_dir=cache_dir, keep_in_memory=keep_in_memory)\n    _check_json_datasetdict(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\n    \"features\",\n    [\n        None,\n        {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"},\n        {\"col_1\": \"string\", \"col_2\": \"string\", \"col_3\": \"string\"},\n        {\"col_1\": \"int32\", \"col_2\": \"int32\", \"col_3\": \"int32\"},\n        {\"col_1\": \"float32\", \"col_2\": \"float32\", \"col_3\": \"float32\"},\n    ],\n)\ndef test_datasetdict_from_json_features(features, jsonl_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    default_expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    expected_features = features.copy() if features else default_expected_features\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None\n    )\n    dataset = DatasetDict.from_json({\"train\": jsonl_path}, features=features, cache_dir=cache_dir)\n    _check_json_datasetdict(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\"split\", [None, NamedSplit(\"train\"), \"train\", \"test\"])\ndef test_datasetdict_from_json_splits(split, jsonl_path, tmp_path):\n    if split:\n        path = {split: jsonl_path}\n    else:\n        split = \"train\"\n        path = {\"train\": jsonl_path, \"test\": jsonl_path}\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    dataset = DatasetDict.from_json(path, cache_dir=cache_dir)\n    _check_json_datasetdict(dataset, expected_features, splits=list(path.keys()))\n    assert all(dataset[split].split == split for split in path.keys())\n\n\ndef _check_parquet_datasetdict(dataset_dict, expected_features, splits=(\"train\",)):\n    assert isinstance(dataset_dict, DatasetDict)\n    for split in splits:\n        dataset = dataset_dict[split]\n        assert dataset.num_rows == 4\n        assert dataset.num_columns == 3\n        assert dataset.column_names == [\"col_1\", \"col_2\", \"col_3\"]\n        for feature, expected_dtype in expected_features.items():\n            assert dataset.features[feature].dtype == expected_dtype\n\n\n@pytest.mark.parametrize(\"keep_in_memory\", [False, True])\ndef test_datasetdict_from_parquet_keep_in_memory(keep_in_memory, parquet_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = DatasetDict.from_parquet({\"train\": parquet_path}, cache_dir=cache_dir, keep_in_memory=keep_in_memory)\n    _check_parquet_datasetdict(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\n    \"features\",\n    [\n        None,\n        {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"},\n        {\"col_1\": \"string\", \"col_2\": \"string\", \"col_3\": \"string\"},\n        {\"col_1\": \"int32\", \"col_2\": \"int32\", \"col_3\": \"int32\"},\n        {\"col_1\": \"float32\", \"col_2\": \"float32\", \"col_3\": \"float32\"},\n    ],\n)\ndef test_datasetdict_from_parquet_features(features, parquet_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    default_expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    expected_features = features.copy() if features else default_expected_features\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None\n    )\n    dataset = DatasetDict.from_parquet({\"train\": parquet_path}, features=features, cache_dir=cache_dir)\n    _check_parquet_datasetdict(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\"split\", [None, NamedSplit(\"train\"), \"train\", \"test\"])\ndef test_datasetdict_from_parquet_split(split, parquet_path, tmp_path):\n    if split:\n        path = {split: parquet_path}\n    else:\n        split = \"train\"\n        path = {\"train\": parquet_path, \"test\": parquet_path}\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"col_1\": \"string\", \"col_2\": \"int64\", \"col_3\": \"float64\"}\n    dataset = DatasetDict.from_parquet(path, cache_dir=cache_dir)\n    _check_parquet_datasetdict(dataset, expected_features, splits=list(path.keys()))\n    assert all(dataset[split].split == split for split in path.keys())\n\n\ndef _check_text_datasetdict(dataset_dict, expected_features, splits=(\"train\",)):\n    assert isinstance(dataset_dict, DatasetDict)\n    for split in splits:\n        dataset = dataset_dict[split]\n        assert dataset.num_rows == 4\n        assert dataset.num_columns == 1\n        assert dataset.column_names == [\"text\"]\n        for feature, expected_dtype in expected_features.items():\n            assert dataset.features[feature].dtype == expected_dtype\n\n\n@pytest.mark.parametrize(\"keep_in_memory\", [False, True])\ndef test_datasetdict_from_text_keep_in_memory(keep_in_memory, text_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"text\": \"string\"}\n    with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = DatasetDict.from_text({\"train\": text_path}, cache_dir=cache_dir, keep_in_memory=keep_in_memory)\n    _check_text_datasetdict(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\n    \"features\",\n    [\n        None,\n        {\"text\": \"string\"},\n        {\"text\": \"int32\"},\n        {\"text\": \"float32\"},\n    ],\n)\ndef test_datasetdict_from_text_features(features, text_path, tmp_path):\n    cache_dir = tmp_path / \"cache\"\n    default_expected_features = {\"text\": \"string\"}\n    expected_features = features.copy() if features else default_expected_features\n    features = (\n        Features({feature: Value(dtype) for feature, dtype in features.items()}) if features is not None else None\n    )\n    dataset = DatasetDict.from_text({\"train\": text_path}, features=features, cache_dir=cache_dir)\n    _check_text_datasetdict(dataset, expected_features)\n\n\n@pytest.mark.parametrize(\"split\", [None, NamedSplit(\"train\"), \"train\", \"test\"])\ndef test_datasetdict_from_text_split(split, text_path, tmp_path):\n    if split:\n        path = {split: text_path}\n    else:\n        split = \"train\"\n        path = {\"train\": text_path, \"test\": text_path}\n    cache_dir = tmp_path / \"cache\"\n    expected_features = {\"text\": \"string\"}\n    dataset = DatasetDict.from_text(path, cache_dir=cache_dir)\n    _check_text_datasetdict(dataset, expected_features, splits=list(path.keys()))\n    assert all(dataset[split].split == split for split in path.keys())\n"
  },
  {
    "path": "tests/test_dataset_list.py",
    "content": "from unittest import TestCase\n\nfrom datasets import List, Value\nfrom datasets.arrow_dataset import Dataset\n\n\nclass DatasetListTest(TestCase):\n    def _create_example_records(self):\n        return [\n            {\"col_1\": 3, \"col_2\": \"a\"},\n            {\"col_1\": 2, \"col_2\": \"b\"},\n            {\"col_1\": 1, \"col_2\": \"c\"},\n            {\"col_1\": 0, \"col_2\": \"d\"},\n        ]\n\n    def _create_example_dict(self):\n        data = {\"col_1\": [3, 2, 1, 0], \"col_2\": [\"a\", \"b\", \"c\", \"d\"]}\n        return Dataset.from_dict(data)\n\n    def test_create(self):\n        example_records = self._create_example_records()\n        dset = Dataset.from_list(example_records)\n        self.assertListEqual(dset.column_names, [\"col_1\", \"col_2\"])\n        for i, r in enumerate(dset):\n            self.assertDictEqual(r, example_records[i])\n\n    def test_list_dict_equivalent(self):\n        example_records = self._create_example_records()\n        dset = Dataset.from_list(example_records)\n        dset_from_dict = Dataset.from_dict({k: [r[k] for r in example_records] for k in example_records[0]})\n        self.assertEqual(dset.info, dset_from_dict.info)\n\n    def test_uneven_records(self):  # checks what happens with missing columns\n        uneven_records = [{\"col_1\": 1}, {\"col_2\": \"x\"}]\n        dset = Dataset.from_list(uneven_records)\n        self.assertDictEqual(dset[0], {\"col_1\": 1})\n        self.assertDictEqual(dset[1], {\"col_1\": None})  # NB: first record is used for columns\n\n    def test_variable_list_records(self):  # checks if the type can be inferred from the second record\n        list_records = [{\"col_1\": []}, {\"col_1\": [1, 2]}]\n        dset = Dataset.from_list(list_records)\n        self.assertEqual(dset.info.features[\"col_1\"], List(Value(\"int64\")))\n\n    def test_create_empty(self):\n        dset = Dataset.from_list([])\n        self.assertEqual(len(dset), 0)\n        self.assertListEqual(dset.column_names, [])\n"
  },
  {
    "path": "tests/test_distributed.py",
    "content": "import os\nimport sys\nfrom pathlib import Path\n\nimport pytest\n\nfrom datasets import Dataset, IterableDataset\nfrom datasets.distributed import split_dataset_by_node\n\nfrom .utils import execute_subprocess_async, get_torch_dist_unique_port, require_torch\n\n\ndef test_split_dataset_by_node_map_style():\n    full_ds = Dataset.from_dict({\"i\": range(17)})\n    full_size = len(full_ds)\n    world_size = 3\n    datasets_per_rank = [\n        split_dataset_by_node(full_ds, rank=rank, world_size=world_size) for rank in range(world_size)\n    ]\n    assert sum(len(ds) for ds in datasets_per_rank) == full_size\n    assert len({tuple(x.values()) for ds in datasets_per_rank for x in ds}) == full_size\n\n\ndef test_split_dataset_by_node_iterable():\n    def gen():\n        return ({\"i\": i} for i in range(17))\n\n    world_size = 3\n    full_ds = IterableDataset.from_generator(gen)\n    full_size = len(list(full_ds))\n    datasets_per_rank = [\n        split_dataset_by_node(full_ds, rank=rank, world_size=world_size) for rank in range(world_size)\n    ]\n    assert sum(len(list(ds)) for ds in datasets_per_rank) == full_size\n    assert len({tuple(x.values()) for ds in datasets_per_rank for x in ds}) == full_size\n\n\n@pytest.mark.parametrize(\"shards_per_node\", [1, 2, 3])\ndef test_split_dataset_by_node_iterable_sharded(shards_per_node):\n    def gen(shards):\n        for shard in shards:\n            yield from ({\"i\": i, \"shard\": shard} for i in range(17))\n\n    world_size = 3\n    num_shards = shards_per_node * world_size\n    gen_kwargs = {\"shards\": [f\"shard_{shard_idx}.txt\" for shard_idx in range(num_shards)]}\n    full_ds = IterableDataset.from_generator(gen, gen_kwargs=gen_kwargs)\n    full_size = len(list(full_ds))\n    assert full_ds.num_shards == world_size * shards_per_node\n    datasets_per_rank = [\n        split_dataset_by_node(full_ds, rank=rank, world_size=world_size) for rank in range(world_size)\n    ]\n    assert [ds.num_shards for ds in datasets_per_rank] == [shards_per_node] * world_size\n    assert sum(len(list(ds)) for ds in datasets_per_rank) == full_size\n    assert len({tuple(x.values()) for ds in datasets_per_rank for x in ds}) == full_size\n\n\ndef test_split_dataset_by_node_iterable_distributed():\n    def gen():\n        return ({\"i\": i} for i in range(100))\n\n    world_size = 3\n    num_workers = 3\n    full_ds = IterableDataset.from_generator(gen)\n    full_size = len(list(full_ds))\n    datasets_per_rank = [\n        split_dataset_by_node(full_ds, rank=rank, world_size=world_size) for rank in range(world_size)\n    ]\n    datasets_per_rank_per_worker = [\n        split_dataset_by_node(ds, rank=worker, world_size=num_workers)\n        for ds in datasets_per_rank\n        for worker in range(num_workers)\n    ]\n    assert sum(len(list(ds)) for ds in datasets_per_rank_per_worker) == full_size\n    assert len({tuple(x.values()) for ds in datasets_per_rank_per_worker for x in ds}) == full_size\n\n\ndef test_distributed_shuffle_iterable():\n    def gen():\n        return ({\"i\": i} for i in range(17))\n\n    world_size = 2\n    full_ds = IterableDataset.from_generator(gen)\n    full_size = len(list(full_ds))\n\n    ds_rank0 = split_dataset_by_node(full_ds, rank=0, world_size=world_size).shuffle(seed=42)\n    assert len(list(ds_rank0)) == 1 + full_size // world_size\n\n    ds_rank0 = split_dataset_by_node(full_ds.shuffle(seed=42), rank=0, world_size=world_size)\n    assert len(list(ds_rank0)) == 1 + full_size // world_size\n\n\n@pytest.mark.parametrize(\"streaming\", [False, True])\n@require_torch\n@pytest.mark.skipif(os.name == \"nt\", reason=\"execute_subprocess_async doesn't support windows\")\n@pytest.mark.integration\ndef test_torch_distributed_run(streaming):\n    nproc_per_node = 2\n    master_port = get_torch_dist_unique_port()\n    test_script = Path(__file__).resolve().parent / \"distributed_scripts\" / \"run_torch_distributed.py\"\n    distributed_args = f\"\"\"\n        -m torch.distributed.run\n        --nproc_per_node={nproc_per_node}\n        --master_port={master_port}\n        {test_script}\n    \"\"\".split()\n    args = f\"\"\"\n        --streaming={streaming}\n    \"\"\".split()\n    cmd = [sys.executable] + distributed_args + args\n    execute_subprocess_async(cmd, env=os.environ.copy())\n\n\n@pytest.mark.parametrize(\n    \"nproc_per_node, num_workers\",\n    [\n        (2, 2),  # each node has 2 shards and each worker has 1 shards\n        (3, 2),  # each node uses all the shards but skips examples, and each worker has 2 shards\n    ],\n)\n@require_torch\n@pytest.mark.skipif(os.name == \"nt\", reason=\"execute_subprocess_async doesn't support windows\")\n@pytest.mark.integration\ndef test_torch_distributed_run_streaming_with_num_workers(nproc_per_node, num_workers):\n    streaming = True\n    master_port = get_torch_dist_unique_port()\n    test_script = Path(__file__).resolve().parent / \"distributed_scripts\" / \"run_torch_distributed.py\"\n    distributed_args = f\"\"\"\n        -m torch.distributed.run\n        --nproc_per_node={nproc_per_node}\n        --master_port={master_port}\n        {test_script}\n    \"\"\".split()\n    args = f\"\"\"\n        --streaming={streaming}\n        --num_workers={num_workers}\n    \"\"\".split()\n    cmd = [sys.executable] + distributed_args + args\n    execute_subprocess_async(cmd, env=os.environ.copy())\n"
  },
  {
    "path": "tests/test_download_manager.py",
    "content": "import json\nimport os\nfrom pathlib import Path\n\nimport pytest\n\nfrom datasets.download.download_config import DownloadConfig\nfrom datasets.download.download_manager import DownloadManager\nfrom datasets.download.streaming_download_manager import StreamingDownloadManager\nfrom datasets.utils.file_utils import hash_url_to_filename, xopen\nfrom datasets.utils.py_utils import NestedDataStructure\n\n\nURL = \"tmp://file1.txt\"\nCONTENT = '\"text\": [\"foo\", \"foo\"]'\nHASH = \"ce0516943c3a4f9af269cf40fa658d615fa0f00d2dd9ef3f0ac5a3b35be0b719\"\n\n\nclass MockResponse:\n    status_code = 200\n    headers = {\"Content-Length\": \"100\"}\n    cookies = {}\n\n    def iter_content(self, **kwargs):\n        return [bytes(CONTENT, \"utf-8\")]\n\n\ndef mock_request(*args, **kwargs):\n    return MockResponse()\n\n\n@pytest.mark.parametrize(\"urls_type\", [\"str\", \"list\", \"dict\", \"dict_of_dict\"])\ndef test_download_manager_download(urls_type, tmp_path, tmpfs):\n    url = URL\n    with tmpfs.open(url, \"w\") as f:\n        f.write(CONTENT)\n    urls_types = {\"str\": url, \"list\": [url], \"dict\": {\"train\": url}, \"dict_of_dict\": {\"train\": {\"en\": url}}}\n    urls = urls_types[urls_type]\n    dataset_name = \"dummy\"\n    cache_subdir = \"downloads\"\n    cache_dir_root = tmp_path\n    download_config = DownloadConfig(\n        cache_dir=os.path.join(cache_dir_root, cache_subdir),\n        use_etag=False,\n    )\n    dl_manager = DownloadManager(dataset_name=dataset_name, download_config=download_config)\n    downloaded_paths = dl_manager.download(urls)\n    assert isinstance(downloaded_paths, type(urls))\n    if \"urls_type\".startswith(\"list\"):\n        assert len(downloaded_paths) == len(urls)\n    elif \"urls_type\".startswith(\"dict\"):\n        assert downloaded_paths.keys() == urls.keys()\n        if \"urls_type\" == \"dict_of_dict\":\n            key = list(urls.keys())[0]\n            assert isinstance(downloaded_paths[key], dict)\n            assert downloaded_paths[key].keys() == urls[key].keys()\n    for downloaded_path, url in zip(\n        NestedDataStructure(downloaded_paths).flatten(), NestedDataStructure(urls).flatten()\n    ):\n        downloaded_path = Path(downloaded_path)\n        parts = downloaded_path.parts\n        assert parts[-1] == HASH\n        assert parts[-2] == cache_subdir\n        assert downloaded_path.exists()\n        content = downloaded_path.read_text()\n        assert content == CONTENT\n        metadata_downloaded_path = downloaded_path.with_suffix(\".json\")\n        assert metadata_downloaded_path.exists()\n        metadata_content = json.loads(metadata_downloaded_path.read_text())\n        assert metadata_content == {\"url\": URL, \"etag\": None}\n\n\n@pytest.mark.parametrize(\"paths_type\", [str, list, dict])\n@pytest.mark.parametrize(\"extract_on_the_fly\", [False, True])\ndef test_download_manager_extract(paths_type, xz_file, text_file, extract_on_the_fly):\n    filename = str(xz_file)\n    if issubclass(paths_type, str):\n        paths = filename\n    elif issubclass(paths_type, list):\n        paths = [filename]\n    elif issubclass(paths_type, dict):\n        paths = {\"train\": filename}\n    dataset_name = \"dummy\"\n    cache_dir = xz_file.parent\n    extracted_subdir = \"extracted\"\n    download_config = DownloadConfig(\n        cache_dir=cache_dir,\n        use_etag=False,\n        extract_on_the_fly=extract_on_the_fly,\n    )\n    dl_manager = DownloadManager(dataset_name=dataset_name, download_config=download_config)\n    extracted_paths = dl_manager.extract(paths)\n    input_paths = paths\n    for extracted_paths in [extracted_paths]:\n        if isinstance(paths, str):\n            extracted_paths = [extracted_paths]\n            input_paths = [paths]\n        elif isinstance(paths, dict):\n            assert \"train\" in extracted_paths.keys()\n            extracted_paths = extracted_paths.values()\n            input_paths = paths.values()\n        assert extracted_paths\n        for extracted_path, input_path in zip(extracted_paths, input_paths):\n            assert extracted_path == dl_manager.extracted_paths[input_path]\n            if not extract_on_the_fly:\n                extracted_path = Path(extracted_path)\n                parts = extracted_path.parts\n                assert parts[-1] == hash_url_to_filename(input_path, etag=None)\n                assert parts[-2] == extracted_subdir\n                assert extracted_path.exists()\n                extracted_file_content = extracted_path.read_text()\n                expected_file_content = text_file.read_text()\n                assert extracted_file_content == expected_file_content\n            else:\n                assert extracted_path == StreamingDownloadManager(\n                    dataset_name=dataset_name, download_config=download_config\n                ).extract(xz_file)\n                assert xopen(extracted_path).read() == text_file.read_text()\n\n\ndef test_download_manager_delete_extracted_files(xz_file):\n    dataset_name = \"dummy\"\n    cache_dir = xz_file.parent\n    extracted_subdir = \"extracted\"\n    download_config = DownloadConfig(\n        cache_dir=cache_dir,\n        use_etag=False,\n    )\n    dl_manager = DownloadManager(dataset_name=dataset_name, download_config=download_config)\n    extracted_path = dl_manager.extract(xz_file)\n    assert extracted_path == dl_manager.extracted_paths[xz_file]\n    extracted_path = Path(extracted_path)\n    parts = extracted_path.parts\n\n    assert parts[-1] == hash_url_to_filename(str(xz_file), etag=None)\n    assert parts[-2] == extracted_subdir\n    assert extracted_path.exists()\n    dl_manager.delete_extracted_files()\n    assert not extracted_path.exists()\n\n\ndef _test_jsonl(path, file):\n    assert path.endswith(\".jsonl\")\n    for num_items, line in enumerate(file, start=1):\n        item = json.loads(line.decode(\"utf-8\"))\n        assert item.keys() == {\"col_1\", \"col_2\", \"col_3\"}\n    assert num_items == 4\n\n\n@pytest.mark.parametrize(\"archive_jsonl\", [\"tar_jsonl_path\", \"zip_jsonl_path\"])\ndef test_iter_archive_path(archive_jsonl, request):\n    archive_jsonl_path = request.getfixturevalue(archive_jsonl)\n    dl_manager = DownloadManager()\n    for num_jsonl, (path, file) in enumerate(dl_manager.iter_archive(archive_jsonl_path), start=1):\n        _test_jsonl(path, file)\n    assert num_jsonl == 2\n\n\n@pytest.mark.parametrize(\"archive_nested_jsonl\", [\"tar_nested_jsonl_path\", \"zip_nested_jsonl_path\"])\ndef test_iter_archive_file(archive_nested_jsonl, request):\n    archive_nested_jsonl_path = request.getfixturevalue(archive_nested_jsonl)\n    dl_manager = DownloadManager()\n    for num_tar, (path, file) in enumerate(dl_manager.iter_archive(archive_nested_jsonl_path), start=1):\n        for num_jsonl, (subpath, subfile) in enumerate(dl_manager.iter_archive(file), start=1):\n            _test_jsonl(subpath, subfile)\n    assert num_tar == 1\n    assert num_jsonl == 2\n\n\ndef test_iter_files(data_dir_with_hidden_files):\n    dl_manager = DownloadManager()\n    for num_file, file in enumerate(dl_manager.iter_files(data_dir_with_hidden_files), start=1):\n        assert os.path.basename(file) == (\"test.txt\" if num_file == 1 else \"train.txt\")\n    assert num_file == 2\n"
  },
  {
    "path": "tests/test_exceptions.py",
    "content": "import warnings\n\nimport pytest\n\nimport datasets.utils.deprecation_utils\nfrom datasets.exceptions import (\n    ChecksumVerificationError,\n    ExpectedMoreDownloadedFilesError,\n    ExpectedMoreSplitsError,\n    NonMatchingChecksumError,\n    NonMatchingSplitsSizesError,\n    SplitsVerificationError,\n    UnexpectedDownloadedFileError,\n    UnexpectedSplitsError,\n)\n\n\n@pytest.mark.parametrize(\n    \"error\",\n    [\n        ChecksumVerificationError,\n        UnexpectedDownloadedFileError,\n        ExpectedMoreDownloadedFilesError,\n        NonMatchingChecksumError,\n        SplitsVerificationError,\n        UnexpectedSplitsError,\n        ExpectedMoreSplitsError,\n        NonMatchingSplitsSizesError,\n    ],\n)\ndef test_error_not_deprecated(error, monkeypatch):\n    monkeypatch.setattr(datasets.utils.deprecation_utils, \"_emitted_deprecation_warnings\", set())\n    with warnings.catch_warnings():\n        warnings.simplefilter(\"error\")\n        error()\n"
  },
  {
    "path": "tests/test_experimental.py",
    "content": "import unittest\nimport warnings\n\nfrom datasets.utils import experimental\n\n\n@experimental\ndef dummy_function():\n    return \"success\"\n\n\nclass TestExperimentalFlag(unittest.TestCase):\n    def test_experimental_warning(self):\n        with warnings.catch_warnings(record=True) as w:\n            warnings.simplefilter(\"always\")\n            self.assertEqual(dummy_function(), \"success\")\n        self.assertEqual(len(w), 1)\n"
  },
  {
    "path": "tests/test_extract.py",
    "content": "import os\n\nimport pytest\n\nfrom datasets.utils.extract import (\n    Bzip2Extractor,\n    Extractor,\n    GzipExtractor,\n    Lz4Extractor,\n    SevenZipExtractor,\n    TarExtractor,\n    XzExtractor,\n    ZipExtractor,\n    ZstdExtractor,\n)\n\nfrom .utils import require_lz4, require_py7zr, require_zstandard\n\n\n@pytest.mark.parametrize(\n    \"compression_format, is_archive\",\n    [\n        (\"7z\", True),\n        (\"bz2\", False),\n        (\"gzip\", False),\n        (\"lz4\", False),\n        (\"tar\", True),\n        (\"xz\", False),\n        (\"zip\", True),\n        (\"zstd\", False),\n    ],\n)\ndef test_base_extractors(\n    compression_format,\n    is_archive,\n    bz2_file,\n    gz_file,\n    lz4_file,\n    seven_zip_file,\n    tar_file,\n    xz_file,\n    zip_file,\n    zstd_file,\n    tmp_path,\n    text_file,\n):\n    input_paths_and_base_extractors = {\n        \"7z\": (seven_zip_file, SevenZipExtractor),\n        \"bz2\": (bz2_file, Bzip2Extractor),\n        \"gzip\": (gz_file, GzipExtractor),\n        \"lz4\": (lz4_file, Lz4Extractor),\n        \"tar\": (tar_file, TarExtractor),\n        \"xz\": (xz_file, XzExtractor),\n        \"zip\": (zip_file, ZipExtractor),\n        \"zstd\": (zstd_file, ZstdExtractor),\n    }\n    input_path, base_extractor = input_paths_and_base_extractors[compression_format]\n    if input_path is None:\n        reason = f\"for '{compression_format}' compression_format, \"\n        if compression_format == \"7z\":\n            reason += require_py7zr.kwargs[\"reason\"]\n        elif compression_format == \"lz4\":\n            reason += require_lz4.kwargs[\"reason\"]\n        elif compression_format == \"zstd\":\n            reason += require_zstandard.kwargs[\"reason\"]\n        pytest.skip(reason)\n    assert base_extractor.is_extractable(input_path)\n    output_path = tmp_path / (\"extracted\" if is_archive else \"extracted.txt\")\n    base_extractor.extract(input_path, output_path)\n    if is_archive:\n        assert output_path.is_dir()\n        for file_path in output_path.iterdir():\n            assert file_path.name == text_file.name\n            extracted_file_content = file_path.read_text(encoding=\"utf-8\")\n    else:\n        extracted_file_content = output_path.read_text(encoding=\"utf-8\")\n    expected_file_content = text_file.read_text(encoding=\"utf-8\")\n    assert extracted_file_content == expected_file_content\n\n\n@pytest.mark.parametrize(\n    \"compression_format, is_archive\",\n    [\n        (\"7z\", True),\n        (\"bz2\", False),\n        (\"gzip\", False),\n        (\"lz4\", False),\n        (\"tar\", True),\n        (\"xz\", False),\n        (\"zip\", True),\n        (\"zstd\", False),\n    ],\n)\ndef test_extractor(\n    compression_format,\n    is_archive,\n    bz2_file,\n    gz_file,\n    lz4_file,\n    seven_zip_file,\n    tar_file,\n    xz_file,\n    zip_file,\n    zstd_file,\n    tmp_path,\n    text_file,\n):\n    input_paths = {\n        \"7z\": seven_zip_file,\n        \"bz2\": bz2_file,\n        \"gzip\": gz_file,\n        \"lz4\": lz4_file,\n        \"tar\": tar_file,\n        \"xz\": xz_file,\n        \"zip\": zip_file,\n        \"zstd\": zstd_file,\n    }\n    input_path = input_paths[compression_format]\n    if input_path is None:\n        reason = f\"for '{compression_format}' compression_format, \"\n        if compression_format == \"7z\":\n            reason += require_py7zr.kwargs[\"reason\"]\n        elif compression_format == \"lz4\":\n            reason += require_lz4.kwargs[\"reason\"]\n        elif compression_format == \"zstd\":\n            reason += require_zstandard.kwargs[\"reason\"]\n        pytest.skip(reason)\n    extractor_format = Extractor.infer_extractor_format(input_path)\n    assert extractor_format is not None\n    output_path = tmp_path / (\"extracted\" if is_archive else \"extracted.txt\")\n    Extractor.extract(input_path, output_path, extractor_format)\n    if is_archive:\n        assert output_path.is_dir()\n        for file_path in output_path.iterdir():\n            assert file_path.name == text_file.name\n            extracted_file_content = file_path.read_text(encoding=\"utf-8\")\n    else:\n        extracted_file_content = output_path.read_text(encoding=\"utf-8\")\n    expected_file_content = text_file.read_text(encoding=\"utf-8\")\n    assert extracted_file_content == expected_file_content\n\n\n@pytest.fixture\ndef tar_file_with_dot_dot(tmp_path, text_file):\n    import tarfile\n\n    directory = tmp_path / \"data_dot_dot\"\n    directory.mkdir()\n    path = directory / \"tar_file_with_dot_dot.tar\"\n    with tarfile.TarFile(path, \"w\") as f:\n        f.add(text_file, arcname=os.path.join(\"..\", text_file.name))\n    return path\n\n\n@pytest.fixture\ndef tar_file_with_sym_link(tmp_path):\n    import tarfile\n\n    directory = tmp_path / \"data_sym_link\"\n    directory.mkdir()\n    path = directory / \"tar_file_with_sym_link.tar\"\n    os.symlink(\"..\", directory / \"subdir\", target_is_directory=True)\n    with tarfile.TarFile(path, \"w\") as f:\n        f.add(str(directory / \"subdir\"), arcname=\"subdir\")  # str required by os.readlink on Windows and Python < 3.8\n    return path\n\n\n@pytest.mark.parametrize(\n    \"insecure_tar_file, error_log\",\n    [(\"tar_file_with_dot_dot\", \"illegal path\"), (\"tar_file_with_sym_link\", \"Symlink\")],\n)\ndef test_tar_extract_insecure_files(\n    insecure_tar_file, error_log, tar_file_with_dot_dot, tar_file_with_sym_link, tmp_path, caplog\n):\n    insecure_tar_files = {\n        \"tar_file_with_dot_dot\": tar_file_with_dot_dot,\n        \"tar_file_with_sym_link\": tar_file_with_sym_link,\n    }\n    input_path = insecure_tar_files[insecure_tar_file]\n    output_path = tmp_path / \"extracted\"\n    TarExtractor.extract(input_path, output_path)\n    assert caplog.text\n    for record in caplog.records:\n        assert record.levelname == \"ERROR\"\n        assert error_log in record.msg\n\n\ndef test_is_zipfile_false_positive(tmpdir):\n    # We should have less false positives than zipfile.is_zipfile\n    # We do that by checking only the magic number\n    not_a_zip_file = tmpdir / \"not_a_zip_file\"\n    # From: https://github.com/python/cpython/pull/5053\n    data = (\n        b\"\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIHDR\\x00\\x00\\x00\\x01\\x00\\x00\"\n        b\"\\x00\\x02\\x08\\x06\\x00\\x00\\x00\\x99\\x81\\xb6'\\x00\\x00\\x00\\x15I\"\n        b\"DATx\\x01\\x01\\n\\x00\\xf5\\xff\\x00PK\\x05\\x06\\x00PK\\x06\\x06\\x07\"\n        b\"\\xac\\x01N\\xc6|a\\r\\x00\\x00\\x00\\x00IEND\\xaeB`\\x82\"\n    )\n    with not_a_zip_file.open(\"wb\") as f:\n        f.write(data)\n    # zipfile.is_zipfile(str(not_a_zip_file)) could be a false positive for `zipfile`\n    assert not ZipExtractor.is_extractable(not_a_zip_file)  # but we're right\n"
  },
  {
    "path": "tests/test_file_utils.py",
    "content": "import os\nimport re\nfrom pathlib import Path\nfrom unittest.mock import patch\n\nimport pytest\nimport zstandard as zstd\nfrom fsspec.registry import _registry as _fsspec_registry\nfrom fsspec.spec import AbstractBufferedFile, AbstractFileSystem\nfrom huggingface_hub.errors import OfflineModeIsEnabled\n\nfrom datasets.download.download_config import DownloadConfig\nfrom datasets.utils.file_utils import (\n    _get_extraction_protocol,\n    _prepare_single_hop_path_and_storage_options,\n    cached_path,\n    fsspec_get,\n    fsspec_head,\n    get_from_cache,\n    xdirname,\n    xexists,\n    xgetsize,\n    xglob,\n    xisdir,\n    xisfile,\n    xjoin,\n    xlistdir,\n    xnumpy_load,\n    xopen,\n    xPath,\n    xrelpath,\n    xsplit,\n    xsplitext,\n    xwalk,\n)\nfrom datasets.utils.hub import hf_dataset_url\n\nfrom .utils import slow\n\n\nFILE_CONTENT = \"\"\"\\\n    Text data.\n    Second line of data.\"\"\"\n\nFILE_PATH = \"file\"\n\nTEST_URL = \"https://huggingface.co/datasets/hf-internal-testing/dataset_with_data_files/resolve/main/data/train.txt\"\nTEST_URL_CONTENT = \"foo\\n\" * 10\n\nTEST_GG_DRIVE_FILENAME = \"train.tsv\"\nTEST_GG_DRIVE_URL = \"https://drive.google.com/uc?export=download&id=17bOgBDc3hRCoPZ89EYtKDzK-yXAWat94\"\nTEST_GG_DRIVE_GZIPPED_URL = \"https://drive.google.com/uc?export=download&id=1Bt4Garpf0QLiwkJhHJzXaVa0I0H5Qhwz\"\nTEST_GG_DRIVE_ZIPPED_URL = \"https://drive.google.com/uc?export=download&id=1k92sUfpHxKq8PXWRr7Y5aNHXwOCNUmqh\"\nTEST_GG_DRIVE_CONTENT = \"\"\"\\\npokemon_name, type\nCharmander, fire\nSquirtle, water\nBulbasaur, grass\"\"\"\n\n\n@pytest.fixture(scope=\"session\")\ndef zstd_path(tmp_path_factory):\n    path = tmp_path_factory.mktemp(\"data\") / (FILE_PATH + \".zstd\")\n    data = bytes(FILE_CONTENT, \"utf-8\")\n    with zstd.open(path, \"wb\") as f:\n        f.write(data)\n    return path\n\n\n@pytest.fixture\ndef tmpfs_file(tmpfs):\n    with open(os.path.join(tmpfs.local_root_dir, FILE_PATH), \"w\") as f:\n        f.write(FILE_CONTENT)\n    return FILE_PATH\n\n\n@pytest.mark.parametrize(\"compression_format\", [\"gzip\", \"xz\", \"zstd\"])\ndef test_cached_path_extract(compression_format, gz_file, xz_file, zstd_path, tmp_path, text_file):\n    input_paths = {\"gzip\": gz_file, \"xz\": xz_file, \"zstd\": zstd_path}\n    input_path = input_paths[compression_format]\n    cache_dir = tmp_path / \"cache\"\n    download_config = DownloadConfig(cache_dir=cache_dir, extract_compressed_file=True)\n    extracted_path = cached_path(input_path, download_config=download_config)\n    with open(extracted_path) as f:\n        extracted_file_content = f.read()\n    with open(text_file) as f:\n        expected_file_content = f.read()\n    assert extracted_file_content == expected_file_content\n\n\n@pytest.mark.parametrize(\"default_extracted\", [True, False])\n@pytest.mark.parametrize(\"default_cache_dir\", [True, False])\ndef test_extracted_datasets_path(default_extracted, default_cache_dir, xz_file, tmp_path, monkeypatch):\n    custom_cache_dir = \"custom_cache\"\n    custom_extracted_dir = \"custom_extracted_dir\"\n    custom_extracted_path = tmp_path / \"custom_extracted_path\"\n    if default_extracted:\n        expected = (\"downloads\" if default_cache_dir else custom_cache_dir, \"extracted\")\n    else:\n        monkeypatch.setattr(\"datasets.config.EXTRACTED_DATASETS_DIR\", custom_extracted_dir)\n        monkeypatch.setattr(\"datasets.config.EXTRACTED_DATASETS_PATH\", str(custom_extracted_path))\n        expected = custom_extracted_path.parts[-2:] if default_cache_dir else (custom_cache_dir, custom_extracted_dir)\n\n    filename = xz_file\n    download_config = (\n        DownloadConfig(extract_compressed_file=True)\n        if default_cache_dir\n        else DownloadConfig(cache_dir=tmp_path / custom_cache_dir, extract_compressed_file=True)\n    )\n    extracted_file_path = cached_path(filename, download_config=download_config)\n    assert Path(extracted_file_path).parent.parts[-2:] == expected\n\n\ndef test_cached_path_local(text_file):\n    # input absolute path -> output absolute path\n    text_file_abs = str(Path(text_file).resolve())\n    assert os.path.samefile(cached_path(text_file_abs), text_file_abs)\n    # input relative path -> output absolute path\n    text_file = __file__\n    text_file_abs = str(Path(text_file).resolve())\n    text_file_rel = str(Path(text_file).resolve().relative_to(Path(os.getcwd())))\n    assert os.path.samefile(cached_path(text_file_rel), text_file_abs)\n\n\ndef test_cached_path_missing_local(tmp_path):\n    # absolute path\n    missing_file = str(tmp_path.resolve() / \"__missing_file__.txt\")\n    with pytest.raises(FileNotFoundError):\n        cached_path(missing_file)\n    # relative path\n    missing_file = \"./__missing_file__.txt\"\n    with pytest.raises(FileNotFoundError):\n        cached_path(missing_file)\n\n\ndef test_get_from_cache_fsspec(tmpfs_file):\n    output_path = get_from_cache(f\"tmp://{tmpfs_file}\")\n    with open(output_path) as f:\n        output_file_content = f.read()\n    assert output_file_content == FILE_CONTENT\n\n\n@patch(\"datasets.config.HF_HUB_OFFLINE\", True)\ndef test_cached_path_offline():\n    with pytest.raises(OfflineModeIsEnabled):\n        cached_path(\"https://huggingface.co\")\n\n\n@patch(\"datasets.config.HF_HUB_OFFLINE\", True)\ndef test_fsspec_offline(tmp_path_factory):\n    filename = tmp_path_factory.mktemp(\"data\") / \"file.html\"\n    with pytest.raises(OfflineModeIsEnabled):\n        fsspec_get(\"s3://huggingface.co\", temp_file=filename)\n    with pytest.raises(OfflineModeIsEnabled):\n        fsspec_head(\"s3://huggingface.co\")\n\n\n@pytest.mark.parametrize(\n    \"urlpath, download_config, expected_urlpath, expected_storage_options\",\n    [\n        (\n            \"https://huggingface.co/datasets/hf-internal-testing/dataset_with_data_files/resolve/main/data/train.txt\",\n            DownloadConfig(),\n            \"hf://datasets/hf-internal-testing/dataset_with_data_files@main/data/train.txt\",\n            {\"hf\": {\"endpoint\": \"https://huggingface.co\", \"token\": None}},\n        ),\n        (\n            \"https://huggingface.co/datasets/hf-internal-testing/dataset_with_data_files/resolve/main/data/train.txt\",\n            DownloadConfig(token=\"MY-TOKEN\"),\n            \"hf://datasets/hf-internal-testing/dataset_with_data_files@main/data/train.txt\",\n            {\"hf\": {\"endpoint\": \"https://huggingface.co\", \"token\": \"MY-TOKEN\"}},\n        ),\n        (\n            \"https://huggingface.co/datasets/hf-internal-testing/dataset_with_data_files/resolve/main/data/train.txt\",\n            DownloadConfig(token=\"MY-TOKEN\", storage_options={\"hf\": {\"on_error\": \"omit\"}}),\n            \"hf://datasets/hf-internal-testing/dataset_with_data_files@main/data/train.txt\",\n            {\"hf\": {\"endpoint\": \"https://huggingface.co\", \"token\": \"MY-TOKEN\", \"on_error\": \"omit\"}},\n        ),\n        (\n            \"https://domain.org/data.txt\",\n            DownloadConfig(),\n            \"https://domain.org/data.txt\",\n            {\"https\": {\"client_kwargs\": {\"trust_env\": True}}},\n        ),\n        (\n            \"https://domain.org/data.txt\",\n            DownloadConfig(storage_options={\"https\": {\"block_size\": \"omit\"}}),\n            \"https://domain.org/data.txt\",\n            {\"https\": {\"client_kwargs\": {\"trust_env\": True}, \"block_size\": \"omit\"}},\n        ),\n        (\n            \"https://domain.org/data.txt\",\n            DownloadConfig(storage_options={\"https\": {\"client_kwargs\": {\"raise_for_status\": True}}}),\n            \"https://domain.org/data.txt\",\n            {\"https\": {\"client_kwargs\": {\"trust_env\": True, \"raise_for_status\": True}}},\n        ),\n        (\n            \"https://domain.org/data.txt\",\n            DownloadConfig(storage_options={\"https\": {\"client_kwargs\": {\"trust_env\": False}}}),\n            \"https://domain.org/data.txt\",\n            {\"https\": {\"client_kwargs\": {\"trust_env\": False}}},\n        ),\n        (\n            \"https://raw.githubusercontent.com/data.txt\",\n            DownloadConfig(storage_options={\"https\": {\"headers\": {\"x-test\": \"true\"}}}),\n            \"https://raw.githubusercontent.com/data.txt\",\n            {\n                \"https\": {\n                    \"client_kwargs\": {\"trust_env\": True},\n                    \"headers\": {\"x-test\": \"true\", \"Accept-Encoding\": \"identity\"},\n                }\n            },\n        ),\n    ],\n)\ndef test_prepare_single_hop_path_and_storage_options(\n    urlpath, download_config, expected_urlpath, expected_storage_options\n):\n    original_download_config_storage_options = str(download_config.storage_options)\n    prepared_urlpath, storage_options = _prepare_single_hop_path_and_storage_options(urlpath, download_config)\n    assert prepared_urlpath == expected_urlpath\n    assert storage_options == expected_storage_options\n    # Check that DownloadConfig.storage_options are not modified:\n    assert str(download_config.storage_options) == original_download_config_storage_options\n\n\nclass DummyTestFS(AbstractFileSystem):\n    protocol = \"mock\"\n    _file_class = AbstractBufferedFile\n    _fs_contents = (\n        {\"name\": \"top_level\", \"type\": \"directory\"},\n        {\"name\": \"top_level/second_level\", \"type\": \"directory\"},\n        {\"name\": \"top_level/second_level/date=2019-10-01\", \"type\": \"directory\"},\n        {\n            \"name\": \"top_level/second_level/date=2019-10-01/a.parquet\",\n            \"type\": \"file\",\n            \"size\": 100,\n        },\n        {\n            \"name\": \"top_level/second_level/date=2019-10-01/b.parquet\",\n            \"type\": \"file\",\n            \"size\": 100,\n        },\n        {\"name\": \"top_level/second_level/date=2019-10-02\", \"type\": \"directory\"},\n        {\n            \"name\": \"top_level/second_level/date=2019-10-02/a.parquet\",\n            \"type\": \"file\",\n            \"size\": 100,\n        },\n        {\"name\": \"top_level/second_level/date=2019-10-04\", \"type\": \"directory\"},\n        {\n            \"name\": \"top_level/second_level/date=2019-10-04/a.parquet\",\n            \"type\": \"file\",\n            \"size\": 100,\n        },\n        {\"name\": \"misc\", \"type\": \"directory\"},\n        {\"name\": \"misc/foo.txt\", \"type\": \"file\", \"size\": 100},\n        {\"name\": \"glob_test\", \"type\": \"directory\", \"size\": 0},\n        {\"name\": \"glob_test/hat\", \"type\": \"directory\", \"size\": 0},\n        {\"name\": \"glob_test/hat/^foo.txt\", \"type\": \"file\", \"size\": 100},\n        {\"name\": \"glob_test/dollar\", \"type\": \"directory\", \"size\": 0},\n        {\"name\": \"glob_test/dollar/$foo.txt\", \"type\": \"file\", \"size\": 100},\n        {\"name\": \"glob_test/lbrace\", \"type\": \"directory\", \"size\": 0},\n        {\"name\": \"glob_test/lbrace/{foo.txt\", \"type\": \"file\", \"size\": 100},\n        {\"name\": \"glob_test/rbrace\", \"type\": \"directory\", \"size\": 0},\n        {\"name\": \"glob_test/rbrace/}foo.txt\", \"type\": \"file\", \"size\": 100},\n    )\n\n    def __getitem__(self, name):\n        for item in self._fs_contents:\n            if item[\"name\"] == name:\n                return item\n        raise IndexError(f\"{name} not found!\")\n\n    def ls(self, path, detail=True, refresh=True, **kwargs):\n        if kwargs.pop(\"strip_proto\", True):\n            path = self._strip_protocol(path)\n\n        files = not refresh and self._ls_from_cache(path)\n        if not files:\n            files = [file for file in self._fs_contents if path == self._parent(file[\"name\"])]\n            files.sort(key=lambda file: file[\"name\"])\n            self.dircache[path.rstrip(\"/\")] = files\n\n        if detail:\n            return files\n        return [file[\"name\"] for file in files]\n\n    def _open(\n        self,\n        path,\n        mode=\"rb\",\n        block_size=None,\n        autocommit=True,\n        cache_options=None,\n        **kwargs,\n    ):\n        return self._file_class(\n            self,\n            path,\n            mode,\n            block_size,\n            autocommit,\n            cache_options=cache_options,\n            **kwargs,\n        )\n\n\n@pytest.fixture\ndef mock_fsspec2():  # to avoid the name collision with `mock_fsspec` from fixtures/fsspec.py\n    _fsspec_registry[\"mock\"] = DummyTestFS\n    yield\n    del _fsspec_registry[\"mock\"]\n\n\ndef _readd_double_slash_removed_by_path(path_as_posix: str) -> str:\n    \"\"\"Path(...) on an url path like zip://file.txt::http://host.com/data.zip\n    converts the :// to :/\n    This function readds the ://\n\n    It handles cases like:\n\n    - https://host.com/data.zip\n    - C://data.zip\n    - zip://file.txt::https://host.com/data.zip\n    - zip://file.txt::/Users/username/data.zip\n    - zip://file.txt::C://data.zip\n\n    Args:\n        path_as_posix (str): output of Path(...).as_posix()\n\n    Returns:\n        str: the url path with :// instead of :/\n    \"\"\"\n    return re.sub(\"([A-z]:/)([A-z:])\", r\"\\g<1>/\\g<2>\", path_as_posix)\n\n\n@pytest.mark.parametrize(\n    \"input_path, paths_to_join, expected_path\",\n    [\n        (\n            \"https://host.com/archive.zip\",\n            (\"file.txt\",),\n            \"https://host.com/archive.zip/file.txt\",\n        ),\n        (\n            \"zip://::https://host.com/archive.zip\",\n            (\"file.txt\",),\n            \"zip://file.txt::https://host.com/archive.zip\",\n        ),\n        (\n            \"zip://folder::https://host.com/archive.zip\",\n            (\"file.txt\",),\n            \"zip://folder/file.txt::https://host.com/archive.zip\",\n        ),\n        (\n            \".\",\n            (\"file.txt\",),\n            os.path.join(\".\", \"file.txt\"),\n        ),\n        (\n            str(Path().resolve()),\n            (\"file.txt\",),\n            str((Path().resolve() / \"file.txt\")),\n        ),\n    ],\n)\ndef test_xjoin(input_path, paths_to_join, expected_path):\n    output_path = xjoin(input_path, *paths_to_join)\n    assert output_path == expected_path\n    output_path = xPath(input_path).joinpath(*paths_to_join)\n    assert output_path == xPath(expected_path)\n\n\n@pytest.mark.parametrize(\n    \"input_path, expected_path\",\n    [\n        (str(Path(__file__).resolve()), str(Path(__file__).resolve().parent)),\n        (\"https://host.com/archive.zip\", \"https://host.com\"),\n        (\n            \"zip://file.txt::https://host.com/archive.zip\",\n            \"zip://::https://host.com/archive.zip\",\n        ),\n        (\n            \"zip://folder/file.txt::https://host.com/archive.zip\",\n            \"zip://folder::https://host.com/archive.zip\",\n        ),\n    ],\n)\ndef test_xdirname(input_path, expected_path):\n    output_path = xdirname(input_path)\n    output_path = _readd_double_slash_removed_by_path(Path(output_path).as_posix())\n    assert output_path == _readd_double_slash_removed_by_path(Path(expected_path).as_posix())\n\n\n@pytest.mark.parametrize(\n    \"input_path, exists\",\n    [\n        (\"tmp_path/file.txt\", True),\n        (\"tmp_path/file_that_doesnt_exist.txt\", False),\n        (\"mock://top_level/second_level/date=2019-10-01/a.parquet\", True),\n        (\"mock://top_level/second_level/date=2019-10-01/file_that_doesnt_exist.parquet\", False),\n    ],\n)\ndef test_xexists(input_path, exists, tmp_path, mock_fsspec2):\n    if input_path.startswith(\"tmp_path\"):\n        input_path = input_path.replace(\"/\", os.sep).replace(\"tmp_path\", str(tmp_path))\n        (tmp_path / \"file.txt\").touch()\n    assert xexists(input_path) is exists\n\n\n@pytest.mark.integration\ndef test_xexists_private(hf_private_dataset_repo_txt_data, hf_token):\n    root_url = hf_dataset_url(hf_private_dataset_repo_txt_data, \"\")\n    download_config = DownloadConfig(token=hf_token)\n    assert xexists(root_url + \"data/text_data.txt\", download_config=download_config)\n    assert not xexists(root_url + \"file_that_doesnt_exist.txt\", download_config=download_config)\n\n\n@pytest.mark.parametrize(\n    \"input_path, expected_head_and_tail\",\n    [\n        (\n            str(Path(__file__).resolve()),\n            (str(Path(__file__).resolve().parent), str(Path(__file__).resolve().name)),\n        ),\n        (\"https://host.com/archive.zip\", (\"https://host.com\", \"archive.zip\")),\n        (\"zip://file.txt::https://host.com/archive.zip\", (\"zip://::https://host.com/archive.zip\", \"file.txt\")),\n        (\"zip://folder::https://host.com/archive.zip\", (\"zip://::https://host.com/archive.zip\", \"folder\")),\n        (\"zip://::https://host.com/archive.zip\", (\"zip://::https://host.com/archive.zip\", \"\")),\n    ],\n)\ndef test_xsplit(input_path, expected_head_and_tail):\n    output_path, tail = xsplit(input_path)\n    expected_path, expected_tail = expected_head_and_tail\n    output_path = _readd_double_slash_removed_by_path(Path(output_path).as_posix())\n    expected_path = _readd_double_slash_removed_by_path(Path(expected_path).as_posix())\n    assert output_path == expected_path\n    assert tail == expected_tail\n\n\n@pytest.mark.parametrize(\n    \"input_path, expected_path_and_ext\",\n    [\n        (\n            str(Path(__file__).resolve()),\n            (str(Path(__file__).resolve().with_suffix(\"\")), str(Path(__file__).resolve().suffix)),\n        ),\n        (\"https://host.com/archive.zip\", (\"https://host.com/archive\", \".zip\")),\n        (\"zip://file.txt::https://host.com/archive.zip\", (\"zip://file::https://host.com/archive.zip\", \".txt\")),\n        (\"zip://folder::https://host.com/archive.zip\", (\"zip://folder::https://host.com/archive.zip\", \"\")),\n        (\"zip://::https://host.com/archive.zip\", (\"zip://::https://host.com/archive.zip\", \"\")),\n    ],\n)\ndef test_xsplitext(input_path, expected_path_and_ext):\n    output_path, ext = xsplitext(input_path)\n    expected_path, expected_ext = expected_path_and_ext\n    output_path = _readd_double_slash_removed_by_path(Path(output_path).as_posix())\n    expected_path = _readd_double_slash_removed_by_path(Path(expected_path).as_posix())\n    assert output_path == expected_path\n    assert ext == expected_ext\n\n\ndef test_xopen_local(text_path):\n    with xopen(text_path, \"r\", encoding=\"utf-8\") as f, open(text_path, encoding=\"utf-8\") as expected_file:\n        assert list(f) == list(expected_file)\n    with xPath(text_path).open(\"r\", encoding=\"utf-8\") as f, open(text_path, encoding=\"utf-8\") as expected_file:\n        assert list(f) == list(expected_file)\n\n\n@pytest.mark.integration\ndef test_xopen_remote():\n    with xopen(TEST_URL, \"r\", encoding=\"utf-8\") as f:\n        assert list(f) == TEST_URL_CONTENT.splitlines(keepends=True)\n    with xPath(TEST_URL).open(\"r\", encoding=\"utf-8\") as f:\n        assert list(f) == TEST_URL_CONTENT.splitlines(keepends=True)\n\n\n@pytest.mark.parametrize(\n    \"input_path, expected_paths\",\n    [\n        (\"tmp_path\", [\"file1.txt\", \"file2.txt\"]),\n        (\"mock://\", [\"glob_test\", \"misc\", \"top_level\"]),\n        (\"mock://top_level\", [\"second_level\"]),\n        (\"mock://top_level/second_level/date=2019-10-01\", [\"a.parquet\", \"b.parquet\"]),\n    ],\n)\ndef test_xlistdir(input_path, expected_paths, tmp_path, mock_fsspec2):\n    if input_path.startswith(\"tmp_path\"):\n        input_path = input_path.replace(\"/\", os.sep).replace(\"tmp_path\", str(tmp_path))\n        for file in [\"file1.txt\", \"file2.txt\"]:\n            (tmp_path / file).touch()\n    output_paths = sorted(xlistdir(input_path))\n    assert output_paths == expected_paths\n\n\n@pytest.mark.integration\ndef test_xlistdir_private(hf_private_dataset_repo_zipped_txt_data, hf_token):\n    root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, \"data.zip\")\n    download_config = DownloadConfig(token=hf_token)\n    assert len(xlistdir(\"zip://::\" + root_url, download_config=download_config)) == 1\n    assert len(xlistdir(\"zip://main_dir::\" + root_url, download_config=download_config)) == 2\n    with pytest.raises(FileNotFoundError):\n        xlistdir(\"zip://qwertyuiop::\" + root_url, download_config=download_config)\n    with pytest.raises(FileNotFoundError):\n        xlistdir(root_url, download_config=download_config)\n\n\n@pytest.mark.parametrize(\n    \"input_path, isdir\",\n    [\n        (\"tmp_path\", True),\n        (\"tmp_path/file.txt\", False),\n        (\"mock://\", True),\n        (\"mock://top_level\", True),\n        (\"mock://dir_that_doesnt_exist\", False),\n    ],\n)\ndef test_xisdir(input_path, isdir, tmp_path, mock_fsspec2):\n    if input_path.startswith(\"tmp_path\"):\n        input_path = input_path.replace(\"/\", os.sep).replace(\"tmp_path\", str(tmp_path))\n        (tmp_path / \"file.txt\").touch()\n    assert xisdir(input_path) == isdir\n\n\n@pytest.mark.integration\ndef test_xisdir_private(hf_private_dataset_repo_zipped_txt_data, hf_token):\n    root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, \"data.zip\")\n    download_config = DownloadConfig(token=hf_token)\n    assert xisdir(\"zip://::\" + root_url, download_config=download_config) is True\n    assert xisdir(\"zip://main_dir::\" + root_url, download_config=download_config) is True\n    assert xisdir(\"zip://qwertyuiop::\" + root_url, download_config=download_config) is False\n    assert xisdir(root_url, download_config=download_config) is False\n\n\n@pytest.mark.parametrize(\n    \"input_path, isfile\",\n    [\n        (\"tmp_path/file.txt\", True),\n        (\"tmp_path/file_that_doesnt_exist.txt\", False),\n        (\"mock://\", False),\n        (\"mock://top_level/second_level/date=2019-10-01/a.parquet\", True),\n    ],\n)\ndef test_xisfile(input_path, isfile, tmp_path, mock_fsspec2):\n    if input_path.startswith(\"tmp_path\"):\n        input_path = input_path.replace(\"/\", os.sep).replace(\"tmp_path\", str(tmp_path))\n        (tmp_path / \"file.txt\").touch()\n    assert xisfile(input_path) == isfile\n\n\n@pytest.mark.integration\ndef test_xisfile_private(hf_private_dataset_repo_txt_data, hf_token):\n    root_url = hf_dataset_url(hf_private_dataset_repo_txt_data, \"\")\n    download_config = DownloadConfig(token=hf_token)\n    assert xisfile(root_url + \"data/text_data.txt\", download_config=download_config) is True\n    assert xisfile(root_url + \"qwertyuiop\", download_config=download_config) is False\n\n\n@pytest.mark.parametrize(\n    \"input_path, size\",\n    [\n        (\"tmp_path/file.txt\", 100),\n        (\"mock://\", 0),\n        (\"mock://top_level/second_level/date=2019-10-01/a.parquet\", 100),\n    ],\n)\ndef test_xgetsize(input_path, size, tmp_path, mock_fsspec2):\n    if input_path.startswith(\"tmp_path\"):\n        input_path = input_path.replace(\"/\", os.sep).replace(\"tmp_path\", str(tmp_path))\n        (tmp_path / \"file.txt\").touch()\n        (tmp_path / \"file.txt\").write_bytes(b\"x\" * 100)\n    assert xgetsize(input_path) == size\n\n\n@pytest.mark.integration\ndef test_xgetsize_private(hf_private_dataset_repo_txt_data, hf_token):\n    root_url = hf_dataset_url(hf_private_dataset_repo_txt_data, \"\")\n    download_config = DownloadConfig(token=hf_token)\n    assert xgetsize(root_url + \"data/text_data.txt\", download_config=download_config) == 39\n    with pytest.raises(FileNotFoundError):\n        xgetsize(root_url + \"qwertyuiop\", download_config=download_config)\n\n\n@pytest.mark.parametrize(\n    \"input_path, expected_paths\",\n    [\n        (\"tmp_path/*.txt\", [\"file1.txt\", \"file2.txt\"]),\n        (\"mock://*\", [\"mock://glob_test\", \"mock://misc\", \"mock://top_level\"]),\n        (\"mock://top_*\", [\"mock://top_level\"]),\n        (\n            \"mock://top_level/second_level/date=2019-10-0[1-4]\",\n            [\n                \"mock://top_level/second_level/date=2019-10-01\",\n                \"mock://top_level/second_level/date=2019-10-02\",\n                \"mock://top_level/second_level/date=2019-10-04\",\n            ],\n        ),\n        (\n            \"mock://top_level/second_level/date=2019-10-0[1-4]/*\",\n            [\n                \"mock://top_level/second_level/date=2019-10-01/a.parquet\",\n                \"mock://top_level/second_level/date=2019-10-01/b.parquet\",\n                \"mock://top_level/second_level/date=2019-10-02/a.parquet\",\n                \"mock://top_level/second_level/date=2019-10-04/a.parquet\",\n            ],\n        ),\n    ],\n)\ndef test_xglob(input_path, expected_paths, tmp_path, mock_fsspec2):\n    if input_path.startswith(\"tmp_path\"):\n        input_path = input_path.replace(\"/\", os.sep).replace(\"tmp_path\", str(tmp_path))\n        expected_paths = [str(tmp_path / file) for file in expected_paths]\n        for file in [\"file1.txt\", \"file2.txt\", \"README.md\"]:\n            (tmp_path / file).touch()\n    output_paths = sorted(xglob(input_path))\n    assert output_paths == expected_paths\n\n\n@pytest.mark.integration\ndef test_xglob_private(hf_private_dataset_repo_zipped_txt_data, hf_token):\n    root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, \"data.zip\")\n    download_config = DownloadConfig(token=hf_token)\n    assert len(xglob(\"zip://**::\" + root_url, download_config=download_config)) == 3\n    assert len(xglob(\"zip://qwertyuiop/*::\" + root_url, download_config=download_config)) == 0\n\n\n@pytest.mark.parametrize(\n    \"input_path, expected_outputs\",\n    [\n        (\"tmp_path\", [(\"\", [], [\"file1.txt\", \"file2.txt\", \"README.md\"])]),\n        (\n            \"mock://top_level/second_level\",\n            [\n                (\"mock://top_level/second_level\", [\"date=2019-10-01\", \"date=2019-10-02\", \"date=2019-10-04\"], []),\n                (\"mock://top_level/second_level/date=2019-10-01\", [], [\"a.parquet\", \"b.parquet\"]),\n                (\"mock://top_level/second_level/date=2019-10-02\", [], [\"a.parquet\"]),\n                (\"mock://top_level/second_level/date=2019-10-04\", [], [\"a.parquet\"]),\n            ],\n        ),\n    ],\n)\ndef test_xwalk(input_path, expected_outputs, tmp_path, mock_fsspec2):\n    if input_path.startswith(\"tmp_path\"):\n        input_path = input_path.replace(\"/\", os.sep).replace(\"tmp_path\", str(tmp_path))\n        expected_outputs = sorted(\n            [\n                (str(tmp_path / dirpath).rstrip(\"/\"), sorted(dirnames), sorted(filenames))\n                for dirpath, dirnames, filenames in expected_outputs\n            ]\n        )\n        for file in [\"file1.txt\", \"file2.txt\", \"README.md\"]:\n            (tmp_path / file).touch()\n    outputs = sorted(xwalk(input_path))\n    outputs = [(dirpath, sorted(dirnames), sorted(filenames)) for dirpath, dirnames, filenames in outputs]\n    assert outputs == expected_outputs\n\n\n@pytest.mark.integration\ndef test_xwalk_private(hf_private_dataset_repo_zipped_txt_data, hf_token):\n    root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, \"data.zip\")\n    download_config = DownloadConfig(token=hf_token)\n    assert len(list(xwalk(\"zip://::\" + root_url, download_config=download_config))) == 2\n    assert len(list(xwalk(\"zip://main_dir::\" + root_url, download_config=download_config))) == 1\n    assert len(list(xwalk(\"zip://qwertyuiop::\" + root_url, download_config=download_config))) == 0\n\n\n@pytest.mark.parametrize(\n    \"input_path, start_path, expected_path\",\n    [\n        (\"dir1/dir2/file.txt\".replace(\"/\", os.path.sep), \"dir1\", \"dir2/file.txt\".replace(\"/\", os.path.sep)),\n        (\"dir1/dir2/file.txt\".replace(\"/\", os.path.sep), \"dir1/dir2\".replace(\"/\", os.path.sep), \"file.txt\"),\n        (\"zip://file.txt::https://host.com/archive.zip\", \"zip://::https://host.com/archive.zip\", \"file.txt\"),\n        (\n            \"zip://folder/file.txt::https://host.com/archive.zip\",\n            \"zip://::https://host.com/archive.zip\",\n            \"folder/file.txt\",\n        ),\n        (\n            \"zip://folder/file.txt::https://host.com/archive.zip\",\n            \"zip://folder::https://host.com/archive.zip\",\n            \"file.txt\",\n        ),\n    ],\n)\ndef test_xrelpath(input_path, start_path, expected_path):\n    output_path = xrelpath(input_path, start=start_path)\n    assert output_path == expected_path\n\n\nclass TestxPath:\n    @pytest.mark.parametrize(\n        \"input_path\",\n        [\n            \"https://host.com/archive.zip\",\n            \"zip://file.txt::https://host.com/archive.zip\",\n            \"zip://dir/file.txt::https://host.com/archive.zip\",\n            \"file.txt\",\n            str(Path().resolve() / \"file.txt\"),\n        ],\n    )\n    def test_xpath_str(self, input_path):\n        assert str(xPath(input_path)) == input_path\n\n    @pytest.mark.parametrize(\n        \"input_path, expected_path\",\n        [\n            (\"https://host.com/archive.zip\", \"https://host.com/archive.zip\"),\n            (\"zip://file.txt::https://host.com/archive.zip\", \"zip://file.txt::https://host.com/archive.zip\"),\n            (\"zip://dir/file.txt::https://host.com/archive.zip\", \"zip://dir/file.txt::https://host.com/archive.zip\"),\n            (\"file.txt\", \"file.txt\"),\n            (str(Path().resolve() / \"file.txt\"), (Path().resolve() / \"file.txt\").as_posix()),\n        ],\n    )\n    def test_xpath_as_posix(self, input_path, expected_path):\n        assert xPath(input_path).as_posix() == expected_path\n\n    @pytest.mark.parametrize(\n        \"input_path, exists\",\n        [\n            (\"tmp_path/file.txt\", True),\n            (\"tmp_path/file_that_doesnt_exist.txt\", False),\n            (\"mock://top_level/second_level/date=2019-10-01/a.parquet\", True),\n            (\"mock://top_level/second_level/date=2019-10-01/file_that_doesnt_exist.parquet\", False),\n        ],\n    )\n    def test_xpath_exists(self, input_path, exists, tmp_path, mock_fsspec2):\n        if input_path.startswith(\"tmp_path\"):\n            input_path = input_path.replace(\"/\", os.sep).replace(\"tmp_path\", str(tmp_path))\n            (tmp_path / \"file.txt\").touch()\n        assert xexists(input_path) is exists\n\n    @pytest.mark.parametrize(\n        \"input_path, pattern, expected_paths\",\n        [\n            (\"tmp_path\", \"*.txt\", [\"file1.txt\", \"file2.txt\"]),\n            (\"mock://\", \"*\", [\"mock://glob_test\", \"mock://misc\", \"mock://top_level\"]),\n            (\"mock://\", \"top_*\", [\"mock://top_level\"]),\n            (\n                \"mock://top_level/second_level\",\n                \"date=2019-10-0[1-4]\",\n                [\n                    \"mock://top_level/second_level/date=2019-10-01\",\n                    \"mock://top_level/second_level/date=2019-10-02\",\n                    \"mock://top_level/second_level/date=2019-10-04\",\n                ],\n            ),\n            (\n                \"mock://top_level/second_level\",\n                \"date=2019-10-0[1-4]/*\",\n                [\n                    \"mock://top_level/second_level/date=2019-10-01/a.parquet\",\n                    \"mock://top_level/second_level/date=2019-10-01/b.parquet\",\n                    \"mock://top_level/second_level/date=2019-10-02/a.parquet\",\n                    \"mock://top_level/second_level/date=2019-10-04/a.parquet\",\n                ],\n            ),\n        ],\n    )\n    def test_xpath_glob(self, input_path, pattern, expected_paths, tmp_path, mock_fsspec2):\n        if input_path == \"tmp_path\":\n            input_path = tmp_path\n            expected_paths = [str(tmp_path / file) for file in expected_paths]\n            for file in [\"file1.txt\", \"file2.txt\", \"README.md\"]:\n                (tmp_path / file).touch()\n        output_paths = sorted([str(path) for path in xPath(input_path).glob(pattern)])\n        assert output_paths == expected_paths\n\n    @pytest.mark.parametrize(\n        \"input_path, pattern, expected_paths\",\n        [\n            (\"tmp_path\", \"*.txt\", [\"file1.txt\", \"file2.txt\"]),\n            (\n                \"mock://\",\n                \"date=2019-10-0[1-4]\",\n                [\n                    \"mock://top_level/second_level/date=2019-10-01\",\n                    \"mock://top_level/second_level/date=2019-10-02\",\n                    \"mock://top_level/second_level/date=2019-10-04\",\n                ],\n            ),\n            (\n                \"mock://top_level\",\n                \"date=2019-10-0[1-4]\",\n                [\n                    \"mock://top_level/second_level/date=2019-10-01\",\n                    \"mock://top_level/second_level/date=2019-10-02\",\n                    \"mock://top_level/second_level/date=2019-10-04\",\n                ],\n            ),\n            (\n                \"mock://\",\n                \"date=2019-10-0[1-4]/*\",\n                [\n                    \"mock://top_level/second_level/date=2019-10-01/a.parquet\",\n                    \"mock://top_level/second_level/date=2019-10-01/b.parquet\",\n                    \"mock://top_level/second_level/date=2019-10-02/a.parquet\",\n                    \"mock://top_level/second_level/date=2019-10-04/a.parquet\",\n                ],\n            ),\n            (\n                \"mock://top_level\",\n                \"date=2019-10-0[1-4]/*\",\n                [\n                    \"mock://top_level/second_level/date=2019-10-01/a.parquet\",\n                    \"mock://top_level/second_level/date=2019-10-01/b.parquet\",\n                    \"mock://top_level/second_level/date=2019-10-02/a.parquet\",\n                    \"mock://top_level/second_level/date=2019-10-04/a.parquet\",\n                ],\n            ),\n        ],\n    )\n    def test_xpath_rglob(self, input_path, pattern, expected_paths, tmp_path, mock_fsspec2):\n        if input_path == \"tmp_path\":\n            input_path = tmp_path\n            dir_path = tmp_path / \"dir\"\n            dir_path.mkdir()\n            expected_paths = [str(dir_path / file) for file in expected_paths]\n            for file in [\"file1.txt\", \"file2.txt\", \"README.md\"]:\n                (dir_path / file).touch()\n        output_paths = sorted([str(path) for path in xPath(input_path).rglob(pattern)])\n        assert output_paths == expected_paths\n\n    @pytest.mark.parametrize(\n        \"input_path, expected_path\",\n        [\n            (\"https://host.com/archive.zip\", \"https://host.com\"),\n            (\"zip://file.txt::https://host.com/archive.zip\", \"zip://::https://host.com/archive.zip\"),\n            (\"zip://dir/file.txt::https://host.com/archive.zip\", \"zip://dir::https://host.com/archive.zip\"),\n            (\"file.txt\", \"\"),\n            (str(Path().resolve() / \"file.txt\"), str(Path().resolve())),\n        ],\n    )\n    def test_xpath_parent(self, input_path, expected_path):\n        assert xPath(input_path).parent == xPath(expected_path)\n\n    @pytest.mark.parametrize(\n        \"input_path, expected\",\n        [\n            (\"https://host.com/archive.zip\", \"archive.zip\"),\n            (\"zip://file.txt::https://host.com/archive.zip\", \"file.txt\"),\n            (\"zip://dir/file.txt::https://host.com/archive.zip\", \"file.txt\"),\n            (\"file.txt\", \"file.txt\"),\n            (str(Path().resolve() / \"file.txt\"), \"file.txt\"),\n        ],\n    )\n    def test_xpath_name(self, input_path, expected):\n        assert xPath(input_path).name == expected\n\n    @pytest.mark.parametrize(\n        \"input_path, expected\",\n        [\n            (\"https://host.com/archive.zip\", \"archive\"),\n            (\"zip://file.txt::https://host.com/archive.zip\", \"file\"),\n            (\"zip://dir/file.txt::https://host.com/archive.zip\", \"file\"),\n            (\"file.txt\", \"file\"),\n            (str(Path().resolve() / \"file.txt\"), \"file\"),\n        ],\n    )\n    def test_xpath_stem(self, input_path, expected):\n        assert xPath(input_path).stem == expected\n\n    @pytest.mark.parametrize(\n        \"input_path, expected\",\n        [\n            (\"https://host.com/archive.zip\", \".zip\"),\n            (\"zip://file.txt::https://host.com/archive.zip\", \".txt\"),\n            (\"zip://dir/file.txt::https://host.com/archive.zip\", \".txt\"),\n            (\"file.txt\", \".txt\"),\n            (str(Path().resolve() / \"file.txt\"), \".txt\"),\n        ],\n    )\n    def test_xpath_suffix(self, input_path, expected):\n        assert xPath(input_path).suffix == expected\n\n    @pytest.mark.parametrize(\n        \"input_path, suffix, expected\",\n        [\n            (\"https://host.com/archive.zip\", \".ann\", \"https://host.com/archive.ann\"),\n            (\"zip://file.txt::https://host.com/archive.zip\", \".ann\", \"zip://file.ann::https://host.com/archive.zip\"),\n            (\n                \"zip://dir/file.txt::https://host.com/archive.zip\",\n                \".ann\",\n                \"zip://dir/file.ann::https://host.com/archive.zip\",\n            ),\n            (\"file.txt\", \".ann\", \"file.ann\"),\n            (str(Path().resolve() / \"file.txt\"), \".ann\", str(Path().resolve() / \"file.ann\")),\n        ],\n    )\n    def test_xpath_with_suffix(self, input_path, suffix, expected):\n        assert xPath(input_path).with_suffix(suffix) == xPath(expected)\n\n\n@pytest.mark.parametrize(\n    \"urlpath, expected_protocol\",\n    [\n        (\"zip://train-00000.json.gz::https://foo.bar/data.zip\", \"gzip\"),\n        (\"https://foo.bar/train.json.gz?dl=1\", \"gzip\"),\n        (\"http://opus.nlpl.eu/download.php?f=Bianet/v1/moses/en-ku.txt.zip\", \"zip\"),\n        (\"https://github.com/user/what-time-is-it/blob/master/gutenberg_time_phrases.zip?raw=true\", \"zip\"),\n        (\"https://github.com/user/repo/blob/master/data/morph_train.tsv?raw=true\", None),\n        (\"https://repo.org/bitstream/handle/20.500.12185/346/annotated_corpus.zip?sequence=3&isAllowed=y\", \"zip\"),\n        (\"https://zenodo.org/record/2787612/files/SICK.zip?download=1\", \"zip\"),\n    ],\n)\ndef test_get_extraction_protocol(urlpath, expected_protocol):\n    assert _get_extraction_protocol(urlpath) == expected_protocol\n\n\n@pytest.mark.parametrize(\n    \"urlpath, expected_protocol\",\n    [\n        (TEST_GG_DRIVE_GZIPPED_URL, \"gzip\"),\n        (TEST_GG_DRIVE_ZIPPED_URL, \"zip\"),\n    ],\n)\n@slow  # otherwise it spams Google Drive and the CI gets banned\ndef test_get_extraction_protocol_gg_drive(urlpath, expected_protocol):\n    assert _get_extraction_protocol(urlpath) == expected_protocol\n\n\n@slow  # otherwise it spams Google Drive and the CI gets banned\n@pytest.mark.integration\ndef test_streaming_gg_drive():\n    with xopen(TEST_GG_DRIVE_URL) as f:\n        assert f.read() == TEST_GG_DRIVE_CONTENT\n\n\ndef test_xnumpy_load(tmp_path):\n    import numpy as np\n\n    expected_x = np.arange(10)\n    npy_path = tmp_path / \"data-x.npy\"\n    np.save(npy_path, expected_x)\n    x = xnumpy_load(npy_path)\n    assert np.array_equal(x, expected_x)\n\n    npz_path = tmp_path / \"data.npz\"\n    np.savez(npz_path, x=expected_x)\n    with xnumpy_load(npz_path) as f:\n        x = f[\"x\"]\n    assert np.array_equal(x, expected_x)\n"
  },
  {
    "path": "tests/test_filelock.py",
    "content": "import os\n\nfrom datasets.utils._filelock import FileLock\n\n\ndef test_long_path(tmpdir):\n    filename = \"a\" * 1000 + \".lock\"\n    lock1 = FileLock(str(tmpdir / filename))\n    assert lock1.lock_file.endswith(\".lock\")\n    assert not lock1.lock_file.endswith(filename)\n    assert len(os.path.basename(lock1.lock_file)) <= 255\n"
  },
  {
    "path": "tests/test_filesystem.py",
    "content": "import os\n\nimport fsspec\nimport pytest\nfrom fsspec.core import url_to_fs\nfrom fsspec.registry import _registry as _fsspec_registry\n\nfrom datasets.filesystems import COMPRESSION_FILESYSTEMS, is_remote_filesystem\n\nfrom .utils import require_lz4, require_zstandard\n\n\ndef test_mockfs(mockfs):\n    assert \"mock\" in _fsspec_registry\n    assert \"bz2\" in _fsspec_registry\n\n\ndef test_non_mockfs():\n    assert \"mock\" not in _fsspec_registry\n    assert \"bz2\" in _fsspec_registry\n\n\ndef test_is_remote_filesystem(mockfs):\n    is_remote = is_remote_filesystem(mockfs)\n    assert is_remote is True\n\n    fs = fsspec.filesystem(\"file\")\n\n    is_remote = is_remote_filesystem(fs)\n    assert is_remote is False\n\n\n@pytest.mark.parametrize(\"compression_fs_class\", COMPRESSION_FILESYSTEMS)\ndef test_compression_filesystems(compression_fs_class, gz_file, bz2_file, lz4_file, zstd_file, xz_file, text_file):\n    input_paths = {\"gzip\": gz_file, \"xz\": xz_file, \"zstd\": zstd_file, \"bz2\": bz2_file, \"lz4\": lz4_file}\n    input_path = input_paths[compression_fs_class.protocol]\n    if input_path is None:\n        reason = f\"for '{compression_fs_class.protocol}' compression protocol, \"\n        if compression_fs_class.protocol == \"lz4\":\n            reason += require_lz4.kwargs[\"reason\"]\n        elif compression_fs_class.protocol == \"zstd\":\n            reason += require_zstandard.kwargs[\"reason\"]\n        pytest.skip(reason)\n    fs = fsspec.filesystem(compression_fs_class.protocol, fo=input_path)\n    expected_filename = os.path.basename(input_path)\n    expected_filename = expected_filename[: expected_filename.rindex(\".\")]\n    assert fs.glob(\"*\") == [expected_filename]\n    with fs.open(expected_filename, \"r\", encoding=\"utf-8\") as f, open(text_file, encoding=\"utf-8\") as expected_file:\n        assert f.read() == expected_file.read()\n\n\n@pytest.mark.parametrize(\"protocol\", [\"zip\", \"gzip\"])\ndef test_fs_isfile(protocol, zip_jsonl_path, jsonl_gz_path):\n    compressed_file_paths = {\"zip\": zip_jsonl_path, \"gzip\": jsonl_gz_path}\n    compressed_file_path = compressed_file_paths[protocol]\n    member_file_path = \"dataset.jsonl\"\n    path = f\"{protocol}://{member_file_path}::{compressed_file_path}\"\n    fs, *_ = url_to_fs(path)\n    assert fs.isfile(member_file_path)\n    assert not fs.isfile(\"non_existing_\" + member_file_path)\n"
  },
  {
    "path": "tests/test_fingerprint.py",
    "content": "import json\nimport os\nimport pickle\nimport subprocess\nfrom functools import partial\nfrom pathlib import Path\nfrom tempfile import gettempdir\nfrom textwrap import dedent\nfrom types import FunctionType\nfrom unittest import TestCase\nfrom unittest.mock import patch\n\nimport numpy as np\nimport pytest\nfrom multiprocess import Pool\n\nimport datasets\nfrom datasets import config\nfrom datasets.fingerprint import Hasher, fingerprint_transform\nfrom datasets.table import InMemoryTable\n\nfrom .utils import (\n    require_not_windows,\n    require_numpy1_on_windows,\n    require_regex,\n    require_spacy,\n    require_tiktoken,\n    require_torch,\n    require_torch_compile,\n    require_transformers,\n)\n\n\nclass Foo:\n    def __init__(self, foo):\n        self.foo = foo\n\n    def __call__(self):\n        return self.foo\n\n\nclass DatasetChild(datasets.Dataset):\n    @fingerprint_transform(inplace=False)\n    def func1(self, new_fingerprint, *args, **kwargs):\n        return DatasetChild(self.data, fingerprint=new_fingerprint)\n\n    @fingerprint_transform(inplace=False)\n    def func2(self, new_fingerprint, *args, **kwargs):\n        return DatasetChild(self.data, fingerprint=new_fingerprint)\n\n\nclass UnpicklableCallable:\n    def __init__(self, callable):\n        self.callable = callable\n\n    def __call__(self, *args, **kwargs):\n        if self.callable is not None:\n            return self.callable(*args, **kwargs)\n\n    def __getstate__(self):\n        raise pickle.PicklingError()\n\n\nif config.TORCH_AVAILABLE:\n    import torch\n    import torch.nn as nn\n    import torch.nn.functional as F\n\n    class TorchModule(nn.Module):\n        def __init__(self):\n            super().__init__()\n            self.conv1 = nn.Conv2d(1, 20, 5)\n            self.conv2 = nn.Conv2d(20, 20, 5)\n\n        def forward(self, x):\n            x = F.relu(self.conv1(x))\n            return F.relu(self.conv2(x))\nelse:\n    TorchModule = None\n\n\nclass TokenizersHashTest(TestCase):\n    @require_transformers\n    @pytest.mark.integration\n    def test_hash_tokenizer(self):\n        from transformers import AutoTokenizer\n\n        def encode(x):\n            return tokenizer(x)\n\n        # TODO: add hash consistency tests across sessions\n        tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n        hash1 = Hasher.hash(tokenizer)\n        hash1_lambda = Hasher.hash(lambda x: tokenizer(x))\n        hash1_encode = Hasher.hash(encode)\n        tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n        hash2 = Hasher.hash(tokenizer)\n        hash2_lambda = Hasher.hash(lambda x: tokenizer(x))\n        hash2_encode = Hasher.hash(encode)\n        tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n        hash3 = Hasher.hash(tokenizer)\n        hash3_lambda = Hasher.hash(lambda x: tokenizer(x))\n        hash3_encode = Hasher.hash(encode)\n        self.assertEqual(hash1, hash3)\n        self.assertNotEqual(hash1, hash2)\n        self.assertEqual(hash1_lambda, hash3_lambda)\n        self.assertNotEqual(hash1_lambda, hash2_lambda)\n        self.assertEqual(hash1_encode, hash3_encode)\n        self.assertNotEqual(hash1_encode, hash2_encode)\n\n    @require_transformers\n    @pytest.mark.integration\n    def test_hash_tokenizer_with_cache(self):\n        from transformers import AutoTokenizer\n\n        tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n        hash1 = Hasher.hash(tokenizer)\n        tokenizer(\"Hello world !\")  # call once to change the tokenizer's cache\n        hash2 = Hasher.hash(tokenizer)\n        self.assertEqual(hash1, hash2)\n\n    @require_regex\n    def test_hash_regex(self):\n        import regex\n\n        pat = regex.Regex(\"foo\")\n        hash1 = Hasher.hash(pat)\n        pat = regex.Regex(\"bar\")\n        hash2 = Hasher.hash(pat)\n        pat = regex.Regex(\"foo\")\n        hash3 = Hasher.hash(pat)\n        self.assertEqual(hash1, hash3)\n        self.assertNotEqual(hash1, hash2)\n\n\nclass RecurseHashTest(TestCase):\n    def test_recurse_hash_for_function(self):\n        def func():\n            return foo\n\n        foo = [0]\n        hash1 = Hasher.hash(func)\n        foo = [1]\n        hash2 = Hasher.hash(func)\n        foo = [0]\n        hash3 = Hasher.hash(func)\n        self.assertEqual(hash1, hash3)\n        self.assertNotEqual(hash1, hash2)\n\n    def test_hash_ignores_line_definition_of_function(self):\n        def func():\n            pass\n\n        hash1 = Hasher.hash(func)\n\n        def func():\n            pass\n\n        hash2 = Hasher.hash(func)\n        self.assertEqual(hash1, hash2)\n\n    def test_recurse_hash_for_class(self):\n        hash1 = Hasher.hash(Foo([0]))\n        hash2 = Hasher.hash(Foo([1]))\n        hash3 = Hasher.hash(Foo([0]))\n        self.assertEqual(hash1, hash3)\n        self.assertNotEqual(hash1, hash2)\n\n    def test_recurse_hash_for_method(self):\n        hash1 = Hasher.hash(Foo([0]).__call__)\n        hash2 = Hasher.hash(Foo([1]).__call__)\n        hash3 = Hasher.hash(Foo([0]).__call__)\n        self.assertEqual(hash1, hash3)\n        self.assertNotEqual(hash1, hash2)\n\n    def test_hash_ipython_function(self):\n        def create_ipython_func(co_filename, returned_obj):\n            def func():\n                return returned_obj\n\n            code = func.__code__\n            # Use _create_code from dill in order to make it work for different python versions\n            code = code.replace(co_filename=co_filename)\n            return FunctionType(code, func.__globals__, func.__name__, func.__defaults__, func.__closure__)\n\n        co_filename, returned_obj = \"<ipython-input-2-e0383a102aae>\", [0]\n        hash1 = Hasher.hash(create_ipython_func(co_filename, returned_obj))\n        co_filename, returned_obj = \"<ipython-input-2-e0383a102aae>\", [1]\n        hash2 = Hasher.hash(create_ipython_func(co_filename, returned_obj))\n        co_filename, returned_obj = \"<ipython-input-5-713f6613acf3>\", [0]\n        hash3 = Hasher.hash(create_ipython_func(co_filename, returned_obj))\n        self.assertEqual(hash1, hash3)\n        self.assertNotEqual(hash1, hash2)\n\n        co_filename, returned_obj = os.path.join(gettempdir(), \"ipykernel_12345\", \"321456789.py\"), [0]\n        hash4 = Hasher.hash(create_ipython_func(co_filename, returned_obj))\n        co_filename, returned_obj = os.path.join(gettempdir(), \"ipykernel_12345\", \"321456789.py\"), [1]\n        hash5 = Hasher.hash(create_ipython_func(co_filename, returned_obj))\n        co_filename, returned_obj = os.path.join(gettempdir(), \"ipykernel_12345\", \"654123987.py\"), [0]\n        hash6 = Hasher.hash(create_ipython_func(co_filename, returned_obj))\n        self.assertEqual(hash4, hash6)\n        self.assertNotEqual(hash4, hash5)\n\n    def test_recurse_hash_for_function_with_shuffled_globals(self):\n        foo, bar = [0], [1]\n\n        def func():\n            return foo, bar\n\n        func.__module__ = \"__main__\"\n\n        def globalvars_mock1_side_effect(func, *args, **kwargs):\n            return {\"foo\": foo, \"bar\": bar}\n\n        def globalvars_mock2_side_effect(func, *args, **kwargs):\n            return {\"bar\": bar, \"foo\": foo}\n\n        with patch(\"dill.detect.globalvars\", side_effect=globalvars_mock1_side_effect) as globalvars_mock1:\n            hash1 = Hasher.hash(func)\n            self.assertGreater(globalvars_mock1.call_count, 0)\n        with patch(\"dill.detect.globalvars\", side_effect=globalvars_mock2_side_effect) as globalvars_mock2:\n            hash2 = Hasher.hash(func)\n            self.assertGreater(globalvars_mock2.call_count, 0)\n        self.assertEqual(hash1, hash2)\n\n\nclass HashingTest(TestCase):\n    def test_hash_simple(self):\n        hash1 = Hasher.hash(\"hello\")\n        hash2 = Hasher.hash(\"hello\")\n        hash3 = Hasher.hash(\"there\")\n        self.assertEqual(hash1, hash2)\n        self.assertNotEqual(hash1, hash3)\n\n    def test_hash_class_instance(self):\n        hash1 = Hasher.hash(Foo(\"hello\"))\n        hash2 = Hasher.hash(Foo(\"hello\"))\n        hash3 = Hasher.hash(Foo(\"there\"))\n        self.assertEqual(hash1, hash2)\n        self.assertNotEqual(hash1, hash3)\n\n    def test_hash_update(self):\n        hasher = Hasher()\n        for x in [\"hello\", Foo(\"hello\")]:\n            hasher.update(x)\n        hash1 = hasher.hexdigest()\n        hasher = Hasher()\n        for x in [\"hello\", Foo(\"hello\")]:\n            hasher.update(x)\n        hash2 = hasher.hexdigest()\n        hasher = Hasher()\n        for x in [\"there\", Foo(\"there\")]:\n            hasher.update(x)\n        hash3 = hasher.hexdigest()\n        self.assertEqual(hash1, hash2)\n        self.assertNotEqual(hash1, hash3)\n\n    def test_hash_unpicklable(self):\n        with self.assertRaises(pickle.PicklingError):\n            Hasher.hash(UnpicklableCallable(Foo(\"hello\")))\n\n    def test_hash_same_strings(self):\n        string = \"abc\"\n        obj1 = [string, string]  # two strings have the same ids\n        obj2 = [string, string]\n        obj3 = json.loads(f'[\"{string}\", \"{string}\"]')  # two strings have different ids\n        self.assertIs(obj1[0], string)\n        self.assertIs(obj1[0], obj1[1])\n        self.assertIs(obj2[0], string)\n        self.assertIs(obj2[0], obj2[1])\n        self.assertIsNot(obj3[0], string)\n        self.assertIsNot(obj3[0], obj3[1])\n        hash1 = Hasher.hash(obj1)\n        hash2 = Hasher.hash(obj2)\n        hash3 = Hasher.hash(obj3)\n        self.assertEqual(hash1, hash2)\n        self.assertEqual(hash1, hash3)\n\n    def test_set_stable(self):\n        rng = np.random.default_rng(42)\n        set_ = {rng.random() for _ in range(10_000)}\n        expected_hash = Hasher.hash(set_)\n        assert expected_hash == Pool(1).apply_async(partial(Hasher.hash, set(set_))).get()\n\n    def test_set_doesnt_depend_on_order(self):\n        set_ = set(\"abc\")\n        hash1 = Hasher.hash(set_)\n        set_ = set(\"def\")\n        hash2 = Hasher.hash(set_)\n        set_ = set(\"cba\")\n        hash3 = Hasher.hash(set_)\n        self.assertEqual(hash1, hash3)\n        self.assertNotEqual(hash1, hash2)\n\n    @require_tiktoken\n    def test_hash_tiktoken_encoding(self):\n        import tiktoken\n\n        enc = tiktoken.get_encoding(\"gpt2\")\n        hash1 = Hasher.hash(enc)\n        enc = tiktoken.get_encoding(\"r50k_base\")\n        hash2 = Hasher.hash(enc)\n        enc = tiktoken.get_encoding(\"gpt2\")\n        hash3 = Hasher.hash(enc)\n        self.assertEqual(hash1, hash3)\n        self.assertNotEqual(hash1, hash2)\n\n    @require_numpy1_on_windows\n    @require_torch\n    def test_hash_torch_tensor(self):\n        import torch\n\n        t = torch.tensor([1.0])\n        hash1 = Hasher.hash(t)\n        t = torch.tensor([2.0])\n        hash2 = Hasher.hash(t)\n        t = torch.tensor([1.0])\n        hash3 = Hasher.hash(t)\n        self.assertEqual(hash1, hash3)\n        self.assertNotEqual(hash1, hash2)\n\n    @require_numpy1_on_windows\n    @require_torch\n    def test_hash_torch_generator(self):\n        import torch\n\n        t = torch.Generator(device=\"cpu\").manual_seed(42)\n        hash1 = Hasher.hash(t)\n        t = t = torch.Generator(device=\"cpu\").manual_seed(50)\n        hash2 = Hasher.hash(t)\n        t = t = torch.Generator(device=\"cpu\").manual_seed(42)\n        hash3 = Hasher.hash(t)\n        self.assertEqual(hash1, hash3)\n        self.assertNotEqual(hash1, hash2)\n\n    @require_spacy\n    @pytest.mark.integration\n    def test_hash_spacy_model(self):\n        import spacy\n\n        nlp = spacy.blank(\"en\")\n        hash1 = Hasher.hash(nlp)\n        nlp = spacy.blank(\"fr\")\n        hash2 = Hasher.hash(nlp)\n        nlp = spacy.blank(\"en\")\n        hash3 = Hasher.hash(nlp)\n        self.assertEqual(hash1, hash3)\n        self.assertNotEqual(hash1, hash2)\n\n    @require_not_windows\n    @require_torch_compile\n    def test_hash_torch_compiled_function(self):\n        import torch\n\n        def f(x):\n            return torch.sin(x) + torch.cos(x)\n\n        hash1 = Hasher.hash(f)\n        f = torch.compile(f)\n        hash2 = Hasher.hash(f)\n        self.assertEqual(hash1, hash2)\n\n    @require_not_windows\n    @require_torch_compile\n    def test_hash_torch_compiled_module(self):\n        m = TorchModule()\n        next(iter(m.parameters())).data.fill_(1.0)\n        hash1 = Hasher.hash(m)\n        m = torch.compile(m)\n        hash2 = Hasher.hash(m)\n        m = TorchModule()\n        next(iter(m.parameters())).data.fill_(2.0)\n        m = torch.compile(m)\n        hash3 = Hasher.hash(m)\n        self.assertEqual(hash1, hash2)\n        self.assertNotEqual(hash1, hash3)\n        self.assertNotEqual(hash2, hash3)\n\n\n@pytest.mark.integration\ndef test_move_script_doesnt_change_hash(tmp_path: Path):\n    dir1 = tmp_path / \"dir1\"\n    dir2 = tmp_path / \"dir2\"\n    dir1.mkdir()\n    dir2.mkdir()\n    script_filename = \"script.py\"\n    code = dedent(\n        \"\"\"\n    from datasets.fingerprint import Hasher\n    def foo():\n        pass\n    print(Hasher.hash(foo))\n    \"\"\"\n    )\n    script_path1 = dir1 / script_filename\n    script_path2 = dir2 / script_filename\n    with script_path1.open(\"w\") as f:\n        f.write(code)\n    with script_path2.open(\"w\") as f:\n        f.write(code)\n    fingerprint1 = subprocess.check_output([\"python\", str(script_path1)])\n    fingerprint2 = subprocess.check_output([\"python\", str(script_path2)])\n    assert fingerprint1 == fingerprint2\n\n\ndef test_fingerprint_in_multiprocessing():\n    data = {\"a\": [0, 1, 2]}\n    dataset = DatasetChild(InMemoryTable.from_pydict(data))\n    expected_fingerprint = dataset.func1()._fingerprint\n    with Pool(2) as pool:\n        fingerprints = pool.map(\n            lambda _: DatasetChild(InMemoryTable.from_pydict(data)).func1()._fingerprint, range(10)\n        )\n    assert all(f == expected_fingerprint for f in fingerprints)\n\n\ndef test_temp_cache_dir_with_tmpdir_nonexistent(tmp_path, caplog):\n    \"\"\"Test that _TempCacheDir creates TMPDIR if it doesn't exist.\"\"\"\n    import os\n\n    # Set TMPDIR to a non-existent directory\n    tmpdir_path = tmp_path / \"custom_tmpdir\"\n    assert not tmpdir_path.exists(), \"TMPDIR should not exist initially\"\n\n    # Save original TMPDIR and set new one\n    original_tmpdir = os.environ.get(\"TMPDIR\")\n    try:\n        os.environ[\"TMPDIR\"] = str(tmpdir_path)\n\n        # Clear any existing temp cache dir to force recreation\n        import datasets.fingerprint\n\n        datasets.fingerprint._TEMP_DIR_FOR_TEMP_CACHE_FILES = None\n\n        # Import and test _TempCacheDir directly\n        from datasets.fingerprint import _TempCacheDir\n\n        with caplog.at_level(\"INFO\", logger=\"datasets.fingerprint\"):\n            temp_cache = _TempCacheDir()\n            cache_dir = temp_cache.name\n\n        # The key test: verify the cache directory is within the TMPDIR we set\n        # This proves that TMPDIR was respected and the directory was created\n        tmpdir_path_str = str(tmpdir_path)\n        assert cache_dir.startswith(tmpdir_path_str), (\n            f\"Cache dir {cache_dir} should be in TMPDIR {tmpdir_path_str}. TMPDIR env var: {os.environ.get('TMPDIR')}\"\n        )\n        # Verify the directory was created\n        assert tmpdir_path.exists(), (\n            f\"TMPDIR directory {tmpdir_path} should have been created. Cache dir is: {cache_dir}\"\n        )\n        # Verify logging\n        assert f\"Created TMPDIR directory: {tmpdir_path_str}\" in caplog.text\n\n        # Cleanup\n        temp_cache.cleanup()\n    finally:\n        # Restore original TMPDIR\n        if original_tmpdir is not None:\n            os.environ[\"TMPDIR\"] = original_tmpdir\n        elif \"TMPDIR\" in os.environ:\n            del os.environ[\"TMPDIR\"]\n\n\ndef test_temp_cache_dir_with_tmpdir_existing(tmp_path, monkeypatch):\n    \"\"\"Test that _TempCacheDir works correctly when TMPDIR exists.\"\"\"\n    from datasets.fingerprint import get_temporary_cache_files_directory\n\n    # Set TMPDIR to an existing directory\n    tmpdir_path = tmp_path / \"existing_tmpdir\"\n    tmpdir_path.mkdir()\n    monkeypatch.setenv(\"TMPDIR\", str(tmpdir_path))\n\n    # Clear any existing temp cache dir\n    import datasets.fingerprint\n\n    datasets.fingerprint._TEMP_DIR_FOR_TEMP_CACHE_FILES = None\n\n    cache_dir = get_temporary_cache_files_directory()\n\n    # Verify the cache directory is within the TMPDIR\n    assert cache_dir.startswith(str(tmpdir_path)), f\"Cache dir {cache_dir} should be in TMPDIR {tmpdir_path}\"\n\n\ndef test_temp_cache_dir_without_tmpdir(monkeypatch):\n    \"\"\"Test that _TempCacheDir works correctly when TMPDIR is not set.\"\"\"\n    from datasets.fingerprint import get_temporary_cache_files_directory\n\n    # Remove TMPDIR if it exists\n    monkeypatch.delenv(\"TMPDIR\", raising=False)\n\n    # Clear any existing temp cache dir\n    import datasets.fingerprint\n\n    datasets.fingerprint._TEMP_DIR_FOR_TEMP_CACHE_FILES = None\n\n    cache_dir = get_temporary_cache_files_directory()\n\n    # Verify it uses the default temp directory\n    from tempfile import gettempdir\n\n    default_temp = gettempdir()\n    assert cache_dir.startswith(default_temp), f\"Cache dir {cache_dir} should be in default temp {default_temp}\"\n\n\ndef test_temp_cache_dir_tmpdir_creation_failure(tmp_path, monkeypatch, caplog):\n    \"\"\"Test that _TempCacheDir raises if TMPDIR cannot be created.\"\"\"\n    from unittest.mock import patch\n\n    from datasets.fingerprint import _TempCacheDir\n\n    # Set TMPDIR to a path that will fail to create (e.g., invalid permissions)\n    # Use a path that's likely to fail on creation\n    tmpdir_path = tmp_path / \"nonexistent\" / \"nested\" / \"path\"\n    monkeypatch.setenv(\"TMPDIR\", str(tmpdir_path))\n\n    # Mock os.makedirs to raise an error\n    with patch(\"datasets.fingerprint.os.makedirs\", side_effect=OSError(\"Permission denied\")):\n        with pytest.raises(OSError) as excinfo:\n            _TempCacheDir()\n\n    # Verify the error message gives clear context about TMPDIR\n    msg = str(excinfo.value)\n    assert \"TMPDIR is set to\" in msg\n    assert \"could not be created\" in msg\n\n\ndef test_temp_cache_dir_tmpdir_not_directory(tmp_path, monkeypatch):\n    \"\"\"Test that _TempCacheDir raises if TMPDIR points to a non-directory.\"\"\"\n    from datasets.fingerprint import _TempCacheDir\n\n    # Create a regular file and point TMPDIR to it\n    file_path = tmp_path / \"not_a_dir\"\n    file_path.write_text(\"not a directory\")\n    monkeypatch.setenv(\"TMPDIR\", str(file_path))\n\n    with pytest.raises(OSError) as excinfo:\n        _TempCacheDir()\n\n    msg = str(excinfo.value)\n    assert \"TMPDIR is set to\" in msg\n    assert \"is not a directory\" in msg\n\n\ndef test_fingerprint_when_transform_version_changes():\n    data = {\"a\": [0, 1, 2]}\n\n    class DummyDatasetChild(datasets.Dataset):\n        @fingerprint_transform(inplace=False)\n        def func(self, new_fingerprint):\n            return DummyDatasetChild(self.data, fingerprint=new_fingerprint)\n\n    fingeprint_no_version = DummyDatasetChild(InMemoryTable.from_pydict(data)).func()\n\n    class DummyDatasetChild(datasets.Dataset):\n        @fingerprint_transform(inplace=False, version=\"1.0.0\")\n        def func(self, new_fingerprint):\n            return DummyDatasetChild(self.data, fingerprint=new_fingerprint)\n\n    fingeprint_1 = DummyDatasetChild(InMemoryTable.from_pydict(data)).func()\n\n    class DummyDatasetChild(datasets.Dataset):\n        @fingerprint_transform(inplace=False, version=\"2.0.0\")\n        def func(self, new_fingerprint):\n            return DummyDatasetChild(self.data, fingerprint=new_fingerprint)\n\n    fingeprint_2 = DummyDatasetChild(InMemoryTable.from_pydict(data)).func()\n\n    assert len({fingeprint_no_version, fingeprint_1, fingeprint_2}) == 3\n\n\ndef test_dependency_on_dill():\n    # AttributeError: module 'dill._dill' has no attribute 'stack'\n    hasher = Hasher()\n    hasher.update(lambda x: x)\n"
  },
  {
    "path": "tests/test_fingerprint_tokenizer_stability.py",
    "content": "from tokenizers import Tokenizer\nfrom tokenizers.models import WordLevel\nfrom tokenizers.pre_tokenizers import Whitespace\nfrom transformers import PreTrainedTokenizerFast\n\nfrom datasets import Dataset\nfrom datasets.fingerprint import Hasher\n\n\ndef _make_mutable_backend_tokenizer() -> PreTrainedTokenizerFast:\n    # Build a tiny tokenizer entirely locally (no network), backed by `tokenizers.Tokenizer`.\n    vocab = {\"[UNK]\": 0, \"[PAD]\": 1, \"hello\": 2, \"world\": 3}\n    backend = Tokenizer(WordLevel(vocab=vocab, unk_token=\"[UNK]\"))\n    backend.pre_tokenizer = Whitespace()\n    return PreTrainedTokenizerFast(tokenizer_object=backend, unk_token=\"[UNK]\", pad_token=\"[PAD]\")\n\n\ndef test_hasher_hash_tokenizer_stable_after_call():\n    tok = _make_mutable_backend_tokenizer()\n    h0 = Hasher.hash(tok)\n    _ = tok([\"hello world\"], truncation=True, padding=\"max_length\", max_length=8)\n    h1 = Hasher.hash(tok)\n    assert h0 == h1\n\n\ndef test_map_cache_reused_with_tokenizer_after_call(tmp_path):\n    # Regression test for https://github.com/huggingface/datasets/issues/3847\n    #\n    # Tokenizers can mutate backend truncation/padding state when called, which used to make the\n    # dataset transform fingerprint unstable and prevented cache reuse.\n    tok = _make_mutable_backend_tokenizer()\n\n    raw = Dataset.from_dict({\"text\": [\"hello world\"] * 1000})\n    stored = tmp_path / \"stored\"\n    raw.save_to_disk(stored)\n    raw = Dataset.load_from_disk(stored)\n\n    def tokenize(examples):\n        return tok(examples[\"text\"], truncation=True, padding=\"max_length\", max_length=8)\n\n    res1 = raw.map(tokenize, batched=True, load_from_cache_file=True, remove_columns=[\"text\"])\n    res2 = raw.map(tokenize, batched=True, load_from_cache_file=True, remove_columns=[\"text\"])\n\n    assert res1.cache_files and res2.cache_files\n    assert res1.cache_files[0][\"filename\"] == res2.cache_files[0][\"filename\"]\n"
  },
  {
    "path": "tests/test_formatting.py",
    "content": "import datetime\nfrom pathlib import Path\nfrom unittest import TestCase\n\nimport numpy as np\nimport pandas as pd\nimport pyarrow as pa\nimport pytest\n\nfrom datasets import Audio, Features, Image, IterableDataset\nfrom datasets.formatting import NumpyFormatter, PandasFormatter, PythonFormatter, query_table\nfrom datasets.formatting.formatting import (\n    LazyBatch,\n    LazyRow,\n    NumpyArrowExtractor,\n    PandasArrowExtractor,\n    PythonArrowExtractor,\n)\nfrom datasets.table import InMemoryTable\n\nfrom .utils import (\n    require_jax,\n    require_numpy1_on_windows,\n    require_pil,\n    require_polars,\n    require_tf,\n    require_torch,\n    require_torchcodec,\n)\n\n\nclass AnyArray:\n    def __init__(self, data) -> None:\n        self.data = data\n\n    def __array__(self) -> np.ndarray:\n        return np.asarray(self.data)\n\n\ndef _gen_any_arrays():\n    for _ in range(10):\n        yield {\"array\": AnyArray(list(range(10)))}\n\n\n@pytest.fixture\ndef any_arrays_dataset():\n    return IterableDataset.from_generator(_gen_any_arrays)\n\n\n_COL_A = [0, 1, 2]\n_COL_B = [\"foo\", \"bar\", \"foobar\"]\n_COL_C = [[[1.0, 0.0, 0.0]] * 2, [[0.0, 1.0, 0.0]] * 2, [[0.0, 0.0, 1.0]] * 2]\n_COL_D = [datetime.datetime(2023, 1, 1, 0, 0, tzinfo=datetime.timezone.utc)] * 3\n\n_INDICES = [1, 0]\n\nIMAGE_PATH_1 = Path(__file__).parent / \"features\" / \"data\" / \"test_image_rgb.jpg\"\nIMAGE_PATH_2 = Path(__file__).parent / \"features\" / \"data\" / \"test_image_rgba.png\"\nAUDIO_PATH_1 = Path(__file__).parent / \"features\" / \"data\" / \"test_audio_44100.wav\"\n\n\nclass ArrowExtractorTest(TestCase):\n    def _create_dummy_table(self):\n        return pa.Table.from_pydict({\"a\": _COL_A, \"b\": _COL_B, \"c\": _COL_C, \"d\": _COL_D})\n\n    def test_python_extractor(self):\n        pa_table = self._create_dummy_table()\n        extractor = PythonArrowExtractor()\n        row = extractor.extract_row(pa_table)\n        self.assertEqual(row, {\"a\": _COL_A[0], \"b\": _COL_B[0], \"c\": _COL_C[0], \"d\": _COL_D[0]})\n        col = extractor.extract_column(pa_table)\n        self.assertEqual(col, _COL_A)\n        batch = extractor.extract_batch(pa_table)\n        self.assertEqual(batch, {\"a\": _COL_A, \"b\": _COL_B, \"c\": _COL_C, \"d\": _COL_D})\n\n    def test_numpy_extractor(self):\n        pa_table = self._create_dummy_table().drop([\"c\", \"d\"])\n        extractor = NumpyArrowExtractor()\n        row = extractor.extract_row(pa_table)\n        np.testing.assert_equal(row, {\"a\": _COL_A[0], \"b\": _COL_B[0]})\n        col = extractor.extract_column(pa_table)\n        np.testing.assert_equal(col, np.array(_COL_A))\n        batch = extractor.extract_batch(pa_table)\n        np.testing.assert_equal(batch, {\"a\": np.array(_COL_A), \"b\": np.array(_COL_B)})\n\n    def test_numpy_extractor_nested(self):\n        pa_table = self._create_dummy_table().drop([\"a\", \"b\", \"d\"])\n        extractor = NumpyArrowExtractor()\n        row = extractor.extract_row(pa_table)\n        self.assertEqual(row[\"c\"][0].dtype, np.float64)\n        self.assertEqual(row[\"c\"].dtype, object)\n        col = extractor.extract_column(pa_table)\n        self.assertEqual(col[0][0].dtype, np.float64)\n        self.assertEqual(col[0].dtype, object)\n        self.assertEqual(col.dtype, object)\n        batch = extractor.extract_batch(pa_table)\n        self.assertEqual(batch[\"c\"][0][0].dtype, np.float64)\n        self.assertEqual(batch[\"c\"][0].dtype, object)\n        self.assertEqual(batch[\"c\"].dtype, object)\n\n    def test_numpy_extractor_temporal(self):\n        pa_table = self._create_dummy_table().drop([\"a\", \"b\", \"c\"])\n        extractor = NumpyArrowExtractor()\n        row = extractor.extract_row(pa_table)\n        self.assertTrue(np.issubdtype(row[\"d\"].dtype, np.datetime64))\n        col = extractor.extract_column(pa_table)\n        self.assertTrue(np.issubdtype(col[0].dtype, np.datetime64))\n        self.assertTrue(np.issubdtype(col.dtype, np.datetime64))\n        batch = extractor.extract_batch(pa_table)\n        self.assertTrue(np.issubdtype(batch[\"d\"][0].dtype, np.datetime64))\n        self.assertTrue(np.issubdtype(batch[\"d\"].dtype, np.datetime64))\n\n    def test_pandas_extractor(self):\n        pa_table = self._create_dummy_table()\n        extractor = PandasArrowExtractor()\n        row = extractor.extract_row(pa_table)\n        self.assertIsInstance(row, pd.DataFrame)\n        pd.testing.assert_series_equal(row[\"a\"], pd.Series(_COL_A, name=\"a\")[:1])\n        pd.testing.assert_series_equal(row[\"b\"], pd.Series(_COL_B, name=\"b\")[:1])\n        col = extractor.extract_column(pa_table)\n        pd.testing.assert_series_equal(col, pd.Series(_COL_A, name=\"a\"))\n        batch = extractor.extract_batch(pa_table)\n        self.assertIsInstance(batch, pd.DataFrame)\n        pd.testing.assert_series_equal(batch[\"a\"], pd.Series(_COL_A, name=\"a\"))\n        pd.testing.assert_series_equal(batch[\"b\"], pd.Series(_COL_B, name=\"b\"))\n\n    def test_pandas_extractor_nested(self):\n        pa_table = self._create_dummy_table().drop([\"a\", \"b\", \"d\"])\n        extractor = PandasArrowExtractor()\n        row = extractor.extract_row(pa_table)\n        self.assertEqual(row[\"c\"][0][0].dtype, np.float64)\n        self.assertEqual(row[\"c\"].dtype, object)\n        col = extractor.extract_column(pa_table)\n        self.assertEqual(col[0][0].dtype, np.float64)\n        self.assertEqual(col[0].dtype, object)\n        self.assertEqual(col.dtype, object)\n        batch = extractor.extract_batch(pa_table)\n        self.assertEqual(batch[\"c\"][0][0].dtype, np.float64)\n        self.assertEqual(batch[\"c\"][0].dtype, object)\n        self.assertEqual(batch[\"c\"].dtype, object)\n\n    def test_pandas_extractor_temporal(self):\n        pa_table = self._create_dummy_table().drop([\"a\", \"b\", \"c\"])\n        extractor = PandasArrowExtractor()\n        row = extractor.extract_row(pa_table)\n        self.assertTrue(pd.api.types.is_datetime64_any_dtype(row[\"d\"].dtype))\n        col = extractor.extract_column(pa_table)\n        self.assertTrue(isinstance(col[0], datetime.datetime))\n        self.assertTrue(pd.api.types.is_datetime64_any_dtype(col.dtype))\n        batch = extractor.extract_batch(pa_table)\n        self.assertTrue(isinstance(batch[\"d\"][0], datetime.datetime))\n        self.assertTrue(pd.api.types.is_datetime64_any_dtype(batch[\"d\"].dtype))\n\n    @require_polars\n    def test_polars_extractor(self):\n        import polars as pl\n\n        from datasets.formatting.polars_formatter import PolarsArrowExtractor\n\n        pa_table = self._create_dummy_table()\n        extractor = PolarsArrowExtractor()\n        row = extractor.extract_row(pa_table)\n        self.assertIsInstance(row, pl.DataFrame)\n        assert pl.Series.eq(row[\"a\"], pl.Series(\"a\", _COL_A)[:1]).all()\n        assert pl.Series.eq(row[\"b\"], pl.Series(\"b\", _COL_B)[:1]).all()\n        col = extractor.extract_column(pa_table)\n        assert pl.Series.eq(col, pl.Series(\"a\", _COL_A)).all()\n        batch = extractor.extract_batch(pa_table)\n        self.assertIsInstance(batch, pl.DataFrame)\n        assert pl.Series.eq(batch[\"a\"], pl.Series(\"a\", _COL_A)).all()\n        assert pl.Series.eq(batch[\"b\"], pl.Series(\"b\", _COL_B)).all()\n\n    @require_polars\n    def test_polars_nested(self):\n        import polars as pl\n\n        from datasets.formatting.polars_formatter import PolarsArrowExtractor\n\n        pa_table = self._create_dummy_table().drop([\"a\", \"b\", \"d\"])\n        extractor = PolarsArrowExtractor()\n        row = extractor.extract_row(pa_table)\n        self.assertEqual(row[\"c\"][0][0].dtype, pl.Float64)\n        self.assertEqual(row[\"c\"].dtype, pl.List(pl.List(pl.Float64)))\n        col = extractor.extract_column(pa_table)\n        self.assertEqual(col[0][0].dtype, pl.Float64)\n        self.assertEqual(col[0].dtype, pl.List(pl.Float64))\n        self.assertEqual(col.dtype, pl.List(pl.List(pl.Float64)))\n        batch = extractor.extract_batch(pa_table)\n        self.assertEqual(batch[\"c\"][0][0].dtype, pl.Float64)\n        self.assertEqual(batch[\"c\"][0].dtype, pl.List(pl.Float64))\n        self.assertEqual(batch[\"c\"].dtype, pl.List(pl.List(pl.Float64)))\n\n    @require_polars\n    def test_polars_temporal(self):\n        from datasets.formatting.polars_formatter import PolarsArrowExtractor\n\n        pa_table = self._create_dummy_table().drop([\"a\", \"b\", \"c\"])\n        extractor = PolarsArrowExtractor()\n        row = extractor.extract_row(pa_table)\n        self.assertTrue(row[\"d\"].dtype.is_temporal())\n        col = extractor.extract_column(pa_table)\n        self.assertTrue(isinstance(col[0], datetime.datetime))\n        self.assertTrue(col.dtype.is_temporal())\n        batch = extractor.extract_batch(pa_table)\n        self.assertTrue(isinstance(batch[\"d\"][0], datetime.datetime))\n        self.assertTrue(batch[\"d\"].dtype.is_temporal())\n\n\nclass LazyDictTest(TestCase):\n    def _create_dummy_table(self):\n        return pa.Table.from_pydict({\"a\": _COL_A, \"b\": _COL_B, \"c\": _COL_C})\n\n    def _create_dummy_formatter(self):\n        return PythonFormatter(lazy=True)\n\n    def test_lazy_dict_copy(self):\n        pa_table = self._create_dummy_table()\n        formatter = self._create_dummy_formatter()\n        lazy_batch = formatter.format_batch(pa_table)\n        lazy_batch_copy = lazy_batch.copy()\n        self.assertEqual(type(lazy_batch), type(lazy_batch_copy))\n        self.assertEqual(lazy_batch.items(), lazy_batch_copy.items())\n        lazy_batch[\"d\"] = [1, 2, 3]\n        self.assertNotEqual(lazy_batch.items(), lazy_batch_copy.items())\n\n\nclass FormatterTest(TestCase):\n    def _create_dummy_table(self):\n        return pa.Table.from_pydict({\"a\": _COL_A, \"b\": _COL_B, \"c\": _COL_C})\n\n    def test_python_formatter(self):\n        pa_table = self._create_dummy_table()\n        formatter = PythonFormatter()\n        row = formatter.format_row(pa_table)\n        self.assertEqual(row, {\"a\": _COL_A[0], \"b\": _COL_B[0], \"c\": _COL_C[0]})\n        col = formatter.format_column(pa_table)\n        self.assertEqual(col, _COL_A)\n        batch = formatter.format_batch(pa_table)\n        self.assertEqual(batch, {\"a\": _COL_A, \"b\": _COL_B, \"c\": _COL_C})\n\n    def test_python_formatter_lazy(self):\n        pa_table = self._create_dummy_table()\n        formatter = PythonFormatter(lazy=True)\n        row = formatter.format_row(pa_table)\n        self.assertIsInstance(row, LazyRow)\n        self.assertEqual(row[\"a\"], _COL_A[0])\n        self.assertEqual(row[\"b\"], _COL_B[0])\n        self.assertEqual(row[\"c\"], _COL_C[0])\n        batch = formatter.format_batch(pa_table)\n        self.assertIsInstance(batch, LazyBatch)\n        self.assertEqual(batch[\"a\"], _COL_A)\n        self.assertEqual(batch[\"b\"], _COL_B)\n        self.assertEqual(batch[\"c\"], _COL_C)\n\n    def test_numpy_formatter(self):\n        pa_table = self._create_dummy_table()\n        formatter = NumpyFormatter()\n        row = formatter.format_row(pa_table)\n        np.testing.assert_equal(row, {\"a\": _COL_A[0], \"b\": _COL_B[0], \"c\": np.array(_COL_C[0])})\n        col = formatter.format_column(pa_table)\n        np.testing.assert_equal(col, np.array(_COL_A))\n        batch = formatter.format_batch(pa_table)\n        np.testing.assert_equal(batch, {\"a\": np.array(_COL_A), \"b\": np.array(_COL_B), \"c\": np.array(_COL_C)})\n        assert batch[\"c\"].shape == np.array(_COL_C).shape\n\n    def test_numpy_formatter_np_array_kwargs(self):\n        pa_table = self._create_dummy_table().drop([\"b\"])\n        formatter = NumpyFormatter(dtype=np.float16)\n        row = formatter.format_row(pa_table)\n        self.assertEqual(row[\"c\"].dtype, np.dtype(np.float16))\n        col = formatter.format_column(pa_table)\n        self.assertEqual(col.dtype, np.float16)\n        batch = formatter.format_batch(pa_table)\n        self.assertEqual(batch[\"a\"].dtype, np.dtype(np.float16))\n        self.assertEqual(batch[\"c\"].dtype, np.dtype(np.float16))\n\n    @require_pil\n    def test_numpy_formatter_image(self):\n        # same dimensions\n        pa_table = pa.table({\"image\": [{\"bytes\": None, \"path\": str(IMAGE_PATH_1)}] * 2})\n        formatter = NumpyFormatter(features=Features({\"image\": Image()}))\n        row = formatter.format_row(pa_table)\n        self.assertEqual(row[\"image\"].dtype, np.uint8)\n        self.assertEqual(row[\"image\"].shape, (480, 640, 3))\n        col = formatter.format_column(pa_table)\n        self.assertEqual(col.dtype, np.uint8)\n        self.assertEqual(col.shape, (2, 480, 640, 3))\n        batch = formatter.format_batch(pa_table)\n        self.assertEqual(batch[\"image\"].dtype, np.uint8)\n        self.assertEqual(batch[\"image\"].shape, (2, 480, 640, 3))\n\n        # different dimensions\n        pa_table = pa.table(\n            {\"image\": [{\"bytes\": None, \"path\": str(IMAGE_PATH_1)}, {\"bytes\": None, \"path\": str(IMAGE_PATH_2)}]}\n        )\n        formatter = NumpyFormatter(features=Features({\"image\": Image()}))\n        row = formatter.format_row(pa_table)\n        self.assertEqual(row[\"image\"].dtype, np.uint8)\n        self.assertEqual(row[\"image\"].shape, (480, 640, 3))\n        col = formatter.format_column(pa_table)\n        self.assertIsInstance(col, np.ndarray)\n        self.assertEqual(col.dtype, object)\n        self.assertEqual(col[0].dtype, np.uint8)\n        self.assertEqual(col[0].shape, (480, 640, 3))\n        batch = formatter.format_batch(pa_table)\n        self.assertIsInstance(batch[\"image\"], np.ndarray)\n        self.assertEqual(batch[\"image\"].dtype, object)\n        self.assertEqual(batch[\"image\"][0].dtype, np.uint8)\n        self.assertEqual(batch[\"image\"][0].shape, (480, 640, 3))\n\n    @require_torchcodec\n    def test_numpy_formatter_audio(self):\n        pa_table = pa.table({\"audio\": [{\"bytes\": None, \"path\": str(AUDIO_PATH_1)}]})\n        formatter = NumpyFormatter(features=Features({\"audio\": Audio()}))\n        row = formatter.format_row(pa_table)\n        self.assertEqual(row[\"audio\"].get_all_samples().data.cpu().numpy().dtype, np.dtype(np.float32))\n        col = formatter.format_column(pa_table)\n        self.assertEqual(col[0].get_all_samples().data.cpu().numpy().dtype, np.float32)\n        batch = formatter.format_batch(pa_table)\n        self.assertEqual(batch[\"audio\"][0].get_all_samples().data.cpu().numpy().dtype, np.dtype(np.float32))\n\n    def test_pandas_formatter(self):\n        pa_table = self._create_dummy_table()\n        formatter = PandasFormatter()\n        row = formatter.format_row(pa_table)\n        self.assertIsInstance(row, pd.DataFrame)\n        pd.testing.assert_series_equal(row[\"a\"], pd.Series(_COL_A, name=\"a\")[:1])\n        pd.testing.assert_series_equal(row[\"b\"], pd.Series(_COL_B, name=\"b\")[:1])\n        col = formatter.format_column(pa_table)\n        pd.testing.assert_series_equal(col, pd.Series(_COL_A, name=\"a\"))\n        batch = formatter.format_batch(pa_table)\n        self.assertIsInstance(batch, pd.DataFrame)\n        pd.testing.assert_series_equal(batch[\"a\"], pd.Series(_COL_A, name=\"a\"))\n        pd.testing.assert_series_equal(batch[\"b\"], pd.Series(_COL_B, name=\"b\"))\n\n    @require_polars\n    def test_polars_formatter(self):\n        import polars as pl\n\n        from datasets.formatting import PolarsFormatter\n\n        pa_table = self._create_dummy_table()\n        formatter = PolarsFormatter()\n        row = formatter.format_row(pa_table)\n        self.assertIsInstance(row, pl.DataFrame)\n        assert pl.Series.eq(row[\"a\"], pl.Series(\"a\", _COL_A)[:1]).all()\n        assert pl.Series.eq(row[\"b\"], pl.Series(\"b\", _COL_B)[:1]).all()\n        col = formatter.format_column(pa_table)\n        assert pl.Series.eq(col, pl.Series(\"a\", _COL_A)).all()\n        batch = formatter.format_batch(pa_table)\n        self.assertIsInstance(batch, pl.DataFrame)\n        assert pl.Series.eq(batch[\"a\"], pl.Series(\"a\", _COL_A)).all()\n        assert pl.Series.eq(batch[\"b\"], pl.Series(\"b\", _COL_B)).all()\n\n    @require_numpy1_on_windows\n    @require_torch\n    def test_torch_formatter(self):\n        import torch\n\n        from datasets.formatting import TorchFormatter\n\n        pa_table = self._create_dummy_table()\n        formatter = TorchFormatter()\n        row = formatter.format_row(pa_table)\n        torch.testing.assert_close(row[\"a\"], torch.tensor(_COL_A, dtype=torch.int64)[0])\n        assert row[\"b\"] == _COL_B[0]\n        torch.testing.assert_close(row[\"c\"], torch.tensor(_COL_C, dtype=torch.float32)[0])\n        col = formatter.format_column(pa_table)\n        torch.testing.assert_close(col, torch.tensor(_COL_A, dtype=torch.int64))\n        batch = formatter.format_batch(pa_table)\n        torch.testing.assert_close(batch[\"a\"], torch.tensor(_COL_A, dtype=torch.int64))\n        assert batch[\"b\"] == _COL_B\n        torch.testing.assert_close(batch[\"c\"], torch.tensor(_COL_C, dtype=torch.float32))\n        assert batch[\"c\"].shape == np.array(_COL_C).shape\n\n    @require_numpy1_on_windows\n    @require_torch\n    def test_torch_formatter_torch_tensor_kwargs(self):\n        import torch\n\n        from datasets.formatting import TorchFormatter\n\n        pa_table = self._create_dummy_table().drop([\"b\"])\n        formatter = TorchFormatter(dtype=torch.float16)\n        row = formatter.format_row(pa_table)\n        self.assertEqual(row[\"c\"].dtype, torch.float16)\n        col = formatter.format_column(pa_table)\n        self.assertEqual(col.dtype, torch.float16)\n        batch = formatter.format_batch(pa_table)\n        self.assertEqual(batch[\"a\"].dtype, torch.float16)\n        self.assertEqual(batch[\"c\"].dtype, torch.float16)\n\n    @require_numpy1_on_windows\n    @require_torch\n    @require_pil\n    def test_torch_formatter_image(self):\n        import torch\n\n        from datasets.formatting import TorchFormatter\n\n        # same dimensions\n        pa_table = pa.table({\"image\": [{\"bytes\": None, \"path\": str(IMAGE_PATH_1)}] * 2})\n        formatter = TorchFormatter(features=Features({\"image\": Image()}))\n        row = formatter.format_row(pa_table)\n        self.assertEqual(row[\"image\"].dtype, torch.uint8)\n        # torch uses CHW format contrary to numpy which uses HWC\n        self.assertEqual(row[\"image\"].shape, (3, 480, 640))\n        col = formatter.format_column(pa_table)\n        self.assertEqual(col.dtype, torch.uint8)\n        self.assertEqual(col.shape, (2, 3, 480, 640))\n        batch = formatter.format_batch(pa_table)\n        self.assertEqual(batch[\"image\"].dtype, torch.uint8)\n        self.assertEqual(batch[\"image\"].shape, (2, 3, 480, 640))\n\n        # different dimensions\n        pa_table = pa.table(\n            {\"image\": [{\"bytes\": None, \"path\": str(IMAGE_PATH_1)}, {\"bytes\": None, \"path\": str(IMAGE_PATH_2)}]}\n        )\n        formatter = TorchFormatter(features=Features({\"image\": Image()}))\n        row = formatter.format_row(pa_table)\n        self.assertEqual(row[\"image\"].dtype, torch.uint8)\n        self.assertEqual(row[\"image\"].shape, (3, 480, 640))\n        col = formatter.format_column(pa_table)\n        self.assertIsInstance(col, list)\n        self.assertEqual(col[0].dtype, torch.uint8)\n        self.assertEqual(col[0].shape, (3, 480, 640))\n        batch = formatter.format_batch(pa_table)\n        self.assertIsInstance(batch[\"image\"], list)\n        self.assertEqual(batch[\"image\"][0].dtype, torch.uint8)\n        self.assertEqual(batch[\"image\"][0].shape, (3, 480, 640))\n\n    @require_torch\n    @require_torchcodec\n    def test_torch_formatter_audio(self):\n        import torch\n\n        from datasets.formatting import TorchFormatter\n\n        pa_table = pa.table({\"audio\": [{\"bytes\": None, \"path\": str(AUDIO_PATH_1)}]})\n        formatter = TorchFormatter(features=Features({\"audio\": Audio()}))\n        row = formatter.format_row(pa_table)\n        self.assertEqual(row[\"audio\"].get_all_samples().data.dtype, torch.float32)\n        col = formatter.format_column(pa_table)\n        self.assertEqual(col[0].get_all_samples().data.dtype, torch.float32)\n        batch = formatter.format_batch(pa_table)\n        self.assertEqual(batch[\"audio\"][0].get_all_samples().data.dtype, torch.float32)\n\n    @require_tf\n    def test_tf_formatter(self):\n        import tensorflow as tf\n\n        from datasets.formatting import TFFormatter\n\n        pa_table = self._create_dummy_table()\n        formatter = TFFormatter()\n        row = formatter.format_row(pa_table)\n        tf.debugging.assert_equal(row[\"a\"], tf.convert_to_tensor(_COL_A, dtype=tf.int64)[0])\n        tf.debugging.assert_equal(row[\"b\"], tf.convert_to_tensor(_COL_B, dtype=tf.string)[0])\n        tf.debugging.assert_equal(row[\"c\"], tf.convert_to_tensor(_COL_C, dtype=tf.float32)[0])\n        col = formatter.format_column(pa_table)\n        tf.debugging.assert_equal(col, tf.ragged.constant(_COL_A, dtype=tf.int64))\n        batch = formatter.format_batch(pa_table)\n        tf.debugging.assert_equal(batch[\"a\"], tf.convert_to_tensor(_COL_A, dtype=tf.int64))\n        tf.debugging.assert_equal(batch[\"b\"], tf.convert_to_tensor(_COL_B, dtype=tf.string))\n        self.assertIsInstance(batch[\"c\"], tf.Tensor)\n        self.assertEqual(batch[\"c\"].dtype, tf.float32)\n        tf.debugging.assert_equal(\n            batch[\"c\"].shape.as_list(), tf.convert_to_tensor(_COL_C, dtype=tf.float32).shape.as_list()\n        )\n        tf.debugging.assert_equal(tf.convert_to_tensor(batch[\"c\"]), tf.convert_to_tensor(_COL_C, dtype=tf.float32))\n\n    @require_tf\n    def test_tf_formatter_tf_tensor_kwargs(self):\n        import tensorflow as tf\n\n        from datasets.formatting import TFFormatter\n\n        pa_table = self._create_dummy_table().drop([\"b\"])\n        formatter = TFFormatter(dtype=tf.float16)\n        row = formatter.format_row(pa_table)\n        self.assertEqual(row[\"c\"].dtype, tf.float16)\n        col = formatter.format_column(pa_table)\n        self.assertEqual(col.dtype, tf.float16)\n        batch = formatter.format_batch(pa_table)\n        self.assertEqual(batch[\"a\"].dtype, tf.float16)\n        self.assertEqual(batch[\"c\"].dtype, tf.float16)\n\n    @require_tf\n    @require_pil\n    def test_tf_formatter_image(self):\n        import tensorflow as tf\n\n        from datasets.formatting import TFFormatter\n\n        # same dimensions\n        pa_table = pa.table({\"image\": [{\"bytes\": None, \"path\": str(IMAGE_PATH_1)}] * 2})\n        formatter = TFFormatter(features=Features({\"image\": Image()}))\n        row = formatter.format_row(pa_table)\n        self.assertEqual(row[\"image\"].dtype, tf.uint8)\n        self.assertEqual(row[\"image\"].shape, (480, 640, 3))\n        col = formatter.format_column(pa_table)\n        self.assertEqual(col.dtype, tf.uint8)\n        self.assertEqual(col.shape, (2, 480, 640, 3))\n        batch = formatter.format_batch(pa_table)\n        self.assertEqual(batch[\"image\"][0].dtype, tf.uint8)\n        self.assertEqual(batch[\"image\"].shape, (2, 480, 640, 3))\n\n        # different dimensions\n        pa_table = pa.table(\n            {\"image\": [{\"bytes\": None, \"path\": str(IMAGE_PATH_1)}, {\"bytes\": None, \"path\": str(IMAGE_PATH_2)}]}\n        )\n        formatter = TFFormatter(features=Features({\"image\": Image()}))\n        row = formatter.format_row(pa_table)\n        self.assertEqual(row[\"image\"].dtype, tf.uint8)\n        self.assertEqual(row[\"image\"].shape, (480, 640, 3))\n        col = formatter.format_column(pa_table)\n        self.assertIsInstance(col, list)\n        self.assertEqual(col[0].dtype, tf.uint8)\n        self.assertEqual(col[0].shape, (480, 640, 3))\n        batch = formatter.format_batch(pa_table)\n        self.assertIsInstance(batch[\"image\"], list)\n        self.assertEqual(batch[\"image\"][0].dtype, tf.uint8)\n        self.assertEqual(batch[\"image\"][0].shape, (480, 640, 3))\n\n    @require_tf\n    def test_tf_formatter_audio(self):\n        import tensorflow as tf\n\n        from datasets.formatting import TFFormatter\n\n        pa_table = pa.table({\"audio\": [{\"bytes\": None, \"path\": str(AUDIO_PATH_1)}]})\n        formatter = TFFormatter(features=Features({\"audio\": Audio()}))\n        row = formatter.format_row(pa_table)\n        tf_row = tf.convert_to_tensor(row[\"audio\"].get_all_samples().data.cpu().numpy())\n        self.assertEqual(tf_row.dtype, tf.float32)\n        col = formatter.format_column(pa_table)\n        tf_col_0 = tf.convert_to_tensor(col[0].get_all_samples().data.cpu().numpy())\n        self.assertEqual(tf_col_0.dtype, tf.float32)\n        batch = formatter.format_batch(pa_table)\n        tf_batch_0 = tf.convert_to_tensor(batch[\"audio\"][0].get_all_samples().data.cpu().numpy())\n        self.assertEqual(tf_batch_0.dtype, tf.float32)\n\n    @require_jax\n    def test_jax_formatter(self):\n        import jax\n        import jax.numpy as jnp\n\n        from datasets.formatting import JaxFormatter\n\n        pa_table = self._create_dummy_table()\n        formatter = JaxFormatter()\n        row = formatter.format_row(pa_table)\n        jnp.allclose(row[\"a\"], jnp.array(_COL_A, dtype=jnp.int64 if jax.config.jax_enable_x64 else jnp.int32)[0])\n        assert row[\"b\"] == _COL_B[0]\n        jnp.allclose(row[\"c\"], jnp.array(_COL_C, dtype=jnp.float32)[0])\n        col = formatter.format_column(pa_table)\n        jnp.allclose(col, jnp.array(_COL_A, dtype=jnp.int64 if jax.config.jax_enable_x64 else jnp.int32))\n        batch = formatter.format_batch(pa_table)\n        jnp.allclose(batch[\"a\"], jnp.array(_COL_A, dtype=jnp.int64 if jax.config.jax_enable_x64 else jnp.int32))\n        assert batch[\"b\"] == _COL_B\n        jnp.allclose(batch[\"c\"], jnp.array(_COL_C, dtype=jnp.float32))\n        assert batch[\"c\"].shape == np.array(_COL_C).shape\n\n    @require_jax\n    def test_jax_formatter_jnp_array_kwargs(self):\n        import jax.numpy as jnp\n\n        from datasets.formatting import JaxFormatter\n\n        pa_table = self._create_dummy_table().drop([\"b\"])\n        formatter = JaxFormatter(dtype=jnp.float16)\n        row = formatter.format_row(pa_table)\n        self.assertEqual(row[\"c\"].dtype, jnp.float16)\n        col = formatter.format_column(pa_table)\n        self.assertEqual(col.dtype, jnp.float16)\n        batch = formatter.format_batch(pa_table)\n        self.assertEqual(batch[\"a\"].dtype, jnp.float16)\n        self.assertEqual(batch[\"c\"].dtype, jnp.float16)\n\n    @require_jax\n    @require_pil\n    def test_jax_formatter_image(self):\n        import jax.numpy as jnp\n\n        from datasets.formatting import JaxFormatter\n\n        # same dimensions\n        pa_table = pa.table({\"image\": [{\"bytes\": None, \"path\": str(IMAGE_PATH_1)}] * 2})\n        formatter = JaxFormatter(features=Features({\"image\": Image()}))\n        row = formatter.format_row(pa_table)\n        self.assertEqual(row[\"image\"].dtype, jnp.uint8)\n        self.assertEqual(row[\"image\"].shape, (480, 640, 3))\n        col = formatter.format_column(pa_table)\n        self.assertEqual(col.dtype, jnp.uint8)\n        self.assertEqual(col.shape, (2, 480, 640, 3))\n        batch = formatter.format_batch(pa_table)\n        self.assertEqual(batch[\"image\"].dtype, jnp.uint8)\n        self.assertEqual(batch[\"image\"].shape, (2, 480, 640, 3))\n\n        # different dimensions\n        pa_table = pa.table(\n            {\"image\": [{\"bytes\": None, \"path\": str(IMAGE_PATH_1)}, {\"bytes\": None, \"path\": str(IMAGE_PATH_2)}]}\n        )\n        formatter = JaxFormatter(features=Features({\"image\": Image()}))\n        row = formatter.format_row(pa_table)\n        self.assertEqual(row[\"image\"].dtype, jnp.uint8)\n        self.assertEqual(row[\"image\"].shape, (480, 640, 3))\n        col = formatter.format_column(pa_table)\n        self.assertIsInstance(col, list)\n        self.assertEqual(col[0].dtype, jnp.uint8)\n        self.assertEqual(col[0].shape, (480, 640, 3))\n        batch = formatter.format_batch(pa_table)\n        self.assertIsInstance(batch[\"image\"], list)\n        self.assertEqual(batch[\"image\"][0].dtype, jnp.uint8)\n        self.assertEqual(batch[\"image\"][0].shape, (480, 640, 3))\n\n    @require_jax\n    @require_torchcodec\n    def test_jax_formatter_audio(self):\n        import jax.numpy as jnp\n\n        from datasets.formatting import JaxFormatter\n\n        pa_table = pa.table({\"audio\": [{\"bytes\": None, \"path\": str(AUDIO_PATH_1)}]})\n        formatter = JaxFormatter(features=Features({\"audio\": Audio()}))\n        row = formatter.format_row(pa_table)\n        self.assertEqual(row[\"audio\"][\"array\"].dtype, jnp.float32)\n        col = formatter.format_column(pa_table)\n        self.assertEqual(col[0][\"array\"].dtype, jnp.float32)\n        batch = formatter.format_batch(pa_table)\n        self.assertEqual(batch[\"audio\"][0][\"array\"].dtype, jnp.float32)\n\n    @require_jax\n    def test_jax_formatter_device(self):\n        import jax\n\n        from datasets.formatting import JaxFormatter\n\n        pa_table = self._create_dummy_table()\n        device = jax.devices()[0]\n        formatter = JaxFormatter(device=str(device))\n        row = formatter.format_row(pa_table)\n        assert row[\"a\"].devices().pop() == device\n        assert row[\"c\"].devices().pop() == device\n        col = formatter.format_column(pa_table)\n        assert col.devices().pop() == device\n        batch = formatter.format_batch(pa_table)\n        assert batch[\"a\"].devices().pop() == device\n        assert batch[\"c\"].devices().pop() == device\n\n\nclass QueryTest(TestCase):\n    def _create_dummy_table(self):\n        return pa.Table.from_pydict({\"a\": _COL_A, \"b\": _COL_B, \"c\": _COL_C})\n\n    def _create_dummy_arrow_indices(self):\n        return pa.Table.from_arrays([pa.array(_INDICES, type=pa.uint64())], names=[\"indices\"])\n\n    def assertTableEqual(self, first: pa.Table, second: pa.Table):\n        self.assertEqual(first.schema, second.schema)\n        for first_array, second_array in zip(first, second):\n            self.assertEqual(first_array, second_array)\n        self.assertEqual(first, second)\n\n    def test_query_table_int(self):\n        pa_table = self._create_dummy_table()\n        table = InMemoryTable(pa_table)\n        n = pa_table.num_rows\n        # classical usage\n        subtable = query_table(table, 0)\n        self.assertTableEqual(subtable, pa.Table.from_pydict({\"a\": _COL_A[:1], \"b\": _COL_B[:1], \"c\": _COL_C[:1]}))\n        subtable = query_table(table, 1)\n        self.assertTableEqual(subtable, pa.Table.from_pydict({\"a\": _COL_A[1:2], \"b\": _COL_B[1:2], \"c\": _COL_C[1:2]}))\n        subtable = query_table(table, -1)\n        self.assertTableEqual(subtable, pa.Table.from_pydict({\"a\": _COL_A[-1:], \"b\": _COL_B[-1:], \"c\": _COL_C[-1:]}))\n        # raise an IndexError\n        with self.assertRaises(IndexError):\n            query_table(table, n)\n        with self.assertRaises(IndexError):\n            query_table(table, -(n + 1))\n        # with indices\n        indices = InMemoryTable(self._create_dummy_arrow_indices())\n        subtable = query_table(table, 0, indices=indices)\n        self.assertTableEqual(\n            subtable,\n            pa.Table.from_pydict({\"a\": [_COL_A[_INDICES[0]]], \"b\": [_COL_B[_INDICES[0]]], \"c\": [_COL_C[_INDICES[0]]]}),\n        )\n        with self.assertRaises(IndexError):\n            assert len(indices) < n\n            query_table(table, len(indices), indices=indices)\n\n    def test_query_table_slice(self):\n        pa_table = self._create_dummy_table()\n        table = InMemoryTable(pa_table)\n        n = pa_table.num_rows\n        # classical usage\n        subtable = query_table(table, slice(0, 1))\n        self.assertTableEqual(subtable, pa.Table.from_pydict({\"a\": _COL_A[:1], \"b\": _COL_B[:1], \"c\": _COL_C[:1]}))\n        subtable = query_table(table, slice(1, 2))\n        self.assertTableEqual(subtable, pa.Table.from_pydict({\"a\": _COL_A[1:2], \"b\": _COL_B[1:2], \"c\": _COL_C[1:2]}))\n        subtable = query_table(table, slice(-2, -1))\n        self.assertTableEqual(\n            subtable, pa.Table.from_pydict({\"a\": _COL_A[-2:-1], \"b\": _COL_B[-2:-1], \"c\": _COL_C[-2:-1]})\n        )\n        # usage with None\n        subtable = query_table(table, slice(-1, None))\n        self.assertTableEqual(subtable, pa.Table.from_pydict({\"a\": _COL_A[-1:], \"b\": _COL_B[-1:], \"c\": _COL_C[-1:]}))\n        subtable = query_table(table, slice(None, n + 1))\n        self.assertTableEqual(\n            subtable, pa.Table.from_pydict({\"a\": _COL_A[: n + 1], \"b\": _COL_B[: n + 1], \"c\": _COL_C[: n + 1]})\n        )\n        self.assertTableEqual(subtable, pa.Table.from_pydict({\"a\": _COL_A, \"b\": _COL_B, \"c\": _COL_C}))\n        subtable = query_table(table, slice(-(n + 1), None))\n        self.assertTableEqual(\n            subtable, pa.Table.from_pydict({\"a\": _COL_A[-(n + 1) :], \"b\": _COL_B[-(n + 1) :], \"c\": _COL_C[-(n + 1) :]})\n        )\n        self.assertTableEqual(subtable, pa.Table.from_pydict({\"a\": _COL_A, \"b\": _COL_B, \"c\": _COL_C}))\n        # usage with step\n        subtable = query_table(table, slice(None, None, 2))\n        self.assertTableEqual(subtable, pa.Table.from_pydict({\"a\": _COL_A[::2], \"b\": _COL_B[::2], \"c\": _COL_C[::2]}))\n        # empty ouput but no errors\n        subtable = query_table(table, slice(-1, 0))  # usage with both negative and positive idx\n        assert len(_COL_A[-1:0]) == 0\n        self.assertTableEqual(subtable, pa_table.slice(0, 0))\n        subtable = query_table(table, slice(2, 1))\n        assert len(_COL_A[2:1]) == 0\n        self.assertTableEqual(subtable, pa_table.slice(0, 0))\n        subtable = query_table(table, slice(n, n))\n        assert len(_COL_A[n:n]) == 0\n        self.assertTableEqual(subtable, pa_table.slice(0, 0))\n        subtable = query_table(table, slice(n, n + 1))\n        assert len(_COL_A[n : n + 1]) == 0\n        self.assertTableEqual(subtable, pa_table.slice(0, 0))\n        # it's not possible to get an error with a slice\n\n        # with indices\n        indices = InMemoryTable(self._create_dummy_arrow_indices())\n        subtable = query_table(table, slice(0, 1), indices=indices)\n        self.assertTableEqual(\n            subtable,\n            pa.Table.from_pydict({\"a\": [_COL_A[_INDICES[0]]], \"b\": [_COL_B[_INDICES[0]]], \"c\": [_COL_C[_INDICES[0]]]}),\n        )\n        subtable = query_table(table, slice(n - 1, n), indices=indices)\n        assert len(indices.column(0).to_pylist()[n - 1 : n]) == 0\n        self.assertTableEqual(subtable, pa_table.slice(0, 0))\n\n    def test_query_table_range(self):\n        pa_table = self._create_dummy_table()\n        table = InMemoryTable(pa_table)\n        n = pa_table.num_rows\n        np_A, np_B, np_C = np.array(_COL_A, dtype=np.int64), np.array(_COL_B), np.array(_COL_C)\n        # classical usage\n        subtable = query_table(table, range(0, 1))\n        self.assertTableEqual(\n            subtable,\n            pa.Table.from_pydict({\"a\": np_A[range(0, 1)], \"b\": np_B[range(0, 1)], \"c\": np_C[range(0, 1)].tolist()}),\n        )\n        subtable = query_table(table, range(1, 2))\n        self.assertTableEqual(\n            subtable,\n            pa.Table.from_pydict({\"a\": np_A[range(1, 2)], \"b\": np_B[range(1, 2)], \"c\": np_C[range(1, 2)].tolist()}),\n        )\n        subtable = query_table(table, range(-2, -1))\n        self.assertTableEqual(\n            subtable,\n            pa.Table.from_pydict(\n                {\"a\": np_A[range(-2, -1)], \"b\": np_B[range(-2, -1)], \"c\": np_C[range(-2, -1)].tolist()}\n            ),\n        )\n        # usage with both negative and positive idx\n        subtable = query_table(table, range(-1, 0))\n        self.assertTableEqual(\n            subtable,\n            pa.Table.from_pydict({\"a\": np_A[range(-1, 0)], \"b\": np_B[range(-1, 0)], \"c\": np_C[range(-1, 0)].tolist()}),\n        )\n        subtable = query_table(table, range(-1, n))\n        self.assertTableEqual(\n            subtable,\n            pa.Table.from_pydict({\"a\": np_A[range(-1, n)], \"b\": np_B[range(-1, n)], \"c\": np_C[range(-1, n)].tolist()}),\n        )\n        # usage with step\n        subtable = query_table(table, range(0, n, 2))\n        self.assertTableEqual(\n            subtable,\n            pa.Table.from_pydict(\n                {\"a\": np_A[range(0, n, 2)], \"b\": np_B[range(0, n, 2)], \"c\": np_C[range(0, n, 2)].tolist()}\n            ),\n        )\n        subtable = query_table(table, range(0, n + 1, 2 * n))\n        self.assertTableEqual(\n            subtable,\n            pa.Table.from_pydict(\n                {\n                    \"a\": np_A[range(0, n + 1, 2 * n)],\n                    \"b\": np_B[range(0, n + 1, 2 * n)],\n                    \"c\": np_C[range(0, n + 1, 2 * n)].tolist(),\n                }\n            ),\n        )\n        # empty ouput but no errors\n        subtable = query_table(table, range(2, 1))\n        assert len(np_A[range(2, 1)]) == 0\n        self.assertTableEqual(subtable, pa.Table.from_batches([], schema=pa_table.schema))\n        subtable = query_table(table, range(n, n))\n        assert len(np_A[range(n, n)]) == 0\n        self.assertTableEqual(subtable, pa.Table.from_batches([], schema=pa_table.schema))\n        # raise an IndexError\n        with self.assertRaises(IndexError):\n            with self.assertRaises(IndexError):\n                np_A[range(0, n + 1)]\n            query_table(table, range(0, n + 1))\n        with self.assertRaises(IndexError):\n            with self.assertRaises(IndexError):\n                np_A[range(-(n + 1), -1)]\n            query_table(table, range(-(n + 1), -1))\n        with self.assertRaises(IndexError):\n            with self.assertRaises(IndexError):\n                np_A[range(n, n + 1)]\n            query_table(table, range(n, n + 1))\n        # with indices\n        indices = InMemoryTable(self._create_dummy_arrow_indices())\n        subtable = query_table(table, range(0, 1), indices=indices)\n        self.assertTableEqual(\n            subtable,\n            pa.Table.from_pydict({\"a\": [_COL_A[_INDICES[0]]], \"b\": [_COL_B[_INDICES[0]]], \"c\": [_COL_C[_INDICES[0]]]}),\n        )\n        with self.assertRaises(IndexError):\n            assert len(indices) < n\n            query_table(table, range(len(indices), len(indices) + 1), indices=indices)\n\n    def test_query_table_str(self):\n        pa_table = self._create_dummy_table()\n        table = InMemoryTable(pa_table)\n        subtable = query_table(table, \"a\")\n        self.assertTableEqual(subtable, pa.Table.from_pydict({\"a\": _COL_A}))\n        with self.assertRaises(KeyError):\n            query_table(table, \"z\")\n        indices = InMemoryTable(self._create_dummy_arrow_indices())\n        subtable = query_table(table, \"a\", indices=indices)\n        self.assertTableEqual(subtable, pa.Table.from_pydict({\"a\": [_COL_A[i] for i in _INDICES]}))\n\n    def test_query_table_iterable(self):\n        pa_table = self._create_dummy_table()\n        table = InMemoryTable(pa_table)\n        n = pa_table.num_rows\n        np_A, np_B, np_C = np.array(_COL_A, dtype=np.int64), np.array(_COL_B), np.array(_COL_C)\n        # classical usage\n        subtable = query_table(table, [0])\n        self.assertTableEqual(\n            subtable, pa.Table.from_pydict({\"a\": np_A[[0]], \"b\": np_B[[0]], \"c\": np_C[[0]].tolist()})\n        )\n        subtable = query_table(table, [1])\n        self.assertTableEqual(\n            subtable, pa.Table.from_pydict({\"a\": np_A[[1]], \"b\": np_B[[1]], \"c\": np_C[[1]].tolist()})\n        )\n        subtable = query_table(table, [-1])\n        self.assertTableEqual(\n            subtable, pa.Table.from_pydict({\"a\": np_A[[-1]], \"b\": np_B[[-1]], \"c\": np_C[[-1]].tolist()})\n        )\n        subtable = query_table(table, [0, -1, 1])\n        self.assertTableEqual(\n            subtable,\n            pa.Table.from_pydict({\"a\": np_A[[0, -1, 1]], \"b\": np_B[[0, -1, 1]], \"c\": np_C[[0, -1, 1]].tolist()}),\n        )\n        # numpy iterable\n        subtable = query_table(table, np.array([0, -1, 1]))\n        self.assertTableEqual(\n            subtable,\n            pa.Table.from_pydict({\"a\": np_A[[0, -1, 1]], \"b\": np_B[[0, -1, 1]], \"c\": np_C[[0, -1, 1]].tolist()}),\n        )\n        # empty ouput but no errors\n        subtable = query_table(table, [])\n        assert len(np_A[[]]) == 0\n        self.assertTableEqual(subtable, pa.Table.from_batches([], schema=pa_table.schema))\n        # raise an IndexError\n        with self.assertRaises(IndexError):\n            with self.assertRaises(IndexError):\n                np_A[[n]]\n            query_table(table, [n])\n        with self.assertRaises(IndexError):\n            with self.assertRaises(IndexError):\n                np_A[[-(n + 1)]]\n            query_table(table, [-(n + 1)])\n        # with indices\n        indices = InMemoryTable(self._create_dummy_arrow_indices())\n        subtable = query_table(table, [0], indices=indices)\n        self.assertTableEqual(\n            subtable,\n            pa.Table.from_pydict({\"a\": [_COL_A[_INDICES[0]]], \"b\": [_COL_B[_INDICES[0]]], \"c\": [_COL_C[_INDICES[0]]]}),\n        )\n        with self.assertRaises(IndexError):\n            assert len(indices) < n\n            query_table(table, [len(indices)], indices=indices)\n\n    def test_query_table_indexable_type(self):\n        pa_table = self._create_dummy_table()\n        table = InMemoryTable(pa_table)\n        n = pa_table.num_rows\n        # classical usage\n        subtable = query_table(table, np.int64(0))\n        self.assertTableEqual(subtable, pa.Table.from_pydict({\"a\": _COL_A[:1], \"b\": _COL_B[:1], \"c\": _COL_C[:1]}))\n        subtable = query_table(table, np.int64(1))\n        self.assertTableEqual(subtable, pa.Table.from_pydict({\"a\": _COL_A[1:2], \"b\": _COL_B[1:2], \"c\": _COL_C[1:2]}))\n        subtable = query_table(table, np.int64(-1))\n        self.assertTableEqual(subtable, pa.Table.from_pydict({\"a\": _COL_A[-1:], \"b\": _COL_B[-1:], \"c\": _COL_C[-1:]}))\n        # raise an IndexError\n        with self.assertRaises(IndexError):\n            query_table(table, np.int64(n))\n        with self.assertRaises(IndexError):\n            query_table(table, np.int64(-(n + 1)))\n        # with indices\n        indices = InMemoryTable(self._create_dummy_arrow_indices())\n        subtable = query_table(table, np.int64(0), indices=indices)\n        self.assertTableEqual(\n            subtable,\n            pa.Table.from_pydict({\"a\": [_COL_A[_INDICES[0]]], \"b\": [_COL_B[_INDICES[0]]], \"c\": [_COL_C[_INDICES[0]]]}),\n        )\n        with self.assertRaises(IndexError):\n            assert len(indices) < n\n            query_table(table, np.int64(len(indices)), indices=indices)\n\n    def test_query_table_invalid_key_type(self):\n        pa_table = self._create_dummy_table()\n        table = InMemoryTable(pa_table)\n        with self.assertRaises(TypeError):\n            query_table(table, 0.0)\n        with self.assertRaises(TypeError):\n            query_table(table, [0, \"a\"])\n        with self.assertRaises(TypeError):\n            query_table(table, int)\n        with self.assertRaises(TypeError):\n\n            def iter_to_inf(start=0):\n                while True:\n                    yield start\n                    start += 1\n\n            query_table(table, iter_to_inf())\n\n\n@pytest.fixture(scope=\"session\")\ndef arrow_table():\n    return pa.Table.from_pydict({\"col_int\": [0, 1, 2], \"col_float\": [0.0, 1.0, 2.0]})\n\n\n@require_tf\n@pytest.mark.parametrize(\n    \"cast_schema\",\n    [\n        None,\n        [(\"col_int\", pa.int64()), (\"col_float\", pa.float64())],\n        [(\"col_int\", pa.int32()), (\"col_float\", pa.float64())],\n        [(\"col_int\", pa.int64()), (\"col_float\", pa.float32())],\n    ],\n)\ndef test_tf_formatter_sets_default_dtypes(cast_schema, arrow_table):\n    import tensorflow as tf\n\n    from datasets.formatting import TFFormatter\n\n    if cast_schema:\n        arrow_table = arrow_table.cast(pa.schema(cast_schema))\n    arrow_table_dict = arrow_table.to_pydict()\n    list_int = arrow_table_dict[\"col_int\"]\n    list_float = arrow_table_dict[\"col_float\"]\n    formatter = TFFormatter()\n\n    row = formatter.format_row(arrow_table)\n    tf.debugging.assert_equal(row[\"col_int\"], tf.ragged.constant(list_int, dtype=tf.int64)[0])\n    tf.debugging.assert_equal(row[\"col_float\"], tf.ragged.constant(list_float, dtype=tf.float32)[0])\n\n    col = formatter.format_column(arrow_table)\n    tf.debugging.assert_equal(col, tf.ragged.constant(list_int, dtype=tf.int64))\n\n    batch = formatter.format_batch(arrow_table)\n    tf.debugging.assert_equal(batch[\"col_int\"], tf.ragged.constant(list_int, dtype=tf.int64))\n    tf.debugging.assert_equal(batch[\"col_float\"], tf.ragged.constant(list_float, dtype=tf.float32))\n\n\n@require_numpy1_on_windows\n@require_torch\n@pytest.mark.parametrize(\n    \"cast_schema\",\n    [\n        None,\n        [(\"col_int\", pa.int64()), (\"col_float\", pa.float64())],\n        [(\"col_int\", pa.int32()), (\"col_float\", pa.float64())],\n        [(\"col_int\", pa.int64()), (\"col_float\", pa.float32())],\n    ],\n)\ndef test_torch_formatter_sets_default_dtypes(cast_schema, arrow_table):\n    import torch\n\n    from datasets.formatting import TorchFormatter\n\n    if cast_schema:\n        arrow_table = arrow_table.cast(pa.schema(cast_schema))\n    arrow_table_dict = arrow_table.to_pydict()\n    list_int = arrow_table_dict[\"col_int\"]\n    list_float = arrow_table_dict[\"col_float\"]\n    formatter = TorchFormatter()\n\n    row = formatter.format_row(arrow_table)\n    torch.testing.assert_close(row[\"col_int\"], torch.tensor(list_int, dtype=torch.int64)[0])\n    torch.testing.assert_close(row[\"col_float\"], torch.tensor(list_float, dtype=torch.float32)[0])\n\n    col = formatter.format_column(arrow_table)\n    torch.testing.assert_close(col, torch.tensor(list_int, dtype=torch.int64))\n\n    batch = formatter.format_batch(arrow_table)\n    torch.testing.assert_close(batch[\"col_int\"], torch.tensor(list_int, dtype=torch.int64))\n    torch.testing.assert_close(batch[\"col_float\"], torch.tensor(list_float, dtype=torch.float32))\n\n\ndef test_iterable_dataset_of_arrays_format_to_arrow(any_arrays_dataset: IterableDataset):\n    formatted = any_arrays_dataset.with_format(\"arrow\")\n    assert all(isinstance(example, pa.Table) for example in formatted)\n\n\ndef test_iterable_dataset_of_arrays_format_to_numpy(any_arrays_dataset: IterableDataset):\n    formatted = any_arrays_dataset.with_format(\"np\")\n    assert all(isinstance(example[\"array\"], np.ndarray) for example in formatted)\n\n\n@require_torch\ndef test_iterable_dataset_of_arrays_format_to_torch(any_arrays_dataset: IterableDataset):\n    import torch\n\n    formatted = any_arrays_dataset.with_format(\"torch\")\n    assert all(isinstance(example[\"array\"], torch.Tensor) for example in formatted)\n\n\n@require_tf\ndef test_iterable_dataset_of_arrays_format_to_tf(any_arrays_dataset: IterableDataset):\n    import tensorflow as tf\n\n    formatted = any_arrays_dataset.with_format(\"tf\")\n    assert all(isinstance(example[\"array\"], tf.Tensor) for example in formatted)\n\n\n@require_jax\ndef test_iterable_dataset_of_arrays_format_to_jax(any_arrays_dataset: IterableDataset):\n    import jax.numpy as jnp\n\n    formatted = any_arrays_dataset.with_format(\"jax\")\n    assert all(isinstance(example[\"array\"], jnp.ndarray) for example in formatted)\n"
  },
  {
    "path": "tests/test_hub.py",
    "content": "from textwrap import dedent\nfrom types import SimpleNamespace\nfrom unittest.mock import patch\nfrom urllib.parse import quote\n\nimport pytest\nfrom huggingface_hub import CommitOperationAdd, CommitOperationDelete\n\nimport datasets\nfrom datasets.config import METADATA_CONFIGS_FIELD\nfrom datasets.hub import delete_from_hub\nfrom datasets.utils.hub import hf_dataset_url\n\n\n@pytest.mark.parametrize(\"repo_id\", [\"canonical_dataset_name\", \"org-name/dataset-name\"])\n@pytest.mark.parametrize(\"filename\", [\"filename.csv\", \"filename with blanks.csv\"])\n@pytest.mark.parametrize(\"revision\", [None, \"v2\"])\ndef test_dataset_url(repo_id, filename, revision):\n    url = hf_dataset_url(repo_id=repo_id, filename=filename, revision=revision)\n    assert url == f\"https://huggingface.co/datasets/{repo_id}/resolve/{revision or 'main'}/{quote(filename)}\"\n\n\ndef test_delete_from_hub(temporary_repo, hf_api, hf_token, csv_path, ci_hub_config) -> None:\n    with temporary_repo() as repo_id:\n        hf_api.create_repo(repo_id, token=hf_token, repo_type=\"dataset\")\n        hf_api.upload_file(\n            path_or_fileobj=str(csv_path),\n            path_in_repo=\"cats/train/0000.csv\",\n            repo_id=repo_id,\n            repo_type=\"dataset\",\n            token=hf_token,\n        )\n        hf_api.upload_file(\n            path_or_fileobj=str(csv_path),\n            path_in_repo=\"dogs/train/0000.csv\",\n            repo_id=repo_id,\n            repo_type=\"dataset\",\n            token=hf_token,\n        )\n        hf_api.upload_file(\n            token=hf_token,\n            path_or_fileobj=dedent(\n                f\"\"\"\\\n            ---\n            {METADATA_CONFIGS_FIELD}:\n            - config_name: cats\n              data_files:\n              - split: train\n                path: cats/train/*\n            - config_name: dogs\n              data_files:\n              - split: train\n                path: dogs/train/*\n            ---\n            \"\"\"\n            ).encode(),\n            path_in_repo=\"README.md\",\n            repo_id=repo_id,\n            repo_type=\"dataset\",\n        )\n        commit_info = SimpleNamespace(\n            pr_url=\"https:///hub-ci.huggingface.co/datasets/__DUMMY_USER__/__DUMMY_DATASET__/refs%2Fpr%2F1\"\n        )\n        with patch.object(datasets.hub.HfApi, \"create_commit\", return_value=commit_info) as mock_method:\n            _ = delete_from_hub(repo_id, \"dogs\")\n    assert mock_method.called\n    assert mock_method.call_args.kwargs.get(\"commit_message\") == \"Delete 'dogs' config\"\n    assert mock_method.call_args.kwargs.get(\"create_pr\")\n    expected_operations = [\n        CommitOperationDelete(path_in_repo=\"dogs/train/0000.csv\", is_folder=False),\n        CommitOperationAdd(\n            path_in_repo=\"README.md\",\n            path_or_fileobj=dedent(\n                f\"\"\"\\\n            ---\n            {METADATA_CONFIGS_FIELD}:\n            - config_name: cats\n              data_files:\n              - split: train\n                path: cats/train/*\n            ---\n            \"\"\"\n            ).encode(),\n        ),\n    ]\n    assert mock_method.call_args.kwargs.get(\"operations\") == expected_operations\n"
  },
  {
    "path": "tests/test_info.py",
    "content": "import os\n\nimport pytest\nimport yaml\n\nfrom datasets.features.features import Features, Value\nfrom datasets.info import DatasetInfo, DatasetInfosDict\nfrom datasets.utils.py_utils import asdict\n\n\n@pytest.mark.parametrize(\n    \"files\",\n    [\n        [\"full:README.md\", \"dataset_infos.json\"],\n        [\"empty:README.md\", \"dataset_infos.json\"],\n        [\"dataset_infos.json\"],\n        [\"full:README.md\"],\n    ],\n)\ndef test_from_dir(files, tmp_path_factory):\n    dataset_infos_dir = tmp_path_factory.mktemp(\"dset_infos_dir\")\n    if \"full:README.md\" in files:\n        with open(dataset_infos_dir / \"README.md\", \"w\") as f:\n            f.write(\"---\\ndataset_info:\\n  dataset_size: 42\\n---\")\n    if \"empty:README.md\" in files:\n        with open(dataset_infos_dir / \"README.md\", \"w\") as f:\n            f.write(\"\")\n    # we want to support dataset_infos.json for backward compatibility\n    if \"dataset_infos.json\" in files:\n        with open(dataset_infos_dir / \"dataset_infos.json\", \"w\") as f:\n            f.write('{\"default\": {\"dataset_size\": 42}}')\n    dataset_infos = DatasetInfosDict.from_directory(dataset_infos_dir)\n    assert dataset_infos\n    assert dataset_infos[\"default\"].dataset_size == 42\n\n\n@pytest.mark.parametrize(\n    \"dataset_info\",\n    [\n        DatasetInfo(),\n        DatasetInfo(\n            description=\"foo\",\n            features=Features({\"a\": Value(\"int32\")}),\n            builder_name=\"builder\",\n            config_name=\"config\",\n            version=\"1.0.0\",\n            splits=[{\"name\": \"train\"}],\n            download_size=42,\n        ),\n    ],\n)\ndef test_dataset_info_dump_and_reload(tmp_path, dataset_info: DatasetInfo):\n    tmp_path = str(tmp_path)\n    dataset_info.write_to_directory(tmp_path)\n    reloaded = DatasetInfo.from_directory(tmp_path)\n    assert dataset_info == reloaded\n    assert os.path.exists(os.path.join(tmp_path, \"dataset_info.json\"))\n\n\ndef test_dataset_info_to_yaml_dict():\n    dataset_info = DatasetInfo(\n        description=\"foo\",\n        citation=\"bar\",\n        homepage=\"https://foo.bar\",\n        license=\"CC0\",\n        features=Features({\"a\": Value(\"int32\")}),\n        post_processed={},\n        supervised_keys=(),\n        builder_name=\"builder\",\n        config_name=\"config\",\n        version=\"1.0.0\",\n        splits=[{\"name\": \"train\", \"num_examples\": 42}],\n        download_checksums={},\n        download_size=1337,\n        post_processing_size=442,\n        dataset_size=1234,\n        size_in_bytes=1337 + 442 + 1234,\n    )\n    dataset_info_yaml_dict = dataset_info._to_yaml_dict()\n    assert sorted(dataset_info_yaml_dict) == sorted(DatasetInfo._INCLUDED_INFO_IN_YAML)\n    for key in DatasetInfo._INCLUDED_INFO_IN_YAML:\n        assert key in dataset_info_yaml_dict\n        assert isinstance(dataset_info_yaml_dict[key], (list, dict, int, str))\n    dataset_info_yaml = yaml.safe_dump(dataset_info_yaml_dict)\n    reloaded = yaml.safe_load(dataset_info_yaml)\n    assert dataset_info_yaml_dict == reloaded\n\n\ndef test_dataset_info_to_yaml_dict_empty():\n    dataset_info = DatasetInfo()\n    dataset_info_yaml_dict = dataset_info._to_yaml_dict()\n    assert dataset_info_yaml_dict == {}\n\n\n@pytest.mark.parametrize(\n    \"dataset_infos_dict\",\n    [\n        DatasetInfosDict(),\n        DatasetInfosDict({\"default\": DatasetInfo()}),\n        DatasetInfosDict({\"my_config_name\": DatasetInfo()}),\n        DatasetInfosDict(\n            {\n                \"default\": DatasetInfo(\n                    description=\"foo\",\n                    features=Features({\"a\": Value(\"int32\")}),\n                    builder_name=\"builder\",\n                    config_name=\"config\",\n                    version=\"1.0.0\",\n                    splits=[{\"name\": \"train\"}],\n                    download_size=42,\n                )\n            }\n        ),\n        DatasetInfosDict(\n            {\n                \"v1\": DatasetInfo(dataset_size=42),\n                \"v2\": DatasetInfo(dataset_size=1337),\n            }\n        ),\n    ],\n)\ndef test_dataset_infos_dict_dump_and_reload(tmp_path, dataset_infos_dict: DatasetInfosDict):\n    tmp_path = str(tmp_path)\n    dataset_infos_dict.write_to_directory(tmp_path)\n    reloaded = DatasetInfosDict.from_directory(tmp_path)\n\n    # the config_name of the dataset_infos_dict take over the attribute\n    for config_name, dataset_info in dataset_infos_dict.items():\n        dataset_info.config_name = config_name\n        # the yaml representation doesn't include fields like description or citation\n        # so we just test that we can recover what we can from the yaml\n        dataset_infos_dict[config_name] = DatasetInfo._from_yaml_dict(dataset_info._to_yaml_dict())\n    assert dataset_infos_dict == reloaded\n\n    if dataset_infos_dict:\n        assert os.path.exists(os.path.join(tmp_path, \"README.md\"))\n\n\n@pytest.mark.parametrize(\n    \"dataset_info\",\n    [\n        None,\n        DatasetInfo(),\n        DatasetInfo(\n            description=\"foo\",\n            features=Features({\"a\": Value(\"int32\")}),\n            builder_name=\"builder\",\n            config_name=\"config\",\n            version=\"1.0.0\",\n            splits=[{\"name\": \"train\"}],\n            download_size=42,\n            dataset_name=\"dataset_name\",\n        ),\n    ],\n)\ndef test_from_merge_same_dataset_infos(dataset_info):\n    num_elements = 3\n    if dataset_info is not None:\n        dataset_info_list = [dataset_info.copy() for _ in range(num_elements)]\n    else:\n        dataset_info_list = [None] * num_elements\n    dataset_info_merged = DatasetInfo.from_merge(dataset_info_list)\n    if dataset_info is not None:\n        assert dataset_info == dataset_info_merged\n    else:\n        assert DatasetInfo() == dataset_info_merged\n\n\ndef test_dataset_info_from_dict_with_large_list():\n    dataset_info_dict = {\n        \"citation\": \"\",\n        \"description\": \"\",\n        \"features\": {\"col_1\": {\"feature\": {\"dtype\": \"int64\", \"_type\": \"Value\"}, \"_type\": \"LargeList\"}},\n        \"homepage\": \"\",\n        \"license\": \"\",\n    }\n    dataset_info = DatasetInfo.from_dict(dataset_info_dict)\n    assert asdict(dataset_info) == dataset_info_dict\n"
  },
  {
    "path": "tests/test_info_utils.py",
    "content": "import pytest\n\nimport datasets.config\nfrom datasets.utils.info_utils import is_small_dataset\n\n\n@pytest.mark.parametrize(\"dataset_size\", [None, 400 * 2**20, 600 * 2**20])\n@pytest.mark.parametrize(\"input_in_memory_max_size\", [\"default\", 0, 100 * 2**20, 900 * 2**20])\ndef test_is_small_dataset(dataset_size, input_in_memory_max_size, monkeypatch):\n    if input_in_memory_max_size != \"default\":\n        monkeypatch.setattr(datasets.config, \"IN_MEMORY_MAX_SIZE\", input_in_memory_max_size)\n    in_memory_max_size = datasets.config.IN_MEMORY_MAX_SIZE\n    if input_in_memory_max_size == \"default\":\n        assert in_memory_max_size == 0\n    else:\n        assert in_memory_max_size == input_in_memory_max_size\n    if dataset_size and in_memory_max_size:\n        expected = dataset_size < in_memory_max_size\n    else:\n        expected = False\n    result = is_small_dataset(dataset_size)\n    assert result == expected\n"
  },
  {
    "path": "tests/test_inspect.py",
    "content": "import pytest\n\nfrom datasets.exceptions import DatasetNotFoundError\nfrom datasets.inspect import (\n    get_dataset_config_info,\n    get_dataset_config_names,\n    get_dataset_default_config_name,\n    get_dataset_infos,\n    get_dataset_split_names,\n)\n\n\npytestmark = pytest.mark.integration\n\n\n@pytest.mark.parametrize(\n    \"path, config_name, expected_splits\",\n    [\n        (\"rajpurkar/squad\", \"plain_text\", [\"train\", \"validation\"]),\n        (\"dalle-mini/wit\", \"default\", [\"train\"]),\n        (\"paws\", \"labeled_final\", [\"train\", \"test\", \"validation\"]),\n    ],\n)\ndef test_get_dataset_config_info(path, config_name, expected_splits):\n    info = get_dataset_config_info(path, config_name=config_name)\n    assert info.config_name == config_name\n    assert list(info.splits.keys()) == expected_splits\n\n\ndef test_get_dataset_config_info_private(hf_token, hf_private_dataset_repo_txt_data):\n    info = get_dataset_config_info(hf_private_dataset_repo_txt_data, config_name=\"default\", token=hf_token)\n    assert list(info.splits.keys()) == [\"train\"]\n\n\n@pytest.mark.parametrize(\n    \"path, config_name, expected_exception\",\n    [\n        (\"paws\", None, ValueError),\n        # non-existing, gated, private:\n        (\"hf-internal-testing/non-existing-dataset\", \"default\", DatasetNotFoundError),\n        (\"hf-internal-testing/gated_dataset_with_data_files\", \"default\", DatasetNotFoundError),\n        (\"hf-internal-testing/private_dataset_with_data_files\", \"default\", DatasetNotFoundError),\n        (\"hf-internal-testing/gated_dataset_with_data_files\", \"default\", DatasetNotFoundError),\n        (\"hf-internal-testing/private_dataset_with_data_files\", \"default\", DatasetNotFoundError),\n    ],\n)\ndef test_get_dataset_config_info_raises(path, config_name, expected_exception):\n    with pytest.raises(expected_exception):\n        get_dataset_config_info(path, config_name=config_name)\n\n\n@pytest.mark.parametrize(\n    \"path, expected\",\n    [\n        (\"amirveyseh/acronym_identification\", [\"default\"]),\n        (\"rajpurkar/squad\", [\"plain_text\"]),\n        (\"dalle-mini/wit\", [\"default\"]),\n        (\"hf-internal-testing/librispeech_asr_dummy\", [\"clean\"]),\n        (\"hf-internal-testing/audiofolder_no_configs_in_metadata\", [\"default\"]),\n        (\"hf-internal-testing/audiofolder_single_config_in_metadata\", [\"custom\"]),\n        (\"hf-internal-testing/audiofolder_two_configs_in_metadata\", [\"v1\", \"v2\"]),\n    ],\n)\ndef test_get_dataset_config_names(path, expected):\n    config_names = get_dataset_config_names(path)\n    assert config_names == expected\n\n\n@pytest.mark.parametrize(\n    \"path, expected\",\n    [\n        (\"amirveyseh/acronym_identification\", \"default\"),\n        (\"rajpurkar/squad\", \"plain_text\"),\n        (\"dalle-mini/wit\", \"default\"),\n        (\"hf-internal-testing/librispeech_asr_dummy\", \"clean\"),\n        (\"hf-internal-testing/audiofolder_no_configs_in_metadata\", \"default\"),\n        (\"hf-internal-testing/audiofolder_single_config_in_metadata\", \"custom\"),\n        (\"hf-internal-testing/audiofolder_two_configs_in_metadata\", None),\n    ],\n)\ndef test_get_dataset_default_config_name(path, expected):\n    default_config_name = get_dataset_default_config_name(path)\n    if expected:\n        assert default_config_name == expected\n    else:\n        assert default_config_name is None\n\n\n@pytest.mark.parametrize(\n    \"path, expected_configs, expected_splits_in_first_config\",\n    [\n        (\"rajpurkar/squad\", [\"plain_text\"], [\"train\", \"validation\"]),\n        (\"dalle-mini/wit\", [\"default\"], [\"train\"]),\n        (\"paws\", [\"labeled_final\", \"labeled_swap\", \"unlabeled_final\"], [\"train\", \"test\", \"validation\"]),\n    ],\n)\ndef test_get_dataset_info(path, expected_configs, expected_splits_in_first_config):\n    infos = get_dataset_infos(path)\n    assert list(infos.keys()) == expected_configs\n    expected_config = expected_configs[0]\n    assert expected_config in infos\n    info = infos[expected_config]\n    assert info.config_name == expected_config\n    assert list(info.splits.keys()) == expected_splits_in_first_config\n\n\n@pytest.mark.parametrize(\n    \"path, expected_config, expected_splits\",\n    [\n        (\"rajpurkar/squad\", \"plain_text\", [\"train\", \"validation\"]),\n        (\"dalle-mini/wit\", \"default\", [\"train\"]),\n        (\"paws\", \"labeled_final\", [\"train\", \"test\", \"validation\"]),\n    ],\n)\ndef test_get_dataset_split_names(path, expected_config, expected_splits):\n    infos = get_dataset_infos(path)\n    assert expected_config in infos\n    info = infos[expected_config]\n    assert info.config_name == expected_config\n    assert list(info.splits.keys()) == expected_splits\n\n\n@pytest.mark.parametrize(\n    \"path, config_name, expected_exception\",\n    [\n        (\"paws\", None, ValueError),\n    ],\n)\ndef test_get_dataset_split_names_error(path, config_name, expected_exception):\n    with pytest.raises(expected_exception):\n        get_dataset_split_names(path, config_name=config_name)\n"
  },
  {
    "path": "tests/test_iterable_dataset.py",
    "content": "import asyncio\nimport pickle\nimport time\nfrom copy import deepcopy\nfrom dataclasses import dataclass\nfrom itertools import chain, cycle, islice\nfrom unittest.mock import MagicMock, patch\n\nimport numpy as np\nimport pandas as pd\nimport pyarrow as pa\nimport pyarrow.compute as pc\nimport pytest\nfrom huggingface_hub import HfFileSystemResolvedPath\nfrom packaging import version\n\nfrom datasets import Dataset, config, load_dataset\nfrom datasets.combine import concatenate_datasets, interleave_datasets\nfrom datasets.distributed import split_dataset_by_node\nfrom datasets.features import (\n    ClassLabel,\n    Features,\n    Image,\n    List,\n    Value,\n)\nfrom datasets.formatting import Formatter, get_format_type_from_alias\nfrom datasets.info import DatasetInfo\nfrom datasets.iterable_dataset import (\n    ArrowExamplesIterable,\n    BufferShuffledExamplesIterable,\n    CyclingMultiSourcesExamplesIterable,\n    ExamplesIterable,\n    FilteredExamplesIterable,\n    FormattedExamplesIterable,\n    FormattingConfig,\n    HorizontallyConcatenatedMultiSourcesExamplesIterable,\n    IterableColumn,\n    IterableDataset,\n    MappedExamplesIterable,\n    RandomlyCyclingMultiSourcesExamplesIterable,\n    RebatchedArrowExamplesIterable,\n    RepeatExamplesIterable,\n    SelectColumnsIterable,\n    SkipExamplesIterable,\n    StepExamplesIterable,\n    TakeExamplesIterable,\n    VerticallyConcatenatedMultiSourcesExamplesIterable,\n    _BaseExamplesIterable,\n    _batch_to_examples,\n    _convert_to_arrow,\n    _examples_to_batch,\n)\n\nfrom .utils import (\n    assert_arrow_memory_doesnt_increase,\n    require_dill_gt_0_3_2,\n    require_jax,\n    require_not_windows,\n    require_numpy1_on_windows,\n    require_polars,\n    require_pyspark,\n    require_tf,\n    require_torch,\n    require_torchdata_stateful_dataloader,\n)\n\n\nif config.HF_HUB_VERSION >= version.parse(\"1.6.0\"):\n    from huggingface_hub.errors import BucketNotFoundError\n    from huggingface_hub.hf_file_system import HfFileSystemResolvedBucketPath, HfFileSystemResolvedRepositoryPath\n\nelse:\n    BucketNotFoundError = None\n    HfFileSystemResolvedBucketPath = None\n    HfFileSystemResolvedRepositoryPath = HfFileSystemResolvedPath\n\nSAMPLE_DATASET_IDENTIFIER = \"hf-internal-testing/dataset_with_data_files\"\n\nDEFAULT_N_EXAMPLES = 20\nDEFAULT_BATCH_SIZE = 4\nDEFAULT_FILEPATH = \"file.txt\"\n\n\ndef generate_examples_fn(**kwargs):\n    kwargs = kwargs.copy()\n    n = kwargs.pop(\"n\", DEFAULT_N_EXAMPLES)\n    filepaths = kwargs.pop(\"filepaths\", None)\n    for filepath in filepaths or [DEFAULT_FILEPATH]:\n        if filepaths is not None:\n            kwargs[\"filepath\"] = filepath\n        for i in range(n):\n            yield f\"{filepath}_{i}\", {\"id\": i, **kwargs}\n\n\ndef generate_tables_fn(**kwargs):\n    kwargs = kwargs.copy()\n    n = kwargs.pop(\"n\", DEFAULT_N_EXAMPLES)\n    batch_size = kwargs.pop(\"batch_size\", DEFAULT_BATCH_SIZE)\n    filepaths = kwargs.pop(\"filepaths\", None)\n    for filepath in filepaths or [DEFAULT_FILEPATH]:\n        buffer = []\n        batch_idx = 0\n        if filepaths is not None:\n            kwargs[\"filepath\"] = filepath\n        for i in range(n):\n            buffer.append({\"id\": i, **kwargs})\n            if len(buffer) == batch_size:\n                yield f\"{filepath}_{batch_idx}\", pa.Table.from_pylist(buffer)\n                buffer = []\n                batch_idx += 1\n        yield batch_idx, pa.Table.from_pylist(buffer)\n\n\n@pytest.fixture\ndef dataset():\n    ex_iterable = ExamplesIterable(generate_examples_fn, {})\n    return IterableDataset(ex_iterable, info=DatasetInfo(description=\"dummy\"), split=\"train\")\n\n\n@pytest.fixture\ndef dataset_with_several_columns():\n    ex_iterable = ExamplesIterable(\n        generate_examples_fn,\n        {\"filepath\": [\"data0.txt\", \"data1.txt\", \"data2.txt\"], \"metadata\": {\"sources\": [\"https://foo.bar\"]}},\n    )\n    return IterableDataset(ex_iterable, info=DatasetInfo(description=\"dummy\"), split=\"train\")\n\n\n@pytest.fixture\ndef arrow_file(tmp_path_factory, dataset: IterableDataset):\n    filename = str(tmp_path_factory.mktemp(\"data\") / \"file.arrow\")\n    Dataset.from_generator(dataset.__iter__).map(cache_file_name=filename)\n    return filename\n\n\ndef assert_load_state_dict_resumes_iteration(ex_iterable: _BaseExamplesIterable):\n    ex_iterable._init_state_dict()\n    state_dicts = [ex_iterable.state_dict()]\n    examples = []\n    for _, example in ex_iterable:\n        state_dicts.append(ex_iterable.state_dict())\n        examples.append(example)\n    for i, state_dict in enumerate(state_dicts):\n        ex_iterable.load_state_dict(state_dict)\n        examples_after_resuming = [example for _, example in ex_iterable]\n        assert examples_after_resuming == examples[i:], f\"resuming from idx {i} with {state_dict=}\"\n\n\ndef assert_load_state_dict_resumes_arrow_iteration(ex_iterable: _BaseExamplesIterable):\n    assert ex_iterable.iter_arrow is not None\n    ex_iterable._init_state_dict()\n    state_dicts = [ex_iterable.state_dict()]\n    examples = []\n    indices = [0]\n    for _, pa_table in ex_iterable.iter_arrow():\n        state_dicts.append(ex_iterable.state_dict())\n        examples.extend(pa_table.to_pylist())\n        indices.append(indices[-1] + len(pa_table))\n    for i, state_dict in zip(indices, state_dicts):\n        ex_iterable.load_state_dict(state_dict)\n        examples_after_resuming = [\n            example for _, pa_table in ex_iterable.iter_arrow() for example in pa_table.to_pylist()\n        ]\n        assert examples_after_resuming == examples[i:], f\"resuming from idx {i} with {state_dict=}\"\n\n\n################################\n#\n#   Utilities tests\n#\n################################\n\n\n@pytest.mark.parametrize(\"batch_size\", [1, 2, 3, 9, 10, 11, 20])\n@pytest.mark.parametrize(\"drop_last_batch\", [False, True])\ndef test_convert_to_arrow(batch_size, drop_last_batch):\n    examples = [{\"foo\": i} for i in range(10)]\n    full_table = pa.Table.from_pylist(examples)\n    num_rows = len(full_table) if not drop_last_batch else len(full_table) // batch_size * batch_size\n    num_batches = (num_rows // batch_size) + 1 if num_rows % batch_size else num_rows // batch_size\n    subtables = list(\n        _convert_to_arrow(\n            list(enumerate(examples)),\n            batch_size=batch_size,\n            drop_last_batch=drop_last_batch,\n        )\n    )\n    assert len(subtables) == num_batches\n    if drop_last_batch:\n        assert all(len(subtable) == batch_size for _, subtable in subtables)\n    else:\n        assert all(len(subtable) == batch_size for _, subtable in subtables[:-1])\n        assert len(subtables[-1][1]) <= batch_size\n    if num_rows > 0:\n        reloaded = pa.concat_tables([subtable for _, subtable in subtables])\n        assert full_table.slice(0, num_rows).to_pydict() == reloaded.to_pydict()\n\n\n################################\n#\n#   _BaseExampleIterable tests\n#\n################################\n\n\ndef test_examples_iterable():\n    ex_iterable = ExamplesIterable(generate_examples_fn, {})\n    expected = list(generate_examples_fn())\n    assert next(iter(ex_iterable)) == expected[0]\n    assert list(ex_iterable) == expected\n    assert ex_iterable.iter_arrow is None\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n\n\ndef test_examples_iterable_with_kwargs():\n    ex_iterable = ExamplesIterable(generate_examples_fn, {\"filepaths\": [\"0.txt\", \"1.txt\"], \"split\": \"train\"})\n    expected = list(generate_examples_fn(filepaths=[\"0.txt\", \"1.txt\"], split=\"train\"))\n    assert list(ex_iterable) == expected\n    assert all(\"split\" in ex for _, ex in ex_iterable)\n    assert sorted({ex[\"filepath\"] for _, ex in ex_iterable}) == [\"0.txt\", \"1.txt\"]\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n\n\ndef test_examples_iterable_shuffle_data_sources():\n    ex_iterable = ExamplesIterable(generate_examples_fn, {\"filepaths\": [\"0.txt\", \"1.txt\"]})\n    ex_iterable = ex_iterable.shuffle_data_sources(np.random.default_rng(40))\n    expected = list(generate_examples_fn(filepaths=[\"1.txt\", \"0.txt\"]))  # shuffle the filepaths\n    assert list(ex_iterable) == expected\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n\n\ndef test_examples_iterable_shuffle_shards_and_metadata():\n    def gen(filepaths, all_metadata):\n        for i, (filepath, metadata) in enumerate(zip(filepaths, all_metadata)):\n            yield i, {\"filepath\": filepath, \"metadata\": metadata}\n\n    ex_iterable = ExamplesIterable(\n        gen,\n        {\n            \"filepaths\": [f\"{i}.txt\" for i in range(100)],\n            \"all_metadata\": [{\"id\": str(i)} for i in range(100)],\n        },\n    )\n    ex_iterable = ex_iterable.shuffle_data_sources(np.random.default_rng(42))\n    out = list(ex_iterable)\n    filepaths_ids = [x[\"filepath\"].split(\".\")[0] for _, x in out]\n    metadata_ids = [x[\"metadata\"][\"id\"] for _, x in out]\n    assert filepaths_ids == metadata_ids, \"entangled lists of shards/metadata should be shuffled the same way\"\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n\n\ndef test_arrow_examples_iterable():\n    ex_iterable = ArrowExamplesIterable(generate_tables_fn, {})\n    expected = sum([pa_table.to_pylist() for _, pa_table in generate_tables_fn()], [])\n    assert next(iter(ex_iterable))[1] == expected[0]\n    assert [example for _, example in ex_iterable] == expected\n    expected = list(generate_tables_fn())\n    assert list(ex_iterable.iter_arrow()) == expected\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n\n\ndef test_arrow_examples_iterable_with_kwargs():\n    ex_iterable = ArrowExamplesIterable(generate_tables_fn, {\"filepaths\": [\"0.txt\", \"1.txt\"], \"split\": \"train\"})\n    expected = sum(\n        [pa_table.to_pylist() for _, pa_table in generate_tables_fn(filepaths=[\"0.txt\", \"1.txt\"], split=\"train\")], []\n    )\n    assert [example for _, example in ex_iterable] == expected\n    assert all(\"split\" in ex for _, ex in ex_iterable)\n    assert sorted({ex[\"filepath\"] for _, ex in ex_iterable}) == [\"0.txt\", \"1.txt\"]\n    expected = list(generate_tables_fn(filepaths=[\"0.txt\", \"1.txt\"], split=\"train\"))\n    assert list(ex_iterable.iter_arrow()) == expected\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n\n\ndef test_arrow_examples_iterable_shuffle_data_sources():\n    ex_iterable = ArrowExamplesIterable(generate_tables_fn, {\"filepaths\": [\"0.txt\", \"1.txt\"]})\n    ex_iterable = ex_iterable.shuffle_data_sources(np.random.default_rng(40))\n    expected = sum(\n        [pa_table.to_pylist() for _, pa_table in generate_tables_fn(filepaths=[\"1.txt\", \"0.txt\"])], []\n    )  # shuffle the filepaths\n    assert [example for _, example in ex_iterable] == expected\n    expected = list(generate_tables_fn(filepaths=[\"1.txt\", \"0.txt\"]))\n    assert list(ex_iterable.iter_arrow()) == expected\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n\n\n@pytest.mark.parametrize(\n    \"tables\",\n    [\n        [pa.table({\"foo\": range(10)})],\n        [pa.table({\"foo\": range(5 * i, 5 * (i + 1))}) for i in range(2)],\n        [pa.table({\"foo\": range(5 * i, 5 * (i + 1))}) for i in range(7)],\n        [pa.table({\"foo\": [i]}) for i in range(10)],\n    ],\n)\n@pytest.mark.parametrize(\"batch_size\", [1, 2, 3, 7, 9, 10, 11, 13, 20])\n@pytest.mark.parametrize(\"drop_last_batch\", [False, True])\ndef test_rebatched_arrow_examples_iterable(tables, batch_size, drop_last_batch):\n    full_table = pa.concat_tables(tables)\n    num_rows = len(full_table) if not drop_last_batch else len(full_table) // batch_size * batch_size\n    num_batches = (num_rows // batch_size) + 1 if num_rows % batch_size else num_rows // batch_size\n\n    def gen(tables):\n        for i, table in enumerate(tables):\n            yield str(i), table\n\n    ex_iterable = ArrowExamplesIterable(gen, {\"tables\": tables})\n    ex_iterable = RebatchedArrowExamplesIterable(ex_iterable, batch_size=batch_size, drop_last_batch=drop_last_batch)\n    subtables = list(ex_iterable.iter_arrow())\n    assert len(subtables) == num_batches\n    if drop_last_batch:\n        assert all(len(subtable) == batch_size for _, subtable in subtables)\n    else:\n        assert all(len(subtable) == batch_size for _, subtable in subtables[:-1])\n        assert len(subtables[-1][1]) <= batch_size\n    if num_rows > 0:\n        reloaded = pa.concat_tables([subtable for _, subtable in subtables])\n        assert full_table.slice(0, num_rows).to_pydict() == reloaded.to_pydict()\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n    assert_load_state_dict_resumes_arrow_iteration(ex_iterable)\n\n\n@pytest.mark.parametrize(\"seed\", [42, 1337, 101010, 123456])\ndef test_buffer_shuffled_examples_iterable(seed):\n    n, buffer_size = 100, 30\n    generator = np.random.default_rng(seed)\n    base_ex_iterable = ExamplesIterable(generate_examples_fn, {\"n\": n})\n    ex_iterable = BufferShuffledExamplesIterable(base_ex_iterable, buffer_size=buffer_size, generator=generator)\n\n    rng = deepcopy(generator)\n    expected_indices_used_for_shuffling = list(\n        islice(BufferShuffledExamplesIterable._iter_random_indices(rng, buffer_size=buffer_size), n - buffer_size)\n    )\n    # indices to pick in the shuffle buffer should all be in the right range\n    assert all(0 <= index_to_pick < buffer_size for index_to_pick in expected_indices_used_for_shuffling)\n    # it should be random indices\n    assert expected_indices_used_for_shuffling != list(range(buffer_size))\n\n    # The final order of examples is the result of a shuffle buffer.\n    all_examples = list(generate_examples_fn(n=n))\n    # We create a buffer and we pick random examples from it.\n    buffer, rest = all_examples[:buffer_size], all_examples[buffer_size:]\n    expected = []\n    for i, index_to_pick in enumerate(expected_indices_used_for_shuffling):\n        expected.append(buffer[index_to_pick])\n        # The picked examples are directly replaced by the next examples from the iterable.\n        buffer[index_to_pick] = rest.pop(0)\n    # Once we have reached the end of the iterable, we shuffle the buffer and return the remaining examples.\n    rng.shuffle(buffer)\n    expected += buffer\n\n    assert next(iter(ex_iterable)) == expected[0]\n    assert list(ex_iterable) == expected\n    assert sorted(ex_iterable) == sorted(all_examples)\n\n\ndef test_cycling_multi_sources_examples_iterable():\n    ex_iterable1 = ExamplesIterable(generate_examples_fn, {\"text\": \"foo\"})\n    ex_iterable2 = ExamplesIterable(generate_examples_fn, {\"text\": \"bar\"})\n    ex_iterable = CyclingMultiSourcesExamplesIterable([ex_iterable1, ex_iterable2])\n    expected = list(chain(*zip(generate_examples_fn(text=\"foo\"), generate_examples_fn(text=\"bar\"))))\n\n    # The cycling stops as soon as one iterable is out of examples (here ex_iterable1), so the last sample from ex_iterable2 is unecessary\n    expected = expected[:-1]\n\n    assert next(iter(ex_iterable)) == expected[0]\n    assert list(ex_iterable) == expected\n    assert all((x[\"id\"], x[\"text\"]) == (i // 2, \"bar\" if i % 2 else \"foo\") for i, (_, x) in enumerate(ex_iterable))\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n\n\n@pytest.mark.parametrize(\"probabilities\", [None, (0.5, 0.5), (0.9, 0.1)])\ndef test_randomly_cycling_multi_sources_examples_iterable(probabilities):\n    seed = 42\n    generator = np.random.default_rng(seed)\n    ex_iterable1 = ExamplesIterable(generate_examples_fn, {\"text\": \"foo\"})\n    ex_iterable2 = ExamplesIterable(generate_examples_fn, {\"text\": \"bar\"})\n    ex_iterable = RandomlyCyclingMultiSourcesExamplesIterable(\n        [ex_iterable1, ex_iterable2], generator=generator, probabilities=probabilities\n    )\n\n    # The source used randomly changes at each example. It stops when one of the iterators is empty.\n    rng = deepcopy(generator)\n    iterators = (generate_examples_fn(text=\"foo\"), generate_examples_fn(text=\"bar\"))\n    indices_iterator = cycle(rng.choice(len(iterators), size=1000, p=probabilities))\n    expected = []\n    lengths = [len(list(ex_iterable1)), len(list(ex_iterable2))]\n    for i in indices_iterator:\n        if lengths[0] == 0 or lengths[1] == 0:\n            break\n        for key, example in iterators[i]:\n            expected.append((key, example))\n            lengths[i] -= 1\n            break\n        else:\n            break\n\n    assert next(iter(ex_iterable)) == expected[0]\n    assert list(ex_iterable) == expected\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n\n\n@pytest.mark.parametrize(\"probabilities\", [None, (0.5, 0.5), (0.9, 0.1)])\n@pytest.mark.parametrize(\"stopping_strategy\", [\"first_exhausted\", \"all_exhausted\"])\n@pytest.mark.parametrize(\"step\", [-1, 0, 5, 20, 30, 300])\ndef test_randomly_cycling_multi_sources_examples_iterable_state(probabilities, stopping_strategy, step):\n    seed = 42\n    generator = np.random.default_rng(seed)\n    ex_iterable1 = ExamplesIterable(generate_examples_fn, {\"text\": \"foo\"})\n    ex_iterable2 = ExamplesIterable(generate_examples_fn, {\"text\": \"bar\"})\n    ex_iterable = RandomlyCyclingMultiSourcesExamplesIterable(\n        [ex_iterable1, ex_iterable2],\n        generator=generator,\n        probabilities=probabilities,\n        stopping_strategy=stopping_strategy,\n    )\n    step = min(step, len(list(ex_iterable)) - 1)\n    ex_iterable._init_state_dict()\n    state_dict = ex_iterable.state_dict()\n    examples = []\n    for i, x in enumerate(ex_iterable):\n        examples.append(x)\n        if i == step:\n            state_dict = ex_iterable.state_dict()\n    ex_iterable.load_state_dict(state_dict)\n    assert examples[step + 1 :] == list(ex_iterable)\n\n\n@pytest.mark.parametrize(\n    \"n, func, batched, batch_size\",\n    [\n        (3, lambda x: {\"id+1\": x[\"id\"] + 1}, False, None),  # just add 1 to the id\n        (3, lambda x: {\"id+1\": [x[\"id\"][0] + 1]}, True, 1),  # same with bs=1\n        (5, lambda x: {\"id+1\": [i + 1 for i in x[\"id\"]]}, True, 10),  # same with bs=10\n        (25, lambda x: {\"id+1\": [i + 1 for i in x[\"id\"]]}, True, 10),  # same with bs=10\n        (5, lambda x: {\"id+1\": [i + 1 for i in x[\"id\"]]}, True, None),  # same with bs=None\n        (5, lambda x: {\"id+1\": [i + 1 for i in x[\"id\"]]}, True, -1),  # same with bs<=0\n        (3, lambda x: {k: v * 2 for k, v in x.items()}, True, 1),  # make a duplicate of each example\n    ],\n)\ndef test_mapped_examples_iterable(n, func, batched, batch_size):\n    base_ex_iterable = ExamplesIterable(generate_examples_fn, {\"n\": n})\n    ex_iterable = MappedExamplesIterable(base_ex_iterable, func, batched=batched, batch_size=batch_size)\n    all_examples = [x for _, x in generate_examples_fn(n=n)]\n    if batched is False:\n        expected = [{**x, **func(x)} for x in all_examples]\n    else:\n        # For batched map we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function\n        all_transformed_examples = []\n        # If batch_size is None or <=0, we use the whole dataset as a single batch\n        if batch_size is None or batch_size <= 0:\n            batch_size = len(all_examples)\n        for batch_offset in range(0, len(all_examples), batch_size):\n            examples = all_examples[batch_offset : batch_offset + batch_size]\n            batch = _examples_to_batch(examples)\n            transformed_batch = func(batch)\n            all_transformed_examples.extend(_batch_to_examples(transformed_batch))\n        expected = _examples_to_batch(all_examples)\n        expected.update(_examples_to_batch(all_transformed_examples))\n        expected = list(_batch_to_examples(expected))\n    assert next(iter(ex_iterable))[1] == expected[0]\n    assert [x for _, x in ex_iterable] == expected\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n\n\n@pytest.mark.parametrize(\n    \"n, func, batched, batch_size\",\n    [\n        (3, lambda x: {\"id+1\": x[\"id\"] + 1}, False, None),  # just add 1 to the id\n        (3, lambda x: {\"id+1\": [x[\"id\"][0] + 1]}, True, 1),  # same with bs=1\n        (5, lambda x: {\"id+1\": [i + 1 for i in x[\"id\"]]}, True, 10),  # same with bs=10\n        (25, lambda x: {\"id+1\": [i + 1 for i in x[\"id\"]]}, True, 10),  # same with bs=10\n        (5, lambda x: {\"id+1\": [i + 1 for i in x[\"id\"]]}, True, None),  # same with bs=None\n        (5, lambda x: {\"id+1\": [i + 1 for i in x[\"id\"]]}, True, -1),  # same with bs<=0\n        (3, lambda x: {k: v * 2 for k, v in x.items()}, True, 1),  # make a duplicate of each example\n    ],\n)\ndef test_mapped_examples_iterable_drop_last_batch(n, func, batched, batch_size):\n    base_ex_iterable = ExamplesIterable(generate_examples_fn, {\"n\": n})\n    ex_iterable = MappedExamplesIterable(\n        base_ex_iterable, func, batched=batched, batch_size=batch_size, drop_last_batch=True\n    )\n    all_examples = [x for _, x in generate_examples_fn(n=n)]\n    is_empty = False\n    if batched is False:\n        # `drop_last_batch` has no effect here\n        expected = [{**x, **func(x)} for x in all_examples]\n    else:\n        # For batched map we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function\n        all_transformed_examples = []\n        # If batch_size is None or <=0, we use the whole dataset as a single batch\n        if batch_size is None or batch_size <= 0:\n            batch_size = len(all_examples)\n        for batch_offset in range(0, len(all_examples), batch_size):\n            examples = all_examples[batch_offset : batch_offset + batch_size]\n            if len(examples) < batch_size:  # ignore last batch\n                break\n            batch = _examples_to_batch(examples)\n            transformed_batch = func(batch)\n            all_transformed_examples.extend(_batch_to_examples(transformed_batch))\n        all_examples = all_examples if n % batch_size == 0 else all_examples[: n // batch_size * batch_size]\n        if all_examples:\n            expected = _examples_to_batch(all_examples)\n            expected.update(_examples_to_batch(all_transformed_examples))\n            expected = list(_batch_to_examples(expected))\n        else:\n            is_empty = True\n\n    if not is_empty:\n        assert next(iter(ex_iterable))[1] == expected[0]\n        assert [x for _, x in ex_iterable] == expected\n    else:\n        with pytest.raises(StopIteration):\n            next(iter(ex_iterable))\n\n\ndef _wrap_async(func, *args, **kwargs):\n    async def wrapped_func(*args, **kwargs):\n        return func(*args, **kwargs)\n\n    return wrapped_func\n\n\n@pytest.mark.parametrize(\n    \"n, func, batched, batch_size\",\n    [\n        (3, lambda x, index: {\"id+idx\": x[\"id\"] + index}, False, None),  # add the index to the id\n        (\n            25,\n            lambda x, indices: {\"id+idx\": [i + j for i, j in zip(x[\"id\"], indices)]},\n            True,\n            10,\n        ),  # add the index to the id\n        (5, lambda x, indices: {\"id+idx\": [i + j for i, j in zip(x[\"id\"], indices)]}, True, None),  # same with bs=None\n        (5, lambda x, indices: {\"id+idx\": [i + j for i, j in zip(x[\"id\"], indices)]}, True, -1),  # same with bs<=0\n    ],\n)\n@pytest.mark.parametrize(\"wrapper\", [lambda x: x, _wrap_async])\ndef test_mapped_examples_iterable_with_indices(n, func, batched, batch_size, wrapper):\n    base_ex_iterable = ExamplesIterable(generate_examples_fn, {\"n\": n})\n    ex_iterable = MappedExamplesIterable(\n        base_ex_iterable, wrapper(func), batched=batched, batch_size=batch_size, with_indices=True\n    )\n    all_examples = [x for _, x in generate_examples_fn(n=n)]\n    if batched is False:\n        expected = [{**x, **func(x, idx)} for idx, x in enumerate(all_examples)]\n    else:\n        # For batched map we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function\n        all_transformed_examples = []\n        # If batch_size is None or <=0, we use the whole dataset as a single batch\n        if batch_size is None or batch_size <= 0:\n            batch_size = len(all_examples)\n        for batch_offset in range(0, len(all_examples), batch_size):\n            examples = all_examples[batch_offset : batch_offset + batch_size]\n            batch = _examples_to_batch(examples)\n            indices = list(range(batch_offset, batch_offset + len(examples)))\n            transformed_batch = func(batch, indices)\n            all_transformed_examples.extend(_batch_to_examples(transformed_batch))\n        expected = _examples_to_batch(all_examples)\n        expected.update(_examples_to_batch(all_transformed_examples))\n        expected = list(_batch_to_examples(expected))\n    assert next(iter(ex_iterable))[1] == expected[0]\n    assert [x for _, x in ex_iterable] == expected\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n\n\n@pytest.mark.parametrize(\n    \"n, func, batched, batch_size, remove_columns\",\n    [\n        (3, lambda x: {\"id+1\": x[\"id\"] + 1}, False, None, [\"extra_column\"]),  # just add 1 to the id\n        (25, lambda x: {\"id+1\": [i + 1 for i in x[\"id\"]]}, True, 10, [\"extra_column\"]),  # same with bs=10\n        (\n            50,\n            lambda x: {\"foo\": [\"bar\"] * np.random.default_rng(x[\"id\"][0]).integers(0, 10)},\n            True,\n            8,\n            [\"extra_column\", \"id\"],\n        ),  # make a duplicate of each example\n        (5, lambda x: {\"id+1\": [i + 1 for i in x[\"id\"]]}, True, None, [\"extra_column\"]),  # same with bs=None\n        (5, lambda x: {\"id+1\": [i + 1 for i in x[\"id\"]]}, True, -1, [\"extra_column\"]),  # same with bs<=0\n    ],\n)\ndef test_mapped_examples_iterable_remove_columns(n, func, batched, batch_size, remove_columns):\n    base_ex_iterable = ExamplesIterable(generate_examples_fn, {\"n\": n, \"extra_column\": \"foo\"})\n    ex_iterable = MappedExamplesIterable(\n        base_ex_iterable, func, batched=batched, batch_size=batch_size, remove_columns=remove_columns\n    )\n    all_examples = [x for _, x in generate_examples_fn(n=n)]\n    columns_to_remove = remove_columns if isinstance(remove_columns, list) else [remove_columns]\n    if batched is False:\n        expected = [{**{k: v for k, v in x.items() if k not in columns_to_remove}, **func(x)} for x in all_examples]\n    else:\n        # For batched map we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function\n        all_transformed_examples = []\n        # If batch_size is None or <=0, we use the whole dataset as a single batch\n        if batch_size is None or batch_size <= 0:\n            batch_size = len(all_examples)\n        for batch_offset in range(0, len(all_examples), batch_size):\n            examples = all_examples[batch_offset : batch_offset + batch_size]\n            batch = _examples_to_batch(examples)\n            transformed_batch = func(batch)\n            all_transformed_examples.extend(_batch_to_examples(transformed_batch))\n        expected = {k: v for k, v in _examples_to_batch(all_examples).items() if k not in columns_to_remove}\n        expected.update(_examples_to_batch(all_transformed_examples))\n        expected = list(_batch_to_examples(expected))\n    assert next(iter(ex_iterable))[1] == expected[0]\n    assert [x for _, x in ex_iterable] == expected\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n\n\n# issue #7345 and PR #7353\n@pytest.mark.parametrize(\"batched\", [False, True])\n@pytest.mark.parametrize(\"batch_size\", [None, 2])\n@pytest.mark.parametrize(\"input_columns\", [None, [\"i\"]])\n@pytest.mark.parametrize(\"remove_columns\", [None, [\"i\"]])\n@pytest.mark.parametrize(\"new_output\", [False, True])\ndef test_iterable_dataset_vs_dataset_map(batched, batch_size, input_columns, remove_columns, new_output):\n    if input_columns is not None and not new_output:\n        return\n\n    ds1 = Dataset.from_list([{\"i\": i} for i in range(4)])\n\n    if batched:\n\n        def f1(i):\n            return {\"i\": [j + 1 for j in i]}\n    else:\n\n        def f1(i):\n            return {\"i\": i + 1}\n\n    if input_columns is None:\n\n        def f2(x):\n            return f1(x[\"i\"])\n    else:\n        f2 = f1\n\n    if new_output:\n        f = f2\n    else:\n\n        def f(x):\n            x[\"i\"] = f2(x)[\"i\"]\n            return x\n\n    r = [\n        list(\n            ds2.map(\n                f,\n                batch_size=batch_size,\n                batched=batched,\n                remove_columns=remove_columns,\n                input_columns=input_columns,\n            )\n        )\n        for ds2 in [ds1, ds1.to_iterable_dataset()]\n    ]\n    r[1] = [x for x in r[1] if len(x) > 0]\n    assert len(r[0]) == len(r[1])\n    assert all(x == y for x, y in zip(*r))\n\n\n@pytest.mark.parametrize(\n    \"n, func, batched, batch_size, fn_kwargs\",\n    [\n        (3, lambda x, y=0: {\"id+y\": x[\"id\"] + y}, False, None, None),\n        (3, lambda x, y=0: {\"id+y\": x[\"id\"] + y}, False, None, {\"y\": 3}),\n        (25, lambda x, y=0: {\"id+y\": [i + y for i in x[\"id\"]]}, True, 10, {\"y\": 3}),\n        (5, lambda x, y=0: {\"id+y\": [i + y for i in x[\"id\"]]}, True, None, {\"y\": 3}),  # same with bs=None\n        (5, lambda x, y=0: {\"id+y\": [i + y for i in x[\"id\"]]}, True, -1, {\"y\": 3}),  # same with bs<=0\n    ],\n)\ndef test_mapped_examples_iterable_fn_kwargs(n, func, batched, batch_size, fn_kwargs):\n    base_ex_iterable = ExamplesIterable(generate_examples_fn, {\"n\": n})\n    ex_iterable = MappedExamplesIterable(\n        base_ex_iterable, func, batched=batched, batch_size=batch_size, fn_kwargs=fn_kwargs\n    )\n    all_examples = [x for _, x in generate_examples_fn(n=n)]\n    if fn_kwargs is None:\n        fn_kwargs = {}\n    if batched is False:\n        expected = [{**x, **func(x, **fn_kwargs)} for x in all_examples]\n    else:\n        # For batched map we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function\n        all_transformed_examples = []\n        # If batch_size is None or <=0, we use the whole dataset as a single batch\n        if batch_size is None or batch_size <= 0:\n            batch_size = len(all_examples)\n        for batch_offset in range(0, len(all_examples), batch_size):\n            examples = all_examples[batch_offset : batch_offset + batch_size]\n            batch = _examples_to_batch(examples)\n            transformed_batch = func(batch, **fn_kwargs)\n            all_transformed_examples.extend(_batch_to_examples(transformed_batch))\n        expected = _examples_to_batch(all_examples)\n        expected.update(_examples_to_batch(all_transformed_examples))\n        expected = list(_batch_to_examples(expected))\n    assert next(iter(ex_iterable))[1] == expected[0]\n    assert [x for _, x in ex_iterable] == expected\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n\n\n@pytest.mark.parametrize(\n    \"n, func, batched, batch_size, input_columns\",\n    [\n        (3, lambda id_: {\"id+1\": id_ + 1}, False, None, [\"id\"]),  # just add 1 to the id\n        (25, lambda ids_: {\"id+1\": [i + 1 for i in ids_]}, True, 10, [\"id\"]),  # same with bs=10\n        (5, lambda ids_: {\"id+1\": [i + 1 for i in ids_]}, True, None, [\"id\"]),  # same with bs=None\n        (5, lambda ids_: {\"id+1\": [i + 1 for i in ids_]}, True, -1, [\"id\"]),  # same with bs<=0\n    ],\n)\ndef test_mapped_examples_iterable_input_columns(n, func, batched, batch_size, input_columns):\n    base_ex_iterable = ExamplesIterable(generate_examples_fn, {\"n\": n})\n    ex_iterable = MappedExamplesIterable(\n        base_ex_iterable, func, batched=batched, batch_size=batch_size, input_columns=input_columns\n    )\n    all_examples = [x for _, x in generate_examples_fn(n=n)]\n    columns_to_input = input_columns if isinstance(input_columns, list) else [input_columns]\n    if batched is False:\n        expected = [{**x, **func(*[x[col] for col in columns_to_input])} for x in all_examples]\n    else:\n        # For batched map we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function\n        all_transformed_examples = []\n        # If batch_size is None or <=0, we use the whole dataset as a single batch\n        if batch_size is None or batch_size <= 0:\n            batch_size = len(all_examples)\n        for batch_offset in range(0, len(all_examples), batch_size):\n            examples = all_examples[batch_offset : batch_offset + batch_size]\n            batch = _examples_to_batch(examples)\n            transformed_batch = func(*[batch[col] for col in columns_to_input])\n            all_transformed_examples.extend(_batch_to_examples(transformed_batch))\n        expected = _examples_to_batch(all_examples)\n        expected.update(_examples_to_batch(all_transformed_examples))\n        expected = list(_batch_to_examples(expected))\n    assert next(iter(ex_iterable))[1] == expected[0]\n    assert [x for _, x in ex_iterable] == expected\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n\n\n@pytest.mark.parametrize(\n    \"n, func, batched, batch_size\",\n    [\n        (3, lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)), False, None),  # just add 1 to the id\n        (3, lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)), True, 1),  # same with bs=1\n        (5, lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)), True, 10),  # same with bs=10\n        (25, lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)), True, 10),  # same with bs=10\n        (5, lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)), True, None),  # same with bs=None\n        (5, lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)), True, -1),  # same with bs<=0\n        (3, lambda t: pa.concat_tables([t] * 2), True, 1),  # make a duplicate of each example\n    ],\n)\ndef test_mapped_examples_iterable_arrow_format(n, func, batched, batch_size):\n    base_ex_iterable = ExamplesIterable(generate_examples_fn, {\"n\": n})\n    base_ex_iterable = RebatchedArrowExamplesIterable(\n        base_ex_iterable, batch_size=batch_size if batched else 1, force_convert_to_arrow=True\n    )\n    ex_iterable = MappedExamplesIterable(\n        base_ex_iterable,\n        func,\n        batched=batched,\n        batch_size=batch_size,\n        formatting=FormattingConfig(format_type=\"arrow\"),\n    )\n    all_examples = [x for _, x in generate_examples_fn(n=n)]\n    if batched is False:\n        expected = [func(pa.Table.from_pylist([x])).to_pylist()[0] for x in all_examples]\n    else:\n        expected = []\n        # If batch_size is None or <=0, we use the whole dataset as a single batch\n        if batch_size is None or batch_size <= 0:\n            batch_size = len(all_examples)\n        for batch_offset in range(0, len(all_examples), batch_size):\n            examples = all_examples[batch_offset : batch_offset + batch_size]\n            batch = pa.Table.from_pylist(examples)\n            expected.extend(func(batch).to_pylist())\n    assert next(iter(ex_iterable))[1] == expected[0]\n    assert [x for _, x in ex_iterable] == expected\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n    assert_load_state_dict_resumes_arrow_iteration(ex_iterable)\n\n\n@pytest.mark.parametrize(\n    \"n, func, batched, batch_size\",\n    [\n        (3, lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)), False, None),  # just add 1 to the id\n        (3, lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)), True, 1),  # same with bs=1\n        (5, lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)), True, 10),  # same with bs=10\n        (25, lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)), True, 10),  # same with bs=10\n        (5, lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)), True, None),  # same with bs=None\n        (5, lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)), True, -1),  # same with bs<=0\n        (3, lambda t: pa.concat_tables([t] * 2), True, 1),  # make a duplicate of each example\n    ],\n)\ndef test_mapped_examples_iterable_arrow_format_from_arrow_examples_iterable(n, func, batched, batch_size):\n    base_ex_iterable = ArrowExamplesIterable(generate_tables_fn, {\"n\": n})\n    base_ex_iterable = RebatchedArrowExamplesIterable(base_ex_iterable, batch_size=batch_size if batched else 1)\n    ex_iterable = MappedExamplesIterable(\n        base_ex_iterable,\n        func,\n        batched=batched,\n        batch_size=batch_size,\n        formatting=FormattingConfig(format_type=\"arrow\"),\n    )\n    all_examples = [x for _, x in generate_examples_fn(n=n)]\n    if batched is False:\n        expected = [func(pa.Table.from_pylist([x])).to_pylist()[0] for x in all_examples]\n    else:\n        expected = []\n        # If batch_size is None or <=0, we use the whole dataset as a single batch\n        if batch_size is None or batch_size <= 0:\n            batch_size = len(all_examples)\n        for batch_offset in range(0, len(all_examples), batch_size):\n            examples = all_examples[batch_offset : batch_offset + batch_size]\n            batch = pa.Table.from_pylist(examples)\n            expected.extend(func(batch).to_pylist())\n    assert next(iter(ex_iterable))[1] == expected[0]\n    assert [x for _, x in ex_iterable] == expected\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n    assert_load_state_dict_resumes_arrow_iteration(ex_iterable)\n\n\n@pytest.mark.parametrize(\n    \"n, func, batched, batch_size\",\n    [\n        (3, lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)), False, None),  # just add 1 to the id\n        (3, lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)), True, 1),  # same with bs=1\n        (5, lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)), True, 10),  # same with bs=10\n        (25, lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)), True, 10),  # same with bs=10\n        (5, lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)), True, None),  # same with bs=None\n        (5, lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)), True, -1),  # same with bs<=0\n        (3, lambda t: pa.concat_tables([t] * 2), True, 1),  # make a duplicate of each example\n    ],\n)\ndef test_mapped_examples_iterable_drop_last_batch_and_arrow_format(n, func, batched, batch_size):\n    base_ex_iterable = ExamplesIterable(generate_examples_fn, {\"n\": n})\n    base_ex_iterable = RebatchedArrowExamplesIterable(\n        base_ex_iterable, batch_size=batch_size if batched else 1, force_convert_to_arrow=True\n    )\n    ex_iterable = MappedExamplesIterable(\n        base_ex_iterable,\n        func,\n        batched=batched,\n        batch_size=batch_size,\n        drop_last_batch=True,\n        formatting=FormattingConfig(format_type=\"arrow\"),\n    )\n    all_examples = [x for _, x in generate_examples_fn(n=n)]\n    is_empty = False\n    if batched is False:\n        # `drop_last_batch` has no effect here\n        expected = [func(pa.Table.from_pylist([x])).to_pylist()[0] for x in all_examples]\n    else:\n        all_transformed_examples = []\n        # If batch_size is None or <=0, we use the whole dataset as a single batch\n        if batch_size is None or batch_size <= 0:\n            batch_size = len(all_examples)\n        for batch_offset in range(0, len(all_examples), batch_size):\n            examples = all_examples[batch_offset : batch_offset + batch_size]\n            if len(examples) < batch_size:  # ignore last batch\n                break\n            batch = pa.Table.from_pylist(examples)\n            out = func(batch)\n            all_transformed_examples.extend(\n                out.to_pylist()\n            )  # we don't merge with input since they're arrow tables and not dictionaries\n        all_examples = all_examples if n % batch_size == 0 else all_examples[: n // batch_size * batch_size]\n        if all_examples:\n            expected = all_transformed_examples\n        else:\n            is_empty = True\n\n    if not is_empty:\n        assert next(iter(ex_iterable))[1] == expected[0]\n        assert [x for _, x in ex_iterable] == expected\n    else:\n        with pytest.raises(StopIteration):\n            next(iter(ex_iterable))\n\n\n@pytest.mark.parametrize(\n    \"n, func, batched, batch_size\",\n    [\n        (\n            3,\n            lambda t, index: t.append_column(\"id+idx\", pc.add(t[\"id\"], index)),\n            False,\n            None,\n        ),  # add the index to the id\n        (\n            25,\n            lambda t, indices: t.append_column(\"id+idx\", pc.add(t[\"id\"], indices)),\n            True,\n            10,\n        ),  # add the index to the id\n        (5, lambda t, indices: t.append_column(\"id+idx\", pc.add(t[\"id\"], indices)), True, None),  # same with bs=None\n        (5, lambda t, indices: t.append_column(\"id+idx\", pc.add(t[\"id\"], indices)), True, -1),  # same with bs<=0\n    ],\n)\ndef test_mapped_examples_iterable_with_indices_and_arrow_format(n, func, batched, batch_size):\n    base_ex_iterable = ExamplesIterable(generate_examples_fn, {\"n\": n})\n    base_ex_iterable = RebatchedArrowExamplesIterable(\n        base_ex_iterable, batch_size=batch_size if batched else 1, force_convert_to_arrow=True\n    )\n    ex_iterable = MappedExamplesIterable(\n        base_ex_iterable,\n        func,\n        batched=batched,\n        batch_size=batch_size,\n        with_indices=True,\n        formatting=FormattingConfig(format_type=\"arrow\"),\n    )\n    all_examples = [x for _, x in generate_examples_fn(n=n)]\n    if batched is False:\n        expected = [func(pa.Table.from_pylist([x]), i).to_pylist()[0] for i, x in enumerate(all_examples)]\n    else:\n        expected = []\n        # If batch_size is None or <=0, we use the whole dataset as a single batch\n        if batch_size is None or batch_size <= 0:\n            batch_size = len(all_examples)\n        for batch_offset in range(0, len(all_examples), batch_size):\n            examples = all_examples[batch_offset : batch_offset + batch_size]\n            batch = pa.Table.from_pylist(examples)\n            expected.extend(func(batch, list(range(batch_offset, batch_offset + len(batch)))).to_pylist())\n    assert next(iter(ex_iterable))[1] == expected[0]\n    assert [x for _, x in ex_iterable] == expected\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n    assert_load_state_dict_resumes_arrow_iteration(ex_iterable)\n\n\n@pytest.mark.parametrize(\n    \"n, func, batched, batch_size, remove_columns\",\n    [\n        (\n            3,\n            lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)),\n            False,\n            None,\n            [\"extra_column\"],\n        ),  # just add 1 to the id\n        (25, lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)), True, 10, [\"extra_column\"]),  # same with bs=10\n        (\n            50,\n            lambda t: pa.table({\"foo\": [\"bar\"] * np.random.default_rng(t[\"id\"][0].as_py()).integers(0, 10)}),\n            True,\n            8,\n            [\"extra_column\", \"id\"],\n        ),  # make a duplicate of each example\n        (5, lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)), True, None, [\"extra_column\"]),  # same with bs=None\n        (5, lambda t: t.append_column(\"id+1\", pc.add(t[\"id\"], 1)), True, -1, [\"extra_column\"]),  # same with bs<=0\n    ],\n)\ndef test_mapped_examples_iterable_remove_columns_arrow_format(n, func, batched, batch_size, remove_columns):\n    base_ex_iterable = ExamplesIterable(generate_examples_fn, {\"n\": n, \"extra_column\": \"foo\"})\n    base_ex_iterable = RebatchedArrowExamplesIterable(\n        base_ex_iterable, batch_size=batch_size if batched else 1, force_convert_to_arrow=True\n    )\n    ex_iterable = MappedExamplesIterable(\n        base_ex_iterable,\n        func,\n        batched=batched,\n        batch_size=batch_size,\n        remove_columns=remove_columns,\n        formatting=FormattingConfig(format_type=\"arrow\"),\n    )\n    all_examples = [x for _, x in generate_examples_fn(n=n)]\n    columns_to_remove = remove_columns if isinstance(remove_columns, list) else [remove_columns]\n    if batched is False:\n        expected = [\n            {**{k: v for k, v in func(pa.Table.from_pylist([x])).to_pylist()[0].items() if k not in columns_to_remove}}\n            for x in all_examples\n        ]\n    else:\n        expected = []\n        # If batch_size is None or <=0, we use the whole dataset as a single batch\n        if batch_size is None or batch_size <= 0:\n            batch_size = len(all_examples)\n        for batch_offset in range(0, len(all_examples), batch_size):\n            examples = all_examples[batch_offset : batch_offset + batch_size]\n            batch = pa.Table.from_pylist(examples)\n            expected.extend(\n                [{k: v for k, v in x.items() if k not in columns_to_remove} for x in func(batch).to_pylist()]\n            )\n    assert next(iter(ex_iterable))[1] == expected[0]\n    assert [x for _, x in ex_iterable] == expected\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n    assert_load_state_dict_resumes_arrow_iteration(ex_iterable)\n\n\n@pytest.mark.parametrize(\n    \"n, func, batched, batch_size, fn_kwargs\",\n    [\n        (3, lambda t, y=0: t.append_column(\"id+idx\", pc.add(t[\"id\"], y)), False, None, None),\n        (3, lambda t, y=0: t.append_column(\"id+idx\", pc.add(t[\"id\"], y)), False, None, {\"y\": 3}),\n        (25, lambda t, y=0: t.append_column(\"id+idx\", pc.add(t[\"id\"], y)), True, 10, {\"y\": 3}),\n        (5, lambda t, y=0: t.append_column(\"id+idx\", pc.add(t[\"id\"], y)), True, None, {\"y\": 3}),  # same with bs=None\n        (5, lambda t, y=0: t.append_column(\"id+idx\", pc.add(t[\"id\"], y)), True, -1, {\"y\": 3}),  # same with bs<=0\n    ],\n)\ndef test_mapped_examples_iterable_fn_kwargs_and_arrow_format(n, func, batched, batch_size, fn_kwargs):\n    base_ex_iterable = ExamplesIterable(generate_examples_fn, {\"n\": n})\n    base_ex_iterable = RebatchedArrowExamplesIterable(\n        base_ex_iterable, batch_size=batch_size if batched else 1, force_convert_to_arrow=True\n    )\n    ex_iterable = MappedExamplesIterable(\n        base_ex_iterable,\n        func,\n        batched=batched,\n        batch_size=batch_size,\n        fn_kwargs=fn_kwargs,\n        formatting=FormattingConfig(format_type=\"arrow\"),\n    )\n    all_examples = [x for _, x in generate_examples_fn(n=n)]\n    if fn_kwargs is None:\n        fn_kwargs = {}\n    if batched is False:\n        expected = [func(pa.Table.from_pylist([x]), **fn_kwargs).to_pylist()[0] for x in all_examples]\n    else:\n        expected = []\n        # If batch_size is None or <=0, we use the whole dataset as a single batch\n        if batch_size is None or batch_size <= 0:\n            batch_size = len(all_examples)\n        for batch_offset in range(0, len(all_examples), batch_size):\n            examples = all_examples[batch_offset : batch_offset + batch_size]\n            batch = pa.Table.from_pylist(examples)\n            expected.extend(func(batch, **fn_kwargs).to_pylist())\n    assert next(iter(ex_iterable))[1] == expected[0]\n    assert [x for _, x in ex_iterable] == expected\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n    assert_load_state_dict_resumes_arrow_iteration(ex_iterable)\n\n\n@pytest.mark.parametrize(\n    \"n, func, batched, batch_size, input_columns\",\n    [\n        (3, lambda id_: pa.table({\"id+1\": pc.add(id_, 1)}), False, None, [\"id\"]),  # just add 1 to the id\n        (25, lambda ids_: pa.table({\"id+1\": pc.add(ids_, 1)}), True, 10, [\"id\"]),  # same with bs=10\n        (5, lambda ids_: pa.table({\"id+1\": pc.add(ids_, 1)}), True, None, [\"id\"]),  # same with bs=None\n        (5, lambda ids_: pa.table({\"id+1\": pc.add(ids_, 1)}), True, -1, [\"id\"]),  # same with bs<=0\n    ],\n)\ndef test_mapped_examples_iterable_input_columns_and_arrow_format(n, func, batched, batch_size, input_columns):\n    base_ex_iterable = ExamplesIterable(generate_examples_fn, {\"n\": n})\n    base_ex_iterable = RebatchedArrowExamplesIterable(\n        base_ex_iterable, batch_size=batch_size if batched else 1, force_convert_to_arrow=True\n    )\n    ex_iterable = MappedExamplesIterable(\n        base_ex_iterable,\n        func,\n        batched=batched,\n        batch_size=batch_size,\n        input_columns=input_columns,\n        formatting=FormattingConfig(format_type=\"arrow\"),\n    )\n    all_examples = [x for _, x in generate_examples_fn(n=n)]\n    columns_to_input = input_columns if isinstance(input_columns, list) else [input_columns]\n    if batched is False:\n        expected = [\n            func(*[pa.Table.from_pylist([x])[col] for col in columns_to_input]).to_pylist()[0] for x in all_examples\n        ]\n    else:\n        expected = []\n        # If batch_size is None or <=0, we use the whole dataset as a single batch\n        if batch_size is None or batch_size <= 0:\n            batch_size = len(all_examples)\n        for batch_offset in range(0, len(all_examples), batch_size):\n            examples = all_examples[batch_offset : batch_offset + batch_size]\n            batch = pa.Table.from_pylist(examples)\n            expected.extend(func(*[batch[col] for col in columns_to_input]).to_pylist())\n    assert next(iter(ex_iterable))[1] == expected[0]\n    assert [x for _, x in ex_iterable] == expected\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n    assert_load_state_dict_resumes_arrow_iteration(ex_iterable)\n\n\n@pytest.mark.parametrize(\n    \"n, func, batched, batch_size\",\n    [\n        (3, lambda x: x[\"id\"] % 2 == 0, False, None),  # keep even number\n        (3, lambda x: [x[\"id\"][0] % 2 == 0], True, 1),  # same with bs=1\n        (25, lambda x: [i % 2 == 0 for i in x[\"id\"]], True, 10),  # same with bs=10\n        (5, lambda x: [i % 2 == 0 for i in x[\"id\"]], True, None),  # same with bs=None\n        (5, lambda x: [i % 2 == 0 for i in x[\"id\"]], True, -1),  # same with bs<=0\n        (3, lambda x: False, False, None),  # return 0 examples\n        (3, lambda x: [False] * len(x[\"id\"]), True, 10),  # same with bs=10\n    ],\n)\ndef test_filtered_examples_iterable(n, func, batched, batch_size):\n    base_ex_iterable = ExamplesIterable(generate_examples_fn, {\"n\": n})\n    ex_iterable = FilteredExamplesIterable(base_ex_iterable, func, batched=batched, batch_size=batch_size)\n    all_examples = [x for _, x in generate_examples_fn(n=n)]\n    if batched is False:\n        expected = [x for x in all_examples if func(x)]\n    else:\n        # For batched filter we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function\n        expected = []\n        # If batch_size is None or <=0, we use the whole dataset as a single batch\n        if batch_size is None or batch_size <= 0:\n            batch_size = len(all_examples)\n        for batch_offset in range(0, len(all_examples), batch_size):\n            examples = all_examples[batch_offset : batch_offset + batch_size]\n            batch = _examples_to_batch(examples)\n            mask = func(batch)\n            expected.extend([x for x, to_keep in zip(examples, mask) if to_keep])\n    if expected:\n        assert next(iter(ex_iterable))[1] == expected[0]\n    assert [x for _, x in ex_iterable] == expected\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n\n\n@pytest.mark.parametrize(\n    \"n, func, batched, batch_size\",\n    [\n        (3, lambda x, index: index % 2 == 0, False, None),  # keep even number\n        (25, lambda x, indices: [idx % 2 == 0 for idx in indices], True, 10),  # same with bs=10\n        (5, lambda x, indices: [idx % 2 == 0 for idx in indices], True, None),  # same with bs=None\n        (5, lambda x, indices: [idx % 2 == 0 for idx in indices], True, -1),  # same with bs<=0\n    ],\n)\ndef test_filtered_examples_iterable_with_indices(n, func, batched, batch_size):\n    base_ex_iterable = ExamplesIterable(generate_examples_fn, {\"n\": n})\n    ex_iterable = FilteredExamplesIterable(\n        base_ex_iterable, func, batched=batched, batch_size=batch_size, with_indices=True\n    )\n    all_examples = [x for _, x in generate_examples_fn(n=n)]\n    if batched is False:\n        expected = [x for idx, x in enumerate(all_examples) if func(x, idx)]\n    else:\n        # For batched filter we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function\n        expected = []\n        # If batch_size is None or <=0, we use the whole dataset as a single batch\n        if batch_size is None or batch_size <= 0:\n            batch_size = len(all_examples)\n        for batch_offset in range(0, len(all_examples), batch_size):\n            examples = all_examples[batch_offset : batch_offset + batch_size]\n            batch = _examples_to_batch(examples)\n            indices = list(range(batch_offset, batch_offset + len(examples)))\n            mask = func(batch, indices)\n            expected.extend([x for x, to_keep in zip(examples, mask) if to_keep])\n    assert next(iter(ex_iterable))[1] == expected[0]\n    assert [x for _, x in ex_iterable] == expected\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n\n\n@pytest.mark.parametrize(\n    \"n, func, batched, batch_size, input_columns\",\n    [\n        (3, lambda id_: id_ % 2 == 0, False, None, [\"id\"]),  # keep even number\n        (25, lambda ids_: [i % 2 == 0 for i in ids_], True, 10, [\"id\"]),  # same with bs=10\n        (3, lambda ids_: [i % 2 == 0 for i in ids_], True, None, [\"id\"]),  # same with bs=None\n        (3, lambda ids_: [i % 2 == 0 for i in ids_], True, None, [\"id\"]),  # same with bs=None\n    ],\n)\ndef test_filtered_examples_iterable_input_columns(n, func, batched, batch_size, input_columns):\n    base_ex_iterable = ExamplesIterable(generate_examples_fn, {\"n\": n})\n    ex_iterable = FilteredExamplesIterable(\n        base_ex_iterable, func, batched=batched, batch_size=batch_size, input_columns=input_columns\n    )\n    all_examples = [x for _, x in generate_examples_fn(n=n)]\n    columns_to_input = input_columns if isinstance(input_columns, list) else [input_columns]\n    if batched is False:\n        expected = [x for x in all_examples if func(*[x[col] for col in columns_to_input])]\n    else:\n        # For batched filter we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function\n        expected = []\n        # If batch_size is None or <=0, we use the whole dataset as a single batch\n        if batch_size is None or batch_size <= 0:\n            batch_size = len(all_examples)\n        for batch_offset in range(0, len(all_examples), batch_size):\n            examples = all_examples[batch_offset : batch_offset + batch_size]\n            batch = _examples_to_batch(examples)\n            mask = func(*[batch[col] for col in columns_to_input])\n            expected.extend([x for x, to_keep in zip(examples, mask) if to_keep])\n    assert next(iter(ex_iterable))[1] == expected[0]\n    assert [x for _, x in ex_iterable] == expected\n    assert_load_state_dict_resumes_iteration(ex_iterable)\n\n\ndef test_map_async():\n    dset = Dataset.from_dict({\"x\": range(100)}).to_iterable_dataset()\n\n    async def f(example):\n        await asyncio.sleep(0.1)\n        return {\"y\": 1}\n\n    _start = time.time()\n    out = dset.map(f)\n    assert time.time() - _start < 2.0\n    assert next(iter(out))[\"y\"] == 1\n\n    async def f(batch):\n        await asyncio.sleep(0.1)\n        return {\"y\": [1] * len(batch[\"x\"])}\n\n    _start = time.time()\n    out = dset.map(f, batched=True)\n    assert time.time() - _start < 2.0\n    assert next(iter(out))[\"y\"] == 1\n\n\ndef test_filter_async():\n    dset = Dataset.from_dict({\"x\": range(100)}).to_iterable_dataset()\n\n    async def f(example):\n        await asyncio.sleep(0.1)\n        return example[\"x\"] == 42\n\n    _start = time.time()\n    out = dset.filter(f)\n    assert time.time() - _start < 2.0\n    assert len(list(out)) == 1\n\n    async def f(batch):\n        await asyncio.sleep(0.1)\n        return [x == 42 for x in batch[\"x\"]]\n\n    _start = time.time()\n    out = dset.filter(f, batched=True)\n    assert time.time() - _start < 2.0\n    assert len(list(out)) == 1\n\n\ndef test_skip_examples_iterable():\n    total, count = 10, 2\n    base_ex_iterable = ExamplesIterable(generate_examples_fn, {\"n\": total})\n    skip_ex_iterable = SkipExamplesIterable(base_ex_iterable, n=count)\n    expected = list(generate_examples_fn(n=total))[count:]\n    assert list(skip_ex_iterable) == expected\n    assert skip_ex_iterable.shuffle_data_sources(np.random.default_rng(42)) is skip_ex_iterable, (\n        \"skip examples makes the shards order fixed\"\n    )\n    assert_load_state_dict_resumes_iteration(skip_ex_iterable)\n\n\ndef test_take_examples_iterable():\n    total, count = 10, 2\n    base_ex_iterable = ExamplesIterable(generate_examples_fn, {\"n\": total})\n    take_ex_iterable = TakeExamplesIterable(base_ex_iterable, n=count)\n    expected = list(generate_examples_fn(n=total))[:count]\n    assert list(take_ex_iterable) == expected\n    assert take_ex_iterable.shuffle_data_sources(np.random.default_rng(42)) is take_ex_iterable, (\n        \"skip examples makes the shards order fixed\"\n    )\n    assert_load_state_dict_resumes_iteration(take_ex_iterable)\n\n\ndef test_step_examples_iterable():\n    total, step, offset = 10, 2, 1\n    base_ex_iterable = ExamplesIterable(generate_examples_fn, {\"n\": total})\n    step_ex_iterable = StepExamplesIterable(base_ex_iterable, step=step, offset=offset)\n    expected = list(generate_examples_fn(n=total))[offset::step]\n    assert list(step_ex_iterable) == expected\n    assert_load_state_dict_resumes_iteration(step_ex_iterable)\n\n\ndef test_skip_arrow_examples_iterable():\n    total, count = 10, 2\n    base_ex_iterable = ArrowExamplesIterable(generate_tables_fn, {\"n\": total})\n    skip_ex_iterable = SkipExamplesIterable(base_ex_iterable, n=count)\n    expected = [x for _, pa_table in generate_tables_fn(n=total) for x in pa_table.to_pylist()][count:]\n    assert [example for _, example in skip_ex_iterable] == expected\n    assert skip_ex_iterable.shuffle_data_sources(np.random.default_rng(42)) is skip_ex_iterable, (\n        \"skip examples makes the shards order fixed\"\n    )\n    assert_load_state_dict_resumes_iteration(skip_ex_iterable)\n\n\ndef test_take_arrow_examples_iterable():\n    total, count = 10, 2\n    base_ex_iterable = ArrowExamplesIterable(generate_tables_fn, {\"n\": total})\n    take_ex_iterable = TakeExamplesIterable(base_ex_iterable, n=count)\n    expected = [x for _, pa_table in generate_tables_fn(n=total) for x in pa_table.to_pylist()][:count]\n    assert [example for _, example in take_ex_iterable] == expected\n    assert take_ex_iterable.shuffle_data_sources(np.random.default_rng(42)) is take_ex_iterable, (\n        \"skip examples makes the shards order fixed\"\n    )\n    assert_load_state_dict_resumes_iteration(take_ex_iterable)\n\n\ndef test_step_arrow_examples_iterable():\n    total, step, offset = 10, 2, 1\n    base_ex_iterable = ArrowExamplesIterable(generate_tables_fn, {\"n\": total})\n    step_ex_iterable = StepExamplesIterable(base_ex_iterable, step=step, offset=offset)\n    expected = [x for _, pa_table in generate_tables_fn(n=total) for x in pa_table.to_pylist()][offset::step]\n    assert [example for _, example in step_ex_iterable] == expected\n    assert_load_state_dict_resumes_iteration(step_ex_iterable)\n\n\n@pytest.mark.parametrize(\n    \"n, num_times\",\n    [\n        (3, None),\n        (3, 3),\n        (3, 0),\n    ],\n)\ndef test_repeat_examples_iterable(n, num_times):\n    base_ex_iterable = ExamplesIterable(generate_examples_fn, {\"n\": n})\n    ex_iterable = RepeatExamplesIterable(base_ex_iterable, num_times=num_times)\n    all_examples = [x for _, x in generate_examples_fn(n=n)]\n    if num_times is not None:\n        expected = all_examples * max(num_times, 0)\n        assert [x for _, x in ex_iterable] == expected\n    else:\n        max_iters = 135\n        iterator = iter(ex_iterable)\n        for i in range(max_iters):\n            assert next(iterator)[1] == all_examples[i % len(all_examples)], f\"iteration {i} failed,\"\n\n\ndef test_vertically_concatenated_examples_iterable():\n    ex_iterable1 = ExamplesIterable(generate_examples_fn, {\"label\": 10})\n    ex_iterable2 = ExamplesIterable(generate_examples_fn, {\"label\": 5})\n    concatenated_ex_iterable = VerticallyConcatenatedMultiSourcesExamplesIterable([ex_iterable1, ex_iterable2])\n    expected = [x for _, x in ex_iterable1] + [x for _, x in ex_iterable2]\n    assert [x for _, x in concatenated_ex_iterable] == expected\n    assert_load_state_dict_resumes_iteration(concatenated_ex_iterable)\n\n\ndef test_vertically_concatenated_examples_iterable_with_different_columns():\n    # having different columns is supported\n    # Though iterable datasets fill the missing data with nulls\n    ex_iterable1 = ExamplesIterable(generate_examples_fn, {\"label\": 10})\n    ex_iterable2 = ExamplesIterable(generate_examples_fn, {})\n    concatenated_ex_iterable = VerticallyConcatenatedMultiSourcesExamplesIterable([ex_iterable1, ex_iterable2])\n    expected = [x for _, x in ex_iterable1] + [x for _, x in ex_iterable2]\n    assert [x for _, x in concatenated_ex_iterable] == expected\n    assert_load_state_dict_resumes_iteration(concatenated_ex_iterable)\n\n\ndef test_vertically_concatenated_examples_iterable_shuffle_data_sources():\n    ex_iterable1 = ExamplesIterable(generate_examples_fn, {\"label\": 10})\n    ex_iterable2 = ExamplesIterable(generate_examples_fn, {\"label\": 5})\n    concatenated_ex_iterable = VerticallyConcatenatedMultiSourcesExamplesIterable([ex_iterable1, ex_iterable2])\n    rng = np.random.default_rng(42)\n    shuffled_ex_iterable = concatenated_ex_iterable.shuffle_data_sources(rng)\n    # make sure the list of examples iterables is shuffled, and each examples iterable is shuffled\n    expected = [x for _, x in ex_iterable2.shuffle_data_sources(rng)] + [\n        x for _, x in ex_iterable1.shuffle_data_sources(rng)\n    ]\n    assert [x for _, x in shuffled_ex_iterable] == expected\n    assert_load_state_dict_resumes_iteration(shuffled_ex_iterable)\n\n\ndef test_horizontally_concatenated_examples_iterable():\n    ex_iterable1 = ExamplesIterable(generate_examples_fn, {\"label1\": 10})\n    ex_iterable2 = ExamplesIterable(generate_examples_fn, {\"label2\": 5})\n    concatenated_ex_iterable = HorizontallyConcatenatedMultiSourcesExamplesIterable([ex_iterable1, ex_iterable2])\n    with pytest.raises(ValueError):  # column \"id\" is duplicated -> raise an error\n        list(concatenated_ex_iterable)\n    ex_iterable2 = MappedExamplesIterable(ex_iterable2, lambda x: x, remove_columns=[\"id\"])\n    concatenated_ex_iterable = HorizontallyConcatenatedMultiSourcesExamplesIterable([ex_iterable1, ex_iterable2])\n    expected = [{**x, **y} for (_, x), (_, y) in zip(ex_iterable1, ex_iterable2)]\n    assert [x for _, x in concatenated_ex_iterable] == expected\n    assert concatenated_ex_iterable.shuffle_data_sources(np.random.default_rng(42)) is concatenated_ex_iterable, (\n        \"horizontally concatenated examples makes the shards order fixed\"\n    )\n    assert_load_state_dict_resumes_iteration(concatenated_ex_iterable)\n\n\n@pytest.mark.parametrize(\n    \"ex_iterable\",\n    [\n        ExamplesIterable(generate_examples_fn, {}),\n        SelectColumnsIterable(ExamplesIterable(generate_examples_fn, {}), [\"id\"]),\n        StepExamplesIterable(ExamplesIterable(generate_examples_fn, {}), 2, 0),\n        CyclingMultiSourcesExamplesIterable([ExamplesIterable(generate_examples_fn, {})]),\n        VerticallyConcatenatedMultiSourcesExamplesIterable([ExamplesIterable(generate_examples_fn, {})]),\n        HorizontallyConcatenatedMultiSourcesExamplesIterable([ExamplesIterable(generate_examples_fn, {})]),\n        RandomlyCyclingMultiSourcesExamplesIterable(\n            [ExamplesIterable(generate_examples_fn, {})], np.random.default_rng(42)\n        ),\n        MappedExamplesIterable(ExamplesIterable(generate_examples_fn, {}), lambda x: x),\n        MappedExamplesIterable(ArrowExamplesIterable(generate_tables_fn, {}), lambda x: x),\n        FilteredExamplesIterable(ExamplesIterable(generate_examples_fn, {}), lambda x: True),\n        FilteredExamplesIterable(ArrowExamplesIterable(generate_tables_fn, {}), lambda x: True),\n        BufferShuffledExamplesIterable(ExamplesIterable(generate_examples_fn, {}), 10, np.random.default_rng(42)),\n        SkipExamplesIterable(ExamplesIterable(generate_examples_fn, {}), 10),\n        TakeExamplesIterable(ExamplesIterable(generate_examples_fn, {}), 10),\n        FormattedExamplesIterable(\n            ExamplesIterable(generate_examples_fn, {}), None, Features({\"id\": Value(\"int32\")}), token_per_repo_id={}\n        ),\n    ],\n)\ndef test_no_iter_arrow(ex_iterable: _BaseExamplesIterable):\n    assert ex_iterable.iter_arrow is None\n    if not isinstance(ex_iterable, BufferShuffledExamplesIterable):\n        assert_load_state_dict_resumes_iteration(ex_iterable)\n\n\n@pytest.mark.parametrize(\n    \"ex_iterable\",\n    [\n        ArrowExamplesIterable(generate_tables_fn, {}),\n        SelectColumnsIterable(ArrowExamplesIterable(generate_tables_fn, {}), [\"id\"]),\n        StepExamplesIterable(ArrowExamplesIterable(generate_tables_fn, {}), 2, 0),\n        # CyclingMultiSourcesExamplesIterable([ArrowExamplesIterable(generate_tables_fn, {})]),  # not implemented\n        VerticallyConcatenatedMultiSourcesExamplesIterable([ArrowExamplesIterable(generate_tables_fn, {})]),\n        HorizontallyConcatenatedMultiSourcesExamplesIterable([ArrowExamplesIterable(generate_tables_fn, {})]),\n        # RandomlyCyclingMultiSourcesExamplesIterable([ArrowExamplesIterable(generate_tables_fn, {})], np.random.default_rng(42)),  # not implemented\n        MappedExamplesIterable(\n            RebatchedArrowExamplesIterable(\n                ExamplesIterable(generate_examples_fn, {}), batch_size=1, force_convert_to_arrow=True\n            ),\n            lambda t: t,\n            formatting=FormattingConfig(format_type=\"arrow\"),\n        ),\n        MappedExamplesIterable(\n            RebatchedArrowExamplesIterable(ArrowExamplesIterable(generate_tables_fn, {}), batch_size=1),\n            lambda t: t,\n            formatting=FormattingConfig(format_type=\"arrow\"),\n        ),\n        FilteredExamplesIterable(\n            RebatchedArrowExamplesIterable(\n                ExamplesIterable(generate_examples_fn, {}), batch_size=1, force_convert_to_arrow=True\n            ),\n            lambda t: True,\n            formatting=FormattingConfig(format_type=\"arrow\"),\n        ),\n        FilteredExamplesIterable(\n            RebatchedArrowExamplesIterable(\n                ArrowExamplesIterable(generate_tables_fn, {}), batch_size=1, force_convert_to_arrow=True\n            ),\n            lambda t: True,\n            formatting=FormattingConfig(format_type=\"arrow\"),\n        ),\n        # BufferShuffledExamplesIterable(ArrowExamplesIterable(generate_tables_fn, {}), 10, np.random.default_rng(42)),  # not implemented\n        SkipExamplesIterable(ArrowExamplesIterable(generate_tables_fn, {}), 10),\n        TakeExamplesIterable(ArrowExamplesIterable(generate_tables_fn, {}), 10),\n        FormattedExamplesIterable(\n            ArrowExamplesIterable(generate_tables_fn, {}), None, Features({\"id\": Value(\"int32\")}), token_per_repo_id={}\n        ),\n    ],\n)\ndef test_iter_arrow(ex_iterable: _BaseExamplesIterable):\n    assert ex_iterable.iter_arrow is not None\n    key, pa_table = next(ex_iterable.iter_arrow())\n    assert isinstance(pa_table, pa.Table)\n    assert_load_state_dict_resumes_arrow_iteration(ex_iterable)\n\n\n############################\n#\n#   IterableDataset tests\n#\n############################\n\n\ndef test_iterable_dataset():\n    dataset = IterableDataset(ExamplesIterable(generate_examples_fn, {}))\n    expected = [x for _, x in generate_examples_fn()]\n    assert next(iter(dataset)) == expected[0]\n    assert list(dataset) == expected\n\n\ndef test_iterable_dataset_push_to_hub_max_shard_size_and_num_shards_are_mutually_exclusive():\n    dataset = IterableDataset.from_generator(lambda: iter([{\"id\": 0}]))\n    with pytest.raises(ValueError, match=\"either max_shard_size or num_shards\"):\n        dataset.push_to_hub(\"user/dataset\", max_shard_size=\"1MB\", num_shards=2)\n\n\ndef test_iterable_dataset_push_to_hub_single_shard_disables_multiprocessing():\n    dataset = IterableDataset.from_generator(lambda: iter([{\"id\": 0}]))\n    mock_context = MagicMock()\n    mock_pool = MagicMock()\n    mock_pool_cls = MagicMock(return_value=mock_pool)\n    mock_context.Pool = mock_pool_cls\n    with (\n        patch(\"multiprocess.get_context\", return_value=mock_context),\n        patch.object(\n            IterableDataset,\n            \"_push_parquet_shards_to_hub_single\",\n            return_value=iter([(0, True, ([], [], Features(), 0, 1))]),\n        ),\n    ):\n        additions, new_parquet_paths, features, spit_info, uploaded_size = dataset._push_parquet_shards_to_hub(\n            resolved_output_path=HfFileSystemResolvedRepositoryPath(\n                repo_type=\"dataset\", repo_id=\"user/dataset\", revision=\"main\", path_in_repo=\"\"\n            ),\n            data_dir=\"data\",\n            split=\"train\",\n            token=None,\n            create_pr=False,\n            max_shard_size=None,\n            num_shards=1,\n            embed_external_files=False,\n            num_proc=4,\n        )\n    mock_pool.assert_not_called()\n    assert additions == []\n    assert new_parquet_paths == []\n    assert features == Features()\n    assert spit_info.name == \"train\"\n    assert spit_info.num_bytes == 0\n    assert spit_info.num_examples == 1\n    assert uploaded_size == 0\n\n\ndef test_iterable_dataset_push_to_hub_default_num_shards_uses_dataset_num_shards():\n    def gen(shard_names):\n        for shard_name in shard_names:\n            yield {\"shard_name\": shard_name}\n\n    dataset = IterableDataset.from_generator(gen, gen_kwargs={\"shard_names\": [\"train-0\", \"train-1\", \"train-2\"]})\n    captured_num_shards = {}\n\n    def mock_push_single(**kwargs):\n        captured_num_shards[\"value\"] = kwargs[\"num_shards\"]\n        return iter([(0, True, ([], [], Features(), 0, 0))])\n\n    with patch.object(IterableDataset, \"_push_parquet_shards_to_hub_single\", side_effect=mock_push_single):\n        dataset._push_parquet_shards_to_hub(\n            resolved_output_path=HfFileSystemResolvedRepositoryPath(\n                repo_type=\"dataset\", repo_id=\"user/dataset\", revision=\"main\", path_in_repo=\"\"\n            ),\n            data_dir=\"data\",\n            split=\"train\",\n            token=None,\n            create_pr=False,\n            max_shard_size=None,\n            num_shards=None,\n            embed_external_files=False,\n            num_proc=None,\n        )\n\n    assert captured_num_shards[\"value\"] == dataset.num_shards\n\n\ndef test_iterable_dataset_push_to_hub_max_shard_size_computes_num_shards_from_estimated_size():\n    dataset = Dataset.from_dict({\"id\": list(range(16)), \"text\": [\"value\"] * 16}).to_iterable_dataset()\n    estimated_nbytes = sum(\n        table.nbytes for table in dataset.with_format(\"arrow\").iter(batch_size=config.DEFAULT_MAX_BATCH_SIZE)\n    )\n    max_shard_size = max(1, estimated_nbytes // 2)\n    expected_num_shards = max(int(estimated_nbytes / max_shard_size) + 1, 1)\n    captured_num_shards = {}\n\n    def mock_push_single(**kwargs):\n        captured_num_shards[\"value\"] = kwargs[\"num_shards\"]\n        return iter([(0, True, ([], [], Features(), 0, 0))])\n\n    with patch.object(IterableDataset, \"_push_parquet_shards_to_hub_single\", side_effect=mock_push_single):\n        dataset._push_parquet_shards_to_hub(\n            resolved_output_path=HfFileSystemResolvedRepositoryPath(\n                repo_type=\"dataset\", repo_id=\"user/dataset\", revision=\"main\", path_in_repo=\"\"\n            ),\n            data_dir=\"data\",\n            split=\"train\",\n            token=None,\n            create_pr=False,\n            max_shard_size=max_shard_size,\n            num_shards=None,\n            embed_external_files=False,\n            num_proc=None,\n        )\n\n    assert captured_num_shards[\"value\"] == expected_num_shards\n\n\ndef test_iterable_dataset_push_to_hub_max_shard_size_respects_num_proc_floor():\n    dataset = IterableDataset.from_generator(\n        lambda shard_names: ({\"shard_name\": shard_name} for shard_name in shard_names),\n        gen_kwargs={\"shard_names\": [\"train-0\", \"train-1\", \"train-2\"]},\n    )\n    estimated_nbytes = sum(\n        table.nbytes for table in dataset.with_format(\"arrow\").iter(batch_size=config.DEFAULT_MAX_BATCH_SIZE)\n    )\n    requested_num_proc = dataset.num_shards\n    max_shard_size = max(estimated_nbytes * 2, 1)\n    expected_num_shards = max(int(estimated_nbytes / max_shard_size) + 1, requested_num_proc)\n\n    with (\n        patch(\n            \"datasets.iterable_dataset.iflatmap_unordered\",\n            return_value=iter([(0, True, ([], [], Features(), 0, 0))]),\n        ) as mock_iflatmap_unordered,\n    ):\n        dataset._push_parquet_shards_to_hub(\n            resolved_output_path=HfFileSystemResolvedRepositoryPath(\n                repo_id=\"user/dataset\", path_in_repo=\"\", revision=\"main\", repo_type=\"dataset\"\n            ),\n            data_dir=\"data\",\n            split=\"train\",\n            token=None,\n            create_pr=False,\n            max_shard_size=max_shard_size,\n            num_shards=None,\n            embed_external_files=False,\n            num_proc=requested_num_proc,\n        )\n\n    kwargs_iterable = mock_iflatmap_unordered.call_args.kwargs[\"kwargs_iterable\"]\n    assert len(kwargs_iterable) == requested_num_proc\n    assert {job_kwargs[\"num_shards\"] for job_kwargs in kwargs_iterable} == {expected_num_shards}\n\n\ndef test_iterable_dataset_from_generator():\n    data = [\n        {\"col_1\": \"0\", \"col_2\": 0, \"col_3\": 0.0},\n        {\"col_1\": \"1\", \"col_2\": 1, \"col_3\": 1.0},\n        {\"col_1\": \"2\", \"col_2\": 2, \"col_3\": 2.0},\n        {\"col_1\": \"3\", \"col_2\": 3, \"col_3\": 3.0},\n    ]\n\n    def gen():\n        yield from data\n\n    dataset = IterableDataset.from_generator(gen)\n    assert isinstance(dataset, IterableDataset)\n    assert list(dataset) == data\n\n\ndef test_iterable_dataset_from_generator_with_shards():\n    def gen(shard_names):\n        for shard_name in shard_names:\n            for i in range(10):\n                yield {\"shard_name\": shard_name, \"i\": i}\n\n    shard_names = [f\"data{shard_idx}.txt\" for shard_idx in range(4)]\n    dataset = IterableDataset.from_generator(gen, gen_kwargs={\"shard_names\": shard_names})\n    assert isinstance(dataset, IterableDataset)\n    assert dataset.num_shards == len(shard_names)\n\n\ndef test_iterable_dataset_to_pandas_preserves_declared_features():\n    features = Features({\"col\": Value(\"int32\")})\n    dataset = Dataset.from_dict({\"col\": [0, None]}, features=features).to_iterable_dataset()\n\n    df = dataset.to_pandas()\n    assert list(df.columns) == [\"col\"]\n    assert df[\"col\"].iloc[0] == 0\n    assert pd.isna(df[\"col\"].iloc[1])\n\n    batches = list(dataset.to_pandas(batch_size=1, batched=True))\n    assert len(batches) == 2\n    assert batches[0][\"col\"].iloc[0] == 0\n    assert pd.isna(batches[1][\"col\"].iloc[0])\n\n\ndef test_iterable_dataset_to_pandas_casts_when_schema_mismatch():\n    from datasets.table import cast_table_to_features as original_cast_table_to_features\n\n    features = Features({\"col\": Value(\"int32\")})\n    dataset = IterableDataset(\n        ExamplesIterable(lambda: iter([(\"0\", {\"col\": 0}), (\"1\", {\"col\": 1})]), {}),\n        info=DatasetInfo(features=features),\n    )\n\n    with patch(\n        \"datasets.iterable_dataset.cast_table_to_features\",\n        wraps=original_cast_table_to_features,\n    ) as mock_cast:\n        df = dataset.to_pandas()\n        batches = list(dataset.to_pandas(batch_size=1, batched=True))\n\n    assert mock_cast.call_count >= 1\n    assert list(df.columns) == [\"col\"]\n    assert df[\"col\"].iloc[0] == 0\n    assert len(batches) == 2\n\n\n@require_numpy1_on_windows\ndef test_iterable_dataset_from_file(dataset: IterableDataset, arrow_file: str):\n    with assert_arrow_memory_doesnt_increase():\n        dataset_from_file = IterableDataset.from_file(arrow_file)\n    expected_features = dataset._resolve_features().features\n    assert dataset_from_file.features.type == expected_features.type\n    assert dataset_from_file.features == expected_features\n    assert isinstance(dataset_from_file, IterableDataset)\n    assert list(dataset_from_file) == list(dataset)\n\n\n@require_not_windows\n@require_dill_gt_0_3_2\n@require_pyspark\ndef test_from_spark_streaming():\n    import pyspark\n\n    spark = pyspark.sql.SparkSession.builder.master(\"local[*]\").appName(\"pyspark\").getOrCreate()\n    data = [\n        (\"0\", 0, 0.0),\n        (\"1\", 1, 1.0),\n        (\"2\", 2, 2.0),\n        (\"3\", 3, 3.0),\n    ]\n    df = spark.createDataFrame(data, \"col_1: string, col_2: int, col_3: float\")\n    dataset = IterableDataset.from_spark(df)\n    assert isinstance(dataset, IterableDataset)\n    results = []\n    for ex in dataset:\n        results.append(ex)\n    assert results == [\n        {\"col_1\": \"0\", \"col_2\": 0, \"col_3\": 0.0},\n        {\"col_1\": \"1\", \"col_2\": 1, \"col_3\": 1.0},\n        {\"col_1\": \"2\", \"col_2\": 2, \"col_3\": 2.0},\n        {\"col_1\": \"3\", \"col_2\": 3, \"col_3\": 3.0},\n    ]\n\n\n@require_not_windows\n@require_dill_gt_0_3_2\n@require_pyspark\ndef test_from_spark_streaming_features():\n    import PIL.Image\n    import pyspark\n\n    spark = pyspark.sql.SparkSession.builder.master(\"local[*]\").appName(\"pyspark\").getOrCreate()\n    data = [(0, np.arange(4 * 4 * 3).reshape(4, 4, 3).tolist())]\n    df = spark.createDataFrame(data, \"idx: int, image: array<array<array<int>>>\")\n    features = Features({\"idx\": Value(\"int64\"), \"image\": Image()})\n    dataset = IterableDataset.from_spark(\n        df,\n        features=features,\n    )\n    assert isinstance(dataset, IterableDataset)\n    results = []\n    for ex in dataset:\n        results.append(ex)\n    assert len(results) == 1\n    isinstance(results[0][\"image\"], PIL.Image.Image)\n\n\n@require_torch\ndef test_iterable_dataset_torch_integration():\n    ex_iterable = ExamplesIterable(generate_examples_fn, {})\n    dataset = IterableDataset(ex_iterable)\n    import torch.utils.data\n\n    assert isinstance(dataset, torch.utils.data.IterableDataset)\n    assert isinstance(dataset, IterableDataset)\n\n\n@require_torch\ndef test_iterable_dataset_torch_picklable():\n    import pickle\n\n    ex_iterable = ExamplesIterable(generate_examples_fn, {})\n    dataset = IterableDataset(ex_iterable, formatting=FormattingConfig(format_type=\"torch\"))\n    reloaded_dataset = pickle.loads(pickle.dumps(dataset))\n\n    import torch.utils.data\n\n    assert isinstance(reloaded_dataset, IterableDataset)\n    assert isinstance(reloaded_dataset, torch.utils.data.IterableDataset)\n    assert reloaded_dataset._formatting.format_type == \"torch\"\n    assert len(list(dataset)) == len(list(reloaded_dataset))\n\n\n@require_torch\ndef test_iterable_dataset_with_format_torch():\n    ex_iterable = ExamplesIterable(generate_examples_fn, {})\n    dataset = IterableDataset(ex_iterable)\n    from torch.utils.data import DataLoader\n\n    dataloader = DataLoader(dataset)\n    assert len(list(dataloader)) == len(list(ex_iterable))\n\n\n@require_torch\ndef test_iterable_dataset_torch_dataloader_parallel():\n    from torch.utils.data import DataLoader\n\n    ex_iterable = ExamplesIterable(generate_examples_fn, {})\n    dataset = IterableDataset(ex_iterable)\n    dataloader = DataLoader(dataset, num_workers=2, batch_size=None)\n    result = list(dataloader)\n    expected = [example for _, example in ex_iterable]\n    assert len(result) == len(expected)\n    assert {str(x) for x in result} == {str(x) for x in expected}\n\n\n@require_torch\n@pytest.mark.filterwarnings(\"ignore:This DataLoader will create:UserWarning\")\n@pytest.mark.parametrize(\"num_shards, num_workers\", [(2, 1), (2, 2), (3, 2), (2, 3)])\ndef test_sharded_iterable_dataset_torch_dataloader_parallel(num_shards, num_workers):\n    from torch.utils.data import DataLoader\n\n    ex_iterable = ExamplesIterable(generate_examples_fn, {\"filepaths\": [f\"{i}.txt\" for i in range(num_shards)]})\n    dataset = IterableDataset(ex_iterable)\n    dataloader = DataLoader(dataset, batch_size=None, num_workers=num_workers)\n    result = list(dataloader)\n    expected = [example for _, example in ex_iterable]\n    assert len(result) == len(expected)\n    assert {str(x) for x in result} == {str(x) for x in expected}\n\n\n@require_torch\n@pytest.mark.integration\n@pytest.mark.parametrize(\"num_workers\", [1, 2])\ndef test_iterable_dataset_from_hub_torch_dataloader_parallel(num_workers, tmp_path):\n    from torch.utils.data import DataLoader\n\n    dataset = load_dataset(SAMPLE_DATASET_IDENTIFIER, cache_dir=str(tmp_path), streaming=True, split=\"train\")\n    dataloader = DataLoader(dataset, batch_size=None, num_workers=num_workers)\n    result = list(dataloader)\n    assert len(result) == 10\n\n\ndef gen_with_worker_info(shard):\n    from torch.utils.data import get_worker_info\n\n    worker_info = get_worker_info()\n    for i in range(100):\n        yield {\"value\": i, \"worker_id\": worker_info.id}\n\n\n@require_torch\ndef test_iterable_dataset_shuffle_with_multiple_workers_different_rng():\n    # GH 7567\n    from torch.utils.data import DataLoader\n\n    num_workers = 20\n    ds = IterableDataset.from_generator(gen_with_worker_info, gen_kwargs={\"shard\": list(range(num_workers))})\n    ds = ds.shuffle(buffer_size=100, seed=1234)\n    dataloader = DataLoader(ds, batch_size=None, num_workers=num_workers)\n\n    result = list(dataloader)\n    for single_chunk in [result[x : x + num_workers] for x in range(0, len(result), num_workers)]:\n        values = [item[\"value\"] for item in single_chunk]\n        # This will fail with the chance 1/100 ** 20!\n        assert len(set(values)) != 1, \"Make sure not all values are identical\"\n\n\ndef gen_with_value(shard, value):\n    for i in range(100):\n        yield {\"value\": value}\n\n\n@require_torch\ndef test_iterable_dataset_interleave_dataset_with_multiple_workers():\n    # GH 7567\n    from torch.utils.data import DataLoader\n\n    num_workers = 20\n    ds = [\n        IterableDataset.from_generator(gen_with_value, gen_kwargs={\"shard\": list(range(num_workers)), \"value\": i})\n        for i in range(10)\n    ]\n    ds = interleave_datasets(ds, probabilities=[1 / len(ds)] * len(ds), seed=1234)\n    dataloader = DataLoader(ds, batch_size=None, num_workers=num_workers)\n\n    result = list(dataloader)\n    for single_chunk in [result[x : x + num_workers] for x in range(0, len(result), num_workers)]:\n        values = [item[\"value\"] for item in single_chunk]\n        assert len(set(values)) != 1, \"Make sure not all values are identical\"\n\n\ndef gen_with_id(shard, value):\n    for i in range(50):\n        yield {\"value\": value, \"id\": i}\n\n\n@require_torch\ndef test_iterable_dataset_interleave_dataset_deterministic_across_iterations():\n    # GH 7567\n    from torch.utils.data import DataLoader\n\n    num_workers = 10\n    ds = [\n        IterableDataset.from_generator(gen_with_id, gen_kwargs={\"shard\": list(range(num_workers)), \"value\": i})\n        for i in range(5)\n    ]\n    ds = interleave_datasets(ds, probabilities=[1 / len(ds)] * len(ds), seed=1234)\n    dataloader = DataLoader(ds, batch_size=None, num_workers=num_workers)\n\n    # First iteration\n    first_result = list(dataloader)\n\n    # Second iteration\n    second_result = list(dataloader)\n\n    assert first_result == second_result, \"Results should be identical across iterations when using same seed\"\n\n\n@pytest.mark.parametrize(\"batch_size\", [4, 5])\n@pytest.mark.parametrize(\"drop_last_batch\", [False, True])\ndef test_iterable_dataset_iter_batch(batch_size, drop_last_batch):\n    n = 25\n    dataset = IterableDataset(ExamplesIterable(generate_examples_fn, {\"n\": n}))\n    all_examples = [ex for _, ex in generate_examples_fn(n=n)]\n    expected = []\n    for i in range(0, len(all_examples), batch_size):\n        if len(all_examples[i : i + batch_size]) < batch_size and drop_last_batch:\n            continue\n        expected.append(_examples_to_batch(all_examples[i : i + batch_size]))\n    assert next(iter(dataset.iter(batch_size, drop_last_batch=drop_last_batch))) == expected[0]\n    assert list(dataset.iter(batch_size, drop_last_batch=drop_last_batch)) == expected\n\n\ndef test_iterable_dataset_info():\n    info = DatasetInfo(description=\"desc\", citation=\"@article{}\", size_in_bytes=42)\n    ex_iterable = ExamplesIterable(generate_examples_fn, {})\n    dataset = IterableDataset(ex_iterable, info=info)\n    assert dataset.info == info\n    assert dataset.description == info.description\n    assert dataset.citation == info.citation\n    assert dataset.size_in_bytes == info.size_in_bytes\n\n\ndef test_iterable_dataset_set_epoch(dataset: IterableDataset):\n    assert dataset._epoch == 0\n    dataset.set_epoch(42)\n    assert dataset._epoch == 42\n\n\ndef test_iterable_dataset_set_epoch_resuming(dataset: IterableDataset):\n    dataset_length = len(list(dataset))\n    assert len(list(dataset)) == dataset_length > 0\n    dataset.load_state_dict(dataset.state_dict())\n    assert len(list(dataset)) == 0\n    dataset.set_epoch(1)\n    assert len(list(dataset)) == dataset_length > 0\n    dataset.load_state_dict(dataset.state_dict())\n    assert len(list(dataset)) == 0\n\n\ndef test_iterable_dataset_map(\n    dataset: IterableDataset,\n):\n    func = lambda x: {\"id+1\": x[\"id\"] + 1}  # noqa: E731\n    mapped_dataset = dataset.map(func)\n    assert isinstance(mapped_dataset._ex_iterable, MappedExamplesIterable)\n    assert mapped_dataset._ex_iterable.function is func\n    assert mapped_dataset._ex_iterable.batched is False\n    assert next(iter(mapped_dataset)) == {**next(iter(dataset)), **func(next(iter(generate_examples_fn()))[1])}\n\n\ndef test_iterable_dataset_map_batched(\n    dataset: IterableDataset,\n):\n    func = lambda x: {\"id+1\": [i + 1 for i in x[\"id\"]]}  # noqa: E731\n    batch_size = 3\n    dataset = dataset.map(func, batched=True, batch_size=batch_size)\n    assert isinstance(dataset._ex_iterable, MappedExamplesIterable)\n    assert dataset._ex_iterable.function is func\n    assert dataset._ex_iterable.batch_size == batch_size\n    assert next(iter(dataset)) == {\"id\": 0, \"id+1\": 1}\n\n\ndef test_iterable_dataset_map_complex_features(\n    dataset: IterableDataset,\n):\n    # https://github.com/huggingface/datasets/issues/3505\n    ex_iterable = ExamplesIterable(generate_examples_fn, {\"label\": \"positive\"})\n    features = Features(\n        {\n            \"id\": Value(\"int64\"),\n            \"label\": Value(\"string\"),\n        }\n    )\n    dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features))\n    dataset = dataset.cast_column(\"label\", ClassLabel(names=[\"negative\", \"positive\"]))\n    dataset = dataset.map(lambda x: {\"id+1\": x[\"id\"] + 1, **x})\n    assert isinstance(dataset._ex_iterable, MappedExamplesIterable)\n    features[\"label\"] = ClassLabel(names=[\"negative\", \"positive\"])\n    assert [{k: v for k, v in ex.items() if k != \"id+1\"} for ex in dataset] == [\n        features.encode_example(ex) for _, ex in ex_iterable\n    ]\n\n\ndef test_iterable_dataset_map_with_features(dataset: IterableDataset) -> None:\n    # https://github.com/huggingface/datasets/issues/3888\n    ex_iterable = ExamplesIterable(generate_examples_fn, {\"label\": \"positive\"})\n    features_before_map = Features(\n        {\n            \"id\": Value(\"int64\"),\n            \"label\": Value(\"string\"),\n        }\n    )\n    dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features_before_map))\n    assert dataset.info.features is not None\n    assert dataset.info.features == features_before_map\n    features_after_map = Features(\n        {\n            \"id\": Value(\"int64\"),\n            \"label\": Value(\"string\"),\n            \"target\": Value(\"string\"),\n        }\n    )\n    dataset = dataset.map(lambda x: {\"target\": x[\"label\"]}, features=features_after_map)\n    assert dataset.info.features is not None\n    assert dataset.info.features == features_after_map\n\n\ndef test_iterable_dataset_map_with_fn_kwargs(dataset: IterableDataset) -> None:\n    fn_kwargs = {\"y\": 1}\n    mapped_dataset = dataset.map(lambda x, y: {\"id+y\": x[\"id\"] + y}, fn_kwargs=fn_kwargs)\n    assert mapped_dataset._ex_iterable.batched is False\n    assert next(iter(mapped_dataset)) == {\"id\": 0, \"id+y\": 1}\n    batch_size = 3\n    mapped_dataset = dataset.map(\n        lambda x, y: {\"id+y\": [i + y for i in x[\"id\"]]}, batched=True, batch_size=batch_size, fn_kwargs=fn_kwargs\n    )\n    assert isinstance(mapped_dataset._ex_iterable, MappedExamplesIterable)\n    assert mapped_dataset._ex_iterable.batch_size == batch_size\n    assert next(iter(mapped_dataset)) == {\"id\": 0, \"id+y\": 1}\n\n\ndef test_iterable_dataset_filter(dataset: IterableDataset) -> None:\n    fn_kwargs = {\"y\": 1}\n    filtered_dataset = dataset.filter(lambda x, y: x[\"id\"] == y, fn_kwargs=fn_kwargs)\n    assert filtered_dataset._ex_iterable.batched is False\n    assert next(iter(filtered_dataset)) == {\"id\": 1}\n\n\ndef test_iterable_dataset_filter_chaining_does_not_raise() -> None:\n    \"\"\"Chaining two .filter() calls must not raise TypeError.\n\n    After the first .filter() the internal ex_iterable becomes typed\n    (is_typed=True) because FilteredExamplesIterable adds a mask column.\n    The second .filter() then wraps it in FormattedExamplesIterable.\n    Previously, features=None was passed when is_typed=True, causing\n    FilteredExamplesIterable.__init__ to crash with:\n      TypeError: 'NoneType' object is not a mapping\n    (issue #8037)\n    \"\"\"\n    from datasets import IterableDataset\n    from datasets.features import Features, Value\n\n    features = Features({\"id\": Value(\"int32\"), \"text\": Value(\"string\")})\n\n    def gen():\n        for i in range(5):\n            yield {\"id\": i, \"text\": f\"item-{i}\"}\n\n    ds = IterableDataset.from_generator(gen, features=features)\n    ds = ds.filter(lambda x: x[\"id\"] >= 1)\n    # Second filter must not raise TypeError\n    ds = ds.filter(lambda x: x[\"id\"] <= 3)\n    result = list(ds)\n    assert [row[\"id\"] for row in result] == [1, 2, 3]\n\n\n@pytest.mark.parametrize(\"seed\", [42, 1337, 101010, 123456])\n@pytest.mark.parametrize(\"epoch\", [None, 0, 1])\ndef test_iterable_dataset_shuffle(dataset: IterableDataset, seed, epoch):\n    buffer_size = 3\n    dataset = deepcopy(dataset)\n    dataset._ex_iterable.kwargs[\"filepaths\"] = [\"0.txt\", \"1.txt\"]\n    dataset = dataset.shuffle(seed, buffer_size=buffer_size)\n    # Effective seed is mix of seed and epoch\n    if epoch is None or epoch == 0:\n        effective_seed = seed\n    else:\n        dataset.set_epoch(epoch)\n        effective_seed = np.random.default_rng(seed).integers(0, 1 << 63) - epoch\n    # Shuffling adds a shuffle buffer\n    expected_first_example_index = next(\n        iter(BufferShuffledExamplesIterable._iter_random_indices(np.random.default_rng(effective_seed), buffer_size))\n    )\n    assert isinstance(dataset._ex_iterable, BufferShuffledExamplesIterable)\n    # It also shuffles the underlying examples iterable\n    expected_ex_iterable = ExamplesIterable(\n        generate_examples_fn, {\"filepaths\": [\"0.txt\", \"1.txt\"]}\n    ).shuffle_data_sources(np.random.default_rng(seed))\n    if epoch:\n        expected_ex_iterable = expected_ex_iterable.shuffle_data_sources(np.random.default_rng(epoch))\n    assert isinstance(dataset._ex_iterable.ex_iterable, ExamplesIterable)\n    assert next(iter(dataset)) == list(islice(expected_ex_iterable, expected_first_example_index + 1))[-1][1]\n\n\n@pytest.mark.parametrize(\n    \"features\",\n    [\n        None,\n        Features(\n            {\n                \"id\": Value(\"int64\"),\n                \"label\": Value(\"int64\"),\n            }\n        ),\n        Features(\n            {\n                \"id\": Value(\"int64\"),\n                \"label\": ClassLabel(names=[\"negative\", \"positive\"]),\n            }\n        ),\n    ],\n)\ndef test_iterable_dataset_features(features):\n    ex_iterable = ExamplesIterable(generate_examples_fn, {\"label\": 0})\n    dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features))\n    if features:\n        expected = [features.encode_example(x) for _, x in ex_iterable]\n    else:\n        expected = [x for _, x in ex_iterable]\n    assert list(dataset) == expected\n\n\ndef test_iterable_dataset_features_cast_to_python():\n    ex_iterable = ExamplesIterable(\n        generate_examples_fn, {\"timestamp\": pd.Timestamp(2020, 1, 1), \"array\": np.ones(5), \"n\": 1}\n    )\n    features = Features(\n        {\n            \"id\": Value(\"int64\"),\n            \"timestamp\": Value(\"timestamp[us]\"),\n            \"array\": List(Value(\"int64\")),\n        }\n    )\n    dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features))\n    assert list(dataset) == [{\"timestamp\": pd.Timestamp(2020, 1, 1).to_pydatetime(), \"array\": [1] * 5, \"id\": 0}]\n\n\n@require_torch\n@require_tf\n@require_jax\n@pytest.mark.parametrize(\n    \"format_type\", [None, \"torch\", \"python\", \"tf\", \"tensorflow\", \"np\", \"numpy\", \"jax\", \"arrow\", \"pd\", \"pandas\"]\n)\ndef test_iterable_dataset_with_format(dataset: IterableDataset, format_type):\n    formatted_dataset = dataset.with_format(format_type)\n    assert formatted_dataset._formatting.format_type == get_format_type_from_alias(format_type)\n\n\n@require_torch\ndef test_iterable_dataset_is_torch_iterable_dataset(dataset: IterableDataset):\n    from torch.utils.data import DataLoader, _DatasetKind\n\n    dataloader = DataLoader(dataset)\n    assert dataloader._dataset_kind == _DatasetKind.Iterable\n    out = list(dataloader)\n    assert len(out) == DEFAULT_N_EXAMPLES\n\n\n@require_torch\ndef test_iterable_dataset_persists_epoch_in_torch_workers(dataset: IterableDataset):\n    from torch.utils.data import DataLoader\n\n    dataset = dataset.shuffle(seed=42)\n    dataloader = DataLoader(dataset, num_workers=1, persistent_workers=True)\n    epoch0 = list(dataloader)\n    assert list(dataloader) == epoch0\n    dataset.set_epoch(1)\n    assert list(dataloader) != epoch0\n\n    # Make sure pickle works even with torch objects in shared memory\n    dataset_copy: IterableDataset = pickle.loads(pickle.dumps(dataset))\n    dataloader = DataLoader(dataset_copy, num_workers=1, persistent_workers=True)\n    epoch1 = list(dataloader)\n    assert list(dataloader) == epoch1\n    dataset.set_epoch(2)  # this should not affect the copy\n    assert list(dataloader) == epoch1\n    dataset_copy.set_epoch(2)\n    assert list(dataloader) != epoch1\n\n\n@pytest.mark.parametrize(\"n\", [0, 2, int(1e10)])\ndef test_iterable_dataset_skip(dataset: IterableDataset, n):\n    skip_dataset = dataset.skip(n)\n    assert isinstance(skip_dataset._ex_iterable, SkipExamplesIterable)\n    assert skip_dataset._ex_iterable.n == n\n    assert list(skip_dataset) == list(dataset)[n:]\n\n\n@pytest.mark.parametrize(\"n\", [0, 2, int(1e10)])\ndef test_iterable_dataset_take(dataset: IterableDataset, n):\n    take_dataset = dataset.take(n)\n    assert isinstance(take_dataset._ex_iterable, TakeExamplesIterable)\n    assert take_dataset._ex_iterable.n == n\n    assert list(take_dataset) == list(dataset)[:n]\n\n\n@pytest.mark.parametrize(\"n\", [0, 2])\ndef test_iterable_dataset_repeat(dataset: IterableDataset, n):\n    repeat_dataset = dataset.repeat(n)\n    assert isinstance(repeat_dataset._ex_iterable, RepeatExamplesIterable)\n    assert repeat_dataset._ex_iterable.num_times == n\n    assert list(repeat_dataset) == list(dataset) * n\n\n\ndef test_iterable_dataset_shard():\n    num_examples = 20\n    num_shards = 5\n    dataset = Dataset.from_dict({\"a\": range(num_examples)}).to_iterable_dataset(num_shards=num_shards)\n    assert sum(dataset.shard(num_shards, i).num_shards for i in range(num_shards)) == dataset.num_shards\n    assert list(concatenate_datasets([dataset.shard(num_shards, i) for i in range(num_shards)])) == list(dataset)\n    num_shards = 2\n    assert sum(dataset.shard(num_shards, i).num_shards for i in range(num_shards)) == dataset.num_shards\n    assert list(concatenate_datasets([dataset.shard(num_shards, i) for i in range(num_shards)])) == list(dataset)\n    assert (\n        sum(dataset.shard(num_shards, i, contiguous=False).num_shards for i in range(num_shards)) == dataset.num_shards\n    )\n    assert list(\n        concatenate_datasets([dataset.shard(num_shards, i, contiguous=False) for i in range(num_shards)])\n    ) != list(dataset)\n    assert sorted(\n        concatenate_datasets([dataset.shard(num_shards, i, contiguous=False) for i in range(num_shards)]),\n        key=lambda x: x[\"a\"],\n    ) == list(dataset)\n\n\n@pytest.mark.parametrize(\"method\", [\"skip\", \"take\"])\n@pytest.mark.parametrize(\"after_shuffle\", [False, True])\n@pytest.mark.parametrize(\"count\", [2, 5, 11])\ndef test_iterable_dataset_skip_or_take_after_shuffle(method, after_shuffle, count):\n    seed = 42\n    n, num_shards = 3, 10\n    ex_iterable = ExamplesIterable(\n        generate_examples_fn, {\"n\": n, \"filepaths\": [f\"{i}.txt\" for i in range(num_shards)]}\n    )\n    dataset = IterableDataset(ex_iterable)\n    shuffled_dataset = dataset\n    if after_shuffle:\n        shuffled_dataset = shuffled_dataset.shuffle(seed, buffer_size=DEFAULT_N_EXAMPLES)\n        shuffled_dataset = shuffled_dataset.skip(count) if method == \"skip\" else shuffled_dataset.take(count)\n        # skip/take a shuffled dataset should not keep the same examples and shuffle the shards\n        key = lambda x: f\"{x['filepath']}_{x['id']}\"  # noqa: E731\n        assert (len(list(dataset)) - count if method == \"skip\" else count) == len(list(shuffled_dataset))\n        assert sorted(list(dataset)[count:] if method == \"skip\" else list(dataset)[:count], key=key) != sorted(\n            shuffled_dataset, key=key\n        )\n    else:\n        shuffled_dataset = shuffled_dataset.skip(count) if method == \"skip\" else shuffled_dataset.take(count)\n        shuffled_dataset = shuffled_dataset.shuffle(seed, buffer_size=DEFAULT_N_EXAMPLES)\n        # shuffling a skip/take dataset should keep the same examples and don't shuffle the shards\n        key = lambda x: f\"{x['filepath']}_{x['id']}\"  # noqa: E731\n        assert (len(list(dataset)) - count if method == \"skip\" else count) == len(list(shuffled_dataset))\n        assert sorted(list(dataset)[count:] if method == \"skip\" else list(dataset)[:count], key=key) == sorted(\n            shuffled_dataset, key=key\n        )\n\n\n@pytest.mark.parametrize(\"method\", [\"skip\", \"take\"])\n@pytest.mark.parametrize(\"after_split_by_node\", [False, True])\n@pytest.mark.parametrize(\"count\", [2, 5, 11])\ndef test_iterable_dataset_skip_or_take_after_split_by_node(method, after_split_by_node, count):\n    n, num_shards = 3, 10\n    rank, world_size = 1, 2\n    ex_iterable = ExamplesIterable(\n        generate_examples_fn, {\"n\": n, \"filepaths\": [f\"{i}.txt\" for i in range(num_shards)]}\n    )\n    dataset = IterableDataset(ex_iterable)\n    distributed_dataset = dataset\n    true_distributed_dataset = split_dataset_by_node(dataset, rank=rank, world_size=world_size)\n    if after_split_by_node:\n        distributed_dataset = split_dataset_by_node(distributed_dataset, rank=rank, world_size=world_size)\n        distributed_dataset = distributed_dataset.skip(count) if method == \"skip\" else distributed_dataset.take(count)\n        assert (\n            list(true_distributed_dataset)[count:]\n            if method == \"skip\"\n            else list(true_distributed_dataset)[:count] == list(distributed_dataset)\n        )\n    else:\n        distributed_dataset = distributed_dataset.skip(count) if method == \"skip\" else distributed_dataset.take(count)\n        distributed_dataset = split_dataset_by_node(distributed_dataset, rank=rank, world_size=world_size)\n        assert len(\n            list(true_distributed_dataset)[count // world_size :]\n            if method == \"skip\"\n            else list(true_distributed_dataset)[: count // world_size]\n        ) == len(list(distributed_dataset))\n\n\ndef test_iterable_dataset_add_column(dataset_with_several_columns: IterableDataset):\n    new_column = list(range(3 * DEFAULT_N_EXAMPLES))\n    new_dataset = dataset_with_several_columns.add_column(\"new_column\", new_column)\n    assert list(new_dataset) == [\n        {**example, \"new_column\": idx} for idx, example in enumerate(dataset_with_several_columns)\n    ]\n    new_dataset = new_dataset._resolve_features()\n    assert \"new_column\" in new_dataset.column_names\n\n\ndef test_iterable_dataset_rename_column(dataset_with_several_columns: IterableDataset):\n    new_dataset = dataset_with_several_columns.rename_column(\"id\", \"new_id\")\n    assert list(new_dataset) == [\n        {(\"new_id\" if k == \"id\" else k): v for k, v in example.items()} for example in dataset_with_several_columns\n    ]\n    assert new_dataset.features is None\n    assert new_dataset.column_names is None\n    # rename the column if ds.features was not None\n    new_dataset = dataset_with_several_columns._resolve_features().rename_column(\"id\", \"new_id\")\n    assert new_dataset.features is not None\n    assert new_dataset.column_names is not None\n    assert \"id\" not in new_dataset.column_names\n    assert \"new_id\" in new_dataset.column_names\n\n\ndef test_iterable_dataset_rename_columns(dataset_with_several_columns: IterableDataset):\n    column_mapping = {\"id\": \"new_id\", \"filepath\": \"filename\"}\n    new_dataset = dataset_with_several_columns.rename_columns(column_mapping)\n    assert list(new_dataset) == [\n        {column_mapping.get(k, k): v for k, v in example.items()} for example in dataset_with_several_columns\n    ]\n    assert new_dataset.features is None\n    assert new_dataset.column_names is None\n    # rename the columns if ds.features was not None\n    new_dataset = dataset_with_several_columns._resolve_features().rename_columns(column_mapping)\n    assert new_dataset.features is not None\n    assert new_dataset.column_names is not None\n    assert all(c not in new_dataset.column_names for c in [\"id\", \"filepath\"])\n    assert all(c in new_dataset.column_names for c in [\"new_id\", \"filename\"])\n\n\ndef test_iterable_dataset_remove_columns(dataset_with_several_columns: IterableDataset):\n    new_dataset = dataset_with_several_columns.remove_columns(\"id\")\n    assert list(new_dataset) == [\n        {k: v for k, v in example.items() if k != \"id\"} for example in dataset_with_several_columns\n    ]\n    assert new_dataset.features is None\n    new_dataset = dataset_with_several_columns.remove_columns([\"id\", \"filepath\"])\n    assert list(new_dataset) == [\n        {k: v for k, v in example.items() if k != \"id\" and k != \"filepath\"} for example in dataset_with_several_columns\n    ]\n    assert new_dataset.features is None\n    assert new_dataset.column_names is None\n    # remove the columns if ds.features was not None\n    new_dataset = dataset_with_several_columns._resolve_features().remove_columns([\"id\", \"filepath\"])\n    assert new_dataset.features is not None\n    assert new_dataset.column_names is not None\n    assert all(c not in new_dataset.features for c in [\"id\", \"filepath\"])\n    assert all(c not in new_dataset.column_names for c in [\"id\", \"filepath\"])\n\n\ndef test_iterable_dataset_select_columns(dataset_with_several_columns: IterableDataset):\n    new_dataset = dataset_with_several_columns.select_columns(\"id\")\n    assert list(new_dataset) == [\n        {k: v for k, v in example.items() if k == \"id\"} for example in dataset_with_several_columns\n    ]\n    assert new_dataset.features is None\n    new_dataset = dataset_with_several_columns.select_columns([\"id\", \"filepath\"])\n    assert list(new_dataset) == [\n        {k: v for k, v in example.items() if k in (\"id\", \"filepath\")} for example in dataset_with_several_columns\n    ]\n    assert new_dataset.features is None\n    # select the columns if ds.features was not None\n    new_dataset = dataset_with_several_columns._resolve_features().select_columns([\"id\", \"filepath\"])\n    assert new_dataset.features is not None\n    assert new_dataset.column_names is not None\n    assert all(c in new_dataset.features for c in [\"id\", \"filepath\"])\n    assert all(c in new_dataset.column_names for c in [\"id\", \"filepath\"])\n\n\ndef test_iterable_dataset_cast_column():\n    ex_iterable = ExamplesIterable(generate_examples_fn, {\"label\": 10})\n    features = Features({\"id\": Value(\"int64\"), \"label\": Value(\"int64\")})\n    dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features))\n    casted_dataset = dataset.cast_column(\"label\", Value(\"bool\"))\n    casted_features = features.copy()\n    casted_features[\"label\"] = Value(\"bool\")\n    assert list(casted_dataset) == [casted_features.encode_example(ex) for _, ex in ex_iterable]\n\n\ndef test_iterable_dataset_cast():\n    ex_iterable = ExamplesIterable(generate_examples_fn, {\"label\": 10})\n    features = Features({\"id\": Value(\"int64\"), \"label\": Value(\"int64\")})\n    dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features))\n    new_features = Features({\"id\": Value(\"int64\"), \"label\": Value(\"bool\")})\n    casted_dataset = dataset.cast(new_features)\n    assert list(casted_dataset) == [new_features.encode_example(ex) for _, ex in ex_iterable]\n\n\ndef test_iterable_dataset_resolve_features():\n    ex_iterable = ExamplesIterable(generate_examples_fn, {})\n    dataset = IterableDataset(ex_iterable)\n    assert dataset.features is None\n    assert dataset.column_names is None\n    dataset = dataset._resolve_features()\n    assert dataset.features == Features(\n        {\n            \"id\": Value(\"int64\"),\n        }\n    )\n    assert dataset.column_names == [\"id\"]\n\n\ndef test_iterable_dataset_resolve_features_keep_order():\n    def gen():\n        yield from zip(range(3), [{\"a\": 1}, {\"c\": 1}, {\"b\": 1}])\n\n    ex_iterable = ExamplesIterable(gen, {})\n    dataset = IterableDataset(ex_iterable)._resolve_features()\n    # columns appear in order of appearance in the dataset\n    assert list(dataset.features) == [\"a\", \"c\", \"b\"]\n    assert dataset.column_names == [\"a\", \"c\", \"b\"]\n\n\ndef test_iterable_dataset_with_features_fill_with_none():\n    def gen():\n        yield from zip(range(2), [{\"a\": 1}, {\"b\": 1}])\n\n    ex_iterable = ExamplesIterable(gen, {})\n    info = DatasetInfo(features=Features({\"a\": Value(\"int32\"), \"b\": Value(\"int32\")}))\n    dataset = IterableDataset(ex_iterable, info=info)\n    assert list(dataset) == [{\"a\": 1, \"b\": None}, {\"b\": 1, \"a\": None}]\n\n\ndef test_concatenate_datasets():\n    ex_iterable1 = ExamplesIterable(generate_examples_fn, {\"label\": 10})\n    dataset1 = IterableDataset(ex_iterable1)\n    ex_iterable2 = ExamplesIterable(generate_examples_fn, {\"label\": 5})\n    dataset2 = IterableDataset(ex_iterable2)\n    concatenated_dataset = concatenate_datasets([dataset1, dataset2])\n    assert list(concatenated_dataset) == list(dataset1) + list(dataset2)\n\n\ndef test_concatenate_datasets_resolves_features():\n    ex_iterable1 = ExamplesIterable(generate_examples_fn, {\"label\": 10})\n    dataset1 = IterableDataset(ex_iterable1)\n    ex_iterable2 = ExamplesIterable(generate_examples_fn, {\"label\": 5})\n    dataset2 = IterableDataset(ex_iterable2)\n    concatenated_dataset = concatenate_datasets([dataset1, dataset2])\n    assert concatenated_dataset.features is not None\n    assert sorted(concatenated_dataset.features) == [\"id\", \"label\"]\n\n\ndef test_concatenate_datasets_with_different_columns():\n    ex_iterable1 = ExamplesIterable(generate_examples_fn, {\"label\": 10})\n    dataset1 = IterableDataset(ex_iterable1)\n    ex_iterable2 = ExamplesIterable(generate_examples_fn, {})\n    dataset2 = IterableDataset(ex_iterable2)\n    # missing column \"label\" -> it should be replaced with nulls\n    extended_dataset2_list = [{\"label\": None, **x} for x in dataset2]\n\n    concatenated_dataset = concatenate_datasets([dataset1, dataset2])\n    assert list(concatenated_dataset) == list(dataset1) + extended_dataset2_list\n    # change order\n    concatenated_dataset = concatenate_datasets([dataset2, dataset1])\n    assert list(concatenated_dataset) == extended_dataset2_list + list(dataset1)\n\n\ndef test_concatenate_datasets_axis_1():\n    ex_iterable1 = ExamplesIterable(generate_examples_fn, {\"label1\": 10})\n    dataset1 = IterableDataset(ex_iterable1)\n    ex_iterable2 = ExamplesIterable(generate_examples_fn, {\"label2\": 5})\n    dataset2 = IterableDataset(ex_iterable2)\n    with pytest.raises(ValueError):  # column \"id\" is duplicated -> raise an error\n        concatenate_datasets([dataset1, dataset2], axis=1)\n    concatenated_dataset = concatenate_datasets([dataset1, dataset2.remove_columns(\"id\")], axis=1)\n    assert list(concatenated_dataset) == [{**x, **y} for x, y in zip(dataset1, dataset2)]\n\n\ndef test_concatenate_datasets_axis_1_resolves_features():\n    ex_iterable1 = ExamplesIterable(generate_examples_fn, {\"label1\": 10})\n    dataset1 = IterableDataset(ex_iterable1)\n    ex_iterable2 = ExamplesIterable(generate_examples_fn, {\"label2\": 5})\n    dataset2 = IterableDataset(ex_iterable2).remove_columns(\"id\")\n    concatenated_dataset = concatenate_datasets([dataset1, dataset2], axis=1)\n    assert concatenated_dataset.features is not None\n    assert sorted(concatenated_dataset.features) == [\"id\", \"label1\", \"label2\"]\n\n\ndef test_concatenate_datasets_axis_1_with_different_lengths():\n    n1 = 10\n    ex_iterable1 = ExamplesIterable(generate_examples_fn, {\"label1\": 10, \"n\": n1})\n    dataset1 = IterableDataset(ex_iterable1)\n    n2 = 5\n    ex_iterable2 = ExamplesIterable(generate_examples_fn, {\"label2\": 5, \"n\": n2})\n    dataset2 = IterableDataset(ex_iterable2).remove_columns(\"id\")\n    # missing rows -> they should be replaced with nulls\n    extended_dataset2_list = list(dataset2) + [{\"label2\": None}] * (n1 - n2)\n\n    concatenated_dataset = concatenate_datasets([dataset1, dataset2], axis=1)\n    assert list(concatenated_dataset) == [{**x, **y} for x, y in zip(dataset1, extended_dataset2_list)]\n    # change order\n    concatenated_dataset = concatenate_datasets([dataset2, dataset1], axis=1)\n    assert list(concatenated_dataset) == [{**x, **y} for x, y in zip(extended_dataset2_list, dataset1)]\n\n\n@require_torch\n@require_tf\n@require_jax\n@pytest.mark.parametrize(\n    \"format_type\", [None, \"torch\", \"python\", \"tf\", \"tensorflow\", \"np\", \"numpy\", \"jax\", \"arrow\", \"pd\", \"pandas\"]\n)\ndef test_concatenate_datasets_with_format(dataset: IterableDataset, format_type):\n    formatted_dataset = dataset.with_format(format_type)\n    concatenated_dataset = concatenate_datasets([formatted_dataset])\n    assert concatenated_dataset._formatting.format_type == get_format_type_from_alias(format_type)\n\n\n@pytest.mark.parametrize(\n    \"probas, seed, expected_length, stopping_strategy\",\n    [\n        (None, None, 3 * (DEFAULT_N_EXAMPLES - 1) + 1, \"first_exhausted\"),\n        ([1, 0, 0], None, DEFAULT_N_EXAMPLES, \"first_exhausted\"),\n        ([0, 1, 0], None, DEFAULT_N_EXAMPLES, \"first_exhausted\"),\n        ([0.2, 0.5, 0.3], 42, None, \"first_exhausted\"),\n        ([0.1, 0.1, 0.8], 1337, None, \"first_exhausted\"),\n        ([0.5, 0.2, 0.3], 101010, None, \"first_exhausted\"),\n        (None, None, 3 * DEFAULT_N_EXAMPLES, \"all_exhausted\"),\n        ([0.2, 0.5, 0.3], 42, None, \"all_exhausted\"),\n        ([0.1, 0.1, 0.8], 1337, None, \"all_exhausted\"),\n        ([0.5, 0.2, 0.3], 101010, None, \"all_exhausted\"),\n    ],\n)\ndef test_interleave_datasets(dataset: IterableDataset, probas, seed, expected_length, stopping_strategy):\n    d1 = dataset\n    d2 = dataset.map(lambda x: {\"id+1\": x[\"id\"] + 1, **x})\n    d3 = dataset.with_format(\"python\")\n    datasets = [d1, d2, d3]\n\n    merged_dataset = interleave_datasets(\n        datasets, probabilities=probas, seed=seed, stopping_strategy=stopping_strategy\n    )\n\n    def fill_default(example):\n        return {\"id\": None, \"id+1\": None, **example}\n\n    # Check the examples iterable\n    assert isinstance(\n        merged_dataset._ex_iterable, (CyclingMultiSourcesExamplesIterable, RandomlyCyclingMultiSourcesExamplesIterable)\n    )\n    # Check that it is deterministic\n    if seed is not None:\n        merged_dataset2 = interleave_datasets(\n            [d1, d2, d3], probabilities=probas, seed=seed, stopping_strategy=stopping_strategy\n        )\n        assert list(merged_dataset) == list(merged_dataset2)\n    # Check features\n    assert merged_dataset.features == Features({\"id\": Value(\"int64\"), \"id+1\": Value(\"int64\")})\n    # Check first example\n    if seed is not None:\n        rng = np.random.default_rng(seed)\n        i = next(iter(cycle(rng.choice(len(datasets), size=1000, p=probas))))\n        assert next(iter(merged_dataset)) == fill_default(next(iter(datasets[i])))\n    else:\n        assert any(next(iter(merged_dataset)) == fill_default(next(iter(dataset))) for dataset in datasets)\n    # Compute length it case it's random\n    if expected_length is None:\n        expected_length = 0\n        counts = np.array([len(list(d)) for d in datasets])\n        bool_strategy_func = np.all if stopping_strategy == \"all_exhausted\" else np.any\n        rng = np.random.default_rng(seed)\n        for i in cycle(rng.choice(len(datasets), size=1000, p=probas)):\n            counts[i] -= 1\n            expected_length += 1\n            if bool_strategy_func(counts <= 0):\n                break\n    # Check length\n    assert len(list(merged_dataset)) == expected_length\n\n\ndef test_interleave_datasets_with_features(\n    dataset: IterableDataset,\n):\n    features = Features(\n        {\n            \"id\": Value(\"int64\"),\n            \"label\": ClassLabel(names=[\"negative\", \"positive\"]),\n        }\n    )\n    ex_iterable = ExamplesIterable(generate_examples_fn, {\"label\": 0})\n    dataset_with_features = IterableDataset(ex_iterable, info=DatasetInfo(features=features))\n\n    merged_dataset = interleave_datasets([dataset, dataset_with_features])\n    assert merged_dataset.features == features\n\n\ndef test_interleave_datasets_with_oversampling():\n    # Test hardcoded results\n    d1 = IterableDataset(ExamplesIterable((lambda: (yield from [(i, {\"a\": i}) for i in [0, 1, 2]])), {}))\n    d2 = IterableDataset(ExamplesIterable((lambda: (yield from [(i, {\"a\": i}) for i in [10, 11, 12, 13]])), {}))\n    d3 = IterableDataset(ExamplesIterable((lambda: (yield from [(i, {\"a\": i}) for i in [20, 21, 22, 23, 24]])), {}))\n\n    expected_values = [0, 10, 20, 1, 11, 21, 2, 12, 22, 0, 13, 23, 1, 10, 24]\n\n    # Check oversampling strategy without probabilities\n    assert [x[\"a\"] for x in interleave_datasets([d1, d2, d3], stopping_strategy=\"all_exhausted\")] == expected_values\n\n    # Check oversampling strategy with probabilities\n    expected_values = [20, 0, 21, 10, 1, 22, 23, 24, 2, 0, 1, 20, 11, 21, 2, 0, 12, 1, 22, 13]\n\n    values = [\n        x[\"a\"]\n        for x in interleave_datasets(\n            [d1, d2, d3], probabilities=[0.5, 0.2, 0.3], seed=42, stopping_strategy=\"all_exhausted\"\n        )\n    ]\n\n    assert values == expected_values\n\n\n@require_torch\ndef test_with_format_torch(dataset_with_several_columns: IterableDataset):\n    import torch\n\n    dset = dataset_with_several_columns.with_format(type=\"torch\")\n    example = next(iter(dset))\n    batch = next(iter(dset.iter(batch_size=3)))\n    assert len(example) == 3\n    assert isinstance(example[\"id\"], torch.Tensor)\n    assert list(example[\"id\"].shape) == []\n    assert example[\"id\"].item() == 0\n    assert isinstance(batch[\"id\"], torch.Tensor)\n    assert isinstance(example[\"filepath\"], list)\n    assert isinstance(example[\"filepath\"][0], str)\n    assert example[\"filepath\"][0] == \"data0.txt\"\n    assert isinstance(batch[\"filepath\"], list)\n    assert isinstance(example[\"metadata\"], dict)\n    assert isinstance(example[\"metadata\"][\"sources\"], list)\n    assert isinstance(example[\"metadata\"][\"sources\"][0], str)\n    assert isinstance(batch[\"metadata\"], list)\n\n\n@require_tf\ndef test_with_format_tf(dataset_with_several_columns: IterableDataset):\n    import tensorflow as tf\n\n    dset = dataset_with_several_columns.with_format(type=\"tensorflow\")\n    example = next(iter(dset))\n    batch = next(iter(dset.iter(batch_size=3)))\n    assert isinstance(example[\"id\"], tf.Tensor)\n    assert list(example[\"id\"].shape) == []\n    assert example[\"id\"].numpy().item() == 0\n    assert isinstance(batch[\"id\"], tf.Tensor)\n    assert isinstance(example[\"filepath\"], tf.Tensor)\n    assert example[\"filepath\"][0] == b\"data0.txt\"\n    assert isinstance(batch[\"filepath\"], tf.Tensor)\n    assert isinstance(example[\"metadata\"], dict)\n    assert isinstance(example[\"metadata\"][\"sources\"], tf.Tensor)\n    assert isinstance(batch[\"metadata\"], list)\n\n\ndef test_map_array_are_not_converted_back_to_lists(dataset: IterableDataset):\n    def func(example):\n        return {\"array\": np.array([1, 2, 3])}\n\n    dset_test = dataset.map(func)\n    example = next(iter(dset_test))\n    # not aligned with Dataset.map because we don't convert back to lists after map()\n    assert isinstance(example[\"array\"], np.ndarray)\n\n\ndef test_formatted_map(dataset: IterableDataset):\n    dataset = dataset.with_format(\"np\")\n    assert isinstance(next(dataset.iter(batch_size=3))[\"id\"], np.ndarray)\n    dataset = dataset.with_format(None)\n    assert isinstance(next(dataset.iter(batch_size=3))[\"id\"], list)\n\n    def add_one_numpy(example):\n        assert isinstance(example[\"id\"], np.ndarray)\n        return {\"id\": example[\"id\"] + 1}\n\n    dataset = dataset.with_format(\"np\")\n    dataset = dataset.map(add_one_numpy, batched=True)\n    assert isinstance(next(dataset.iter(batch_size=3))[\"id\"], np.ndarray)\n    dataset = dataset.with_format(None)\n    assert isinstance(next(dataset.iter(batch_size=3))[\"id\"], list)\n\n\ndef test_format_from_arrow():\n    python_arrow_extractor = Formatter.python_arrow_extractor\n    numpy_arrow_extractor = Formatter.numpy_arrow_extractor\n\n    with (\n        patch.object(Formatter, \"python_arrow_extractor\") as mock_python_arrow_extractor,\n        patch.object(Formatter, \"numpy_arrow_extractor\") as mock_numpy_arrow_extractor,\n    ):\n        mock_python_arrow_extractor.side_effect = python_arrow_extractor\n        mock_numpy_arrow_extractor.side_effect = numpy_arrow_extractor\n\n        def g():\n            yield 0, pa.table({\"a\": range(10)})\n\n        ds = IterableDataset(ArrowExamplesIterable(g, {}))\n        ds = ds.with_format(\"np\")\n        ds = ds.map(lambda x: x, batched=True)\n        next(iter(ds))\n\n        # we do arrow -> numpy -> python\n        mock_numpy_arrow_extractor.assert_called()\n        # we don't do any arrow -> python\n        mock_python_arrow_extractor.assert_not_called()\n\n\ndef test_format_arrow(dataset: IterableDataset):\n    ds = dataset.with_format(\"arrow\")\n    assert isinstance(next(iter(ds)), pa.Table)\n    assert isinstance(next(iter(ds.iter(batch_size=4))), pa.Table)\n    assert len(next(iter(ds))) == 1\n    assert len(next(iter(ds.iter(batch_size=4)))) == 4\n    ds = ds.map(lambda t: t.append_column(\"new_col\", pa.array([0] * len(t))))\n    ds = ds.map(lambda t: t.append_column(\"new_col_batched\", pa.array([1] * len(t))), batched=True)\n    ds = ds.with_format(None)\n    assert next(iter(ds)) == {**next(iter(dataset)), \"new_col\": 0, \"new_col_batched\": 1}\n\n\ndef test_format_pandas(dataset: IterableDataset):\n    ds = dataset.with_format(\"pandas\")\n    assert isinstance(next(iter(ds)), pd.DataFrame)\n    assert isinstance(next(iter(ds.iter(batch_size=4))), pd.DataFrame)\n    assert len(next(iter(ds))) == 1\n    assert len(next(iter(ds.iter(batch_size=4)))) == 4\n    ds = ds.map(lambda df: df.assign(new_col=[0] * len(df)))\n    ds = ds.map(lambda df: df.assign(new_col_batched=[1] * len(df)), batched=True)\n    ds = ds.with_format(None)\n    assert next(iter(ds)) == {**next(iter(dataset)), \"new_col\": 0, \"new_col_batched\": 1}\n\n\n@require_polars\ndef test_format_polars(dataset: IterableDataset):\n    import polars as pl\n\n    ds = dataset.with_format(\"polars\")\n    assert isinstance(next(iter(ds)), pl.DataFrame)\n    assert isinstance(next(iter(ds.iter(batch_size=4))), pl.DataFrame)\n    assert len(next(iter(ds))) == 1\n    assert len(next(iter(ds.iter(batch_size=4)))) == 4\n    ds = ds.map(lambda df: df.with_columns(pl.Series([0] * len(df)).alias(\"new_col\")))\n    ds = ds.map(lambda df: df.with_columns(pl.Series([1] * len(df)).alias(\"new_col_batched\")), batched=True)\n    ds = ds.with_format(None)\n    assert next(iter(ds)) == {**next(iter(dataset)), \"new_col\": 0, \"new_col_batched\": 1}\n\n\n@pytest.mark.parametrize(\"num_shards1, num_shards2, num_workers\", [(2, 1, 1), (2, 2, 2), (1, 3, 1), (4, 3, 3)])\ndef test_interleave_dataset_with_sharding(num_shards1, num_shards2, num_workers):\n    from torch.utils.data import DataLoader\n\n    ex_iterable1 = ExamplesIterable(generate_examples_fn, {\"filepaths\": [f\"{i}-1.txt\" for i in range(num_shards1)]})\n    dataset1 = IterableDataset(ex_iterable1).with_format(\"torch\")\n    ex_iterable2 = ExamplesIterable(generate_examples_fn, {\"filepaths\": [f\"{i}-2.txt\" for i in range(num_shards2)]})\n    dataset2 = IterableDataset(ex_iterable2).with_format(\"torch\")\n\n    dataset_merged = interleave_datasets([dataset1, dataset2], stopping_strategy=\"first_exhausted\")\n    assert dataset_merged.num_shards == min(num_shards1, num_shards2)\n    dataloader = DataLoader(dataset_merged, batch_size=None, num_workers=num_workers)\n    result = list(dataloader)\n    expected_length = 2 * min(\n        len([example for _, example in ex_iterable1]), len([example for _, example in ex_iterable2])\n    )\n    # some samples may be missing because the stopping strategy is applied per process\n    assert expected_length - num_workers <= len(result) <= expected_length\n    assert len(result) == len({str(x) for x in result})\n\n\ndef filter_func(batch):\n    return batch[\"id\"] == 4\n\n\ndef map_func(batch):\n    batch[\"id\"] *= 2\n    return batch\n\n\ndef test_pickle_after_many_transforms(dataset_with_several_columns):\n    dataset = dataset_with_several_columns\n    dataset = dataset.remove_columns([\"filepath\"])\n    dataset = dataset.take(5)\n    dataset = dataset.map(map_func)\n    dataset = dataset.shuffle()\n    dataset = dataset.skip(1)\n    dataset = dataset.filter(filter_func)\n    dataset = dataset.add_column(\"additional_col\", [\"something\"])\n    dataset = dataset.rename_column(\"metadata\", \"metadata1\")\n    dataset = dataset.rename_columns({\"id\": \"id1\", \"metadata1\": \"metadata2\"})\n    dataset = dataset.select_columns([\"id1\", \"additional_col\"])\n\n    unpickled_dataset = pickle.loads(pickle.dumps(dataset))\n\n    assert list(unpickled_dataset) == list(dataset)\n\n\n@require_torchdata_stateful_dataloader\ndef test_resume_dataloader(dataset: IterableDataset):\n    from torchdata.stateful_dataloader import StatefulDataLoader\n\n    dl = StatefulDataLoader(dataset)\n    remaining = []\n    for i, x in enumerate(dl):\n        if i == 2:\n            state_dict = dl.state_dict()\n        elif i > 2:\n            remaining.append(x)\n    dl = StatefulDataLoader(dataset)\n    dl.load_state_dict(state_dict)\n    assert remaining == list(dl)\n\n\ndef test_iterable_dataset_batch():\n    # Create a simple IterableDataset\n    data = [{\"id\": i, \"text\": f\"Text {i}\"} for i in range(10)]\n    ds = IterableDataset.from_generator(lambda: (x for x in data))\n\n    # Test with batch_size=3, drop_last_batch=False\n    batched_ds = ds.batch(batch_size=3, drop_last_batch=False)\n    batches = list(batched_ds)\n\n    assert len(batches) == 4  # 3 full batches and 1 partial batch\n    for i, batch in enumerate(batches[:3]):  # Check full batches\n        assert len(batch[\"id\"]) == 3\n        assert len(batch[\"text\"]) == 3\n        assert batch[\"id\"] == [3 * i, 3 * i + 1, 3 * i + 2]\n        assert batch[\"text\"] == [f\"Text {3 * i}\", f\"Text {3 * i + 1}\", f\"Text {3 * i + 2}\"]\n\n    # Check last partial batch\n    assert len(batches[3][\"id\"]) == 1\n    assert len(batches[3][\"text\"]) == 1\n    assert batches[3][\"id\"] == [9]\n    assert batches[3][\"text\"] == [\"Text 9\"]\n\n    # Test with batch_size=3, drop_last_batch=True\n    batched_ds = ds.batch(batch_size=3, drop_last_batch=True)\n    batches = list(batched_ds)\n\n    assert len(batches) == 3  # Only full batches\n    for i, batch in enumerate(batches):\n        assert len(batch[\"id\"]) == 3\n        assert len(batch[\"text\"]) == 3\n        assert batch[\"id\"] == [3 * i, 3 * i + 1, 3 * i + 2]\n        assert batch[\"text\"] == [f\"Text {3 * i}\", f\"Text {3 * i + 1}\", f\"Text {3 * i + 2}\"]\n\n    # Test with batch_size=4 (doesn't evenly divide dataset size)\n    batched_ds = ds.batch(batch_size=4, drop_last_batch=False)\n    batches = list(batched_ds)\n\n    assert len(batches) == 3  # 2 full batches and 1 partial batch\n    for i, batch in enumerate(batches[:2]):  # Check full batches\n        assert len(batch[\"id\"]) == 4\n        assert len(batch[\"text\"]) == 4\n        assert batch[\"id\"] == [4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3]\n        assert batch[\"text\"] == [f\"Text {4 * i}\", f\"Text {4 * i + 1}\", f\"Text {4 * i + 2}\", f\"Text {4 * i + 3}\"]\n\n    # Check last partial batch\n    assert len(batches[2][\"id\"]) == 2\n    assert len(batches[2][\"text\"]) == 2\n    assert batches[2][\"id\"] == [8, 9]\n    assert batches[2][\"text\"] == [\"Text 8\", \"Text 9\"]\n\n    # Test with features\n    batched_ds = ds._resolve_features().batch(batch_size=3)\n    batches = list(batched_ds)\n\n    assert batched_ds.features is not None\n    assert len(batches) == 4  # 3 full batches and 1 partial batch\n    for i, batch in enumerate(batches[:1]):\n        assert len(batch[\"id\"]) == 3\n        assert len(batch[\"text\"]) == 3\n        assert batch[\"id\"] == [3 * i, 3 * i + 1, 3 * i + 2]\n        assert batch[\"text\"] == [f\"Text {3 * i}\", f\"Text {3 * i + 1}\", f\"Text {3 * i + 2}\"]\n\n\n@dataclass\nclass DecodableFeature:\n    decode_example_num_calls = 0\n\n    def __init__(self):\n        self.decode = True\n\n    def decode_example(self, example, token_per_repo_id=None):\n        type(self).decode_example_num_calls += 1\n        return \"decoded\" if self.decode else example\n\n    def __call__(self):\n        return pa.string()\n\n\ndef test_decode():\n    data = [{\"i\": str(i)} for i in range(10)]\n    features = Features({\"i\": DecodableFeature()})\n    ds = IterableDataset.from_generator(lambda: (x for x in data), features=features)\n    assert next(iter(ds)) == {\"i\": \"decoded\"}\n    assert DecodableFeature.decode_example_num_calls == 1\n    ds = ds.decode(False)\n    assert next(iter(ds)) == {\"i\": \"0\"}\n    assert DecodableFeature.decode_example_num_calls == 1\n    ds = ds.decode(True)\n    assert next(iter(ds)) == {\"i\": \"decoded\"}\n    assert DecodableFeature.decode_example_num_calls == 2\n    ds = ds.decode(num_threads=1)\n    assert next(iter(ds)) == {\"i\": \"decoded\"}\n    assert DecodableFeature.decode_example_num_calls == 4\n\n\n############################\n#\n#   IterableColumn tests\n#\n############################\n\n\nclass TestIterableColumn:\n    def test_simple_getitem(self):\n        def gen():\n            yield {\"text\": \"Good\", \"label\": 0}\n            yield {\"text\": \"Bad\", \"label\": 1}\n\n        ds = IterableDataset.from_generator(gen)\n        texts = ds[\"text\"]\n        assert isinstance(texts, IterableColumn)\n\n        first_pass = list(texts)\n        assert first_pass == [\"Good\", \"Bad\"]\n        second_pass = list(texts)\n        assert second_pass == [\"Good\", \"Bad\"]\n\n    def test_chained_getitem(self):\n        def gen():\n            yield {\"sample\": {\"text\": \"Good\", \"label\": 0}}\n            yield {\"sample\": {\"text\": \"Bad\", \"label\": 1}}\n\n        ds = IterableDataset.from_generator(gen)\n        texts = ds[\"sample\"][\"text\"]\n        assert isinstance(texts, IterableColumn)\n\n        first_pass = list(texts)\n        assert first_pass == [\"Good\", \"Bad\"]\n        second_pass = list(texts)\n        assert second_pass == [\"Good\", \"Bad\"]\n\n    def test_getitem_for_batched_dataset(self):\n        data = [\n            {\"text\": \"Good\", \"label\": 0},\n            {\"text\": \"Bad\", \"label\": 1},\n            {\"text\": \"Good again\", \"label\": 0},\n            {\"text\": \"Bad again\", \"label\": 1},\n        ]\n\n        def gen():\n            yield from data\n\n        ds = IterableDataset.from_generator(gen).batch(batch_size=2)\n        texts = ds[\"text\"]\n        assert isinstance(texts, IterableColumn)\n        assert list(texts) == [[\"Good\", \"Bad\"], [\"Good again\", \"Bad again\"]]\n"
  },
  {
    "path": "tests/test_load.py",
    "content": "import importlib\nimport os\nimport pickle\nimport shutil\nimport tempfile\nfrom multiprocessing import Pool\nfrom pathlib import Path\nfrom types import SimpleNamespace\nfrom unittest import TestCase\nfrom unittest.mock import patch\n\nimport dill\nimport pyarrow as pa\nimport pytest\n\nimport datasets\nfrom datasets import config, load_dataset\nfrom datasets.arrow_dataset import Dataset\nfrom datasets.arrow_writer import ArrowWriter\nfrom datasets.builder import DatasetBuilder\nfrom datasets.config import METADATA_CONFIGS_FIELD\nfrom datasets.data_files import DataFilesDict, DataFilesPatternsDict\nfrom datasets.dataset_dict import DatasetDict\nfrom datasets.download.download_config import DownloadConfig\nfrom datasets.exceptions import DatasetNotFoundError\nfrom datasets.features import Features, Value\nfrom datasets.iterable_dataset import IterableDataset\nfrom datasets.load import (\n    CachedDatasetModuleFactory,\n    HubDatasetModuleFactory,\n    LocalDatasetModuleFactory,\n    PackagedDatasetModuleFactory,\n    infer_module_for_data_files_list,\n    infer_module_for_data_files_list_in_archives,\n    load_dataset_builder,\n)\nfrom datasets.packaged_modules.audiofolder.audiofolder import AudioFolder, AudioFolderConfig\nfrom datasets.packaged_modules.imagefolder.imagefolder import ImageFolder, ImageFolderConfig\nfrom datasets.utils.logging import INFO, get_logger\n\nfrom .utils import (\n    OfflineSimulationMode,\n    assert_arrow_memory_doesnt_increase,\n    assert_arrow_memory_increases,\n    offline,\n    require_pil,\n    require_torchcodec,\n    require_zstandard,\n    set_current_working_directory_to_temp_dir,\n)\n\n\nSAMPLE_DATASET_IDENTIFIER2 = \"hf-internal-testing/dataset_with_data_files\"  # only has data files\nSAMPLE_DATASET_IDENTIFIER3 = \"hf-internal-testing/multi_dir_dataset\"  # has multiple data directories\nSAMPLE_DATASET_IDENTIFIER4 = \"hf-internal-testing/imagefolder_with_metadata\"  # imagefolder with a metadata file inside the train/test directories\nSAMPLE_DATASET_IDENTIFIER5 = \"hf-internal-testing/imagefolder_with_metadata_no_splits\"  # imagefolder with a metadata file and no default split names in data files\n\nSAMPLE_DATASET_COMMIT_HASH = \"0e1cee81e718feadf49560b287c4eb669c2efb1a\"\nSAMPLE_DATASET_COMMIT_HASH2 = \"c19550d35263090b1ec2bfefdbd737431fafec40\"\nSAMPLE_DATASET_COMMIT_HASH3 = \"aaa2d4bdd1d877d1c6178562cfc584bdfa90f6dc\"\nSAMPLE_DATASET_COMMIT_HASH4 = \"507fa72044169a5a1802b7ac2d6bd38d5f310739\"\nSAMPLE_DATASET_COMMIT_HASH5 = \"4971fa562942cab8263f56a448c3f831b18f1c27\"\n\nSAMPLE_DATASET_NO_CONFIGS_IN_METADATA = \"hf-internal-testing/audiofolder_no_configs_in_metadata\"\nSAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA = \"hf-internal-testing/audiofolder_single_config_in_metadata\"\nSAMPLE_DATASET_TWO_CONFIG_IN_METADATA = \"hf-internal-testing/audiofolder_two_configs_in_metadata\"\nSAMPLE_DATASET_TWO_CONFIG_IN_METADATA_WITH_DEFAULT = (\n    \"hf-internal-testing/audiofolder_two_configs_in_metadata_with_default\"\n)\nSAMPLE_DATASET_CAPITAL_LETTERS_IN_NAME = \"hf-internal-testing/DatasetWithCapitalLetters\"\n\nSAMPLE_DATASET_NO_CONFIGS_IN_METADATA_COMMIT_HASH = \"26cd5079bb0d3cd1521c6894765a0b8edb159d7f\"\nSAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA_COMMIT_HASH = \"1668dfc91efae975e44457cdabef60fb9200820a\"\nSAMPLE_DATASET_TWO_CONFIG_IN_METADATA_COMMIT_HASH = \"e71bce498e6c2bd2c58b20b097fdd3389793263f\"\nSAMPLE_DATASET_TWO_CONFIG_IN_METADATA_WITH_DEFAULT_COMMIT_HASH = \"38937109bb4dc7067f575fe6e7b420158eb9cf32\"\nSAMPLE_DATASET_CAPITAL_LETTERS_IN_NAME_COMMIT_HASH = \"70aa36264a6954920a13dd0465156a60b9f8af4b\"\n\nSAMPLE_NOT_EXISTING_DATASET_IDENTIFIER = \"hf-internal-testing/_dummy\"\nSAMPLE_DATASET_NAME_THAT_DOESNT_EXIST = \"_dummy\"\n\n\n@pytest.fixture\ndef data_dir(tmp_path):\n    data_dir = tmp_path / \"data_dir\"\n    data_dir.mkdir()\n    with open(data_dir / \"train.txt\", \"w\") as f:\n        f.write(\"foo\\n\" * 10)\n    with open(data_dir / \"test.txt\", \"w\") as f:\n        f.write(\"bar\\n\" * 10)\n    return str(data_dir)\n\n\n@pytest.fixture\ndef data_dir_with_arrow(tmp_path):\n    data_dir = tmp_path / \"data_dir\"\n    data_dir.mkdir()\n    output_train = os.path.join(data_dir, \"train.arrow\")\n    with ArrowWriter(path=output_train) as writer:\n        writer.write_table(pa.Table.from_pydict({\"col_1\": [\"foo\"] * 10}))\n        num_examples, num_bytes = writer.finalize()\n    assert num_examples == 10\n    assert num_bytes > 0\n    output_test = os.path.join(data_dir, \"test.arrow\")\n    with ArrowWriter(path=output_test) as writer:\n        writer.write_table(pa.Table.from_pydict({\"col_1\": [\"bar\"] * 10}))\n        num_examples, num_bytes = writer.finalize()\n    assert num_examples == 10\n    assert num_bytes > 0\n    return str(data_dir)\n\n\n@pytest.fixture\ndef data_dir_with_metadata(tmp_path):\n    data_dir = tmp_path / \"data_dir_with_metadata\"\n    data_dir.mkdir()\n    (data_dir / \"train\").mkdir()\n    (data_dir / \"test\").mkdir()\n    with open(data_dir / \"train\" / \"cat.jpg\", \"wb\") as f:\n        f.write(b\"train_image_bytes\")\n    with open(data_dir / \"test\" / \"dog.jpg\", \"wb\") as f:\n        f.write(b\"test_image_bytes\")\n    with open(data_dir / \"train\" / \"metadata.jsonl\", \"w\") as f:\n        f.write(\n            \"\"\"\\\n        {\"file_name\": \"cat.jpg\", \"caption\": \"Cool train cat image\"}\n        \"\"\"\n        )\n    with open(data_dir / \"test\" / \"metadata.jsonl\", \"w\") as f:\n        f.write(\n            \"\"\"\\\n        {\"file_name\": \"dog.jpg\", \"caption\": \"Cool test dog image\"}\n        \"\"\"\n        )\n    return str(data_dir)\n\n\n@pytest.fixture\ndef data_dir_with_single_config_in_metadata(tmp_path):\n    data_dir = tmp_path / \"data_dir_with_one_default_config_in_metadata\"\n\n    cats_data_dir = data_dir / \"cats\"\n    cats_data_dir.mkdir(parents=True)\n    dogs_data_dir = data_dir / \"dogs\"\n    dogs_data_dir.mkdir(parents=True)\n\n    with open(cats_data_dir / \"cat.jpg\", \"wb\") as f:\n        f.write(b\"this_is_a_cat_image_bytes\")\n    with open(dogs_data_dir / \"dog.jpg\", \"wb\") as f:\n        f.write(b\"this_is_a_dog_image_bytes\")\n    with open(data_dir / \"README.md\", \"w\") as f:\n        f.write(\n            f\"\"\"\\\n---\n{METADATA_CONFIGS_FIELD}:\n  - config_name: custom\n    drop_labels: true\n---\n        \"\"\"\n        )\n    return str(data_dir)\n\n\n@pytest.fixture\ndef data_dir_with_config_and_data_files(tmp_path):\n    data_dir = tmp_path / \"data_dir_with_config_and_data_files\"\n\n    cats_data_dir = data_dir / \"data\" / \"cats\"\n    cats_data_dir.mkdir(parents=True)\n    dogs_data_dir = data_dir / \"data\" / \"dogs\"\n    dogs_data_dir.mkdir(parents=True)\n\n    with open(cats_data_dir / \"cat.jpg\", \"wb\") as f:\n        f.write(b\"this_is_a_cat_image_bytes\")\n    with open(dogs_data_dir / \"dog.jpg\", \"wb\") as f:\n        f.write(b\"this_is_a_dog_image_bytes\")\n    with open(data_dir / \"README.md\", \"w\") as f:\n        f.write(\n            f\"\"\"\\\n---\n{METADATA_CONFIGS_FIELD}:\n  - config_name: custom\n    data_files: \"data/**/*.jpg\"\n---\n        \"\"\"\n        )\n    return str(data_dir)\n\n\n@pytest.fixture\ndef data_dir_with_two_config_in_metadata(tmp_path):\n    data_dir = tmp_path / \"data_dir_with_two_configs_in_metadata\"\n    cats_data_dir = data_dir / \"cats\"\n    cats_data_dir.mkdir(parents=True)\n    dogs_data_dir = data_dir / \"dogs\"\n    dogs_data_dir.mkdir(parents=True)\n\n    with open(cats_data_dir / \"cat.jpg\", \"wb\") as f:\n        f.write(b\"this_is_a_cat_image_bytes\")\n    with open(dogs_data_dir / \"dog.jpg\", \"wb\") as f:\n        f.write(b\"this_is_a_dog_image_bytes\")\n\n    with open(data_dir / \"README.md\", \"w\") as f:\n        f.write(\n            f\"\"\"\\\n---\n{METADATA_CONFIGS_FIELD}:\n  - config_name: \"v1\"\n    drop_labels: true\n    default: true\n  - config_name: \"v2\"\n    drop_labels: false\n---\n        \"\"\"\n        )\n    return str(data_dir)\n\n\n@pytest.fixture\ndef data_dir_with_data_dir_configs_in_metadata(tmp_path):\n    data_dir = tmp_path / \"data_dir_with_two_configs_in_metadata\"\n    cats_data_dir = data_dir / \"cats\"\n    cats_data_dir.mkdir(parents=True)\n    dogs_data_dir = data_dir / \"dogs\"\n    dogs_data_dir.mkdir(parents=True)\n\n    with open(cats_data_dir / \"cat.jpg\", \"wb\") as f:\n        f.write(b\"this_is_a_cat_image_bytes\")\n    with open(dogs_data_dir / \"dog.jpg\", \"wb\") as f:\n        f.write(b\"this_is_a_dog_image_bytes\")\n\n\n@pytest.fixture\ndef sub_data_dirs(tmp_path):\n    data_dir2 = tmp_path / \"data_dir2\"\n    relative_subdir1 = \"subdir1\"\n    sub_data_dir1 = data_dir2 / relative_subdir1\n    sub_data_dir1.mkdir(parents=True)\n    with open(sub_data_dir1 / \"train.txt\", \"w\") as f:\n        f.write(\"foo\\n\" * 10)\n    with open(sub_data_dir1 / \"test.txt\", \"w\") as f:\n        f.write(\"bar\\n\" * 10)\n\n    relative_subdir2 = \"subdir2\"\n    sub_data_dir2 = tmp_path / data_dir2 / relative_subdir2\n    sub_data_dir2.mkdir(parents=True)\n    with open(sub_data_dir2 / \"train.txt\", \"w\") as f:\n        f.write(\"foo\\n\" * 10)\n    with open(sub_data_dir2 / \"test.txt\", \"w\") as f:\n        f.write(\"bar\\n\" * 10)\n\n    return str(data_dir2), relative_subdir1\n\n\n@pytest.fixture\ndef complex_data_dir(tmp_path):\n    data_dir = tmp_path / \"complex_data_dir\"\n    data_dir.mkdir()\n    (data_dir / \"data\").mkdir()\n    with open(data_dir / \"data\" / \"train.txt\", \"w\") as f:\n        f.write(\"foo\\n\" * 10)\n    with open(data_dir / \"data\" / \"test.txt\", \"w\") as f:\n        f.write(\"bar\\n\" * 10)\n    with open(data_dir / \"README.md\", \"w\") as f:\n        f.write(\"This is a readme\")\n    with open(data_dir / \".dummy\", \"w\") as f:\n        f.write(\"this is a dummy file that is not a data file\")\n    return str(data_dir)\n\n\n@pytest.mark.parametrize(\n    \"data_files, expected_module, expected_builder_kwargs\",\n    [\n        ([\"train.csv\"], \"csv\", {}),\n        ([\"train.tsv\"], \"csv\", {\"sep\": \"\\t\"}),\n        ([\"train.json\"], \"json\", {}),\n        ([\"train.jsonl\"], \"json\", {}),\n        ([\"train.parquet\"], \"parquet\", {}),\n        ([\"train.geoparquet\"], \"parquet\", {}),\n        ([\"train.gpq\"], \"parquet\", {}),\n        ([\"train.arrow\"], \"arrow\", {}),\n        ([\"train.txt\"], \"text\", {}),\n        ([\"uppercase.TXT\"], \"text\", {}),\n        ([\"unsupported.ext\"], None, {}),\n        ([\"\"], None, {}),\n    ],\n)\ndef test_infer_module_for_data_files(data_files, expected_module, expected_builder_kwargs):\n    module, builder_kwargs = infer_module_for_data_files_list(data_files)\n    assert module == expected_module\n    assert builder_kwargs == expected_builder_kwargs\n\n\n@pytest.mark.parametrize(\n    \"data_file, expected_module\",\n    [\n        (\"zip_csv_path\", \"csv\"),\n        (\"zip_csv_with_dir_path\", \"csv\"),\n        (\"zip_uppercase_csv_path\", \"csv\"),\n        (\"zip_unsupported_ext_path\", None),\n    ],\n)\ndef test_infer_module_for_data_files_in_archives(\n    data_file, expected_module, zip_csv_path, zip_csv_with_dir_path, zip_uppercase_csv_path, zip_unsupported_ext_path\n):\n    data_file_paths = {\n        \"zip_csv_path\": zip_csv_path,\n        \"zip_csv_with_dir_path\": zip_csv_with_dir_path,\n        \"zip_uppercase_csv_path\": zip_uppercase_csv_path,\n        \"zip_unsupported_ext_path\": zip_unsupported_ext_path,\n    }\n    data_files = [str(data_file_paths[data_file])]\n    inferred_module, _ = infer_module_for_data_files_list_in_archives(data_files)\n    assert inferred_module == expected_module\n\n\nclass ModuleFactoryTest(TestCase):\n    @pytest.fixture(autouse=True)\n    def inject_fixtures(\n        self,\n        jsonl_path,\n        data_dir,\n        data_dir_with_metadata,\n        data_dir_with_single_config_in_metadata,\n        data_dir_with_config_and_data_files,\n        data_dir_with_two_config_in_metadata,\n        sub_data_dirs,\n    ):\n        self._jsonl_path = jsonl_path\n        self._data_dir = data_dir\n        self._data_dir_with_metadata = data_dir_with_metadata\n        self._data_dir_with_single_config_in_metadata = data_dir_with_single_config_in_metadata\n        self._data_dir_with_config_and_data_files = data_dir_with_config_and_data_files\n        self._data_dir_with_two_config_in_metadata = data_dir_with_two_config_in_metadata\n        self._data_dir2 = sub_data_dirs[0]\n        self._sub_data_dir = sub_data_dirs[1]\n\n    def setUp(self):\n        self.cache_dir = tempfile.mkdtemp()\n        self.download_config = DownloadConfig(cache_dir=self.cache_dir)\n\n    def test_LocalDatasetModuleFactory(self):\n        factory = LocalDatasetModuleFactory(self._data_dir)\n        module_factory_result = factory.get_module()\n        assert importlib.import_module(module_factory_result.module_path) is not None\n        assert os.path.isdir(module_factory_result.builder_kwargs[\"base_path\"])\n\n    def test_LocalDatasetModuleFactory_with_data_dir(self):\n        factory = LocalDatasetModuleFactory(self._data_dir2, data_dir=self._sub_data_dir)\n        module_factory_result = factory.get_module()\n        assert importlib.import_module(module_factory_result.module_path) is not None\n        builder_config = module_factory_result.builder_configs_parameters.builder_configs[0]\n        assert (\n            builder_config.data_files is not None\n            and len(builder_config.data_files[\"train\"]) == 1\n            and len(builder_config.data_files[\"test\"]) == 1\n        )\n        assert all(\n            self._sub_data_dir in Path(data_file).parts\n            for data_file in builder_config.data_files[\"train\"] + builder_config.data_files[\"test\"]\n        )\n\n    def test_LocalDatasetModuleFactory_with_metadata(self):\n        factory = LocalDatasetModuleFactory(self._data_dir_with_metadata)\n        module_factory_result = factory.get_module()\n        assert importlib.import_module(module_factory_result.module_path) is not None\n        builder_config = module_factory_result.builder_configs_parameters.builder_configs[0]\n        assert (\n            builder_config.data_files is not None\n            and len(builder_config.data_files[\"train\"]) > 0\n            and len(builder_config.data_files[\"test\"]) > 0\n        )\n        assert any(Path(data_file).name == \"metadata.jsonl\" for data_file in builder_config.data_files[\"train\"])\n        assert any(Path(data_file).name == \"metadata.jsonl\" for data_file in builder_config.data_files[\"test\"])\n\n    def test_LocalDatasetModuleFactory_with_single_config_in_metadata(self):\n        factory = LocalDatasetModuleFactory(\n            self._data_dir_with_single_config_in_metadata,\n        )\n        module_factory_result = factory.get_module()\n        assert importlib.import_module(module_factory_result.module_path) is not None\n\n        module_metadata_configs = module_factory_result.builder_configs_parameters.metadata_configs\n        assert module_metadata_configs is not None\n        assert len(module_metadata_configs) == 1\n        assert next(iter(module_metadata_configs)) == \"custom\"\n        assert \"drop_labels\" in next(iter(module_metadata_configs.values()))\n        assert next(iter(module_metadata_configs.values()))[\"drop_labels\"] is True\n\n        module_builder_configs = module_factory_result.builder_configs_parameters.builder_configs\n        assert module_builder_configs is not None\n        assert len(module_builder_configs) == 1\n        assert isinstance(module_builder_configs[0], ImageFolderConfig)\n        assert module_builder_configs[0].name == \"custom\"\n        assert module_builder_configs[0].data_files is not None\n        assert isinstance(module_builder_configs[0].data_files, DataFilesPatternsDict)\n        module_builder_configs[0]._resolve_data_files(self._data_dir_with_single_config_in_metadata, DownloadConfig())\n        assert isinstance(module_builder_configs[0].data_files, DataFilesDict)\n        assert len(module_builder_configs[0].data_files) == 1  # one train split\n        assert len(module_builder_configs[0].data_files[\"train\"]) == 2  # two files\n        assert module_builder_configs[0].drop_labels is True  # parameter is passed from metadata\n\n        # config named \"default\" is automatically considered to be a default config\n        assert module_factory_result.builder_configs_parameters.default_config_name == \"custom\"\n\n        # we don't pass config params to builder in builder_kwargs, they are stored in builder_configs directly\n        assert \"drop_labels\" not in module_factory_result.builder_kwargs\n\n    def test_LocalDatasetModuleFactory_with_config_and_data_files(self):\n        factory = LocalDatasetModuleFactory(\n            self._data_dir_with_config_and_data_files,\n        )\n        module_factory_result = factory.get_module()\n        assert importlib.import_module(module_factory_result.module_path) is not None\n\n        module_metadata_configs = module_factory_result.builder_configs_parameters.metadata_configs\n        builder_kwargs = module_factory_result.builder_kwargs\n        assert module_metadata_configs is not None\n        assert len(module_metadata_configs) == 1\n        assert next(iter(module_metadata_configs)) == \"custom\"\n        assert \"data_files\" in next(iter(module_metadata_configs.values()))\n        assert next(iter(module_metadata_configs.values()))[\"data_files\"] == \"data/**/*.jpg\"\n        assert \"data_files\" not in builder_kwargs\n\n    def test_LocalDatasetModuleFactory_data_dir_with_config_and_data_files(self):\n        factory = LocalDatasetModuleFactory(self._data_dir_with_config_and_data_files, data_dir=\"data\")\n        module_factory_result = factory.get_module()\n        assert importlib.import_module(module_factory_result.module_path) is not None\n\n        module_metadata_configs = module_factory_result.builder_configs_parameters.metadata_configs\n        builder_kwargs = module_factory_result.builder_kwargs\n        assert module_metadata_configs is not None\n        assert len(module_metadata_configs) == 1\n        assert next(iter(module_metadata_configs)) == \"custom\"\n        assert \"data_files\" in next(iter(module_metadata_configs.values()))\n        assert next(iter(module_metadata_configs.values()))[\"data_files\"] == \"data/**/*.jpg\"\n        assert \"data_files\" in builder_kwargs\n        assert \"train\" in builder_kwargs[\"data_files\"]\n        assert len(builder_kwargs[\"data_files\"][\"train\"]) == 2\n\n    def test_LocalDatasetModuleFactory_with_two_configs_in_metadata(self):\n        factory = LocalDatasetModuleFactory(\n            self._data_dir_with_two_config_in_metadata,\n        )\n        module_factory_result = factory.get_module()\n        assert importlib.import_module(module_factory_result.module_path) is not None\n\n        module_metadata_configs = module_factory_result.builder_configs_parameters.metadata_configs\n        assert module_metadata_configs is not None\n        assert len(module_metadata_configs) == 2\n        assert list(module_metadata_configs) == [\"v1\", \"v2\"]\n        assert \"drop_labels\" in module_metadata_configs[\"v1\"]\n        assert module_metadata_configs[\"v1\"][\"drop_labels\"] is True\n        assert \"drop_labels\" in module_metadata_configs[\"v2\"]\n        assert module_metadata_configs[\"v2\"][\"drop_labels\"] is False\n\n        module_builder_configs = module_factory_result.builder_configs_parameters.builder_configs\n        assert module_builder_configs is not None\n        assert len(module_builder_configs) == 2\n        module_builder_config_v1, module_builder_config_v2 = module_builder_configs\n        assert module_builder_config_v1.name == \"v1\"\n        assert module_builder_config_v2.name == \"v2\"\n        assert isinstance(module_builder_config_v1, ImageFolderConfig)\n        assert isinstance(module_builder_config_v2, ImageFolderConfig)\n        assert isinstance(module_builder_config_v1.data_files, DataFilesPatternsDict)\n        assert isinstance(module_builder_config_v2.data_files, DataFilesPatternsDict)\n        module_builder_config_v1._resolve_data_files(self._data_dir_with_two_config_in_metadata, DownloadConfig())\n        module_builder_config_v2._resolve_data_files(self._data_dir_with_two_config_in_metadata, DownloadConfig())\n        assert isinstance(module_builder_config_v1.data_files, DataFilesDict)\n        assert isinstance(module_builder_config_v2.data_files, DataFilesDict)\n        assert sorted(module_builder_config_v1.data_files) == [\"train\"]\n        assert len(module_builder_config_v1.data_files[\"train\"]) == 2\n        assert sorted(module_builder_config_v2.data_files) == [\"train\"]\n        assert len(module_builder_config_v2.data_files[\"train\"]) == 2\n        assert module_builder_config_v1.drop_labels is True  # parameter is passed from metadata\n        assert module_builder_config_v2.drop_labels is False  # parameter is passed from metadata\n\n        assert (\n            module_factory_result.builder_configs_parameters.default_config_name == \"v1\"\n        )  # it's marked as a default one in yaml\n\n        # we don't pass config params to builder in builder_kwargs, they are stored in builder_configs directly\n        assert \"drop_labels\" not in module_factory_result.builder_kwargs\n\n    def test_PackagedDatasetModuleFactory(self):\n        factory = PackagedDatasetModuleFactory(\n            \"json\", data_files=self._jsonl_path, download_config=self.download_config\n        )\n        module_factory_result = factory.get_module()\n        assert importlib.import_module(module_factory_result.module_path) is not None\n\n    def test_PackagedDatasetModuleFactory_with_data_dir(self):\n        factory = PackagedDatasetModuleFactory(\"json\", data_dir=self._data_dir, download_config=self.download_config)\n        module_factory_result = factory.get_module()\n        assert importlib.import_module(module_factory_result.module_path) is not None\n        data_files = module_factory_result.builder_kwargs.get(\"data_files\")\n        assert data_files is not None and len(data_files[\"train\"]) > 0 and len(data_files[\"test\"]) > 0\n        assert Path(data_files[\"train\"][0]).parent.samefile(self._data_dir)\n        assert Path(data_files[\"test\"][0]).parent.samefile(self._data_dir)\n\n    def test_PackagedDatasetModuleFactory_with_data_dir_and_metadata(self):\n        factory = PackagedDatasetModuleFactory(\n            \"imagefolder\", data_dir=self._data_dir_with_metadata, download_config=self.download_config\n        )\n        module_factory_result = factory.get_module()\n        assert importlib.import_module(module_factory_result.module_path) is not None\n        data_files = module_factory_result.builder_kwargs.get(\"data_files\")\n        assert data_files is not None and len(data_files[\"train\"]) > 0 and len(data_files[\"test\"]) > 0\n        assert Path(self._data_dir_with_metadata) in Path(data_files[\"train\"][0]).parents\n        assert Path(self._data_dir_with_metadata) in Path(data_files[\"test\"][0]).parents\n        assert any(Path(data_file).name == \"metadata.jsonl\" for data_file in data_files[\"train\"])\n        assert any(Path(data_file).name == \"metadata.jsonl\" for data_file in data_files[\"test\"])\n\n    @pytest.mark.integration\n    def test_HubDatasetModuleFactory(self):\n        factory = HubDatasetModuleFactory(\n            SAMPLE_DATASET_IDENTIFIER2, commit_hash=SAMPLE_DATASET_COMMIT_HASH2, download_config=self.download_config\n        )\n        module_factory_result = factory.get_module()\n        assert importlib.import_module(module_factory_result.module_path) is not None\n        assert module_factory_result.builder_kwargs[\"base_path\"].startswith(config.HF_ENDPOINT)\n\n    @pytest.mark.integration\n    def test_HubDatasetModuleFactory_with_data_dir(self):\n        data_dir = \"data2\"\n        factory = HubDatasetModuleFactory(\n            SAMPLE_DATASET_IDENTIFIER3,\n            commit_hash=SAMPLE_DATASET_COMMIT_HASH3,\n            data_dir=data_dir,\n            download_config=self.download_config,\n        )\n        module_factory_result = factory.get_module()\n        assert importlib.import_module(module_factory_result.module_path) is not None\n        builder_config = module_factory_result.builder_configs_parameters.builder_configs[0]\n        assert module_factory_result.builder_kwargs[\"base_path\"].startswith(config.HF_ENDPOINT)\n        assert (\n            builder_config.data_files is not None\n            and len(builder_config.data_files[\"train\"]) == 1\n            and len(builder_config.data_files[\"test\"]) == 1\n        )\n        assert all(\n            data_dir in Path(data_file).parts\n            for data_file in builder_config.data_files[\"train\"] + builder_config.data_files[\"test\"]\n        )\n\n    @pytest.mark.integration\n    def test_HubDatasetModuleFactory_with_metadata(self):\n        factory = HubDatasetModuleFactory(\n            SAMPLE_DATASET_IDENTIFIER4, commit_hash=SAMPLE_DATASET_COMMIT_HASH4, download_config=self.download_config\n        )\n        module_factory_result = factory.get_module()\n        assert importlib.import_module(module_factory_result.module_path) is not None\n        builder_config = module_factory_result.builder_configs_parameters.builder_configs[0]\n        assert module_factory_result.builder_kwargs[\"base_path\"].startswith(config.HF_ENDPOINT)\n        assert (\n            builder_config.data_files is not None\n            and len(builder_config.data_files[\"train\"]) > 0\n            and len(builder_config.data_files[\"test\"]) > 0\n        )\n        assert any(Path(data_file).name == \"metadata.jsonl\" for data_file in builder_config.data_files[\"train\"])\n        assert any(Path(data_file).name == \"metadata.jsonl\" for data_file in builder_config.data_files[\"test\"])\n\n        factory = HubDatasetModuleFactory(\n            SAMPLE_DATASET_IDENTIFIER5, commit_hash=SAMPLE_DATASET_COMMIT_HASH5, download_config=self.download_config\n        )\n        module_factory_result = factory.get_module()\n        assert importlib.import_module(module_factory_result.module_path) is not None\n        builder_config = module_factory_result.builder_configs_parameters.builder_configs[0]\n        assert module_factory_result.builder_kwargs[\"base_path\"].startswith(config.HF_ENDPOINT)\n        assert (\n            builder_config.data_files is not None\n            and len(builder_config.data_files) == 1\n            and len(builder_config.data_files[\"train\"]) > 0\n        )\n        assert any(Path(data_file).name == \"metadata.jsonl\" for data_file in builder_config.data_files[\"train\"])\n\n    @pytest.mark.integration\n    def test_HubDatasetModuleFactory_with_one_default_config_in_metadata(self):\n        factory = HubDatasetModuleFactory(\n            SAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA,\n            commit_hash=SAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA_COMMIT_HASH,\n            download_config=self.download_config,\n        )\n        module_factory_result = factory.get_module()\n        assert importlib.import_module(module_factory_result.module_path) is not None\n        assert module_factory_result.builder_kwargs[\"base_path\"].startswith(config.HF_ENDPOINT)\n\n        module_metadata_configs = module_factory_result.builder_configs_parameters.metadata_configs\n        assert module_metadata_configs is not None\n        assert len(module_metadata_configs) == 1\n        assert next(iter(module_metadata_configs)) == \"custom\"\n        assert \"drop_labels\" in next(iter(module_metadata_configs.values()))\n        assert next(iter(module_metadata_configs.values()))[\"drop_labels\"] is True\n\n        module_builder_configs = module_factory_result.builder_configs_parameters.builder_configs\n        assert module_builder_configs is not None\n        assert len(module_builder_configs) == 1\n        assert isinstance(module_builder_configs[0], AudioFolderConfig)\n        assert module_builder_configs[0].name == \"custom\"\n        assert module_builder_configs[0].data_files is not None\n        assert isinstance(module_builder_configs[0].data_files, DataFilesPatternsDict)\n        module_builder_configs[0]._resolve_data_files(\n            module_factory_result.builder_kwargs[\"base_path\"], DownloadConfig()\n        )\n        assert isinstance(module_builder_configs[0].data_files, DataFilesDict)\n        assert sorted(module_builder_configs[0].data_files) == [\"test\", \"train\"]\n        assert len(module_builder_configs[0].data_files[\"train\"]) == 3\n        assert len(module_builder_configs[0].data_files[\"test\"]) == 3\n        assert module_builder_configs[0].drop_labels is True  # parameter is passed from metadata\n\n        # config named \"default\" is automatically considered to be a default config\n        assert module_factory_result.builder_configs_parameters.default_config_name == \"custom\"\n\n        # we don't pass config params to builder in builder_kwargs, they are stored in builder_configs directly\n        assert \"drop_labels\" not in module_factory_result.builder_kwargs\n\n    @pytest.mark.integration\n    def test_HubDatasetModuleFactory_with_two_configs_in_metadata(self):\n        datasets_names = [\n            (SAMPLE_DATASET_TWO_CONFIG_IN_METADATA, SAMPLE_DATASET_TWO_CONFIG_IN_METADATA_COMMIT_HASH),\n            (\n                SAMPLE_DATASET_TWO_CONFIG_IN_METADATA_WITH_DEFAULT,\n                SAMPLE_DATASET_TWO_CONFIG_IN_METADATA_WITH_DEFAULT_COMMIT_HASH,\n            ),\n        ]\n        for dataset_name, commit_hash in datasets_names:\n            factory = HubDatasetModuleFactory(\n                dataset_name, commit_hash=commit_hash, download_config=self.download_config\n            )\n            module_factory_result = factory.get_module()\n            assert importlib.import_module(module_factory_result.module_path) is not None\n\n            module_metadata_configs = module_factory_result.builder_configs_parameters.metadata_configs\n            assert module_metadata_configs is not None\n            assert len(module_metadata_configs) == 2\n            assert list(module_metadata_configs) == [\"v1\", \"v2\"]\n            assert \"drop_labels\" in module_metadata_configs[\"v1\"]\n            assert module_metadata_configs[\"v1\"][\"drop_labels\"] is True\n            assert \"drop_labels\" in module_metadata_configs[\"v2\"]\n            assert module_metadata_configs[\"v2\"][\"drop_labels\"] is False\n\n            module_builder_configs = module_factory_result.builder_configs_parameters.builder_configs\n            assert module_builder_configs is not None\n            assert len(module_builder_configs) == 2\n            module_builder_config_v1, module_builder_config_v2 = module_builder_configs\n            assert module_builder_config_v1.name == \"v1\"\n            assert module_builder_config_v2.name == \"v2\"\n            assert isinstance(module_builder_config_v1, AudioFolderConfig)\n            assert isinstance(module_builder_config_v2, AudioFolderConfig)\n            assert isinstance(module_builder_config_v1.data_files, DataFilesPatternsDict)\n            assert isinstance(module_builder_config_v2.data_files, DataFilesPatternsDict)\n            module_builder_config_v1._resolve_data_files(\n                module_factory_result.builder_kwargs[\"base_path\"], DownloadConfig()\n            )\n            module_builder_config_v2._resolve_data_files(\n                module_factory_result.builder_kwargs[\"base_path\"], DownloadConfig()\n            )\n            assert isinstance(module_builder_config_v1.data_files, DataFilesDict)\n            assert isinstance(module_builder_config_v2.data_files, DataFilesDict)\n            assert sorted(module_builder_config_v1.data_files) == [\"test\", \"train\"]\n            assert len(module_builder_config_v1.data_files[\"train\"]) == 3\n            assert len(module_builder_config_v1.data_files[\"test\"]) == 3\n            assert sorted(module_builder_config_v2.data_files) == [\"test\", \"train\"]\n            assert len(module_builder_config_v2.data_files[\"train\"]) == 2\n            assert len(module_builder_config_v2.data_files[\"test\"]) == 1\n            assert module_builder_config_v1.drop_labels is True  # parameter is passed from metadata\n            assert module_builder_config_v2.drop_labels is False  # parameter is passed from metadata\n            # we don't pass config params to builder in builder_kwargs, they are stored in builder_configs directly\n            assert \"drop_labels\" not in module_factory_result.builder_kwargs\n\n            if dataset_name == SAMPLE_DATASET_TWO_CONFIG_IN_METADATA_WITH_DEFAULT:\n                assert module_factory_result.builder_configs_parameters.default_config_name == \"v1\"\n            else:\n                assert module_factory_result.builder_configs_parameters.default_config_name is None\n\n    @pytest.mark.integration\n    def test_CachedDatasetModuleFactory(self):\n        name = SAMPLE_DATASET_IDENTIFIER2\n        load_dataset_builder(name, cache_dir=self.cache_dir).download_and_prepare()\n        for offline_mode in OfflineSimulationMode:\n            with offline(offline_mode):\n                factory = CachedDatasetModuleFactory(\n                    name,\n                    cache_dir=self.cache_dir,\n                )\n                module_factory_result = factory.get_module()\n                assert importlib.import_module(module_factory_result.module_path) is not None\n\n\n@pytest.mark.parametrize(\n    \"factory_class,requires_commit_hash\",\n    [\n        (CachedDatasetModuleFactory, False),\n        (HubDatasetModuleFactory, True),\n        (LocalDatasetModuleFactory, False),\n        (PackagedDatasetModuleFactory, False),\n    ],\n)\ndef test_module_factories(factory_class, requires_commit_hash):\n    name = \"dummy_name\"\n    if requires_commit_hash:\n        factory = factory_class(name, commit_hash=\"foo\")\n    else:\n        factory = factory_class(name)\n    assert factory.name == name\n\n\n@pytest.mark.integration\nclass LoadTest(TestCase):\n    @pytest.fixture(autouse=True)\n    def inject_fixtures(self, caplog):\n        self._caplog = caplog\n\n    def setUp(self):\n        self.cache_dir = tempfile.mkdtemp()\n\n    def tearDown(self):\n        shutil.rmtree(self.cache_dir)\n\n    @pytest.mark.integration\n    def test_offline_dataset_module_factory(self):\n        repo_id = SAMPLE_DATASET_IDENTIFIER2\n        builder = load_dataset_builder(repo_id, cache_dir=self.cache_dir)\n        builder.download_and_prepare()\n        for offline_simulation_mode in list(OfflineSimulationMode):\n            with offline(offline_simulation_mode):\n                self._caplog.clear()\n                # allow provide the repo id without an explicit path to remote or local actual file\n                dataset_module = datasets.load.dataset_module_factory(repo_id, cache_dir=self.cache_dir)\n                self.assertEqual(dataset_module.module_path, \"datasets.packaged_modules.cache.cache\")\n                self.assertIn(\"Using the latest cached version of the dataset\", self._caplog.text)\n\n    @pytest.mark.integration\n    def test_offline_dataset_module_factory_with_capital_letters_in_name(self):\n        repo_id = SAMPLE_DATASET_CAPITAL_LETTERS_IN_NAME\n        builder = load_dataset_builder(repo_id, cache_dir=self.cache_dir)\n        builder.download_and_prepare()\n        for offline_simulation_mode in list(OfflineSimulationMode):\n            with offline(offline_simulation_mode):\n                self._caplog.clear()\n                # allow provide the repo id without an explicit path to remote or local actual file\n                dataset_module = datasets.load.dataset_module_factory(repo_id, cache_dir=self.cache_dir)\n                self.assertEqual(dataset_module.module_path, \"datasets.packaged_modules.cache.cache\")\n                self.assertIn(\"Using the latest cached version of the dataset\", self._caplog.text)\n\n    def test_load_dataset_from_hub(self):\n        with self.assertRaises(DatasetNotFoundError) as context:\n            datasets.load_dataset(\"_dummy\")\n        self.assertIn(\n            \"Dataset '_dummy' doesn't exist on the Hub\",\n            str(context.exception),\n        )\n        with self.assertRaises(DatasetNotFoundError) as context:\n            datasets.load_dataset(\"HuggingFaceFW/fineweb-edu\", revision=\"0.0.0\")\n        self.assertIn(\n            \"Revision '0.0.0' doesn't exist for dataset 'HuggingFaceFW/fineweb-edu' on the Hub.\",\n            str(context.exception),\n        )\n        for offline_simulation_mode in list(OfflineSimulationMode):\n            with offline(offline_simulation_mode):\n                with self.assertRaises(ConnectionError) as context:\n                    datasets.load_dataset(\"_dummy\")\n                if offline_simulation_mode != OfflineSimulationMode.HF_HUB_OFFLINE_SET_TO_1:\n                    self.assertIn(\n                        \"Couldn't reach '_dummy' on the Hub\",\n                        str(context.exception),\n                    )\n\n    @pytest.mark.integration\n    def test_load_dataset_invalid_revision_with_cache(self):\n        repo_id = SAMPLE_DATASET_IDENTIFIER2\n        builder = load_dataset_builder(repo_id, cache_dir=self.cache_dir)\n        builder.download_and_prepare()\n        with self.assertRaises(DatasetNotFoundError) as context:\n            datasets.load_dataset(repo_id, revision=\"invalid_revision\", cache_dir=self.cache_dir)\n        self.assertIn(\n            \"Revision 'invalid_revision' doesn't exist for dataset\",\n            str(context.exception),\n        )\n\n    def test_load_dataset_namespace(self):\n        with self.assertRaises(DatasetNotFoundError) as context:\n            datasets.load_dataset(\"hf-internal-testing/_dummy\")\n        self.assertIn(\"hf-internal-testing/_dummy\", str(context.exception))\n        for offline_simulation_mode in list(OfflineSimulationMode):\n            with offline(offline_simulation_mode):\n                with self.assertRaises(ConnectionError) as context:\n                    datasets.load_dataset(\"hf-internal-testing/_dummy\")\n                self.assertIn(\"hf-internal-testing/_dummy\", str(context.exception), msg=offline_simulation_mode)\n\n\n@pytest.mark.integration\ndef test_load_dataset_builder_with_metadata():\n    builder = datasets.load_dataset_builder(SAMPLE_DATASET_IDENTIFIER4)\n    assert isinstance(builder, ImageFolder)\n    assert builder.config.name == \"default\"\n    assert builder.config.data_files is not None\n    assert builder.config.drop_metadata is None\n    with pytest.raises(ValueError):\n        builder = datasets.load_dataset_builder(SAMPLE_DATASET_IDENTIFIER4, \"non-existing-config\")\n\n\n@pytest.mark.integration\ndef test_load_dataset_builder_config_kwargs_passed_as_arguments():\n    builder_default = datasets.load_dataset_builder(SAMPLE_DATASET_IDENTIFIER4)\n    builder_custom = datasets.load_dataset_builder(SAMPLE_DATASET_IDENTIFIER4, drop_metadata=True)\n    assert builder_custom.config.drop_metadata != builder_default.config.drop_metadata\n    assert builder_custom.config.drop_metadata is True\n\n\ndef test_load_dataset_builder_config_kwargs_override_builder_kwargs():\n    class DummyBuilder:\n        def __init__(self, **kwargs):\n            self.kwargs = kwargs\n\n        # make sure that the builder is not trying to use the legacy cache dir\n        def _use_legacy_cache_dir_if_possible(self, dataset_module):\n            pass\n\n    dataset_module = SimpleNamespace(\n        builder_kwargs={\"base_path\": \"from_builder\"},\n        builder_configs_parameters=SimpleNamespace(\n            default_config_name=\"default\",\n            builder_configs=[SimpleNamespace(data_files={\"train\": [\"dummy.txt\"]})],\n        ),\n        dataset_infos={},\n        hash=\"dummy_hash\",\n    )\n\n    with (\n        patch(\"datasets.load.dataset_module_factory\", return_value=dataset_module),\n        patch(\"datasets.load.get_dataset_builder_class\", return_value=DummyBuilder),\n    ):\n        builder = datasets.load_dataset_builder(\"dummy/path\", base_path=\"from_user\")\n\n    assert builder.kwargs[\"base_path\"] == \"from_user\"\n\n\n@pytest.mark.integration\ndef test_load_dataset_builder_with_two_configs_in_metadata():\n    builder = datasets.load_dataset_builder(SAMPLE_DATASET_TWO_CONFIG_IN_METADATA, \"v1\")\n    assert isinstance(builder, AudioFolder)\n    assert builder.config.name == \"v1\"\n    assert builder.config.data_files is not None\n    with pytest.raises(ValueError):\n        datasets.load_dataset_builder(SAMPLE_DATASET_TWO_CONFIG_IN_METADATA)\n    with pytest.raises(ValueError):\n        datasets.load_dataset_builder(SAMPLE_DATASET_TWO_CONFIG_IN_METADATA, \"non-existing-config\")\n\n\n@pytest.mark.parametrize(\"serializer\", [pickle, dill])\ndef test_load_dataset_builder_with_metadata_configs_pickable(serializer):\n    builder = datasets.load_dataset_builder(SAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA)\n    builder_unpickled = serializer.loads(serializer.dumps(builder))\n    assert builder.BUILDER_CONFIGS == builder_unpickled.BUILDER_CONFIGS\n    assert list(builder_unpickled.builder_configs) == [\"custom\"]\n    assert isinstance(builder_unpickled.builder_configs[\"custom\"], AudioFolderConfig)\n\n    builder2 = datasets.load_dataset_builder(SAMPLE_DATASET_TWO_CONFIG_IN_METADATA, \"v1\")\n    builder2_unpickled = serializer.loads(serializer.dumps(builder2))\n    assert builder2.BUILDER_CONFIGS == builder2_unpickled.BUILDER_CONFIGS != builder_unpickled.BUILDER_CONFIGS\n    assert list(builder2_unpickled.builder_configs) == [\"v1\", \"v2\"]\n    assert isinstance(builder2_unpickled.builder_configs[\"v1\"], AudioFolderConfig)\n    assert isinstance(builder2_unpickled.builder_configs[\"v2\"], AudioFolderConfig)\n\n\ndef test_load_dataset_builder_for_absolute_data_dir(complex_data_dir):\n    builder = datasets.load_dataset_builder(complex_data_dir)\n    assert isinstance(builder, DatasetBuilder)\n    assert builder.name == \"text\"\n    assert builder.dataset_name == Path(complex_data_dir).name\n    assert builder.config.name == \"default\"\n    assert isinstance(builder.config.data_files, DataFilesDict)\n    assert len(builder.config.data_files[\"train\"]) > 0\n    assert len(builder.config.data_files[\"test\"]) > 0\n\n\ndef test_load_dataset_builder_for_relative_data_dir(complex_data_dir):\n    with set_current_working_directory_to_temp_dir():\n        relative_data_dir = \"relative_data_dir\"\n        shutil.copytree(complex_data_dir, relative_data_dir)\n        builder = datasets.load_dataset_builder(relative_data_dir)\n        assert isinstance(builder, DatasetBuilder)\n        assert builder.name == \"text\"\n        assert builder.dataset_name == relative_data_dir\n        assert builder.config.name == \"default\"\n        assert isinstance(builder.config.data_files, DataFilesDict)\n        assert len(builder.config.data_files[\"train\"]) > 0\n        assert len(builder.config.data_files[\"test\"]) > 0\n\n\n@pytest.mark.integration\ndef test_load_dataset_builder_for_community_dataset():\n    builder = datasets.load_dataset_builder(SAMPLE_DATASET_IDENTIFIER2)\n    assert isinstance(builder, DatasetBuilder)\n    assert builder.name == \"text\"\n    assert builder.dataset_name == SAMPLE_DATASET_IDENTIFIER2.split(\"/\")[-1]\n    assert builder.config.name == \"default\"\n    assert isinstance(builder.config.data_files, DataFilesDict)\n    assert len(builder.config.data_files[\"train\"]) > 0\n    assert len(builder.config.data_files[\"test\"]) > 0\n\n\ndef test_load_dataset_builder_fail():\n    with pytest.raises(DatasetNotFoundError):\n        datasets.load_dataset_builder(\"blabla\")\n\n\n@pytest.mark.integration\n@pytest.mark.parametrize(\n    \"kwargs, expected_train_num_rows, expected_test_num_rows\",\n    [\n        ({}, 2, 2),\n        ({\"data_dir\": \"data1\"}, 1, 1),  # GH-6918: NonMatchingSplitsSizesError\n        ({\"data_files\": \"data1/train.txt\"}, 1, None),  # GH-6939: ExpectedMoreSplits\n    ],\n)\ndef test_load_dataset_from_hub(kwargs, expected_train_num_rows, expected_test_num_rows):\n    dataset = load_dataset(SAMPLE_DATASET_IDENTIFIER3, **kwargs)\n    assert dataset[\"train\"].num_rows == expected_train_num_rows\n    assert (dataset[\"test\"].num_rows == expected_test_num_rows) if expected_test_num_rows else (\"test\" not in dataset)\n\n\n@pytest.mark.integration\n@pytest.mark.parametrize(\"stream_from_cache, \", [False, True])\ndef test_load_dataset_cached_from_hub(stream_from_cache, caplog):\n    dataset = load_dataset(SAMPLE_DATASET_IDENTIFIER3)\n    assert isinstance(dataset, DatasetDict)\n    assert all(isinstance(d, Dataset) for d in dataset.values())\n    assert len(dataset) == 2\n    assert isinstance(next(iter(dataset[\"train\"])), dict)\n    for offline_simulation_mode in list(OfflineSimulationMode):\n        with offline(offline_simulation_mode):\n            caplog.clear()\n            # Load dataset from cache\n            dataset = datasets.load_dataset(SAMPLE_DATASET_IDENTIFIER3, streaming=stream_from_cache)\n            assert len(dataset) == 2\n            assert \"Using the latest cached version of the dataset\" in caplog.text\n            assert isinstance(next(iter(dataset[\"train\"])), dict)\n    with pytest.raises(DatasetNotFoundError) as exc_info:\n        datasets.load_dataset(SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST)\n    assert f\"Dataset '{SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST}' doesn't exist on the Hub\" in str(exc_info.value)\n\n\ndef test_load_dataset_streaming_gz_json(jsonl_gz_path):\n    data_files = jsonl_gz_path\n    ds = load_dataset(\"json\", split=\"train\", data_files=data_files, streaming=True)\n    assert isinstance(ds, IterableDataset)\n    ds_item = next(iter(ds))\n    assert ds_item == {\"col_1\": \"0\", \"col_2\": 0, \"col_3\": 0.0}\n\n\n@pytest.mark.integration\n@pytest.mark.parametrize(\n    \"path\",\n    [\n        \"sample.jsonl\",\n        \"sample.jsonl.gz\",\n        \"sample.tar\",\n        \"sample.jsonl.xz\",\n        \"sample.zip\",\n        pytest.param(\"sample.jsonl.zst\", marks=require_zstandard),\n    ],\n)\ndef test_load_dataset_streaming_compressed_files(path):\n    repo_id = \"hf-internal-testing/compressed_files\"\n    data_files = f\"https://huggingface.co/datasets/{repo_id}/resolve/main/{path}\"\n    if data_files[-3:] in (\"zip\", \"tar\"):  # we need to glob \"*\" inside archives\n        data_files = data_files[-3:] + \"://*::\" + data_files\n    ds = load_dataset(\"json\", split=\"train\", data_files=data_files, streaming=True)\n    assert isinstance(ds, IterableDataset)\n    ds_item = next(iter(ds))\n    assert ds_item == {\n        \"tokens\": [\"Ministeri\", \"de\", \"Justícia\", \"d'Espanya\"],\n        \"ner_tags\": [1, 2, 2, 2],\n        \"langs\": [\"ca\", \"ca\", \"ca\", \"ca\"],\n        \"spans\": [\"PER: Ministeri de Justícia d'Espanya\"],\n    }\n\n\n@pytest.mark.parametrize(\"path_extension\", [\"csv\", \"csv.bz2\"])\n@pytest.mark.parametrize(\"streaming\", [False, True])\ndef test_load_dataset_streaming_csv(path_extension, streaming, csv_path, bz2_csv_path):\n    paths = {\"csv\": csv_path, \"csv.bz2\": bz2_csv_path}\n    data_files = str(paths[path_extension])\n    features = Features({\"col_1\": Value(\"string\"), \"col_2\": Value(\"int32\"), \"col_3\": Value(\"float32\")})\n    ds = load_dataset(\"csv\", split=\"train\", data_files=data_files, features=features, streaming=streaming)\n    assert isinstance(ds, IterableDataset if streaming else Dataset)\n    ds_item = next(iter(ds))\n    assert ds_item == {\"col_1\": \"0\", \"col_2\": 0, \"col_3\": 0.0}\n\n\n@pytest.mark.parametrize(\"streaming\", [False, True])\n@pytest.mark.parametrize(\"data_file\", [\"zip_csv_path\", \"zip_csv_with_dir_path\", \"csv_path\"])\ndef test_load_dataset_zip_csv(data_file, streaming, zip_csv_path, zip_csv_with_dir_path, csv_path):\n    data_file_paths = {\n        \"zip_csv_path\": zip_csv_path,\n        \"zip_csv_with_dir_path\": zip_csv_with_dir_path,\n        \"csv_path\": csv_path,\n    }\n    data_files = str(data_file_paths[data_file])\n    expected_size = 8 if data_file.startswith(\"zip\") else 4\n    features = Features({\"col_1\": Value(\"string\"), \"col_2\": Value(\"int32\"), \"col_3\": Value(\"float32\")})\n    ds = load_dataset(\"csv\", split=\"train\", data_files=data_files, features=features, streaming=streaming)\n    if streaming:\n        ds_item_counter = 0\n        for ds_item in ds:\n            if ds_item_counter == 0:\n                assert ds_item == {\"col_1\": \"0\", \"col_2\": 0, \"col_3\": 0.0}\n            ds_item_counter += 1\n        assert ds_item_counter == expected_size\n    else:\n        assert ds.shape[0] == expected_size\n        ds_item = next(iter(ds))\n        assert ds_item == {\"col_1\": \"0\", \"col_2\": 0, \"col_3\": 0.0}\n\n\n@pytest.mark.parametrize(\"streaming\", [False, True])\n@pytest.mark.parametrize(\"data_file\", [\"zip_jsonl_path\", \"zip_jsonl_with_dir_path\", \"jsonl_path\"])\ndef test_load_dataset_zip_jsonl(data_file, streaming, zip_jsonl_path, zip_jsonl_with_dir_path, jsonl_path):\n    data_file_paths = {\n        \"zip_jsonl_path\": zip_jsonl_path,\n        \"zip_jsonl_with_dir_path\": zip_jsonl_with_dir_path,\n        \"jsonl_path\": jsonl_path,\n    }\n    data_files = str(data_file_paths[data_file])\n    expected_size = 8 if data_file.startswith(\"zip\") else 4\n    features = Features({\"col_1\": Value(\"string\"), \"col_2\": Value(\"int32\"), \"col_3\": Value(\"float32\")})\n    ds = load_dataset(\"json\", split=\"train\", data_files=data_files, features=features, streaming=streaming)\n    if streaming:\n        ds_item_counter = 0\n        for ds_item in ds:\n            if ds_item_counter == 0:\n                assert ds_item == {\"col_1\": \"0\", \"col_2\": 0, \"col_3\": 0.0}\n            ds_item_counter += 1\n        assert ds_item_counter == expected_size\n    else:\n        assert ds.shape[0] == expected_size\n        ds_item = next(iter(ds))\n        assert ds_item == {\"col_1\": \"0\", \"col_2\": 0, \"col_3\": 0.0}\n\n\n@pytest.mark.parametrize(\"streaming\", [False, True])\n@pytest.mark.parametrize(\"data_file\", [\"zip_text_path\", \"zip_text_with_dir_path\", \"text_path\"])\ndef test_load_dataset_zip_text(data_file, streaming, zip_text_path, zip_text_with_dir_path, text_path):\n    data_file_paths = {\n        \"zip_text_path\": zip_text_path,\n        \"zip_text_with_dir_path\": zip_text_with_dir_path,\n        \"text_path\": text_path,\n    }\n    data_files = str(data_file_paths[data_file])\n    expected_size = 8 if data_file.startswith(\"zip\") else 4\n    ds = load_dataset(\"text\", split=\"train\", data_files=data_files, streaming=streaming)\n    if streaming:\n        ds_item_counter = 0\n        for ds_item in ds:\n            if ds_item_counter == 0:\n                assert ds_item == {\"text\": \"0\"}\n            ds_item_counter += 1\n        assert ds_item_counter == expected_size\n    else:\n        assert ds.shape[0] == expected_size\n        ds_item = next(iter(ds))\n        assert ds_item == {\"text\": \"0\"}\n\n\n@pytest.mark.parametrize(\"streaming\", [False, True])\ndef test_load_dataset_arrow(streaming, data_dir_with_arrow):\n    ds = load_dataset(\"arrow\", split=\"train\", data_dir=data_dir_with_arrow, streaming=streaming)\n    expected_size = 10\n    if streaming:\n        ds_item_counter = 0\n        for ds_item in ds:\n            if ds_item_counter == 0:\n                assert ds_item == {\"col_1\": \"foo\"}\n            ds_item_counter += 1\n        assert ds_item_counter == 10\n    else:\n        assert ds.num_rows == 10\n        assert ds.shape[0] == expected_size\n        ds_item = next(iter(ds))\n        assert ds_item == {\"col_1\": \"foo\"}\n\n\ndef test_load_dataset_text_with_unicode_new_lines(text_path_with_unicode_new_lines):\n    data_files = str(text_path_with_unicode_new_lines)\n    ds = load_dataset(\"text\", split=\"train\", data_files=data_files)\n    assert ds.num_rows == 3\n\n\ndef test_load_dataset_with_unsupported_extensions(text_dir_with_unsupported_extension):\n    data_files = str(text_dir_with_unsupported_extension)\n    ds = load_dataset(\"text\", split=\"train\", data_files=data_files)\n    assert ds.num_rows == 4\n\n\n@pytest.mark.integration\ndef test_loading_from_the_datasets_hub_with_token():\n    class CustomException(Exception):\n        pass\n\n    with patch(\"huggingface_hub.file_download._get_metadata_or_catch_error\") as mock_request:\n        mock_request.side_effect = CustomException()\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            with pytest.raises(CustomException):\n                load_dataset(SAMPLE_NOT_EXISTING_DATASET_IDENTIFIER, cache_dir=tmp_dir, token=\"foo\")\n        mock_request.assert_called_once()\n        assert mock_request.call_args_list[0][1][\"headers\"][\"authorization\"] == \"Bearer foo\"\n\n\n@pytest.mark.integration\ndef test_load_streaming_private_dataset(hf_token, hf_private_dataset_repo_txt_data):\n    ds = load_dataset(hf_private_dataset_repo_txt_data, streaming=True, token=hf_token)\n    assert next(iter(ds)) is not None\n\n\n@pytest.mark.integration\ndef test_load_dataset_builder_private_dataset(hf_token, hf_private_dataset_repo_txt_data):\n    builder = load_dataset_builder(hf_private_dataset_repo_txt_data, token=hf_token)\n    assert isinstance(builder, DatasetBuilder)\n\n\n@pytest.mark.integration\ndef test_load_streaming_private_dataset_with_zipped_data(hf_token, hf_private_dataset_repo_zipped_txt_data):\n    ds = load_dataset(hf_private_dataset_repo_zipped_txt_data, streaming=True, token=hf_token)\n    assert next(iter(ds)) is not None\n\n\n@require_pil\n@pytest.mark.integration\ndef test_load_dataset_config_kwargs_passed_as_arguments():\n    ds_default = load_dataset(SAMPLE_DATASET_IDENTIFIER4)\n    ds_custom = load_dataset(SAMPLE_DATASET_IDENTIFIER4, drop_metadata=True)\n    assert list(ds_default[\"train\"].features) == [\"image\", \"caption\"]\n    assert list(ds_custom[\"train\"].features) == [\"image\"]\n\n\n@require_torchcodec\n@pytest.mark.integration\ndef test_load_hub_dataset_with_single_config_in_metadata():\n    # load the same dataset but with no configurations (=with default parameters)\n    ds = load_dataset(SAMPLE_DATASET_NO_CONFIGS_IN_METADATA)\n    assert list(ds[\"train\"].features) == [\"audio\", \"label\"]  # assert label feature is here as expected by default\n    assert len(ds[\"train\"]) == 5 and len(ds[\"test\"]) == 4\n\n    ds2 = load_dataset(SAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA)  # single config -> no need to specify it\n    assert list(ds2[\"train\"].features) == [\"audio\"]  # assert param `drop_labels=True` from metadata is passed\n    assert len(ds2[\"train\"]) == 3 and len(ds2[\"test\"]) == 3\n\n    ds3 = load_dataset(SAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA, \"custom\")\n    assert list(ds3[\"train\"].features) == [\"audio\"]  # assert param `drop_labels=True` from metadata is passed\n    assert len(ds3[\"train\"]) == 3 and len(ds3[\"test\"]) == 3\n\n    with pytest.raises(ValueError):\n        # no config named \"default\"\n        _ = load_dataset(SAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA, \"default\")\n\n\n@require_torchcodec\n@pytest.mark.integration\ndef test_load_hub_dataset_with_two_config_in_metadata():\n    ds = load_dataset(SAMPLE_DATASET_TWO_CONFIG_IN_METADATA, \"v1\")\n    assert list(ds[\"train\"].features) == [\"audio\"]  # assert param `drop_labels=True` from metadata is passed\n    assert len(ds[\"train\"]) == 3 and len(ds[\"test\"]) == 3\n\n    ds2 = load_dataset(SAMPLE_DATASET_TWO_CONFIG_IN_METADATA, \"v2\")\n    assert list(ds2[\"train\"].features) == [\n        \"audio\",\n        \"label\",\n    ]  # assert param `drop_labels=False` from metadata is passed\n    assert len(ds2[\"train\"]) == 2 and len(ds2[\"test\"]) == 1\n\n    with pytest.raises(ValueError):\n        # config is required but not specified\n        _ = load_dataset(SAMPLE_DATASET_TWO_CONFIG_IN_METADATA)\n\n    with pytest.raises(ValueError):\n        # no config named \"default\"\n        _ = load_dataset(SAMPLE_DATASET_TWO_CONFIG_IN_METADATA, \"default\")\n\n    ds_with_default = load_dataset(SAMPLE_DATASET_TWO_CONFIG_IN_METADATA_WITH_DEFAULT)\n    # it's a dataset with the same data but \"v1\" config is marked as a default one\n    assert list(ds_with_default[\"train\"].features) == list(ds[\"train\"].features)\n    assert len(ds_with_default[\"train\"]) == len(ds[\"train\"]) and len(ds_with_default[\"test\"]) == len(ds[\"test\"])\n\n\n@require_torchcodec\n@pytest.mark.integration\ndef test_load_hub_dataset_with_metadata_config_in_parallel():\n    # assert it doesn't fail (pickling of dynamically created class works)\n    ds = load_dataset(SAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA, num_proc=2)\n    assert \"label\" not in ds[\"train\"].features  # assert param `drop_labels=True` from metadata is passed\n    assert len(ds[\"train\"]) == 3 and len(ds[\"test\"]) == 3\n\n    ds = load_dataset(SAMPLE_DATASET_TWO_CONFIG_IN_METADATA, \"v1\", num_proc=2)\n    assert \"label\" not in ds[\"train\"].features  # assert param `drop_labels=True` from metadata is passed\n    assert len(ds[\"train\"]) == 3 and len(ds[\"test\"]) == 3\n\n    ds = load_dataset(SAMPLE_DATASET_TWO_CONFIG_IN_METADATA, \"v2\", num_proc=2)\n    assert \"label\" in ds[\"train\"].features\n    assert len(ds[\"train\"]) == 2 and len(ds[\"test\"]) == 1\n\n\n@require_pil\n@pytest.mark.integration\n@pytest.mark.parametrize(\"streaming\", [True])\ndef test_load_dataset_private_zipped_images(hf_private_dataset_repo_zipped_img_data, hf_token, streaming):\n    ds = load_dataset(hf_private_dataset_repo_zipped_img_data, split=\"train\", streaming=streaming, token=hf_token)\n    assert isinstance(ds, IterableDataset if streaming else Dataset)\n    ds_items = list(ds)\n    assert len(ds_items) == 2\n\n\ndef test_load_dataset_then_move_then_reload(data_dir, tmp_path, caplog):\n    cache_dir1 = tmp_path / \"cache1\"\n    cache_dir2 = tmp_path / \"cache2\"\n    dataset = load_dataset(data_dir, split=\"train\", cache_dir=cache_dir1, trust_remote_code=True)\n    fingerprint1 = dataset._fingerprint\n    del dataset\n    os.rename(cache_dir1, cache_dir2)\n    caplog.clear()\n    with caplog.at_level(INFO, logger=get_logger().name):\n        dataset = load_dataset(data_dir, split=\"train\", cache_dir=cache_dir2)\n    assert \"Found cached dataset\" in caplog.text\n    assert dataset._fingerprint == fingerprint1, \"for the caching mechanism to work, fingerprint should stay the same\"\n    dataset = load_dataset(data_dir, split=\"test\", cache_dir=cache_dir2)\n    assert dataset._fingerprint != fingerprint1\n\n\ndef test_load_dataset_builder_then_edit_then_load_again(tmp_path: Path):\n    dataset_dir = tmp_path / \"test_load_dataset_then_edit_then_load_again\"\n    dataset_dir.mkdir()\n    with open(dataset_dir / \"train.txt\", \"w\") as f:\n        f.write(\"Hello there\")\n    dataset_builder = load_dataset_builder(str(dataset_dir))\n    with open(dataset_dir / \"train.txt\", \"w\") as f:\n        f.write(\"General Kenobi !\")\n    edited_dataset_builder = load_dataset_builder(str(dataset_dir))\n    assert dataset_builder.cache_dir != edited_dataset_builder.cache_dir\n\n\n@pytest.mark.parametrize(\"max_in_memory_dataset_size\", [\"default\", 0, 50, 500])\ndef test_load_dataset_local_with_default_in_memory(max_in_memory_dataset_size, data_dir, monkeypatch):\n    current_dataset_size = 148\n    if max_in_memory_dataset_size == \"default\":\n        max_in_memory_dataset_size = 0  # default\n    else:\n        monkeypatch.setattr(datasets.config, \"IN_MEMORY_MAX_SIZE\", max_in_memory_dataset_size)\n    if max_in_memory_dataset_size:\n        expected_in_memory = current_dataset_size < max_in_memory_dataset_size\n    else:\n        expected_in_memory = False\n\n    with assert_arrow_memory_increases() if expected_in_memory else assert_arrow_memory_doesnt_increase():\n        dataset = load_dataset(data_dir)\n    assert (dataset[\"train\"].dataset_size < max_in_memory_dataset_size) is expected_in_memory\n\n\n@pytest.mark.integration\ndef test_remote_data_files():\n    repo_id = \"hf-internal-testing/raw_jsonl\"\n    filename = \"wikiann-bn-validation.jsonl\"\n    data_files = f\"https://huggingface.co/datasets/{repo_id}/resolve/main/{filename}\"\n    ds = load_dataset(\"json\", split=\"train\", data_files=data_files, streaming=True)\n    assert isinstance(ds, IterableDataset)\n    ds_item = next(iter(ds))\n    assert ds_item.keys() == {\"langs\", \"ner_tags\", \"spans\", \"tokens\"}\n\n\ndef distributed_load_dataset(args):\n    data_name, tmp_dir, datafiles = args\n    dataset = load_dataset(data_name, cache_dir=tmp_dir, data_files=datafiles)\n    return dataset\n\n\ndef test_load_dataset_distributed(tmp_path, csv_path):\n    num_workers = 5\n    args = \"csv\", str(tmp_path), csv_path\n    with Pool(processes=num_workers) as pool:  # start num_workers processes\n        datasets = pool.map(distributed_load_dataset, [args] * num_workers)\n        assert len(datasets) == num_workers\n        assert all(len(dataset) == len(datasets[0]) > 0 for dataset in datasets)\n        assert len(datasets[0].cache_files) > 0\n        assert all(dataset.cache_files == datasets[0].cache_files for dataset in datasets)\n\n\ndef test_load_dataset_with_storage_options(mockfs):\n    with mockfs.open(\"data.txt\", \"w\") as f:\n        f.write(\"Hello there\\n\")\n        f.write(\"General Kenobi !\")\n    data_files = {\"train\": [\"mock://data.txt\"]}\n    ds = load_dataset(\"text\", data_files=data_files, storage_options=mockfs.storage_options)\n    assert list(ds[\"train\"]) == [{\"text\": \"Hello there\"}, {\"text\": \"General Kenobi !\"}]\n\n\n@require_pil\ndef test_load_dataset_with_storage_options_with_decoding(mockfs, image_file):\n    import PIL.Image\n\n    filename = os.path.basename(image_file)\n    with mockfs.open(filename, \"wb\") as fout:\n        with open(image_file, \"rb\") as fin:\n            fout.write(fin.read())\n    data_files = {\"train\": [\"mock://\" + filename]}\n    ds = load_dataset(\"imagefolder\", data_files=data_files, storage_options=mockfs.storage_options)\n    assert len(ds[\"train\"]) == 1\n    assert isinstance(ds[\"train\"][0][\"image\"], PIL.Image.Image)\n\n\ndef test_load_dataset_with_zip(zip_csv_path):\n    path = str(zip_csv_path.parent)\n    ds = load_dataset(path)\n    assert list(ds.keys()) == [\"train\"]\n    assert ds[\"train\"].column_names == [\"col_1\", \"col_2\", \"col_3\"]\n    assert ds[\"train\"].num_rows == 8\n    assert ds[\"train\"][0] == {\"col_1\": 0, \"col_2\": 0, \"col_3\": 0.0}\n\n\n@pytest.mark.integration\ndef test_reload_old_cache_from_2_15(tmp_path: Path):\n    cache_dir = tmp_path / \"test_reload_old_cache_from_2_15\"\n    builder_cache_dir = (\n        cache_dir / \"polinaeterna___audiofolder_two_configs_in_metadata/v2-374bfde4f55442bc/0.0.0/7896925d64deea5d\"\n    )\n    builder_cache_dir.mkdir(parents=True)\n    arrow_path = builder_cache_dir / \"audiofolder_two_configs_in_metadata-train.arrow\"\n    dataset_info_path = builder_cache_dir / \"dataset_info.json\"\n    with dataset_info_path.open(\"w\") as f:\n        f.write(\"{}\")\n    arrow_path.touch()\n    builder = load_dataset_builder(\n        \"polinaeterna/audiofolder_two_configs_in_metadata\",\n        \"v2\",\n        data_files=\"v2/train/*\",\n        cache_dir=cache_dir.as_posix(),\n    )\n    assert builder.cache_dir == builder_cache_dir.as_posix()  # old cache from 2.15\n\n    builder = load_dataset_builder(\n        \"polinaeterna/audiofolder_two_configs_in_metadata\", \"v2\", cache_dir=cache_dir.as_posix()\n    )\n    assert (\n        builder.cache_dir\n        == (\n            cache_dir / \"polinaeterna___audiofolder_two_configs_in_metadata\" / \"v2\" / \"0.0.0\" / str(builder.hash)\n        ).as_posix()\n    )  # new cache\n\n\n@pytest.mark.integration\ndef test_update_dataset_card_data_with_standalone_yaml():\n    # Labels defined in .huggingface.yml because they are too long to be in README.md\n    from datasets.utils.metadata import MetadataConfigs\n\n    with patch(\n        \"datasets.utils.metadata.MetadataConfigs.from_dataset_card_data\",\n        side_effect=MetadataConfigs.from_dataset_card_data,\n    ) as card_data_read_mock:\n        builder = load_dataset_builder(\"datasets-maintainers/dataset-with-standalone-yaml\")\n    assert card_data_read_mock.call_args.args[0][\"license\"] is not None  # from README.md\n    assert card_data_read_mock.call_args.args[0][\"dataset_info\"] is not None  # from standalone yaml\n    assert card_data_read_mock.call_args.args[0][\"tags\"] == [\"test\"]  # standalone yaml has precedence\n    assert isinstance(\n        builder.info.features[\"label\"], datasets.ClassLabel\n    )  # correctly loaded from long labels list in standalone yaml\n"
  },
  {
    "path": "tests/test_metadata_util.py",
    "content": "import re\nimport sys\nimport tempfile\nimport unittest\nfrom pathlib import Path\n\nimport pytest\nimport yaml\nfrom huggingface_hub import DatasetCard, DatasetCardData\n\nfrom datasets.config import METADATA_CONFIGS_FIELD\nfrom datasets.features import Features, Value\nfrom datasets.info import DatasetInfo\nfrom datasets.utils.metadata import MetadataConfigs\n\n\ndef _dedent(string: str) -> str:\n    indent_level = min(re.search(\"^ +\", t).end() if t.startswith(\" \") else 0 for t in string.splitlines())\n    return \"\\n\".join([line[indent_level:] for line in string.splitlines() if indent_level < len(line)])\n\n\nREADME_YAML = \"\"\"\\\n---\nlanguage:\n- zh\n- en\ntask_ids:\n- sentiment-classification\n---\n# Begin of markdown\n\nSome cool dataset card\n\"\"\"\n\nREADME_EMPTY_YAML = \"\"\"\\\n---\n---\n# Begin of markdown\n\nSome cool dataset card\n\"\"\"\n\n\nREADME_NO_YAML = \"\"\"\\\n# Begin of markdown\n\nSome cool dataset card\n\"\"\"\n\n\nREADME_METADATA_CONFIG_INCORRECT_FORMAT = f\"\"\"\\\n---\n{METADATA_CONFIGS_FIELD}:\n  data_dir: v1\n  drop_labels: true\n---\n\"\"\"\n\n\nREADME_METADATA_SINGLE_CONFIG = f\"\"\"\\\n---\n{METADATA_CONFIGS_FIELD}:\n  - config_name: custom\n    data_dir: v1\n    drop_labels: true\n---\n\"\"\"\n\n\nREADME_METADATA_TWO_CONFIGS_WITH_DEFAULT_FLAG = f\"\"\"\\\n---\n{METADATA_CONFIGS_FIELD}:\n  - config_name: v1\n    data_dir: v1\n    drop_labels: true\n  - config_name: v2\n    data_dir: v2\n    drop_labels: false\n    default: true\n---\n\"\"\"\n\n\nREADME_METADATA_TWO_CONFIGS_WITH_DEFAULT_NAME = f\"\"\"\\\n---\n{METADATA_CONFIGS_FIELD}:\n  - config_name: custom\n    data_dir: custom\n    drop_labels: true\n  - config_name: default\n    data_dir: data\n    drop_labels: false\n---\n\"\"\"\n\n\nREADME_METADATA_WITH_FEATURES = f\"\"\"\\\n---\n{METADATA_CONFIGS_FIELD}:\n  - config_name: default\n    features:\n      - name: id\n        dtype: int64\n      - name:  name\n        dtype: string\n      - name: score\n        dtype: float64\n---\n\"\"\"\n\n\nEXPECTED_METADATA_SINGLE_CONFIG = {\"custom\": {\"data_dir\": \"v1\", \"drop_labels\": True}}\nEXPECTED_METADATA_TWO_CONFIGS_DEFAULT_FLAG = {\n    \"v1\": {\"data_dir\": \"v1\", \"drop_labels\": True},\n    \"v2\": {\"data_dir\": \"v2\", \"drop_labels\": False, \"default\": True},\n}\nEXPECTED_METADATA_TWO_CONFIGS_DEFAULT_NAME = {\n    \"custom\": {\"data_dir\": \"custom\", \"drop_labels\": True},\n    \"default\": {\"data_dir\": \"data\", \"drop_labels\": False},\n}\nEXPECTED_METADATA_WITH_FEATURES = {\n    \"default\": {\n        \"features\": Features(\n            {\"id\": Value(dtype=\"int64\"), \"name\": Value(dtype=\"string\"), \"score\": Value(dtype=\"float64\")}\n        )\n    }\n}\n\n\n@pytest.fixture\ndef data_dir_with_two_subdirs(tmp_path):\n    data_dir = tmp_path / \"data_dir_with_two_configs_in_metadata\"\n    cats_data_dir = data_dir / \"cats\"\n    cats_data_dir.mkdir(parents=True)\n    dogs_data_dir = data_dir / \"dogs\"\n    dogs_data_dir.mkdir(parents=True)\n\n    with open(cats_data_dir / \"cat.jpg\", \"wb\") as f:\n        f.write(b\"this_is_a_cat_image_bytes\")\n    with open(dogs_data_dir / \"dog.jpg\", \"wb\") as f:\n        f.write(b\"this_is_a_dog_image_bytes\")\n\n    return str(data_dir)\n\n\nclass TestMetadataUtils(unittest.TestCase):\n    def test_metadata_dict_from_readme(self):\n        with tempfile.TemporaryDirectory() as tmp_dir:\n            path = Path(tmp_dir) / \"README.md\"\n            with open(path, \"w+\") as readme_file:\n                readme_file.write(README_YAML)\n            dataset_card_data = DatasetCard.load(path).data\n            self.assertDictEqual(\n                dataset_card_data.to_dict(), {\"language\": [\"zh\", \"en\"], \"task_ids\": [\"sentiment-classification\"]}\n            )\n\n            with open(path, \"w+\") as readme_file:\n                readme_file.write(README_EMPTY_YAML)\n            if (\n                sys.platform != \"win32\"\n            ):  # there is a bug on windows, see https://github.com/huggingface/huggingface_hub/issues/1546\n                dataset_card_data = DatasetCard.load(path).data\n                self.assertDictEqual(dataset_card_data.to_dict(), {})\n\n            with open(path, \"w+\") as readme_file:\n                readme_file.write(README_NO_YAML)\n            dataset_card_data = DatasetCard.load(path).data\n            self.assertEqual(dataset_card_data.to_dict(), {})\n\n    def test_from_yaml_string(self):\n        valid_yaml_string = _dedent(\n            \"\"\"\\\n            annotations_creators:\n            - found\n            language_creators:\n            - found\n            language:\n            - en\n            license:\n            - unknown\n            multilinguality:\n            - monolingual\n            pretty_name: Test Dataset\n            size_categories:\n            - 10K<n<100K\n            source_datasets:\n            - extended|other-yahoo-webscope-l6\n            task_categories:\n            - question-answering\n            task_ids:\n            - open-domain-qa\n            \"\"\"\n        )\n        assert DatasetCardData(**yaml.safe_load(valid_yaml_string)).to_dict()\n\n        valid_yaml_with_optional_keys = _dedent(\n            \"\"\"\\\n            annotations_creators:\n            - found\n            language_creators:\n            - found\n            language:\n            - en\n            license:\n            - unknown\n            multilinguality:\n            - monolingual\n            pretty_name: Test Dataset\n            size_categories:\n            - 10K<n<100K\n            source_datasets:\n            - extended|other-yahoo-webscope-l6\n            task_categories:\n            - text-classification\n            task_ids:\n            - multi-class-classification\n            paperswithcode_id:\n            - squad\n            configs:\n            - en\n            train-eval-index:\n            - config: en\n              task: text-classification\n              task_id: multi_class_classification\n              splits:\n                train_split: train\n                eval_split: test\n              col_mapping:\n                text: text\n                label: target\n              metrics:\n                - type: accuracy\n                  name: Accuracy\n            extra_gated_prompt: |\n              By clicking on “Access repository” below, you also agree to ImageNet Terms of Access:\n              [RESEARCHER_FULLNAME] (the \"Researcher\") has requested permission to use the ImageNet database (the \"Database\") at Princeton University and Stanford University. In exchange for such permission, Researcher hereby agrees to the following terms and conditions:\n              1. Researcher shall use the Database only for non-commercial research and educational purposes.\n            extra_gated_fields:\n              Company: text\n              Country: text\n              I agree to use this model for non-commerical use ONLY: checkbox\n            \"\"\"\n        )\n        assert DatasetCardData(**yaml.safe_load(valid_yaml_with_optional_keys)).to_dict()\n\n\n@pytest.mark.parametrize(\n    \"readme_content, expected_metadata_configs_dict, expected_default_config_name\",\n    [\n        (README_METADATA_SINGLE_CONFIG, EXPECTED_METADATA_SINGLE_CONFIG, \"custom\"),\n        (README_METADATA_TWO_CONFIGS_WITH_DEFAULT_FLAG, EXPECTED_METADATA_TWO_CONFIGS_DEFAULT_FLAG, \"v2\"),\n        (README_METADATA_TWO_CONFIGS_WITH_DEFAULT_NAME, EXPECTED_METADATA_TWO_CONFIGS_DEFAULT_NAME, \"default\"),\n        (README_METADATA_WITH_FEATURES, EXPECTED_METADATA_WITH_FEATURES, \"default\"),\n    ],\n)\ndef test_metadata_configs_dataset_card_data(\n    readme_content, expected_metadata_configs_dict, expected_default_config_name\n):\n    with tempfile.TemporaryDirectory() as tmp_dir:\n        path = Path(tmp_dir) / \"README.md\"\n        with open(path, \"w+\") as readme_file:\n            readme_file.write(readme_content)\n        dataset_card_data = DatasetCard.load(path).data\n        metadata_configs_dict = MetadataConfigs.from_dataset_card_data(dataset_card_data)\n        assert metadata_configs_dict == expected_metadata_configs_dict\n        assert metadata_configs_dict.get_default_config_name() == expected_default_config_name\n\n\ndef test_metadata_configs_incorrect_yaml():\n    with tempfile.TemporaryDirectory() as tmp_dir:\n        path = Path(tmp_dir) / \"README.md\"\n        with open(path, \"w+\") as readme_file:\n            readme_file.write(README_METADATA_CONFIG_INCORRECT_FORMAT)\n        dataset_card_data = DatasetCard.load(path).data\n        with pytest.raises(ValueError):\n            _ = MetadataConfigs.from_dataset_card_data(dataset_card_data)\n\n\ndef test_split_order_in_metadata_configs_from_exported_parquet_files_and_dataset_infos():\n    exported_parquet_files = [\n        {\n            \"dataset\": \"AI-Lab-Makerere/beans\",\n            \"config\": \"default\",\n            \"split\": \"test\",\n            \"url\": \"https://huggingface.co/datasets/AI-Lab-Makerere/beans/resolve/refs%2Fconvert%2Fparquet/default/test/0000.parquet\",\n            \"filename\": \"0000.parquet\",\n            \"size\": 17707203,\n        },\n        {\n            \"dataset\": \"AI-Lab-Makerere/beans\",\n            \"config\": \"default\",\n            \"split\": \"train\",\n            \"url\": \"https://huggingface.co/datasets/AI-Lab-Makerere/beans/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet\",\n            \"filename\": \"0000.parquet\",\n            \"size\": 143780164,\n        },\n        {\n            \"dataset\": \"AI-Lab-Makerere/beans\",\n            \"config\": \"default\",\n            \"split\": \"validation\",\n            \"url\": \"https://huggingface.co/datasets/AI-Lab-Makerere/beans/resolve/refs%2Fconvert%2Fparquet/default/validation/0000.parquet\",\n            \"filename\": \"0000.parquet\",\n            \"size\": 18500862,\n        },\n    ]\n    dataset_infos = {\n        \"default\": DatasetInfo(\n            dataset_name=\"AI-Lab-Makerere/beans\",\n            config_name=\"default\",\n            version=\"0.0.0\",\n            splits={\n                \"train\": {\n                    \"name\": \"train\",\n                    \"num_bytes\": 143996486,\n                    \"num_examples\": 1034,\n                    \"shard_lengths\": None,\n                    \"dataset_name\": \"AI-Lab-Makerere/beans\",\n                },\n                \"validation\": {\n                    \"name\": \"validation\",\n                    \"num_bytes\": 18525985,\n                    \"num_examples\": 133,\n                    \"shard_lengths\": None,\n                    \"dataset_name\": \"AI-Lab-Makerere/beans\",\n                },\n                \"test\": {\n                    \"name\": \"test\",\n                    \"num_bytes\": 17730506,\n                    \"num_examples\": 128,\n                    \"shard_lengths\": None,\n                    \"dataset_name\": \"AI-Lab-Makerere/beans\",\n                },\n            },\n            download_checksums={\n                \"https://huggingface.co/datasets/AI-Lab-Makerere/beans/resolve/main/data/train.zip\": {\n                    \"num_bytes\": 143812152,\n                    \"checksum\": None,\n                },\n                \"https://huggingface.co/datasets/AI-Lab-Makerere/beans/resolve/main/data/validation.zip\": {\n                    \"num_bytes\": 18504213,\n                    \"checksum\": None,\n                },\n                \"https://huggingface.co/datasets/AI-Lab-Makerere/beans/resolve/main/data/test.zip\": {\n                    \"num_bytes\": 17708541,\n                    \"checksum\": None,\n                },\n            },\n            download_size=180024906,\n            post_processing_size=None,\n            dataset_size=180252977,\n            size_in_bytes=360277883,\n        )\n    }\n    metadata_configs = MetadataConfigs._from_exported_parquet_files_and_dataset_infos(\n        \"123\", exported_parquet_files, dataset_infos\n    )\n    split_names = [data_file[\"split\"] for data_file in metadata_configs[\"default\"][\"data_files\"]]\n    assert split_names == [\"train\", \"validation\", \"test\"]\n"
  },
  {
    "path": "tests/test_offline_util.py",
    "content": "from tempfile import NamedTemporaryFile\n\nimport httpx\nimport pytest\nimport requests\nfrom huggingface_hub import get_session\nfrom huggingface_hub.errors import OfflineModeIsEnabled\n\nfrom datasets.utils.file_utils import fsspec_get, fsspec_head\n\nfrom .utils import (\n    IS_HF_HUB_1_x,\n    OfflineSimulationMode,\n    RequestWouldHangIndefinitelyError,\n    offline,\n    require_not_windows,\n)\n\n\n@pytest.mark.integration\n@require_not_windows  # fsspec get keeps a file handle on windows that raises PermissionError\ndef test_offline_with_timeout():\n    expected_exception = httpx.ReadTimeout if IS_HF_HUB_1_x else requests.ConnectTimeout\n    with offline(OfflineSimulationMode.CONNECTION_TIMES_OUT):\n        with pytest.raises(RequestWouldHangIndefinitelyError):\n            get_session().request(\"GET\", \"https://huggingface.co\")\n\n        with pytest.raises(expected_exception):\n            get_session().request(\"GET\", \"https://huggingface.co\", timeout=1.0)\n\n        with pytest.raises(expected_exception), NamedTemporaryFile() as temp_file:\n            fsspec_get(\"hf://dummy\", temp_file=temp_file)\n\n\n@pytest.mark.integration\n@require_not_windows  # fsspec get keeps a file handle on windows that raises PermissionError\ndef test_offline_with_connection_error():\n    expected_exception = httpx.ConnectError if IS_HF_HUB_1_x else requests.ConnectionError\n    with offline(OfflineSimulationMode.CONNECTION_FAILS):\n        with pytest.raises(expected_exception):\n            get_session().request(\"GET\", \"https://huggingface.co\")\n\n        with pytest.raises(expected_exception), NamedTemporaryFile() as temp_file:\n            fsspec_get(\"hf://dummy\", temp_file=temp_file)\n\n\ndef test_offline_with_datasets_offline_mode_enabled():\n    with offline(OfflineSimulationMode.HF_HUB_OFFLINE_SET_TO_1):\n        with pytest.raises(OfflineModeIsEnabled):\n            fsspec_head(\"hf://dummy\")\n        with pytest.raises(OfflineModeIsEnabled), NamedTemporaryFile() as temp_file:\n            fsspec_get(\"hf://dummy\", temp_file=temp_file)\n"
  },
  {
    "path": "tests/test_parallel.py",
    "content": "import pytest\n\nfrom datasets.parallel import ParallelBackendConfig, parallel_backend\nfrom datasets.utils.py_utils import map_nested\n\nfrom .utils import require_dill_gt_0_3_2, require_joblibspark, require_not_windows\n\n\ndef add_one(i):  # picklable for multiprocessing\n    return i + 1\n\n\n@require_dill_gt_0_3_2\n@require_joblibspark\n@require_not_windows\ndef test_parallel_backend_input():\n    with parallel_backend(\"spark\"):\n        assert ParallelBackendConfig.backend_name == \"spark\"\n\n    lst = [1, 2, 3]\n    with pytest.raises(ValueError):\n        with parallel_backend(\"unsupported backend\"):\n            map_nested(add_one, lst, num_proc=2)\n\n    with pytest.raises(ValueError):\n        with parallel_backend(\"unsupported backend\"):\n            map_nested(add_one, lst, num_proc=-1)\n\n\n@require_dill_gt_0_3_2\n@require_joblibspark\n@require_not_windows\n@pytest.mark.parametrize(\"num_proc\", [2, -1])\ndef test_parallel_backend_map_nested(num_proc):\n    s1 = [1, 2]\n    s2 = {\"a\": 1, \"b\": 2}\n    s3 = {\"a\": [1, 2], \"b\": [3, 4]}\n    s4 = {\"a\": {\"1\": 1}, \"b\": 2}\n    s5 = {\"a\": 1, \"b\": 2, \"c\": 3, \"d\": 4}\n    expected_map_nested_s1 = [2, 3]\n    expected_map_nested_s2 = {\"a\": 2, \"b\": 3}\n    expected_map_nested_s3 = {\"a\": [2, 3], \"b\": [4, 5]}\n    expected_map_nested_s4 = {\"a\": {\"1\": 2}, \"b\": 3}\n    expected_map_nested_s5 = {\"a\": 2, \"b\": 3, \"c\": 4, \"d\": 5}\n\n    with parallel_backend(\"spark\"):\n        assert map_nested(add_one, s1, num_proc=num_proc) == expected_map_nested_s1\n        assert map_nested(add_one, s2, num_proc=num_proc) == expected_map_nested_s2\n        assert map_nested(add_one, s3, num_proc=num_proc) == expected_map_nested_s3\n        assert map_nested(add_one, s4, num_proc=num_proc) == expected_map_nested_s4\n        assert map_nested(add_one, s5, num_proc=num_proc) == expected_map_nested_s5\n"
  },
  {
    "path": "tests/test_patching.py",
    "content": "from datasets.utils.patching import _PatchedModuleObj, patch_submodule\n\nfrom . import _test_patching\n\n\ndef test_patch_submodule():\n    import os as original_os\n    from os import path as original_path\n    from os import rename as original_rename\n    from os.path import dirname as original_dirname\n    from os.path import join as original_join\n\n    assert _test_patching.os is original_os\n    assert _test_patching.path is original_path\n    assert _test_patching.join is original_join\n\n    assert _test_patching.renamed_os is original_os\n    assert _test_patching.renamed_path is original_path\n    assert _test_patching.renamed_join is original_join\n\n    mock = \"__test_patch_submodule_mock__\"\n    with patch_submodule(_test_patching, \"os.path.join\", mock):\n        # Every way to access os.path.join must be patched, and the rest must stay untouched\n\n        # check os.path.join\n        assert isinstance(_test_patching.os, _PatchedModuleObj)\n        assert isinstance(_test_patching.os.path, _PatchedModuleObj)\n        assert _test_patching.os.path.join is mock\n\n        # check path.join\n        assert isinstance(_test_patching.path, _PatchedModuleObj)\n        assert _test_patching.path.join is mock\n\n        # check join\n        assert _test_patching.join is mock\n\n        # check that the other attributes are untouched\n        assert _test_patching.os.rename is original_rename\n        assert _test_patching.path.dirname is original_dirname\n        assert _test_patching.os.path.dirname is original_dirname\n\n        # Even renamed modules or objects must be patched\n\n        # check renamed_os.path.join\n        assert isinstance(_test_patching.renamed_os, _PatchedModuleObj)\n        assert isinstance(_test_patching.renamed_os.path, _PatchedModuleObj)\n        assert _test_patching.renamed_os.path.join is mock\n\n        # check renamed_path.join\n        assert isinstance(_test_patching.renamed_path, _PatchedModuleObj)\n        assert _test_patching.renamed_path.join is mock\n\n        # check renamed_join\n        assert _test_patching.renamed_join is mock\n\n        # check that the other attributes are untouched\n        assert _test_patching.renamed_os.rename is original_rename\n        assert _test_patching.renamed_path.dirname is original_dirname\n        assert _test_patching.renamed_os.path.dirname is original_dirname\n\n    # check that everthing is back to normal when the patch is over\n\n    assert _test_patching.os is original_os\n    assert _test_patching.path is original_path\n    assert _test_patching.join is original_join\n\n    assert _test_patching.renamed_os is original_os\n    assert _test_patching.renamed_path is original_path\n    assert _test_patching.renamed_join is original_join\n\n\ndef test_patch_submodule_builtin():\n    assert _test_patching.open is open\n\n    mock = \"__test_patch_submodule_builtin_mock__\"\n    # _test_patching has \"open\" in its globals\n    assert _test_patching.open is open\n    with patch_submodule(_test_patching, \"open\", mock):\n        assert _test_patching.open is mock\n\n    # check that everthing is back to normal when the patch is over\n\n    assert _test_patching.open is open\n\n\ndef test_patch_submodule_missing():\n    # pandas.read_csv is not present in _test_patching\n    mock = \"__test_patch_submodule_missing_mock__\"\n    with patch_submodule(_test_patching, \"pandas.read_csv\", mock):\n        pass\n\n\ndef test_patch_submodule_missing_builtin():\n    # builtin should always be mocked even if they're not in the globals\n    # in case they're loaded at one point\n    mock = \"__test_patch_submodule_missing_builtin_mock__\"\n    # _test_patching doesn't have \"len\" in its globals\n    assert getattr(_test_patching, \"len\", None) is None\n    with patch_submodule(_test_patching, \"len\", mock):\n        assert _test_patching.len is mock\n    assert _test_patching.len is len\n\n\ndef test_patch_submodule_start_and_stop():\n    mock = \"__test_patch_submodule_start_and_stop_mock__\"\n    patch = patch_submodule(_test_patching, \"open\", mock)\n    assert _test_patching.open is open\n    patch.start()\n    assert _test_patching.open is mock\n    patch.stop()\n    assert _test_patching.open is open\n\n\ndef test_patch_submodule_successive():\n    from os import rename as original_rename\n    from os.path import dirname as original_dirname\n    from os.path import join as original_join\n\n    mock_join = \"__test_patch_submodule_successive_join__\"\n    mock_dirname = \"__test_patch_submodule_successive_dirname__\"\n    mock_rename = \"__test_patch_submodule_successive_rename__\"\n    assert _test_patching.os.path.join is original_join\n    assert _test_patching.os.path.dirname is original_dirname\n    assert _test_patching.os.rename is original_rename\n\n    with patch_submodule(_test_patching, \"os.path.join\", mock_join):\n        with patch_submodule(_test_patching, \"os.rename\", mock_rename):\n            with patch_submodule(_test_patching, \"os.path.dirname\", mock_dirname):\n                assert _test_patching.os.path.join is mock_join\n                assert _test_patching.os.path.dirname is mock_dirname\n                assert _test_patching.os.rename is mock_rename\n\n    # try another order\n    with patch_submodule(_test_patching, \"os.rename\", mock_rename):\n        with patch_submodule(_test_patching, \"os.path.join\", mock_join):\n            with patch_submodule(_test_patching, \"os.path.dirname\", mock_dirname):\n                assert _test_patching.os.path.join is mock_join\n                assert _test_patching.os.path.dirname is mock_dirname\n                assert _test_patching.os.rename is mock_rename\n\n    assert _test_patching.os.path.join is original_join\n    assert _test_patching.os.path.dirname is original_dirname\n    assert _test_patching.os.rename is original_rename\n\n\ndef test_patch_submodule_doesnt_exist():\n    mock = \"__test_patch_submodule_doesnt_exist_mock__\"\n    with patch_submodule(_test_patching, \"__module_that_doesn_exist__.__attribute_that_doesn_exist__\", mock):\n        pass\n    with patch_submodule(_test_patching, \"os.__attribute_that_doesn_exist__\", mock):\n        pass\n"
  },
  {
    "path": "tests/test_py_utils.py",
    "content": "import os\nimport pickle\nimport time\nfrom dataclasses import dataclass\nfrom multiprocessing import Pool\nfrom unittest import TestCase\nfrom unittest.mock import patch\n\nimport multiprocess\nimport numpy as np\nimport pytest\n\nfrom datasets.utils.py_utils import (\n    NestedDataStructure,\n    asdict,\n    iflatmap_unordered,\n    map_nested,\n    string_to_dict,\n    temp_seed,\n    temporary_assignment,\n    zip_dict,\n)\n\nfrom .utils import require_numpy1_on_windows, require_tf, require_torch\n\n\ndef np_sum(x):  # picklable for multiprocessing\n    return x.sum()\n\n\ndef add_one(i):  # picklable for multiprocessing\n    return i + 1\n\n\ndef add_one_to_batch(batch):  # picklable for multiprocessing\n    return [i + 1 for i in batch]\n\n\n@dataclass\nclass A:\n    x: int\n    y: str\n\n\n@pytest.mark.parametrize(\"batched, function\", [(False, add_one), (True, add_one_to_batch)])\n@pytest.mark.parametrize(\"num_proc\", [None, 2])\n@pytest.mark.parametrize(\n    \"data_struct, expected_result\",\n    [\n        ({}, {}),\n        ([], []),\n        (1, 2),\n        ([1, 2], [2, 3]),\n        ({\"a\": 1, \"b\": 2}, {\"a\": 2, \"b\": 3}),\n        ({\"a\": [1, 2], \"b\": [3, 4]}, {\"a\": [2, 3], \"b\": [4, 5]}),\n        ({\"a\": {\"1\": 1}, \"b\": {\"2\": 2}}, {\"a\": {\"1\": 2}, \"b\": {\"2\": 3}}),\n        ({\"a\": 1, \"b\": [2, 3], \"c\": {\"1\": 4}}, {\"a\": 2, \"b\": [3, 4], \"c\": {\"1\": 5}}),\n        ({\"a\": 1, \"b\": 2, \"c\": 3, \"d\": 4}, {\"a\": 2, \"b\": 3, \"c\": 4, \"d\": 5}),\n    ],\n)\ndef test_map_nested(data_struct, expected_result, num_proc, batched, function):\n    assert map_nested(function, data_struct, num_proc=num_proc, batched=batched) == expected_result\n\n\nclass PyUtilsTest(TestCase):\n    def test_map_nested(self):\n        num_proc = 2\n        sn1 = {\"a\": np.eye(2), \"b\": np.zeros(3), \"c\": np.ones(2)}\n        expected_map_nested_sn1_sum = {\"a\": 2, \"b\": 0, \"c\": 2}\n        expected_map_nested_sn1_int = {\n            \"a\": np.eye(2).astype(int),\n            \"b\": np.zeros(3).astype(int),\n            \"c\": np.ones(2).astype(int),\n        }\n        self.assertEqual(map_nested(np_sum, sn1, map_numpy=False), expected_map_nested_sn1_sum)\n        self.assertEqual(\n            {k: v.tolist() for k, v in map_nested(int, sn1, map_numpy=True).items()},\n            {k: v.tolist() for k, v in expected_map_nested_sn1_int.items()},\n        )\n        self.assertEqual(map_nested(np_sum, sn1, map_numpy=False, num_proc=num_proc), expected_map_nested_sn1_sum)\n        self.assertEqual(\n            {k: v.tolist() for k, v in map_nested(int, sn1, map_numpy=True, num_proc=num_proc).items()},\n            {k: v.tolist() for k, v in expected_map_nested_sn1_int.items()},\n        )\n        with self.assertRaises((AttributeError, pickle.PicklingError)):  # can't pickle a local lambda\n            map_nested(lambda x: x + 1, sn1, num_proc=num_proc)\n\n    def test_zip_dict(self):\n        d1 = {\"a\": 1, \"b\": 2}\n        d2 = {\"a\": 3, \"b\": 4}\n        d3 = {\"a\": 5, \"b\": 6}\n        expected_zip_dict_result = sorted([(\"a\", (1, 3, 5)), (\"b\", (2, 4, 6))])\n        self.assertEqual(sorted(zip_dict(d1, d2, d3)), expected_zip_dict_result)\n\n    def test_temporary_assignment(self):\n        class Foo:\n            my_attr = \"bar\"\n\n        foo = Foo()\n        self.assertEqual(foo.my_attr, \"bar\")\n        with temporary_assignment(foo, \"my_attr\", \"BAR\"):\n            self.assertEqual(foo.my_attr, \"BAR\")\n        self.assertEqual(foo.my_attr, \"bar\")\n\n\n@pytest.mark.parametrize(\n    \"iterable_length, num_proc, expected_num_proc\",\n    [\n        (1, None, 1),\n        (1, 1, 1),\n        (2, None, 1),\n        (2, 1, 1),\n        (2, 2, 1),\n        (2, 3, 1),\n        (3, 2, 1),\n        (16, 16, 16),\n        (16, 17, 16),\n        (17, 16, 16),\n    ],\n)\ndef test_map_nested_num_proc(iterable_length, num_proc, expected_num_proc):\n    with (\n        patch(\"datasets.utils.py_utils._single_map_nested\") as mock_single_map_nested,\n        patch(\"datasets.parallel.parallel.Pool\") as mock_multiprocessing_pool,\n    ):\n        data_struct = {f\"{i}\": i for i in range(iterable_length)}\n        _ = map_nested(lambda x: x + 10, data_struct, num_proc=num_proc, parallel_min_length=16)\n        if expected_num_proc == 1:\n            assert mock_single_map_nested.called\n            assert not mock_multiprocessing_pool.called\n        else:\n            assert not mock_single_map_nested.called\n            assert mock_multiprocessing_pool.called\n            assert mock_multiprocessing_pool.call_args[0][0] == expected_num_proc\n\n\nclass TempSeedTest(TestCase):\n    @require_tf\n    def test_tensorflow(self):\n        import tensorflow as tf\n        from tensorflow.keras import layers\n\n        model = layers.Dense(2)\n\n        def gen_random_output():\n            x = tf.random.uniform((1, 3))\n            return model(x).numpy()\n\n        with temp_seed(42, set_tensorflow=True):\n            out1 = gen_random_output()\n        with temp_seed(42, set_tensorflow=True):\n            out2 = gen_random_output()\n        out3 = gen_random_output()\n\n        np.testing.assert_equal(out1, out2)\n        self.assertGreater(np.abs(out1 - out3).sum(), 0)\n\n    @require_numpy1_on_windows\n    @require_torch\n    def test_torch(self):\n        import torch\n\n        def gen_random_output():\n            model = torch.nn.Linear(3, 2)\n            x = torch.rand(1, 3)\n            return model(x).detach().numpy()\n\n        with temp_seed(42, set_pytorch=True):\n            out1 = gen_random_output()\n        with temp_seed(42, set_pytorch=True):\n            out2 = gen_random_output()\n        out3 = gen_random_output()\n\n        np.testing.assert_equal(out1, out2)\n        self.assertGreater(np.abs(out1 - out3).sum(), 0)\n\n    def test_numpy(self):\n        def gen_random_output():\n            return np.random.rand(1, 3)\n\n        with temp_seed(42):\n            out1 = gen_random_output()\n        with temp_seed(42):\n            out2 = gen_random_output()\n        out3 = gen_random_output()\n\n        np.testing.assert_equal(out1, out2)\n        self.assertGreater(np.abs(out1 - out3).sum(), 0)\n\n\n@pytest.mark.parametrize(\"input_data\", [{}])\ndef test_nested_data_structure_data(input_data):\n    output_data = NestedDataStructure(input_data).data\n    assert output_data == input_data\n\n\n@pytest.mark.parametrize(\n    \"data, expected_output\",\n    [\n        ({}, []),\n        ([], []),\n        (\"foo\", [\"foo\"]),\n        ([\"foo\", \"bar\"], [\"foo\", \"bar\"]),\n        ([[\"foo\", \"bar\"]], [\"foo\", \"bar\"]),\n        ([[[\"foo\"], [\"bar\"]]], [\"foo\", \"bar\"]),\n        ([[[\"foo\"], \"bar\"]], [\"foo\", \"bar\"]),\n        ({\"a\": 1, \"b\": 2}, [1, 2]),\n        ({\"a\": [1, 2], \"b\": [3, 4]}, [1, 2, 3, 4]),\n        ({\"a\": [[1, 2]], \"b\": [[3, 4]]}, [1, 2, 3, 4]),\n        ({\"a\": [[1, 2]], \"b\": [3, 4]}, [1, 2, 3, 4]),\n        ({\"a\": [[[1], [2]]], \"b\": [[[3], [4]]]}, [1, 2, 3, 4]),\n        ({\"a\": [[[1], [2]]], \"b\": [[3, 4]]}, [1, 2, 3, 4]),\n        ({\"a\": [[[1], [2]]], \"b\": [3, 4]}, [1, 2, 3, 4]),\n        ({\"a\": [[[1], [2]]], \"b\": [3, [4]]}, [1, 2, 3, 4]),\n        ({\"a\": {\"1\": 1}, \"b\": 2}, [1, 2]),\n        ({\"a\": {\"1\": [1]}, \"b\": 2}, [1, 2]),\n        ({\"a\": {\"1\": [1]}, \"b\": [2]}, [1, 2]),\n    ],\n)\ndef test_flatten(data, expected_output):\n    output = NestedDataStructure(data).flatten()\n    assert output == expected_output\n\n\ndef test_asdict():\n    input = A(x=1, y=\"foobar\")\n    expected_output = {\"x\": 1, \"y\": \"foobar\"}\n    assert asdict(input) == expected_output\n\n    input = {\"a\": {\"b\": A(x=10, y=\"foo\")}, \"c\": [A(x=20, y=\"bar\")]}\n    expected_output = {\"a\": {\"b\": {\"x\": 10, \"y\": \"foo\"}}, \"c\": [{\"x\": 20, \"y\": \"bar\"}]}\n    assert asdict(input) == expected_output\n\n    with pytest.raises(TypeError):\n        asdict([1, A(x=10, y=\"foo\")])\n\n\ndef _split_text(text: str):\n    return text.split()\n\n\ndef _2seconds_generator_of_2items_with_timing(content):\n    yield (time.time(), content)\n    time.sleep(2)\n    yield (time.time(), content)\n\n\ndef test_iflatmap_unordered():\n    with Pool(2) as pool:\n        out = list(iflatmap_unordered(pool, _split_text, kwargs_iterable=[{\"text\": \"hello there\"}] * 10))\n        assert out.count(\"hello\") == 10\n        assert out.count(\"there\") == 10\n        assert len(out) == 20\n\n    # check multiprocess from pathos (uses dill for pickling)\n    with multiprocess.Pool(2) as pool:\n        out = list(iflatmap_unordered(pool, _split_text, kwargs_iterable=[{\"text\": \"hello there\"}] * 10))\n        assert out.count(\"hello\") == 10\n        assert out.count(\"there\") == 10\n        assert len(out) == 20\n\n    # check that we get items as fast as possible\n    with Pool(2) as pool:\n        out = []\n        for yield_time, content in iflatmap_unordered(\n            pool, _2seconds_generator_of_2items_with_timing, kwargs_iterable=[{\"content\": \"a\"}, {\"content\": \"b\"}]\n        ):\n            assert yield_time < time.time() + 0.1, \"we should each item directly after it was yielded\"\n            out.append(content)\n        assert out.count(\"a\") == 2\n        assert out.count(\"b\") == 2\n        assert len(out) == 4\n\n\ndef test_string_to_dict():\n    file_name = \"dataset/cache-3b163736cf4505085d8b5f9b4c266c26.arrow\"\n    file_name_prefix, file_name_ext = os.path.splitext(file_name)\n\n    suffix_template = \"_{rank:05d}_of_{num_proc:05d}\"\n    cache_file_name_pattern = file_name_prefix + suffix_template + file_name_ext\n\n    file_name_parts = string_to_dict(file_name, cache_file_name_pattern)\n    assert file_name_parts is None\n\n    rank = 1\n    num_proc = 2\n    file_name = file_name_prefix + suffix_template.format(rank=rank, num_proc=num_proc) + file_name_ext\n    file_name_parts = string_to_dict(file_name, cache_file_name_pattern)\n    assert file_name_parts is not None\n    assert file_name_parts == {\"rank\": f\"{rank:05d}\", \"num_proc\": f\"{num_proc:05d}\"}\n"
  },
  {
    "path": "tests/test_search.py",
    "content": "import os\nimport tempfile\nfrom functools import partial\nfrom unittest import TestCase\nfrom unittest.mock import patch\n\nimport numpy as np\nimport pytest\n\nfrom datasets.arrow_dataset import Dataset\nfrom datasets.search import ElasticSearchIndex, FaissIndex, MissingIndex\n\nfrom .utils import require_elasticsearch, require_faiss\n\n\npytestmark = pytest.mark.integration\n\n\n@require_faiss\nclass IndexableDatasetTest(TestCase):\n    def _create_dummy_dataset(self):\n        dset = Dataset.from_dict({\"filename\": [\"my_name-train\" + \"_\" + str(x) for x in np.arange(30).tolist()]})\n        return dset\n\n    def test_add_faiss_index(self):\n        import faiss\n\n        dset: Dataset = self._create_dummy_dataset()\n        dset = dset.map(\n            lambda ex, i: {\"vecs\": i * np.ones(5, dtype=np.float32)}, with_indices=True, keep_in_memory=True\n        )\n        dset = dset.add_faiss_index(\"vecs\", batch_size=100, metric_type=faiss.METRIC_INNER_PRODUCT)\n        scores, examples = dset.get_nearest_examples(\"vecs\", np.ones(5, dtype=np.float32))\n        self.assertEqual(examples[\"filename\"][0], \"my_name-train_29\")\n        dset.drop_index(\"vecs\")\n\n    def test_add_faiss_index_errors(self):\n        import faiss\n\n        dset: Dataset = self._create_dummy_dataset()\n        with pytest.raises(ValueError, match=\"Wrong feature type for column 'filename'\"):\n            _ = dset.add_faiss_index(\"filename\", batch_size=100, metric_type=faiss.METRIC_INNER_PRODUCT)\n\n    def test_add_faiss_index_from_external_arrays(self):\n        import faiss\n\n        dset: Dataset = self._create_dummy_dataset()\n        dset.add_faiss_index_from_external_arrays(\n            external_arrays=np.ones((30, 5)) * np.arange(30).reshape(-1, 1),\n            index_name=\"vecs\",\n            batch_size=100,\n            metric_type=faiss.METRIC_INNER_PRODUCT,\n        )\n        scores, examples = dset.get_nearest_examples(\"vecs\", np.ones(5, dtype=np.float32))\n        self.assertEqual(examples[\"filename\"][0], \"my_name-train_29\")\n\n    def test_serialization(self):\n        import faiss\n\n        dset: Dataset = self._create_dummy_dataset()\n        dset.add_faiss_index_from_external_arrays(\n            external_arrays=np.ones((30, 5)) * np.arange(30).reshape(-1, 1),\n            index_name=\"vecs\",\n            metric_type=faiss.METRIC_INNER_PRODUCT,\n        )\n\n        # Setting delete=False and unlinking manually is not pretty... but it is required on Windows to\n        # ensure somewhat stable behaviour. If we don't, we get PermissionErrors. This is an age-old issue.\n        # see https://bugs.python.org/issue14243 and\n        # https://stackoverflow.com/questions/23212435/permission-denied-to-write-to-my-temporary-file/23212515\n        with tempfile.NamedTemporaryFile(delete=False) as tmp_file:\n            dset.save_faiss_index(\"vecs\", tmp_file.name)\n            dset.load_faiss_index(\"vecs2\", tmp_file.name)\n        os.unlink(tmp_file.name)\n\n        scores, examples = dset.get_nearest_examples(\"vecs2\", np.ones(5, dtype=np.float32))\n        self.assertEqual(examples[\"filename\"][0], \"my_name-train_29\")\n\n    def test_drop_index(self):\n        dset: Dataset = self._create_dummy_dataset()\n        dset.add_faiss_index_from_external_arrays(\n            external_arrays=np.ones((30, 5)) * np.arange(30).reshape(-1, 1), index_name=\"vecs\"\n        )\n        dset.drop_index(\"vecs\")\n        self.assertRaises(MissingIndex, partial(dset.get_nearest_examples, \"vecs2\", np.ones(5, dtype=np.float32)))\n\n    def test_add_elasticsearch_index(self):\n        from elasticsearch import Elasticsearch\n\n        dset: Dataset = self._create_dummy_dataset()\n        with (\n            patch(\"elasticsearch.Elasticsearch.search\") as mocked_search,\n            patch(\"elasticsearch.client.IndicesClient.create\") as mocked_index_create,\n            patch(\"elasticsearch.helpers.streaming_bulk\") as mocked_bulk,\n        ):\n            mocked_index_create.return_value = {\"acknowledged\": True}\n            mocked_bulk.return_value([(True, None)] * 30)\n            mocked_search.return_value = {\"hits\": {\"hits\": [{\"_score\": 1, \"_id\": 29}]}}\n            es_client = Elasticsearch()\n\n            dset.add_elasticsearch_index(\"filename\", es_client=es_client)\n            scores, examples = dset.get_nearest_examples(\"filename\", \"my_name-train_29\")\n            self.assertEqual(examples[\"filename\"][0], \"my_name-train_29\")\n\n\n@require_faiss\nclass FaissIndexTest(TestCase):\n    def test_flat_ip(self):\n        import faiss\n\n        index = FaissIndex(metric_type=faiss.METRIC_INNER_PRODUCT)\n\n        # add vectors\n        index.add_vectors(np.eye(5, dtype=np.float32))\n        self.assertIsNotNone(index.faiss_index)\n        self.assertEqual(index.faiss_index.ntotal, 5)\n        index.add_vectors(np.zeros((5, 5), dtype=np.float32))\n        self.assertEqual(index.faiss_index.ntotal, 10)\n\n        # single query\n        query = np.zeros(5, dtype=np.float32)\n        query[1] = 1\n        scores, indices = index.search(query)\n        self.assertRaises(ValueError, index.search, query.reshape(-1, 1))\n        self.assertGreater(scores[0], 0)\n        self.assertEqual(indices[0], 1)\n\n        # batched queries\n        queries = np.eye(5, dtype=np.float32)[::-1]\n        total_scores, total_indices = index.search_batch(queries)\n        self.assertRaises(ValueError, index.search_batch, queries[0])\n        best_scores = [scores[0] for scores in total_scores]\n        best_indices = [indices[0] for indices in total_indices]\n        self.assertGreater(np.min(best_scores), 0)\n        self.assertListEqual([4, 3, 2, 1, 0], best_indices)\n\n    def test_factory(self):\n        import faiss\n\n        index = FaissIndex(string_factory=\"Flat\")\n        index.add_vectors(np.eye(5, dtype=np.float32))\n        self.assertIsInstance(index.faiss_index, faiss.IndexFlat)\n        index = FaissIndex(string_factory=\"LSH\")\n        index.add_vectors(np.eye(5, dtype=np.float32))\n        self.assertIsInstance(index.faiss_index, faiss.IndexLSH)\n        with self.assertRaises(ValueError):\n            _ = FaissIndex(string_factory=\"Flat\", custom_index=faiss.IndexFlat(5))\n\n    def test_custom(self):\n        import faiss\n\n        custom_index = faiss.IndexFlat(5)\n        index = FaissIndex(custom_index=custom_index)\n        index.add_vectors(np.eye(5, dtype=np.float32))\n        self.assertIsInstance(index.faiss_index, faiss.IndexFlat)\n\n    def test_serialization(self):\n        import faiss\n\n        index = FaissIndex(metric_type=faiss.METRIC_INNER_PRODUCT)\n        index.add_vectors(np.eye(5, dtype=np.float32))\n\n        # Setting delete=False and unlinking manually is not pretty... but it is required on Windows to\n        # ensure somewhat stable behaviour. If we don't, we get PermissionErrors. This is an age-old issue.\n        # see https://bugs.python.org/issue14243 and\n        # https://stackoverflow.com/questions/23212435/permission-denied-to-write-to-my-temporary-file/23212515\n        with tempfile.NamedTemporaryFile(delete=False) as tmp_file:\n            index.save(tmp_file.name)\n            index = FaissIndex.load(tmp_file.name)\n        os.unlink(tmp_file.name)\n\n        query = np.zeros(5, dtype=np.float32)\n        query[1] = 1\n        scores, indices = index.search(query)\n        self.assertGreater(scores[0], 0)\n        self.assertEqual(indices[0], 1)\n\n\n@require_faiss\ndef test_serialization_fs(mockfs):\n    import faiss\n\n    index = FaissIndex(metric_type=faiss.METRIC_INNER_PRODUCT)\n    index.add_vectors(np.eye(5, dtype=np.float32))\n\n    index_name = \"index.faiss\"\n    path = f\"mock://{index_name}\"\n    index.save(path, storage_options=mockfs.storage_options)\n    index = FaissIndex.load(path, storage_options=mockfs.storage_options)\n\n    query = np.zeros(5, dtype=np.float32)\n    query[1] = 1\n    scores, indices = index.search(query)\n    assert scores[0] > 0\n    assert indices[0] == 1\n\n\n@require_elasticsearch\nclass ElasticSearchIndexTest(TestCase):\n    def test_elasticsearch(self):\n        from elasticsearch import Elasticsearch\n\n        with (\n            patch(\"elasticsearch.Elasticsearch.search\") as mocked_search,\n            patch(\"elasticsearch.client.IndicesClient.create\") as mocked_index_create,\n            patch(\"elasticsearch.helpers.streaming_bulk\") as mocked_bulk,\n        ):\n            es_client = Elasticsearch()\n            mocked_index_create.return_value = {\"acknowledged\": True}\n            index = ElasticSearchIndex(es_client=es_client)\n            mocked_bulk.return_value([(True, None)] * 3)\n            index.add_documents([\"foo\", \"bar\", \"foobar\"])\n\n            # single query\n            query = \"foo\"\n            mocked_search.return_value = {\"hits\": {\"hits\": [{\"_score\": 1, \"_id\": 0}]}}\n            scores, indices = index.search(query)\n            self.assertEqual(scores[0], 1)\n            self.assertEqual(indices[0], 0)\n\n            # single query with timeout\n            query = \"foo\"\n            mocked_search.return_value = {\"hits\": {\"hits\": [{\"_score\": 1, \"_id\": 0}]}}\n            scores, indices = index.search(query, request_timeout=30)\n            self.assertEqual(scores[0], 1)\n            self.assertEqual(indices[0], 0)\n\n            # batched queries\n            queries = [\"foo\", \"bar\", \"foobar\"]\n            mocked_search.return_value = {\"hits\": {\"hits\": [{\"_score\": 1, \"_id\": 1}]}}\n            total_scores, total_indices = index.search_batch(queries)\n            best_scores = [scores[0] for scores in total_scores]\n            best_indices = [indices[0] for indices in total_indices]\n            self.assertGreater(np.min(best_scores), 0)\n            self.assertListEqual([1, 1, 1], best_indices)\n\n            # batched queries with timeout\n            queries = [\"foo\", \"bar\", \"foobar\"]\n            mocked_search.return_value = {\"hits\": {\"hits\": [{\"_score\": 1, \"_id\": 1}]}}\n            total_scores, total_indices = index.search_batch(queries, request_timeout=30)\n            best_scores = [scores[0] for scores in total_scores]\n            best_indices = [indices[0] for indices in total_indices]\n            self.assertGreater(np.min(best_scores), 0)\n            self.assertListEqual([1, 1, 1], best_indices)\n"
  },
  {
    "path": "tests/test_sharding_utils.py",
    "content": "import pytest\n\nfrom datasets.utils.sharding import _distribute_shards, _number_of_shards_in_gen_kwargs, _split_gen_kwargs\n\n\n@pytest.mark.parametrize(\n    \"kwargs, expected\",\n    [\n        ({\"num_shards\": 0, \"max_num_jobs\": 1}, []),\n        ({\"num_shards\": 10, \"max_num_jobs\": 1}, [range(10)]),\n        ({\"num_shards\": 10, \"max_num_jobs\": 10}, [range(i, i + 1) for i in range(10)]),\n        ({\"num_shards\": 1, \"max_num_jobs\": 10}, [range(1)]),\n        ({\"num_shards\": 10, \"max_num_jobs\": 3}, [range(0, 4), range(4, 7), range(7, 10)]),\n        ({\"num_shards\": 3, \"max_num_jobs\": 10}, [range(0, 1), range(1, 2), range(2, 3)]),\n    ],\n)\ndef test_distribute_shards(kwargs, expected):\n    out = _distribute_shards(**kwargs)\n    assert out == expected\n\n\n@pytest.mark.parametrize(\n    \"gen_kwargs, max_num_jobs, expected\",\n    [\n        ({\"foo\": 0}, 10, [{\"foo\": 0}]),\n        ({\"shards\": [0, 1, 2, 3]}, 1, [{\"shards\": [0, 1, 2, 3]}]),\n        ({\"shards\": [0, 1, 2, 3]}, 4, [{\"shards\": [0]}, {\"shards\": [1]}, {\"shards\": [2]}, {\"shards\": [3]}]),\n        ({\"shards\": [0, 1]}, 4, [{\"shards\": [0]}, {\"shards\": [1]}]),\n        ({\"shards\": [0, 1, 2, 3]}, 2, [{\"shards\": [0, 1]}, {\"shards\": [2, 3]}]),\n    ],\n)\ndef test_split_gen_kwargs(gen_kwargs, max_num_jobs, expected):\n    out = _split_gen_kwargs(gen_kwargs, max_num_jobs)\n    assert out == expected\n\n\n@pytest.mark.parametrize(\n    \"gen_kwargs, expected\",\n    [\n        ({\"foo\": 0}, 1),\n        ({\"shards\": [0]}, 1),\n        ({\"shards\": [0, 1, 2, 3]}, 4),\n        ({\"shards\": [0, 1, 2, 3], \"foo\": 0}, 4),\n        ({\"shards\": [0, 1, 2, 3], \"other\": (0, 1)}, 4),\n        ({\"shards\": [0, 1, 2, 3], \"shards2\": [0, 1]}, RuntimeError),\n    ],\n)\ndef test_number_of_shards_in_gen_kwargs(gen_kwargs, expected):\n    if expected is RuntimeError:\n        with pytest.raises(expected):\n            _number_of_shards_in_gen_kwargs(gen_kwargs)\n    else:\n        out = _number_of_shards_in_gen_kwargs(gen_kwargs)\n        assert out == expected\n"
  },
  {
    "path": "tests/test_splits.py",
    "content": "import inspect\n\nimport pytest\n\nfrom datasets.splits import Split, SplitDict, SplitInfo\nfrom datasets.utils.py_utils import asdict\n\n\n@pytest.mark.parametrize(\n    \"split_dict\",\n    [\n        SplitDict(),\n        SplitDict({\"train\": SplitInfo(name=\"train\", num_bytes=1337, num_examples=42, dataset_name=\"my_dataset\")}),\n        SplitDict({\"train\": SplitInfo(name=\"train\", num_bytes=1337, num_examples=42)}),\n        SplitDict({\"train\": SplitInfo()}),\n    ],\n)\ndef test_split_dict_to_yaml_list(split_dict: SplitDict):\n    split_dict_yaml_list = split_dict._to_yaml_list()\n    assert len(split_dict_yaml_list) == len(split_dict)\n    reloaded = SplitDict._from_yaml_list(split_dict_yaml_list)\n    for split_name, split_info in split_dict.items():\n        # dataset_name field is deprecated, and is therefore not part of the YAML dump\n        split_info.dataset_name = None\n        # the split name of split_dict takes over the name of the split info object\n        split_info.name = split_name\n    assert split_dict == reloaded\n\n\n@pytest.mark.parametrize(\n    \"split_info\", [SplitInfo(), SplitInfo(dataset_name=None), SplitInfo(dataset_name=\"my_dataset\")]\n)\ndef test_split_dict_asdict_has_dataset_name(split_info):\n    # For backward compatibility, we need asdict(split_dict) to return split info dictrionaries with the \"dataset_name\"\n    # field even if it's deprecated. This way old versionso of `datasets` can still reload dataset_infos.json files\n    split_dict_asdict = asdict(SplitDict({\"train\": split_info}))\n    assert \"dataset_name\" in split_dict_asdict[\"train\"]\n    assert split_dict_asdict[\"train\"][\"dataset_name\"] == split_info.dataset_name\n\n\ndef test_named_split_inequality():\n    # Used while building the docs, when set as a default parameter value in a function signature\n    assert Split.TRAIN != inspect.Parameter.empty\n"
  },
  {
    "path": "tests/test_streaming_download_manager.py",
    "content": "import json\nimport os\nfrom pathlib import Path\n\nimport pytest\n\nfrom datasets.download.streaming_download_manager import (\n    StreamingDownloadManager,\n    xbasename,\n    xglob,\n    xjoin,\n    xopen,\n)\nfrom datasets.filesystems import COMPRESSION_FILESYSTEMS\n\nfrom .utils import require_lz4, require_zstandard, slow\n\n\nTEST_GG_DRIVE_FILENAME = \"train.tsv\"\nTEST_GG_DRIVE_URL = \"https://drive.google.com/uc?export=download&id=17bOgBDc3hRCoPZ89EYtKDzK-yXAWat94\"\nTEST_GG_DRIVE_GZIPPED_URL = \"https://drive.google.com/uc?export=download&id=1Bt4Garpf0QLiwkJhHJzXaVa0I0H5Qhwz\"\nTEST_GG_DRIVE_ZIPPED_URL = \"https://drive.google.com/uc?export=download&id=1k92sUfpHxKq8PXWRr7Y5aNHXwOCNUmqh\"\nTEST_GG_DRIVE_CONTENT = \"\"\"\\\npokemon_name, type\nCharmander, fire\nSquirtle, water\nBulbasaur, grass\"\"\"\n\n\ndef test_streaming_dl_manager_download_dummy_path():\n    path = str(Path(__file__).resolve())\n    dl_manager = StreamingDownloadManager()\n    assert dl_manager.download(path) == path\n\n\ndef test_streaming_dl_manager_download_dummy_url():\n    url = \"https://f.oo/bar.txt\"\n    dl_manager = StreamingDownloadManager()\n    assert dl_manager.download(url) == url\n\n\n@pytest.mark.parametrize(\n    \"urlpath\",\n    [\n        \"zip://train-00000.tar.gz::https://foo.bar/data.zip\",\n        \"https://foo.bar/train.tar.gz\",\n        \"https://foo.bar/train.tgz\",\n        \"https://foo.bar/train.tar\",\n    ],\n)\ndef test_streaming_dl_manager_extract_throws(urlpath):\n    with pytest.raises(NotImplementedError):\n        _ = StreamingDownloadManager().extract(urlpath)\n\n\ndef test_streaming_dl_manager_download(text_path):\n    dl_manager = StreamingDownloadManager()\n    out = dl_manager.download(text_path)\n    assert out == text_path\n    with xopen(out, encoding=\"utf-8\") as f, open(text_path, encoding=\"utf-8\") as expected_file:\n        assert f.read() == expected_file.read()\n\n\ndef test_streaming_dl_manager_download_and_extract_no_extraction_dummy_path():\n    path = str(Path(__file__).resolve())\n    dl_manager = StreamingDownloadManager()\n    assert dl_manager.download_and_extract(path) == path\n\n\ndef test_streaming_dl_manager_download_and_extract_no_extraction_dummy_url():\n    url = \"https://f.oo/bar.txt\"\n    dl_manager = StreamingDownloadManager()\n    assert dl_manager.download_and_extract(url) == url\n\n\ndef test_streaming_dl_manager_extract(text_gz_path, text_path):\n    dl_manager = StreamingDownloadManager()\n    output_path = dl_manager.extract(text_gz_path)\n    path = os.path.basename(text_gz_path)\n    path = path[: path.rindex(\".\")]\n    assert output_path == f\"gzip://{path}::{text_gz_path}\"\n    fsspec_open_file = xopen(output_path, encoding=\"utf-8\")\n    with fsspec_open_file as f, open(text_path, encoding=\"utf-8\") as expected_file:\n        assert f.read() == expected_file.read()\n\n\n@pytest.mark.parametrize(\"archive_jsonl\", [\"tar_jsonl_path\", \"zip_jsonl_path\"])\ndef test_iter_files_in_archive(archive_jsonl, request):\n    archive_path = str(request.getfixturevalue(archive_jsonl))\n    protocol = \"tar\" if archive_path.endswith(\".tar\") else \"zip\"\n    files = list(StreamingDownloadManager().iter_files(f\"{protocol}://::{archive_path}\"))\n    assert sorted(xbasename(file) for file in files) == [\"dataset.jsonl\", \"dataset2.jsonl\"]\n    assert all(file.endswith(f\"::{archive_path}\") for file in files)\n\n\ndef test_streaming_dl_manager_download_and_extract_with_extraction(text_gz_path, text_path):\n    dl_manager = StreamingDownloadManager()\n    output_path = dl_manager.download_and_extract(text_gz_path)\n    path = os.path.basename(text_gz_path)\n    path = path[: path.rindex(\".\")]\n    assert output_path == f\"gzip://{path}::{text_gz_path}\"\n    fsspec_open_file = xopen(output_path, encoding=\"utf-8\")\n    with fsspec_open_file as f, open(text_path, encoding=\"utf-8\") as expected_file:\n        assert f.read() == expected_file.read()\n\n\n@pytest.mark.parametrize(\n    \"input_path, filename, expected_path\",\n    [(\"https://domain.org/archive.zip\", \"filename.jsonl\", \"zip://filename.jsonl::https://domain.org/archive.zip\")],\n)\ndef test_streaming_dl_manager_download_and_extract_with_join(input_path, filename, expected_path):\n    dl_manager = StreamingDownloadManager()\n    extracted_path = dl_manager.download_and_extract(input_path)\n    output_path = xjoin(extracted_path, filename)\n    assert output_path == expected_path\n\n\n@pytest.mark.parametrize(\"compression_fs_class\", COMPRESSION_FILESYSTEMS)\ndef test_streaming_dl_manager_extract_all_supported_single_file_compression_types(\n    compression_fs_class, gz_file, xz_file, zstd_file, bz2_file, lz4_file, text_file\n):\n    input_paths = {\"gzip\": gz_file, \"xz\": xz_file, \"zstd\": zstd_file, \"bz2\": bz2_file, \"lz4\": lz4_file}\n    input_path = input_paths[compression_fs_class.protocol]\n    if input_path is None:\n        reason = f\"for '{compression_fs_class.protocol}' compression protocol, \"\n        if compression_fs_class.protocol == \"lz4\":\n            reason += require_lz4.kwargs[\"reason\"]\n        elif compression_fs_class.protocol == \"zstd\":\n            reason += require_zstandard.kwargs[\"reason\"]\n        pytest.skip(reason)\n    dl_manager = StreamingDownloadManager()\n    output_path = dl_manager.extract(input_path)\n    path = os.path.basename(input_path)\n    path = path[: path.rindex(\".\")]\n    assert output_path == f\"{compression_fs_class.protocol}://{path}::{input_path}\"\n    fsspec_open_file = xopen(output_path, encoding=\"utf-8\")\n    with fsspec_open_file as f, open(text_file, encoding=\"utf-8\") as expected_file:\n        assert f.read() == expected_file.read()\n\n\n@slow  # otherwise it spams Google Drive and the CI gets banned\n@pytest.mark.integration\ndef test_streaming_gg_drive_no_extract():\n    urlpath = StreamingDownloadManager().download_and_extract(TEST_GG_DRIVE_URL)\n    with xopen(urlpath) as f:\n        assert f.read() == TEST_GG_DRIVE_CONTENT\n\n\n@slow  # otherwise it spams Google Drive and the CI gets banned\n@pytest.mark.integration\ndef test_streaming_gg_drive_gzipped():\n    urlpath = StreamingDownloadManager().download_and_extract(TEST_GG_DRIVE_GZIPPED_URL)\n    with xopen(urlpath) as f:\n        assert f.read() == TEST_GG_DRIVE_CONTENT\n\n\n@slow  # otherwise it spams Google Drive and the CI gets banned\n@pytest.mark.integration\ndef test_streaming_gg_drive_zipped():\n    urlpath = StreamingDownloadManager().download_and_extract(TEST_GG_DRIVE_ZIPPED_URL)\n    all_files = list(xglob(xjoin(urlpath, \"*\")))\n    assert len(all_files) == 1\n    assert xbasename(all_files[0]) == TEST_GG_DRIVE_FILENAME\n    with xopen(all_files[0]) as f:\n        assert f.read() == TEST_GG_DRIVE_CONTENT\n\n\ndef _test_jsonl(path, file):\n    assert path.endswith(\".jsonl\")\n    for num_items, line in enumerate(file, start=1):\n        item = json.loads(line.decode(\"utf-8\"))\n        assert item.keys() == {\"col_1\", \"col_2\", \"col_3\"}\n    assert num_items == 4\n\n\n@pytest.mark.parametrize(\"archive_jsonl\", [\"tar_jsonl_path\", \"zip_jsonl_path\"])\ndef test_iter_archive_path(archive_jsonl, request):\n    archive_jsonl_path = request.getfixturevalue(archive_jsonl)\n    dl_manager = StreamingDownloadManager()\n    archive_iterable = dl_manager.iter_archive(archive_jsonl_path)\n    num_jsonl = 0\n    for num_jsonl, (path, file) in enumerate(archive_iterable, start=1):\n        _test_jsonl(path, file)\n    assert num_jsonl == 2\n    # do it twice to make sure it's reset correctly\n    num_jsonl = 0\n    for num_jsonl, (path, file) in enumerate(archive_iterable, start=1):\n        _test_jsonl(path, file)\n    assert num_jsonl == 2\n\n\n@pytest.mark.parametrize(\"archive_nested_jsonl\", [\"tar_nested_jsonl_path\", \"zip_nested_jsonl_path\"])\ndef test_iter_archive_file(archive_nested_jsonl, request):\n    archive_nested_jsonl_path = request.getfixturevalue(archive_nested_jsonl)\n    dl_manager = StreamingDownloadManager()\n    files_iterable = dl_manager.iter_archive(archive_nested_jsonl_path)\n    num_tar, num_jsonl = 0, 0\n    for num_tar, (path, file) in enumerate(files_iterable, start=1):\n        for num_jsonl, (subpath, subfile) in enumerate(dl_manager.iter_archive(file), start=1):\n            _test_jsonl(subpath, subfile)\n    assert num_tar == 1\n    assert num_jsonl == 2\n    # do it twice to make sure it's reset correctly\n    num_tar, num_jsonl = 0, 0\n    for num_tar, (path, file) in enumerate(files_iterable, start=1):\n        for num_jsonl, (subpath, subfile) in enumerate(dl_manager.iter_archive(file), start=1):\n            _test_jsonl(subpath, subfile)\n    assert num_tar == 1\n    assert num_jsonl == 2\n\n\ndef test_iter_files(data_dir_with_hidden_files):\n    dl_manager = StreamingDownloadManager()\n    for num_file, file in enumerate(dl_manager.iter_files(data_dir_with_hidden_files), start=1):\n        assert os.path.basename(file) == (\"test.txt\" if num_file == 1 else \"train.txt\")\n    assert num_file == 2\n"
  },
  {
    "path": "tests/test_table.py",
    "content": "import copy\nimport pickle\nfrom decimal import Decimal\nfrom functools import partial\nfrom typing import Union\nfrom unittest.mock import MagicMock\n\nimport numpy as np\nimport pyarrow as pa\nimport pytest\n\nfrom datasets.features import Array2D, ClassLabel, Features, Image, LargeList, List, Value\nfrom datasets.features.features import Array2DExtensionType, get_nested_type\nfrom datasets.table import (\n    ConcatenationTable,\n    InMemoryTable,\n    MemoryMappedTable,\n    Table,\n    TableBlock,\n    _in_memory_arrow_table_from_buffer,\n    _in_memory_arrow_table_from_file,\n    _interpolation_search,\n    _memory_mapped_arrow_table_from_file,\n    array_cast,\n    cast_array_to_feature,\n    cast_table_to_schema,\n    concat_tables,\n    embed_array_storage,\n    embed_table_storage,\n    inject_arrow_table_documentation,\n    table_cast,\n    table_iter,\n)\n\nfrom .utils import assert_arrow_memory_doesnt_increase, assert_arrow_memory_increases, slow\n\n\n@pytest.fixture(scope=\"session\")\ndef in_memory_pa_table(arrow_file) -> pa.Table:\n    return pa.ipc.open_stream(arrow_file).read_all()\n\n\ndef _to_testing_blocks(table: TableBlock) -> list[list[TableBlock]]:\n    assert len(table) > 2\n    blocks = [\n        [table.slice(0, 2)],\n        [table.slice(2).drop([c for c in table.column_names if c != \"tokens\"]), table.slice(2).drop([\"tokens\"])],\n    ]\n    return blocks\n\n\n@pytest.fixture(scope=\"session\")\ndef in_memory_blocks(in_memory_pa_table):\n    table = InMemoryTable(in_memory_pa_table)\n    return _to_testing_blocks(table)\n\n\n@pytest.fixture(scope=\"session\")\ndef memory_mapped_blocks(arrow_file):\n    table = MemoryMappedTable.from_file(arrow_file)\n    return _to_testing_blocks(table)\n\n\n@pytest.fixture(scope=\"session\")\ndef mixed_in_memory_and_memory_mapped_blocks(in_memory_blocks, memory_mapped_blocks):\n    return in_memory_blocks[:1] + memory_mapped_blocks[1:]\n\n\ndef assert_deepcopy_without_bringing_data_in_memory(table: MemoryMappedTable):\n    with assert_arrow_memory_doesnt_increase():\n        copied_table = copy.deepcopy(table)\n    assert isinstance(copied_table, MemoryMappedTable)\n    assert copied_table.table == table.table\n\n\ndef assert_deepcopy_does_bring_data_in_memory(table: MemoryMappedTable):\n    with assert_arrow_memory_increases():\n        copied_table = copy.deepcopy(table)\n    assert isinstance(copied_table, MemoryMappedTable)\n    assert copied_table.table == table.table\n\n\ndef assert_pickle_without_bringing_data_in_memory(table: MemoryMappedTable):\n    with assert_arrow_memory_doesnt_increase():\n        pickled_table = pickle.dumps(table)\n        unpickled_table = pickle.loads(pickled_table)\n    assert isinstance(unpickled_table, MemoryMappedTable)\n    assert unpickled_table.table == table.table\n\n\ndef assert_pickle_does_bring_data_in_memory(table: MemoryMappedTable):\n    with assert_arrow_memory_increases():\n        pickled_table = pickle.dumps(table)\n        unpickled_table = pickle.loads(pickled_table)\n    assert isinstance(unpickled_table, MemoryMappedTable)\n    assert unpickled_table.table == table.table\n\n\ndef assert_index_attributes_equal(table: Table, other: Table):\n    assert table._batches == other._batches\n    np.testing.assert_array_equal(table._offsets, other._offsets)\n    assert table._schema == other._schema\n\n\ndef add_suffix_to_column_names(table, suffix):\n    return table.rename_columns([f\"{name}{suffix}\" for name in table.column_names])\n\n\ndef test_inject_arrow_table_documentation(in_memory_pa_table):\n    method = pa.Table.slice\n\n    def function_to_wrap(*args):\n        return method(*args)\n\n    args = (0, 1)\n    wrapped_method = inject_arrow_table_documentation(method)(function_to_wrap)\n    assert method(in_memory_pa_table, *args) == wrapped_method(in_memory_pa_table, *args)\n    assert \"pyarrow.Table\" not in wrapped_method.__doc__\n    assert \"Table\" in wrapped_method.__doc__\n\n\ndef test_in_memory_arrow_table_from_file(arrow_file, in_memory_pa_table):\n    with assert_arrow_memory_increases():\n        pa_table = _in_memory_arrow_table_from_file(arrow_file)\n        assert in_memory_pa_table == pa_table\n\n\ndef test_in_memory_arrow_table_from_buffer(in_memory_pa_table):\n    with assert_arrow_memory_increases():\n        buf_writer = pa.BufferOutputStream()\n        writer = pa.RecordBatchStreamWriter(buf_writer, schema=in_memory_pa_table.schema)\n        writer.write_table(in_memory_pa_table)\n        writer.close()\n        buf_writer.close()\n        pa_table = _in_memory_arrow_table_from_buffer(buf_writer.getvalue())\n        assert in_memory_pa_table == pa_table\n\n\ndef test_memory_mapped_arrow_table_from_file(arrow_file, in_memory_pa_table):\n    with assert_arrow_memory_doesnt_increase():\n        pa_table = _memory_mapped_arrow_table_from_file(arrow_file)\n        assert in_memory_pa_table == pa_table\n\n\ndef test_table_init(in_memory_pa_table):\n    table = Table(in_memory_pa_table)\n    assert table.table == in_memory_pa_table\n\n\ndef test_table_validate(in_memory_pa_table):\n    table = Table(in_memory_pa_table)\n    assert table.validate() == in_memory_pa_table.validate()\n\n\ndef test_table_equals(in_memory_pa_table):\n    table = Table(in_memory_pa_table)\n    assert table.equals(in_memory_pa_table)\n\n\ndef test_table_to_batches(in_memory_pa_table):\n    table = Table(in_memory_pa_table)\n    assert table.to_batches() == in_memory_pa_table.to_batches()\n\n\ndef test_table_to_pydict(in_memory_pa_table):\n    table = Table(in_memory_pa_table)\n    assert table.to_pydict() == in_memory_pa_table.to_pydict()\n\n\ndef test_table_to_string(in_memory_pa_table):\n    table = Table(in_memory_pa_table)\n    assert table.to_string() == in_memory_pa_table.to_string()\n\n\ndef test_table_field(in_memory_pa_table):\n    assert \"tokens\" in in_memory_pa_table.column_names\n    table = Table(in_memory_pa_table)\n    assert table.field(\"tokens\") == in_memory_pa_table.field(\"tokens\")\n\n\ndef test_table_column(in_memory_pa_table):\n    assert \"tokens\" in in_memory_pa_table.column_names\n    table = Table(in_memory_pa_table)\n    assert table.column(\"tokens\") == in_memory_pa_table.column(\"tokens\")\n\n\ndef test_table_itercolumns(in_memory_pa_table):\n    table = Table(in_memory_pa_table)\n    assert isinstance(table.itercolumns(), type(in_memory_pa_table.itercolumns()))\n    assert list(table.itercolumns()) == list(in_memory_pa_table.itercolumns())\n\n\ndef test_table_getitem(in_memory_pa_table):\n    table = Table(in_memory_pa_table)\n    assert table[0] == in_memory_pa_table[0]\n\n\ndef test_table_len(in_memory_pa_table):\n    table = Table(in_memory_pa_table)\n    assert len(table) == len(in_memory_pa_table)\n\n\ndef test_table_str(in_memory_pa_table):\n    table = Table(in_memory_pa_table)\n    assert str(table) == str(in_memory_pa_table).replace(\"pyarrow.Table\", \"Table\")\n    assert repr(table) == repr(in_memory_pa_table).replace(\"pyarrow.Table\", \"Table\")\n\n\n@pytest.mark.parametrize(\n    \"attribute\", [\"schema\", \"columns\", \"num_columns\", \"num_rows\", \"shape\", \"nbytes\", \"column_names\"]\n)\ndef test_table_attributes(in_memory_pa_table, attribute):\n    table = Table(in_memory_pa_table)\n    assert getattr(table, attribute) == getattr(in_memory_pa_table, attribute)\n\n\ndef test_in_memory_table_from_file(arrow_file, in_memory_pa_table):\n    with assert_arrow_memory_increases():\n        table = InMemoryTable.from_file(arrow_file)\n        assert table.table == in_memory_pa_table\n        assert isinstance(table, InMemoryTable)\n\n\ndef test_in_memory_table_from_buffer(in_memory_pa_table):\n    with assert_arrow_memory_increases():\n        buf_writer = pa.BufferOutputStream()\n        writer = pa.RecordBatchStreamWriter(buf_writer, schema=in_memory_pa_table.schema)\n        writer.write_table(in_memory_pa_table)\n        writer.close()\n        buf_writer.close()\n        table = InMemoryTable.from_buffer(buf_writer.getvalue())\n        assert table.table == in_memory_pa_table\n        assert isinstance(table, InMemoryTable)\n\n\ndef test_in_memory_table_from_pandas(in_memory_pa_table):\n    df = in_memory_pa_table.to_pandas()\n    with assert_arrow_memory_increases():\n        # with no schema it might infer another order of the fields in the schema\n        table = InMemoryTable.from_pandas(df)\n        assert isinstance(table, InMemoryTable)\n    # by specifying schema we get the same order of features, and so the exact same table\n    table = InMemoryTable.from_pandas(df, schema=in_memory_pa_table.schema)\n    assert table.table == in_memory_pa_table\n    assert isinstance(table, InMemoryTable)\n\n\ndef test_in_memory_table_from_arrays(in_memory_pa_table):\n    arrays = list(in_memory_pa_table.columns)\n    names = list(in_memory_pa_table.column_names)\n    table = InMemoryTable.from_arrays(arrays, names=names)\n    assert table.table == in_memory_pa_table\n    assert isinstance(table, InMemoryTable)\n\n\ndef test_in_memory_table_from_pydict(in_memory_pa_table):\n    pydict = in_memory_pa_table.to_pydict()\n    with assert_arrow_memory_increases():\n        table = InMemoryTable.from_pydict(pydict)\n        assert isinstance(table, InMemoryTable)\n        assert table.table == pa.Table.from_pydict(pydict)\n\n\ndef test_in_memory_table_from_pylist(in_memory_pa_table):\n    pylist = InMemoryTable(in_memory_pa_table).to_pylist()\n    table = InMemoryTable.from_pylist(pylist)\n    assert isinstance(table, InMemoryTable)\n    assert pylist == table.to_pylist()\n\n\ndef test_in_memory_table_from_batches(in_memory_pa_table):\n    batches = list(in_memory_pa_table.to_batches())\n    table = InMemoryTable.from_batches(batches)\n    assert table.table == in_memory_pa_table\n    assert isinstance(table, InMemoryTable)\n\n\ndef test_in_memory_table_deepcopy(in_memory_pa_table):\n    table = InMemoryTable(in_memory_pa_table)\n    copied_table = copy.deepcopy(table)\n    assert table.table == copied_table.table\n    assert_index_attributes_equal(table, copied_table)\n    # deepcopy must return the exact same arrow objects since they are immutable\n    assert table.table is copied_table.table\n    assert all(batch1 is batch2 for batch1, batch2 in zip(table._batches, copied_table._batches))\n\n\ndef test_in_memory_table_pickle(in_memory_pa_table):\n    table = InMemoryTable(in_memory_pa_table)\n    pickled_table = pickle.dumps(table)\n    unpickled_table = pickle.loads(pickled_table)\n    assert unpickled_table.table == table.table\n    assert_index_attributes_equal(table, unpickled_table)\n\n\n@slow\ndef test_in_memory_table_pickle_big_table():\n    big_table_4GB = InMemoryTable.from_pydict({\"col\": [0] * ((4 * 8 << 30) // 64)})\n    length = len(big_table_4GB)\n    big_table_4GB = pickle.dumps(big_table_4GB)\n    big_table_4GB = pickle.loads(big_table_4GB)\n    assert len(big_table_4GB) == length\n\n\ndef test_in_memory_table_slice(in_memory_pa_table):\n    table = InMemoryTable(in_memory_pa_table).slice(1, 2)\n    assert table.table == in_memory_pa_table.slice(1, 2)\n    assert isinstance(table, InMemoryTable)\n\n\ndef test_in_memory_table_filter(in_memory_pa_table):\n    mask = pa.array([i % 2 == 0 for i in range(len(in_memory_pa_table))])\n    table = InMemoryTable(in_memory_pa_table).filter(mask)\n    assert table.table == in_memory_pa_table.filter(mask)\n    assert isinstance(table, InMemoryTable)\n\n\ndef test_in_memory_table_flatten(in_memory_pa_table):\n    table = InMemoryTable(in_memory_pa_table).flatten()\n    assert table.table == in_memory_pa_table.flatten()\n    assert isinstance(table, InMemoryTable)\n\n\ndef test_in_memory_table_combine_chunks(in_memory_pa_table):\n    table = InMemoryTable(in_memory_pa_table).combine_chunks()\n    assert table.table == in_memory_pa_table.combine_chunks()\n    assert isinstance(table, InMemoryTable)\n\n\ndef test_in_memory_table_cast(in_memory_pa_table):\n    assert pa.list_(pa.int64()) in in_memory_pa_table.schema.types\n    schema = pa.schema(\n        {\n            k: v if v != pa.list_(pa.int64()) else pa.list_(pa.int32())\n            for k, v in zip(in_memory_pa_table.schema.names, in_memory_pa_table.schema.types)\n        }\n    )\n    table = InMemoryTable(in_memory_pa_table).cast(schema)\n    assert table.table == in_memory_pa_table.cast(schema)\n    assert isinstance(table, InMemoryTable)\n\n\ndef test_in_memory_table_cast_reorder_struct():\n    table = InMemoryTable(\n        pa.Table.from_pydict(\n            {\n                \"top\": [\n                    {\n                        \"foo\": \"a\",\n                        \"bar\": \"b\",\n                    }\n                ]\n            }\n        )\n    )\n    schema = pa.schema({\"top\": pa.struct({\"bar\": pa.string(), \"foo\": pa.string()})})\n    assert table.cast(schema).schema == schema\n\n\ndef test_in_memory_table_cast_with_hf_features():\n    table = InMemoryTable(pa.Table.from_pydict({\"labels\": [0, 1]}))\n    features = Features({\"labels\": ClassLabel(names=[\"neg\", \"pos\"])})\n    schema = features.arrow_schema\n    assert table.cast(schema).schema == schema\n    assert Features.from_arrow_schema(table.cast(schema).schema) == features\n\n\ndef test_in_memory_table_replace_schema_metadata(in_memory_pa_table):\n    metadata = {\"huggingface\": \"{}\"}\n    table = InMemoryTable(in_memory_pa_table).replace_schema_metadata(metadata)\n    assert table.table.schema.metadata == in_memory_pa_table.replace_schema_metadata(metadata).schema.metadata\n    assert isinstance(table, InMemoryTable)\n\n\ndef test_in_memory_table_add_column(in_memory_pa_table):\n    i = len(in_memory_pa_table.column_names)\n    field_ = \"new_field\"\n    column = pa.array(list(range(len(in_memory_pa_table))))\n    table = InMemoryTable(in_memory_pa_table).add_column(i, field_, column)\n    assert table.table == in_memory_pa_table.add_column(i, field_, column)\n    assert isinstance(table, InMemoryTable)\n\n\ndef test_in_memory_table_append_column(in_memory_pa_table):\n    field_ = \"new_field\"\n    column = pa.array(list(range(len(in_memory_pa_table))))\n    table = InMemoryTable(in_memory_pa_table).append_column(field_, column)\n    assert table.table == in_memory_pa_table.append_column(field_, column)\n    assert isinstance(table, InMemoryTable)\n\n\ndef test_in_memory_table_remove_column(in_memory_pa_table):\n    table = InMemoryTable(in_memory_pa_table).remove_column(0)\n    assert table.table == in_memory_pa_table.remove_column(0)\n    assert isinstance(table, InMemoryTable)\n\n\ndef test_in_memory_table_set_column(in_memory_pa_table):\n    i = len(in_memory_pa_table.column_names)\n    field_ = \"new_field\"\n    column = pa.array(list(range(len(in_memory_pa_table))))\n    table = InMemoryTable(in_memory_pa_table).set_column(i, field_, column)\n    assert table.table == in_memory_pa_table.set_column(i, field_, column)\n    assert isinstance(table, InMemoryTable)\n\n\ndef test_in_memory_table_rename_columns(in_memory_pa_table):\n    assert \"tokens\" in in_memory_pa_table.column_names\n    names = [name if name != \"tokens\" else \"new_tokens\" for name in in_memory_pa_table.column_names]\n    table = InMemoryTable(in_memory_pa_table).rename_columns(names)\n    assert table.table == in_memory_pa_table.rename_columns(names)\n    assert isinstance(table, InMemoryTable)\n\n\ndef test_in_memory_table_drop(in_memory_pa_table):\n    names = [in_memory_pa_table.column_names[0]]\n    table = InMemoryTable(in_memory_pa_table).drop(names)\n    assert table.table == in_memory_pa_table.drop(names)\n    assert isinstance(table, InMemoryTable)\n\n\ndef test_memory_mapped_table_init(arrow_file, in_memory_pa_table):\n    table = MemoryMappedTable(_memory_mapped_arrow_table_from_file(arrow_file), arrow_file)\n    assert table.table == in_memory_pa_table\n    assert isinstance(table, MemoryMappedTable)\n    assert_deepcopy_without_bringing_data_in_memory(table)\n    assert_pickle_without_bringing_data_in_memory(table)\n\n\ndef test_memory_mapped_table_from_file(arrow_file, in_memory_pa_table):\n    with assert_arrow_memory_doesnt_increase():\n        table = MemoryMappedTable.from_file(arrow_file)\n    assert table.table == in_memory_pa_table\n    assert isinstance(table, MemoryMappedTable)\n    assert_deepcopy_without_bringing_data_in_memory(table)\n    assert_pickle_without_bringing_data_in_memory(table)\n\n\ndef test_memory_mapped_table_from_file_with_replay(arrow_file, in_memory_pa_table):\n    replays = [(\"slice\", (0, 1), {}), (\"flatten\", (), {})]\n    with assert_arrow_memory_doesnt_increase():\n        table = MemoryMappedTable.from_file(arrow_file, replays=replays)\n    assert len(table) == 1\n    for method, args, kwargs in replays:\n        in_memory_pa_table = getattr(in_memory_pa_table, method)(*args, **kwargs)\n    assert table.table == in_memory_pa_table\n    assert_deepcopy_without_bringing_data_in_memory(table)\n    assert_pickle_without_bringing_data_in_memory(table)\n\n\ndef test_memory_mapped_table_deepcopy(arrow_file):\n    table = MemoryMappedTable.from_file(arrow_file)\n    copied_table = copy.deepcopy(table)\n    assert table.table == copied_table.table\n    assert table.path == copied_table.path\n    assert_index_attributes_equal(table, copied_table)\n    # deepcopy must return the exact same arrow objects since they are immutable\n    assert table.table is copied_table.table\n    assert all(batch1 is batch2 for batch1, batch2 in zip(table._batches, copied_table._batches))\n\n\ndef test_memory_mapped_table_pickle(arrow_file):\n    table = MemoryMappedTable.from_file(arrow_file)\n    pickled_table = pickle.dumps(table)\n    unpickled_table = pickle.loads(pickled_table)\n    assert unpickled_table.table == table.table\n    assert unpickled_table.path == table.path\n    assert_index_attributes_equal(table, unpickled_table)\n\n\ndef test_memory_mapped_table_pickle_doesnt_fill_memory(arrow_file):\n    with assert_arrow_memory_doesnt_increase():\n        table = MemoryMappedTable.from_file(arrow_file)\n    assert_deepcopy_without_bringing_data_in_memory(table)\n    assert_pickle_without_bringing_data_in_memory(table)\n\n\ndef test_memory_mapped_table_pickle_applies_replay(arrow_file):\n    replays = [(\"slice\", (0, 1), {}), (\"flatten\", (), {})]\n    with assert_arrow_memory_doesnt_increase():\n        table = MemoryMappedTable.from_file(arrow_file, replays=replays)\n    assert isinstance(table, MemoryMappedTable)\n    assert table.replays == replays\n    assert_deepcopy_without_bringing_data_in_memory(table)\n    assert_pickle_without_bringing_data_in_memory(table)\n\n\ndef test_memory_mapped_table_slice(arrow_file, in_memory_pa_table):\n    table = MemoryMappedTable.from_file(arrow_file).slice(1, 2)\n    assert table.table == in_memory_pa_table.slice(1, 2)\n    assert isinstance(table, MemoryMappedTable)\n    assert table.replays == [(\"slice\", (1, 2), {})]\n    assert_deepcopy_without_bringing_data_in_memory(table)\n    assert_pickle_without_bringing_data_in_memory(table)\n\n\ndef test_memory_mapped_table_filter(arrow_file, in_memory_pa_table):\n    mask = pa.array([i % 2 == 0 for i in range(len(in_memory_pa_table))])\n    table = MemoryMappedTable.from_file(arrow_file).filter(mask)\n    assert table.table == in_memory_pa_table.filter(mask)\n    assert isinstance(table, MemoryMappedTable)\n    assert table.replays == [(\"filter\", (mask,), {})]\n    assert_deepcopy_without_bringing_data_in_memory(table)\n    # filter DOES increase memory\n    # assert_pickle_without_bringing_data_in_memory(table)\n    assert_pickle_does_bring_data_in_memory(table)\n\n\ndef test_memory_mapped_table_flatten(arrow_file, in_memory_pa_table):\n    table = MemoryMappedTable.from_file(arrow_file).flatten()\n    assert table.table == in_memory_pa_table.flatten()\n    assert isinstance(table, MemoryMappedTable)\n    assert table.replays == [(\"flatten\", (), {})]\n    assert_deepcopy_without_bringing_data_in_memory(table)\n    assert_pickle_without_bringing_data_in_memory(table)\n\n\ndef test_memory_mapped_table_combine_chunks(arrow_file, in_memory_pa_table):\n    table = MemoryMappedTable.from_file(arrow_file).combine_chunks()\n    assert table.table == in_memory_pa_table.combine_chunks()\n    assert isinstance(table, MemoryMappedTable)\n    assert table.replays == [(\"combine_chunks\", (), {})]\n    assert_deepcopy_without_bringing_data_in_memory(table)\n    assert_pickle_without_bringing_data_in_memory(table)\n\n\ndef test_memory_mapped_table_cast(arrow_file, in_memory_pa_table):\n    assert pa.list_(pa.int64()) in in_memory_pa_table.schema.types\n    schema = pa.schema(\n        {\n            k: v if v != pa.list_(pa.int64()) else pa.list_(pa.int32())\n            for k, v in zip(in_memory_pa_table.schema.names, in_memory_pa_table.schema.types)\n        }\n    )\n    table = MemoryMappedTable.from_file(arrow_file).cast(schema)\n    assert table.table == in_memory_pa_table.cast(schema)\n    assert isinstance(table, MemoryMappedTable)\n    assert table.replays == [(\"cast\", (schema,), {})]\n    assert_deepcopy_without_bringing_data_in_memory(table)\n    # cast DOES increase memory when converting integers precision for example\n    # assert_pickle_without_bringing_data_in_memory(table)\n    assert_pickle_does_bring_data_in_memory(table)\n\n\ndef test_memory_mapped_table_replace_schema_metadata(arrow_file, in_memory_pa_table):\n    metadata = {\"huggingface\": \"{}\"}\n    table = MemoryMappedTable.from_file(arrow_file).replace_schema_metadata(metadata)\n    assert table.table.schema.metadata == in_memory_pa_table.replace_schema_metadata(metadata).schema.metadata\n    assert isinstance(table, MemoryMappedTable)\n    assert table.replays == [(\"replace_schema_metadata\", (metadata,), {})]\n    assert_deepcopy_without_bringing_data_in_memory(table)\n    assert_pickle_without_bringing_data_in_memory(table)\n\n\ndef test_memory_mapped_table_add_column(arrow_file, in_memory_pa_table):\n    i = len(in_memory_pa_table.column_names)\n    field_ = \"new_field\"\n    column = pa.array(list(range(len(in_memory_pa_table))))\n    table = MemoryMappedTable.from_file(arrow_file).add_column(i, field_, column)\n    assert table.table == in_memory_pa_table.add_column(i, field_, column)\n    assert isinstance(table, MemoryMappedTable)\n    assert table.replays == [(\"add_column\", (i, field_, column), {})]\n    assert_deepcopy_without_bringing_data_in_memory(table)\n    assert_pickle_without_bringing_data_in_memory(table)\n\n\ndef test_memory_mapped_table_append_column(arrow_file, in_memory_pa_table):\n    field_ = \"new_field\"\n    column = pa.array(list(range(len(in_memory_pa_table))))\n    table = MemoryMappedTable.from_file(arrow_file).append_column(field_, column)\n    assert table.table == in_memory_pa_table.append_column(field_, column)\n    assert isinstance(table, MemoryMappedTable)\n    assert table.replays == [(\"append_column\", (field_, column), {})]\n    assert_deepcopy_without_bringing_data_in_memory(table)\n    assert_pickle_without_bringing_data_in_memory(table)\n\n\ndef test_memory_mapped_table_remove_column(arrow_file, in_memory_pa_table):\n    table = MemoryMappedTable.from_file(arrow_file).remove_column(0)\n    assert table.table == in_memory_pa_table.remove_column(0)\n    assert isinstance(table, MemoryMappedTable)\n    assert table.replays == [(\"remove_column\", (0,), {})]\n    assert_deepcopy_without_bringing_data_in_memory(table)\n    assert_pickle_without_bringing_data_in_memory(table)\n\n\ndef test_memory_mapped_table_set_column(arrow_file, in_memory_pa_table):\n    i = len(in_memory_pa_table.column_names)\n    field_ = \"new_field\"\n    column = pa.array(list(range(len(in_memory_pa_table))))\n    table = MemoryMappedTable.from_file(arrow_file).set_column(i, field_, column)\n    assert table.table == in_memory_pa_table.set_column(i, field_, column)\n    assert isinstance(table, MemoryMappedTable)\n    assert table.replays == [(\"set_column\", (i, field_, column), {})]\n    assert_deepcopy_without_bringing_data_in_memory(table)\n    assert_pickle_without_bringing_data_in_memory(table)\n\n\ndef test_memory_mapped_table_rename_columns(arrow_file, in_memory_pa_table):\n    assert \"tokens\" in in_memory_pa_table.column_names\n    names = [name if name != \"tokens\" else \"new_tokens\" for name in in_memory_pa_table.column_names]\n    table = MemoryMappedTable.from_file(arrow_file).rename_columns(names)\n    assert table.table == in_memory_pa_table.rename_columns(names)\n    assert isinstance(table, MemoryMappedTable)\n    assert table.replays == [(\"rename_columns\", (names,), {})]\n    assert_deepcopy_without_bringing_data_in_memory(table)\n    assert_pickle_without_bringing_data_in_memory(table)\n\n\ndef test_memory_mapped_table_drop(arrow_file, in_memory_pa_table):\n    names = [in_memory_pa_table.column_names[0]]\n    table = MemoryMappedTable.from_file(arrow_file).drop(names)\n    assert table.table == in_memory_pa_table.drop(names)\n    assert isinstance(table, MemoryMappedTable)\n    assert table.replays == [(\"drop\", (names,), {})]\n    assert_deepcopy_without_bringing_data_in_memory(table)\n    assert_pickle_without_bringing_data_in_memory(table)\n\n\n@pytest.mark.parametrize(\"blocks_type\", [\"in_memory\", \"memory_mapped\", \"mixed\"])\ndef test_concatenation_table_init(\n    blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks\n):\n    blocks = (\n        in_memory_blocks\n        if blocks_type == \"in_memory\"\n        else memory_mapped_blocks\n        if blocks_type == \"memory_mapped\"\n        else mixed_in_memory_and_memory_mapped_blocks\n    )\n    table = ConcatenationTable(in_memory_pa_table, blocks)\n    assert table.table == in_memory_pa_table\n    assert table.blocks == blocks\n\n\ndef test_concatenation_table_from_blocks(in_memory_pa_table, in_memory_blocks):\n    assert len(in_memory_pa_table) > 2\n    in_memory_table = InMemoryTable(in_memory_pa_table)\n    t1, t2 = in_memory_table.slice(0, 2), in_memory_table.slice(2)\n    table = ConcatenationTable.from_blocks(in_memory_table)\n    assert isinstance(table, ConcatenationTable)\n    assert table.table == in_memory_pa_table\n    assert table.blocks == [[in_memory_table]]\n    table = ConcatenationTable.from_blocks([t1, t2])\n    assert isinstance(table, ConcatenationTable)\n    assert table.table == in_memory_pa_table\n    assert table.blocks == [[in_memory_table]]\n    table = ConcatenationTable.from_blocks([[t1], [t2]])\n    assert isinstance(table, ConcatenationTable)\n    assert table.table == in_memory_pa_table\n    assert table.blocks == [[in_memory_table]]\n    table = ConcatenationTable.from_blocks(in_memory_blocks)\n    assert isinstance(table, ConcatenationTable)\n    assert table.table == in_memory_pa_table\n    assert table.blocks == [[in_memory_table]]\n\n\n@pytest.mark.parametrize(\"blocks_type\", [\"in_memory\", \"memory_mapped\", \"mixed\"])\ndef test_concatenation_table_from_blocks_doesnt_increase_memory(\n    blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks\n):\n    blocks = {\n        \"in_memory\": in_memory_blocks,\n        \"memory_mapped\": memory_mapped_blocks,\n        \"mixed\": mixed_in_memory_and_memory_mapped_blocks,\n    }[blocks_type]\n    with assert_arrow_memory_doesnt_increase():\n        table = ConcatenationTable.from_blocks(blocks)\n        assert isinstance(table, ConcatenationTable)\n        assert table.table == in_memory_pa_table\n        if blocks_type == \"in_memory\":\n            assert table.blocks == [[InMemoryTable(in_memory_pa_table)]]\n        else:\n            assert table.blocks == blocks\n\n\n@pytest.mark.parametrize(\"axis\", [0, 1])\ndef test_concatenation_table_from_tables(axis, in_memory_pa_table, arrow_file):\n    in_memory_table = InMemoryTable(in_memory_pa_table)\n    concatenation_table = ConcatenationTable.from_blocks(in_memory_table)\n    memory_mapped_table = MemoryMappedTable.from_file(arrow_file)\n    tables = [in_memory_pa_table, in_memory_table, concatenation_table, memory_mapped_table]\n    if axis == 0:\n        expected_table = pa.concat_tables([in_memory_pa_table] * len(tables))\n    else:\n        # avoids error due to duplicate column names\n        tables[1:] = [add_suffix_to_column_names(table, i) for i, table in enumerate(tables[1:], 1)]\n        expected_table = in_memory_pa_table\n        for table in tables[1:]:\n            for name, col in zip(table.column_names, table.columns):\n                expected_table = expected_table.append_column(name, col)\n\n    with assert_arrow_memory_doesnt_increase():\n        table = ConcatenationTable.from_tables(tables, axis=axis)\n    assert isinstance(table, ConcatenationTable)\n    assert table.table == expected_table\n    # because of consolidation, we end up with 1 InMemoryTable and 1 MemoryMappedTable\n    assert len(table.blocks) == 1 if axis == 1 else 2\n    assert len(table.blocks[0]) == 1 if axis == 0 else 2\n    assert axis == 1 or len(table.blocks[1]) == 1\n    assert isinstance(table.blocks[0][0], InMemoryTable)\n    assert isinstance(table.blocks[1][0] if axis == 0 else table.blocks[0][1], MemoryMappedTable)\n\n\ndef test_concatenation_table_from_tables_axis1_misaligned_blocks(arrow_file):\n    table = MemoryMappedTable.from_file(arrow_file)\n    t1 = table.slice(0, 2)\n    t2 = table.slice(0, 3).rename_columns([col + \"_1\" for col in table.column_names])\n    concatenated = ConcatenationTable.from_tables(\n        [\n            ConcatenationTable.from_blocks([[t1], [t1], [t1]]),\n            ConcatenationTable.from_blocks([[t2], [t2]]),\n        ],\n        axis=1,\n    )\n    assert len(concatenated) == 6\n    assert [len(row_blocks[0]) for row_blocks in concatenated.blocks] == [2, 1, 1, 2]\n    concatenated = ConcatenationTable.from_tables(\n        [\n            ConcatenationTable.from_blocks([[t2], [t2]]),\n            ConcatenationTable.from_blocks([[t1], [t1], [t1]]),\n        ],\n        axis=1,\n    )\n    assert len(concatenated) == 6\n    assert [len(row_blocks[0]) for row_blocks in concatenated.blocks] == [2, 1, 1, 2]\n\n\n@pytest.mark.parametrize(\"blocks_type\", [\"in_memory\", \"memory_mapped\", \"mixed\"])\ndef test_concatenation_table_deepcopy(\n    blocks_type, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks\n):\n    blocks = {\n        \"in_memory\": in_memory_blocks,\n        \"memory_mapped\": memory_mapped_blocks,\n        \"mixed\": mixed_in_memory_and_memory_mapped_blocks,\n    }[blocks_type]\n    table = ConcatenationTable.from_blocks(blocks)\n    copied_table = copy.deepcopy(table)\n    assert table.table == copied_table.table\n    assert table.blocks == copied_table.blocks\n    assert_index_attributes_equal(table, copied_table)\n    # deepcopy must return the exact same arrow objects since they are immutable\n    assert table.table is copied_table.table\n    assert all(batch1 is batch2 for batch1, batch2 in zip(table._batches, copied_table._batches))\n\n\n@pytest.mark.parametrize(\"blocks_type\", [\"in_memory\", \"memory_mapped\", \"mixed\"])\ndef test_concatenation_table_pickle(\n    blocks_type, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks\n):\n    blocks = {\n        \"in_memory\": in_memory_blocks,\n        \"memory_mapped\": memory_mapped_blocks,\n        \"mixed\": mixed_in_memory_and_memory_mapped_blocks,\n    }[blocks_type]\n    table = ConcatenationTable.from_blocks(blocks)\n    pickled_table = pickle.dumps(table)\n    unpickled_table = pickle.loads(pickled_table)\n    assert unpickled_table.table == table.table\n    assert unpickled_table.blocks == table.blocks\n    assert_index_attributes_equal(table, unpickled_table)\n\n\ndef test_concat_tables_with_features_metadata(arrow_file, in_memory_pa_table):\n    input_features = Features.from_arrow_schema(in_memory_pa_table.schema)\n    input_features[\"id\"] = Value(\"int64\", id=\"my_id\")\n    intput_schema = input_features.arrow_schema\n    t0 = in_memory_pa_table.replace_schema_metadata(intput_schema.metadata)\n    t1 = MemoryMappedTable.from_file(arrow_file)\n    tables = [t0, t1]\n    concatenated_table = concat_tables(tables, axis=0)\n    output_schema = concatenated_table.schema\n    output_features = Features.from_arrow_schema(output_schema)\n    assert output_schema == intput_schema\n    assert output_schema.metadata == intput_schema.metadata\n    assert output_features == input_features\n    assert output_features[\"id\"].id == \"my_id\"\n\n\n@pytest.mark.parametrize(\"blocks_type\", [\"in_memory\", \"memory_mapped\", \"mixed\"])\ndef test_concatenation_table_slice(\n    blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks\n):\n    blocks = {\n        \"in_memory\": in_memory_blocks,\n        \"memory_mapped\": memory_mapped_blocks,\n        \"mixed\": mixed_in_memory_and_memory_mapped_blocks,\n    }[blocks_type]\n    table = ConcatenationTable.from_blocks(blocks).slice(1, 2)\n    assert table.table == in_memory_pa_table.slice(1, 2)\n    assert isinstance(table, ConcatenationTable)\n\n\ndef test_concatenation_table_slice_mixed_schemas_vertically(arrow_file):\n    t1 = MemoryMappedTable.from_file(arrow_file)\n    t2 = InMemoryTable.from_pydict({\"additional_column\": [\"foo\"]})\n    expected = pa.table(\n        {\n            **{column: values + [None] for column, values in t1.to_pydict().items()},\n            \"additional_column\": [None] * len(t1) + [\"foo\"],\n        }\n    )\n    blocks = [[t1], [t2]]\n    table = ConcatenationTable.from_blocks(blocks)\n    assert table.to_pydict() == expected.to_pydict()\n    assert isinstance(table, ConcatenationTable)\n    reloaded = pickle.loads(pickle.dumps(table))\n    assert reloaded.to_pydict() == expected.to_pydict()\n    assert isinstance(reloaded, ConcatenationTable)\n    reloaded = pickle.loads(pickle.dumps(table.slice(1, 2)))\n    assert reloaded.to_pydict() == expected.slice(1, 2).to_pydict()\n    assert isinstance(reloaded, ConcatenationTable)\n\n\n@pytest.mark.parametrize(\"blocks_type\", [\"in_memory\", \"memory_mapped\", \"mixed\"])\ndef test_concatenation_table_filter(\n    blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks\n):\n    blocks = {\n        \"in_memory\": in_memory_blocks,\n        \"memory_mapped\": memory_mapped_blocks,\n        \"mixed\": mixed_in_memory_and_memory_mapped_blocks,\n    }[blocks_type]\n    mask = pa.array([i % 2 == 0 for i in range(len(in_memory_pa_table))])\n    table = ConcatenationTable.from_blocks(blocks).filter(mask)\n    assert table.table == in_memory_pa_table.filter(mask)\n    assert isinstance(table, ConcatenationTable)\n\n\n@pytest.mark.parametrize(\"blocks_type\", [\"in_memory\", \"memory_mapped\", \"mixed\"])\ndef test_concatenation_table_flatten(\n    blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks\n):\n    blocks = {\n        \"in_memory\": in_memory_blocks,\n        \"memory_mapped\": memory_mapped_blocks,\n        \"mixed\": mixed_in_memory_and_memory_mapped_blocks,\n    }[blocks_type]\n    table = ConcatenationTable.from_blocks(blocks).flatten()\n    assert table.table == in_memory_pa_table.flatten()\n    assert isinstance(table, ConcatenationTable)\n\n\n@pytest.mark.parametrize(\"blocks_type\", [\"in_memory\", \"memory_mapped\", \"mixed\"])\ndef test_concatenation_table_combine_chunks(\n    blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks\n):\n    blocks = {\n        \"in_memory\": in_memory_blocks,\n        \"memory_mapped\": memory_mapped_blocks,\n        \"mixed\": mixed_in_memory_and_memory_mapped_blocks,\n    }[blocks_type]\n    table = ConcatenationTable.from_blocks(blocks).combine_chunks()\n    assert table.table == in_memory_pa_table.combine_chunks()\n    assert isinstance(table, ConcatenationTable)\n\n\n@pytest.mark.parametrize(\"blocks_type\", [\"in_memory\", \"memory_mapped\", \"mixed\"])\ndef test_concatenation_table_cast(\n    blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks\n):\n    blocks = {\n        \"in_memory\": in_memory_blocks,\n        \"memory_mapped\": memory_mapped_blocks,\n        \"mixed\": mixed_in_memory_and_memory_mapped_blocks,\n    }[blocks_type]\n    assert pa.list_(pa.int64()) in in_memory_pa_table.schema.types\n    assert pa.int64() in in_memory_pa_table.schema.types\n    schema = pa.schema(\n        {\n            k: v if v != pa.list_(pa.int64()) else pa.list_(pa.int32())\n            for k, v in zip(in_memory_pa_table.schema.names, in_memory_pa_table.schema.types)\n        }\n    )\n    table = ConcatenationTable.from_blocks(blocks).cast(schema)\n    assert table.table == in_memory_pa_table.cast(schema)\n    assert isinstance(table, ConcatenationTable)\n    schema = pa.schema(\n        {\n            k: v if v != pa.int64() else pa.int32()\n            for k, v in zip(in_memory_pa_table.schema.names, in_memory_pa_table.schema.types)\n        }\n    )\n    table = ConcatenationTable.from_blocks(blocks).cast(schema)\n    assert table.table == in_memory_pa_table.cast(schema)\n    assert isinstance(table, ConcatenationTable)\n\n\n@pytest.mark.parametrize(\"blocks_type\", [\"in_memory\", \"memory_mapped\", \"mixed\"])\ndef test_concat_tables_cast_with_features_metadata(\n    blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks\n):\n    blocks = {\n        \"in_memory\": in_memory_blocks,\n        \"memory_mapped\": memory_mapped_blocks,\n        \"mixed\": mixed_in_memory_and_memory_mapped_blocks,\n    }[blocks_type]\n    input_features = Features.from_arrow_schema(in_memory_pa_table.schema)\n    input_features[\"id\"] = Value(\"int64\", id=\"my_id\")\n    intput_schema = input_features.arrow_schema\n    concatenated_table = ConcatenationTable.from_blocks(blocks).cast(intput_schema)\n    output_schema = concatenated_table.schema\n    output_features = Features.from_arrow_schema(output_schema)\n    assert output_schema == intput_schema\n    assert output_schema.metadata == intput_schema.metadata\n    assert output_features == input_features\n    assert output_features[\"id\"].id == \"my_id\"\n\n\n@pytest.mark.parametrize(\"blocks_type\", [\"in_memory\", \"memory_mapped\", \"mixed\"])\ndef test_concatenation_table_replace_schema_metadata(\n    blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks\n):\n    blocks = {\n        \"in_memory\": in_memory_blocks,\n        \"memory_mapped\": memory_mapped_blocks,\n        \"mixed\": mixed_in_memory_and_memory_mapped_blocks,\n    }[blocks_type]\n    metadata = {\"huggingface\": \"{}\"}\n    table = ConcatenationTable.from_blocks(blocks).replace_schema_metadata(metadata)\n    assert table.table.schema.metadata == in_memory_pa_table.replace_schema_metadata(metadata).schema.metadata\n    assert isinstance(table, ConcatenationTable)\n\n\n@pytest.mark.parametrize(\"blocks_type\", [\"in_memory\", \"memory_mapped\", \"mixed\"])\ndef test_concatenation_table_add_column(\n    blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks\n):\n    blocks = {\n        \"in_memory\": in_memory_blocks,\n        \"memory_mapped\": memory_mapped_blocks,\n        \"mixed\": mixed_in_memory_and_memory_mapped_blocks,\n    }[blocks_type]\n    i = len(in_memory_pa_table.column_names)\n    field_ = \"new_field\"\n    column = pa.array(list(range(len(in_memory_pa_table))))\n    with pytest.raises(NotImplementedError):\n        ConcatenationTable.from_blocks(blocks).add_column(i, field_, column)\n        # assert table.table == in_memory_pa_table.add_column(i, field_, column)\n        # unpickled_table = pickle.loads(pickle.dumps(table))\n        # assert unpickled_table.table == in_memory_pa_table.add_column(i, field_, column)\n\n\n@pytest.mark.parametrize(\"blocks_type\", [\"in_memory\", \"memory_mapped\", \"mixed\"])\ndef test_concatenation_table_append_column(\n    blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks\n):\n    blocks = {\n        \"in_memory\": in_memory_blocks,\n        \"memory_mapped\": memory_mapped_blocks,\n        \"mixed\": mixed_in_memory_and_memory_mapped_blocks,\n    }[blocks_type]\n    field_ = \"new_field\"\n    column = pa.array(list(range(len(in_memory_pa_table))))\n    with pytest.raises(NotImplementedError):\n        ConcatenationTable.from_blocks(blocks).append_column(field_, column)\n        # assert table.table == in_memory_pa_table.append_column(field_, column)\n        # unpickled_table = pickle.loads(pickle.dumps(table))\n        # assert unpickled_table.table == in_memory_pa_table.append_column(field_, column)\n\n\n@pytest.mark.parametrize(\"blocks_type\", [\"in_memory\", \"memory_mapped\", \"mixed\"])\ndef test_concatenation_table_remove_column(\n    blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks\n):\n    blocks = {\n        \"in_memory\": in_memory_blocks,\n        \"memory_mapped\": memory_mapped_blocks,\n        \"mixed\": mixed_in_memory_and_memory_mapped_blocks,\n    }[blocks_type]\n    table = ConcatenationTable.from_blocks(blocks).remove_column(0)\n    assert table.table == in_memory_pa_table.remove_column(0)\n    assert isinstance(table, ConcatenationTable)\n\n\n@pytest.mark.parametrize(\"blocks_type\", [\"in_memory\", \"memory_mapped\", \"mixed\"])\ndef test_concatenation_table_set_column(\n    blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks\n):\n    blocks = {\n        \"in_memory\": in_memory_blocks,\n        \"memory_mapped\": memory_mapped_blocks,\n        \"mixed\": mixed_in_memory_and_memory_mapped_blocks,\n    }[blocks_type]\n    i = len(in_memory_pa_table.column_names)\n    field_ = \"new_field\"\n    column = pa.array(list(range(len(in_memory_pa_table))))\n    with pytest.raises(NotImplementedError):\n        ConcatenationTable.from_blocks(blocks).set_column(i, field_, column)\n        # assert table.table == in_memory_pa_table.set_column(i, field_, column)\n        # unpickled_table = pickle.loads(pickle.dumps(table))\n        # assert unpickled_table.table == in_memory_pa_table.set_column(i, field_, column)\n\n\n@pytest.mark.parametrize(\"blocks_type\", [\"in_memory\", \"memory_mapped\", \"mixed\"])\ndef test_concatenation_table_rename_columns(\n    blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks\n):\n    blocks = {\n        \"in_memory\": in_memory_blocks,\n        \"memory_mapped\": memory_mapped_blocks,\n        \"mixed\": mixed_in_memory_and_memory_mapped_blocks,\n    }[blocks_type]\n    assert \"tokens\" in in_memory_pa_table.column_names\n    names = [name if name != \"tokens\" else \"new_tokens\" for name in in_memory_pa_table.column_names]\n    table = ConcatenationTable.from_blocks(blocks).rename_columns(names)\n    assert isinstance(table, ConcatenationTable)\n    assert table.table == in_memory_pa_table.rename_columns(names)\n\n\n@pytest.mark.parametrize(\"blocks_type\", [\"in_memory\", \"memory_mapped\", \"mixed\"])\ndef test_concatenation_table_drop(\n    blocks_type, in_memory_pa_table, in_memory_blocks, memory_mapped_blocks, mixed_in_memory_and_memory_mapped_blocks\n):\n    blocks = {\n        \"in_memory\": in_memory_blocks,\n        \"memory_mapped\": memory_mapped_blocks,\n        \"mixed\": mixed_in_memory_and_memory_mapped_blocks,\n    }[blocks_type]\n    names = [in_memory_pa_table.column_names[0]]\n    table = ConcatenationTable.from_blocks(blocks).drop(names)\n    assert table.table == in_memory_pa_table.drop(names)\n    assert isinstance(table, ConcatenationTable)\n\n\ndef test_concat_tables(arrow_file, in_memory_pa_table):\n    t0 = in_memory_pa_table\n    t1 = InMemoryTable(t0)\n    t2 = MemoryMappedTable.from_file(arrow_file)\n    t3 = ConcatenationTable.from_blocks(t1)\n    tables = [t0, t1, t2, t3]\n    concatenated_table = concat_tables(tables, axis=0)\n    assert concatenated_table.table == pa.concat_tables([t0] * 4)\n    assert concatenated_table.table.shape == (40, 4)\n    assert isinstance(concatenated_table, ConcatenationTable)\n    assert len(concatenated_table.blocks) == 3  # t0 and t1 are consolidated as a single InMemoryTable\n    assert isinstance(concatenated_table.blocks[0][0], InMemoryTable)\n    assert isinstance(concatenated_table.blocks[1][0], MemoryMappedTable)\n    assert isinstance(concatenated_table.blocks[2][0], InMemoryTable)\n    # add suffix to avoid error due to duplicate column names\n    concatenated_table = concat_tables(\n        [add_suffix_to_column_names(table, i) for i, table in enumerate(tables)], axis=1\n    )\n    assert concatenated_table.table.shape == (10, 16)\n    assert len(concatenated_table.blocks[0]) == 3  # t0 and t1 are consolidated as a single InMemoryTable\n    assert isinstance(concatenated_table.blocks[0][0], InMemoryTable)\n    assert isinstance(concatenated_table.blocks[0][1], MemoryMappedTable)\n    assert isinstance(concatenated_table.blocks[0][2], InMemoryTable)\n\n\ndef _interpolation_search_ground_truth(arr: list[int], x: int) -> Union[int, IndexError]:\n    for i in range(len(arr) - 1):\n        if arr[i] <= x < arr[i + 1]:\n            return i\n    return IndexError\n\n\nclass _ListWithGetitemCounter(list):\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n        self.unique_getitem_calls = set()\n\n    def __getitem__(self, i):\n        out = super().__getitem__(i)\n        self.unique_getitem_calls.add(i)\n        return out\n\n    @property\n    def getitem_unique_count(self):\n        return len(self.unique_getitem_calls)\n\n\n@pytest.mark.parametrize(\n    \"arr, x\",\n    [(np.arange(0, 14, 3), x) for x in range(-1, 22)]\n    + [(list(np.arange(-5, 5)), x) for x in range(-6, 6)]\n    + [([0, 1_000, 1_001, 1_003], x) for x in [-1, 0, 2, 100, 999, 1_000, 1_001, 1_002, 1_003, 1_004]]\n    + [(list(range(1_000)), x) for x in [-1, 0, 1, 10, 666, 999, 1_000, 1_0001]],\n)\ndef test_interpolation_search(arr, x):\n    ground_truth = _interpolation_search_ground_truth(arr, x)\n    if isinstance(ground_truth, int):\n        arr = _ListWithGetitemCounter(arr)\n        output = _interpolation_search(arr, x)\n        assert ground_truth == output\n        # 4 maximum unique getitem calls is expected for the cases of this test\n        # but it can be bigger for large and messy arrays.\n        assert arr.getitem_unique_count <= 4\n    else:\n        with pytest.raises(ground_truth):\n            _interpolation_search(arr, x)\n\n\ndef test_indexed_table_mixin():\n    n_rows_per_chunk = 10\n    n_chunks = 4\n    pa_table = pa.Table.from_pydict({\"col\": [0] * n_rows_per_chunk})\n    pa_table = pa.concat_tables([pa_table] * n_chunks)\n    table = Table(pa_table)\n    assert all(table._offsets.tolist() == np.cumsum([0] + [n_rows_per_chunk] * n_chunks))\n    assert table.fast_slice(5) == pa_table.slice(5)\n    assert table.fast_slice(2, 13) == pa_table.slice(2, 13)\n\n\ndef test_cast_integer_array_to_features():\n    arr = pa.array([[0, 1]])\n    assert cast_array_to_feature(arr, List(Value(\"string\"))).type == pa.list_(pa.string())\n    assert cast_array_to_feature(arr, List(Value(\"string\")), allow_decimal_to_str=False).type == pa.list_(pa.string())\n    with pytest.raises(TypeError):\n        cast_array_to_feature(arr, List(Value(\"string\")), allow_primitive_to_str=False)\n\n\ndef test_cast_float_array_to_features():\n    arr = pa.array([[0.0, 1.0]])\n    assert cast_array_to_feature(arr, List(Value(\"string\"))).type == pa.list_(pa.string())\n    assert cast_array_to_feature(arr, List(Value(\"string\")), allow_decimal_to_str=False).type == pa.list_(pa.string())\n    with pytest.raises(TypeError):\n        cast_array_to_feature(arr, List(Value(\"string\")), allow_primitive_to_str=False)\n\n\ndef test_cast_boolean_array_to_features():\n    arr = pa.array([[False, True]])\n    assert cast_array_to_feature(arr, List(Value(\"string\"))).type == pa.list_(pa.string())\n    assert cast_array_to_feature(arr, List(Value(\"string\")), allow_decimal_to_str=False).type == pa.list_(pa.string())\n    with pytest.raises(TypeError):\n        cast_array_to_feature(arr, List(Value(\"string\")), allow_primitive_to_str=False)\n\n\ndef test_cast_decimal_array_to_features():\n    arr = pa.array([[Decimal(0), Decimal(1)]])\n    assert cast_array_to_feature(arr, List(Value(\"string\"))).type == pa.list_(pa.string())\n    assert cast_array_to_feature(arr, List(Value(\"string\")), allow_primitive_to_str=False).type == pa.list_(\n        pa.string()\n    )\n    with pytest.raises(TypeError):\n        cast_array_to_feature(arr, List(Value(\"string\")), allow_decimal_to_str=False)\n\n\n@pytest.mark.parametrize(\n    \"array_list, expected_list\",\n    [\n        ([{\"age\": 25}, {\"age\": 63}], [{\"age\": 25, \"name\": None}, {\"age\": 63, \"name\": None}]),\n        ([{}, {}], [{\"age\": None, \"name\": None}, {\"age\": None, \"name\": None}]),  # completely empty struct\n    ],\n)\ndef test_cast_array_to_feature_with_struct_with_missing_fields(array_list, expected_list):\n    arr = pa.array(array_list)\n    feature = {\"age\": Value(\"int32\"), \"name\": Value(\"string\")}\n    cast_array = cast_array_to_feature(arr, feature)\n    assert cast_array.type == pa.struct({\"age\": pa.int32(), \"name\": pa.string()})\n    assert cast_array.to_pylist() == expected_list\n\n\ndef test_cast_array_to_features_nested():\n    arr = pa.array([[{\"foo\": [0]}]])\n    assert cast_array_to_feature(arr, List({\"foo\": List(Value(\"string\"))})).type == pa.list_(\n        pa.struct({\"foo\": pa.list_(pa.string())})\n    )\n\n\ndef test_cast_array_to_features_to_nested_with_no_fields():\n    arr = pa.array([{}])\n    assert cast_array_to_feature(arr, {}).type == pa.struct({})\n    assert cast_array_to_feature(arr, {}).to_pylist() == arr.to_pylist()\n\n\ndef test_cast_array_to_features_nested_with_nulls():\n    # same type\n    arr = pa.array([{\"foo\": [None, [0]]}], pa.struct({\"foo\": pa.list_(pa.list_(pa.int64()))}))\n    casted_array = cast_array_to_feature(arr, {\"foo\": List(List(Value(\"int64\")))})\n    assert casted_array.type == pa.struct({\"foo\": pa.list_(pa.list_(pa.int64()))})\n    assert casted_array.to_pylist() == arr.to_pylist()\n    # different type\n    arr = pa.array([{\"foo\": [None, [0]]}], pa.struct({\"foo\": pa.list_(pa.list_(pa.int64()))}))\n    casted_array = cast_array_to_feature(arr, {\"foo\": List(List(Value(\"int32\")))})\n    assert casted_array.type == pa.struct({\"foo\": pa.list_(pa.list_(pa.int32()))})\n    assert casted_array.to_pylist() == [{\"foo\": [None, [0]]}]\n\n\ndef test_cast_array_to_features_to_null_type():\n    # same type\n    arr = pa.array([[None, None]])\n    assert cast_array_to_feature(arr, List(Value(\"null\"))).type == pa.list_(pa.null())\n\n    # different type\n    arr = pa.array([[None, 1]])\n    with pytest.raises(TypeError):\n        cast_array_to_feature(arr, List(Value(\"null\")))\n\n\ndef test_cast_array_to_features_array_xd():\n    # same storage type\n    arr = pa.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]], pa.list_(pa.list_(pa.int32(), 2), 2))\n    casted_array = cast_array_to_feature(arr, Array2D(shape=(2, 2), dtype=\"int32\"))\n    assert casted_array.type == Array2DExtensionType(shape=(2, 2), dtype=\"int32\")\n    # different storage type\n    casted_array = cast_array_to_feature(arr, Array2D(shape=(2, 2), dtype=\"float32\"))\n    assert casted_array.type == Array2DExtensionType(shape=(2, 2), dtype=\"float32\")\n\n\ndef test_cast_array_to_features_sequence_classlabel():\n    arr = pa.array([[], [1], [0, 1]], pa.list_(pa.int64()))\n    assert cast_array_to_feature(arr, List(ClassLabel(names=[\"foo\", \"bar\"]))).type == pa.list_(pa.int64())\n\n    arr = pa.array([[], [\"bar\"], [\"foo\", \"bar\"]], pa.list_(pa.string()))\n    assert cast_array_to_feature(arr, List(ClassLabel(names=[\"foo\", \"bar\"]))).type == pa.list_(pa.int64())\n\n    # Test empty arrays\n    arr = pa.array([[], []], pa.list_(pa.int64()))\n    assert cast_array_to_feature(arr, List(ClassLabel(names=[\"foo\", \"bar\"]))).type == pa.list_(pa.int64())\n\n    arr = pa.array([[], []], pa.list_(pa.string()))\n    assert cast_array_to_feature(arr, List(ClassLabel(names=[\"foo\", \"bar\"]))).type == pa.list_(pa.int64())\n\n    # Test invalid class labels\n    arr = pa.array([[2]], pa.list_(pa.int64()))\n    with pytest.raises(ValueError):\n        assert cast_array_to_feature(arr, List(ClassLabel(names=[\"foo\", \"bar\"])))\n\n    arr = pa.array([[\"baz\"]], pa.list_(pa.string()))\n    with pytest.raises(ValueError):\n        assert cast_array_to_feature(arr, List(ClassLabel(names=[\"foo\", \"bar\"])))\n\n\n@pytest.mark.parametrize(\n    \"arr\",\n    [\n        pa.array([[0, 1, 2], [3, None, 5], None, [6, 7, 8], None], pa.list_(pa.int32(), 3)),\n    ],\n)\n@pytest.mark.parametrize(\"slice\", [None, slice(1, None), slice(-1), slice(1, 3), slice(2, 3), slice(1, 1)])\n@pytest.mark.parametrize(\"target_value_feature\", [Value(\"int64\")])\ndef test_cast_fixed_size_list_array_to_features_sequence(arr, slice, target_value_feature):\n    arr = arr if slice is None else arr[slice]\n    # Fixed size list\n    casted_array = cast_array_to_feature(arr, List(target_value_feature, length=arr.type.list_size))\n    assert casted_array.type == get_nested_type(List(target_value_feature, length=arr.type.list_size))\n    assert casted_array.to_pylist() == arr.to_pylist()\n    with pytest.raises(TypeError):\n        cast_array_to_feature(arr, List(target_value_feature, length=arr.type.list_size + 1))\n    # Variable size list\n    casted_array = cast_array_to_feature(arr, List(target_value_feature))\n    assert casted_array.type == get_nested_type(List(target_value_feature))\n    assert casted_array.to_pylist() == arr.to_pylist()\n    casted_array = cast_array_to_feature(arr, List(target_value_feature))\n    assert casted_array.type == get_nested_type(List(target_value_feature))\n    assert casted_array.to_pylist() == arr.to_pylist()\n\n\n@pytest.mark.parametrize(\n    \"arr\",\n    [\n        pa.array([[0, 1, 2], [3, None, 5], None, [6, 7, 8], None], pa.list_(pa.int32())),\n    ],\n)\n@pytest.mark.parametrize(\"slice\", [None, slice(1, None), slice(-1), slice(1, 3), slice(2, 3), slice(1, 1)])\n@pytest.mark.parametrize(\"target_value_feature\", [Value(\"int64\")])\ndef test_cast_list_array_to_features_sequence(arr, slice, target_value_feature):\n    arr = arr if slice is None else arr[slice]\n    # Variable size list\n    casted_array = cast_array_to_feature(arr, List(target_value_feature))\n    assert casted_array.type == get_nested_type(List(target_value_feature))\n    assert casted_array.to_pylist() == arr.to_pylist()\n    casted_array = cast_array_to_feature(arr, List(target_value_feature))\n    assert casted_array.type == get_nested_type(List(target_value_feature))\n    assert casted_array.to_pylist() == arr.to_pylist()\n    # Fixed size list\n    list_size = arr.value_lengths().drop_null()[0].as_py() if arr.value_lengths().drop_null() else 2\n    casted_array = cast_array_to_feature(arr, List(target_value_feature, length=list_size))\n    assert casted_array.type == get_nested_type(List(target_value_feature, length=list_size))\n    assert casted_array.to_pylist() == arr.to_pylist()\n\n\n@pytest.mark.parametrize(\"sequence_feature_dtype\", [\"string\", \"int64\"])\n@pytest.mark.parametrize(\"from_list_type\", [\"list\", \"fixed_size_list\", \"large_list\"])\n@pytest.mark.parametrize(\"list_within_struct\", [False, True])\ndef test_cast_array_to_feature_with_list_array_and_sequence_feature(\n    list_within_struct, from_list_type, sequence_feature_dtype\n):\n    list_feature = {\n        \"list\": List,\n        \"fixed_size_list\": partial(List, length=2),\n        \"large_list\": LargeList,\n    }\n    list_type = {\n        \"list\": pa.list_,\n        \"fixed_size_list\": partial(pa.list_, list_size=2),\n        \"large_list\": pa.large_list,\n    }\n    primitive_type = {\n        \"string\": pa.string(),\n        \"int64\": pa.int64(),\n    }\n    to_type = \"list\"\n    array_data = [0, 1]\n    array_type = list_type[from_list_type](pa.int64())\n    sequence_feature = list_feature[from_list_type](Value(sequence_feature_dtype))\n    expected_array_type = list_type[from_list_type](primitive_type[sequence_feature_dtype])\n    if list_within_struct:\n        array_data = {\"col_1\": array_data}\n        array_type = pa.struct({\"col_1\": array_type})\n        sequence_feature = {\"col_1\": sequence_feature}\n        expected_array_type = pa.struct({\"col_1\": expected_array_type})\n    array_data = [array_data] * 2\n    array_type = list_type[from_list_type](array_type)\n    feature = list_feature[to_type](sequence_feature)\n    expected_array_type = list_type[to_type](expected_array_type)\n    array = pa.array([array_data], type=array_type)\n    cast_array = cast_array_to_feature(array, feature)\n    assert cast_array.type == expected_array_type\n\n\n@pytest.mark.parametrize(\"large_list_feature_value_type\", [\"string\", \"int64\"])\n@pytest.mark.parametrize(\"from_list_type\", [\"list\", \"fixed_size_list\", \"large_list\"])\ndef test_cast_array_to_feature_with_list_array_and_large_list_feature(from_list_type, large_list_feature_value_type):\n    list_type = {\n        \"list\": pa.list_,\n        \"fixed_size_list\": partial(pa.list_, list_size=2),\n        \"large_list\": pa.large_list,\n    }\n    primitive_type = {\n        \"string\": pa.string(),\n        \"int64\": pa.int64(),\n    }\n    to_type = \"large_list\"\n    array_data = [0, 1]\n    array_type = list_type[from_list_type](pa.int64())\n    large_list_feature_value = Value(large_list_feature_value_type)\n    expected_array_type = list_type[to_type](primitive_type[large_list_feature_value_type])\n    feature = LargeList(large_list_feature_value)\n    array = pa.array([array_data], type=array_type)\n    cast_array = cast_array_to_feature(array, feature)\n    assert cast_array.type == expected_array_type\n\n\ndef test_cast_array_xd_to_features_sequence():\n    arr = np.random.randint(0, 10, size=(8, 2, 3)).tolist()\n    arr = Array2DExtensionType(shape=(2, 3), dtype=\"int64\").wrap_array(pa.array(arr, pa.list_(pa.list_(pa.int64()))))\n    arr = pa.ListArray.from_arrays([0, None, 4, 8], arr)\n    # Variable size list\n    casted_array = cast_array_to_feature(arr, List(Array2D(shape=(2, 3), dtype=\"int32\")))\n    assert casted_array.type == get_nested_type(List(Array2D(shape=(2, 3), dtype=\"int32\")))\n    assert casted_array.to_pylist() == arr.to_pylist()\n    # Fixed size list\n    casted_array = cast_array_to_feature(arr, List(Array2D(shape=(2, 3), dtype=\"int32\"), length=4))\n    assert casted_array.type == get_nested_type(List(Array2D(shape=(2, 3), dtype=\"int32\"), length=4))\n    assert casted_array.to_pylist() == arr.to_pylist()\n\n\ndef test_embed_array_storage(image_file):\n    array = pa.array([{\"bytes\": None, \"path\": image_file}], type=Image.pa_type)\n    embedded_images_array = embed_array_storage(array, Image())\n    assert isinstance(embedded_images_array.to_pylist()[0][\"path\"], str)\n    assert embedded_images_array.to_pylist()[0][\"path\"] == \"test_image_rgb.jpg\"\n    assert isinstance(embedded_images_array.to_pylist()[0][\"bytes\"], bytes)\n\n\ndef test_embed_array_storage_nested(image_file):\n    array = pa.array([[{\"bytes\": None, \"path\": image_file}]], type=pa.list_(Image.pa_type))\n    embedded_images_array = embed_array_storage(array, List(Image()))\n    assert isinstance(embedded_images_array.to_pylist()[0][0][\"path\"], str)\n    assert isinstance(embedded_images_array.to_pylist()[0][0][\"bytes\"], bytes)\n    array = pa.array([{\"foo\": {\"bytes\": None, \"path\": image_file}}], type=pa.struct({\"foo\": Image.pa_type}))\n    embedded_images_array = embed_array_storage(array, {\"foo\": Image()})\n    assert isinstance(embedded_images_array.to_pylist()[0][\"foo\"][\"path\"], str)\n    assert isinstance(embedded_images_array.to_pylist()[0][\"foo\"][\"bytes\"], bytes)\n\n\n@pytest.mark.parametrize(\n    \"array, feature, expected_embedded_array_type\",\n    [\n        (\n            pa.array([[{\"path\": \"image_path\"}]], type=pa.list_(Image.pa_type)),\n            List(Image()),\n            pa.types.is_list,\n        ),\n        (\n            pa.array([[{\"path\": \"image_path\"}]], type=pa.large_list(Image.pa_type)),\n            LargeList(Image()),\n            pa.types.is_large_list,\n        ),\n    ],\n)\ndef test_embed_array_storage_with_list_types(array, feature, expected_embedded_array_type, monkeypatch):\n    mock_embed_storage = MagicMock(\n        return_value=pa.StructArray.from_arrays(\n            [pa.array([b\"image_bytes\"], type=pa.binary()), pa.array([\"image_path\"], type=pa.string())],\n            [\"bytes\", \"path\"],\n        )\n    )\n    monkeypatch.setattr(Image, \"embed_storage\", mock_embed_storage)\n    embedded_images_array = embed_array_storage(array, feature)\n    assert expected_embedded_array_type(embedded_images_array.type)\n    assert embedded_images_array.to_pylist() == [[{\"bytes\": b\"image_bytes\", \"path\": \"image_path\"}]]\n\n\ndef test_embed_table_storage(image_file):\n    features = Features({\"image\": Image()})\n    table = table_cast(pa.table({\"image\": [image_file]}), features.arrow_schema)\n    embedded_images_table = embed_table_storage(table)\n    assert isinstance(embedded_images_table.to_pydict()[\"image\"][0][\"path\"], str)\n    assert isinstance(embedded_images_table.to_pydict()[\"image\"][0][\"bytes\"], bytes)\n\n\n@pytest.mark.parametrize(\n    \"table\",\n    [\n        InMemoryTable(pa.table({\"foo\": range(10)})),\n        InMemoryTable(pa.concat_tables([pa.table({\"foo\": range(0, 5)}), pa.table({\"foo\": range(5, 10)})])),\n        InMemoryTable(pa.concat_tables([pa.table({\"foo\": [i]}) for i in range(10)])),\n    ],\n)\n@pytest.mark.parametrize(\"batch_size\", [1, 2, 3, 9, 10, 11, 20])\n@pytest.mark.parametrize(\"drop_last_batch\", [False, True])\ndef test_table_iter(table, batch_size, drop_last_batch):\n    num_rows = len(table) if not drop_last_batch else len(table) // batch_size * batch_size\n    num_batches = (num_rows // batch_size) + 1 if num_rows % batch_size else num_rows // batch_size\n    subtables = list(table_iter(table, batch_size=batch_size, drop_last_batch=drop_last_batch))\n    assert len(subtables) == num_batches\n    if drop_last_batch:\n        assert all(len(subtable) == batch_size for subtable in subtables)\n    else:\n        assert all(len(subtable) == batch_size for subtable in subtables[:-1])\n        assert len(subtables[-1]) <= batch_size\n    if num_rows > 0:\n        reloaded = pa.concat_tables(subtables)\n        assert table.slice(0, num_rows).to_pydict() == reloaded.to_pydict()\n\n\n@pytest.mark.parametrize(\"to_type\", [\"list\", \"fixed_size_list\", \"large_list\"])\n@pytest.mark.parametrize(\"from_type\", [\"list\", \"fixed_size_list\", \"large_list\"])\ndef test_array_cast(from_type, to_type):\n    array_type = {\n        \"list\": pa.list_(pa.int64()),\n        \"fixed_size_list\": pa.list_(pa.int64(), 2),\n        \"large_list\": pa.large_list(pa.int64()),\n    }\n    arr = pa.array([[0, 1]], type=array_type[from_type])\n    cast_arr = array_cast(arr, array_type[to_type])\n    assert cast_arr.type == array_type[to_type]\n    assert cast_arr.values == arr.values\n\n\ndef test_cast_table_to_schema_with_missing_fields():\n    table = pa.table({\"age\": [25, 63]})\n    schema = pa.schema({\"age\": pa.int32(), \"name\": pa.string()})\n    cast_table = cast_table_to_schema(table, schema)\n    assert cast_table.schema == pa.schema({\"age\": pa.int32(), \"name\": pa.string()})\n    assert cast_table.to_pydict() == {\"age\": [25, 63], \"name\": [None, None]}\n"
  },
  {
    "path": "tests/test_tqdm.py",
    "content": "import unittest\nfrom unittest.mock import patch\n\nimport pytest\nfrom pytest import CaptureFixture\n\nfrom datasets.utils import (\n    are_progress_bars_disabled,\n    disable_progress_bars,\n    enable_progress_bars,\n    tqdm,\n)\n\n\nclass TestTqdmUtils(unittest.TestCase):\n    @pytest.fixture(autouse=True)\n    def capsys(self, capsys: CaptureFixture) -> None:\n        \"\"\"Workaround to make capsys work in unittest framework.\n\n        Capsys is a convenient pytest fixture to capture stdout.\n        See https://waylonwalker.com/pytest-capsys/.\n\n        Taken from https://github.com/pytest-dev/pytest/issues/2504#issuecomment-309475790.\n        \"\"\"\n        self.capsys = capsys\n\n    def setUp(self) -> None:\n        \"\"\"Get verbosity to set it back after the tests.\"\"\"\n        self._previous_are_progress_bars_disabled = are_progress_bars_disabled()\n        return super().setUp()\n\n    def tearDown(self) -> None:\n        \"\"\"Set back progress bars verbosity as before testing.\"\"\"\n        if self._previous_are_progress_bars_disabled:\n            disable_progress_bars()\n        else:\n            enable_progress_bars()\n\n    @patch(\"datasets.utils._tqdm.HF_DATASETS_DISABLE_PROGRESS_BARS\", None)\n    def test_tqdm_helpers(self) -> None:\n        \"\"\"Test helpers to enable/disable progress bars.\"\"\"\n        disable_progress_bars()\n        self.assertTrue(are_progress_bars_disabled())\n\n        enable_progress_bars()\n        self.assertFalse(are_progress_bars_disabled())\n\n    @patch(\"datasets.utils._tqdm.HF_DATASETS_DISABLE_PROGRESS_BARS\", True)\n    def test_cannot_enable_tqdm_when_env_variable_is_set(self) -> None:\n        \"\"\"\n        Test helpers cannot enable/disable progress bars when\n        `HF_DATASETS_DISABLE_PROGRESS_BARS` is set.\n        \"\"\"\n        disable_progress_bars()\n        self.assertTrue(are_progress_bars_disabled())\n\n        with self.assertWarns(UserWarning):\n            enable_progress_bars()\n        self.assertTrue(are_progress_bars_disabled())  # Still disabled !\n\n    @patch(\"datasets.utils._tqdm.HF_DATASETS_DISABLE_PROGRESS_BARS\", False)\n    def test_cannot_disable_tqdm_when_env_variable_is_set(self) -> None:\n        \"\"\"\n        Test helpers cannot enable/disable progress bars when\n        `HF_DATASETS_DISABLE_PROGRESS_BARS` is set.\n        \"\"\"\n        enable_progress_bars()\n        self.assertFalse(are_progress_bars_disabled())\n\n        with self.assertWarns(UserWarning):\n            disable_progress_bars()\n        self.assertFalse(are_progress_bars_disabled())  # Still enabled !\n\n    @patch(\"datasets.utils._tqdm.HF_DATASETS_DISABLE_PROGRESS_BARS\", None)\n    def test_tqdm_disabled(self) -> None:\n        \"\"\"Test TQDM not outputting anything when globally disabled.\"\"\"\n        disable_progress_bars()\n        for _ in tqdm(range(10)):\n            pass\n\n        captured = self.capsys.readouterr()\n        self.assertEqual(captured.out, \"\")\n        self.assertEqual(captured.err, \"\")\n\n    @patch(\"datasets.utils._tqdm.HF_DATASETS_DISABLE_PROGRESS_BARS\", None)\n    def test_tqdm_disabled_cannot_be_forced(self) -> None:\n        \"\"\"Test TQDM cannot be forced when globally disabled.\"\"\"\n        disable_progress_bars()\n        for _ in tqdm(range(10), disable=False):\n            pass\n\n        captured = self.capsys.readouterr()\n        self.assertEqual(captured.out, \"\")\n        self.assertEqual(captured.err, \"\")\n\n    @patch(\"datasets.utils._tqdm.HF_DATASETS_DISABLE_PROGRESS_BARS\", None)\n    def test_tqdm_can_be_disabled_when_globally_enabled(self) -> None:\n        \"\"\"Test TQDM can still be locally disabled even when globally enabled.\"\"\"\n        enable_progress_bars()\n        for _ in tqdm(range(10), disable=True):\n            pass\n\n        captured = self.capsys.readouterr()\n        self.assertEqual(captured.out, \"\")\n        self.assertEqual(captured.err, \"\")\n\n    @patch(\"datasets.utils._tqdm.HF_DATASETS_DISABLE_PROGRESS_BARS\", None)\n    def test_tqdm_enabled(self) -> None:\n        \"\"\"Test TQDM work normally when globally enabled.\"\"\"\n        enable_progress_bars()\n        for _ in tqdm(range(10)):\n            pass\n\n        captured = self.capsys.readouterr()\n        self.assertEqual(captured.out, \"\")\n        self.assertIn(\"10/10\", captured.err)  # tqdm log\n"
  },
  {
    "path": "tests/test_upstream_hub.py",
    "content": "import fnmatch\nimport gc\nimport os\nimport shutil\nimport tempfile\nimport textwrap\nimport time\nfrom io import BytesIO\nfrom pathlib import Path\nfrom unittest.mock import patch\n\nimport numpy as np\nimport pytest\nfrom huggingface_hub import DatasetCard, HfApi\n\nfrom datasets import (\n    Audio,\n    ClassLabel,\n    Dataset,\n    DatasetDict,\n    DownloadManager,\n    Features,\n    Image,\n    IterableDatasetDict,\n    List,\n    Value,\n    load_dataset,\n    load_dataset_builder,\n)\nfrom datasets.config import METADATA_CONFIGS_FIELD\nfrom datasets.data_files import get_data_patterns\nfrom datasets.exceptions import DatasetNotFoundError\nfrom datasets.packaged_modules.folder_based_builder.folder_based_builder import (\n    FolderBasedBuilder,\n    FolderBasedBuilderConfig,\n)\nfrom datasets.utils.file_utils import cached_path\nfrom datasets.utils.hub import hf_dataset_url\n\nfrom .fixtures.hub import CI_HUB_ENDPOINT, CI_HUB_USER, CI_HUB_USER_TOKEN\nfrom .utils import (\n    for_all_test_methods,\n    require_buckets_support_in_huggingface_hub,\n    require_pil,\n    require_torchcodec,\n    xfail_if_500_502_http_error,\n)\n\n\npytestmark = pytest.mark.integration\n\n\n@for_all_test_methods(xfail_if_500_502_http_error)\n@pytest.mark.usefixtures(\"ci_hub_config\")\nclass TestPushToHub:\n    _api = HfApi(endpoint=CI_HUB_ENDPOINT)\n    _token = CI_HUB_USER_TOKEN\n\n    def test_push_dataset_dict_to_hub_no_token(self, temporary_repo, set_ci_hub_access_token):\n        ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n\n        local_ds = DatasetDict({\"train\": ds})\n\n        with temporary_repo() as ds_name:\n            local_ds.push_to_hub(ds_name)\n            hub_ds = load_dataset(ds_name, download_mode=\"force_redownload\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds[\"train\"].features.keys()) == list(hub_ds[\"train\"].features.keys())\n            assert local_ds[\"train\"].features == hub_ds[\"train\"].features\n\n            # Ensure that there is a single file on the repository that has the correct name\n            files = sorted(self._api.list_repo_files(ds_name, repo_type=\"dataset\"))\n            assert files == [\".gitattributes\", \"README.md\", \"data/train-00000-of-00001.parquet\"]\n\n    def test_push_dataset_dict_to_hub_name_without_namespace(self, temporary_repo):\n        ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n\n        local_ds = DatasetDict({\"train\": ds})\n\n        with temporary_repo() as ds_name:\n            local_ds.push_to_hub(ds_name.split(\"/\")[-1], token=self._token)\n            hub_ds = load_dataset(ds_name, download_mode=\"force_redownload\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds[\"train\"].features.keys()) == list(hub_ds[\"train\"].features.keys())\n            assert local_ds[\"train\"].features == hub_ds[\"train\"].features\n\n            # Ensure that there is a single file on the repository that has the correct name\n            files = sorted(self._api.list_repo_files(ds_name, repo_type=\"dataset\"))\n            assert files == [\".gitattributes\", \"README.md\", \"data/train-00000-of-00001.parquet\"]\n\n    def test_push_dataset_dict_to_hub_datasets_with_different_features(self, cleanup_repo):\n        ds_train = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n        ds_test = Dataset.from_dict({\"x\": [True, False, True], \"y\": [\"a\", \"b\", \"c\"]})\n\n        local_ds = DatasetDict({\"train\": ds_train, \"test\": ds_test})\n\n        ds_name = f\"{CI_HUB_USER}/test-{int(time.time() * 10e6)}\"\n        try:\n            with pytest.raises(ValueError):\n                local_ds.push_to_hub(ds_name.split(\"/\")[-1], token=self._token)\n        except AssertionError:\n            cleanup_repo(ds_name)\n            raise\n\n    def test_push_dataset_dict_to_hub_private(self, temporary_repo):\n        ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n\n        local_ds = DatasetDict({\"train\": ds})\n\n        with temporary_repo() as ds_name:\n            local_ds.push_to_hub(ds_name, token=self._token, private=True)\n            hub_ds = load_dataset(ds_name, download_mode=\"force_redownload\", token=self._token)\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds[\"train\"].features.keys()) == list(hub_ds[\"train\"].features.keys())\n            assert local_ds[\"train\"].features == hub_ds[\"train\"].features\n\n            # Ensure that there is a single file on the repository that has the correct name\n            files = sorted(self._api.list_repo_files(ds_name, repo_type=\"dataset\", token=self._token))\n            assert files == [\".gitattributes\", \"README.md\", \"data/train-00000-of-00001.parquet\"]\n\n    def test_push_dataset_dict_to_hub(self, temporary_repo):\n        ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n\n        local_ds = DatasetDict({\"train\": ds})\n\n        with temporary_repo() as ds_name:\n            local_ds.push_to_hub(ds_name, token=self._token)\n            hub_ds = load_dataset(ds_name, download_mode=\"force_redownload\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds[\"train\"].features.keys()) == list(hub_ds[\"train\"].features.keys())\n            assert local_ds[\"train\"].features == hub_ds[\"train\"].features\n\n            # Ensure that there is a single file on the repository that has the correct name\n            files = sorted(self._api.list_repo_files(ds_name, repo_type=\"dataset\", token=self._token))\n            assert files == [\".gitattributes\", \"README.md\", \"data/train-00000-of-00001.parquet\"]\n\n    def test_push_dataset_dict_to_hub_with_pull_request(self, temporary_repo):\n        ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n\n        local_ds = DatasetDict({\"train\": ds})\n\n        with temporary_repo() as ds_name:\n            local_ds.push_to_hub(ds_name, token=self._token, create_pr=True)\n            hub_ds = load_dataset(ds_name, revision=\"refs/pr/1\", download_mode=\"force_redownload\")\n\n            assert local_ds[\"train\"].features == hub_ds[\"train\"].features\n            assert list(local_ds.keys()) == list(hub_ds.keys())\n            assert local_ds[\"train\"].features == hub_ds[\"train\"].features\n\n            # Ensure that there is a single file on the repository that has the correct name\n            files = sorted(\n                self._api.list_repo_files(ds_name, revision=\"refs/pr/1\", repo_type=\"dataset\", token=self._token)\n            )\n            assert files == [\".gitattributes\", \"README.md\", \"data/train-00000-of-00001.parquet\"]\n\n    def test_push_dataset_dict_to_hub_with_revision(self, temporary_repo):\n        ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n\n        local_ds = DatasetDict({\"train\": ds})\n\n        with temporary_repo() as ds_name:\n            local_ds.push_to_hub(ds_name, token=self._token, revision=\"dev\")\n            hub_ds = load_dataset(ds_name, revision=\"dev\", download_mode=\"force_redownload\")\n\n            assert local_ds[\"train\"].features == hub_ds[\"train\"].features\n            assert list(local_ds.keys()) == list(hub_ds.keys())\n            assert local_ds[\"train\"].features == hub_ds[\"train\"].features\n\n            # Ensure that there is a single file on the repository that has the correct name\n            files = sorted(self._api.list_repo_files(ds_name, revision=\"dev\", repo_type=\"dataset\", token=self._token))\n            assert files == [\".gitattributes\", \"README.md\", \"data/train-00000-of-00001.parquet\"]\n\n    def test_push_dataset_dict_to_hub_multiple_files(self, temporary_repo):\n        ds = Dataset.from_dict({\"x\": list(range(1000)), \"y\": list(range(1000))})\n\n        local_ds = DatasetDict({\"train\": ds})\n\n        with temporary_repo() as ds_name:\n            with patch(\"datasets.config.MAX_SHARD_SIZE\", \"16KB\"):\n                local_ds.push_to_hub(ds_name, token=self._token)\n            hub_ds = load_dataset(ds_name, download_mode=\"force_redownload\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds[\"train\"].features.keys()) == list(hub_ds[\"train\"].features.keys())\n            assert local_ds[\"train\"].features == hub_ds[\"train\"].features\n\n            # Ensure that there are two files on the repository that have the correct name\n            files = sorted(self._api.list_repo_files(ds_name, repo_type=\"dataset\", token=self._token))\n            assert files == [\n                \".gitattributes\",\n                \"README.md\",\n                \"data/train-00000-of-00002.parquet\",\n                \"data/train-00001-of-00002.parquet\",\n            ]\n\n    def test_push_dataset_dict_to_hub_multiple_files_with_max_shard_size(self, temporary_repo):\n        ds = Dataset.from_dict({\"x\": list(range(1000)), \"y\": list(range(1000))})\n\n        local_ds = DatasetDict({\"train\": ds})\n\n        with temporary_repo() as ds_name:\n            local_ds.push_to_hub(ds_name, token=self._token, max_shard_size=\"16KB\")\n            hub_ds = load_dataset(ds_name, download_mode=\"force_redownload\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds[\"train\"].features.keys()) == list(hub_ds[\"train\"].features.keys())\n            assert local_ds[\"train\"].features == hub_ds[\"train\"].features\n\n            # Ensure that there are two files on the repository that have the correct name\n            files = sorted(self._api.list_repo_files(ds_name, repo_type=\"dataset\", token=self._token))\n            assert files == [\n                \".gitattributes\",\n                \"README.md\",\n                \"data/train-00000-of-00002.parquet\",\n                \"data/train-00001-of-00002.parquet\",\n            ]\n\n    def test_push_dataset_dict_to_hub_multiple_files_with_num_shards(self, temporary_repo):\n        ds = Dataset.from_dict({\"x\": list(range(1000)), \"y\": list(range(1000))})\n\n        local_ds = DatasetDict({\"train\": ds})\n\n        with temporary_repo() as ds_name:\n            local_ds.push_to_hub(ds_name, token=self._token, num_shards={\"train\": 2})\n            hub_ds = load_dataset(ds_name, download_mode=\"force_redownload\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds[\"train\"].features.keys()) == list(hub_ds[\"train\"].features.keys())\n            assert local_ds[\"train\"].features == hub_ds[\"train\"].features\n\n            # Ensure that there are two files on the repository that have the correct name\n            files = sorted(self._api.list_repo_files(ds_name, repo_type=\"dataset\", token=self._token))\n            assert files == [\n                \".gitattributes\",\n                \"README.md\",\n                \"data/train-00000-of-00002.parquet\",\n                \"data/train-00001-of-00002.parquet\",\n            ]\n\n    def test_push_dataset_dict_to_hub_with_multiple_commits(self, temporary_repo):\n        ds = Dataset.from_dict({\"x\": list(range(1000)), \"y\": list(range(1000))})\n\n        local_ds = DatasetDict({\"train\": ds})\n\n        with temporary_repo() as ds_name:\n            self._api.create_repo(ds_name, token=self._token, repo_type=\"dataset\")\n            num_commits_before_push = len(self._api.list_repo_commits(ds_name, repo_type=\"dataset\", token=self._token))\n            with (\n                patch(\"datasets.config.MAX_SHARD_SIZE\", \"16KB\"),\n                patch(\"datasets.config.UPLOADS_MAX_NUMBER_PER_COMMIT\", 1),\n            ):\n                local_ds.push_to_hub(ds_name, token=self._token)\n            hub_ds = load_dataset(ds_name, download_mode=\"force_redownload\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds[\"train\"].features.keys()) == list(hub_ds[\"train\"].features.keys())\n            assert local_ds[\"train\"].features == hub_ds[\"train\"].features\n\n            # Ensure that there are two files on the repository that have the correct name\n            files = sorted(self._api.list_repo_files(ds_name, repo_type=\"dataset\", token=self._token))\n            assert files == [\n                \".gitattributes\",\n                \"README.md\",\n                \"data/train-00000-of-00002.parquet\",\n                \"data/train-00001-of-00002.parquet\",\n            ]\n\n            num_commits_after_push = len(self._api.list_repo_commits(ds_name, repo_type=\"dataset\", token=self._token))\n            assert num_commits_after_push - num_commits_before_push > 1\n\n    def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):\n        ds = Dataset.from_dict({\"x\": list(range(1000)), \"y\": list(range(1000))})\n        ds2 = Dataset.from_dict({\"x\": list(range(100)), \"y\": list(range(100))})\n\n        local_ds = DatasetDict({\"train\": ds, \"random\": ds2})\n\n        # Push to hub two times, but the second time with a larger amount of files.\n        # Verify that the new files contain the correct dataset.\n        with temporary_repo() as ds_name:\n            local_ds.push_to_hub(ds_name, token=self._token)\n\n            with tempfile.TemporaryDirectory() as tmp:\n                # Add a file starting with \"data\" to ensure it doesn't get deleted.\n                path = Path(tmp) / \"datafile.txt\"\n                with open(path, \"w\") as f:\n                    f.write(\"Bogus file\")\n\n                self._api.upload_file(\n                    path_or_fileobj=str(path),\n                    path_in_repo=\"datafile.txt\",\n                    repo_id=ds_name,\n                    repo_type=\"dataset\",\n                    token=self._token,\n                )\n\n            local_ds.push_to_hub(ds_name, token=self._token, max_shard_size=500 << 5)\n\n            # Ensure that there are two files on the repository that have the correct name\n            files = sorted(self._api.list_repo_files(ds_name, repo_type=\"dataset\", token=self._token))\n            assert files == [\n                \".gitattributes\",\n                \"README.md\",\n                \"data/random-00000-of-00001.parquet\",\n                \"data/train-00000-of-00002.parquet\",\n                \"data/train-00001-of-00002.parquet\",\n                \"datafile.txt\",\n            ]\n\n            self._api.delete_file(\"datafile.txt\", repo_id=ds_name, repo_type=\"dataset\", token=self._token)\n\n            hub_ds = load_dataset(ds_name, download_mode=\"force_redownload\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds[\"train\"].features.keys()) == list(hub_ds[\"train\"].features.keys())\n            assert local_ds[\"train\"].features == hub_ds[\"train\"].features\n\n        del hub_ds\n\n        # To ensure the reference to the memory-mapped Arrow file is dropped to avoid the PermissionError on Windows\n        gc.collect()\n\n        # Push to hub two times, but the second time with fewer files.\n        # Verify that the new files contain the correct dataset and that non-necessary files have been deleted.\n        with temporary_repo(ds_name):\n            local_ds.push_to_hub(ds_name, token=self._token, max_shard_size=500 << 5)\n\n            with tempfile.TemporaryDirectory() as tmp:\n                # Add a file starting with \"data\" to ensure it doesn't get deleted.\n                path = Path(tmp) / \"datafile.txt\"\n                with open(path, \"w\") as f:\n                    f.write(\"Bogus file\")\n\n                self._api.upload_file(\n                    path_or_fileobj=str(path),\n                    path_in_repo=\"datafile.txt\",\n                    repo_id=ds_name,\n                    repo_type=\"dataset\",\n                    token=self._token,\n                )\n\n            local_ds.push_to_hub(ds_name, token=self._token)\n\n            # Ensure that there are two files on the repository that have the correct name\n            files = sorted(self._api.list_repo_files(ds_name, repo_type=\"dataset\", token=self._token))\n            assert files == [\n                \".gitattributes\",\n                \"README.md\",\n                \"data/random-00000-of-00001.parquet\",\n                \"data/train-00000-of-00001.parquet\",\n                \"datafile.txt\",\n            ]\n\n            # Keeping the \"datafile.txt\" breaks the load_dataset to think it's a text-based dataset\n            self._api.delete_file(\"datafile.txt\", repo_id=ds_name, repo_type=\"dataset\", token=self._token)\n\n            hub_ds = load_dataset(ds_name, download_mode=\"force_redownload\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds[\"train\"].features.keys()) == list(hub_ds[\"train\"].features.keys())\n            assert local_ds[\"train\"].features == hub_ds[\"train\"].features\n\n    @require_buckets_support_in_huggingface_hub\n    def test_push_dataset_dict_to_hub_bucket(self, temporary_bucket):\n        ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n\n        local_ds = DatasetDict({\"train\": ds})\n\n        with temporary_bucket() as bucket_id:\n            ds_location = \"buckets/\" + bucket_id\n            local_ds.push_to_hub(ds_location, token=self._token)\n            hub_ds = load_dataset(ds_location, download_mode=\"force_redownload\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds[\"train\"].features.keys()) == list(hub_ds[\"train\"].features.keys())\n            assert local_ds[\"train\"].features == hub_ds[\"train\"].features\n\n            # Ensure that there is a single file on the repository that has the correct name\n            files = sorted(item.path for item in self._api.list_bucket_tree(bucket_id, token=self._token))\n            assert files == [\"README.md\", \"data/train-00000-of-00001.parquet\"]\n\n    @require_buckets_support_in_huggingface_hub\n    def test_push_dataset_dict_to_hub_bucket_inside_dir(self, temporary_bucket):\n        ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n\n        local_ds = DatasetDict({\"train\": ds})\n\n        with temporary_bucket() as bucket_id:\n            ds_location = \"buckets/\" + bucket_id + \"/my-dir\"\n            local_ds.push_to_hub(ds_location, token=self._token)\n            hub_ds = load_dataset(ds_location, download_mode=\"force_redownload\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds[\"train\"].features.keys()) == list(hub_ds[\"train\"].features.keys())\n            assert local_ds[\"train\"].features == hub_ds[\"train\"].features\n\n            # Ensure that there is a single file on the repository that has the correct name\n            files = sorted(\n                item.path for item in self._api.list_bucket_tree(bucket_id, prefix=\"my-dir\", token=self._token)\n            )\n            assert files == [\"my-dir/README.md\", \"my-dir/data/train-00000-of-00001.parquet\"]\n\n    @require_buckets_support_in_huggingface_hub\n    def test_push_dataset_to_hub_bucket(self, temporary_bucket):\n        local_ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n\n        with temporary_bucket() as bucket_id:\n            ds_location = \"buckets/\" + bucket_id\n            local_ds.push_to_hub(ds_location, token=self._token)\n            hub_ds = load_dataset(ds_location, download_mode=\"force_redownload\", split=\"train\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds.features.keys()) == list(hub_ds.features.keys())\n            assert local_ds.features == hub_ds.features\n\n            # Ensure that there is a single file on the repository that has the correct name\n            files = sorted(item.path for item in self._api.list_bucket_tree(bucket_id, token=self._token))\n            assert files == [\"README.md\", \"data/train-00000-of-00001.parquet\"]\n\n    @require_buckets_support_in_huggingface_hub\n    def test_push_dataset_to_hub_bucket_inside_dir(self, temporary_bucket):\n        local_ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n\n        with temporary_bucket() as bucket_id:\n            ds_location = \"buckets/\" + bucket_id + \"/my-dir\"\n            local_ds.push_to_hub(ds_location, token=self._token)\n            hub_ds = load_dataset(ds_location, download_mode=\"force_redownload\", split=\"train\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds.features.keys()) == list(hub_ds.features.keys())\n            assert local_ds.features == hub_ds.features\n\n            # Ensure that there is a single file on the repository that has the correct name\n            files = sorted(\n                item.path for item in self._api.list_bucket_tree(bucket_id, prefix=\"my-dir\", token=self._token)\n            )\n            assert files == [\"my-dir/README.md\", \"my-dir/data/train-00000-of-00001.parquet\"]\n\n    def test_push_dataset_to_hub(self, temporary_repo):\n        local_ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n\n        with temporary_repo() as ds_name:\n            local_ds.push_to_hub(ds_name, split=\"train\", token=self._token)\n            local_ds_dict = {\"train\": local_ds}\n            hub_ds_dict = load_dataset(ds_name, download_mode=\"force_redownload\")\n\n            assert list(local_ds_dict.keys()) == list(hub_ds_dict.keys())\n\n            for ds_split_name in local_ds_dict.keys():\n                local_ds = local_ds_dict[ds_split_name]\n                hub_ds = hub_ds_dict[ds_split_name]\n                assert local_ds.column_names == hub_ds.column_names\n                assert list(local_ds.features.keys()) == list(hub_ds.features.keys())\n                assert local_ds.features == hub_ds.features\n\n    def test_push_dataset_to_hub_custom_features(self, temporary_repo):\n        features = Features({\"x\": Value(\"int64\"), \"y\": ClassLabel(names=[\"neg\", \"pos\"])})\n        ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [0, 0, 1]}, features=features)\n\n        with temporary_repo() as ds_name:\n            ds.push_to_hub(ds_name, token=self._token)\n            hub_ds = load_dataset(ds_name, split=\"train\", download_mode=\"force_redownload\")\n\n            assert ds.column_names == hub_ds.column_names\n            assert list(ds.features.keys()) == list(hub_ds.features.keys())\n            assert ds.features == hub_ds.features\n            assert ds[:] == hub_ds[:]\n\n    @require_torchcodec\n    @require_torchcodec\n    def test_push_dataset_to_hub_custom_features_audio(self, temporary_repo):\n        audio_path = os.path.join(os.path.dirname(__file__), \"features\", \"data\", \"test_audio_44100.wav\")\n        data = {\"x\": [audio_path, None], \"y\": [0, -1]}\n        features = Features({\"x\": Audio(), \"y\": Value(\"int32\")})\n        ds = Dataset.from_dict(data, features=features)\n\n        for embed_external_files in [True, False]:\n            with temporary_repo() as ds_name:\n                ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token)\n                hub_ds = load_dataset(ds_name, split=\"train\", download_mode=\"force_redownload\")\n\n                assert ds.column_names == hub_ds.column_names\n                assert list(ds.features.keys()) == list(hub_ds.features.keys())\n                assert ds.features == hub_ds.features\n                np.testing.assert_equal(\n                    ds[0][\"x\"].get_all_samples().data.cpu().numpy(),\n                    hub_ds[0][\"x\"].get_all_samples().data.cpu().numpy(),\n                )\n                assert ds[1] == hub_ds[1]  # don't test hub_ds[0] since audio decoding might be slightly different\n                hub_ds = hub_ds.cast_column(\"x\", Audio(decode=False))\n                elem = hub_ds[0][\"x\"]\n                path, bytes_ = elem[\"path\"], elem[\"bytes\"]\n                assert isinstance(path, str)\n                assert os.path.basename(path) == \"test_audio_44100.wav\"\n                assert bool(bytes_) == embed_external_files\n\n    @require_pil\n    def test_push_dataset_to_hub_custom_features_image(self, temporary_repo):\n        image_path = os.path.join(os.path.dirname(__file__), \"features\", \"data\", \"test_image_rgb.jpg\")\n        data = {\"x\": [image_path, None], \"y\": [0, -1]}\n        features = Features({\"x\": Image(), \"y\": Value(\"int32\")})\n        ds = Dataset.from_dict(data, features=features)\n\n        for embed_external_files in [True, False]:\n            with temporary_repo() as ds_name:\n                ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token)\n                hub_ds = load_dataset(ds_name, split=\"train\", download_mode=\"force_redownload\")\n\n                assert ds.column_names == hub_ds.column_names\n                assert list(ds.features.keys()) == list(hub_ds.features.keys())\n                assert ds.features == hub_ds.features\n                assert ds[:] == hub_ds[:]\n                hub_ds = hub_ds.cast_column(\"x\", Image(decode=False))\n                elem = hub_ds[0][\"x\"]\n                path, bytes_ = elem[\"path\"], elem[\"bytes\"]\n                assert isinstance(path, str)\n                assert bool(bytes_) == embed_external_files\n\n    @require_pil\n    def test_push_dataset_to_hub_custom_features_image_list(self, temporary_repo):\n        image_path = os.path.join(os.path.dirname(__file__), \"features\", \"data\", \"test_image_rgb.jpg\")\n        data = {\"x\": [[image_path], [image_path, image_path]], \"y\": [0, -1]}\n        features = Features({\"x\": List(Image()), \"y\": Value(\"int32\")})\n        ds = Dataset.from_dict(data, features=features)\n\n        for embed_external_files in [True, False]:\n            with temporary_repo() as ds_name:\n                ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token)\n                hub_ds = load_dataset(ds_name, split=\"train\", download_mode=\"force_redownload\")\n\n                assert ds.column_names == hub_ds.column_names\n                assert list(ds.features.keys()) == list(hub_ds.features.keys())\n                assert ds.features == hub_ds.features\n                assert ds[:] == hub_ds[:]\n                hub_ds = hub_ds.cast_column(\"x\", List(Image(decode=False)))\n                elem = hub_ds[0][\"x\"][0]\n                path, bytes_ = elem[\"path\"], elem[\"bytes\"]\n                assert isinstance(path, str)\n                assert bool(bytes_) == embed_external_files\n\n    def test_push_dataset_dict_to_hub_custom_features(self, temporary_repo):\n        features = Features({\"x\": Value(\"int64\"), \"y\": ClassLabel(names=[\"neg\", \"pos\"])})\n        ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [0, 0, 1]}, features=features)\n\n        local_ds = DatasetDict({\"test\": ds})\n\n        with temporary_repo() as ds_name:\n            local_ds.push_to_hub(ds_name, token=self._token)\n            hub_ds = load_dataset(ds_name, download_mode=\"force_redownload\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds[\"test\"].features.keys()) == list(hub_ds[\"test\"].features.keys())\n            assert local_ds[\"test\"].features == hub_ds[\"test\"].features\n\n    def test_push_dataset_to_hub_custom_splits(self, temporary_repo):\n        ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n\n        with temporary_repo() as ds_name:\n            ds.push_to_hub(ds_name, split=\"random\", token=self._token)\n            hub_ds = load_dataset(ds_name, download_mode=\"force_redownload\")\n\n            assert ds.column_names == hub_ds[\"random\"].column_names\n            assert list(ds.features.keys()) == list(hub_ds[\"random\"].features.keys())\n            assert ds.features == hub_ds[\"random\"].features\n\n    def test_push_dataset_to_hub_multiple_splits_one_by_one(self, temporary_repo):\n        ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n        with temporary_repo() as ds_name:\n            ds.push_to_hub(ds_name, split=\"train\", token=self._token)\n            ds.push_to_hub(ds_name, split=\"test\", token=self._token)\n            hub_ds = load_dataset(ds_name, download_mode=\"force_redownload\")\n            assert sorted(hub_ds) == [\"test\", \"train\"]\n            assert ds.column_names == hub_ds[\"train\"].column_names\n            assert list(ds.features.keys()) == list(hub_ds[\"train\"].features.keys())\n            assert ds.features == hub_ds[\"train\"].features\n\n    def test_push_dataset_dict_to_hub_custom_splits(self, temporary_repo):\n        ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n\n        local_ds = DatasetDict({\"random\": ds})\n\n        with temporary_repo() as ds_name:\n            local_ds.push_to_hub(ds_name, token=self._token)\n            hub_ds = load_dataset(ds_name, download_mode=\"force_redownload\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds[\"random\"].features.keys()) == list(hub_ds[\"random\"].features.keys())\n            assert local_ds[\"random\"].features == hub_ds[\"random\"].features\n\n    def test_push_multiple_dataset_configs_to_hub_load_dataset_builder(self, temporary_repo):\n        ds_default = Dataset.from_dict({\"a\": [0], \"b\": [1]})\n        ds_config1 = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n        ds_config2 = Dataset.from_dict({\"foo\": [1, 2], \"bar\": [4, 5]})\n\n        with temporary_repo() as ds_name:\n            ds_default.push_to_hub(ds_name, token=self._token)\n            ds_config1.push_to_hub(ds_name, \"config1\", token=self._token)\n            ds_config2.push_to_hub(ds_name, \"config2\", token=self._token)\n            ds_builder_default = load_dataset_builder(ds_name, download_mode=\"force_redownload\")  # default config\n            assert len(ds_builder_default.BUILDER_CONFIGS) == 3\n            assert len(ds_builder_default.config.data_files[\"train\"]) == 1\n            assert fnmatch.fnmatch(\n                ds_builder_default.config.data_files[\"train\"][0],\n                \"*/data/train-*\",\n            )\n            ds_builder_config1 = load_dataset_builder(ds_name, \"config1\", download_mode=\"force_redownload\")\n            assert len(ds_builder_config1.BUILDER_CONFIGS) == 3\n            assert len(ds_builder_config1.config.data_files[\"train\"]) == 1\n            assert fnmatch.fnmatch(\n                ds_builder_config1.config.data_files[\"train\"][0],\n                \"*/config1/train-*\",\n            )\n            ds_builder_config2 = load_dataset_builder(ds_name, \"config2\", download_mode=\"force_redownload\")\n            assert len(ds_builder_config2.BUILDER_CONFIGS) == 3\n            assert len(ds_builder_config2.config.data_files[\"train\"]) == 1\n            assert fnmatch.fnmatch(\n                ds_builder_config2.config.data_files[\"train\"][0],\n                \"*/config2/train-*\",\n            )\n\n            with pytest.raises(ValueError):  # no config 'config3'\n                load_dataset_builder(ds_name, \"config3\", download_mode=\"force_redownload\")\n\n    def test_push_multiple_dataset_configs_to_hub_load_dataset(self, temporary_repo):\n        ds_default = Dataset.from_dict({\"a\": [0], \"b\": [1]})\n        ds_config1 = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n        ds_config2 = Dataset.from_dict({\"foo\": [1, 2], \"bar\": [4, 5]})\n\n        with temporary_repo() as ds_name:\n            ds_default.push_to_hub(ds_name, token=self._token)\n            ds_config1.push_to_hub(ds_name, \"config1\", token=self._token)\n            ds_config2.push_to_hub(ds_name, \"config2\", token=self._token)\n\n            files = sorted(self._api.list_repo_files(ds_name, repo_type=\"dataset\"))\n            assert files == [\n                \".gitattributes\",\n                \"README.md\",\n                \"config1/train-00000-of-00001.parquet\",\n                \"config2/train-00000-of-00001.parquet\",\n                \"data/train-00000-of-00001.parquet\",\n            ]\n\n            hub_ds_default = load_dataset(ds_name, download_mode=\"force_redownload\")\n            hub_ds_config1 = load_dataset(ds_name, \"config1\", download_mode=\"force_redownload\")\n            hub_ds_config2 = load_dataset(ds_name, \"config2\", download_mode=\"force_redownload\")\n\n            # only \"train\" split\n            assert len(hub_ds_default) == len(hub_ds_config1) == len(hub_ds_config2) == 1\n\n            assert ds_default.column_names == hub_ds_default[\"train\"].column_names == [\"a\", \"b\"]\n            assert ds_config1.column_names == hub_ds_config1[\"train\"].column_names == [\"x\", \"y\"]\n            assert ds_config2.column_names == hub_ds_config2[\"train\"].column_names == [\"foo\", \"bar\"]\n\n            assert ds_default.features == hub_ds_default[\"train\"].features\n            assert ds_config1.features == hub_ds_config1[\"train\"].features\n            assert ds_config2.features == hub_ds_config2[\"train\"].features\n\n            assert ds_default.num_rows == hub_ds_default[\"train\"].num_rows == 1\n            assert ds_config1.num_rows == hub_ds_config1[\"train\"].num_rows == 3\n            assert ds_config2.num_rows == hub_ds_config2[\"train\"].num_rows == 2\n\n            with pytest.raises(ValueError):  # no config 'config3'\n                load_dataset(ds_name, \"config3\", download_mode=\"force_redownload\")\n\n    @pytest.mark.parametrize(\"specific_default_config_name\", [False, True])\n    def test_push_multiple_dataset_configs_to_hub_readme_metadata_content(\n        self, specific_default_config_name, temporary_repo\n    ):\n        ds_default = Dataset.from_dict({\"a\": [0], \"b\": [2]})\n        ds_config1 = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n        ds_config2 = Dataset.from_dict({\"foo\": [1, 2], \"bar\": [4, 5]})\n\n        with temporary_repo() as ds_name:\n            if specific_default_config_name:\n                ds_default.push_to_hub(ds_name, config_name=\"config0\", set_default=True, token=self._token)\n            else:\n                ds_default.push_to_hub(ds_name, token=self._token)\n            ds_config1.push_to_hub(ds_name, \"config1\", token=self._token)\n            ds_config2.push_to_hub(ds_name, \"config2\", token=self._token)\n\n            # check that configs args was correctly pushed to README.md\n            ds_readme_path = cached_path(hf_dataset_url(ds_name, \"README.md\"))\n            dataset_card_data = DatasetCard.load(ds_readme_path).data\n            assert METADATA_CONFIGS_FIELD in dataset_card_data\n            assert isinstance(dataset_card_data[METADATA_CONFIGS_FIELD], list)\n            assert sorted(dataset_card_data[METADATA_CONFIGS_FIELD], key=lambda x: x[\"config_name\"]) == (\n                [\n                    {\n                        \"config_name\": \"config0\",\n                        \"data_files\": [\n                            {\"split\": \"train\", \"path\": \"config0/train-*\"},\n                        ],\n                        \"default\": True,\n                    },\n                ]\n                if specific_default_config_name\n                else []\n            ) + [\n                {\n                    \"config_name\": \"config1\",\n                    \"data_files\": [\n                        {\"split\": \"train\", \"path\": \"config1/train-*\"},\n                    ],\n                },\n                {\n                    \"config_name\": \"config2\",\n                    \"data_files\": [\n                        {\"split\": \"train\", \"path\": \"config2/train-*\"},\n                    ],\n                },\n            ] + (\n                []\n                if specific_default_config_name\n                else [\n                    {\n                        \"config_name\": \"default\",\n                        \"data_files\": [\n                            {\"split\": \"train\", \"path\": \"data/train-*\"},\n                        ],\n                    },\n                ]\n            )\n\n    def test_push_multiple_dataset_dict_configs_to_hub_load_dataset_builder(self, temporary_repo):\n        ds_default = Dataset.from_dict({\"a\": [0], \"b\": [1]})\n        ds_config1 = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n        ds_config2 = Dataset.from_dict({\"foo\": [1, 2], \"bar\": [4, 5]})\n        ds_default = DatasetDict({\"random\": ds_default})\n        ds_config1 = DatasetDict({\"random\": ds_config1})\n        ds_config2 = DatasetDict({\"random\": ds_config2})\n\n        with temporary_repo() as ds_name:\n            ds_default.push_to_hub(ds_name, token=self._token)\n            ds_config1.push_to_hub(ds_name, \"config1\", token=self._token)\n            ds_config2.push_to_hub(ds_name, \"config2\", token=self._token)\n\n            ds_builder_default = load_dataset_builder(ds_name, download_mode=\"force_redownload\")  # default config\n            assert len(ds_builder_default.BUILDER_CONFIGS) == 3\n            assert len(ds_builder_default.config.data_files[\"random\"]) == 1\n            assert fnmatch.fnmatch(\n                ds_builder_default.config.data_files[\"random\"][0],\n                \"*/data/random-*\",\n            )\n            ds_builder_config1 = load_dataset_builder(ds_name, \"config1\", download_mode=\"force_redownload\")\n            assert len(ds_builder_config1.BUILDER_CONFIGS) == 3\n            assert len(ds_builder_config1.config.data_files[\"random\"]) == 1\n            assert fnmatch.fnmatch(\n                ds_builder_config1.config.data_files[\"random\"][0],\n                \"*/config1/random-*\",\n            )\n            ds_builder_config2 = load_dataset_builder(ds_name, \"config2\", download_mode=\"force_redownload\")\n            assert len(ds_builder_config2.BUILDER_CONFIGS) == 3\n            assert len(ds_builder_config2.config.data_files[\"random\"]) == 1\n            assert fnmatch.fnmatch(\n                ds_builder_config2.config.data_files[\"random\"][0],\n                \"*/config2/random-*\",\n            )\n            with pytest.raises(ValueError):  # no config named 'config3'\n                load_dataset_builder(ds_name, \"config3\", download_mode=\"force_redownload\")\n\n    def test_push_multiple_dataset_dict_configs_to_hub_load_dataset(self, temporary_repo):\n        ds_default = Dataset.from_dict({\"a\": [0], \"b\": [1]})\n        ds_config1 = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n        ds_config2 = Dataset.from_dict({\"foo\": [1, 2], \"bar\": [4, 5]})\n        ds_default = DatasetDict({\"train\": ds_default, \"random\": ds_default})\n        ds_config1 = DatasetDict({\"train\": ds_config1, \"random\": ds_config1})\n        ds_config2 = DatasetDict({\"train\": ds_config2, \"random\": ds_config2})\n\n        with temporary_repo() as ds_name:\n            ds_default.push_to_hub(ds_name, token=self._token)\n            ds_config1.push_to_hub(ds_name, \"config1\", token=self._token)\n            ds_config2.push_to_hub(ds_name, \"config2\", token=self._token)\n\n            files = sorted(self._api.list_repo_files(ds_name, repo_type=\"dataset\"))\n            assert files == [\n                \".gitattributes\",\n                \"README.md\",\n                \"config1/random-00000-of-00001.parquet\",\n                \"config1/train-00000-of-00001.parquet\",\n                \"config2/random-00000-of-00001.parquet\",\n                \"config2/train-00000-of-00001.parquet\",\n                \"data/random-00000-of-00001.parquet\",\n                \"data/train-00000-of-00001.parquet\",\n            ]\n\n            hub_ds_default = load_dataset(ds_name, download_mode=\"force_redownload\")\n            hub_ds_config1 = load_dataset(ds_name, \"config1\", download_mode=\"force_redownload\")\n            hub_ds_config2 = load_dataset(ds_name, \"config2\", download_mode=\"force_redownload\")\n\n            # two splits\n            expected_splits = [\"random\", \"train\"]\n            assert len(hub_ds_default) == len(hub_ds_config1) == len(hub_ds_config2) == 2\n            assert sorted(hub_ds_default) == sorted(hub_ds_config1) == sorted(hub_ds_config2) == expected_splits\n\n            for split in expected_splits:\n                assert ds_default[split].column_names == hub_ds_default[split].column_names == [\"a\", \"b\"]\n                assert ds_config1[split].column_names == hub_ds_config1[split].column_names == [\"x\", \"y\"]\n                assert ds_config2[split].column_names == hub_ds_config2[split].column_names == [\"foo\", \"bar\"]\n\n                assert ds_default[split].features == hub_ds_default[split].features\n                assert ds_config1[split].features == hub_ds_config1[split].features\n                assert ds_config2[split].features == hub_ds_config2[\"train\"].features\n\n                assert ds_default[split].num_rows == hub_ds_default[split].num_rows == 1\n                assert ds_config1[split].num_rows == hub_ds_config1[split].num_rows == 3\n                assert ds_config2[split].num_rows == hub_ds_config2[split].num_rows == 2\n\n            with pytest.raises(ValueError):  # no config 'config3'\n                load_dataset(ds_name, \"config3\", download_mode=\"force_redownload\")\n\n    @pytest.mark.parametrize(\"specific_default_config_name\", [False, True])\n    def test_push_multiple_dataset_dict_configs_to_hub_readme_metadata_content(\n        self, specific_default_config_name, temporary_repo\n    ):\n        ds_default = Dataset.from_dict({\"a\": [0], \"b\": [1]})\n        ds_config1 = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n        ds_config2 = Dataset.from_dict({\"foo\": [1, 2], \"bar\": [4, 5]})\n        ds_default = DatasetDict({\"train\": ds_default, \"random\": ds_default})\n        ds_config1 = DatasetDict({\"train\": ds_config1, \"random\": ds_config1})\n        ds_config2 = DatasetDict({\"train\": ds_config2, \"random\": ds_config2})\n\n        with temporary_repo() as ds_name:\n            if specific_default_config_name:\n                ds_default.push_to_hub(ds_name, config_name=\"config0\", set_default=True, token=self._token)\n            else:\n                ds_default.push_to_hub(ds_name, token=self._token)\n            ds_config1.push_to_hub(ds_name, \"config1\", token=self._token)\n            ds_config2.push_to_hub(ds_name, \"config2\", token=self._token)\n\n            # check that configs args was correctly pushed to README.md\n            ds_readme_path = cached_path(hf_dataset_url(ds_name, \"README.md\"))\n            dataset_card_data = DatasetCard.load(ds_readme_path).data\n            assert METADATA_CONFIGS_FIELD in dataset_card_data\n            assert isinstance(dataset_card_data[METADATA_CONFIGS_FIELD], list)\n            assert sorted(dataset_card_data[METADATA_CONFIGS_FIELD], key=lambda x: x[\"config_name\"]) == (\n                [\n                    {\n                        \"config_name\": \"config0\",\n                        \"data_files\": [\n                            {\"split\": \"train\", \"path\": \"config0/train-*\"},\n                            {\"split\": \"random\", \"path\": \"config0/random-*\"},\n                        ],\n                        \"default\": True,\n                    },\n                ]\n                if specific_default_config_name\n                else []\n            ) + [\n                {\n                    \"config_name\": \"config1\",\n                    \"data_files\": [\n                        {\"split\": \"train\", \"path\": \"config1/train-*\"},\n                        {\"split\": \"random\", \"path\": \"config1/random-*\"},\n                    ],\n                },\n                {\n                    \"config_name\": \"config2\",\n                    \"data_files\": [\n                        {\"split\": \"train\", \"path\": \"config2/train-*\"},\n                        {\"split\": \"random\", \"path\": \"config2/random-*\"},\n                    ],\n                },\n            ] + (\n                []\n                if specific_default_config_name\n                else [\n                    {\n                        \"config_name\": \"default\",\n                        \"data_files\": [\n                            {\"split\": \"train\", \"path\": \"data/train-*\"},\n                            {\"split\": \"random\", \"path\": \"data/random-*\"},\n                        ],\n                    },\n                ]\n            )\n\n    def test_push_dataset_to_hub_with_config_no_metadata_configs(self, temporary_repo):\n        ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n        ds_another_config = Dataset.from_dict({\"foo\": [1, 2], \"bar\": [4, 5]})\n        parquet_buf = BytesIO()\n        ds.to_parquet(parquet_buf)\n        parquet_content = parquet_buf.getvalue()\n\n        with temporary_repo() as ds_name:\n            self._api.create_repo(ds_name, token=self._token, repo_type=\"dataset\")\n            # old push_to_hub was uploading the parquet files only - without metadata configs\n            self._api.upload_file(\n                path_or_fileobj=parquet_content,\n                path_in_repo=\"data/train-00000-of-00001.parquet\",\n                repo_id=ds_name,\n                repo_type=\"dataset\",\n                token=self._token,\n            )\n            ds_another_config.push_to_hub(ds_name, \"another_config\", token=self._token)\n            ds_builder = load_dataset_builder(ds_name, download_mode=\"force_redownload\")\n            assert len(ds_builder.config.data_files) == 1\n            assert len(ds_builder.config.data_files[\"train\"]) == 1\n            assert fnmatch.fnmatch(ds_builder.config.data_files[\"train\"][0], \"*/data/train-00000-of-00001.parquet\")\n            ds_another_config_builder = load_dataset_builder(\n                ds_name, \"another_config\", download_mode=\"force_redownload\"\n            )\n            assert len(ds_another_config_builder.config.data_files) == 1\n            assert len(ds_another_config_builder.config.data_files[\"train\"]) == 1\n            assert fnmatch.fnmatch(\n                ds_another_config_builder.config.data_files[\"train\"][0],\n                \"*/another_config/train-00000-of-00001.parquet\",\n            )\n\n    def test_push_dataset_dict_to_hub_with_config_no_metadata_configs(self, temporary_repo):\n        ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n        ds_another_config = Dataset.from_dict({\"foo\": [1, 2], \"bar\": [4, 5]})\n        parquet_buf = BytesIO()\n        ds.to_parquet(parquet_buf)\n        parquet_content = parquet_buf.getvalue()\n\n        local_ds_another_config = DatasetDict({\"random\": ds_another_config})\n\n        with temporary_repo() as ds_name:\n            self._api.create_repo(ds_name, token=self._token, repo_type=\"dataset\")\n            # old push_to_hub was uploading the parquet files only - without metadata configs\n            self._api.upload_file(\n                path_or_fileobj=parquet_content,\n                path_in_repo=\"data/random-00000-of-00001.parquet\",\n                repo_id=ds_name,\n                repo_type=\"dataset\",\n                token=self._token,\n            )\n            local_ds_another_config.push_to_hub(ds_name, \"another_config\", token=self._token)\n            ds_builder = load_dataset_builder(ds_name, download_mode=\"force_redownload\")\n            assert len(ds_builder.config.data_files) == 1\n            assert len(ds_builder.config.data_files[\"random\"]) == 1\n            assert fnmatch.fnmatch(ds_builder.config.data_files[\"random\"][0], \"*/data/random-00000-of-00001.parquet\")\n            ds_another_config_builder = load_dataset_builder(\n                ds_name, \"another_config\", download_mode=\"force_redownload\"\n            )\n            assert len(ds_another_config_builder.config.data_files) == 1\n            assert len(ds_another_config_builder.config.data_files[\"random\"]) == 1\n            assert fnmatch.fnmatch(\n                ds_another_config_builder.config.data_files[\"random\"][0],\n                \"*/another_config/random-00000-of-00001.parquet\",\n            )\n\n    def test_push_dataset_dict_to_hub_num_proc(self, temporary_repo, set_ci_hub_access_token):\n        ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]})\n\n        local_ds = DatasetDict({\"train\": ds})\n\n        with temporary_repo() as ds_name:\n            local_ds.push_to_hub(ds_name, num_proc=2)\n            hub_ds = load_dataset(ds_name, download_mode=\"force_redownload\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds[\"train\"].features.keys()) == list(hub_ds[\"train\"].features.keys())\n            assert local_ds[\"train\"].features == hub_ds[\"train\"].features\n\n            # Ensure that there is a single file on the repository that has the correct name\n            files = sorted(self._api.list_repo_files(ds_name, repo_type=\"dataset\"))\n            assert files == [\n                \".gitattributes\",\n                \"README.md\",\n                \"data/train-00000-of-00002.parquet\",\n                \"data/train-00001-of-00002.parquet\",\n            ]\n\n    def test_push_iterable_dataset_dict_to_hub(self, temporary_repo, set_ci_hub_access_token):\n        ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]}).to_iterable_dataset()\n\n        local_ds = IterableDatasetDict({\"train\": ds})\n\n        with temporary_repo() as ds_name:\n            local_ds.push_to_hub(ds_name)\n            hub_ds = load_dataset(ds_name, download_mode=\"force_redownload\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds[\"train\"].features.keys()) == list(hub_ds[\"train\"].features.keys())\n            assert local_ds[\"train\"].features == hub_ds[\"train\"].features\n\n            # Ensure that there is a single file on the repository that has the correct name\n            files = sorted(self._api.list_repo_files(ds_name, repo_type=\"dataset\"))\n            assert files == [\".gitattributes\", \"README.md\", \"data/train-00000-of-00001.parquet\"]\n\n    def test_push_iterable_dataset_dict_to_hub_num_proc(self, temporary_repo, set_ci_hub_access_token):\n        ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]}).to_iterable_dataset(num_shards=3)\n\n        local_ds = IterableDatasetDict({\"train\": ds})\n\n        with temporary_repo() as ds_name:\n            local_ds.push_to_hub(ds_name, num_proc=2)\n            hub_ds = load_dataset(ds_name, download_mode=\"force_redownload\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds[\"train\"].features.keys()) == list(hub_ds[\"train\"].features.keys())\n            assert local_ds[\"train\"].features == hub_ds[\"train\"].features\n\n            # Ensure that there is a single file on the repository that has the correct name\n            files = sorted(self._api.list_repo_files(ds_name, repo_type=\"dataset\"))\n            assert files == [\n                \".gitattributes\",\n                \"README.md\",\n                \"data/train-00000-of-00003.parquet\",\n                \"data/train-00001-of-00003.parquet\",\n                \"data/train-00002-of-00003.parquet\",\n            ]\n\n    def test_push_iterable_dataset_to_hub(self, temporary_repo):\n        local_ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]}).to_iterable_dataset()\n\n        with temporary_repo() as ds_name:\n            local_ds.push_to_hub(ds_name, token=self._token)\n            hub_ds = load_dataset(ds_name, download_mode=\"force_redownload\", split=\"train\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds.features.keys()) == list(hub_ds.features.keys())\n            assert local_ds.features == hub_ds.features\n\n            # Ensure that there is a single file on the repository that has the correct name\n            files = sorted(self._api.list_repo_files(ds_name, repo_type=\"dataset\", token=self._token))\n            assert files == [\".gitattributes\", \"README.md\", \"data/train-00000-of-00001.parquet\"]\n\n    @require_buckets_support_in_huggingface_hub\n    def test_push_iterable_dataset_dict_to_hub_bucket(self, temporary_bucket):\n        ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]}).to_iterable_dataset()\n        local_ds = IterableDatasetDict({\"train\": ds})\n\n        with temporary_bucket() as bucket_id:\n            ds_location = \"buckets/\" + bucket_id\n            local_ds.push_to_hub(ds_location, token=self._token)\n            hub_ds = load_dataset(ds_location, download_mode=\"force_redownload\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds[\"train\"].features.keys()) == list(hub_ds[\"train\"].features.keys())\n            assert local_ds[\"train\"].features == hub_ds[\"train\"].features\n\n            # Ensure that there is a single file on the repository that has the correct name\n            files = sorted(item.path for item in self._api.list_bucket_tree(bucket_id, token=self._token))\n            assert files == [\"README.md\", \"data/train-00000-of-00001.parquet\"]\n\n    @require_buckets_support_in_huggingface_hub\n    def test_push_iterable_dataset_to_hub_bucket(self, temporary_bucket):\n        local_ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]}).to_iterable_dataset()\n\n        with temporary_bucket() as bucket_id:\n            ds_location = \"buckets/\" + bucket_id\n            local_ds.push_to_hub(ds_location, token=self._token)\n            hub_ds = load_dataset(ds_location, download_mode=\"force_redownload\", split=\"train\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds.features.keys()) == list(hub_ds.features.keys())\n            assert local_ds.features == hub_ds.features\n\n            # Ensure that there is a single file on the repository that has the correct name\n            files = sorted(item.path for item in self._api.list_bucket_tree(bucket_id, token=self._token))\n            assert files == [\"README.md\", \"data/train-00000-of-00001.parquet\"]\n\n    @require_buckets_support_in_huggingface_hub\n    def test_push_sharded_iterable_dataset_to_hub_bucket(self, temporary_bucket):\n        local_ds = Dataset.from_dict({\"x\": [1, 2, 3], \"y\": [4, 5, 6]}).to_iterable_dataset(num_shards=3)\n\n        with temporary_bucket() as bucket_id:\n            ds_location = \"buckets/\" + bucket_id\n            local_ds.push_to_hub(ds_location, token=self._token)\n            hub_ds = load_dataset(ds_location, download_mode=\"force_redownload\", split=\"train\")\n\n            assert local_ds.column_names == hub_ds.column_names\n            assert list(local_ds.features.keys()) == list(hub_ds.features.keys())\n            assert local_ds.features == hub_ds.features\n\n            # Ensure that there is a single file on the repository that has the correct name\n            files = sorted(item.path for item in self._api.list_bucket_tree(bucket_id, token=self._token))\n            assert files == [\n                \"README.md\",\n                \"data/train-00000-of-00003.parquet\",\n                \"data/train-00001-of-00003.parquet\",\n                \"data/train-00002-of-00003.parquet\",\n            ]\n\n\nclass DummyFolderBasedBuilder(FolderBasedBuilder):\n    BASE_FEATURE = dict\n    BASE_COLUMN_NAME = \"base\"\n    BUILDER_CONFIG_CLASS = FolderBasedBuilderConfig\n    EXTENSIONS = [\".txt\"]\n    # CLASSIFICATION_TASK = TextClassification(text_column=\"base\", label_column=\"label\")\n\n\n@pytest.fixture(params=[\".jsonl\", \".csv\"])\ndef text_file_with_metadata(request, tmp_path, text_file):\n    metadata_filename_extension = request.param\n    data_dir = tmp_path / \"data_dir\"\n    data_dir.mkdir()\n    text_file_path = data_dir / \"file.txt\"\n    shutil.copyfile(text_file, text_file_path)\n    metadata_file_path = data_dir / f\"metadata{metadata_filename_extension}\"\n    metadata = textwrap.dedent(\n        \"\"\"\\\n        {\"file_name\": \"file.txt\", \"additional_feature\": \"Dummy file\"}\n        \"\"\"\n        if metadata_filename_extension == \".jsonl\"\n        else \"\"\"\\\n        file_name,additional_feature\n        file.txt,Dummy file\n        \"\"\"\n    )\n    with open(metadata_file_path, \"w\", encoding=\"utf-8\") as f:\n        f.write(metadata)\n    return text_file_path, metadata_file_path\n\n\n@for_all_test_methods(xfail_if_500_502_http_error)\n@pytest.mark.usefixtures(\"ci_hub_config\")\nclass TestLoadFromHub:\n    _api = HfApi(endpoint=CI_HUB_ENDPOINT)\n    _token = CI_HUB_USER_TOKEN\n\n    def test_load_dataset_with_metadata_file(self, temporary_repo, text_file_with_metadata, tmp_path):\n        text_file_path, metadata_file_path = text_file_with_metadata\n        data_dir_path = text_file_path.parent\n        cache_dir_path = tmp_path / \".cache\"\n        cache_dir_path.mkdir()\n        with temporary_repo() as repo_id:\n            self._api.create_repo(repo_id, token=self._token, repo_type=\"dataset\")\n            self._api.upload_folder(\n                folder_path=str(data_dir_path),\n                repo_id=repo_id,\n                repo_type=\"dataset\",\n                token=self._token,\n            )\n            data_files = [\n                f\"hf://datasets/{repo_id}/{text_file_path.name}\",\n                f\"hf://datasets/{repo_id}/{metadata_file_path.name}\",\n            ]\n            builder = DummyFolderBasedBuilder(\n                dataset_name=repo_id.split(\"/\")[-1], data_files=data_files, cache_dir=str(cache_dir_path)\n            )\n            download_manager = DownloadManager()\n            gen_kwargs = builder._split_generators(download_manager)[0].gen_kwargs\n            generator = builder._generate_examples(**gen_kwargs)\n            result = [example for _, example in generator]\n            assert len(result) == 1\n\n    def test_get_data_patterns(self, temporary_repo, tmp_path):\n        repo_dir = tmp_path / \"test_get_data_patterns\"\n        data_dir = repo_dir / \"data\"\n        data_dir.mkdir(parents=True)\n        data_file = data_dir / \"train-00001-of-00009.parquet\"\n        data_file.touch()\n        with temporary_repo() as repo_id:\n            self._api.create_repo(repo_id, token=self._token, repo_type=\"dataset\")\n            self._api.upload_folder(\n                folder_path=str(repo_dir),\n                repo_id=repo_id,\n                repo_type=\"dataset\",\n                token=self._token,\n            )\n            data_file_patterns = get_data_patterns(f\"hf://datasets/{repo_id}\")\n            assert data_file_patterns == {\n                \"train\": [\"data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*\"]\n            }\n\n    @pytest.mark.parametrize(\"dataset\", [\"gated\", \"private\"])\n    def test_load_dataset_raises_for_unauthenticated_user(\n        self, dataset, hf_gated_dataset_repo_txt_data, hf_private_dataset_repo_txt_data\n    ):\n        dataset_ids = {\n            \"gated\": hf_gated_dataset_repo_txt_data,\n            \"private\": hf_private_dataset_repo_txt_data,\n        }\n        dataset_id = dataset_ids[dataset]\n        with pytest.raises(DatasetNotFoundError):\n            _ = load_dataset(dataset_id, token=False)\n"
  },
  {
    "path": "tests/test_version.py",
    "content": "import pytest\n\nfrom datasets.utils.version import Version\n\n\n@pytest.mark.parametrize(\n    \"other, expected_equality\",\n    [\n        (Version(\"1.0.0\"), True),\n        (\"1.0.0\", True),\n        (Version(\"2.0.0\"), False),\n        (\"2.0.0\", False),\n        (\"1\", False),\n        (\"a\", False),\n        (1, False),\n        (None, False),\n    ],\n)\ndef test_version_equality_and_hash(other, expected_equality):\n    version = Version(\"1.0.0\")\n    assert (version == other) is expected_equality\n    assert (version != other) is not expected_equality\n    assert (hash(version) == hash(other)) is expected_equality\n"
  },
  {
    "path": "tests/utils.py",
    "content": "import asyncio\nimport importlib.metadata\nimport os\nimport re\nimport sys\nimport tempfile\nimport unittest\nfrom contextlib import contextmanager\nfrom copy import deepcopy\nfrom distutils.util import strtobool\nfrom enum import Enum\nfrom importlib.util import find_spec\nfrom pathlib import Path\nfrom unittest.mock import Mock, patch\n\nimport httpx\nimport pyarrow as pa\nimport pytest\nimport requests\nfrom packaging import version\n\nfrom datasets import config\n\n\ndef parse_flag_from_env(key, default=False):\n    try:\n        value = os.environ[key]\n    except KeyError:\n        # KEY isn't set, default to `default`.\n        _value = default\n    else:\n        # KEY is set, convert it to True or False.\n        try:\n            _value = strtobool(value)\n        except ValueError:\n            # More values are supported, but let's keep the message simple.\n            raise ValueError(f\"If set, {key} must be yes or no.\")\n    return _value\n\n\n_run_slow_tests = parse_flag_from_env(\"RUN_SLOW\", default=False)\n_run_remote_tests = parse_flag_from_env(\"RUN_REMOTE\", default=False)\n_run_local_tests = parse_flag_from_env(\"RUN_LOCAL\", default=True)\n_run_packaged_tests = parse_flag_from_env(\"RUN_PACKAGED\", default=True)\n\n# Compression\nrequire_lz4 = pytest.mark.skipif(not config.LZ4_AVAILABLE, reason=\"test requires lz4\")\nrequire_py7zr = pytest.mark.skipif(not config.PY7ZR_AVAILABLE, reason=\"test requires py7zr\")\nrequire_zstandard = pytest.mark.skipif(not config.ZSTANDARD_AVAILABLE, reason=\"test requires zstandard\")\n\n# Dill-cloudpickle compatibility\nrequire_dill_gt_0_3_2 = pytest.mark.skipif(\n    config.DILL_VERSION <= version.parse(\"0.3.2\"),\n    reason=\"test requires dill>0.3.2 for cloudpickle compatibility\",\n)\n\n# Windows\nrequire_not_windows = pytest.mark.skipif(\n    sys.platform == \"win32\",\n    reason=\"test should not be run on Windows\",\n)\n\n\nrequire_faiss = pytest.mark.skipif(find_spec(\"faiss\") is None or sys.platform == \"win32\", reason=\"test requires faiss\")\nrequire_moto = pytest.mark.skipif(find_spec(\"moto\") is None, reason=\"test requires moto\")\nrequire_numpy1_on_windows = pytest.mark.skipif(\n    version.parse(importlib.metadata.version(\"numpy\")) >= version.parse(\"2.0.0\") and sys.platform == \"win32\",\n    reason=\"test requires numpy < 2.0 on windows\",\n)\n\nIS_HF_HUB_1_x = config.HF_HUB_VERSION >= version.parse(\"0.99\")  # clunky but works with pre-releases\n\n\ndef require_buckets_support_in_huggingface_hub(test_case):\n    \"\"\"\n    Decorator marking a test that requires buckets support in huggingface_hub.\n\n    These tests are skipped when huggingface_hub's version doesn't support buckets.\n\n    \"\"\"\n    try:\n        from huggingface_hub.utils import BucketNotFoundError  # noqa\n    except ImportError:\n        test_case = unittest.skip(\"test requires buckets support in huggingface_hub\")(test_case)\n    return test_case\n\n\ndef require_regex(test_case):\n    \"\"\"\n    Decorator marking a test that requires regex.\n\n    These tests are skipped when Regex isn't installed.\n\n    \"\"\"\n    try:\n        import regex  # noqa\n    except ImportError:\n        test_case = unittest.skip(\"test requires regex\")(test_case)\n    return test_case\n\n\ndef require_elasticsearch(test_case):\n    \"\"\"\n    Decorator marking a test that requires ElasticSearch.\n\n    These tests are skipped when ElasticSearch isn't installed.\n\n    \"\"\"\n    try:\n        import elasticsearch  # noqa\n    except ImportError:\n        test_case = unittest.skip(\"test requires elasticsearch\")(test_case)\n    return test_case\n\n\ndef require_sqlalchemy(test_case):\n    \"\"\"\n    Decorator marking a test that requires SQLAlchemy.\n\n    These tests are skipped when SQLAlchemy isn't installed.\n\n    \"\"\"\n    try:\n        import sqlalchemy  # noqa\n    except ImportError:\n        test_case = unittest.skip(\"test requires sqlalchemy\")(test_case)\n    return test_case\n\n\ndef require_torch(test_case):\n    \"\"\"\n    Decorator marking a test that requires PyTorch.\n\n    These tests are skipped when PyTorch isn't installed.\n\n    \"\"\"\n    if not config.TORCH_AVAILABLE:\n        test_case = unittest.skip(\"test requires PyTorch\")(test_case)\n    return test_case\n\n\ndef require_torch_compile(test_case):\n    \"\"\"\n    Decorator marking a test that requires PyTorch.\n\n    These tests are skipped when PyTorch isn't installed.\n\n    \"\"\"\n    if not config.TORCH_AVAILABLE:\n        test_case = unittest.skip(\"test requires PyTorch\")(test_case)\n    if config.PY_VERSION >= version.parse(\"3.14\"):\n        test_case = unittest.skip(\"test requires torch compile which isn't available in python 3.14\")(test_case)\n    return test_case\n\n\ndef require_polars(test_case):\n    \"\"\"\n    Decorator marking a test that requires Polars.\n\n    These tests are skipped when Polars isn't installed.\n\n    \"\"\"\n    if not config.POLARS_AVAILABLE:\n        test_case = unittest.skip(\"test requires Polars\")(test_case)\n    return test_case\n\n\ndef require_tf(test_case):\n    \"\"\"\n    Decorator marking a test that requires TensorFlow.\n\n    These tests are skipped when TensorFlow isn't installed.\n\n    \"\"\"\n    if not config.TF_AVAILABLE:\n        test_case = unittest.skip(\"test requires TensorFlow\")(test_case)\n    return test_case\n\n\ndef require_jax(test_case):\n    \"\"\"\n    Decorator marking a test that requires JAX.\n\n    These tests are skipped when JAX isn't installed.\n\n    \"\"\"\n    if not config.JAX_AVAILABLE:\n        test_case = unittest.skip(\"test requires JAX\")(test_case)\n    return test_case\n\n\ndef require_pil(test_case):\n    \"\"\"\n    Decorator marking a test that requires Pillow.\n\n    These tests are skipped when Pillow isn't installed.\n\n    \"\"\"\n    if not config.PIL_AVAILABLE:\n        test_case = unittest.skip(\"test requires Pillow\")(test_case)\n    return test_case\n\n\ndef require_torchvision(test_case):\n    \"\"\"\n    Decorator marking a test that requires torchvision.\n\n    These tests are skipped when torchvision isn't installed.\n\n    \"\"\"\n    if not config.TORCHVISION_AVAILABLE:\n        test_case = unittest.skip(\"test requires torchvision\")(test_case)\n    return test_case\n\n\ndef require_torchcodec(test_case):\n    \"\"\"\n    Decorator marking a test that requires torchcodec.\n\n    These tests are skipped when torchcodec isn't installed.\n\n    \"\"\"\n    if not config.TORCHCODEC_AVAILABLE:\n        test_case = unittest.skip(\"test requires torchcodec\")(test_case)\n    return test_case\n\n\ndef require_pdfplumber(test_case):\n    \"\"\"\n    Decorator marking a test that requires pdfplumber.\n\n    These tests are skipped when decord isn't installed.\n\n    \"\"\"\n    if not config.PDFPLUMBER_AVAILABLE:\n        test_case = unittest.skip(\"test requires pdfplumber\")(test_case)\n    return test_case\n\n\ndef require_nibabel(test_case):\n    \"\"\"\n    Decorator marking a test that requires nibabel.\n\n    These tests are skipped when nibabel isn't installed.\n\n    \"\"\"\n    if not config.NIBABEL_AVAILABLE:\n        test_case = unittest.skip(\"test requires nibabel\")(test_case)\n    return test_case\n\n\ndef require_transformers(test_case):\n    \"\"\"\n    Decorator marking a test that requires transformers.\n\n    These tests are skipped when transformers isn't installed.\n\n    \"\"\"\n    try:\n        import transformers  # noqa F401\n    except ImportError:\n        return unittest.skip(\"test requires transformers\")(test_case)\n    else:\n        return test_case\n\n\ndef require_tiktoken(test_case):\n    \"\"\"\n    Decorator marking a test that requires tiktoken.\n\n    These tests are skipped when transformers isn't installed.\n\n    \"\"\"\n    try:\n        import tiktoken  # noqa F401\n    except ImportError:\n        return unittest.skip(\"test requires tiktoken\")(test_case)\n    else:\n        return test_case\n\n\ndef require_spacy(test_case):\n    \"\"\"\n    Decorator marking a test that requires spacy.\n\n    These tests are skipped when they aren't installed.\n\n    \"\"\"\n    try:\n        import spacy  # noqa F401\n    except ImportError:\n        return unittest.skip(\"test requires spacy\")(test_case)\n    else:\n        return test_case\n\n\ndef require_pyspark(test_case):\n    \"\"\"\n    Decorator marking a test that requires pyspark.\n\n    These tests are skipped when pyspark isn't installed.\n\n    \"\"\"\n    try:\n        import pyspark  # noqa F401\n    except ImportError:\n        return unittest.skip(\"test requires pyspark\")(test_case)\n    else:\n        return test_case\n\n\ndef require_joblibspark(test_case):\n    \"\"\"\n    Decorator marking a test that requires joblibspark.\n\n    These tests are skipped when pyspark isn't installed.\n\n    \"\"\"\n    try:\n        import joblibspark  # noqa F401\n    except ImportError:\n        return unittest.skip(\"test requires joblibspark\")(test_case)\n    else:\n        return test_case\n\n\ndef require_torchdata_stateful_dataloader(test_case):\n    \"\"\"\n    Decorator marking a test that requires torchdata.stateful_dataloader.\n\n    These tests are skipped when torchdata with stateful_dataloader module isn't installed.\n\n    \"\"\"\n    try:\n        import torchdata.stateful_dataloader  # noqa F401\n    except (ImportError, AssertionError):\n        return unittest.skip(\"test requires torchdata.stateful_dataloader\")(test_case)\n    else:\n        return test_case\n\n\ndef slow(test_case):\n    \"\"\"\n    Decorator marking a test as slow.\n\n    Slow tests are skipped by default. Set the RUN_SLOW environment variable\n    to a truthy value to run them.\n\n    \"\"\"\n    if not _run_slow_tests or _run_slow_tests == 0:\n        test_case = unittest.skip(\"test is slow\")(test_case)\n    return test_case\n\n\ndef local(test_case):\n    \"\"\"\n    Decorator marking a test as local\n\n    Local tests are run by default. Set the RUN_LOCAL environment variable\n    to a falsy value to not run them.\n    \"\"\"\n    if not _run_local_tests or _run_local_tests == 0:\n        test_case = unittest.skip(\"test is local\")(test_case)\n    return test_case\n\n\ndef packaged(test_case):\n    \"\"\"\n    Decorator marking a test as packaged\n\n    Packaged tests are run by default. Set the RUN_PACKAGED environment variable\n    to a falsy value to not run them.\n    \"\"\"\n    if not _run_packaged_tests or _run_packaged_tests == 0:\n        test_case = unittest.skip(\"test is packaged\")(test_case)\n    return test_case\n\n\ndef remote(test_case):\n    \"\"\"\n    Decorator marking a test as one that relies on GitHub or the Hugging Face Hub.\n\n    Remote tests are skipped by default. Set the RUN_REMOTE environment variable\n    to a falsy value to not run them.\n    \"\"\"\n    if not _run_remote_tests or _run_remote_tests == 0:\n        test_case = unittest.skip(\"test requires remote\")(test_case)\n    return test_case\n\n\ndef for_all_test_methods(*decorators):\n    def decorate(cls):\n        for name, fn in cls.__dict__.items():\n            if callable(fn) and name.startswith(\"test\"):\n                for decorator in decorators:\n                    fn = decorator(fn)\n                setattr(cls, name, fn)\n        return cls\n\n    return decorate\n\n\nclass RequestWouldHangIndefinitelyError(Exception):\n    pass\n\n\nclass OfflineSimulationMode(Enum):\n    CONNECTION_FAILS = 0\n    CONNECTION_TIMES_OUT = 1\n    HF_HUB_OFFLINE_SET_TO_1 = 2\n\n\n@contextmanager\ndef offline(mode: OfflineSimulationMode):\n    \"\"\"\n    Simulate offline mode.\n\n    There are three offline simulation modes:\n\n    CONNECTION_FAILS (default mode): a ConnectionError is raised for each network call.\n    CONNECTION_TIMES_OUT: a ReadTimeout or ConnectTimeout is raised for each network call.\n    HF_HUB_OFFLINE_SET_TO_1: the HF_HUB_OFFLINE_SET_TO_1 environment variable is set to 1.\n        This makes the http/ftp calls of the library instantly fail and raise an OfflineModeEnabled error.\n\n    The raised exceptions are either from the `requests` library (if `huggingface_hub<1.0.0`)\n    or from the `httpx` library (if `huggingface_hub>=1.0.0`).\n    \"\"\"\n    # Enable offline mode\n    if mode is OfflineSimulationMode.HF_HUB_OFFLINE_SET_TO_1:\n        with patch(\"datasets.config.HF_HUB_OFFLINE\", True):\n            yield\n        return\n\n    # Determine which exception to raise based on mode\n\n    def error_response(*args, **kwargs):\n        if mode is OfflineSimulationMode.CONNECTION_FAILS:\n            exc = httpx.ConnectError if IS_HF_HUB_1_x else requests.ConnectionError\n        elif mode is OfflineSimulationMode.CONNECTION_TIMES_OUT:\n            if kwargs.get(\"timeout\") is None:\n                raise RequestWouldHangIndefinitelyError(\n                    \"Tried an HTTP call in offline mode with no timeout set. Please set a timeout.\"\n                )\n            exc = httpx.ReadTimeout if IS_HF_HUB_1_x else requests.ConnectTimeout\n        else:\n            raise ValueError(\"Please use a value from the OfflineSimulationMode enum.\")\n        raise exc(f\"Offline mode {mode}\")\n\n    # Patch all client methods to raise the appropriate error\n    client_mock = Mock()\n    for method in [\"head\", \"get\", \"post\", \"put\", \"delete\", \"request\", \"stream\"]:\n        setattr(client_mock, method, Mock(side_effect=error_response))\n\n    # Patching is slightly different depending on hfh internals\n    patch_target = (\n        {\"target\": \"huggingface_hub.utils._http._GLOBAL_CLIENT\", \"new\": client_mock}\n        if IS_HF_HUB_1_x\n        else {\n            \"target\": \"huggingface_hub.utils._http._get_session_from_cache\",\n            \"return_value\": client_mock,\n        }\n    )\n    with patch(**patch_target):\n        yield\n\n\n@contextmanager\ndef set_current_working_directory_to_temp_dir(*args, **kwargs):\n    original_working_dir = str(Path().resolve())\n    with tempfile.TemporaryDirectory(*args, **kwargs) as tmp_dir:\n        try:\n            os.chdir(tmp_dir)\n            yield\n        finally:\n            os.chdir(original_working_dir)\n\n\n@contextmanager\ndef assert_arrow_memory_increases():\n    import gc\n\n    gc.collect()\n    previous_allocated_memory = pa.total_allocated_bytes()\n    yield\n    assert pa.total_allocated_bytes() - previous_allocated_memory > 0, \"Arrow memory didn't increase.\"\n\n\n@contextmanager\ndef assert_arrow_memory_doesnt_increase():\n    import gc\n\n    gc.collect()\n    previous_allocated_memory = pa.total_allocated_bytes()\n    yield\n    assert pa.total_allocated_bytes() - previous_allocated_memory <= 0, \"Arrow memory wasn't expected to increase.\"\n\n\ndef is_rng_equal(rng1, rng2):\n    return deepcopy(rng1).integers(0, 100, 10).tolist() == deepcopy(rng2).integers(0, 100, 10).tolist()\n\n\ndef xfail_if_500_502_http_error(func):\n    import decorator\n\n    def _wrapper(func, *args, **kwargs):\n        try:\n            return func(*args, **kwargs)\n        except (requests.HTTPError, httpx.HTTPError) as err:\n            if str(err).startswith(\"500\") or str(err).startswith(\"502\"):\n                pytest.xfail(str(err))\n            raise err\n\n    return decorator.decorator(_wrapper, func)\n\n\n# --- distributed testing functions --- #\n\n# copied from transformers\n# originally adapted from https://stackoverflow.com/a/59041913/9201239\n\n\nclass _RunOutput:\n    def __init__(self, returncode, stdout, stderr):\n        self.returncode = returncode\n        self.stdout = stdout\n        self.stderr = stderr\n\n\nasync def _read_stream(stream, callback):\n    while True:\n        line = await stream.readline()\n        if line:\n            callback(line)\n        else:\n            break\n\n\nasync def _stream_subprocess(cmd, env=None, stdin=None, timeout=None, quiet=False, echo=False) -> _RunOutput:\n    if echo:\n        print(\"\\nRunning: \", \" \".join(cmd))\n\n    p = await asyncio.create_subprocess_exec(\n        cmd[0],\n        *cmd[1:],\n        stdin=stdin,\n        stdout=asyncio.subprocess.PIPE,\n        stderr=asyncio.subprocess.PIPE,\n        env=env,\n    )\n\n    # note: there is a warning for a possible deadlock when using `wait` with huge amounts of data in the pipe\n    # https://docs.python.org/3/library/asyncio-subprocess.html#asyncio.asyncio.subprocess.Process.wait\n    #\n    # If it starts hanging, will need to switch to the following code. The problem is that no data\n    # will be seen until it's done and if it hangs for example there will be no debug info.\n    # out, err = await p.communicate()\n    # return _RunOutput(p.returncode, out, err)\n\n    out = []\n    err = []\n\n    def tee(line, sink, pipe, label=\"\"):\n        line = line.decode(\"utf-8\").rstrip()\n        sink.append(line)\n        if not quiet:\n            print(label, line, file=pipe)\n\n    # XXX: the timeout doesn't seem to make any difference here\n    await asyncio.wait(\n        [\n            _read_stream(p.stdout, lambda line: tee(line, out, sys.stdout, label=\"stdout:\")),\n            _read_stream(p.stderr, lambda line: tee(line, err, sys.stderr, label=\"stderr:\")),\n        ],\n        timeout=timeout,\n    )\n    return _RunOutput(await p.wait(), out, err)\n\n\ndef execute_subprocess_async(cmd, env=None, stdin=None, timeout=180, quiet=False, echo=True) -> _RunOutput:\n    loop = asyncio.get_event_loop()\n    result = loop.run_until_complete(\n        _stream_subprocess(cmd, env=env, stdin=stdin, timeout=timeout, quiet=quiet, echo=echo)\n    )\n\n    cmd_str = \" \".join(cmd)\n    if result.returncode > 0:\n        stderr = \"\\n\".join(result.stderr)\n        raise RuntimeError(\n            f\"'{cmd_str}' failed with returncode {result.returncode}\\n\\n\"\n            f\"The combined stderr from workers follows:\\n{stderr}\"\n        )\n\n    # check that the subprocess actually did run and produced some output, should the test rely on\n    # the remote side to do the testing\n    if not result.stdout and not result.stderr:\n        raise RuntimeError(f\"'{cmd_str}' produced no output.\")\n\n    return result\n\n\ndef pytest_xdist_worker_id():\n    \"\"\"\n    Returns an int value of worker's numerical id under `pytest-xdist`'s concurrent workers `pytest -n N` regime, or 0\n    if `-n 1` or `pytest-xdist` isn't being used.\n    \"\"\"\n    worker = os.environ.get(\"PYTEST_XDIST_WORKER\", \"gw0\")\n    worker = re.sub(r\"^gw\", \"\", worker, count=0, flags=re.M)\n    return int(worker)\n\n\ndef get_torch_dist_unique_port():\n    \"\"\"\n    Returns a port number that can be fed to `torchrun`'s `--master_port` argument.\n\n    Under `pytest-xdist` it adds a delta number based on a worker id so that concurrent tests don't try to use the same\n    port at once.\n    \"\"\"\n    port = 29500\n    uniq_delta = pytest_xdist_worker_id()\n    return port + uniq_delta\n"
  },
  {
    "path": "utils/release.py",
    "content": "# Copyright 2021 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport argparse\nimport re\n\nimport packaging.version\n\n\nREPLACE_PATTERNS = {\n    \"init\": (re.compile(r'^__version__\\s+=\\s+\"([^\"]+)\"\\s*$', re.MULTILINE), '__version__ = \"VERSION\"\\n'),\n    \"setup\": (re.compile(r'^(\\s*)version\\s*=\\s*\"[^\"]+\",', re.MULTILINE), r'\\1version=\"VERSION\",'),\n}\nREPLACE_FILES = {\n    \"init\": \"src/datasets/__init__.py\",\n    \"setup\": \"setup.py\",\n}\n\n\ndef update_version_in_file(fname, version, pattern):\n    \"\"\"Update the version in one file using a specific pattern.\"\"\"\n    with open(fname, \"r\", encoding=\"utf-8\", newline=\"\\n\") as f:\n        code = f.read()\n    re_pattern, replace = REPLACE_PATTERNS[pattern]\n    replace = replace.replace(\"VERSION\", version)\n    code = re_pattern.sub(replace, code)\n    with open(fname, \"w\", encoding=\"utf-8\", newline=\"\\n\") as f:\n        f.write(code)\n\n\ndef global_version_update(version):\n    \"\"\"Update the version in all needed files.\"\"\"\n    for pattern, fname in REPLACE_FILES.items():\n        update_version_in_file(fname, version, pattern)\n\n\ndef get_version():\n    \"\"\"Reads the current version in the __init__.\"\"\"\n    with open(REPLACE_FILES[\"init\"], \"r\") as f:\n        code = f.read()\n    default_version = REPLACE_PATTERNS[\"init\"][0].search(code).groups()[0]\n    return packaging.version.parse(default_version)\n\n\ndef pre_release_work(patch=False):\n    \"\"\"Do all the necessary pre-release steps.\"\"\"\n    # First let's get the default version: base version if we are in dev, bump minor otherwise.\n    default_version = get_version()\n    if patch and default_version.is_devrelease:\n        raise ValueError(\"Can't create a patch version from the dev branch, checkout a released version!\")\n    if default_version.is_devrelease:\n        default_version = default_version.base_version\n    elif patch:\n        default_version = f\"{default_version.major}.{default_version.minor}.{default_version.micro + 1}\"\n    else:\n        default_version = f\"{default_version.major}.{default_version.minor + 1}.0\"\n\n    # Now let's ask nicely if that's the right one.\n    version = input(f\"Which version are you releasing? [{default_version}]\")\n    if len(version) == 0:\n        version = default_version\n\n    print(f\"Updating version to {version}.\")\n    global_version_update(version)\n\n\ndef post_release_work():\n    \"\"\"Do all the necesarry post-release steps.\"\"\"\n    # First let's get the current version\n    current_version = get_version()\n    dev_version = f\"{current_version.major}.{current_version.minor + 1}.0.dev0\"\n    current_version = current_version.base_version\n\n    # Check with the user we got that right.\n    version = input(f\"Which version are we developing now? [{dev_version}]\")\n    if len(version) == 0:\n        version = dev_version\n\n    print(f\"Updating version to {version}.\")\n    global_version_update(version)\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\"--post_release\", action=\"store_true\", help=\"Whether or not this is post release.\")\n    parser.add_argument(\"--patch\", action=\"store_true\", help=\"Whether or not this is a patch release.\")\n    args = parser.parse_args()\n    if not args.post_release:\n        pre_release_work(patch=args.patch)\n    elif args.patch:\n        print(\"Nothing to do after a patch :-)\")\n    else:\n        post_release_work()\n"
  }
]