[
  {
    "path": ".github/workflows/ci.yml",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nname: CI\n\non:\n  # Triggers the workflow on push or pull request events but only for the main branch\n  push:\n    branches: [ main ]\n  pull_request:\n    branches: [ main ]\n  # Allows you to run this workflow manually from the Actions tab\n  workflow_dispatch:\n\nenv:\n  CDSAPI_URL: https://cds.climate.copernicus.eu/api\n  CDSAPI_KEY: 1234567-ab12-34cd-9876-4o4fake90909  # A fake key for testing\njobs:\n  build:\n    runs-on: ubuntu-latest\n    strategy:\n      fail-fast: false\n      matrix:\n        python-version: [\"3.8\", \"3.9\"]\n    steps:\n    - name: Cancel previous\n      uses: styfle/cancel-workflow-action@0.7.0\n      with:\n        access_token: ${{ github.token }}\n      if: ${{github.ref != 'refs/head/main'}}\n    - uses: actions/checkout@v2\n    - name: conda cache\n      uses: actions/cache@v3\n      env:\n        # Increase this value to reset cache if etc/example-environment.yml has not changed\n        CACHE_NUMBER: 0\n      with:\n        path: ~/conda_pkgs_dir\n        key:\n          ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ matrix.python-version }}-${{ hashFiles('ci3.8.yml') }}\n    - name: Setup conda environment\n      uses: conda-incubator/setup-miniconda@v3\n      with:\n        python-version: ${{ matrix.python-version }}\n        channels: conda-forge\n        environment-file: ci${{ matrix.python-version}}.yml\n        activate-environment: weather-tools\n        miniforge-variant: Miniforge3\n        miniforge-version: latest\n        use-mamba: true\n    - name: Check MetView's installation\n      shell: bash -l {0}\n      run: python -m metview selfcheck\n    - name: Run unit tests\n      shell: bash -l {0}\n      run: pytest --memray --ignore=weather_dl_v2 # Ignoring dl-v2 as it only supports py3.10\n  lint:\n    runs-on: ubuntu-latest\n    strategy:\n      fail-fast: false\n    steps:\n      - name: Cancel previous\n        uses: styfle/cancel-workflow-action@0.7.0\n        with:\n          access_token: ${{ github.token }}\n        if: ${{github.ref != 'refs/head/main'}}\n      - uses: actions/checkout@v2\n      - name: Set up Python ${{ matrix.python-version }}\n        uses: actions/setup-python@v2\n        with:\n          python-version: ${{ matrix.python-version }}\n      - name: Get pip cache dir\n        id: pip-cache\n        run: |\n          python -m pip install --upgrade pip wheel\n          echo \"dir=$(pip cache dir)\" >> \"$GITHUB_OUTPUT\"\n      - name: Install linter\n        run: |\n          pip install ruff==0.1.2\n      - name: Lint project\n        run: ruff check .\n  type-check:\n    runs-on: ubuntu-latest\n    strategy:\n      fail-fast: false\n      matrix:\n        python-version: [\"3.8\"]\n    steps:\n      - name: Cancel previous\n        uses: styfle/cancel-workflow-action@0.7.0\n        with:\n          access_token: ${{ github.token }}\n        if: ${{github.ref != 'refs/head/main'}}\n      - uses: actions/checkout@v2\n      - name: conda cache\n        uses: actions/cache@v3\n        env:\n          # Increase this value to reset cache if etc/example-environment.yml has not changed\n          CACHE_NUMBER: 0\n        with:\n          path: ~/conda_pkgs_dir\n          key:\n            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ matrix.python-version }}-${{ hashFiles('ci3.8.yml') }}\n      - name: Setup conda environment\n        uses: conda-incubator/setup-miniconda@v3\n        with:\n          python-version: ${{ matrix.python-version }}\n          channels: conda-forge\n          environment-file: ci${{ matrix.python-version}}.yml\n          activate-environment: weather-tools\n          miniforge-variant: Miniforge3\n          miniforge-version: latest\n          use-mamba: true\n      - name: Install weather-tools[test]\n        run: |\n          conda run -n weather-tools pip install -e .[test] --use-deprecated=legacy-resolver\n      - name: Run type checker\n        run: conda run -n weather-tools pytype"
  },
  {
    "path": ".github/workflows/publish.yml",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nname: Build and Upload weather-tools to PyPI\n\non:\n  release:\n    types: [published]\n\njobs:\n  build-artifacts:\n    runs-on: ubuntu-latest\n    steps:\n      - uses: actions/checkout@v2\n      - name: Set up Python\n        uses: actions/setup-python@v2.3.1\n        with:\n          python-version: 3.8\n\n      - name: Install dependencies\n        run: |\n          python -m pip install --upgrade pip\n          python -m pip install setuptools setuptools-scm wheel twine check-manifest\n\n      - name: Build tarball and wheels\n        run: |\n          git clean -xdf\n          git restore -SW .\n          python -m build --sdist --wheel .\n      - name: Check built artifacts\n        run: |\n          python -m twine check dist/*\n          pwd\n          if [ -f dist/weather_tools-0.0.0.tar.gz ]; then\n            echo \"❌ INVALID VERSION NUMBER\"\n            exit 1\n          else\n            echo \"✅ Looks good\"\n          fi\n      - uses: actions/upload-artifact@v2\n        with:\n          name: releases\n          path: dist\n\n  test-built-dist:\n    needs: build-artifacts\n    runs-on: ubuntu-latest\n    steps:\n      - uses: actions/setup-python@v2.3.1\n        name: Install Python\n        with:\n          python-version: 3.8\n      - uses: actions/download-artifact@v4.1.7\n        with:\n          name: releases\n          path: dist\n      - name: List contents of built dist\n        run: |\n          ls -ltrh\n          ls -ltrh dist\n      - name: Publish package to TestPyPI\n        if: github.event_name == 'push'\n        uses: pypa/gh-action-pypi-publish@v1.4.2\n        with:\n          user: __token__\n          password: ${{ secrets.TESTPYPI_TOKEN }}\n          repository_url: https://test.pypi.org/legacy/\n          verbose: true\n\n      - name: Check uploaded package\n        if: github.event_name == 'push'\n        run: |\n          sleep 3\n          python -m pip install --upgrade pip\n          python -m pip install --extra-index-url https://test.pypi.org/simple --upgrade weather-tools\n          weather-dl --help && weather-mv --help && weather-sp --help\n  \n  upload-to-pypi:\n    needs: test-built-dist\n    if: github.event_name == 'release'\n    runs-on: ubuntu-latest\n    steps:\n      - uses: actions/download-artifact@v4.1.7\n        with:\n          name: releases\n          path: dist\n      - name: Publish package to PyPI\n        uses: pypa/gh-action-pypi-publish@v1.4.2\n        with:\n          user: __token__\n          password: ${{ secrets.PYPI_TOKEN }}\n          verbose: true"
  },
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\ncover/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\n.pybuilder/\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n#   For a library or package, you might want to ignore these files since the code is\n#   intended to run in multiple environments; otherwise, check them in:\n# .python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\n# pytype static type analyzer\n.pytype/\n\n# Cython debug symbols\ncython_debug/\n\n# JetBrains / PyCharm\n.idea/\n\n# VSCode\n.vscode/\nlaunch.json\n"
  },
  {
    "path": ".read-the-docs.yaml",
    "content": "# .readthedocs.yaml\n# Read the Docs configuration file\n# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details\n\n# Required\nversion: 2\n\nbuild:\n  image: latest\n\n# Build documentation in the docs/ directory with Sphinx\nsphinx:\n  configuration: docs/conf.py\n\n# Optionally set the version of Python and requirements required to build your docs\npython:\n  version: \"3.8\"\n  install:\n    - requirements: docs/requirements.txt\n    - method: pip\n      path: .\n  system_packages: false\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "# How to Contribute\n\nWe'd love to accept your patches and contributions to this project. There are just a few small guidelines you need to\nfollow.\n\n## Contributor License Agreement\n\nContributions to this project must be accompanied by a Contributor License Agreement (CLA). You (or your employer)\nretain the copyright to your contribution; this simply gives us permission to use and redistribute your contributions as\npart of the project. Head over to\n<https://cla.developers.google.com/> to see your current agreements on file or to sign a new one.\n\nYou generally only need to submit a CLA once, so if you've already submitted one\n(even if it was for a different project), you probably don't need to do it again.\n\n## Code Reviews\n\nAll submissions, including submissions by project members, require review. We use GitHub pull requests for this purpose.\nConsult\n[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more information on using pull requests.\n\n## Community Guidelines\n\nThis project follows\n[Google's Open Source Community Guidelines](https://opensource.google/conduct/).\n\n## Pull Request Etiquette\n\nWe're thrilled to have your pull request! However, we ask that you file an issue (or check if one exist) before\nsubmitting a change-list (CL).\n\nIn general, we're happier reviewing many small PRs over fewer, larger changes. Please try to\nsubmit [small CLs](https://google.github.io/eng-practices/review/developer/small-cls.html).\n\n## Project Structure\n\n```\nbin/  # Tools for development\n\nconfigs/  # Save example and general purpose configs.\n\ndocs/  # Sphinx documentation, deployed to Github pages (or readthedocs).\n\nweather_dl/  # Main python package, other pipelines can come later.\n    __init__.py  \n    download_pipeline/  # sources for pipeline\n        __init__.py \n        ...  \n        <module>.py\n        <module>_test.py  # We'll follow this naming pattern for tests.\n    weather-dl  # script to run pipeline\n    setup.py  # packages sources for execution in beam worker\n              # pipeline-specific requirements managed here\n    \nsetup.py  # Project is pip-installable, project requirements managed here.\n```\n\n## Developer Installation\n\n1. Set up a Python development environment with *Anaconda* or\n   [*Miniconda*](https://docs.conda.io/en/latest/miniconda.html).\n\n    * On a Mac, you can run `brew install miniconda`.\n\n    * Use Python version 3.8.5+ for development. We have not yet upgraded the project to use Python 3.9\n      (see [#166](https://github.com/google/weather-tools/issues/166)).\n\n3. Clone the repo and install dependencies with anaconda.\n\n   ```shell\n   # clone with HTTPS\n   git clone http://github.com/google/weather-tools.git\n   # clone with SSH\n   git clone git@github.com:google/weather-tools.git\n   cd weather-tools\n   conda env create -f environment.yml\n   conda activate weather-tools\n   pip install -e \".[dev]\"\n   ```\n\n4. Install `gcloud`, the Google Cloud CLI, following these [instructions](https://cloud.google.com/sdk/docs/install).\n\n5. Acquire adequate permissions for the project.\n    * Run `gcloud auth application-default login`.\n\n    * Make sure your account has *write* permissions to the storage bucket as well as the permissions to create a\n      Dataflow job.\n\n    * Make sure that both Dataflow and Cloud Storage are enabled on your Google Cloud Platform project.\n\n### Windows Developer Instructions\n\nWindows support for each CLI is currently under development (\nSee [#64](https://github.com/google/weather-tools/issues/64)). However, there are workarounds available for running the\nnweather tools outside of installation with `pip`.\n\nFirst, the would-be pip-installed script can be run directly with python like so:\n\n```shell\npython weather_dl/weather-dl --help\n```\n\n## Testing\n\nFor testing at development time, we make use of three tools:\n\n* `pytype` for checking types:\n   ```shell\n   # check everything\n   pytype\n   # check a specific tool\n   pytype weather_dl\n   ```\n\n* `ruff` for linting:\n   ```shell \n   # lint everything\n   ruff check .\n   # lint a specific tool\n   ruff check weather_mv\n   ```\n\n* `pytest` for running tests:\n   ```shell\n   # test everything \n   pytest\n   # test a specific tool\n   pytest weather_sp\n   ```\n\nIf you'd like to automate running these checks, we provide a post-push git hook:\n\n```shell\ncp bin/post-push .git/hooks/\n```\n\nThis script can be run manually, too.\n\nIf you'd like to locally run these checks for all versions of python this project supports, you can use\n`tox`. Tox will make use of the python versions installed on your machine and create virtual test environments on your\nbehalf.\n\n```shell\ntox\n```\n\nIn addition, we provide a simple script to install _other_ branches locally. Run `bin/install-branch <branch-name>` to\npip install that branches working copy of weather-tools. Hopefully, this script facilitates testing of work-in-progress\ncontributions.\n\nPlease review the [Beam testing docs](https://beam.apache.org/documentation/pipelines/test-your-pipeline/) for guidance\nin how to write tests for the pipeline.\n\n## Documentation\n\nDocuments are generated with Sphinx and the myst-parser. To generate the documents locally, simply invoke `make`:\n\n```shell\ncd docs\nrm -r _build\nmake html\n```\n\n> Note: Due to the idiosyncrasies of how Sphinx detects updates and our use of symbolic links, we recommend deleting the\n> `_build` folder.\n\nOr, you can run the following subshell command to re-generate everything without having to leave the project root:\n\n```shell\n(cd docs && rm -r _build && make html)\n```\n\nAfter the docs are re-generated, you can view them by starting a local file server, for example:\n\n```shell\npython -m http.server -d docs/_build/html\n```\n\n## Versions & Releasing\n\nWe aim to represent the version of each tool using [semver.org](https://semver.org/) semantic versions. To that end, we\nwill abide by the following pattern:\n\n- When making a change to a particular tool, please remember to update its semantic version in the `setup.py` file.\n- The version for _all the tools_ should be incremented on update to _any_ tool. If one tool changes, the\n  whole `google-weather-tools` package should have its version incremented.\n- The representation for the version of all of `google-weather-tools` is\n  via [git tags](https://git-scm.com/book/en/v2/Git-Basics-Tagging). To update the version of the package, please make\n  an annotated tag with a short description of the change.\n- When it's time for release, choose the latest tag and fill out the release description. Check out previous release\n  notes to get an idea of how to structure the next one. These release notes are the primary changelog for the project. \n"
  },
  {
    "path": "Configuration.md",
    "content": "# Configuration Files\n\nConfig files describe both _what_ to download and _how_ it should be downloaded. To this end, configs have two sections:\n`selection` that describes the data desired from data-sources and `parameters` that define the details of the download.\nBy convention, the `parameters` section comes first.\n\nConfiguration files can be written in `*.cfg` or `*.json`, but typically, they're written in the former format (i.e.\nPython's native config language, which is similar to INI format).\n\nBefore jumping into the details of each section, let's look at a few example configs.\n\n## Examples\n\nThe following demonstrate how to download weather data from ECMWF's Copernicus (CDS) and Meteorological Archival and\nRetrieval System (MARS) catalogues.\n\n### Download Era5 Pressure Level Reanalysis from Copernicus\n\n```\n[parameters]\nclient=cds  ; choose a data source client\ndataset=reanalysis-era5-pressure-levels  ; specify a dataset to download from the data source (CDS-specific)\ntarget_path=gs://ecmwf-output-test/era5/{}/{}/{}-pressure-{}.nc  ; create a template for the output file path\npartition_keys=  ; define how we should partition the download by the \"keys\" in the `selection` section.\n    year         ; See docs below for more explanation\n    month\n    day\n    pressure_level\n[selection]\nproduct_type=ensemble_mean\nformat=netcdf\nvariable=\n    divergence\n    fraction_of_cloud_cover\n    geopotential\npressure_level=\n    500\nyear=    ; we can specify a list of values using multiple lines\n    2015\n    2016\n    2017\nmonth=\n    01\nday=\n    01\n    15\ntime=\n    00:00\n    06:00\n    12:00\n    18:00\n```\n\n### Download Yesterday's Surface Temperatures from MARS.\n\n```\n[parameters]\nclient=mars  ; download from MARS (this is the default data source).\ntarget_path=all.an  ; Download all the data from the selection section into one file.\n[selection]\nclass   = od\ntype    = analysis\nlevtype = surface\ndate    = -1  ; Yesterday -- see link to MARS request syntax in docs linked below.\ntime    = 00/06/12/18  ; We can specify multiple values using `/` delimiters -- see MARS request syntax docs\nparam   = z/sp\n```\n\n## `parameters` Section\n\n_Parameters for the pipeline_\n\nThese describe which data source to download, where the data should live, and how the download should be partitioned.\n\n* `client`: (required) Select the weather API client. Supported values are `cds` for Copernicus, and `mars` for MARS.\n* `dataset`: (optional) Name of the target dataset. Allowed options are dictated by the client.\n* `target_path`: (required) Download artifact filename template. Can use Python string format symbols. Must have the\n  same number of format symbols as the number of partition keys.\n* `partition_keys`: (optional) This determines how download jobs will be divided.\n    * Value can be a single item or a list.\n    * Each value must appear as a key in the `selection` section.\n    * Each downloader will receive a config file with every parameter listed in the `selection`, _except_ for the fields\n      specified by the `partition_keys`.\n    * The downloader config will contain one instance of the cross-product of every key in `partition_keys`.\n        * E.g. `['year', 'month']` will lead to a config set like `[(2015, 01), (2015, 02), (2015, 03), ...]`.\n    * The list of keys will be used to format the `target_path`.\n\n> **NOTE**: `target_path` template is totally compatible with Python's standard string formatting.\n> This includes being able to use named arguments (e.g. 'gs://bucket/{year}/{month}/{day}.nc') as well as specifying formats for strings \n> (e.g. 'gs://bucket/{year:04d}/{month:02d}/{day:02d}.nc').\n\n### Creating a date-based directory hierarchy\n\nThe date-based directory hierarchy can be created using Python's standard string formatting.\nBelow are some examples of how to use `target_path` with Python's standard string formatting.\n\n<details>\n<summary><strong>Examples</strong></summary>\n\nNote that any parameters that are not relevant to the target path have been omitted.\n\n```\n[parameters]\ntarget_path=gs://ecmwf-output-test/era5/{date:%%Y/%%m/%%d}.nc\npartition_keys=\n     date\n[selection]\ndate=2017-01-01/to/2017-01-02\n```\n\nwill create  \n`gs://ecmwf-output-test/era5/2017/01/01.nc` and  \n`gs://ecmwf-output-test/era5/2017/01/02.nc`.\n\n```\n[parameters]\ntarget_path=gs://ecmwf-output-test/era5/{date:%%Y/%%m/%%d}-pressure-{pressure_level}.nc\npartition_keys=\n     date\n     pressure_level\n[selection]\npressure_level=\n    500\ndate=2017-01-01/to/2017-01-02\n```\n\nwill create  \n`gs://ecmwf-output-test/era5/2017/01/01-pressure-500.nc` and   \n`gs://ecmwf-output-test/era5/2017/01/02-pressure-500.nc`.\n\n```\n[parameters]\ntarget_path=gs://ecmwf-output-test/pressure-{pressure_level}/era5/{date:%%Y/%%m/%%d}.nc\npartition_keys=\n     date\n     pressure_level\n[selection]\npressure_level=\n    500\ndate=2017-01-01/to/2017-01-02\n```\n\nwill create  \n`gs://ecmwf-output-test/pressure-500/era5/2017/01/01.nc` and  \n`gs://ecmwf-output-test/pressure-500/era5/2017/01/02.nc`.\n\n```\n[parameters]\ntarget_path=gs://ecmwf-output-test/era5/{year:04d}/{month:02d}/{day:02d}-pressure-{pressure_level}.nc\npartition_keys=\n    year\n    month\n    day\n    pressure_level\n[selection]\npressure_level=\n    500\nyear=\n    2017\nmonth=\n    01\nday=\n    01\n    02\n```\n\nwill create  \n`gs://ecmwf-output-test/era5/2017/01/01-pressure-500.nc` and  \n`gs://ecmwf-output-test/era5/2017/01/02-pressure-500.nc`.\n\n> **Note**: Replacing the `target_path` of the above example with this `target_path=gs://ecmwf-output-test/era5/{year}/{month}/{day}-pressure-\n>{pressure_level}.nc`\n>\n> will create\n>\n> `gs://ecmwf-output-test/era5/2017/1/1-pressure-500.nc` and  \n> `gs://ecmwf-output-test/era5/2017/1/2-pressure-500.nc`.\n\nIn addition to the above, the table below presents further partitioning examples based on recent enhancements to the weather-dl tool, particularly around date and related keywords.\n\n<table>\n  <colgroup>\n    <col style=\"width:15%\">\n    <col style=\"width:35%\">\n    <col style=\"width:25%\">\n    <col style=\"width:25%\">\n  </colgroup>\n  <thead>\n    <tr>\n      <td><h5>Keyword</h5></td>\n      <td><h5>Description</h5></td>\n      <td><h5>Sample Config</h5></td>\n      <td><h5>Output Partitions</h5></td>\n    </tr>\n    <tr>\n      <td>date</td>\n      <td>Specifies the calendar date(s) for which data is retrieved in an ECMWF MARS request.</td>\n      <td>[parameters]<br/>target_path={date}.nc<br/>partition_keys=date<br/>[selection]<br/>date=2017-01-01/to/2017-01-02</td>\n      <td>2017-01-01.nc<br/>2017-01-02.nc</td>\n    </tr>\n    <tr>\n      <td>date [step] [time] [var]</td>\n      <td>Along with specifying date in the ECMWF MARS request, users can also partition data using one or more of [forecast-step, initialization-time, or varirable].</td>\n      <td>\n        <h5>Case-1</h5>[parameters]<br/>target_path={date}_{time}.nc<br/>partition_keys=<br/>date<br/>time<br/>[selection]<br/>date=2025-01-01/to/2025-01-08/by/5<br/>time=00/12<br/>\n        <h5>Case-2</h5>[parameters]<br/>target_path={date}_{step}.nc<br/>partition_keys=<br/>date<br/>step<br/>[selection]<br/>date=2024-12-31/to/2024-01-26/by/-3<br/>step=0<br/>\n      </td>\n      <td>\n        <h5>Case-1</h5>2025-01-01_00:00:00.nc<br/>2025-01-01_12:00:00.nc<br/>2025-01-06_00:00:00.nc<br/>2025-01-06_12:00:00.nc<br/>\n        <h5>Case-2</h5>2024-12-31_0.nc<br/>2024-12-28_0.nc<br/>\n      </td>\n    </tr>\n    <tr>\n      <td>year, month, day</td>\n      <td>Specifies dates in a decomposed form (separate fields) for ECMWF MARS requests, enabling flexible selection across ranges and combinations. Supports multiple <b>year</b> and <b>month</b> inputs, allowing users to define broad time spans without enumerating each period.</td>\n      <td>\n        <h5>Case-1</h5>[parameters]<br/>target_path={year}/{year}-{month:02d}.grb2<br/>partition_keys=<br/>year<br/>month<br/>[selection]<br/>year=2021<br/>month=1/to/2<br/>day=all<br/>\n        <h5>Case-2: Full year</h5>[parameters]<br/>target_path=full_{year}.nc<br/>partition_keys=year<br/>[selection]<br/>year=2001/to/2002<br/>month=1/to/12<br/>day=all<br/>\n        <h5>Case-3: Odd months</h5>[parameters]<br/>target_path=odd_{year}.nc<br/>partition_keys=year<br/>[selection]<br/>year=2001/to/2002<br/>month=1/to/12/by/2<br/>day=all<br/>\n        <h5>Case-4: Specific days</h5>[parameters]<br/>target_path=misc_{year}.nc<br/>partition_keys=year<br/>[selection]<br/>year=2001/to/2002<br/>month=1/to/12<br/>day=1/5/10/15<br/>\n      </td>\n      <td>\n        <h5>Case-1</h5>2021/2021-01.grb2<br/>2021/2021-02.grb2<br/>\n        <h5>Case-2</h5>full_2001.nc<br/>full_2002.nc<br/>\n        <h5>Case-3</h5>odd_2001.nc<br/>odd_2002.nc<br/>\n        <h5>Case-4</h5>misc_2001.nc<br/>misc_2002.nc<br/>\n      </td>\n    </tr>\n    <tr>\n      <!-- https://github.com/google/weather-tools/pull/503 -->\n      <td>year-month</td>\n      <td>Added support for a year‑month key, allowing users to specify downloads using month granularity instead of ranged formats.</td>\n      <td>[parameters]<br/>target_path={year-month}.gb<br/>partition_keys=year-month<br/>[selection]<br/>year-month=2024-11/to/2025-02</td>\n      <td>2024-11.gb<br/>2024-12.gb<br/>2025-01.gb<br/>2025-02.gb</td>\n    </tr>\n    <tr>\n      <!-- https://github.com/google/weather-tools/pull/525 -->\n      <td>date_range</td>\n      <td>Added support for specifying one or more date-range values, enabling users to download data across multiple date intervals in a single run.<br/><br/>Note: date_range must be specified in partition_keys.</td>\n      <td>[parameters]<br/>target_path={date_range}.nc<br/>partition_keys=date_range<br/>[selection]<br/>date_range=<br/>2017-01-01/to/2017-01-10<br/>2017-01-21/to/2017-01-31</td>\n      <td>2017-01-01_to_2017-01-10.nc<br/>2017-01-21_to_2017-01-31.nc</td>\n    </tr>\n    <tr>\n      <!-- https://github.com/google/weather-tools/issues/262 -->\n      <!-- https://github.com/google/weather-tools/issues/334 -->\n      <!-- https://github.com/google/weather-tools/blob/main/configs/s2s_operational_forecast_example.cfg -->\n      <!-- https://critique.corp.google.com/cl/748174885/depot/google3/research/weather/anthromet/download_configs/enstrophy/ifs-ext-reforecast-pressure-levels-inst-all-levs.cfg -->\n      <td>hdate</td>\n      <td>This parameter allows weather‑dl to explicitly specify historical target dates for downloads, giving users precise control over which past dates are retrieved.</td>\n      <td>[parameters]<br/>target_path={date}.gb<br/>[selection]<br/>date=2020-01-02<br/>hdate=1/to/3</td>\n      <td>2019-01-02<br/>2018-01-02<br/>2017-01-02</td>\n    </tr>\n    <tr>\n      <!-- https://github.com/google/weather-tools/pull/528 -->\n      <td>(no partition keys specified)</td>\n      <td>Introduced support for creating a single output-file when no partition_keys are specified, ensuring that all data is written into one consolidated file.</td>\n      <td>[parameters]<br/>target_path=data.nc<br/>[selection]<br/>date=2017-01-01/to/2017-01-05<br/>time=00/06/12/18</td>\n      <td>data.nc</td>\n    </tr>\n  </thead>\n</table>\n\n</details>\n\n### Subsections\n\nSometimes, we'd like to alternate passing certain parameters to each client. For example, certain data sources have\nlimits on the number of API requests that can be made, enforcing a maximum per license. In these cases, the user can\nspecify a parameters subsection. The downloader will overwrite the base parameters with the key-value pairs in each\nsubsection, evenly alternating between each parameter set across the partitions.\n\nTo specify a subsection, create a new section with the following naming pattern: `[parameters.<subsection-name>]`.\nThe `<subsection-name>` can be any string, but it's recommended to chose a name that describes the grouping of values in\nthe section.\n\nHere's an example of this type of configuration:\n\n```\n[parameters]\ndataset=ecmwf-mars-output\ntarget_template=gs://ecmwf-downloads/hres-single-level/{}.nc\npartition_keys=\n    date\n[parameters.deepmind]\napi_key=KKKKK1\napi_url=UUUUU1\n[parameters.research]\napi_key=KKKKK2\napi_url=UUUUU2\n[parameters.cloud]\napi_key=KKKKK3\napi_url=UUUUU3\n```\n\n## `selection` Section\n\n_Parameters used to select desired data_\n\nThese will be passed as request parameters to the specified API client. Selections are dependent on how each data\nsource's catalog is structured.\n\n### Copernicus / CDS\n\n**License**: By using Copernicus / CDS Dataset, users agree to the terms and conditions specified in dataset. i.e. [License](https://cds.climate.copernicus.eu/datasets/reanalysis-era5-single-levels?tab=download#manage-licences).\n\n**Catalog**: [https://cds.climate.copernicus.eu/datasets](https://cds.climate.copernicus.eu/datasets)\n\nVisit the follow to register / acquire API credentials:\n_[Install the CDS API key](https://cds.climate.copernicus.eu/how-to-api)_. After, please set\nthe `api_url` and `api_key` arguments in the `parameters` section of your configuration. Alternatively, one can set\nthese values as environment variables:\n\n```shell\nexport CDSAPI_URL=$api_url\nexport CDSAPI_KEY=$api_key\n```\n\nFor CDS parameter options, check out\nthe [Copernicus documentation](https://cds.climate.copernicus.eu/datasets).\nSee [this example](https://cds.climate.copernicus.eu/datasets/reanalysis-era5-pressure-levels?tab=overview)\nfor what kind of requests one can make.\n\n### MARS\n\n**License**: By using MARS Dataset, users agree to the terms and conditions specified in [License](https://www.ecmwf.int/en/forecasts/accessing-forecasts/licences-available) document.\n\n**Catalog**: [https://apps.ecmwf.int/archive-catalogue/](https://apps.ecmwf.int/archive-catalogue/)\n\nVisit the following to register / acquire API credentials:\n_[Install ECWMF Key](https://confluence.ecmwf.int/display/WEBAPI/Access+MARS#AccessMARS-key)_. After, please set\nthe `api_url`, `api_key`, and `api_email` arguments in the `parameters` section of your configuration. Alternatively,\none can set these values as environment variables:\n\n```shell\nexport MARSAPI_URL=$api_url\nexport MARSAPI_EMAIL=$api_email\nexport MARSAPI_KEY=$api_key\n```\n\nFor MARS parameter options, first read up on\n[MARS request syntax](https://confluence.ecmwf.int/display/WEBAPI/Brief+MARS+request+syntax). For a full range of what\ndata can be requested, please consult the [MARS catalog](https://apps.ecmwf.int/archive-catalogue/).\nSee [these examples](https://confluence.ecmwf.int/display/UDOC/MARS+example+requests)\nto discover the kinds of requests that can be made.\n\n> **NOTE**: MARS data is stored on tape drives. It takes longer for multiple workers to request data than a single\n> worker. Thus, it's recommended _not_ to set a partition key when writing MARS data configurations."
  },
  {
    "path": "Dockerfile",
    "content": "\n# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# ==============================================================================\nARG py_version=3.8\nFROM apache/beam_python${py_version}_sdk:2.40.0 as beam_sdk\nFROM continuumio/miniconda3:v25.11.1\n\n# Add the mamba solver for faster builds\nRUN conda install -n base conda-libmamba-solver\nRUN conda config --set solver libmamba\n\n# Create conda env using environment.yml\nARG weather_tools_git_rev=main\nRUN git clone https://github.com/google/weather-tools.git /weather\nWORKDIR /weather\nRUN git checkout \"${weather_tools_git_rev}\"\nRUN rm -r /weather/weather_*/test_data/\nRUN conda env create -f environment.yml --debug && \\\n    conda clean --all -f --yes\n\n# Activate the conda env and update the PATH\nARG CONDA_ENV_NAME=weather-tools\nRUN echo \"source activate ${CONDA_ENV_NAME}\" >> ~/.bashrc\nENV PATH /opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH\n\n# Install gcloud alpha\nRUN apt-get update -y\nRUN gcloud components install alpha --quiet\n\n# Copy files from official SDK image, including script/dependencies.\nCOPY --from=beam_sdk /opt/apache/beam /opt/apache/beam\n\n# Set the entrypoint to Apache Beam SDK launcher.\nENTRYPOINT [\"/opt/apache/beam/boot\"]\n"
  },
  {
    "path": "Efficient-Requests.md",
    "content": "# Writing Efficient Data Requests\n\nTODO([#26](https://github.com/googlestaging/weather-tools/issues/26)). In the mean-time, please consult this ECMWF\ndocumentation:\n* [Web API Retrieval Efficiency](https://confluence.ecmwf.int/display/WEBAPI/Retrieval+efficiency)\n* [Era 5 daily data retrieval efficiency](https://confluence.ecmwf.int/display/WEBAPI/ERA5+daily+retrieval+efficiency)\n"
  },
  {
    "path": "LICENSE",
    "content": "\n                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "MANIFEST.in",
    "content": "global-exclude *.nc\nglobal-exclude *.gb *.grib\nglobal-exclude *.bz2\nglobal-exclude *.yaml *.yml\nglobal-exclude *_test.py\nprune .github\nprune bin\nprune docs\nprune weather_*/test_data\nexclude tox.ini\n"
  },
  {
    "path": "README.md",
    "content": "# weather-tools\n\nApache Beam pipelines to make weather data accessible and useful.\n\n[![CI](https://github.com/googlestaging/weather-tools/actions/workflows/ci.yml/badge.svg)](https://github.com/googlestaging/weather-tools/actions/workflows/ci.yml)\n[![Documentation Status](https://readthedocs.org/projects/weather-tools/badge/?version=latest)](https://weather-tools.readthedocs.io/en/latest/?badge=latest)\n\n## Introduction\n\nThis project contributes a series of command-line tools to make common data engineering tasks easier for researchers in\nclimate and weather. These solutions were born out of the need to improve repeated work performed by research teams\nacross Alphabet.\n\nThe first tool created was the weather downloader (`weather-dl`). This makes it easier to ingest data from the European\nCenter for Medium Range Forecasts (ECMWF). `weather-dl` enables users to describe very specifically what data they'd\nlike to ingest from ECMWF's catalogs. It also offers them control over how to parallelize requests, empowering users to\n[retrieve data efficiently](Efficient-Requests.md). Downloads are driven from a\n[configuration file](Configuration.md), which can be reviewed (and version-controlled) independently of pipeline or\nanalysis code.\n\nWe also provide two additional tools to aid climate and weather researchers: the weather mover (`weather-mv`) and the\nweather splitter (`weather-sp`). These CLIs are still in their alpha stages of development. Yet, they have been used for\nproduction workflows for several partner teams.\n\nWe created the weather mover (`weather-mv`) to load geospatial data from cloud buckets\ninto [Google BigQuery](https://cloud.google.com/bigquery). This enables rapid exploratory analysis and visualization of\nweather data: From BigQuery, scientists can load arbitrary climate data fields into a Pandas or XArray dataframe via a\nsimple SQL query.\n\nThe weather splitter (`weather-sp`) helps normalize how archival weather data is stored in cloud buckets:\nWhether you're trying to merge two datasets with overlapping variables — or, you simply need\nto [open Grib data from XArray](https://github.com/ecmwf/cfgrib/issues/2), it's really useful to split datasets into\ntheir component variables.\n\n## Installing\n\nIt is currently recommended that you create a local python environment (with\n[Anaconda](https://www.anaconda.com/products/individual)) and install the\nsources as follows:\n\n  ```shell\nconda env create --name weather-tools --file=environment.yml\nconda activate weather-tools\n  ```\n\n> Note: Due to its use of 3rd-party binary dependencies such as GDAL and MetView, `weather-tools`\n> is transitioning from PyPi to Conda for its main release channel. The instructions above\n> are a temporary workaround before our Conda-forge release.\n\nFrom here, you can use the `weather-*` tools from your python environment. Currently, the following tools are available:\n\n- [⛈ `weather-dl`](weather_dl/README.md) (_beta_) – Download weather data (namely, from ECMWF's API).\n- [⛅️ `weather-mv`](weather_mv/README.md) (_alpha_) – Load weather data into analytics engines, like BigQuery.\n- [🌪 `weather-sp`](weather_sp/README.md) (_alpha_) – Split weather data by arbitrary dimensions.\n\n## Quickstart\n\nIn this tutorial, we will\ndownload the [Era 5 pressure level dataset](https://cds.climate.copernicus.eu/datasets/reanalysis-era5-pressure-levels?tab=overview)\nand ingest it into Google BigQuery using `weather-dl` and `weather-mv`, respectively.\n\n### Prerequisites\n\n1. [Register here](https://www.ecmwf.int/) and [here](https://cds.climate.copernicus.eu/) for a license from\n   ECMWF's [Copernicus (CDS) API](https://cds.climate.copernicus.eu/api-how-to).\n2. User must agree to the Terms of Use of a dataset before downloading any data out of dataset.(E.g.: accept [terms & condition](https://cds.climate.copernicus.eu/datasets/reanalysis-era5-single-levels?tab=download#manage-licences) from here.)\n3. Install your license by copying your API url & key from [this page](https://cds.climate.copernicus.eu/how-to-api) to a new file `$HOME/.cdsapirc`.[^1] The file should look like this:\n   ```\n   url: https://cds.climate.copernicus.eu/api\n   key: <YOUR_USER_ID>:<YOUR_API_KEY>\n   ```\n4. If you do not already have a Google Cloud project, create one by following\n   [these steps](https://cloud.google.com/docs/get-started). If you are working on\n   an existing project, make sure your user has the [BigQuery Admin role](https://cloud.google.com/bigquery/docs/access-control#bigquery.admin).\n   To learn more about granting IAM roles to users in Google Cloud, visit the\n   [official docs](https://cloud.google.com/iam/docs/granting-changing-revoking-access#grant-single-role).\n5. Create an empty BigQuery Dataset. This can be done using\n   the [Google Cloud Console](https://cloud.google.com/bigquery/docs/quickstarts/quickstart-cloud-console#create_a_dataset)\n   or via the [`bq` CLI tool](https://cloud.google.com/bigquery/docs/quickstarts/quickstart-command-line). \n   For example:\n   ```shell\n   bq mk --project_id=$PROJECT_ID $DATASET_ID\n   ```\n6. Follow [these steps](https://cloud.google.com/storage/docs/creating-buckets) \n   to create a bucket for staging temporary files in [Google Cloud Storage](https://cloud.google.com/storage).\n\n### Steps\n\nFor the purpose of this tutorial, we will use your local machine to run the\ndata pipelines. Note that all `weather-tools` can also be run in [Cloud Dataflow](https://cloud.google.com/dataflow)\nwhich is easier to scale and fully managed.\n\n1. Use `weather-dl` to download the *Era 5 pressure level* dataset.\n   ```bash\n   weather-dl configs/era5_example_config_local_run.cfg \\\n      --local-run # Use the local machine\n   ```\n\n   > Recommendation: Pass the `-d, --dry-run` flag to any of these commands to preview the effects.\n\n   **NOTE:** By default, local downloads are saved to the `./local_run` directory unless another file system is specified.\n   The recommended output location for `weather-dl` is [Cloud Storage](https://cloud.google.com/storage).\n   The source and destination of the download are configured using the `.cfg` configuration file which is passed to the command.\n   To learn more about this configuration file's format and features,\n   see [this reference](Configuration.md). To learn more about the `weather-dl` command, visit [here](weather_dl/README.md).\n\n2. *(optional)* Split your downloaded dataset up with `weather-sp`:\n\n   ```shell\n    weather-sp --input-pattern \"./local_run/era5-*.nc\" \\\n       --output-dir \"split_data\" \n   ```\n\n   Visit the `weather-sp` [docs](weather_sp/README.md) for more information.\n\n3. Use `weather-mv` to ingest the downloaded data into BigQuery, in a structured format.\n\n   ```bash\n   weather-mv bigquery --uris \"./local_run/**.nc\" \\ # or \"./split_data/**.nc\" if weather-sp is used\n      --output_table \"$PROJECT.$DATASET_ID.$TABLE_ID\" \\ # The path to the destination BigQuery table\n      --temp_location \"gs://$BUCKET/tmp\" \\  # Needed for stage temporary files before writing to BigQuery\n      --direct_num_workers 2\n   ```\n\n   See [these docs](weather_mv/README.md) for more about the `weather-mv` command.\n\nThat's it! After the pipeline is completed, you should be able to query the ingested \ndataset in [BigQuery SQL workspace](https://cloud.google.com/bigquery/docs/bigquery-web-ui)\nand analyze it using [BigQuery ML](https://cloud.google.com/bigquery-ml/docs/introduction).\n\n## Contributing\n\nThe weather tools are under active development, and contributions are welcome! Please check out\nour [guide](CONTRIBUTING.md) to get started.\n\n## License\n\nThis is not an official Google product.\n\n```\nCopyright 2021 Google LLC\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n\n    https://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License.\n```\n\n[^1]: Note that you need to be logged in for the [CDS API page](https://cds.climate.copernicus.eu/api-how-to#install-the-cds-api-key) to actually show your user ID and API key. Otherwise, it will display a placeholder, which is confusing to some users.\n"
  },
  {
    "path": "Runners.md",
    "content": "# Choosing a Beam Runner\n\nAll tools use Apache Beam pipelines. By default, pipelines run locally using the `DirectRunner`. You can optionally\nchoose to run the pipelines on [Google Cloud Dataflow](https://cloud.google.com/dataflow) by selection\nthe `DataflowRunner`.\n\nWhen working with GCP, it's recommended you set the project ID up front with the command:\n\n```shell\ngcloud config set project <your-id>\n```\n\n## _Direct Runner options_:\n\n* `--direct_num_workers`: The number of workers to use. We recommend 2 for local development.\n\nExample run:\n\n```shell\nweather-mv -i gs://netcdf_file.nc \\\n  -o $PROJECT.$DATASET_ID.$TABLE_ID \\\n  -t gs://$BUCKET/tmp  \\\n  --direct_num_workers 2\n```\n\nFor a full list of how to configure the direct runner, please review\n[this page](https://beam.apache.org/documentation/runners/direct/).\n\n## _Dataflow options_:\n\n* `--runner`: The `PipelineRunner` to use. This field can be either `DirectRunner` or `DataflowRunner`.\n  Default: `DirectRunner` (local mode)\n* `--project`: The project ID for your Google Cloud Project. This is required if you want to run your pipeline using the\n  Dataflow managed service (i.e. `DataflowRunner`).\n* `--temp_location`: Cloud Storage path for temporary files. Must be a valid Cloud Storage URL, beginning with `gs://`.\n* `--region`: Specifies a regional endpoint for deploying your Dataflow jobs. Default: `us-central1`.\n* `--job_name`: The name of the Dataflow job being executed as it appears in Dataflow's jobs list and job details.\n\nExample run:\n\n```shell\nweather-dl configs/seasonal_forecast_example_config.cfg \\\n  --runner DataflowRunner \\\n  --project $PROJECT \\\n  --region $REGION \\\n  --temp_location gs://$BUCKET/tmp/\n```\n\nFor a full list of how to configure the Dataflow pipeline, please review\n[this table](https://cloud.google.com/dataflow/docs/reference/pipeline-options).\n\n## Monitoring\n\nWhen running Dataflow, you\ncan [monitor jobs through UI](https://cloud.google.com/dataflow/docs/guides/using-monitoring-intf),\nor [via Dataflow's CLI commands](https://cloud.google.com/dataflow/docs/guides/using-command-line-intf):\n\nFor example, to see all outstanding Dataflow jobs, simply run:\n\n```shell\ngcloud dataflow jobs list\n```\n\nTo describe stats about a particular Dataflow job, run:\n\n```shell\ngcloud dataflow jobs describe $JOBID\n```\n\nIn addition, Dataflow provides a series\nof [Beta CLI commands](https://cloud.google.com/sdk/gcloud/reference/beta/dataflow).\n\nThese can be used to keep track of job metrics, like so:\n\n```shell\nJOBID=<enter job id here>\ngcloud beta dataflow metrics list $JOBID --source=user\n```\n\nYou can even [view logs via the beta commands](https://cloud.google.com/sdk/gcloud/reference/beta/dataflow/logs/list):\n\n```shell\ngcloud beta dataflow logs list $JOBID\n```\n\n"
  },
  {
    "path": "Runtime-Container.md",
    "content": "# Runtime Containers\n\n_How to build & public custom Dataflow images in GCS_\n\n*Pre-requisites*: Install the gcloud CLI ([instructions are here](https://cloud.google.com/sdk/docs/install)).\n\nThen, log in to your cloud account:\n\n```shell\ngcloud auth login\n```\n\n> Please follow all the instructions from the CLI. This will involve running an\n> auth script on your local machine, which will open a browser window to log you\n> in.\n\nLast, make sure you have adequate permissions to use Google Cloud Build (see\n[IAM options here](https://cloud.google.com/build/docs/iam-roles-permissions)).\n[This documentation](https://cloud.google.com/build/docs/securing-builds/configure-access-to-resources)\nwill help you configure your project to use Cloud Build.\n\n*Updating the image*: Please modify the `Dockerfile` in the root directory. Then, build and upload the image with Google\nCloud Build (updating the tag, as is appropriate):\n\n```shell\nexport PROJECT=<your-project-here>\nexport REPO=weather-tools\nexport IMAGE_URI=gcr.io/$PROJECT/$REPO\nexport TAG=\"0.0.0\" # Please increment on every update.\n# from the project root...\n\n# dev release\ngcloud builds submit . --tag \"$IMAGE_URI:dev\"\n\n# release:\ngcloud builds submit . --tag \"$IMAGE_URI:$TAG\"  && gcloud builds submit weather_mv/ --tag \"$IMAGE_URI:latest\"\n```\n"
  },
  {
    "path": "bin/install-branch",
    "content": "#!/usr/bin/env sh\n# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Set the first argument as the name of the branch. If unset, default to \"main\".\npip install -e \"git+http://github.com/google/weather-tools.git@${1:-main}#egg=google-weather-tools\"\n"
  },
  {
    "path": "bin/post-push",
    "content": "#!/usr/bin/env sh\n# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nMAG=\"\\033[95m\"\nBLD=\"\\033[1m\"\nEND=\"\\033[0m\"\n\n# End script early on error\nset -e\n\n# In project root...\ncd \"$(dirname $0)/..\"\n\nflake8\npytype\npytest\n\necho \"$MAG$BLD--- Presubmit: Success ---$END\"\nexit 0\n"
  },
  {
    "path": "ci3.8.yml",
    "content": "name: weather-tools\nchannels:\n  - conda-forge\n  - defaults\ndependencies:\n  - python=3.8.13\n  - apache-beam=2.40.0\n  - pytest=7.2.0\n  - pytest-subtests=0.8.0\n  - cfgrib=0.9.10.2\n  - dask=2022.10.0\n  - dataclasses=0.8\n  - distributed=2022.10.0\n  - eccodes=2.27.0\n  - requests=2.28.1\n  - netcdf4=1.6.1\n  - rioxarray=0.13.4\n  - xarray-beam=0.6.2\n  - ecmwf-api-client=1.6.3\n  - fsspec=2022.11.0\n  - gcsfs=2022.11.0\n  - gdal=3.5.1\n  - pyproj=3.4.0\n  - geojson=2.5.0=py_0\n  - simplejson=3.17.6\n  - metview-batch=5.17.0\n  - numpy=1.22.4\n  - pandas=1.5.1\n  - pip=24.3.1\n  - pygrib=2.1.4\n  - xarray==2023.1.0\n  - ruff==0.1.2\n  - google-cloud-sdk=410.0.0\n  - aria2=1.36.0\n  - zarr=2.15.0\n  - google-cloud-monitoring=2.22.2\n  - pillow=10.4.0\n  - cdsapi=0.7.5\n  - pip:\n    - cython==0.29.34\n    - earthengine-api==0.1.329\n    - pyparsing==3.1.4\n    - .[test]\n"
  },
  {
    "path": "ci3.9.yml",
    "content": "name: weather-tools\nchannels:\n  - conda-forge\n  - defaults\ndependencies:\n  - python=3.9.13\n  - apache-beam=2.40.0\n  - pytest=7.2.0\n  - pytest-subtests=0.8.0\n  - cfgrib=0.9.10.2\n  - dask=2022.10.0\n  - dataclasses=0.8\n  - distributed=2022.10.0\n  - eccodes=2.27.0\n  - requests=2.28.1\n  - netcdf4=1.6.1\n  - rioxarray=0.13.4\n  - xarray-beam=0.6.2\n  - ecmwf-api-client=1.6.3\n  - fsspec=2022.11.0\n  - gcsfs=2022.11.0\n  - gdal=3.5.1\n  - pyproj=3.4.0\n  - geojson=2.5.0=py_0\n  - simplejson=3.17.6\n  - metview-batch=5.17.0\n  - numpy=1.22.4\n  - pandas=1.5.1\n  - pip=24.3.1\n  - pygrib=2.1.4\n  - google-cloud-sdk=410.0.0\n  - aria2=1.36.0\n  - xarray==2023.1.0\n  - ruff==0.1.2\n  - zarr=2.15.0\n  - google-cloud-monitoring=2.22.2\n  - pillow=10.4.0\n  - cdsapi=0.7.5\n  - pip:\n    - cython==0.29.34\n    - earthengine-api==0.1.329\n    - pyparsing==3.1.4\n    - .[test]\n"
  },
  {
    "path": "configs/era5_example_config.cfg",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n[parameters]\nclient=cds\ndataset=reanalysis-era5-pressure-levels\ntarget_path=gs://ecmwf-output-test/era5/{year:04d}/{month:02d}/{day:02d}-pressure-{pressure_level}.nc\npartition_keys=\n    year\n    month\n    day\n    pressure_level\napi_url=https://cds.climate.copernicus.eu/api\n# a fake key for tests\napi_key=12345:1234567-ab12-34cd-9876-4o4fake90909\n[selection]\nproduct_type=reanalysis\nformat=netcdf\nvariable=\n    divergence\n    fraction_of_cloud_cover\n    geopotential\npressure_level=\n    500\nyear=\n    2015\n    2016\n    2017\nmonth=\n    01\nday=\n    01\n    15\ntime=\n    00:00\n    06:00\n    12:00\n    18:00"
  },
  {
    "path": "configs/era5_example_config_local_run.cfg",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n## Run with\n# $ weather-dl configs/era5_example_config_local_run.cfg --local-run\n# This will create a folder '$CWD/local_run' with a manifest.json file and four data files\n# era5-20160101-pressure-500.nc\n# era5-20160115-pressure-500.nc\n# era5-20170101-pressure-500.nc\n# era5-20170115-pressure-500.nc\n[parameters]\nclient=cds\ndataset=reanalysis-era5-pressure-levels\ntarget_path=era5-{year:04d}{month:02d}{day:02d}-pressure-{pressure_level}.nc\npartition_keys=\n    year\n    month\n    day\n    pressure_level\n[selection]\nproduct_type=reanalysis\nformat=netcdf\nvariable=\n    divergence\n    fraction_of_cloud_cover\n    geopotential\npressure_level=\n    500\nyear=\n    2016\n    2017\nmonth=\n    01\nday=\n    01\n    15\ntime=\n    00:00\n    12:00\n"
  },
  {
    "path": "configs/era5_example_config_local_run.json",
    "content": "{\n  \"parameters\": {\n    \"client\": \"cds\",\n    \"dataset\": \"reanalysis-era5-pressure-levels\",\n    \"target_path\": \"era5-{}{}{}-pressure-{}.nc\",\n    \"partition_keys\": [\"year\",\"month\",\"day\",\"pressure_level\"]\n  },\n  \"selection\": {\n    \"product_type\": \"ensemble_mean\",\n    \"format\": \"netcdf\",\n    \"variable\": [\"divergence\",\"fraction_of_cloud_cover\",\"geopotential\"],\n    \"pressure_level\": [500],\n    \"year\": [2016, 2017],\n    \"month\": [1],\n    \"day\": [1, 15],\n    \"time\": [\"00:00\", \"12:00\"]\n  }\n}\n"
  },
  {
    "path": "configs/era5_example_config_preproc.cfg",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n[parameters]\nclient=cds\ndataset=reanalysis-era5-pressure-levels\ntarget_path=gs://ecmwf-downloads/test/o1280-{year:04d}-{month:02d}-{day:02d}.grib\npartition_keys=\n    year\n    month\n    day\n[selection]\n# These are the averages of an ensemble of analyses.\nproduct_type=ensemble_mean\nformat=grib\ngrid=o1280\nvariable=\n    relative_humidity\n    temperature\n    fraction_of_cloud_cover\n    geopotential\n    u_component_of_wind\n    v_component_of_wind\npressure_level=\n    500\nyear=\n    2020\nmonth=\n    01\nday=\n    01\n    02\ntime=\n    00:00\n    12:00\n"
  },
  {
    "path": "configs/era5_example_config_using_date.cfg",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n[parameters]\nclient=cds\ndataset=reanalysis-era5-pressure-levels\n# This config creates a date-based directory hierarchy.\n# In this case, the two files that will be created are\n# gs://ecmwf-output-test/era5/2017/01/01-pressure-500.nc\n# gs://ecmwf-output-test/era5/2017/01/02-pressure-500.nc\n# gs://ecmwf-output-test/era5/2017/01/01-pressure-1000.nc\n# gs://ecmwf-output-test/era5/2017/01/02-pressure-1000.nc\ntarget_path=gs://ecmwf-output-test/era5/{date:%%Y/%%m/%%d}-pressure-{pressure_level}.nc\npartition_keys=\n     date\n     pressure_level\n[selection]\nproduct_type=reanalysis\nformat=netcdf\nvariable=\n    divergence\n    fraction_of_cloud_cover\n    geopotential\npressure_level=\n    500\n    1000\ndate=2017-01-01/to/2017-01-02\ntime=\n    00:00\n    06:00\n    12:00\n    18:00\n"
  },
  {
    "path": "configs/era5_example_monthly_soil.cfg",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n[parameters]\nclient=cds\ndataset=reanalysis-era5-complete\ntarget_path=gs://ecmwf-output-test/era5/ERA5GRIB/HRES/Month/{year}/{year}{month:02d}_hres_soil.grb2\npartition_keys=\n    year\n    month\n\n[selection]\nclass=ea\nstream=oper\nexpver=1\nlevtype=sfc\ntype=an\nyear=1979/to/2021\nmonth=01/to/12\nday=all\ntime=00/to/23/by/1\nparam=139.128/170.128/183.128/236.128/238.128/39.128/40.128/41.128/42.128/35.128/36.128/37.128/38.128\n"
  },
  {
    "path": "configs/mars_example_config.cfg",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n[parameters]\nclient=mars\ndataset=ecmwf-mars-output\ntarget_path=gs://ecmwf-downloads/hres-single-level/{date:%%Y/%%m/%%d}.nc\npartition_keys=\n    date\n\n[selection]\n# example requests\n# https://confluence.ecmwf.int/display/UDOC/MARS+example+requests\n\n# hres fields\n# https://www.ecmwf.int/en/forecasts/datasets/set-i\n\nstream=oper\nlevtype=sfc\nparam=10fg6/10u/10v/100u/100vcrr/2t/2d/200u/200v/cp/dsrp/hcc/i10fg/lcc/lsp/lspf/lsrr/msl/ptype/sf/sp/ssr/tcrw/tclw/tcsw/tcw/tcwv/tp\npadding=0\nstep=0/1/2/3/4/5/6/7/8/9/10/11/12/24/48/72/96/120/144/168/192/216/240\ngrid=0.125/0.125\nexpver=1\ntime=0000/1800\ndate=2017-01-01/to/2017-01-07\n# these are weather forecasts\ntype=fc\nclass=od\nexpect=anymars \nformat=netcdf\n"
  },
  {
    "path": "configs/mars_example_config.json",
    "content": "{\n  \"parameters\": {\n    \"client\": \"mars\",\n    \"dataset\": \"ecmwf-mars-output\",\n    \"target_path\": \"gs://ecmwf-downloads/hres-single-level/{:%Y/%m/%d}.nc\",\n    \"partition_keys\": \"date\"\n  },\n\n  \"selection\": {\n    \"stream\": \"oper\",\n    \"levtype\": \"sfc\",\n    \"param\": [\"10fg6\",\"10u\",\"10v\",\"100u\",\"100vcrr\",\"2t\",\"2d\",\"200u\",\"200v\",\"cp\",\"dsrp\",\"hcc\",\"i10fg\",\"lcc\",\"lsp\",\"lspf\",\"lsrr\",\"msl\",\"ptype\",\"sf\",\"sp\",\"ssr\",\"tcrw\",\"tclw\",\"tcsw\",\"tcw\",\"tcwv\",\"tp\"],\n    \"padding\": 0,\n    \"step\": [0,1,2,3,4,5,6,7,8,9,10,11,12,24,48,72,96,120,144,168,192,216,240],\n    \"grid\": [0.125,0.125],\n    \"expver\": 1,\n    \"time\": [\"0000\",\"1800\"],\n    \"date\": [\"2017-01-01\",\"2017-01-07\"],\n    \"type\": \"fc\",\n    \"class\": \"od\",\n    \"expect\": \"anymars\",\n    \"format\": \"netcdf\"\n  }\n}\n"
  },
  {
    "path": "configs/multiple_multiple_licenses/era5_pressure500.cfg",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n[parameters]\nclient=cds\ndataset=reanalysis-era5-pressure-levels\ntarget_path=gs://ecmwf-output-test/era5/{year:04d}/{month:02d}/{day:02d}-pressure-{pressure_level}.nc\npartition_keys=\n    year\n    month\n    day\n    pressure_level\n[parameters.a]\napi_url=https://cds.climate.copernicus.eu/api\n# a fake key for tests\napi_key=12345:1234567-ab12-34cd-9876-4o4fake90909\n[parameters.b]\napi_url=https://cds.climate.copernicus.eu/api\n# a fake key for tests\napi_key=12345:1234567-ab12-34cd-9876-4o4fake90909\n[parameters.c]\napi_url=https://cds.climate.copernicus.eu/api\n# a fake key for tests\napi_key=12345:1234567-ab12-34cd-9876-4o4fake90909\n[selection]\nproduct_type=reanalysis\nformat=netcdf\nvariable=\n    divergence\n    fraction_of_cloud_cover\n    geopotential\npressure_level=\n    500\nyear=\n    2015\n    2016\n    2017\nmonth=\n    01\nday=\n    01\n    15\ntime=\n    00:00\n    06:00\n    12:00\n    18:00"
  },
  {
    "path": "configs/multiple_multiple_licenses/era5_pressure600.cfg",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n[parameters]\nclient=cds\ndataset=reanalysis-era5-pressure-levels\ntarget_path=gs://ecmwf-output-test/era5/{year:04d}/{month:02d}/{day:02d}-pressure-{pressure_level}.nc\npartition_keys=\n    year\n    month\n    day\n    pressure_level\n[parameters.a]\napi_url=https://cds.climate.copernicus.eu/api\n# a fake key for tests\napi_key=12345:1234567-ab12-34cd-9876-4o4fake90909\n[parameters.b]\napi_url=https://cds.climate.copernicus.eu/api\n# a fake key for tests\napi_key=12345:1234567-ab12-34cd-9876-4o4fake90909\n[parameters.c]\napi_url=https://cds.climate.copernicus.eu/api\n# a fake key for tests\napi_key=12345:1234567-ab12-34cd-9876-4o4fake90909\n[selection]\nproduct_type=reanalysis\nformat=netcdf\nvariable=\n    divergence\n    fraction_of_cloud_cover\n    geopotential\npressure_level=\n    600\nyear=\n    2015\n    2016\n    2017\nmonth=\n    01\nday=\n    01\n    15\ntime=\n    00:00\n    06:00\n    12:00\n    18:00"
  },
  {
    "path": "configs/multiple_multiple_licenses/era5_pressure700.cfg",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n[parameters]\nclient=cds\ndataset=reanalysis-era5-pressure-levels\ntarget_path=gs://ecmwf-output-test/era5/{year:04d}/{month:02d}/{day:02d}-pressure-{pressure_level}.nc\npartition_keys=\n    year\n    month\n    day\n    pressure_level\n[parameters.a]\napi_url=https://cds.climate.copernicus.eu/api\n# a fake key for tests\napi_key=12345:1234567-ab12-34cd-9876-4o4fake90909\n[parameters.b]\napi_url=https://cds.climate.copernicus.eu/api\n# a fake key for tests\napi_key=12345:1234567-ab12-34cd-9876-4o4fake90909\n[parameters.c]\napi_url=https://cds.climate.copernicus.eu/api\n# a fake key for tests\napi_key=12345:1234567-ab12-34cd-9876-4o4fake90909\n[selection]\nproduct_type=reanalysis\nformat=netcdf\nvariable=\n    divergence\n    fraction_of_cloud_cover\n    geopotential\npressure_level=\n    700\nyear=\n    2015\n    2016\n    2017\nmonth=\n    01\nday=\n    01\n    15\ntime=\n    00:00\n    06:00\n    12:00\n    18:00"
  },
  {
    "path": "configs/multiple_single_license/era5_pressure500.cfg",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n[parameters]\nclient=cds\ndataset=reanalysis-era5-pressure-levels\ntarget_path=gs://ecmwf-output-test/era5/{year:04d}/{month:02d}/{day:02d}-pressure-{pressure_level}.nc\npartition_keys=\n    year\n    month\n    day\n    pressure_level\napi_url=https://cds.climate.copernicus.eu/api\n# a fake key for tests\napi_key=12345:1234567-ab12-34cd-9876-4o4fake90909\n[selection]\nproduct_type=reanalysis\nformat=netcdf\nvariable=\n    divergence\n    fraction_of_cloud_cover\n    geopotential\npressure_level=\n    500\nyear=\n    2015\n    2016\n    2017\nmonth=\n    01\nday=\n    01\n    15\ntime=\n    00:00\n    06:00\n    12:00\n    18:00"
  },
  {
    "path": "configs/multiple_single_license/era5_pressure600.cfg",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n[parameters]\nclient=cds\ndataset=reanalysis-era5-pressure-levels\ntarget_path=gs://ecmwf-output-test/era5/{year:04d}/{month:02d}/{day:02d}-pressure-{pressure_level}.nc\npartition_keys=\n    year\n    month\n    day\n    pressure_level\napi_url=https://cds.climate.copernicus.eu/api\n# a fake key for tests\napi_key=12345:1234567-ab12-34cd-9876-4o4fake90909\n[selection]\nproduct_type=reanalysis\nformat=netcdf\nvariable=\n    divergence\n    fraction_of_cloud_cover\n    geopotential\npressure_level=\n    600\nyear=\n    2015\n    2016\n    2017\nmonth=\n    01\nday=\n    01\n    15\ntime=\n    00:00\n    06:00\n    12:00\n    18:00"
  },
  {
    "path": "configs/multiple_single_license/era5_pressure700.cfg",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n[parameters]\nclient=cds\ndataset=reanalysis-era5-pressure-levels\ntarget_path=gs://ecmwf-output-test/era5/{year:04d}/{month:02d}/{day:02d}-pressure-{pressure_level}.nc\npartition_keys=\n    year\n    month\n    day\n    pressure_level\napi_url=https://cds.climate.copernicus.eu/api\n# a fake key for tests\napi_key=12345:1234567-ab12-34cd-9876-4o4fake90909\n[selection]\nproduct_type=reanalysis\nformat=netcdf\nvariable=\n    divergence\n    fraction_of_cloud_cover\n    geopotential\npressure_level=\n    700\nyear=\n    2015\n    2016\n    2017\nmonth=\n    01\nday=\n    01\n    15\ntime=\n    00:00\n    06:00\n    12:00\n    18:00"
  },
  {
    "path": "configs/s2s_operational_forecast_example.cfg",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n[parameters]\nclient=ecpublic\ndataset=s2s\ntarget_path=gs://ecmwf-downloads/s2s/pressure-levels/{date}-all-levs.gb\n# target_path=gs://ecmwf-downloads/s2s/pressure-levels/{date}-hc-{hdate}-all-levs.gb\npartition_keys=\n    date\n#    hdate\n\n[selection]\nclass=s2\ndate=2016-01-04/2016-01-07/2016-01-11/2016-01-14/2016-01-18/2016-01-21/2016-01-25\n# hdate = <value>, The number of years to subtract.\n# Eg input:\n#           date = 2020-01-02\n#           hdate = 1/to/6 or 1/2/3/4/5/6\n# Code will do:\n#           hdate = 2019-01-02/2018-01-02/2017-01-02/2016-01-02/2015-01-02/2014-01-02\n# Note: If 'hdate' is specified in the 'selection' section, then 'date' is required as a partition keys.\nhdate=1/to/20\nexpver=prod\nlevelist=10/50/100/200/300/500/700/850/925/1000\nlevtype=pl\nmodel=glob\nnumber=1/to/50\norigin=ecmf\n# All except q\nparam=130/131/132/135/156\nstep=0/to/1104/by/24\nstream=enfo\ntime=00:00:00\ntype=pf"
  },
  {
    "path": "configs/seasonal_forecast_example_config.cfg",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n[parameters]\nclient=cds\ndataset=seasonal-original-single-levels\ntarget_path=gs://ecmwf-output-test/seasonal-forecast/seasonal-forecast-{year:04d}-{month:02d}.nc\npartition_keys=\n    year\n    month\n[selection]\nformat=netcdf\noriginating_centre=ecmwf\nvariable=\n    10m_u_component_of_wind\n    10m_v_component_of_wind\nyear=\n    2016\n    2017\nmonth=\n    01\nday=\n    01\nleadtime_hour=\n    6\narea=\n  10\n  -10\n  -10\n  10\n"
  },
  {
    "path": "configs/tigge_example_config.cfg",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n[parameters]\nclient=ecpublic\ndataset=tigge\ntarget_path=gs://ecmwf-output-test/tigge/{}.gb\npartition_keys=\n    date\n[selection]\nclass=ti\ndate=2020-01-01/to/2020-01-31\nexpver=prod\ngrid=0.25/0.25\nlevtype=sfc\nnumber=1/to/50\norigin=ecmf\nparam=167\nstep=0/to/360/by/6\ntime=00/12\ntype=pf"
  },
  {
    "path": "configs/yesterdays_surface_example.cfg",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n[parameters]\nclient=mars\npartition_keys=\n    date\ntarget_path=all-{}.an\n[selection]\nclass   = od\ntype    = analysis\nlevtype = surface\ndate    = -1\ntime    = 00/06/12/18\nparam   = z/sp"
  },
  {
    "path": "docs/Makefile",
    "content": "# Minimal makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line, and also\n# from the environment for the first two.\nSPHINXOPTS    ?=\nSPHINXBUILD   ?= sphinx-build\nSOURCEDIR     = .\nBUILDDIR      = _build\n\n# Put it first so that \"make\" without argument is like \"make help\".\nhelp:\n\t@$(SPHINXBUILD) -M help \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)\n\n.PHONY: help Makefile\n\n# Catch-all target: route all unknown targets to Sphinx using the new\n# \"make mode\" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).\n%: Makefile\n\t@$(SPHINXBUILD) -M $@ \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)\n"
  },
  {
    "path": "docs/Private-IP-Configuration.md",
    "content": "# Private IP Configuration \n\n_A Guide for Dataflow Pipeline Execution_\n\n## Goals\nIn this document, we’ll describe how to use Private IP for the execution of a dataflow pipeline.\n\n## Background\nWhen we are running the dataflow pipeline, GCP decides to spawn one or more new VM-instances. By default, each VM-instance will have an External IP address. \n\n![VM Instance with External IP Address](_static/vm_instance_with_external_ip_address.png \"VM Instance with External IP Address\")\n\nConsidering a billing account has a limited number of External IP addresses, we can skip this overhead by providing VPC-parameters as CLI-input of dataflow.\n\nFollowing table<font color=\"red\">*</font> summarizes the required input parameters.\n\n| Field             | Description                                                                                                                                                              |\n|-------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| network           | The Compute Engine network for launching Compute Engine instances to run your pipeline. If not set, Google Cloud assumes that you intend to use a network named default. |\n| subnetwork        | The Compute Engine subnetwork for launching Compute Engine instances to run your pipeline.                                                                               |\n| no_use_public_ips | Command-line flag that sets use_public_ips to False. If the option is not explicitly enabled or disabled, the Dataflow workers use public IP addresses.                  |\n\n(<font color=\"red\">*</font> excerpt from [GCP pipeline-options](https://cloud.google.com/dataflow/docs/reference/pipeline-options))\n\n## Steps to Configure VPC\n1. Configure VPC-Network & Subnetwork\n2. Configure Firewall-Rule\n3. Configure NAT & Router\n4. Sample commands to trigger dataflow pipeline execution using above options\n\n## Configure VPC-Network & Subnetwork\n1. Open GCP’s Create VPC-Network page.\n2. Provide name, description.\n3. Select “Subnet creation mode” as Custom.\n4. Provide name, description, region, IP address range & other details for the new subnet.\n5. Select “Private Google Access” as On.\n6. Complete VPC-Network creation by providing other required parameters. Refer to GCP’s [Create-VPC-Network](https://cloud.google.com/vpc/docs/create-modify-vpc-networks) documentation for more details.\n\n![VPC Network Details](_static/vpc_network_details.png \"VPC Network Details\")\n\n## Configure Firewall-Rule\n1. Open GCP’s Create a firewall rule page.\n2. Provide name, description. You may set “Logs” as Off.\n3. For the “Network” drop-down, select the network that we created in the previous step.\n4. Select “Direction of traffic” as Ingress & “Action on match” as Allow.\n5. For the \"Targets\" drop-down, select \"All instances in the network\".\n6. For the \"Source IPv4 ranges\", pass in the following: \"0.0.0.0/0\".\n7. Select \"Protocols and ports\" as Allow all.\n8. Complete Firewall-Rule creation by providing other necessary information. Refer to GCP’s [Configuring-Firewall](https://cloud.google.com/filestore/docs/configuring-firewall) documentation for more details.\n\n![Firewall Rule Details](_static/firewall_rule_details.png \"Firewall Rule Details\")\n\n## Configure NAT & Router\n1. Open GCP’s Create a NAT gateway page.\n2. Provide name, region.\n3. For the “Network” drop-down, select the network that we created earlier.\n4. For the “Router” drop-down, EITHER select pre-created router OR click on “create new router”.\n<br> a. Complete router creation by providing name, description & region. Refer to GCP’s [Create-Router](https://cloud.google.com/network-connectivity/docs/router/how-to/create-router-vpc-on-premises-network) documentation for more details.\n5. Complete NAT gateway creation by providing required details. Refer to GCP’s [Create-NAT-Gateway](https://cloud.google.com/nat/docs/set-up-manage-network-address-translation) documentation for more details.\n\n![Router Details](_static/router_details.png \"Router Details\")\n\n![NAT Gateways](_static/nat_gateways.png \"NAT Gateways\")\n\n## Sample commands to trigger dataflow pipeline execution using above options\n\nFollowing section showcases how VPC-parameters can be given as CLI inputs to weather-mv dataflow pipeline.\n\n```bash\nweather-mv --uris \"gs://$STORAGE_BUCKET/*.nc\"\n           --output_table \"$HOST_PROJECT_ID.$DATASET_ID.$TABLE_ID\"\n           --temp_location \"gs://$STORAGE_BUCKET/tmp\"\n           --runner DataflowRunner\n           --project $HOST_PROJECT_ID\n           --region $REGION_NAME\n           --no_use_public_ips \n           --network=$NETWORK_NAME\n           --subnetwork=regions/$REGION_NAME/subnetworks/$SUBNETWORK_NAME\n```\n\nReplace the following:\n\n- STORAGE_BUCKET: the storage bucket, e.g. bucket_58231\n- HOST_PROJECT_ID: the host project ID, e.g. weather_tools\n- DATASET_ID: the name of dataset, e.g. weather_mv_ds\n- TABLE_ID: the name of table, e.g. tbl_2017_01 \n- REGION_NAME: the regional endpoint of your Dataflow job, e.g. us-central1\n- NETWORK_NAME: the name of your Compute Engine network, e.g. dataflow\n  - Provide network_name same as what we created in Step-1.\n- SUBNETWORK_NAME: the name of your Compute Engine subnetwork, e.g. private\n  - Provide a subnetwork_name same as what we created in Step-1.\n\nAlternatively, you may also execute following command,\n\n```bash\nweather-mv --uris \"gs://$STORAGE_BUCKET/*.nc\"\n           --output_table \"$HOST_PROJECT_ID.$DATASET_ID.$TABLE_ID\"\n           --temp_location \"gs://$STORAGE_BUCKET/tmp\"\n           --runner DataflowRunner\n           --project $HOST_PROJECT_ID\n           --region $REGION_NAME\n           --no_use_public_ips\n           –-subnetwork=https://www.googleapis.com/compute/v1/projects/$HOST_PROJECT_ID/regions/$REGION_NAME/subnetworks/$SUBNETWORK_NAME\n```\n"
  },
  {
    "path": "docs/_static/custom.css",
    "content": "p {\n    text-align: justify;\n}\n\nbody {\n   min-width: 250px;\n}\n\ndiv.body {\n    min-width: 250px;\n}\n"
  },
  {
    "path": "docs/conf.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# Configuration file for the Sphinx documentation builder.\n#\n# This file only contains a selection of the most common options. For a full\n# list see the documentation:\n# https://www.sphinx-doc.org/en/master/usage/configuration.html\n\n# -- Path setup --------------------------------------------------------------\n\n# If extensions (or modules to document with autodoc) are in another directory,\n# add these directories to sys.path here. If the directory is relative to the\n# documentation root, use os.path.abspath to make it absolute, like shown here.\n\nimport os\nimport sys\n\nsys.path.insert(0, os.path.abspath('../weather_dl'))\nsys.path.insert(1, os.path.abspath('../weather_mv'))\nsys.path.insert(2, os.path.abspath('../weather_sp'))\n\n\n# -- Project information -----------------------------------------------------\n\nproject = 'weather-tools'\ncopyright = '2021 Google'\nauthor = 'Anthromets'\n\n# The full version, including alpha/beta/rc tags\nrelease = '0.0'\n\n\n# -- General configuration ---------------------------------------------------\n\n# Add any Sphinx extension module names here, as strings. They can be\n# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom\n# ones.\nextensions = [\n    'myst_parser',\n    'sphinx.ext.todo',\n    'sphinx.ext.viewcode',\n    'sphinx.ext.autodoc',\n]\n\n# Add any paths that contain templates here, relative to this directory.\ntemplates_path = ['_templates']\n\n# List of patterns, relative to source directory, that match files and\n# directories to ignore when looking for source files.\n# This pattern also affects html_static_path and html_extra_path.\nexclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']\n\n\n# -- Options for HTML output -------------------------------------------------\n\n# The theme to use for HTML and HTML Help pages.  See the documentation for\n# a list of builtin themes.\n#\nhtml_theme = 'alabaster'\n\n# Add any paths that contain custom static files (such as style sheets) here,\n# relative to this directory. They are copied after the builtin static files,\n# so a file named \"default.css\" will overwrite the builtin \"default.css\".\nhtml_static_path = ['_static']\n\n# https://stackoverflow.com/a/66295922/809705\nautodoc_typehints = \"description\"\n"
  },
  {
    "path": "docs/download_pipeline.md",
    "content": "# download_pipeline package\n=========================\n\n## Submodules\n\n### download_pipeline.clients module\n\n```{eval-rst}\n.. automodule:: weather_dl.download_pipeline.clients\n   :members:\n   :undoc-members:\n   :show-inheritance:\n```\n\n### download_pipeline.manifest module\n\n```{eval-rst}\n.. automodule:: weather_dl.download_pipeline.manifest\n   :members:\n   :undoc-members:\n   :show-inheritance:\n```\n\n### download_pipeline.parsers module\n\n```{eval-rst}\n.. automodule:: weather_dl.download_pipeline.parsers\n   :members:\n   :undoc-members:\n   :show-inheritance:\n```\n\n### download_pipeline.pipeline module\n\n```{eval-rst}\n.. automodule:: weather_dl.download_pipeline.pipeline\n   :members:\n   :undoc-members:\n   :show-inheritance:\n```\n\n### download_pipeline.store module\n\n```{eval-rst}\n.. automodule:: weather_dl.download_pipeline.stores\n   :members:\n   :undoc-members:\n   :show-inheritance:\n```\n\n## Module contents\n\n```{eval-rst}\n.. automodule:: download_pipeline\n   :members:\n   :undoc-members:\n   :show-inheritance:\n```"
  },
  {
    "path": "docs/index.md",
    "content": "\n```{include} README.md\n```\n\n## Contents\n\n```{eval-rst}\n.. toctree::\n   :maxdepth: 2\n   :titlesonly:\n   :glob:\n   :includehidden:\n\n   weather_dl/README\n   weather_mv/README\n   weather_sp/README\n   Configuration\n   Efficient-Requests\n   Runners\n   Private-IP-Configuration\n   Runtime-Container\n   CONTRIBUTING\n   modules\n```\n\n\n\n\n# Indices and tables\n```{eval-rst}\n* :ref:`genindex`\n* :ref:`modindex`\n* :ref:`search`\n```"
  },
  {
    "path": "docs/loader_pipeline.md",
    "content": "# loader_pipeline package\n\n## Submodules\n\n### loader_pipeline.netcdf_loader module\n\n```{eval-rst}\n.. automodule:: weather_mv.loader_pipeline\n   :members:\n   :undoc-members:\n   :show-inheritance:\n```\n\n## Module contents\n\n```{eval-rst}\n.. automodule:: loader_pipeline\n   :members:\n   :undoc-members:\n   :show-inheritance:\n```"
  },
  {
    "path": "docs/make.bat",
    "content": "REM Copyright 2021 Google LLC\nREM\nREM Licensed under the Apache License, Version 2.0 (the \"License\");\nREM you may not use this file except in compliance with the License.\nREM You may obtain a copy of the License at\nREM\nREM     https://www.apache.org/licenses/LICENSE-2.0\nREM\nREM Unless required by applicable law or agreed to in writing, software\nREM distributed under the License is distributed on an \"AS IS\" BASIS,\nREM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nREM See the License for the specific language governing permissions and\nREM limitations under the License.\n@ECHO OFF\n\npushd %~dp0\n\nREM Command file for Sphinx documentation\n\nif \"%SPHINXBUILD%\" == \"\" (\n\tset SPHINXBUILD=sphinx-build\n)\nset SOURCEDIR=.\nset BUILDDIR=_build\n\nif \"%1\" == \"\" goto help\n\n%SPHINXBUILD% >NUL 2>NUL\nif errorlevel 9009 (\n\techo.\n\techo.The 'sphinx-build' command was not found. Make sure you have Sphinx\n\techo.installed, then set the SPHINXBUILD environment variable to point\n\techo.to the full path of the 'sphinx-build' executable. Alternatively you\n\techo.may add the Sphinx directory to PATH.\n\techo.\n\techo.If you don't have Sphinx installed, grab it from\n\techo.http://sphinx-doc.org/\n\texit /b 1\n)\n\n%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%\ngoto end\n\n:help\n%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%\n\n:end\npopd\n"
  },
  {
    "path": "docs/modules.md",
    "content": "# Modules\n\n```{toctree}\n:maxdepth: 4\ndownload_pipeline\nloader_pipeline\nsplitter_pipeline\n```\n\n"
  },
  {
    "path": "docs/requirements.txt",
    "content": "# doc requirements\nmyst-parser==0.13.7\nsphinx>=2.1\nJinja2<3.1"
  },
  {
    "path": "docs/splitter_pipeline.md",
    "content": "# splitter_pipeline package\n=========================\n\n## Submodules\n\n### splitter_pipeline.file_splitters module\n\n\n```{eval-rst}\n.. automodule:: weather_sp.splitter_pipeline.file_splitters\n   :members:\n   :undoc-members:\n   :show-inheritance:\n```\n\n### splitter_pipeline.pipeline module\n\n```{eval-rst}\n.. automodule:: weather_sp.splitter_pipeline.pipeline\n   :members:\n   :undoc-members:\n   :show-inheritance:\n```\n\n## Module contents\n\n```{eval-rst}\n.. automodule:: splitter_pipeline\n   :members:\n   :undoc-members:\n   :show-inheritance:\n```"
  },
  {
    "path": "environment.yml",
    "content": "name: weather-tools\nchannels:\n  - conda-forge\ndependencies:\n  - python=3.8.13\n  - apache-beam=2.40.0\n  - xarray-beam=0.6.2\n  - xarray=2023.1.0\n  - fsspec=2022.11.0\n  - gcsfs=2022.11.0\n  - rioxarray=0.13.4\n  - gdal=3.5.1\n  - pyproj=3.4.0\n  - ecmwf-api-client=1.6.3\n  - eccodes=2.27.0\n  - cfgrib=0.9.10.2\n  - pygrib=2.1.4\n  - metview-batch=5.17.0\n  - netcdf4=1.6.1\n  - geojson=2.5.0=py_0\n  - simplejson=3.17.6\n  - numpy=1.22.4\n  - pandas=1.5.1\n  - google-cloud-sdk=410.0.0\n  - aria2=1.36.0\n  - pip=24.3.1\n  - zarr=2.15.0\n  - google-cloud-monitoring=2.22.2\n  - pillow=10.4.0\n  - cdsapi=0.7.5\n  - pip:\n    - cython==0.29.34\n    - earthengine-api==0.1.329\n    - firebase-admin==6.0.1\n    - contourpy==1.1.1\n    - google-crc32c==1.1.2\n    - MarkupSafe==2.1.5\n    - setuptools==70.3.0\n    - pyparsing==3.1.4\n    - .\n    - ./weather_dl\n    - ./weather_mv\n    - ./weather_sp\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[tool.ruff]\n# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.\nselect = [\"E\", \"F\", \"W\"]\nignore = []\n\n# Allow autofix for all enabled rules (when `--fix`) is provided.\nfixable = [\"A\", \"B\", \"C\", \"D\", \"E\", \"F\"]\nunfixable = []\n\n# Exclude a variety of commonly ignored directories.\nexclude = [\n    \".bzr\",\n    \".direnv\",\n    \".eggs\",\n    \".git\",\n    \".hg\",\n    \".mypy_cache\",\n    \".nox\",\n    \".pants.d\",\n    \".pytype\",\n    \".ruff_cache\",\n    \".svn\",\n    \".tox\",\n    \".venv\",\n    \"__pypackages__\",\n    \"_build\",\n    \"buck-out\",\n    \"build\",\n    \"dist\",\n    \"node_modules\",\n    \"venv\",\n]\n\n# Same as Black.\nline-length = 120\n\n# Allow unused variables when underscore-prefixed.\ndummy-variable-rgx = \"^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$\"\n\n# Assume Python 3.10.\ntarget-version = \"py310\"\n\n[tool.ruff.mccabe]\n# Unlike Flake8, default to a complexity level of 10.\nmax-complexity = 10\n"
  },
  {
    "path": "setup.cfg",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n[pytype]\ninputs = weather_dl\n         weather_mv\n         weather_sp\n"
  },
  {
    "path": "setup.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom setuptools import find_packages, setup\n\nbeam_gcp_requirements = [\n    \"google-cloud-bigquery==2.34.4\",\n    \"google-cloud-bigquery-storage==2.14.1\",\n    \"google-cloud-bigtable==1.7.2\",\n    \"google-cloud-core==1.7.3\",\n    \"google-cloud-datastore==1.15.5\",\n    \"google-cloud-dlp==3.8.0\",\n    \"google-cloud-language==1.3.2\",\n    \"google-cloud-pubsub==2.13.4\",\n    \"google-cloud-pubsublite==1.4.2\",\n    \"google-cloud-recommendations-ai==0.2.0\",\n    \"google-cloud-spanner==1.19.3\",\n    \"google-cloud-videointelligence==1.16.3\",\n    \"google-cloud-vision==1.0.2\",\n    \"apache-beam[gcp]==2.40.0\",\n]\n\nweather_dl_requirements = [\n    \"cdsapi==0.7.5\",\n    \"ecmwf-api-client\",\n    \"numpy>=1.19.1\",\n    \"pandas\",\n    \"xarray\",\n    \"requests>=2.24.0\",\n    \"google-cloud-firestore\",\n    \"firebase-admin\",\n    \"urllib3==1.26.5\",\n]\n\nweather_mv_requirements = [\n    \"dataclasses\",\n    \"numpy\",\n    \"pandas\",\n    \"xarray\",\n    \"cfgrib\",\n    \"netcdf4\",\n    \"geojson\",\n    \"simplejson\",\n    \"rioxarray\",\n    \"rasterio\",\n    \"earthengine-api>=0.1.263\",\n    \"pyproj\",  # requires separate binary installation!\n    \"gdal\",  # requires separate binary installation!\n    \"xarray-beam==0.6.2\",\n    \"gcsfs==2022.11.0\",\n    \"zarr==2.15.0\",\n]\n\nweather_sp_requirements = [\n    \"numpy>=1.20.3\",\n    \"pygrib\",\n    \"xarray\",\n    \"scipy\",\n]\n\ntest_requirements = [\n    \"pytype==2021.11.29\",\n    \"ruff==0.1.2\",\n    \"pytest\",\n    \"pytest-subtests\",\n    \"netcdf4\",\n    \"numpy\",\n    \"xarray\",\n    \"xarray-beam\",\n    \"absl-py\",\n    \"metview\",\n    \"memray\",\n    \"pytest-memray\",\n    \"h5py\",\n    \"pooch\",\n]\n\nall_test_requirements = beam_gcp_requirements + weather_dl_requirements + \\\n                        weather_mv_requirements + weather_sp_requirements + \\\n                        test_requirements\n\nsetup(\n    name='google-weather-tools',\n    packages=find_packages(),\n    author='Anthromets',\n    author_email='anthromets-ecmwf@google.com',\n    url='https://weather-tools.readthedocs.io/',\n    description='Apache Beam pipelines to make weather data accessible and useful.',\n    long_description=open('README.md', 'r', encoding='utf-8').read(),\n    long_description_content_type='text/markdown',\n    platforms=['darwin', 'linux'],\n    license='License :: OSI Approved :: Apache Software License',\n    classifiers=[\n        'Development Status :: 4 - Beta',\n        'Environment :: Console',\n        'Intended Audience :: Science/Research',\n        'Intended Audience :: Developers',\n        'Intended Audience :: Information Technology',\n        'License :: OSI Approved :: Apache Software License',\n        'Operating System :: MacOS :: MacOS X',\n        # 'Operating System :: Microsoft :: Windows',  # TODO(#64): Fully support Windows.\n        'Operating System :: POSIX',\n        'Programming Language :: Python :: 3.8',\n        'Programming Language :: Python :: 3.9',\n        'Topic :: Scientific/Engineering :: Atmospheric Science',\n\n    ],\n    python_requires='>=3.8, <3.10',\n    install_requires=['apache-beam[gcp]==2.40.0', 'gcsfs==2022.11.0'],\n    use_scm_version=True,\n    setup_requires=['setuptools_scm'],\n    scripts=['weather_dl/weather-dl', 'weather_mv/weather-mv', 'weather_sp/weather-sp'],\n    tests_require=test_requirements,\n    extras_require={\n        'docs': ['tox', 'sphinx>=2.1', 'myst-parser', 'Jinja2<3.1'],\n        'test': all_test_requirements,\n        'dev': ['google-weather-tools[docs,test]'],\n        'regrid': ['metview']\n    },\n    project_urls={\n        'Issue Tracking': 'http://github.com/google/weather-tools/issues',\n    },\n)\n"
  },
  {
    "path": "tox.ini",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n[tox]\nenvlist = py37,py38,lint,type\n\n[testenv]\ndeps = .[test]\ncommands = pytest\n\n[testenv:type]\ndeps = .[test]\ncommands = pytype\n\n[testenv:lint]\ndeps = .[test]\ncommands =\n    flake8 weather_dl\n    flake8 weather_mv\n    flake8 weather_sp\n\n[flake8]\nmax-line-length = 120\n\n[gh-actions]\npython =\n    3.8: py38, lint, type\n    3.9: py39\n"
  },
  {
    "path": "weather_dl/MANIFEST.in",
    "content": "global-exclude *_test.py\ninclude README.md\nexclude download_status*.py"
  },
  {
    "path": "weather_dl/README.md",
    "content": "# ⛈ `weather-dl` – Weather Downloader\n\nWeather Downloader ingests weather data to cloud buckets, such\nas [Google Cloud Storage](https://cloud.google.com/storage) (_beta_).\n\n## Features\n\n* **Flexible Pipelines**: `weather-dl` offers a high degree of control over what is downloaded via configuration files.\n  Separate scripts need not be written to get new data or add parameters. For more, see the\n  [configuration docs](../Configuration.md).\n* **Efficient Parallelization**: The tool gives you full control over how downloads are sharded and parallelized (with\n  good defaults). This lets you focus on the data and not the plumbing.\n* **Hassle-Free Dev-Ops**. `weather-dl` and Dataflow make it easy to spin up VMs on your behalf with one command. No\n  need to keep your local machine online all night to acquire data.\n* **Robust Downloads**. If an error occurs when fetching a shard, Dataflow will automatically retry the download for\n  you. Previously downloaded shards will be skipped by default, so you can re-run the tool without having to worry about\n  duplication of work.\n\n> Note: Currently, only ECMWF's MARS and CDS clients are supported. If you'd like to use `weather-dl` to work with other\n> data sources, please [file an issue](https://github.com/googlestaging/weather-tools/issues) (or consider\n> [making a contribution](../CONTRIBUTING.md)).\n\n## Usage\n\n```\nusage: weather-dl [-h] [-f] [-d] [-l] [-m MANIFEST_LOCATION] [-n NUM_REQUESTS_PER_KEY] [-p PARTITION_CHUNKS]\n                  [-s {in-order,fair}]\n                  config [config ...]\n\n\nWeather Downloader ingests weather data to cloud storage.\n\npositional arguments:\n  config                path/to/configs.cfg, containing client and data information. Can take multiple configs.Accepts\n                        *.cfg and *.json files.\n```\n\n_Common options_:\n\n* `-f, --force-download`: Force redownload of partitions that were previously downloaded.\n* `-d, --dry-run`: Run pipeline steps without _actually_ downloading or writing to cloud storage.\n* `-l, --local-run`: Run locally and download to local hard drive. The data and manifest directory is set by default\n  to '<$CWD>/local_run'. The runner will be set to `DirectRunner`. The only other relevant option is the config\n  and `--direct_num_workers`\n* `-m, --manifest-location MANIFEST_LOCATION`: Location of the manifest. By default, it will use Cloud Logging\n  (stdout for direct runner). You can set the name of the manifest as the hostname of a URL with the 'cli' protocol.\n  For example, `cli://manifest` will prefix all the manifest logs as '[manifest]'. In addition, users can specify \n  either a Firestore collection URI (`fs://<my-collection>?projectId=<my-project-id>`), or \n  BigQuery table (`bq://<project-id>.<dataset-name>.<table-name>`), or `noop://<name>` \n  for an in-memory location.\n* `-n, --num-requests-per-key`: Number of concurrent requests to make per API key. Default: make an educated guess per\n  client & config. Please see the client documentation for more details.\n* `-p, --partition-chunks`: Group shards into chunks of this size when computing the partitions. Specifically, this \n  controls how we chunk elements in a cartesian product, which affects parallelization of that step. Default: chunks of \n  1000 elements for 'in-order' scheduling. Chunks of 1 element for 'fair' scheduling. \n* `-s, --schedule {in-order,fair}`: When using multiple configs, decide how partitions are scheduled: 'in-order' implies \n  that partitions will be processed in sequential order of each config; 'fair' means that partitions from each config \n  will be interspersed evenly. Note: When using 'fair' scheduling, we recommend you set the '--partition-chunks' to a \n  much smaller number. Default: 'in-order'.\n* `--log-level`: An integer to configure log level. Default: 2(INFO).\n* `--use-local-code`: Supply local code to the Runner. Default: False.\n\n> Note: \n>  * In case of BigQuery manifest tool will create the BQ table itself, if not already present. \n>    Or it will use the existing table but can report errors in case of schema mismatch.\n>  * To run complex queries on the Firestore manifest, users may find it helpful to replicate Firestore to BigQuery \n>    using the automated process described in \n>    [this article](https://medium.com/@ammppp/automated-firestore-replication-to-bigquery-15915d518e38). \n>    By following the step-by-step instructions, users can easily set up the automated replication and then use BigQuery \n>    to perform advanced analysis on the data.\n\nInvoke with `-h` or `--help` to see the full range of options.\n\nFor further information on how to write config files, please consult [this documentation](../Configuration.md).\n\n_Usage Examples_:\n\n```bash\nweather-dl configs/era5_example_config_local_run.cfg --local-run\n```\n\nPreview download with a dry run:\n\n```bash\nweather-dl configs/mars_example_config.cfg --dry-run\n```\n\nUsing DataflowRunner\n\n```bash\nweather-dl configs/mars_example_config.cfg \\\n           --runner DataflowRunner \\\n           --project $PROJECT \\\n           --temp_location gs://$BUCKET/tmp  \\\n           --job_name $JOB_NAME\n```\n\nUsing DataflowRunner and using local code for pipeline\n\n```bash\nweather-dl configs/mars_example_config.cfg \\\n           --runner DataflowRunner \\\n           --project $PROJECT \\\n           --temp_location gs://$BUCKET/tmp  \\\n           --job_name $JOB_NAME \\\n           --use-local-code\n```\n\nUsing the DataflowRunner and specifying 3 requests per license\n\n```bash\nweather-dl configs/mars_example_config.cfg \\\n           -n 3 \\\n           --runner DataflowRunner \\\n           --project $PROJECT \\\n           --temp_location gs://$BUCKET/tmp  \\\n           --job_name $JOB_NAME\n```\n\nFor a full list of how to configure the Dataflow pipeline, please review\n[this table](https://cloud.google.com/dataflow/docs/reference/pipeline-options).\n\n## Monitoring\n\nYou can view how your ECMWF API jobs are by visitng the client-specific job queue:\n\n* [MARS](https://apps.ecmwf.int/mars-activity/)\n* [Copernicus](https://cds.climate.copernicus.eu/requests?tab=all)\n\nIf you use Google Cloud Storage, we recommend using [`gsutil` (link)](https://cloud.google.com/storage/docs/gsutil) to\ninspect the progress of your downloads. For example:\n\n```shell\n# Check that the file-sizes of your downloads look alright\ngcloud storage du --readable-sizes gs://your-cloud-bucket/mars-data/*T00z.nc \n# See how many downloads have finished\ngcloud storage du --readable-sizes gs://your-cloud-bucket/mars-data/*T00z.nc | wc -l\n```\n"
  },
  {
    "path": "weather_dl/__init__.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "weather_dl/download_pipeline/__init__.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .pipeline import run, pipeline\n\n\ndef cli(extra=[]):\n    import sys\n    pipeline(run(sys.argv + extra))\n"
  },
  {
    "path": "weather_dl/download_pipeline/clients.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"ECMWF Downloader Clients.\"\"\"\n\nimport abc\nimport collections\nimport contextlib\nimport datetime\nimport io\nimport json\nimport logging\nimport os\nimport time\nimport typing as t\nimport warnings\nfrom urllib.parse import urljoin\n\nfrom cdsapi import api as cds_api\nimport urllib3\nfrom ecmwfapi import api\n\nfrom .config import Config, optimize_selection_partition\nfrom .manifest import Manifest, Stage\nfrom .util import download_with_aria2, retry_with_exponential_backoff\n\nwarnings.simplefilter(\n    \"ignore\", category=urllib3.connectionpool.InsecureRequestWarning)\n\n\nclass Client(abc.ABC):\n    \"\"\"Weather data provider client interface.\n\n    Defines methods and properties required to efficiently interact with weather\n    data providers.\n\n    Attributes:\n        config: A config that contains pipeline parameters, such as API keys.\n        level: Default log level for the client.\n    \"\"\"\n\n    def __init__(self, config: Config, level: int = logging.INFO) -> None:\n        \"\"\"Clients are initialized with the general CLI configuration.\"\"\"\n        self.config = config\n        self.logger = logging.getLogger(f'{__name__}.{type(self).__name__}')\n        self.logger.setLevel(level)\n\n    @abc.abstractmethod\n    def retrieve(self, dataset: str, selection: t.Dict, output: str, manifest: Manifest) -> None:\n        \"\"\"Download from data source.\"\"\"\n        pass\n\n    @classmethod\n    @abc.abstractmethod\n    def num_requests_per_key(cls, dataset: str) -> int:\n        \"\"\"Specifies the number of workers to be used per api key for the dataset.\"\"\"\n        pass\n\n    @property\n    @abc.abstractmethod\n    def license_url(self):\n        \"\"\"Specifies the License URL.\"\"\"\n        pass\n\n\nclass SplitCDSRequest():\n    \"\"\"Extended CDS class that separates fetch and download stage.\"\"\"\n    def __init__(self, *args, **kwargs):\n        self.__cds_client = cds_api.Client(*args, **kwargs)\n\n    @retry_with_exponential_backoff\n    def _download(self, url, path: str, size: int) -> None:\n        self.__cds_client.info(\"Downloading %s to %s (%s)\", url, path, cds_api.bytes_to_string(size))\n        start = time.time()\n\n        download_with_aria2(url, path)\n\n        elapsed = time.time() - start\n        if elapsed:\n            self.__cds_client.info(\"Download rate %s/s\", cds_api.bytes_to_string(size / elapsed))\n\n    def fetch(self, request: t.Dict, dataset: str) -> t.Dict:\n        result = self.__cds_client.retrieve(dataset, request)\n        return {'href': result.location, 'size': result.content_length}\n\n    def download(self, result: cds_api.Result, target: t.Optional[str] = None) -> None:\n        if target:\n            if os.path.exists(target):\n                # Empty the target file, if it already exists, otherwise the\n                # transfer below might be fooled into thinking we're resuming\n                # an interrupted download.\n                open(target, \"w\").close()\n\n            self._download(result[\"href\"], target, result[\"size\"])\n\n\nclass CdsClient(Client):\n    \"\"\"A client to access weather data from the Cloud Data Store (CDS).\n\n    Datasets on CDS can be found at:\n      https://cds.climate.copernicus.eu/cdsapp#!/search?type=dataset\n\n    The parameters section of the input `config` requires two values: `api_url` and\n    `api_key`. Or, these values can be set as the environment variables: `CDSAPI_URL`\n    and `CDSAPI_KEY`. These can be acquired from the following URL, which requires\n    creating a free account: https://cds.climate.copernicus.eu/api-how-to\n\n    The CDS global queues for data access has dynamic rate limits. These can be viewed\n    live here: https://cds.climate.copernicus.eu/live/limits.\n\n    Attributes:\n        config: A config that contains pipeline parameters, such as API keys.\n        level: Default log level for the client.\n    \"\"\"\n\n    \"\"\"Name patterns of datasets that are hosted internally on CDS servers.\"\"\"\n    cds_hosted_datasets = {'reanalysis-era'}\n\n    def retrieve(self, dataset: str, selection: t.Dict, output: str, manifest: Manifest) -> None:\n        c = CDSClientExtended(\n            url=self.config.kwargs.get('api_url', os.environ.get('CDSAPI_URL')),\n            key=self.config.kwargs.get('api_key', os.environ.get('CDSAPI_KEY')),\n            debug_callback=self.logger.debug,\n            info_callback=self.logger.info,\n            warning_callback=self.logger.warning,\n            error_callback=self.logger.error,\n        )\n        selection_ = optimize_selection_partition(selection)\n        with StdoutLogger(self.logger, level=logging.DEBUG):\n            manifest.set_stage(Stage.FETCH)\n            precise_fetch_start_time = (\n                datetime.datetime.utcnow()\n                .replace(tzinfo=datetime.timezone.utc)\n                .isoformat(timespec='seconds')\n            )\n            manifest.prev_stage_precise_start_time = precise_fetch_start_time\n            result = c.fetch(selection_, dataset)\n            manifest.set_stage(Stage.DOWNLOAD)\n            precise_download_start_time = (\n                datetime.datetime.utcnow()\n                .replace(tzinfo=datetime.timezone.utc)\n                .isoformat(timespec='seconds')\n            )\n            manifest.prev_stage_precise_start_time = precise_download_start_time\n            c.download(result, target=output)\n\n    @property\n    def license_url(self):\n        return 'https://cds.climate.copernicus.eu/api/v2/terms/static/licence-to-use-copernicus-products.pdf'\n\n    @classmethod\n    def num_requests_per_key(cls, dataset: str) -> int:\n        \"\"\"Number of requests per key from the CDS API.\n\n        CDS has dynamic, data-specific limits, defined here:\n          https://cds.climate.copernicus.eu/live/limits\n\n        Typically, the reanalysis dataset allows for 3-5 simultaneous requets.\n        For all standard CDS data (backed on disk drives), it's common that 2\n        requests are allowed, though this is dynamically set, too.\n\n        If the Beam pipeline encounters a user request limit error, please cancel\n        all outstanding requests (per each user account) at the following link:\n        https://cds.climate.copernicus.eu/cdsapp#!/yourrequests\n        \"\"\"\n        # TODO(#15): Parse live CDS limits API to set data-specific limits.\n        for internal_set in cls.cds_hosted_datasets:\n            if dataset.startswith(internal_set):\n                return 5\n        return 2\n\n\nclass StdoutLogger(io.StringIO):\n    \"\"\"Special logger to redirect stdout to logs.\"\"\"\n\n    def __init__(self, logger_: logging.Logger, level: int = logging.INFO):\n        super().__init__()\n        self.logger = logger_\n        self.level = level\n        self._redirector = contextlib.redirect_stdout(self)\n\n    def log(self, msg) -> None:\n        self.logger.log(self.level, msg)\n\n    def write(self, msg):\n        if msg and not msg.isspace():\n            self.logger.log(self.level, msg)\n\n    def __enter__(self):\n        self._redirector.__enter__()\n        return self\n\n    def __exit__(self, exc_type, exc_value, traceback):\n        # let contextlib do any exception handling here\n        self._redirector.__exit__(exc_type, exc_value, traceback)\n\n\nclass SplitMARSRequest(api.APIRequest):\n    \"\"\"Extended MARS APIRequest class that separates fetch and download stage.\"\"\"\n    @retry_with_exponential_backoff\n    def _download(self, url, path: str, size: int) -> None:\n        self.log(\n            \"Transferring %s into %s\" % (self._bytename(size), path)\n        )\n        self.log(\"From %s\" % (url,))\n\n        download_with_aria2(url, path)\n\n    def fetch(self, request: t.Dict, dataset: str) -> t.Dict:\n        status = None\n\n        self.connection.submit(\"%s/%s/requests\" % (self.url, self.service), request)\n        self.log(\"Request submitted\")\n        self.log(\"Request id: \" + self.connection.last.get(\"name\"))\n        if self.connection.status != status:\n            status = self.connection.status\n            self.log(\"Request is %s\" % (status,))\n\n        while not self.connection.ready():\n            if self.connection.status != status:\n                status = self.connection.status\n                self.log(\"Request is %s\" % (status,))\n            self.connection.wait()\n\n        if self.connection.status != status:\n            status = self.connection.status\n            self.log(\"Request is %s\" % (status,))\n\n        result = self.connection.result()\n        return result\n\n    def download(self, result: t.Dict, target: t.Optional[str] = None) -> None:\n        if target:\n            if os.path.exists(target):\n                # Empty the target file, if it already exists, otherwise the\n                # transfer below might be fooled into thinking we're resuming\n                # an interrupted download.\n                open(target, \"w\").close()\n\n            self._download(urljoin(self.url, result[\"href\"]), target, result[\"size\"])\n        self.connection.cleanup()\n\n\nclass SplitRequestMixin:\n    c = None\n\n    def fetch(self, req: t.Dict, dataset: t.Optional[str] = None) -> t.Dict:\n        return self.c.fetch(req, dataset)\n\n    def download(self, res: t.Dict, target: str) -> None:\n        self.c.download(res, target)\n\n\nclass CDSClientExtended(SplitRequestMixin):\n    \"\"\"Extended CDS Client class that separates fetch and download stage.\"\"\"\n    def __init__(self, *args, **kwargs):\n        self.c = SplitCDSRequest(*args, **kwargs)\n\n\nclass MARSECMWFServiceExtended(api.ECMWFService, SplitRequestMixin):\n    \"\"\"Extended MARS ECMFService class that separates fetch and download stage.\"\"\"\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n        self.c = SplitMARSRequest(\n            self.url,\n            \"services/%s\" % (self.service,),\n            email=self.email,\n            key=self.key,\n            log=self.log,\n            verbose=self.verbose,\n            quiet=self.quiet,\n        )\n\n\nclass PublicECMWFServerExtended(api.ECMWFDataServer, SplitRequestMixin):\n    def __init__(self, *args, dataset='', **kwargs):\n        super().__init__(*args, **kwargs)\n        self.c = SplitMARSRequest(\n            self.url,\n            \"datasets/%s\" % (dataset,),\n            email=self.email,\n            key=self.key,\n            log=self.log,\n            verbose=self.verbose,\n        )\n\n\nclass MarsClient(Client):\n    \"\"\"A client to access data from the Meteorological Archival and Retrieval System (MARS).\n\n    See https://www.ecmwf.int/en/forecasts/datasets for a summary of datasets available\n    on MARS. Most notable, MARS provides access to ECMWF's Operational Archive\n    https://www.ecmwf.int/en/forecasts/dataset/operational-archive.\n\n    The client config must contain three parameters to autheticate access to the MARS archive:\n    `api_key`, `api_url`, and `api_email`. These can also be configued by setting the\n    commensurate environment variables: `MARSAPI_KEY`, `MARSAPI_URL`, and `MARSAPI_EMAIL`.\n    These credentials can be looked up by after registering for an ECMWF account\n    (https://apps.ecmwf.int/registration/) and visitng: https://api.ecmwf.int/v1/key/.\n\n    MARS server activity can be observed at https://apps.ecmwf.int/mars-activity/.\n\n    Attributes:\n        config: A config that contains pipeline parameters, such as API keys.\n        level: Default log level for the client.\n    \"\"\"\n    def retrieve(self, dataset: str, selection: t.Dict, output: str, manifest: Manifest) -> None:\n        c = MARSECMWFServiceExtended(\n            \"mars\",\n            key=self.config.kwargs.get('api_key', os.environ.get(\"MARSAPI_KEY\")),\n            url=self.config.kwargs.get('api_url', os.environ.get(\"MARSAPI_URL\")),\n            email=self.config.kwargs.get('api_email', os.environ.get(\"MARSAPI_EMAIL\")),\n            log=self.logger.debug,\n            verbose=True,\n        )\n        selection_ = optimize_selection_partition(selection)\n        with StdoutLogger(self.logger, level=logging.DEBUG):\n            manifest.set_stage(Stage.FETCH)\n            precise_fetch_start_time = (\n                datetime.datetime.utcnow()\n                .replace(tzinfo=datetime.timezone.utc)\n                .isoformat(timespec='seconds')\n            )\n            manifest.prev_stage_precise_start_time = precise_fetch_start_time\n            result = c.fetch(req=selection_)\n            manifest.set_stage(Stage.DOWNLOAD)\n            precise_download_start_time = (\n                datetime.datetime.utcnow()\n                .replace(tzinfo=datetime.timezone.utc)\n                .isoformat(timespec='seconds')\n            )\n            manifest.prev_stage_precise_start_time = precise_download_start_time\n            c.download(result, target=output)\n\n    @property\n    def license_url(self):\n        return 'https://apps.ecmwf.int/datasets/licences/general/'\n\n    @classmethod\n    def num_requests_per_key(cls, dataset: str) -> int:\n        \"\"\"Number of requests per key (or user) for the Mars API.\n\n        Mars allows 2 active requests per user and 20 queued requests per user, as of Sept 27, 2021.\n        To ensure we never hit a rate limit error during download, we only make use of the active\n        requests.\n        See: https://confluence.ecmwf.int/display/UDOC/Total+number+of+requests+a+user+can+submit+-+Web+API+FAQ\n\n        Queued requests can _only_ be canceled manually from a web dashboard. If the\n        `ERROR 101 (USER_QUEUED_LIMIT_EXCEEDED)` error occurs in the Beam pipeline, then go to\n        http://apps.ecmwf.int/webmars/joblist/ and cancel queued jobs.\n        \"\"\"\n        return 2\n\n\nclass ECMWFPublicClient(Client):\n    \"\"\"A client for ECMWF's public datasets, like TIGGE.\"\"\"\n    def retrieve(self, dataset: str, selection: t.Dict, output: str, manifest: Manifest) -> None:\n        c = PublicECMWFServerExtended(\n            url=self.config.kwargs.get('api_url', os.environ.get(\"MARSAPI_URL\")),\n            key=self.config.kwargs.get('api_key', os.environ.get(\"MARSAPI_KEY\")),\n            email=self.config.kwargs.get('api_email', os.environ.get(\"MARSAPI_EMAIL\")),\n            log=self.logger.debug,\n            verbose=True,\n            dataset=dataset,\n        )\n        selection_ = optimize_selection_partition(selection)\n        with StdoutLogger(self.logger, level=logging.DEBUG):\n            manifest.set_stage(Stage.FETCH)\n            precise_fetch_start_time = (\n                datetime.datetime.utcnow()\n                .replace(tzinfo=datetime.timezone.utc)\n                .isoformat(timespec='seconds')\n            )\n            manifest.prev_stage_precise_start_time = precise_fetch_start_time\n            result = c.fetch(req=selection_)\n            manifest.set_stage(Stage.DOWNLOAD)\n            precise_download_start_time = (\n                datetime.datetime.utcnow()\n                .replace(tzinfo=datetime.timezone.utc)\n                .isoformat(timespec='seconds')\n            )\n            manifest.prev_stage_precise_start_time = precise_download_start_time\n            c.download(result, target=output)\n\n    @classmethod\n    def num_requests_per_key(cls, dataset: str) -> int:\n        # Experimentally validated request limit.\n        return 5\n\n    @property\n    def license_url(self):\n        if not self.config.dataset:\n            raise ValueError('must specify a dataset for this client!')\n        return f'https://apps.ecmwf.int/datasets/data/{self.config.dataset.lower()}/licence/'\n\n\nclass FakeClient(Client):\n    \"\"\"A client that writes the selection arguments to the output file.\"\"\"\n\n    def retrieve(self, dataset: str, selection: t.Dict, output: str, manifest: Manifest) -> None:\n        manifest.set_stage(Stage.RETRIEVE)\n        precise_retrieve_start_time = (\n            datetime.datetime.utcnow()\n            .replace(tzinfo=datetime.timezone.utc)\n            .isoformat(timespec='seconds')\n        )\n        manifest.prev_stage_precise_start_time = precise_retrieve_start_time\n        self.logger.debug(f'Downloading {dataset} to {output}')\n        with open(output, 'w') as f:\n            json.dump({dataset: selection}, f)\n\n    @property\n    def license_url(self):\n        return 'lorem ipsum'\n\n    @classmethod\n    def num_requests_per_key(cls, dataset: str) -> int:\n        return 1\n\n\nCLIENTS = collections.OrderedDict(\n    cds=CdsClient,\n    mars=MarsClient,\n    ecpublic=ECMWFPublicClient,\n    fake=FakeClient,\n)\n"
  },
  {
    "path": "weather_dl/download_pipeline/clients_test.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport unittest\n\nfrom .clients import FakeClient, CdsClient, MarsClient\n\n\nclass MaxWorkersTest(unittest.TestCase):\n    def test_cdsclient_internal(self):\n        self.assertEqual(CdsClient.num_requests_per_key(\"reanalysis-era5-some-data\"), 5)\n\n    def test_cdsclient_mars_hosted(self):\n        self.assertEqual(CdsClient.num_requests_per_key(\"reanalysis-carra-height-levels\"), 2)\n\n    def test_marsclient(self):\n        self.assertEqual(MarsClient.num_requests_per_key(\"reanalysis-era5-some-data\"), 2)\n\n    def test_fakeclient(self):\n        self.assertEqual(FakeClient.num_requests_per_key(\"reanalysis-era5-some-data\"), 1)\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "weather_dl/download_pipeline/config.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport calendar\nimport copy\nimport dataclasses\nimport itertools\nimport typing as t\n\nValues = t.Union[t.List['Values'], t.Dict[str, 'Values'], bool, int, float, str]  # pytype: disable=not-supported-yet\n\n\n@dataclasses.dataclass\nclass Config:\n    \"\"\"Contains pipeline parameters.\n\n    Attributes:\n        config_name:\n            Name of the config file.\n        client:\n            Name of the Weather-API-client. Supported clients are mentioned in the 'CLIENTS' variable.\n        dataset (optional):\n            Name of the target dataset. Allowed options are dictated by the client.\n        partition_keys (optional):\n            Choose the keys from the selection section to partition the data request.\n            This will compute a cartesian cross product of the selected keys\n            and assign each as their own download.\n        target_path:\n            Download artifact filename template. Can make use of Python's standard string formatting.\n            It can contain format symbols to be replaced by partition keys;\n            if this is used, the total number of format symbols must match the number of partition keys.\n        subsection_name:\n            Name of the particular subsection. 'default' if there is no subsection.\n        force_download:\n            Force redownload of partitions that were previously downloaded.\n        user_id:\n            Username from the environment variables.\n        kwargs (optional):\n            For representing subsections or any other parameters.\n        selection:\n            Contains parameters used to select desired data.\n    \"\"\"\n\n    config_name: str = \"\"\n    client: str = \"\"\n    dataset: t.Optional[str] = \"\"\n    target_path: str = \"\"\n    partition_keys: t.Optional[t.List[str]] = dataclasses.field(default_factory=list)\n    subsection_name: str = \"default\"\n    force_download: bool = False\n    user_id: str = \"unknown\"\n    kwargs: t.Optional[t.Dict[str, Values]] = dataclasses.field(default_factory=dict)\n    selection: t.Dict[str, Values] = dataclasses.field(default_factory=dict)\n\n    @classmethod\n    def from_dict(cls, config: t.Dict) -> 'Config':\n        config_instance = cls()\n        for section_key, section_value in config.items():\n            if section_key == \"parameters\":\n                for key, value in section_value.items():\n                    if hasattr(config_instance, key):\n                        setattr(config_instance, key, value)\n                    else:\n                        config_instance.kwargs[key] = value\n            if section_key == \"selection\":\n                config_instance.selection = section_value\n        return config_instance\n\n\ndef optimize_selection_partition(selection: t.Dict) -> t.Dict:\n    \"\"\"Compute right-hand-side values for the selection section of a single partition.\n\n    Used to support custom syntax and optimizations, such as 'all'.\n    \"\"\"\n    selection_ = copy.deepcopy(selection)\n\n    if 'date_range' in selection_.keys():\n        selection_['date'] = selection_['date_range'][0]\n        del selection_['date_range']\n\n    if 'day' in selection_.keys() and selection_['day'] == 'all':\n        years, months = selection_['year'], selection_['month']\n\n        multiples_error = \"When using day='all' in selection, '/' is not allowed in {type}.\"\n\n        if isinstance(years, str):\n            years = [years]\n\n        if isinstance(months, str):\n            months = [months]\n\n        date_ranges = []\n\n        # Generating dates for every year-month.\n        for year, month in itertools.product(years, months):\n\n            if isinstance(year, str):\n                assert '/' not in year, multiples_error.format(type='year')\n\n            if isinstance(month, str):\n                assert '/' not in month, multiples_error.format(type='month')\n\n            year, month = int(year), int(month)\n\n            _, n_days_in_month = calendar.monthrange(year, month)\n\n            date_range = [f'{year:04d}-{month:02d}-{day:02d}' for day in range(1, n_days_in_month + 1)]\n\n            date_ranges.extend(date_range)\n\n        selection_['date'] = date_ranges\n        del selection_['day']\n        del selection_['month']\n        del selection_['year']\n\n    return selection_\n"
  },
  {
    "path": "weather_dl/download_pipeline/config_test.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport calendar\nimport itertools\nimport os\nimport unittest\n\nimport weather_dl\nfrom .config import optimize_selection_partition\nfrom .pipeline import run\n\n\nclass ConfigTest(unittest.TestCase):\n\n    def setUp(self):\n        self.data_dir = f'{next(iter(weather_dl.__path__))}/../configs'\n\n    def test_process_config_files(self):\n        for filename in os.listdir(self.data_dir):\n            if filename.startswith('.'):\n                continue\n            # only files, no directories\n            if os.path.isdir(os.path.join(self.data_dir, filename)):\n                continue\n            with self.subTest(filename=filename):\n                config = os.path.join(self.data_dir, filename)\n                try:\n                    run([\"weather-dl\", config, \"--dry-run\"])\n                except:  # noqa: E722\n                    self.fail(f'Config {filename!r} is incorrect.')\n\n    def test_process_multi_config_files(self):\n        for filename in os.listdir(self.data_dir):\n            if filename.startswith('.'):\n                continue\n            # only directories, no files\n            if not os.path.isdir(os.path.join(self.data_dir, filename)):\n                continue\n            with self.subTest(dirname=filename):\n                configs_dir = os.path.join(self.data_dir, filename)\n                configs = [os.path.join(configs_dir, c) for c in os.listdir(configs_dir)]\n                try:\n                    run([\"weather-dl\"] + configs + [\"--dry-run\"])\n                except:  # noqa: E722\n                    self.fail(f'Configs {filename!r} is incorrect.')\n\n\nif __name__ == '__main__':\n    unittest.main()\n\n\nclass SelectionSyntaxTest(unittest.TestCase):\n\n    def test_all_days__invalid_year(self):\n        selection_with_multiple_years = {'year': '2020/2021', 'month': '2', 'day': 'all'}\n        with self.assertRaisesRegex(AssertionError, \"When using day='all' in selection, '/' is not allowed in year.\"):\n            optimize_selection_partition(selection_with_multiple_years)\n\n        selection_with_multiple_years = {'year': ['2020/2021'], 'month': '2', 'day': 'all'}\n        with self.assertRaisesRegex(AssertionError, \"When using day='all' in selection, '/' is not allowed in year.\"):\n            optimize_selection_partition(selection_with_multiple_years)\n\n    def test_all_days__invalid_month(self):\n        selection_with_multiple_years = {'year': '2020', 'month': '1/2/3', 'day': 'all'}\n        with self.assertRaisesRegex(AssertionError, \"When using day='all' in selection, '/' is not allowed in month.\"):\n            optimize_selection_partition(selection_with_multiple_years)\n\n        selection_with_multiple_years = {'year': '2020', 'month': ['1/2/3'], 'day': 'all'}\n        with self.assertRaisesRegex(AssertionError, \"When using day='all' in selection, '/' is not allowed in month.\"):\n            optimize_selection_partition(selection_with_multiple_years)\n\n    def test_date_range(self):\n        selection_with_date_range = {'date_range': ['2017-01-01/to/2017-01-10']}\n        actual = optimize_selection_partition(selection_with_date_range)\n        self.assertEqual(actual['date'], selection_with_date_range['date_range'][0])\n\n    def test_with_year_month_as_string(self):\n        selection_with_multiple_months = {'year': '2017', 'month': '12', 'day':'all'}\n        actual = optimize_selection_partition(selection_with_multiple_months)\n\n        expected = []\n        _, n_days_in_month = calendar.monthrange(2017, 12)\n        expected = [f'2017-12-{day:02d}' for day in range(1, n_days_in_month + 1)]\n\n        self.assertEqual(actual['date'], expected)\n        self.assertNotIn('day', actual)\n        self.assertNotIn('month', actual)\n        self.assertNotIn('year', actual)\n\n    def test_with_multiple_months(self):\n        selection_with_multiple_months = {'year': ['2017'], 'month': ['2', '4', '6', '8'], 'day':'all'}\n        actual = optimize_selection_partition(selection_with_multiple_months)\n\n        expected = []\n        for y, m in itertools.product(selection_with_multiple_months['year'], selection_with_multiple_months['month']):\n            y, m = int(y), int(m)\n            _, n_days_in_month = calendar.monthrange(y, m)\n            date_range = [f'{y:04d}-{m:02d}-{day:02d}' for day in range(1, n_days_in_month + 1)]\n            expected.extend(date_range)\n\n        self.assertEqual(actual['date'], expected)\n        self.assertNotIn('day', actual)\n        self.assertNotIn('month', actual)\n        self.assertNotIn('year', actual)\n\n    def test_with_multiple_years(self):\n        selection_with_multiple_years = {'year': ['2017', '2018'], 'month': ['2', '4', '6', '8'], 'day':'all'}\n        actual = optimize_selection_partition(selection_with_multiple_years)\n\n        expected = []\n        for y, m in itertools.product(selection_with_multiple_years['year'], selection_with_multiple_years['month']):\n            y, m = int(y), int(m)\n            _, n_days_in_month = calendar.monthrange(y, m)\n            date_range = [f'{y:04d}-{m:02d}-{day:02d}' for day in range(1, n_days_in_month + 1)]\n            expected.extend(date_range)\n\n        self.assertEqual(actual['date'], expected)\n        self.assertNotIn('day', actual)\n        self.assertNotIn('month', actual)\n        self.assertNotIn('year', actual)\n"
  },
  {
    "path": "weather_dl/download_pipeline/fetcher.py",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport dataclasses\nimport datetime\nimport logging\nimport tempfile\nimport typing as t\n\nimport apache_beam as beam\n\nfrom .clients import CLIENTS, Client\nfrom .config import Config\nfrom .manifest import Manifest, NoOpManifest, Location, Stage\nfrom .parsers import prepare_target_name\nfrom .partition import skip_partition\nfrom .stores import Store, FSStore\nfrom .util import copy, retry_with_exponential_backoff\n\nlogger = logging.getLogger(__name__)\n\n\n@dataclasses.dataclass\nclass Fetcher(beam.DoFn):\n    \"\"\"Executes client download requests.\n\n    Given a sequence of configs (keyed by subsection parameters and number of allowed\n    requests), this will execute retrievals via each client. The keyed strucutre, the\n    result of a `beam.GroupBy` operation, will ensure that all licenses and requests\n    are utilized without any conflict.\n\n    Attributes:\n        client_name: The name of the download client to construct per each request.\n        manifest: A manifest to keep track of the status of requests\n        store: To manage where downloads are persisted.\n    \"\"\"\n\n    client_name: str\n    manifest: Manifest = NoOpManifest(Location('noop://in-memory'))\n    store: t.Optional[Store] = None\n    log_level: t.Optional[int] = logging.INFO\n\n    def __post_init__(self):\n        if self.store is None:\n            self.store = FSStore()\n\n    @retry_with_exponential_backoff\n    def retrieve(self, client: Client, dataset: str, selection: t.Dict, dest: str) -> None:\n        \"\"\"Retrieve from download client, with retries.\"\"\"\n        client.retrieve(dataset, selection, dest, self.manifest)\n\n    def fetch_data(self, config: Config, *, worker_name: str = 'default') -> None:\n        \"\"\"Download data from a client to a temp file, then upload to Cloud Storage.\"\"\"\n        if not config:\n            return\n\n        if skip_partition(config, self.store, self.manifest):\n            return\n\n        client = CLIENTS[self.client_name](config, self.log_level)\n        target = prepare_target_name(config)\n\n        with tempfile.NamedTemporaryFile() as temp:\n            logger.info(f'[{worker_name}] Fetching data for {target!r}.')\n            with self.manifest.transact(config.config_name, config.dataset, config.selection, target, config.user_id):\n                self.retrieve(client, config.dataset, config.selection, temp.name)\n\n                self.manifest.set_stage(Stage.UPLOAD)\n                precise_upload_start_time = (\n                    datetime.datetime.utcnow()\n                    .replace(tzinfo=datetime.timezone.utc)\n                    .isoformat(timespec='seconds')\n                )\n                self.manifest.prev_stage_precise_start_time = precise_upload_start_time\n                logger.info(f'[{worker_name}] Uploading to store for {target!r}.')\n\n                # In dry-run mode we actually aren't required to upload a file.\n                if not self.client_name == \"fake\":\n                    copy(temp.name, target)\n\n                logger.info(f'[{worker_name}] Upload to store complete for {target!r}.')\n\n    def process(self, element) -> None:\n        # element: Tuple[Tuple[str, int], Iterator[Config]]\n        \"\"\"Execute download requests one-by-one.\"\"\"\n        (subsection, request_idx), partitions = element\n        worker_name = f'{subsection}.{request_idx}'\n\n        logger.info(f'[{worker_name}] Starting requests...')\n        logger.debug(f'[{worker_name}] Partitions: {partitions!r}.')\n\n        for partition in partitions:\n            beam.metrics.Metrics.counter('Fetcher', subsection).inc()\n            self.fetch_data(partition, worker_name=worker_name)\n"
  },
  {
    "path": "weather_dl/download_pipeline/fetcher_test.py",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport io\nimport json\nimport os\nimport tempfile\nimport unittest\nfrom unittest.mock import patch\n\nfrom .config import Config\nfrom .fetcher import Fetcher\nfrom .manifest import MockManifest, Location\nfrom .stores import InMemoryStore\n\n\nclass FetchDataTest(unittest.TestCase):\n\n    def setUp(self) -> None:\n        self.dummy_manifest = MockManifest(Location('dummy-manifest'))\n\n    @patch('weather_dl.download_pipeline.clients.CDSClientExtended.download')\n    @patch('weather_dl.download_pipeline.clients.CDSClientExtended.fetch')\n    def test_fetch_data(self, mock_fetch, mock_download):\n        with tempfile.TemporaryDirectory() as tmpdir:\n            config = Config.from_dict({\n                'parameters': {\n                    'dataset': 'reanalysis-era5-pressure-levels',\n                    'partition_keys': ['year', 'month'],\n                    'target_path': os.path.join(tmpdir, 'download-{:02d}-{:02d}.nc'),\n                    'api_url': 'https//api-url.com/v1/',\n                    'api_key': '12345',\n                },\n                'selection': {\n                    'features': ['pressure'],\n                    'month': ['12'],\n                    'year': ['01']\n                }\n            })\n\n            fetcher = Fetcher('cds', self.dummy_manifest, InMemoryStore())\n            fetcher.fetch_data(config)\n\n            self.assertTrue(os.path.exists(os.path.join(tmpdir, 'download-01-12.nc')))\n\n            mock_fetch.assert_called_with(\n                config.selection,\n                'reanalysis-era5-pressure-levels',\n                )\n\n    @patch('weather_dl.download_pipeline.clients.CDSClientExtended.download')\n    @patch('weather_dl.download_pipeline.clients.CDSClientExtended.fetch')\n    def test_fetch_data__manifest__returns_success(self, mock_fetch, mock_download):\n        with tempfile.TemporaryDirectory() as tmpdir:\n            config = Config.from_dict({\n                'parameters': {\n                    'dataset': 'reanalysis-era5-pressure-levels',\n                    'partition_keys': ['year', 'month'],\n                    'target_path': os.path.join(tmpdir, 'download-{:02d}-{:02d}.nc'),\n                    'api_url': 'https//api-url.com/v1/',\n                    'api_key': '12345',\n                },\n                'selection': {\n                    'features': ['pressure'],\n                    'month': ['12'],\n                    'year': ['01']\n                }\n            })\n\n            fetcher = Fetcher('cds', self.dummy_manifest, InMemoryStore())\n            fetcher.fetch_data(config)\n\n            self.assertDictContainsSubset(dict(\n                selection=json.dumps(config.selection),\n                location=os.path.join(tmpdir, 'download-01-12.nc'),\n                stage='upload',\n                status='success',\n                error=None,\n                username='unknown',\n            ), list(self.dummy_manifest.records.values())[0])\n\n    @patch('weather_dl.download_pipeline.clients.CDSClientExtended.fetch')\n    def test_fetch_data__manifest__records_retrieve_failure(self, mock_fetch):\n        with tempfile.TemporaryDirectory() as tmpdir:\n            config = Config.from_dict({\n                'parameters': {\n                    'dataset': 'reanalysis-era5-pressure-levels',\n                    'partition_keys': ['year', 'month'],\n                    'target_path': os.path.join(tmpdir, 'download-{:02d}-{:02d}.nc'),\n                    'api_url': 'https//api-url.com/v1/',\n                    'api_key': '12345',\n                },\n                'selection': {\n                    'features': ['pressure'],\n                    'month': ['12'],\n                    'year': ['01']\n                }\n            })\n\n            error = IOError(\"We don't have enough permissions to download this.\")\n            mock_fetch.side_effect = error\n\n            with self.assertRaises(IOError) as e:\n                fetcher = Fetcher('cds', self.dummy_manifest, InMemoryStore())\n                fetcher.fetch_data(config)\n\n            actual = list(self.dummy_manifest.records.values())[0]\n\n            self.assertDictContainsSubset(dict(\n                selection=json.dumps(config.selection),\n                location=os.path.join(tmpdir, 'download-01-12.nc'),\n                stage='fetch',\n                status='failure',\n                username='unknown',\n            ), actual)\n\n            self.assertIn(error.args[0], actual['error'])\n            self.assertIn(error.args[0], e.exception.args[0])\n\n    @patch('weather_dl.download_pipeline.clients.CDSClientExtended.fetch')\n    def test_fetch_data__manifest__records_gcs_failure(self, mock_fetch):\n        with tempfile.TemporaryDirectory() as tmpdir:\n            config = Config.from_dict({\n                'parameters': {\n                    'dataset': 'reanalysis-era5-pressure-levels',\n                    'partition_keys': ['year', 'month'],\n                    'target_path': os.path.join(tmpdir, 'download-{:02d}-{:02d}.nc'),\n                    'api_url': 'https//api-url.com/v1/',\n                    'api_key': '12345',\n                },\n                'selection': {\n                    'features': ['pressure'],\n                    'month': ['12'],\n                    'year': ['01']\n                }\n            })\n\n            error = IOError(\"Can't open gcs file.\")\n            mock_fetch.side_effect = error\n\n            with self.assertRaises(IOError) as e:\n                fetcher = Fetcher('cds', self.dummy_manifest, InMemoryStore())\n                fetcher.fetch_data(config)\n\n            actual = list(self.dummy_manifest.records.values())[0]\n\n            self.assertDictContainsSubset(dict(\n                selection=json.dumps(config.selection),\n                location=os.path.join(tmpdir, 'download-01-12.nc'),\n                stage='fetch',\n                status='failure',\n                username='unknown',\n            ), actual)\n\n            self.assertIn(error.args[0], actual['error'])\n            self.assertIn(error.args[0], e.exception.args[0])\n\n    @patch('weather_dl.download_pipeline.stores.InMemoryStore.open', return_value=io.StringIO())\n    @patch('weather_dl.download_pipeline.clients.CDSClientExtended.fetch')\n    def test_fetch_data__skips_existing_download(self, mock_fetch, mock_gcs_file):\n        with tempfile.TemporaryDirectory() as tmpdir:\n            config = Config.from_dict({\n                'parameters': {\n                    'dataset': 'reanalysis-era5-pressure-levels',\n                    'partition_keys': ['year', 'month'],\n                    'target_path': os.path.join(tmpdir, 'download-{:02d}-{:02d}.nc'),\n                    'api_url': 'https//api-url.com/v1/',\n                    'api_key': '12345',\n                },\n                'selection': {\n                    'features': ['pressure'],\n                    'month': ['12'],\n                    'year': ['01']\n                }\n            })\n\n            # target file already exists in store...\n            store = InMemoryStore()\n            store.store[os.path.join(tmpdir, 'download-01-12.nc')] = ''\n\n            fetcher = Fetcher('cds', self.dummy_manifest, store)\n            fetcher.fetch_data(config)\n\n            self.assertFalse(mock_gcs_file.called)\n            self.assertFalse(mock_fetch.called)\n"
  },
  {
    "path": "weather_dl/download_pipeline/manifest.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Client interface for connecting to a manifest.\"\"\"\n\nimport abc\nimport collections\nimport dataclasses\nimport datetime\nimport enum\nimport json\nimport logging\nimport os\nimport pandas as pd\nimport threading\nimport time\nimport traceback\nimport typing as t\nfrom urllib.parse import urlparse, parse_qsl\n\nfrom .util import (\n    to_json_serializable_type,\n    fetch_geo_polygon,\n    get_file_size,\n    get_wait_interval,\n    generate_md5_hash,\n    retry_with_exponential_backoff,\n    GLOBAL_COVERAGE_AREA\n)\n\nimport firebase_admin\nfrom firebase_admin import firestore\nfrom google.cloud import bigquery\nfrom google.cloud.firestore_v1 import DocumentReference\nfrom google.cloud.firestore_v1.types import WriteResult\n\n\"\"\"An implementation-dependent Manifest URI.\"\"\"\nLocation = t.NewType('Location', str)\n\nlogger = logging.getLogger(__name__)\n\n\nclass ManifestException(Exception):\n    \"\"\"Errors that occur in Manifest Clients.\"\"\"\n    pass\n\n\nclass Stage(enum.Enum):\n    \"\"\"A request can be either in one of the following stages at a time:\n\n    fetch : This represents request is currently in fetch stage i.e. request placed on the client's server\n        & waiting for some result before starting download (eg. MARS client).\n    download : This represents request is currently in download stage i.e. data is being downloading from client's\n        server to the worker's local file system.\n    upload : This represents request is currently in upload stage i.e. data is getting uploaded from worker's local\n        file system to target location (GCS path).\n    retrieve : In case of clients where there is no proper separation of fetch & download stages (eg. CDS client),\n        request will be in the retrieve stage i.e. fetch + download.\n    \"\"\"\n    RETRIEVE = 'retrieve'\n    FETCH = 'fetch'\n    DOWNLOAD = 'download'\n    UPLOAD = 'upload'\n\n\nclass Status(enum.Enum):\n    \"\"\"Depicts the request's state status:\n\n    scheduled : A request partition is created & scheduled for processing.\n        Note: Its corresponding state can be None only.\n    in-progress : This represents the request state is currently in-progress (i.e. running).\n        The next status would be \"success\" or \"failure\".\n    success : This represents the request state execution completed successfully without any error.\n    failure : This represents the request state execution failed.\n    \"\"\"\n    SCHEDULED = 'scheduled'\n    IN_PROGRESS = 'in-progress'\n    SUCCESS = 'success'\n    FAILURE = 'failure'\n\n\n@dataclasses.dataclass\nclass DownloadStatus():\n    \"\"\"Data recorded in `Manifest`s reflecting the status of a download.\"\"\"\n\n    \"\"\"The name of the config file associated with the request.\"\"\"\n    config_name: str = \"\"\n\n    \"\"\"Represents the dataset field of the configuration.\"\"\"\n    dataset: t.Optional[str] = \"\"\n\n    \"\"\"Copy of selection section of the configuration.\"\"\"\n    selection: t.Dict = dataclasses.field(default_factory=dict)\n\n    \"\"\"Location of the downloaded data.\"\"\"\n    location: str = \"\"\n\n    \"\"\"Represents area covered by the shard.\"\"\"\n    area: str = \"\"\n\n    \"\"\"Current stage of request : 'fetch', 'download', 'retrieve', 'upload' or None.\"\"\"\n    stage: t.Optional[Stage] = None\n\n    \"\"\"Download status: 'scheduled', 'in-progress', 'success', or 'failure'.\"\"\"\n    status: t.Optional[Status] = None\n\n    \"\"\"Cause of error, if any.\"\"\"\n    error: t.Optional[str] = \"\"\n\n    \"\"\"Identifier for the user running the download.\"\"\"\n    username: str = \"\"\n\n    \"\"\"Shard size in GB.\"\"\"\n    size: t.Optional[float] = 0\n\n    \"\"\"A UTC datetime when download was scheduled.\"\"\"\n    scheduled_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the retrieve stage starts.\"\"\"\n    retrieve_start_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the retrieve state ends.\"\"\"\n    retrieve_end_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the fetch state starts.\"\"\"\n    fetch_start_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the fetch state ends.\"\"\"\n    fetch_end_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the download state starts.\"\"\"\n    download_start_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the download state ends.\"\"\"\n    download_end_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the upload state starts.\"\"\"\n    upload_start_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the upload state ends.\"\"\"\n    upload_end_time: t.Optional[str] = \"\"\n\n    @classmethod\n    def from_dict(cls, download_status: t.Dict) -> 'DownloadStatus':\n        \"\"\"Instantiate DownloadStatus dataclass from dict.\"\"\"\n        download_status_instance = cls()\n        for key, value in download_status.items():\n            if key == 'status':\n                setattr(download_status_instance, key, Status(value))\n            elif key == 'stage' and value is not None:\n                setattr(download_status_instance, key, Stage(value))\n            else:\n                setattr(download_status_instance, key, value)\n        return download_status_instance\n\n    @classmethod\n    def to_dict(cls, instance) -> t.Dict:\n        \"\"\"Return the fields of a dataclass instance as a manifest ingestible\n        dictionary mapping of field names to field values.\"\"\"\n        download_status_dict = {}\n        for field in dataclasses.fields(instance):\n            key = field.name\n            value = getattr(instance, field.name)\n            if isinstance(value, Status) or isinstance(value, Stage):\n                download_status_dict[key] = value.value\n            elif isinstance(value, pd.Timestamp):\n                download_status_dict[key] = value.isoformat()\n            elif key == 'selection' and value is not None:\n                download_status_dict[key] = json.dumps(value)\n            else:\n                download_status_dict[key] = value\n        return download_status_dict\n\n\n@dataclasses.dataclass\nclass Manifest(abc.ABC):\n    \"\"\"Abstract manifest of download statuses.\n\n    Update download statuses to some storage medium.\n\n    This class lets one indicate that a download is `scheduled` or in a transaction process.\n    In the event of a transaction, a download will be updated with an `in-progress`, `success`\n    or `failure` status (with accompanying metadata).\n\n    Example:\n        ```\n        my_manifest = parse_manifest_location(Location('fs://some-firestore-collection'))\n\n        # Schedule data for download\n        my_manifest.schedule({'some': 'metadata'}, 'path/to/downloaded/file', 'my-username')\n\n        # ...\n\n        # Initiate a transaction – it will record that the download is `in-progess`\n        with my_manifest.transact({'some': 'metadata'}, 'path/to/downloaded/file', 'my-username') as tx:\n            # download logic here\n            pass\n\n            # ...\n\n            # on error, will record the download as a `failure` before propagating the error.  By default, it will\n            # record download as a `success`.\n        ```\n\n    Attributes:\n        location: An implementation-specific manifest URI.\n        status: The current `DownloadStatus` of the Manifest.\n    \"\"\"\n\n    location: Location\n    # To reduce the impact of _read() and _update() calls\n    # on the start time of the stage.\n    prev_stage_precise_start_time: t.Optional[str] = None\n    status: t.Optional[DownloadStatus] = None\n\n    # This is overridden in subclass.\n    def __post_init__(self):\n        \"\"\"Initialize the manifest.\"\"\"\n        pass\n\n    def schedule(self, config_name: str, dataset: str, selection: t.Dict, location: str, user: str) -> None:\n        \"\"\"Indicate that a job has been scheduled for download.\n\n        'scheduled' jobs occur before 'in-progress', 'success' or 'finished'.\n        \"\"\"\n        scheduled_time = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat(timespec='seconds')\n        self.status = DownloadStatus(\n                config_name=config_name,\n                dataset=dataset if dataset else None,\n                selection=selection,\n                location=location,\n                area=fetch_geo_polygon(selection.get('area', GLOBAL_COVERAGE_AREA)),\n                username=user,\n                stage=None,\n                status=Status.SCHEDULED,\n                error=None,\n                size=None,\n                scheduled_time=scheduled_time,\n                retrieve_start_time=None,\n                retrieve_end_time=None,\n                fetch_start_time=None,\n                fetch_end_time=None,\n                download_start_time=None,\n                download_end_time=None,\n                upload_start_time=None,\n                upload_end_time=None,\n            )\n        self._update(self.status)\n\n    def skip(self, config_name: str, dataset: str, selection: t.Dict, location: str, user: str) -> None:\n        \"\"\"Updates the manifest to mark the shards that were skipped in the current job\n        as 'upload' stage and 'success' status, indicating that they have already been downloaded.\n        \"\"\"\n        old_status = self._read(location)\n        # The manifest needs to be updated for a skipped shard if its entry is not present, or\n        # if the stage is not 'upload', or if the stage is 'upload' but the status is not 'success'.\n        if old_status.location != location or old_status.stage != Stage.UPLOAD or old_status.status != Status.SUCCESS:\n            current_utc_time = (\n                datetime.datetime.utcnow()\n                .replace(tzinfo=datetime.timezone.utc)\n                .isoformat(timespec='seconds')\n            )\n\n            size = get_file_size(location)\n\n            status = DownloadStatus(\n                    config_name=config_name,\n                    dataset=dataset if dataset else None,\n                    selection=selection,\n                    location=location,\n                    area=fetch_geo_polygon(selection.get('area', GLOBAL_COVERAGE_AREA)),\n                    username=user,\n                    stage=Stage.UPLOAD,\n                    status=Status.SUCCESS,\n                    error=None,\n                    size=size,\n                    scheduled_time=None,\n                    retrieve_start_time=None,\n                    retrieve_end_time=None,\n                    fetch_start_time=None,\n                    fetch_end_time=None,\n                    download_start_time=None,\n                    download_end_time=None,\n                    upload_start_time=current_utc_time,\n                    upload_end_time=current_utc_time,\n                )\n            self._update(status)\n            logger.info(f'Manifest updated for skipped shard: {location!r} -- {DownloadStatus.to_dict(status)!r}.')\n\n    def _set_for_transaction(self, config_name: str, dataset: str, selection: t.Dict, location: str, user: str) -> None:\n        \"\"\"Reset Manifest state in preparation for a new transaction.\"\"\"\n        self.status = dataclasses.replace(self._read(location))\n        self.status.config_name = config_name\n        self.status.dataset = dataset if dataset else None\n        self.status.selection = selection\n        self.status.location = location\n        self.status.username = user\n\n    def __enter__(self) -> None:\n        pass\n\n    def __exit__(self, exc_type, exc_inst, exc_tb) -> None:\n        \"\"\"Record end status of a transaction as either 'success' or 'failure'.\"\"\"\n        if exc_type is None:\n            status = Status.SUCCESS\n            error = None\n        else:\n            status = Status.FAILURE\n            # For explanation, see https://docs.python.org/3/library/traceback.html#traceback.format_exception\n            error = '\\n'.join(traceback.format_exception(exc_type, exc_inst, exc_tb))\n\n        new_status = dataclasses.replace(self.status)\n        new_status.error = error\n        new_status.status = status\n        current_utc_time = (\n            datetime.datetime.utcnow()\n            .replace(tzinfo=datetime.timezone.utc)\n            .isoformat(timespec='seconds')\n        )\n\n        # This is necessary for setting the precise start time of the previous stage\n        # and end time of the final stage, as well as handling the case of Status.FAILURE.\n        if new_status.stage == Stage.FETCH:\n            new_status.fetch_start_time = self.prev_stage_precise_start_time\n            new_status.fetch_end_time = current_utc_time\n        elif new_status.stage == Stage.RETRIEVE:\n            new_status.retrieve_start_time = self.prev_stage_precise_start_time\n            new_status.retrieve_end_time = current_utc_time\n        elif new_status.stage == Stage.DOWNLOAD:\n            new_status.download_start_time = self.prev_stage_precise_start_time\n            new_status.download_end_time = current_utc_time\n        else:\n            new_status.upload_start_time = self.prev_stage_precise_start_time\n            new_status.upload_end_time = current_utc_time\n\n        new_status.size = get_file_size(new_status.location)\n\n        self.status = new_status\n\n        self._update(self.status)\n\n    def transact(self, config_name: str, dataset: str, selection: t.Dict, location: str, user: str) -> 'Manifest':\n        \"\"\"Create a download transaction.\"\"\"\n        self._set_for_transaction(config_name, dataset, selection, location, user)\n        return self\n\n    def set_stage(self, stage: Stage) -> None:\n        \"\"\"Sets the current stage in manifest.\"\"\"\n        prev_stage = self.status.stage\n        new_status = dataclasses.replace(self.status)\n        new_status.stage = stage\n        new_status.status = Status.IN_PROGRESS\n        current_utc_time = (\n            datetime.datetime.utcnow()\n            .replace(tzinfo=datetime.timezone.utc)\n            .isoformat(timespec='seconds')\n        )\n\n        if stage == Stage.FETCH:\n            new_status.fetch_start_time = current_utc_time\n            new_status.fetch_end_time = None\n            new_status.download_start_time = None\n            new_status.download_end_time = None\n        elif stage == Stage.RETRIEVE:\n            new_status.retrieve_start_time = current_utc_time\n        elif stage == Stage.DOWNLOAD:\n            new_status.fetch_start_time = self.prev_stage_precise_start_time\n            new_status.fetch_end_time = current_utc_time\n            new_status.download_start_time = current_utc_time\n        else:\n            if prev_stage == Stage.DOWNLOAD:\n                new_status.download_start_time = self.prev_stage_precise_start_time\n                new_status.download_end_time = current_utc_time\n            else:\n                new_status.retrieve_start_time = self.prev_stage_precise_start_time\n                new_status.retrieve_end_time = current_utc_time\n            new_status.upload_start_time = current_utc_time\n\n        self.status = new_status\n        self._update(self.status)\n\n    @abc.abstractmethod\n    def _read(self, location: str) -> DownloadStatus:\n        pass\n\n    @abc.abstractmethod\n    def _update(self, download_status: DownloadStatus) -> None:\n        pass\n\n\nclass ConsoleManifest(Manifest):\n\n    def __post_init__(self):\n        self.name = urlparse(self.location).hostname\n\n    def _read(self, location: str) -> DownloadStatus:\n        return DownloadStatus()\n\n    def _update(self, download_status: DownloadStatus) -> None:\n        logger.info(f'[{self.name}] {DownloadStatus.to_dict(download_status)!r}')\n\n\nclass LocalManifest(Manifest):\n    \"\"\"Writes a JSON representation of the manifest to local file.\"\"\"\n\n    _lock = threading.Lock()\n\n    def __init__(self, location: Location) -> None:\n        super().__init__(Location(os.path.join(location, 'manifest.json')))\n        if location and not os.path.exists(location):\n            os.makedirs(location)\n\n        # If the file is empty, it should start out as an empty JSON object.\n        if not os.path.exists(self.location) or os.path.getsize(self.location) == 0:\n            with open(self.location, 'w') as file:\n                json.dump({}, file)\n\n    def _read(self, location: str) -> DownloadStatus:\n        \"\"\"Reads the JSON data from a manifest.\"\"\"\n        assert os.path.exists(self.location), f'{self.location} must exist!'\n        with LocalManifest._lock:\n            with open(self.location, 'r') as file:\n                manifest = json.load(file)\n                return DownloadStatus.from_dict(manifest.get(location, {}))\n\n    def _update(self, download_status: DownloadStatus) -> None:\n        \"\"\"Writes the JSON data to a manifest.\"\"\"\n        assert os.path.exists(self.location), f'{self.location} must exist!'\n        with LocalManifest._lock:\n            with open(self.location, 'r') as file:\n                manifest = json.load(file)\n\n            status = DownloadStatus.to_dict(download_status)\n            manifest[status['location']] = status\n\n            with open(self.location, 'w') as file:\n                json.dump(manifest, file)\n                logger.debug('Manifest written to.')\n                logger.debug(download_status)\n\n\nclass BQManifest(Manifest):\n    \"\"\"Writes a JSON representation of the manifest to BQ file.\n\n    This is an append-only implementation, the latest value in the manifest\n    represents the current state of a download.\n    \"\"\"\n    def __init__(self, location: Location) -> None:\n        super().__init__(Location(location[5:]))\n        TABLE_SCHEMA = [\n            bigquery.SchemaField('config_name', 'STRING', mode='REQUIRED',\n                                 description=\"The name of the config file associated with the request.\"),\n            bigquery.SchemaField('dataset', 'STRING', mode='NULLABLE',\n                                 description=\"Represents the dataset field of the configuration.\"),\n            bigquery.SchemaField('selection', 'JSON', mode='REQUIRED',\n                                 description=\"Copy of selection section of the configuration.\"),\n            bigquery.SchemaField('location', 'STRING', mode='REQUIRED',\n                                 description=\"Location of the downloaded data.\"),\n            bigquery.SchemaField('area', 'STRING', mode='NULLABLE',\n                                 description=\"Represents area covered by the shard. \"\n                                 \"ST_GeogFromGeoJson(area): To convert a GeoJSON geometry object into a \"\n                                 \"GEOGRAPHY value. \"\n                                 \"ST_COVERS(geography_expression, ST_GEOGPOINT(longitude, latitude)): To check \"\n                                 \"if a point lies in the given area or not.\"),\n            bigquery.SchemaField('stage', 'STRING', mode='NULLABLE',\n                                 description=\"Current stage of request : 'fetch', 'download', 'retrieve', 'upload' \"\n                                 \"or None.\"),\n            bigquery.SchemaField('status', 'STRING', mode='REQUIRED',\n                                 description=\"Download status: 'scheduled', 'in-progress', 'success', or 'failure'.\"),\n            bigquery.SchemaField('error', 'STRING', mode='NULLABLE',\n                                 description=\"Cause of error, if any.\"),\n            bigquery.SchemaField('username', 'STRING', mode='REQUIRED',\n                                 description=\"Identifier for the user running the download.\"),\n            bigquery.SchemaField('size', 'FLOAT', mode='NULLABLE',\n                                 description=\"Shard size in GB.\"),\n            bigquery.SchemaField('scheduled_time', 'TIMESTAMP', mode='NULLABLE',\n                                 description=\"A UTC datetime when download was scheduled.\"),\n            bigquery.SchemaField('retrieve_start_time', 'TIMESTAMP', mode='NULLABLE',\n                                 description=\"A UTC datetime when the retrieve stage starts.\"),\n            bigquery.SchemaField('retrieve_end_time', 'TIMESTAMP', mode='NULLABLE',\n                                 description=\"A UTC datetime when the retrieve state ends.\"),\n            bigquery.SchemaField('fetch_start_time', 'TIMESTAMP', mode='NULLABLE',\n                                 description=\"A UTC datetime when the fetch state starts.\"),\n            bigquery.SchemaField('fetch_end_time', 'TIMESTAMP', mode='NULLABLE',\n                                 description=\"A UTC datetime when the fetch state ends.\"),\n            bigquery.SchemaField('download_start_time', 'TIMESTAMP', mode='NULLABLE',\n                                 description=\"A UTC datetime when the download state starts.\"),\n            bigquery.SchemaField('download_end_time', 'TIMESTAMP', mode='NULLABLE',\n                                 description=\"A UTC datetime when the download state ends.\"),\n            bigquery.SchemaField('upload_start_time', 'TIMESTAMP', mode='NULLABLE',\n                                 description=\"A UTC datetime when the upload state starts.\"),\n            bigquery.SchemaField('upload_end_time', 'TIMESTAMP', mode='NULLABLE',\n                                 description=\"A UTC datetime when the upload state ends.\"),\n        ]\n        table = bigquery.Table(self.location, schema=TABLE_SCHEMA)\n        with bigquery.Client() as client:\n            client.create_table(table, exists_ok=True)\n\n    def _read(self, location: str) -> DownloadStatus:\n        \"\"\"Reads the JSON data from a manifest.\"\"\"\n        with bigquery.Client() as client:\n            select_statement = f\"SELECT * FROM {self.location} WHERE location = @location\"\n\n            # Build the QueryJobConfig object with the parameters.\n            job_config = bigquery.QueryJobConfig()\n            job_config.query_parameters = [bigquery.ScalarQueryParameter('location', 'STRING', location)]\n\n            # Execute the merge statement with the parameters.\n            query_job = client.query(select_statement, job_config=job_config)\n\n            # Wait for the query to execute.\n            result = query_job.result()\n            row = {}\n            if result.total_rows > 0:\n                records = result.to_dataframe().to_dict('records')\n                row = {n: to_json_serializable_type(v) for n, v in records[0].items()}\n            return DownloadStatus.from_dict(row)\n\n    # Added retry here to handle the concurrency issue in BigQuery.\n    # Eg: 400 Resources exceeded during query execution: Too many DML statements outstanding\n    # against table <table-name>, limit is 20\n    @retry_with_exponential_backoff\n    def _update(self, download_status: DownloadStatus) -> None:\n        \"\"\"Writes the JSON data to a manifest.\"\"\"\n        with bigquery.Client() as client:\n            status = DownloadStatus.to_dict(download_status)\n            table = client.get_table(self.location)\n            columns = [field.name for field in table.schema]\n            parameter_type_mapping = {field.name: field.field_type for field in table.schema}\n\n            update_dml = [f\"{col} = @{col}\" for col in columns]\n            insert_dml = [f\"@{col}\" for col in columns]\n            params = {col: status[col] for col in columns}\n\n            # Build the merge statement as a string with parameter placeholders.\n            merge_statement = f\"\"\"\n                MERGE {self.location} T\n                USING (\n                SELECT\n                    @location as location\n                ) S\n                ON T.location = S.location\n                WHEN MATCHED THEN\n                UPDATE SET\n                    {', '.join(update_dml)}\n                WHEN NOT MATCHED THEN\n                INSERT\n                    ({\", \".join(columns)})\n                VALUES\n                    ({', '.join(insert_dml)})\n            \"\"\"\n\n            logger.debug(merge_statement)\n\n            # Build the QueryJobConfig object with the parameters.\n            job_config = bigquery.QueryJobConfig()\n            job_config.query_parameters = [bigquery.ScalarQueryParameter(col, parameter_type_mapping[col], value)\n                                           for col, value in params.items()]\n\n            # Execute the merge statement with the parameters.\n            query_job = client.query(merge_statement, job_config=job_config)\n\n            # Wait for the query to execute.\n            query_job.result()\n\n            logger.debug('Manifest written to.')\n            logger.debug(download_status)\n\n\nclass FirestoreManifest(Manifest):\n    \"\"\"A Firestore Manifest.\n    This Manifest implementation stores DownloadStatuses in a Firebase document store.\n    The document hierarchy for the manifest is as follows:\n      [manifest  <or manifest name, configurable from CLI>]\n      ├── doc_id (md5 hash of the path) { 'selection': {...}, 'location': ..., 'username': ... }\n      └── etc...\n    Where `[<name>]` indicates a collection and `<name> {...}` indicates a document.\n    \"\"\"\n\n    def _get_db(self) -> firestore.firestore.Client:\n        \"\"\"Acquire a firestore client, initializing the firebase app if necessary.\n        Will attempt to get the db client five times. If it's still unsuccessful, a\n        `ManifestException` will be raised.\n        \"\"\"\n        db = None\n        attempts = 0\n\n        while db is None:\n            try:\n                db = firestore.client()\n            except ValueError as e:\n                # The above call will fail with a value error when the firebase app is not initialized.\n                # Initialize the app here, and try again.\n                firebase_admin.initialize_app(options=self.get_firestore_config())\n                logger.info('Initialized Firebase App.')\n\n                if attempts > 4:\n                    raise ManifestException('Exceeded number of retries to get firestore client.') from e\n\n            time.sleep(get_wait_interval(attempts))\n\n            attempts += 1\n\n        return db\n\n    def _read(self, location: str) -> DownloadStatus:\n        \"\"\"Reads the JSON data from a manifest.\"\"\"\n\n        doc_id = generate_md5_hash(location)\n\n        # Update document with download status\n        download_doc_ref = (\n            self.root_document_for_store(doc_id)\n        )\n\n        result = download_doc_ref.get()\n        row = {}\n        if result.exists:\n            records = result.to_dict()\n            row = {n: to_json_serializable_type(v) for n, v in records.items()}\n        return DownloadStatus.from_dict(row)\n\n    def _update(self, download_status: DownloadStatus) -> None:\n        \"\"\"Update or create a download status record.\"\"\"\n        logger.debug('Updating Firestore Manifest.')\n\n        status = DownloadStatus.to_dict(download_status)\n        doc_id = generate_md5_hash(status['location'])\n\n        # Update document with download status\n        download_doc_ref = (\n            self.root_document_for_store(doc_id)\n        )\n\n        result: WriteResult = download_doc_ref.set(status)\n\n        logger.debug(f'Firestore manifest updated. '\n                     f'update_time={result.update_time}, '\n                     f'filename={download_status.location}.')\n\n    def root_document_for_store(self, store_scheme: str) -> DocumentReference:\n        \"\"\"Get the root manifest document given the user's config and current document's storage location.\"\"\"\n        # Get user-defined collection for manifest.\n        root_collection = self.get_firestore_config().get('collection', 'manifest')\n        return self._get_db().collection(root_collection).document(store_scheme)\n\n    def get_firestore_config(self) -> t.Dict:\n        \"\"\"Parse firestore Location format: 'fs://<collection-name>?projectId=<project-id>'\n        Users must specify a 'projectId' query parameter in the firestore location. If this argument\n        isn't passed in, users must set the `GOOGLE_CLOUD_PROJECT` environment variable.\n        Users may specify options to `firebase_admin.initialize_app()` via query arguments in the URL.\n        For more information about what options are available, consult this documentation:\n        https://firebase.google.com/docs/reference/admin/python/firebase_admin#initialize_app\n            Note: each query key-value pair may only appear once. If there are duplicates, the last pair\n            will be used.\n        Optionally, users may configure these options via the `FIREBASE_CONFIG` environment variable,\n        which is typically a path/to/a/file.json.\n        Examples:\n            >>> location = Location(\"fs://my-collection?projectId=my-project-id&storageBucket=foo\")\n            >>> FirestoreManifest(location).get_firestore_config()\n            {'collection': 'my-collection', 'projectId': 'my-project-id', 'storageBucket': 'foo'}\n        Raises:\n            ValueError: If query parameters are malformed.\n            AssertionError: If the 'projectId' query parameter is not set.\n        \"\"\"\n        parsed = urlparse(self.location)\n        query_params = {}\n        if parsed.query:\n            query_params = dict(parse_qsl(parsed.query, strict_parsing=True))\n        return {'collection': parsed.netloc, **query_params}\n\n\nclass MockManifest(Manifest):\n    \"\"\"In-memory mock manifest.\"\"\"\n\n    def __init__(self, location: Location) -> None:\n        super().__init__(location)\n        self.records = {}\n\n    def _read(self, location: str) -> DownloadStatus:\n        manifest = self.records\n        return DownloadStatus.from_dict(manifest.get(location, {}))\n\n    def _update(self, download_status: DownloadStatus) -> None:\n        status = DownloadStatus.to_dict(download_status)\n        self.records.update({status.get('location'): status})\n        logger.debug('Manifest updated.')\n        logger.debug(download_status)\n\n\nclass NoOpManifest(Manifest):\n    \"\"\"A manifest that performs no operations.\"\"\"\n\n    def _read(self, location: str) -> DownloadStatus:\n        return DownloadStatus()\n\n    def _update(self, download_status: DownloadStatus) -> None:\n        pass\n\n\n\"\"\"Exposed manifest implementations.\n\nUsers can choose their preferred manifest implementation by via the protocol of the Manifest Location.\nThe protocol corresponds to the keys of this ordered dictionary.\n\nIf no protocol is specified, we assume the user wants to write to the local file system.\nIf no key is found, the `NoOpManifest` option will be chosen. See `parsers:parse_manifest_location`.\n\"\"\"\nMANIFESTS = collections.OrderedDict({\n    'cli': ConsoleManifest,\n    'fs': FirestoreManifest,\n    'bq': BQManifest,\n    '': LocalManifest,\n})\n\nif __name__ == '__main__':\n    # Execute doc tests\n    import doctest\n\n    doctest.testmod()\n"
  },
  {
    "path": "weather_dl/download_pipeline/manifest_test.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport json\nimport random\nimport string\nimport tempfile\nimport typing as t\nimport unittest\n\nfrom .manifest import LocalManifest, Location, DownloadStatus, Status, Stage\n\n\ndef rand_str(max_len=32):\n    return ''.join([random.choice(string.printable) for _ in range(random.randint(0, max_len))])\n\n\ndef make_download_status(location: t.Optional[str] = None) -> DownloadStatus:\n    return DownloadStatus(\n        selection={},\n        location=rand_str() if location is None else location,\n        status=random.choice([Status.SCHEDULED, Status.IN_PROGRESS,\n                              Status.SUCCESS, Status.FAILURE]),\n        error=random.choice([None] + [rand_str(100) for _ in range(4)]),\n        username=random.choice(['user', 'alice', 'bob', 'root']),\n        stage=random.choice([Stage.FETCH, Stage.DOWNLOAD,\n                             Stage.UPLOAD, Stage.RETRIEVE, None]),\n        size=0.039322572,\n        scheduled_time=\"2023-02-07T17:15:26+00:00\",\n        retrieve_start_time=None,\n        retrieve_end_time=None,\n        fetch_start_time=\"2023-02-07T17:15:29+00:00\",\n        fetch_end_time=\"2023-02-07T17:21:37+00:00\",\n        download_start_time=\"2023-02-07T17:21:39+00:00\",\n        download_end_time=\"2023-02-07T17:21:56+00:00\",\n        upload_start_time=\"2023-02-07T17:21:59+00:00\",\n        upload_end_time=\"2023-02-07T17:22:03+00:00\"\n    )\n\n\nclass LocalManifestTest(unittest.TestCase):\n    NUM_RUNS = 128\n\n    def test_empty_manifest_is_valid_json(self):\n        with tempfile.TemporaryDirectory() as dir_:\n            manifest = LocalManifest(Location(dir_))\n            with open(manifest.location) as file:\n                self.is_valid_json(file)\n\n    def does_not_overwrite_existing_manifest(self):\n        with tempfile.TemporaryDirectory() as dir_:\n            with open(f'{dir}/manifest.json', 'w') as file:\n                json.dump({'foo': 'bar'}, file)\n\n            manifest = LocalManifest(Location(dir_))\n\n            with open(manifest.location, 'r') as file:\n                manifest = json.load(file)\n                self.assertIn('foo', manifest)\n                self.assertEqual(manifest['foo'], 'bar')\n\n    def test_writes_valid_json(self):\n        with tempfile.TemporaryDirectory() as dir_:\n            manifest = LocalManifest(Location(dir_))\n            for _ in range(self.NUM_RUNS):\n                status = make_download_status()\n                manifest._update(status)\n                with open(manifest.location) as file:\n                    self.is_valid_json(file)\n\n    def test_overwrites_existing_statuses(self):\n        locations = ['a', 'b', 'c']\n        with tempfile.TemporaryDirectory() as dir_:\n            manifest = LocalManifest(Location(dir_))\n            for i in range(self.NUM_RUNS):\n                status = make_download_status(location=locations[i % 3])\n                manifest._update(status)\n                with open(manifest.location) as file:\n                    self.is_valid_json(file)\n\n            with open(manifest.location) as file:\n                manifest = json.load(file)\n            self.assertEqual(set(locations), set(manifest.keys()))\n\n    def is_valid_json(self, file: t.IO) -> None:\n        \"\"\"Fails test on error decoding JSON.\"\"\"\n        try:\n            json.dumps(json.load(file))\n        except json.JSONDecodeError:\n            self.fail('JSON is invalid.')\n"
  },
  {
    "path": "weather_dl/download_pipeline/parsers.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Parsers for ECMWF download configuration.\"\"\"\n\nimport ast\nimport configparser\nimport copy as cp\nimport datetime\nimport json\nimport string\nimport textwrap\nimport typing as t\nimport numpy as np\nfrom collections import OrderedDict\nfrom dateutil.relativedelta import relativedelta\nfrom urllib.parse import urlparse\n\nfrom .clients import CLIENTS\nfrom .config import Config\nfrom .manifest import MANIFESTS, Manifest, Location, NoOpManifest\n\n\ndef date(candidate: str) -> datetime.date:\n    \"\"\"Converts ECMWF-format date strings into a `datetime.date`.\n\n    Accepted absolute date formats:\n    - YYYY-MM-DD\n    - YYYYMMDD\n    - YYYY-DDD, where DDD refers to the day of the year\n\n    For example:\n    - 2021-10-31\n    - 19700101\n    - 1950-007\n\n    See https://confluence.ecmwf.int/pages/viewpage.action?pageId=118817289 for date format spec.\n    Note: Name of month is not supported.\n    \"\"\"\n    converted = None\n\n    # Parse relative day value.\n    if candidate.startswith('-'):\n        return datetime.date.today() + datetime.timedelta(days=int(candidate))\n\n    accepted_formats = [\"%Y-%m-%d\", \"%Y%m%d\", \"%Y-%j\"]\n\n    for fmt in accepted_formats:\n        try:\n            converted = datetime.datetime.strptime(candidate, fmt).date()\n            break\n        except ValueError:\n            pass\n\n    if converted is None:\n        raise ValueError(\n            f\"Not a valid date: '{candidate}'. Please use valid relative or absolute format.\"\n        )\n\n    return converted\n\n\ndef time(candidate: str) -> datetime.time:\n    \"\"\"Converts ECMWF-format time strings into a `datetime.time`.\n\n    Accepted time formats:\n    - HH:MM\n    - HHMM\n    - HH\n\n    For example:\n    - 18:00\n    - 1820\n    - 18\n\n    Note: If MM is omitted it defaults to 00.\n    \"\"\"\n    converted = None\n\n    accepted_formats = [\"%H\", \"%H:%M\", \"%H%M\"]\n\n    for fmt in accepted_formats:\n        try:\n            converted = datetime.datetime.strptime(candidate, fmt).time()\n            break\n        except ValueError:\n            pass\n\n    if converted is None:\n        raise ValueError(\n            f\"Not a valid time: '{candidate}'. Please use valid format.\"\n        )\n\n    return converted\n\n\ndef day_month_year(candidate: t.Any) -> int:\n    \"\"\"Converts day, month and year strings into 'int'.\"\"\"\n    try:\n        if isinstance(candidate, str) or isinstance(candidate, int):\n            return int(candidate)\n        raise ValueError('must be a str or int.')\n    except ValueError as e:\n        raise ValueError(\n            f\"Not a valid day, month, or year value: {candidate}. Please use valid value.\"\n        ) from e\n\n\ndef date_range_converter(candidate: str) -> str:\n    \"\"\"Replace / with _ to avoid directory creation.\"\"\"\n    return candidate.replace('/', '_')\n\n\ndef parse_literal(candidate: t.Any) -> t.Any:\n    try:\n        # Support parsing ints with leading zeros, e.g. '01'\n        if isinstance(candidate, str) and candidate.isdigit():\n            return int(candidate)\n        return ast.literal_eval(candidate)\n    except (ValueError, TypeError, SyntaxError, MemoryError, RecursionError):\n        return candidate\n\n\ndef validate(key: str, value: int) -> None:\n    \"\"\"Validates value based on the key.\"\"\"\n    if key == \"day\":\n        assert 1 <= value <= 31, \"Day value must be between 1 to 31.\"\n    if key == \"month\":\n        assert 1 <= value <= 12, \"Month value must be between 1 to 12.\"\n\n\ndef typecast(key: str, value: t.Any) -> t.Any:\n    \"\"\"Type the value to its appropriate datatype.\"\"\"\n    SWITCHER = {\n        'date': date,\n        'time': time,\n        'day': day_month_year,\n        'month': day_month_year,\n        'year': day_month_year,\n        'date_range': date_range_converter,\n    }\n    converted = SWITCHER.get(key, parse_literal)(value)\n    validate(key, converted)\n    return converted\n\n\ndef _read_config_file(file: t.IO) -> t.Dict:\n    \"\"\"Reads `*.json` or `*.cfg` files.\"\"\"\n    try:\n        return json.load(file)\n    except json.JSONDecodeError:\n        pass\n\n    file.seek(0)\n\n    try:\n        config = configparser.ConfigParser()\n        config.read_file(file)\n        config = {s: dict(config.items(s)) for s in config.sections()}\n        return config\n    except configparser.ParsingError:\n        return {}\n\n\ndef parse_config(file: t.IO) -> t.Dict:\n    \"\"\"Parses a `*.json` or `*.cfg` file into a configuration dictionary.\"\"\"\n    config = _read_config_file(file)\n    config_by_section = {s: _parse_lists(v, s) for s, v in config.items()}\n    config_with_nesting = parse_subsections(config_by_section)\n    return config_with_nesting\n\n\ndef parse_manifest(location: Location, pipeline_opts: t.Dict) -> Manifest:\n    \"\"\"Constructs a manifest object by parsing the location.\"\"\"\n    project_id__exists = 'project' in pipeline_opts\n    project_id__not_set = 'projectId' not in location\n\n    # If the firestore location doesn't specify which project (and, the pipeline\n    # knows which project)...\n    if location.startswith('fs://') and project_id__not_set and project_id__exists:\n        # ...Set the project query param in the Firestore URI.\n        start_char = '&' if '?' in location else '?'\n        project = pipeline_opts.get('project')\n        location += f'{start_char}projectId={project}'\n\n    parsed = urlparse(location)\n    return MANIFESTS.get(parsed.scheme, NoOpManifest)(location)\n\n\ndef _splitlines(block: str) -> t.List[str]:\n    \"\"\"Converts a multi-line block into a list of strings.\"\"\"\n    return [line.strip() for line in block.strip().splitlines()]\n\n\ndef mars_range_value(token: str, key: str) -> t.Union[datetime.date, int, float]:\n    \"\"\"Converts a range token into either a date, int, or float.\"\"\"\n    # TODO(b/175432034): Recognize time values\n    try:\n        if key == 'year-month':\n            return datetime.datetime.strptime(token, \"%Y-%m\").date()\n        else:\n            return date(token)\n    except ValueError:\n        pass\n\n    if token.isdecimal():\n        return int(token)\n\n    try:\n        return float(token)\n    except ValueError:\n        raise ValueError(\"Token string must be an 'int', 'float', or 'datetime.date()'.\")\n\n\ndef mars_increment_value(token: str) -> t.Union[int, float]:\n    \"\"\"Converts an increment token into either an int or a float.\"\"\"\n    try:\n        return int(token)\n    except ValueError:\n        pass\n\n    try:\n        return float(token)\n    except ValueError:\n        raise ValueError(\"Token string must be an 'int' or a 'float'.\")\n\n\ndef parse_mars_syntax(block: str, key: str) -> t.List[str]:\n    \"\"\"Parses MARS list or range into a list of arguments; ranges are inclusive.\n\n    Types for the range and value are inferred.\n\n    Examples:\n        >>> parse_mars_syntax(\"10/to/12\")\n        ['10', '11', '12']\n        >>> parse_mars_syntax(\"12/to/10/by/-1\")\n        ['12', '11', '10']\n        >>> parse_mars_syntax(\"0.0/to/0.5/by/0.1\")\n        ['0.0', '0.1', '0.2', '0.30000000000000004', '0.4', '0.5']\n        >>> parse_mars_syntax(\"2020-01-07/to/2020-01-14/by/2\")\n        ['2020-01-07', '2020-01-09', '2020-01-11', '2020-01-13']\n        >>> parse_mars_syntax(\"2020-01-14/to/2020-01-07/by/-2\")\n        ['2020-01-14', '2020-01-12', '2020-01-10', '2020-01-08']\n\n    Returns:\n        A list of strings representing a range from start to finish, based on the\n        type of the values in the range.\n        If all range values are integers, it will return a list of strings of integers.\n        If range values are floats, it will return a list of strings of floats.\n        If the range values are dates, it will return a list of strings of dates in\n        YYYY-MM-DD format. (Note: here, the increment value should be an integer).\n    \"\"\"\n\n    # Split into tokens, omitting empty strings.\n    tokens = [b.strip() for b in block.split('/') if b != '']\n\n    # Return list if no range operators are present.\n    if 'to' not in tokens and 'by' not in tokens:\n        return tokens\n\n    # Parse range values, honoring 'to' and 'by' operators.\n    try:\n        to_idx = tokens.index('to')\n        assert to_idx != 0, \"There must be a start token.\"\n        start_token, end_token = tokens[to_idx - 1], tokens[to_idx + 1]\n        start, end = mars_range_value(start_token, key), mars_range_value(end_token, key)\n\n        # Parse increment token, or choose default increment.\n        increment_token = '1'\n        increment = 1\n        if 'by' in tokens:\n            increment_token = tokens[tokens.index('by') + 1]\n            increment = mars_increment_value(increment_token)\n    except (AssertionError, IndexError, ValueError):\n        raise SyntaxError(f\"Improper range syntax in '{block}'.\")\n\n    # Return a range of values with appropriate data type.\n    if (key == 'year-month' and isinstance(start, datetime.date) and isinstance(end, datetime.date)\n            and isinstance(increment, int)):\n        result = []\n        offset = 1 if start <= end else -1\n        if increment >= 0:\n            increment *= offset  # ensure increment has correct direction\n        current = start\n        while current <= end if offset > 0 else current >= end:\n            result.append(current.strftime(\"%Y-%m\"))\n            current += relativedelta(months=increment)\n        return result\n    elif isinstance(start, datetime.date) and isinstance(end, datetime.date) and key != 'year-month':\n        increment *= -1 if start > end and increment > 0 else 1\n        if not isinstance(increment, int):\n            raise ValueError(\n                f\"Increments on a date range must be integer number of days, '{increment_token}' is invalid.\"\n            )\n        return [d.strftime(\"%Y-%m-%d\") for d in date_range(start, end, increment)]\n    elif (isinstance(start, float) or isinstance(end, float)) and not isinstance(increment, datetime.date):\n        # Increment can be either an int or a float.\n        _round_places = 4\n        return [str(round(x, _round_places)).zfill(len(start_token))\n                for x in np.arange(start, end + increment, increment)]\n    elif isinstance(start, int) and isinstance(end, int) and isinstance(increment, int):\n        # Honor leading zeros.\n        offset = 1 if start <= end else -1\n        return [str(x).zfill(len(start_token)) for x in range(start, end + offset, increment)]\n    else:\n        raise ValueError(\n            f\"Range tokens (start='{start_token}', end='{end_token}', increment='{increment_token}')\"\n            f\" are inconsistent types.\"\n        )\n\n\ndef date_range(start: datetime.date, end: datetime.date, increment: int = 1) -> t.Iterable[datetime.date]:\n    \"\"\"Gets a range of dates, inclusive.\"\"\"\n    offset = 1 if start <= end else -1\n    return (start + datetime.timedelta(days=x) for x in range(0, (end - start).days + offset, increment))\n\n\ndef _parse_lists(config: dict, section: str = '') -> t.Dict:\n    \"\"\"Parses multiline blocks in *.cfg and *.json files as lists.\"\"\"\n    for key, val in config.items():\n        # Checks str type for backward compatibility since it also support \"padding\": 0 in json config\n        if not isinstance(val, str):\n            continue\n\n        if '/' in val and 'parameters' not in section and key != 'date_range':\n            config[key] = parse_mars_syntax(val, key)\n        elif '\\n' in val:\n            config[key] = _splitlines(val)\n\n    return config\n\n\ndef _number_of_replacements(s: t.Text):\n    format_names = [v[1] for v in string.Formatter().parse(s) if v[1] is not None]\n    num_empty_names = len([empty for empty in format_names if empty == ''])\n    if num_empty_names != 0:\n        num_empty_names -= 1\n    return len(set(format_names)) + num_empty_names\n\n\ndef parse_subsections(config: t.Dict) -> t.Dict:\n    \"\"\"Interprets [section.subsection] as nested dictionaries in `.cfg` files.\"\"\"\n    copy = cp.deepcopy(config)\n    for key, val in copy.items():\n        path = key.split('.')\n        runner = copy\n        parent = {}\n        p = None\n        for p in path:\n            if p not in runner:\n                runner[p] = {}\n            parent = runner\n            runner = runner[p]\n        parent[p] = val\n\n    for_cleanup = [key for key, _ in copy.items() if '.' in key]\n    for target in for_cleanup:\n        del copy[target]\n    return copy\n\n\ndef require(condition: bool, message: str, error_type: t.Type[Exception] = ValueError) -> None:\n    \"\"\"A assert-like helper that wraps text and throws an error.\"\"\"\n    if not condition:\n        raise error_type(textwrap.dedent(message))\n\n\ndef process_config(file: t.IO, config_name: str) -> Config:\n    \"\"\"Read the config file and prompt the user if it is improperly structured.\"\"\"\n    config = parse_config(file)\n\n    require(bool(config), \"Unable to parse configuration file.\")\n    require('parameters' in config,\n            \"\"\"\n            'parameters' section required in configuration file.\n\n            The 'parameters' section specifies the 'client', 'dataset', 'target_path', and\n            'partition_key' for the API client.\n\n            Please consult the documentation for more information.\"\"\")\n\n    params = config.get('parameters', {})\n    require('target_template' not in params,\n            \"\"\"\n            'target_template' is deprecated, use 'target_path' instead.\n\n            Please consult the documentation for more information.\"\"\")\n    require('target_path' in params,\n            \"\"\"\n            'parameters' section requires a 'target_path' key.\n\n            The 'target_path' is used to format the name of the output files. It\n            accepts Python 3.5+ string format symbols (e.g. '{}'). The number of symbols\n            should match the length of the 'partition_keys', as the 'partition_keys' args\n            are used to create the templates.\"\"\")\n    require('client' in params,\n            \"\"\"\n            'parameters' section requires a 'client' key.\n\n            Supported clients are {}\n            \"\"\".format(str(list(CLIENTS.keys()))))\n    require(params.get('client') in CLIENTS.keys(),\n            \"\"\"\n            Invalid 'client' parameter.\n\n            Supported clients are {}\n            \"\"\".format(str(list(CLIENTS.keys()))))\n    require('append_date_dirs' not in params,\n            \"\"\"\n            The current version of 'google-weather-tools' no longer supports 'append_date_dirs'!\n\n            Please refer to documentation for creating date-based directory hierarchy :\n            https://weather-tools.readthedocs.io/en/latest/Configuration.html#\"\"\"\n            \"\"\"creating-a-date-based-directory-hierarchy.\"\"\",\n            NotImplementedError)\n    require('target_filename' not in params,\n            \"\"\"\n            The current version of 'google-weather-tools' no longer supports 'target_filename'!\n\n            Please refer to documentation :\n            https://weather-tools.readthedocs.io/en/latest/Configuration.html#parameters-section.\"\"\",\n            NotImplementedError)\n\n    partition_keys = params.get('partition_keys', list())\n    if isinstance(partition_keys, str):\n        partition_keys = [partition_keys.strip()]\n\n    selection = config.get('selection', dict())\n    require(all((key in selection for key in partition_keys)),\n            \"\"\"\n            All 'partition_keys' must appear in the 'selection' section.\n\n            'partition_keys' specify how to split data for workers. Please consult\n            documentation for more information.\"\"\")\n\n    num_template_replacements = _number_of_replacements(params['target_path'])\n    num_partition_keys = len(partition_keys)\n\n    require(num_template_replacements == num_partition_keys,\n            \"\"\"\n            'target_path' has {0} replacements. Expected {1}, since there are {1}\n            partition keys.\n            \"\"\".format(num_template_replacements, num_partition_keys))\n\n    if 'day' in partition_keys:\n        require(selection['day'] != 'all',\n                \"\"\"If 'all' is used for a selection value, it cannot appear as a partition key.\"\"\")\n\n    if 'hdate' in selection:\n        require('date' in partition_keys,\n                \"\"\"\"If 'hdate' is specified in the 'selection' section,\n                then 'date' is required as a partition keys.\"\"\")\n\n    if 'date_range' in selection:\n        require('date_range' in partition_keys,\n                \"\"\"\"If 'date_range' is specified in the 'selection' section,\n                then it is also required as a partition keys.\"\"\")\n\n    # Ensure consistent lookup.\n    config['parameters']['partition_keys'] = partition_keys\n    # Add config file name.\n    config['parameters']['config_name'] = config_name\n\n    # Ensure the cartesian-cross can be taken on singleton values for the partition.\n    for key in partition_keys:\n        if not isinstance(selection[key], list):\n            selection[key] = [selection[key]]\n\n    return Config.from_dict(config)\n\n\ndef prepare_target_name(config: Config) -> str:\n    \"\"\"Returns name of target location.\"\"\"\n    partition_dict = OrderedDict((key, typecast(key, config.selection[key][0])) for key in config.partition_keys)\n    target = config.target_path.format(*partition_dict.values(), **partition_dict)\n\n    return target\n\n\ndef get_subsections(config: Config) -> t.List[t.Tuple[str, t.Dict]]:\n    \"\"\"Collect parameter subsections from main configuration.\n\n    If the `parameters` section contains subsections (e.g. '[parameters.1]',\n    '[parameters.2]'), collect the subsection key-value pairs. Otherwise,\n    return an empty dictionary (i.e. there are no subsections).\n\n    This is useful for specifying multiple API keys for your configuration.\n    For example:\n    ```\n      [parameters.alice]\n      api_key=KKKKK1\n      api_url=UUUUU1\n      [parameters.bob]\n      api_key=KKKKK2\n      api_url=UUUUU2\n      [parameters.eve]\n      api_key=KKKKK3\n      api_url=UUUUU3\n    ```\n    \"\"\"\n    return [(name, params) for name, params in config.kwargs.items()\n            if isinstance(params, dict)] or [('default', {})]\n\n\ndef all_equal(iterator):\n    iterator = iter(iterator)\n    try:\n        first = next(iterator)\n    except StopIteration:\n        return True\n    return all(first == x for x in iterator)\n\n\ndef validate_all_configs(configs: t.List[Config]) -> None:\n    clients = [conf.client for conf in configs]\n    require(all_equal(clients), f'All configs must request data from the same client, {clients[0]!r}.')\n\n    kwargs = [conf.kwargs for conf in configs]\n    require(all_equal(kwargs), 'Discrepancy in config parameters! Please check for consistency across all configs.')\n"
  },
  {
    "path": "weather_dl/download_pipeline/parsers_test.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport datetime\nimport io\nimport unittest\n\nfrom .manifest import MockManifest, Location\nfrom .parsers import (\n    date,\n    parse_config,\n    process_config,\n    _number_of_replacements,\n    parse_subsections,\n    prepare_target_name,\n)\nfrom .config import Config\n\n\nclass DateTest(unittest.TestCase):\n\n    def test_parses_relative_date(self):\n        self.assertEqual(date('-0'), datetime.date.today())\n        self.assertEqual(date('-2'), datetime.date.today() + datetime.timedelta(days=-2))\n\n    def test_parses_kebob_date(self):\n        self.assertEqual(date('2020-08-22'), datetime.date(year=2020, month=8, day=22))\n        self.assertEqual(date('1900-11-01'), datetime.date(year=1900, month=11, day=1))\n\n    def test_parses_smooshed_date(self):\n        self.assertEqual(date('20200822'), datetime.date(year=2020, month=8, day=22))\n        self.assertEqual(date('19001101'), datetime.date(year=1900, month=11, day=1))\n\n    def test_parses_year_and_day_of_year(self):\n        self.assertEqual(date('2020-235'), datetime.date(year=2020, month=8, day=22))\n        self.assertEqual(date('2021-007'), datetime.date(year=2021, month=1, day=7))\n        self.assertEqual(date('1900-305'), datetime.date(year=1900, month=11, day=1))\n\n    def test_throws_error(self):\n        with self.assertRaises(ValueError):\n            date('2020-08-22-12')\n\n        with self.assertRaises(ValueError):\n            date('2020-0822')\n\n        with self.assertRaises(ValueError):\n            date('20-08-22')\n\n        with self.assertRaises(ValueError):\n            date('')\n\n\nclass ParseConfigTest(unittest.TestCase):\n\n    def _assert_no_newlines_in_section(self, dictionary) -> None:\n        for val in dictionary['section'].values():\n            self.assertNotIn('\\n', val)\n\n    def test_json(self):\n        with io.StringIO('{\"section\": {\"key\": \"value\"}}') as f:\n            actual = parse_config(f)\n            self.assertDictEqual(actual, {'section': {'key': 'value'}})\n\n    def test_bad_json(self):\n        with io.StringIO('{\"section\": {\"key\": \"value\", \"brokenKey\": }}') as f:\n            actual = parse_config(f)\n            self.assertDictEqual(actual, {})\n\n    def test_json_produces_lists(self):\n        with io.StringIO('{\"section\": {\"key\": \"value\", \"list\": [0, 10, 20, 30, 40]}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], [0, 10, 20, 30, 40])\n\n    def test_json_parses_mars_list(self):\n        with io.StringIO('{\"section\": {\"key\": \"value\", \"list\": \"1/2/3\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['1', '2', '3'])\n\n    def test_json_parses_mars_int_range(self):\n        with io.StringIO('{\"section\": {\"key\": \"value\", \"list\": \"1/to/5\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['1', '2', '3', '4', '5'])\n\n    def test_json_parses_mars_int_range_padded(self):\n        with io.StringIO('{\"section\": {\"key\": \"value\", \"list\": \"00/to/05\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['00', '01', '02', '03', '04', '05'])\n\n    def test_json_parses_mars_int_range_incremented(self):\n        with io.StringIO('{\"section\": {\"key\": \"value\", \"list\": \"1/to/5/by/2\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['1', '3', '5'])\n\n    def test_json_parses_mars_float_range(self):\n        with io.StringIO('{\"section\": {\"key\": \"value\", \"list\": \"1.0/to/5.0\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['1.0', '2.0', '3.0', '4.0', '5.0'])\n\n    def test_json_parses_mars_float_range_incremented(self):\n        with io.StringIO('{\"section\": {\"key\": \"value\", \"list\": \"1.0/to/5.0/by/2.0\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['1.0', '3.0', '5.0'])\n\n    def test_json_parses_mars_float_range_incremented_by_float(self):\n        with io.StringIO('{\"section\": {\"key\": \"value\", \"list\": \"0.0/to/0.5/by/0.1\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertEqual(actual['section']['list'], ['0.0', '0.1', '0.2', '0.3', '0.4', '0.5'])\n\n    def test_json_parses_mars_date_range(self):\n        with io.StringIO('{\"section\": {\"key\": \"value\", \"list\": \"2020-01-07/to/2020-01-09\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['2020-01-07', '2020-01-08', '2020-01-09'])\n\n    def test_json_parses_mars_relative_date_range(self):\n        with io.StringIO('{\"section\": {\"key\": \"value\", \"list\": \"-3/to/-1\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n\n            dates = [\n                datetime.date.today() + datetime.timedelta(-3),\n                datetime.date.today() + datetime.timedelta(-2),\n                datetime.date.today() + datetime.timedelta(-1),\n            ]\n\n            self.assertListEqual(actual['section']['list'], [d.strftime(\"%Y-%m-%d\") for d in dates])\n\n    def test_json_parses_mars_date_range_incremented(self):\n        with io.StringIO('{\"section\": {\"key\": \"value\", \"list\": \"2020-01-07/to/2020-01-12/by/2\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['2020-01-07', '2020-01-09', '2020-01-11'])\n\n    def test_json_raises_syntax_error_missing_right(self):\n        with self.assertRaisesRegex(SyntaxError, \"Improper range syntax in '2020-01-07/to/'.\"):\n            with io.StringIO('{\"section\": {\"key\": \"value\", \"list\": \"2020-01-07/to/\"}}') as f:\n                parse_config(f)\n\n    def test_json_raises_syntax_error_missing_left(self):\n        with self.assertRaisesRegex(SyntaxError, \"Improper range syntax in '/to/2020-01-07'.\"):\n            with io.StringIO('{\"section\": {\"key\": \"value\", \"list\": \"/to/2020-01-07\"}}') as f:\n                parse_config(f)\n\n    def test_json_raises_syntax_error_missing_increment(self):\n        with self.assertRaisesRegex(SyntaxError, \"Improper range syntax in '2020-01-07/to/2020-01-11/by/'.\"):\n            with io.StringIO('{\"section\": {\"key\": \"value\", \"list\": \"2020-01-07/to/2020-01-11/by/\"}}') as f:\n                parse_config(f)\n\n    def test_json_raises_syntax_error_no_range(self):\n        with self.assertRaisesRegex(SyntaxError, \"Improper range syntax in '2020-01-07/by/2020-01-11'.\"):\n            with io.StringIO('{\"section\": {\"key\": \"value\", \"list\": \"2020-01-07/by/2020-01-11\"}}') as f:\n                parse_config(f)\n\n    def test_json_raises_value_error_date_types(self):\n        with self.assertRaisesRegex(\n            ValueError,\n            \"Increments on a date range must be integer number of days, '2.0' is invalid.\"\n        ):\n            with io.StringIO('{\"section\": {\"key\": \"value\", \"list\": \"2020-01-07/to/2020-01-11/by/2.0\"}}') as f:\n                parse_config(f)\n\n    def test_json_raises_value_error_float_types(self):\n        with self.assertRaisesRegex(SyntaxError, \"Improper range syntax in '1.0/to/10.0/by/2020-01-07'.\"):\n            with io.StringIO('{\"section\": {\"key\": \"value\", \"list\": \"1.0/to/10.0/by/2020-01-07\"}}') as f:\n                parse_config(f)\n\n    def test_json_raises_value_error_int_types(self):\n        with self.assertRaisesRegex(ValueError, \"inconsistent types.\"):\n            with io.StringIO('{\"section\": {\"key\": \"value\", \"list\": \"1/to/10/by/2.0\"}}') as f:\n                parse_config(f)\n\n    def test_json_parses_accidental_extra_whitespace(self):\n        with io.StringIO('{\"section\": {\"key\": \"value\", \"list\": \"1/to/5\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['1', '2', '3', '4', '5'])\n\n    def test_json_parses_parameter_subsections(self):\n        with io.StringIO('{\"parameters\": {\"api_url\": \"https://google.com/\", \\\n                                          \"alice\": {\"api_key\": \"123\"}, \\\n                                          \"bob\": {\"api_key\": \"456\"}}}') as f:\n            actual = parse_config(f)\n            self.assertEqual(actual, {\n                'parameters': {\n                    'api_url': 'https://google.com/',\n                    'alice': {'api_key': '123'},\n                    'bob': {'api_key': '456'},\n                },\n            })\n\n    def test_cfg(self):\n        with io.StringIO(\n                \"\"\"\n                [section]\n                key=value\n                \"\"\"\n        ) as f:\n            actual = parse_config(f)\n            self.assertDictEqual(actual, {'section': {'key': 'value'}})\n\n    def test_bad_cfg(self):\n        with io.StringIO(\n                \"\"\"\n                key=value\n                \"\"\"\n        ) as f:\n            actual = parse_config(f)\n            self.assertDictEqual(actual, {})\n\n    def test_cfg_produces_lists(self):\n        with io.StringIO(\n                \"\"\"\n                [section]\n                key=value\n                list=00\n                    10\n                    20\n                    30\n                    40\n                \"\"\"\n        ) as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['00', '10', '20', '30', '40'])\n\n    def test_cfg_parses_mars_list(self):\n        with io.StringIO(\n                \"\"\"\n                [section]\n                key=value\n                list=1/2/3\n                \"\"\"\n        ) as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['1', '2', '3'])\n\n    def test_cfg_parses_mars_int_range(self):\n        with io.StringIO(\n                \"\"\"\n                [section]\n                key=value\n                list=1/to/5\n                \"\"\"\n        ) as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['1', '2', '3', '4', '5'])\n\n    def test_cfg_parses_mars_int_range_padded(self):\n        with io.StringIO(\n                \"\"\"\n                [section]\n                key=value\n                list=00/to/05\n                \"\"\"\n        ) as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['00', '01', '02', '03', '04', '05'])\n\n    def test_cfg_parses_mars_int_range_incremented(self):\n        with io.StringIO(\n                \"\"\"\n                [section]\n                key=value\n                list=1/to/5/by/2\n                \"\"\"\n        ) as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['1', '3', '5'])\n\n    def test_cfg_parses_mars_int_reverse_range_incremented(self):\n        with io.StringIO(\n                \"\"\"\n                [section]\n                key=value\n                list=5/to/1/by/-2\n                \"\"\"\n        ) as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['5', '3', '1'])\n\n    def test_cfg_parses_mars_float_range(self):\n        with io.StringIO(\n                \"\"\"\n                [section]\n                key=value\n                list=1.0/to/5.0\n                \"\"\"\n        ) as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['1.0', '2.0', '3.0', '4.0', '5.0'])\n\n    def test_cfg_parses_mars_float_range_incremented(self):\n        with io.StringIO(\n                \"\"\"\n                [section]\n                key=value\n                list=1.0/to/5.0/by/2.0\n                \"\"\"\n        ) as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['1.0', '3.0', '5.0'])\n\n    def test_cfg_parses_mars_float_range_incremented_by_float(self):\n        with io.StringIO(\n                \"\"\"\n                [section]\n                key=value\n                list=0.0/to/0.5/by/0.1\n                \"\"\"\n        ) as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['0.0', '0.1', '0.2', '0.3', '0.4', '0.5'])\n\n    def test_cfg_parses_mars_float_reverse_range_incremented_by_float(self):\n        with io.StringIO(\n                \"\"\"\n                [section]\n                key=value\n                list=0.5/to/0.0/by/-0.1\n                \"\"\"\n        ) as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['0.5', '0.4', '0.3', '0.2', '0.1', '0.0'])\n\n    def test_cfg_parses_mars_date_range(self):\n        with io.StringIO(\n                \"\"\"\n                [section]\n                key=value\n                list=2020-01-07/to/2020-01-09\n                \"\"\"\n        ) as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['2020-01-07', '2020-01-08', '2020-01-09'])\n\n    def test_cfg_parses_mars_relative_date_range(self):\n        with io.StringIO(\n                \"\"\"\n                [section]\n                key=value\n                list=-3/to/-1\n                \"\"\"\n        ) as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n\n            dates = [\n                datetime.date.today() + datetime.timedelta(-3),\n                datetime.date.today() + datetime.timedelta(-2),\n                datetime.date.today() + datetime.timedelta(-1),\n            ]\n\n            self.assertListEqual(actual['section']['list'], [d.strftime(\"%Y-%m-%d\") for d in dates])\n\n    def test_cfg_parses_mars_relative_date_reverse_range(self):\n        with io.StringIO(\n                \"\"\"\n                [section]\n                key=value\n                list=-1/to/-3/by/-1\n                \"\"\"\n        ) as f:\n            actual = parse_config(f)\n            for key, val in actual['section'].items():\n                self.assertNotIn('\\n', val)\n\n            dates = [\n                datetime.date.today() + datetime.timedelta(-1),\n                datetime.date.today() + datetime.timedelta(-2),\n                datetime.date.today() + datetime.timedelta(-3),\n            ]\n\n            self.assertListEqual(actual['section']['list'], [d.strftime(\"%Y-%m-%d\") for d in dates])\n\n    def test_cfg_parses_mars_date_range_incremented(self):\n        with io.StringIO(\n                \"\"\"\n                [section]\n                key=value\n                list=2020-01-07/to/2020-01-12/by/2\n                \"\"\"\n        ) as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['2020-01-07', '2020-01-09', '2020-01-11'])\n\n    def test_cfg_parses_mars_date_reverse_range_incremented(self):\n        with io.StringIO(\n                \"\"\"\n                [section]\n                key=value\n                list=2020-01-12/to/2020-01-07/by/-2\n                \"\"\"\n        ) as f:\n            actual = parse_config(f)\n            for key, val in actual['section'].items():\n                self.assertNotIn('\\n', val)\n            self.assertListEqual(actual['section']['list'], ['2020-01-12', '2020-01-10', '2020-01-08'])\n\n    def test_cfg_raises_syntax_error_missing_right(self):\n        with self.assertRaisesRegex(SyntaxError, \"Improper range syntax in '2020-01-07/to/'.\"):\n            with io.StringIO(\n                    \"\"\"\n                    [section]\n                    key=value\n                    list=2020-01-07/to/\n                    \"\"\"\n            ) as f:\n                parse_config(f)\n\n    def test_cfg_raises_syntax_error_missing_left(self):\n        with self.assertRaisesRegex(SyntaxError, \"Improper range syntax in '/to/2020-01-07'.\"):\n            with io.StringIO(\n                    \"\"\"\n                    [section]\n                    key=value\n                    list=/to/2020-01-07\n                    \"\"\"\n            ) as f:\n                parse_config(f)\n\n    def test_cfg_raises_syntax_error_missing_increment(self):\n        with self.assertRaisesRegex(SyntaxError, \"Improper range syntax in '2020-01-07/to/2020-01-11/by/'.\"):\n            with io.StringIO(\n                    \"\"\"\n                    [section]\n                    key=value\n                    list=2020-01-07/to/2020-01-11/by/\n                    \"\"\"\n            ) as f:\n                parse_config(f)\n\n    def test_cfg_raises_syntax_error_no_range(self):\n        with self.assertRaisesRegex(SyntaxError, \"Improper range syntax in '2020-01-07/by/2020-01-11'.\"):\n            with io.StringIO(\n                    \"\"\"\n                    [section]\n                    key=value\n                    list=2020-01-07/by/2020-01-11\n                    \"\"\"\n            ) as f:\n                parse_config(f)\n\n    def test_cfg_raises_value_error_date_types(self):\n        with self.assertRaisesRegex(\n            ValueError,\n            \"Increments on a date range must be integer number of days, '2.0' is invalid.\"\n        ):\n            with io.StringIO(\n                    \"\"\"\n                    [section]\n                    key=value\n                    list=2020-01-07/to/2020-01-11/by/2.0\n                    \"\"\"\n            ) as f:\n                parse_config(f)\n\n    def test_cfg_raises_value_error_float_types(self):\n        with self.assertRaisesRegex(SyntaxError, \"Improper range syntax in '1.0/to/10.0/by/2020-01-07'.\"):\n            with io.StringIO(\n                    \"\"\"\n                    [section]\n                    key=value\n                    list=1.0/to/10.0/by/2020-01-07\n                    \"\"\"\n            ) as f:\n                parse_config(f)\n\n    def test_cfg_raises_value_error_int_types(self):\n        with self.assertRaisesRegex(ValueError, \"inconsistent types.\"):\n            with io.StringIO(\n                    \"\"\"\n                    [section]\n                    key=value\n                    list=1/to/10/by/2.0\n                    \"\"\"\n            ) as f:\n                parse_config(f)\n\n    def test_cfg_parses_accidental_extra_whitespace(self):\n        with io.StringIO(\n                \"\"\"\n                [section]\n                key=value\n                list=\n                    1/to/5\n                \"\"\"\n        ) as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(actual['section']['list'], ['1', '2', '3', '4', '5'])\n\n    def test_json_parse_year_mon_one(self):\n        with io.StringIO('{\"section\": {\"year-month\": \"2024-10/to/2024-10\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(\n                actual['section']['year-month'],\n                ['2024-10']\n            )\n\n    def test_json_parse_year_mon_one_by_one(self):\n        with io.StringIO('{\"section\": {\"year-month\": \"2024-10/to/2024-10/by/1\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(\n                actual['section']['year-month'],\n                ['2024-10']\n            )\n\n    def test_json_parse_year_mon_one_by_two(self):\n        with io.StringIO('{\"section\": {\"year-month\": \"2024-10/to/2024-10/by/2\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(\n                actual['section']['year-month'],\n                ['2024-10']\n            )\n\n    def test_json_parse_year_mon_six(self):\n        with io.StringIO('{\"section\": {\"year-month\": \"2024-10/to/2025-3\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(\n                actual['section']['year-month'],\n                ['2024-10', '2024-11', '2024-12', '2025-01', '2025-02', '2025-03']\n            )\n\n    def test_json_parse_year_mon_six_by_one(self):\n        with io.StringIO('{\"section\": {\"year-month\": \"2024-10/to/2025-3/by/1\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(\n                actual['section']['year-month'],\n                ['2024-10', '2024-11', '2024-12', '2025-01', '2025-02', '2025-03']\n            )\n\n    def test_json_parse_year_mon_six_by_three(self):\n        with io.StringIO('{\"section\": {\"year-month\": \"2024-10/to/2025-3/by/3\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(\n                actual['section']['year-month'],\n                ['2024-10', '2025-01']\n            )\n\n    def test_json_parse_year_mon_six_rev(self):\n        with io.StringIO('{\"section\": {\"year-month\": \"2025-3/to/2024-10\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(\n                actual['section']['year-month'],\n                ['2025-03', '2025-02', '2025-01', '2024-12', '2024-11', '2024-10']\n            )\n\n    def test_json_parse_year_mon_six_by_one_rev(self):\n        with io.StringIO('{\"section\": {\"year-month\": \"2025-3/to/2024-10/by/-1\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(\n                actual['section']['year-month'],\n                ['2025-03', '2025-02', '2025-01', '2024-12', '2024-11', '2024-10']\n            )\n\n    def test_json_parse_year_mon_six_by_three_rev(self):\n        with io.StringIO('{\"section\": {\"year-month\": \"2025-3/to/2024-10/by/-3\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(\n                actual['section']['year-month'],\n                ['2025-03', '2024-12']\n            )\n\n    def test_json_parse_year_mon_eighteen(self):\n        with io.StringIO('{\"section\": {\"year-month\": \"2023-10/to/2025-3\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(\n                actual['section']['year-month'],\n                ['2023-10', '2023-11', '2023-12', '2024-01', '2024-02', '2024-03', '2024-04', '2024-05',\n                 '2024-06', '2024-07', '2024-08', '2024-09', '2024-10', '2024-11', '2024-12', '2025-01',\n                 '2025-02', '2025-03']\n            )\n\n    def test_json_parse_year_mon_eighteen_by_two(self):\n        with io.StringIO('{\"section\": {\"year-month\": \"2023-10/to/2025-3/by/2\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(\n                actual['section']['year-month'],\n                ['2023-10', '2023-12', '2024-02', '2024-04', '2024-06', '2024-08', '2024-10', '2024-12', '2025-02']\n            )\n\n    def test_json_parse_year_mon_eighteen_by_six(self):\n        with io.StringIO('{\"section\": {\"year-month\": \"2023-10/to/2025-3/by/6\"}}') as f:\n            actual = parse_config(f)\n            self._assert_no_newlines_in_section(actual)\n            self.assertListEqual(\n                actual['section']['year-month'],\n                ['2023-10', '2024-04', '2024-10']\n            )\n\n    def test_cfg_parses_parameter_subsections(self):\n        with io.StringIO(\n                \"\"\"\n                [parameters]\n                api_url=https://google.com/\n                [parameters.alice]\n                api_key=123\n                [parameters.bob]\n                api_key=456\n                \"\"\"\n        ) as f:\n            actual = parse_config(f)\n            self.assertEqual(actual, {\n                'parameters': {\n                    'api_url': 'https://google.com/',\n                    'alice': {'api_key': '123'},\n                    'bob': {'api_key': '456'},\n                },\n            })\n\n\nclass HelpersTest(unittest.TestCase):\n\n    CASES = [('', 0), ('{} blah', 1), ('{} {}', 2),\n             ('{0}, {1}', 2), ('%s hello', 0), ('hello {.2f}', 1),\n             ('ear5-{year}{year}-{month}', 2), ('era5-{year}/{year}-{}-{}', 3),\n             ('{year}{year}{year}{year}', 1)]\n\n    def test_number_of_replacements(self):\n        for s, want in self.CASES:\n            with self.subTest(s=s, want=want):\n                actual = _number_of_replacements(s)\n                self.assertEqual(actual, want)\n\n\nclass SubsectionsTest(unittest.TestCase):\n    def test_parses_config_subsections(self):\n        config = {\"parsers\": {'a': 1, 'b': 2}, \"parsers.1\": {'b': 3}}\n\n        actual = parse_subsections(config)\n\n        self.assertEqual(actual, {'parsers': {'a': 1, 'b': 2, '1': {'b': 3}}})\n\n\nclass ApiKeyCountingTest(unittest.TestCase):\n    def test_no_keys(self):\n        config = {\"parameters\": {'a': 1, 'b': 2}, \"parameters.1\": {'b': 3}}\n\n        actual = parse_subsections(config)\n\n        self.assertEqual(actual, {'parameters': {'a': 1, 'b': 2, '1': {'b': 3}}})\n\n    def test_api_keys(self):\n        config = {\"parameters\": {'a': 1, 'b': 2},\n                  \"parameters.param1\": {'api_key': 'key1'},\n                  \"parameters.param2\": {'api_key': 'key2'}}\n\n        actual = parse_subsections(config)\n\n        self.assertEqual(actual,\n                         {'parameters': {'a': 1, 'b': 2,\n                                         'param1': {'api_key': 'key1'},\n                                         'param2': {'api_key': 'key2'}}})\n\n\nclass ProcessConfigTest(unittest.TestCase):\n\n    def test_parse_config(self):\n        with self.assertRaises(ValueError) as ctx:\n            with io.StringIO(\n                    \"\"\"\n                    key=value\n                    \"\"\"\n            ) as f:\n                process_config(f, 'test.cfg')\n\n        self.assertEqual(\"Unable to parse configuration file.\", ctx.exception.args[0])\n\n    def test_require_params_section(self):\n        with self.assertRaises(ValueError) as ctx:\n            with io.StringIO(\n                    \"\"\"\n                    [selection]\n                    key=value\n                    \"\"\"\n            ) as f:\n                process_config(f, 'test.cfg')\n\n        self.assertIn(\n            \"'parameters' section required in configuration file.\",\n            ctx.exception.args[0])\n\n    def test_accepts_parameters_section(self):\n        with self.assertRaises(ValueError) as ctx:\n            with io.StringIO(\n                    \"\"\"\n                    [parameters]\n                    key=value\n                    \"\"\"\n            ) as f:\n                process_config(f, 'test.cfg')\n\n        self.assertNotIn(\n            \"'parameters' section required in configuration file.\",\n            ctx.exception.args[0])\n\n    def test_requires_target_path_param(self):\n        with self.assertRaises(ValueError) as ctx:\n            with io.StringIO(\n                    \"\"\"\n                    [parameters]\n                    dataset=foo\n                    client=cds\n                    target=bar\n                    \"\"\"\n            ) as f:\n                process_config(f, 'test.cfg')\n\n        self.assertIn(\n            \"'parameters' section requires a 'target_path' key.\",\n            ctx.exception.args[0])\n\n    def test_requires_target_template_param_not_present(self):\n        with self.assertRaises(ValueError) as ctx:\n            with io.StringIO(\n                    \"\"\"\n                    [parameters]\n                    dataset=foo\n                    client=cds\n                    target_template=bar\n                    \"\"\"\n            ) as f:\n                process_config(f, 'test.cfg')\n\n        self.assertIn(\n            \"'target_template' is deprecated, use 'target_path' instead.\",\n            ctx.exception.args[0])\n\n    def test_accepts_target_path_param(self):\n        with self.assertRaises(ValueError) as ctx:\n            with io.StringIO(\n                    \"\"\"\n                    [parameters]\n                    dataset=foo\n                    client=cds\n                    target_path\n                    \"\"\"\n            ) as f:\n                process_config(f, 'test.cfg')\n\n        self.assertNotIn(\n            \"'parameters' section requires a 'target_path' key.\",\n            ctx.exception.args[0])\n\n    def test_requires_partition_keys_to_match_sections(self):\n        with self.assertRaises(ValueError) as ctx:\n            with io.StringIO(\n                    \"\"\"\n                    [parameters]\n                    dataset=foo\n                    client=cds\n                    target_path=bar-{}-{}\n                    partition_keys=\n                        year\n                        month\n                    [selection]\n                    day=\n                        01\n                        02\n                        03\n                    decade=\n                        1950\n                        1960\n                        1970\n                        1980\n                        1990\n                        2000\n                        2010\n                        2020\n                    \"\"\"\n            ) as f:\n                process_config(f, 'test.cfg')\n\n        self.assertIn(\n            \"All 'partition_keys' must appear in the 'selection' section.\",\n            ctx.exception.args[0])\n\n    def test_accepts_partition_keys_matching_sections(self):\n        with io.StringIO(\n                \"\"\"\n                [parameters]\n                dataset=foo\n                client=cds\n                target_path=bar-{}-{}\n                partition_keys=\n                    year\n                    month\n                [selection]\n                month=\n                    01\n                    02\n                    03\n                year=\n                    1950\n                    1960\n                    1970\n                    1980\n                    1990\n                    2000\n                    2010\n                    2020\n                \"\"\"\n        ) as f:\n            config = process_config(f, 'test.cfg')\n            self.assertTrue(bool(config))\n\n    def test_accepts_partition_keys_not_present(self):\n        with io.StringIO(\n                \"\"\"\n                [parameters]\n                dataset=foo\n                client=cds\n                target_path=bar\n                [selection]\n                month=\n                    01\n                    02\n                    03\n                year=\n                    1950\n                    1960\n                    1970\n                    1980\n                    1990\n                    2000\n                    2010\n                    2020\n                \"\"\"\n        ) as f:\n            config = process_config(f, 'test.cfg')\n            self.assertTrue(bool(config))\n\n    def test_treats_partition_keys_as_list(self):\n        with io.StringIO(\n                \"\"\"\n                [parameters]\n                dataset=foo\n                client=cds\n                target_path=bar-{}\n                partition_keys=month\n                [selection]\n                month=\n                    01\n                    02\n                    03\n                \"\"\"\n        ) as f:\n            config = process_config(f, 'test.cfg')\n            self.assertIsInstance(config.partition_keys, list)\n\n    def test_mismatched_template_partition_keys(self):\n        with self.assertRaises(ValueError) as ctx:\n            with io.StringIO(\n                    \"\"\"\n                    [parameters]\n                    dataset=foo\n                    client=cds\n                    target_path=bar-{}\n                    partition_keys=\n                        year\n                        month\n                    [selection]\n                    month=\n                        01\n                        02\n                        03\n                    year=\n                        1950\n                        1960\n                        1970\n                        1980\n                        1990\n                        2000\n                        2010\n                        2020\n                    \"\"\"\n            ) as f:\n                process_config(f, 'test.cfg')\n\n        self.assertIn(\n            \"'target_path' has 1 replacements. Expected 2\",\n            ctx.exception.args[0])\n\n    def test_append_date_dirs_raise_error(self):\n        with self.assertRaises(NotImplementedError) as ctx:\n            with io.StringIO(\n                    \"\"\"\n                    [parameters]\n                    dataset=foo\n                    client=cds\n                    target_path=somewhere/bar-{}\n                    append_date_dirs=true\n                    partition_keys=\n                        date\n                    [selection]\n                    date=2017-01-01/to/2017-01-01\n                    \"\"\"\n            ) as f:\n                process_config(f, 'test.cfg')\n\n        self.assertIn(\n            \"The current version of 'google-weather-tools' no longer supports 'append_date_dirs'!\"\n            \"\\n\\nPlease refer to documentation for creating date-based directory hierarchy :\\n\"\n            \"https://weather-tools.readthedocs.io/en/latest/Configuration.html\"\n            \"#creating-a-date-based-directory-hierarchy.\",\n            ctx.exception.args[0])\n\n    def test_target_filename_raise_error(self):\n        with self.assertRaises(NotImplementedError) as ctx:\n            with io.StringIO(\n                    \"\"\"\n                    [parameters]\n                    dataset=foo\n                    client=cds\n                    target_path=somewhere/\n                    target_filename=bar-{}\n                    partition_keys=\n                        date\n                    [selection]\n                    date=2017-01-01/to/2017-01-01\n                    \"\"\"\n            ) as f:\n                process_config(f, 'test.cfg')\n\n        self.assertIn(\n            \"The current version of 'google-weather-tools' no longer supports 'target_filename'!\"\n            \"\\n\\nPlease refer to documentation :\\n\"\n            \"https://weather-tools.readthedocs.io/en/latest/Configuration.html#parameters-section.\",\n            ctx.exception.args[0])\n\n    def test_client_not_set(self):\n        with self.assertRaises(ValueError) as ctx:\n            with io.StringIO(\n                    \"\"\"\n                    [parameters]\n                    dataset=foo\n                    target_path=bar-{}\n                    partition_keys=\n                        year\n                    [selection]\n                    year=\n                        1969\n                    \"\"\"\n            ) as f:\n                process_config(f, 'test.cfg')\n\n        self.assertIn(\n            \"'parameters' section requires a 'client' key.\",\n            ctx.exception.args[0])\n\n    def test_client_invalid(self):\n        with self.assertRaises(ValueError) as ctx:\n            with io.StringIO(\n                    \"\"\"\n                    [parameters]\n                    dataset=foo\n                    client=nope\n                    target_path=bar-{}\n                    partition_keys=\n                        year\n                    [selection]\n                    year=\n                        1969\n                    \"\"\"\n            ) as f:\n                process_config(f, 'test.cfg')\n\n        self.assertIn(\n            \"Invalid 'client' parameter.\",\n            ctx.exception.args[0])\n\n    def test_partition_cannot_include_all(self):\n        with self.assertRaisesRegex(ValueError, 'cannot appear as a partition key.'):\n            with io.StringIO(\n                    \"\"\"\n                    [parameters]\n                    dataset=foo\n                    client=cds\n                    target_path=bar-{}-{}\n                    partition_keys=\n                        month\n                        day\n                    [selection]\n                    year=2012\n                    month=\n                        01\n                        02\n                        03\n                    day=all\n                    \"\"\"\n            ) as f:\n                process_config(f, 'test.cfg')\n\n    def test_partition_key_contains_date_range(self):\n        with self.assertRaisesRegex(ValueError, \"\"\"If 'date_range' is specified in the 'selection' section,\n                then it is also required as a partition keys.\"\"\"):\n            with io.StringIO(\n                    \"\"\"\n                    [parameters]\n                    dataset=foo\n                    client=ecpublic\n                    target_path=bar-{}\n                    partition_keys=\n                        time\n                    [selection]\n                    date_range=2017-01-01/to/2017-01-10\n                    time=00/12\n                    \"\"\"\n            ) as f:\n                process_config(f, 'test.cfg')\n\n    def test_partition_key_contains_date__in_case_of_hdate(self):\n        with self.assertRaisesRegex(ValueError, \"'date' is required as a partition keys.\"):\n            with io.StringIO(\n                    \"\"\"\n                    [parameters]\n                    dataset=foo\n                    client=ecpublic\n                    target_path=bar-{}\n                    partition_keys=\n                        step\n                    [selection]\n                    date=2020-01-02\n                    step=1/2/3/4\n                    hdate=1/to/6\n                    \"\"\"\n            ) as f:\n                process_config(f, 'test.cfg')\n\n    def test_singleton_partitions_are_converted_to_lists(self):\n        with io.StringIO(\n                \"\"\"\n                [parameters]\n                dataset=foo\n                client=cds\n                target_path=bar-{}-{}\n                partition_keys=\n                    month\n                    year\n                [selection]\n                month=01\n                year=2018\n                \"\"\"\n        ) as f:\n            config = process_config(f, 'test.cfg')\n            self.assertEqual(config.selection['month'], ['01'])\n            self.assertEqual(config.selection['year'], ['2018'])\n\n\nclass PrepareTargetNameTest(unittest.TestCase):\n    TEST_CASES = [\n        dict(case='No date.',\n             config={\n                 'parameters': {\n                     'partition_keys': ['year', 'month'],\n                     'target_path': 'download-{}-{}.nc',\n                     'force_download': False\n                 },\n                 'selection': {\n                     'features': ['pressure'],\n                     'month': ['12'],\n                     'year': ['02']\n                 }\n             },\n             expected='download-2-12.nc'),\n        dict(case='Has date but no target directory.',\n             config={\n                 'parameters': {\n                     'partition_keys': ['date'],\n                     'target_path': 'download-{}.nc',\n                     'force_download': False\n                 },\n                 'selection': {\n                     'features': ['pressure'],\n                     'date': ['2017-01-15'],\n                 }\n             },\n             expected='download-2017-01-15.nc'),\n        dict(case='Has Directory, but no date',\n             config={\n                 'parameters': {\n                     'target_path': 'somewhere/download/{:02d}/{:02d}.nc',\n                     'partition_keys': ['year', 'month'],\n                     'force_download': False\n                 },\n                 'selection': {\n                     'features': ['pressure'],\n                     'month': ['02'],\n                     'year': ['02']\n                 }\n             },\n             expected='somewhere/download/02/02.nc'),\n        dict(case='Had date and target directory',\n             config={\n                 'parameters': {\n                     'partition_keys': ['date'],\n                     'target_path': 'somewhere/{date:%Y/%m/%d}-download.nc',\n                     'force_download': False\n                 },\n                 'selection': {\n                     'date': ['2017-01-15'],\n                 }\n             },\n             expected='somewhere/2017/01/15-download.nc'),\n        dict(case='Had date, target directory, and additional params.',\n             config={\n                 'parameters': {\n                     'partition_keys': ['date', 'pressure_level'],\n                     'target_path': 'somewhere/{date:%Y/%m/%d}-pressure-{pressure_level}.nc',\n                     'force_download': False\n                 },\n                 'selection': {\n                     'features': ['pressure'],\n                     'pressure_level': ['500'],\n                     'date': ['2017-01-15'],\n                 }\n             },\n             expected='somewhere/2017/01/15-pressure-500.nc'),\n        dict(case='Has date and target directory, including parameters in path.',\n             config={\n                 'parameters': {\n                     'partition_keys': ['date', 'expver', 'pressure_level'],\n                     'target_path': 'somewhere/expver-{expver}/{date:%Y/%m/%d}-pressure-{pressure_level}.nc',\n                     'force_download': False\n                 },\n                 'selection': {\n                     'features': ['pressure'],\n                     'pressure_level': ['500'],\n                     'date': ['2017-01-15'],\n                     'expver': ['1'],\n                 }\n             },\n             expected='somewhere/expver-1/2017/01/15-pressure-500.nc'),\n        dict(case='Has date_range',\n             config={\n                 'parameters': {\n                     'partition_keys': ['date_range', 'time'],\n                     'target_path': 'somewhere/{date_range}-{time}.nc',\n                     'force_download': False\n                 },\n                 'selection': {\n                     'features': ['pressure'],\n                     'date_range': ['2017-01-01/to/2017-01-10'],\n                     'time': ['00']\n                 }\n             },\n             expected='somewhere/2017-01-01_to_2017-01-10-00:00:00.nc'),\n\n    ]\n\n    def setUp(self) -> None:\n        self.dummy_manifest = MockManifest(Location('dummy-manifest'))\n\n    def test_target_name(self):\n        for it in self.TEST_CASES:\n            with self.subTest(msg=it['case'], **it):\n                actual = prepare_target_name(Config.from_dict(it['config']))\n                self.assertEqual(actual, it['expected'])\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "weather_dl/download_pipeline/partition.py",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport copy as cp\nimport dataclasses\nimport itertools\nimport logging\nimport math\nimport typing as t\n\nimport apache_beam as beam\n\nfrom .config import Config\nfrom .manifest import Manifest, NoOpManifest, Location\nfrom .parsers import prepare_target_name\nfrom .stores import Store, FSStore\nfrom .util import ichunked, generate_hdate\n\nPartition = t.Tuple[str, t.Dict, Config]\nIndex = t.Tuple[int]\n\nlogger = logging.getLogger(__name__)\n\n\n@dataclasses.dataclass\nclass PartitionConfig(beam.PTransform):\n    \"\"\"Partition a config into multiple data requests.\n\n    Partitioning involves four main operations: First, we fan-out shards based on\n    partition keys (a cross product of the values). Second, we filter out existing\n    downloads (unless we want to force downloads). Next, we add subsections to the\n    configs in a cycle (to ensure an even distribution of extra parameters). Last,\n    We assemble each partition into a single Config.\n\n    Attributes:\n        store: A cloud storage system, used for checking the existence of downloads.\n        subsections: A cycle of (name, parameter) tuples.\n        manifest: A download manifest to register preparation state.\n        scheduling: How to sort partitions from multiple configs: in order of each\n           config (default), or in \"fair\" order, where partitions from each config\n           are evenly rotated.\n        partition_chunks: The size of chunks of partition shards to use during the\n            fan-out stage. By default, this operation will aim to divide all the\n            partitions into groups of about 1000 sized-chunks.\n        num_groups: The the number of groups (for fair scheduling). Default: 1.\n    \"\"\"\n\n    store: Store\n    subsections: itertools.cycle\n    manifest: Manifest\n    scheduling: str\n    partition_chunks: t.Optional[int] = None\n    update_manifest: bool = False\n    num_groups: int = 1\n\n    def expand(self, configs):\n        def loop_through_subsections(it: Config) -> Partition:\n            \"\"\"Assign a subsection to each config in a loop.\n\n            If the `parameters` section contains subsections (e.g. '[parameters.1]',\n            '[parameters.2]'), collect a repeating cycle of the subsection key-value\n            pairs. Otherwise, assign a default section to each config.\n\n            This is useful for specifying multiple API keys for your configuration.\n\n            For example:\n            ```\n              [parameters.alice]\n              api_key=KKKKK1\n              api_url=UUUUU1\n              [parameters.bob]\n              api_key=KKKKK2\n              api_url=UUUUU2\n              [parameters.eve]\n              api_key=KKKKK3\n              api_url=UUUUU3\n            ```\n            \"\"\"\n            name, params = next(self.subsections)\n            return name, params, it\n\n        if self.scheduling == 'fair':\n            config_idxs = (\n                    configs\n                    | beam.combiners.ToList()\n                    | 'Fair Fan-out' >> beam.FlatMap(prepare_fair_partition_index,\n                                                     chunk_size=self.partition_chunks,\n                                                     groups=self.num_groups)\n            )\n        else:\n            config_idxs = (\n                    configs\n                    | 'Fan-out' >> beam.FlatMap(prepare_partition_index, chunk_size=self.partition_chunks)\n            )\n\n        return (\n                config_idxs\n                | beam.Reshuffle()\n                | 'To configs' >> beam.FlatMapTuple(prepare_partitions_from_index)\n                | 'Skip existing' >> beam.Filter(new_downloads_only,\n                                                 store=self.store,\n                                                 manifest=self.manifest)\n                | 'Cycle subsections' >> beam.Map(loop_through_subsections)\n                | 'Assemble' >> beam.Map(assemble_config, manifest=self.manifest)\n            )\n\n\ndef _create_partition_config(option: t.Tuple, config: Config) -> Config:\n    \"\"\"Create a config for a single partition option.\n\n    Output a config dictionary, overriding the range of values for\n    each key with the partition instance in 'selection'.\n    Continuing the example from prepare_partitions, the selection section\n    would be:\n      { 'foo': ..., 'year': ['2020'], 'month': ['01'], ... }\n      { 'foo': ..., 'year': ['2020'], 'month': ['02'], ... }\n      { 'foo': ..., 'year': ['2020'], 'month': ['03'], ... }\n\n    Args:\n        option: A single item in the range of partition_keys.\n        config: The download config, including the parameters and selection sections.\n\n    Returns:\n        A configuration with that selects a single download partition.\n    \"\"\"\n    copy = cp.deepcopy(config.selection)\n    out = cp.deepcopy(config)\n    for idx, key in enumerate(config.partition_keys):\n        copy[key] = [option[idx]]\n\n    # Replace hdate with actual value.\n    if 'hdate' in copy:\n        copy['hdate'] = [generate_hdate(copy['date'][0], v) for v in copy['hdate']]\n\n    out.selection = copy\n    return out\n\n\ndef skip_partition(config: Config, store: Store, manifest: Manifest) -> bool:\n    \"\"\"Return true if partition should be skipped.\"\"\"\n\n    if config.force_download:\n        return False\n\n    target = prepare_target_name(config)\n    if store.exists(target):\n        logger.info(f'file {target} found, skipping.')\n        manifest.skip(config.config_name, config.dataset, config.selection, target, config.user_id)\n        return True\n\n    return False\n\n\ndef prepare_partition_index(config: Config,\n                            chunk_size: t.Optional[int] = None) -> t.Iterator[t.Tuple[Config, t.List[Index]]]:\n    \"\"\"Produce indexes over client parameters, partitioning over `partition_keys`\n\n    This produces a Cartesian-Cross over the range of keys.\n\n    For example, if the keys were 'year' and 'month', it would produce\n    an iterable like:\n        ( (0, 0), (0, 1), (0, 2), ...)\n\n    After the indexes were converted back to keys, it would produce values like:\n        ( ('2020', '01'), ('2020', '02'), ('2020', '03'), ...)\n\n    Returns:\n        An iterator of index tuples.\n    \"\"\"\n    dims = [range(len(config.selection[key])) for key in config.partition_keys]\n    n_partitions = math.prod([len(d) for d in dims])\n    logger.info(f'Creating {n_partitions} partitions.')\n\n    if chunk_size is None:\n        chunk_size = 1000\n\n    if not dims:\n        yield config, []\n    else:\n        for option_idx in ichunked(itertools.product(*dims), chunk_size):\n            yield config, list(option_idx)\n\n\ndef prepare_partitions_from_index(config: Config, indexes: t.List[Index]) -> t.Iterator[Config]:\n    \"\"\"Convert a partition index into a config.\n\n    Returns: from an option index.\n        A partition `Config` from an option index.\n    \"\"\"\n    if not indexes:\n        yield _create_partition_config((), config)\n    else:\n        for index in indexes:\n            option = tuple(\n                config.selection[config.partition_keys[key_idx]][val_idx] for key_idx, val_idx in enumerate(index)\n            )\n            yield _create_partition_config(option, config)\n\n\ndef new_downloads_only(candidate: Config, store: t.Optional[Store] = None,\n                       manifest: Manifest = NoOpManifest(Location('noop://in-memory'))) -> bool:\n    \"\"\"Predicate function to skip already downloaded partitions.\"\"\"\n    if store is None:\n        store = FSStore()\n    should_skip = skip_partition(candidate, store, manifest)\n    if should_skip:\n        beam.metrics.Metrics.counter('Prepare', 'skipped').inc()\n    return not should_skip\n\n\ndef assemble_config(partition: Partition, manifest: Manifest) -> Config:\n    \"\"\"Assemble the configuration for a single partition.\n\n    For each cross product of the 'selection' sections, the output dictionary\n    will overwrite parameters from the extra param subsections, evenly cycling\n    through each subsection.\n\n    For example:\n      { 'parameters': {... 'api_key': KKKKK1, ... }, ... }\n      { 'parameters': {... 'api_key': KKKKK2, ... }, ... }\n      { 'parameters': {... 'api_key': KKKKK3, ... }, ... }\n      { 'parameters': {... 'api_key': KKKKK1, ... }, ... }\n      { 'parameters': {... 'api_key': KKKKK2, ... }, ... }\n      { 'parameters': {... 'api_key': KKKKK3, ... }, ... }\n      ...\n\n    Returns:\n        An `Config` assembled out of subsection parameters and config shards.\n    \"\"\"\n    name, params, out = partition\n    out.kwargs.update(params)\n    out.subsection_name = name\n\n    location = prepare_target_name(out)\n    user = out.user_id\n    manifest.schedule(out.config_name, out.dataset, out.selection, location, user)\n\n    logger.info(f'[{name}] Created partition {location!r}.')\n    beam.metrics.Metrics.counter('Subsection', name).inc()\n\n    return out\n\n\ndef cycle_iters(iters: t.List[t.Iterator], take: int = 1) -> t.Iterator:\n    \"\"\"Evenly cycle through a list of iterators.\n\n    Args:\n        iters: A list of iterators to evely cycle through.\n        take: Yield N items at a time. When not set to 1, this will yield\n          multiple items from the same collection.\n\n    Returns:\n        An iteration across several iterators in a round-robin order.\n    \"\"\"\n    while iters:\n        for i, it in enumerate(iters):\n            try:\n                for j in range(take):\n                    logger.debug(f'yielding item {j!r} from iterable {i!r}.')\n                    yield next(it)\n            except StopIteration:\n                iters.remove(it)\n\n\ndef prepare_fair_partition_index(configs: t.List[Config],\n                                 chunk_size: t.Optional[int],\n                                 groups: int) -> t.Iterator[t.Tuple[Config, t.List[Index]]]:\n    \"\"\"Given a list of all configs, evenly cycle through each partition chunked by the 'chunk_size'.\"\"\"\n    if chunk_size is None:\n        chunk_size = 1\n    iters = [prepare_partition_index(config, chunk_size) for config in configs]\n    yield from cycle_iters(iters, take=groups)\n"
  },
  {
    "path": "weather_dl/download_pipeline/partition_test.py",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport itertools\nimport json\nimport tempfile\nimport typing as t\nimport unittest\nfrom unittest.mock import MagicMock\n\nimport apache_beam as beam\nfrom xarray_beam._src.test_util import EagerPipeline\n\nfrom .manifest import MockManifest, Location, DownloadStatus, LocalManifest, Status, Stage\nfrom .parsers import get_subsections\nfrom .partition import skip_partition, PartitionConfig\nfrom .stores import InMemoryStore, Store\nfrom .config import Config\n\n\nclass OddFilesDoNotExistStore(InMemoryStore):\n    def __init__(self):\n        super().__init__()\n        self.count = 0\n\n    def exists(self, filename: str) -> bool:\n        ret = self.count % 2 == 0\n        self.count += 1\n        return ret\n\n\nclass PreparePartitionTest(unittest.TestCase):\n\n    def setUp(self) -> None:\n        self.dummy_manifest = MockManifest(Location('mock://dummy'))\n\n    def create_partition_configs(self, configs,\n                                 store: t.Optional[Store] = None,\n                                 schedule='in-order',\n                                 n_requests_per: int = 1) -> t.List[Config]:\n        subsections = get_subsections(configs[0])\n        params_cycle = itertools.cycle(subsections)\n\n        return (EagerPipeline()\n                | beam.Create(configs)\n                | PartitionConfig(store, params_cycle, self.dummy_manifest, schedule,\n                                  len(subsections) * n_requests_per))\n\n    def test_partition_single_key(self):\n        config = {\n            'parameters': {\n                'partition_keys': ['year'],\n                'target_path': 'download-{}.nc',\n            },\n            'selection': {\n                'features': ['pressure', 'temperature', 'wind_speed_U', 'wind_speed_V'],\n                'month': [str(i) for i in range(1, 13)],\n                'year': [str(i) for i in range(2015, 2021)]\n            }\n        }\n\n        config_obj = Config.from_dict(config)\n        actual = self.create_partition_configs([config_obj])\n\n        self.assertListEqual([d.selection for d in actual], [\n            {**config['selection'], **{'year': [str(i)]}}\n            for i in range(2015, 2021)\n        ])\n\n    def test_partition_multi_key(self):\n        config = {\n            'parameters': {\n                'partition_keys': ['year', 'month'],\n                'target_path': 'download-{}-{}.nc',\n            },\n            'selection': {\n                'features': ['pressure', 'temperature', 'wind_speed_U', 'wind_speed_V'],\n                'month': [str(i) for i in range(1, 3)],\n                'year': [str(i) for i in range(2015, 2017)]\n            }\n        }\n\n        config_obj = Config.from_dict(config)\n        actual = self.create_partition_configs([config_obj])\n\n        self.assertListEqual([d.selection for d in actual], [\n            {**config['selection'], **{'year': ['2015'], 'month': ['1']}},\n            {**config['selection'], **{'year': ['2015'], 'month': ['2']}},\n            {**config['selection'], **{'year': ['2016'], 'month': ['1']}},\n            {**config['selection'], **{'year': ['2016'], 'month': ['2']}},\n        ])\n\n    def test_partition_multi_key_single_values(self):\n        config = {\n            'parameters': {\n                'partition_keys': ['year', 'month'],\n                'target_path': 'download-{}-{}.nc',\n            },\n            'selection': {\n                'features': ['pressure', 'temperature', 'wind_speed_U', 'wind_speed_V'],\n                'month': ['1'],\n                'year': ['2015'],\n            }\n        }\n\n        config_obj = Config.from_dict(config)\n        actual = self.create_partition_configs([config_obj])\n\n        self.assertListEqual([d.selection for d in actual], [\n            {**config['selection'], **{'year': ['2015'], 'month': ['1']}},\n        ])\n\n    def test_partition_multi_params_multi_key(self):\n        config = {\n            'parameters': dict(\n                partition_keys=['year', 'month'],\n                target_path='download-{}-{}.nc',\n                research={\n                    'api_key': 'KKKK1',\n                    'api_url': 'UUUU1'\n                },\n                cloud={\n                    'api_key': 'KKKK2',\n                    'api_url': 'UUUU2'\n                },\n                deepmind={\n                    'api_key': 'KKKK3',\n                    'api_url': 'UUUU3'\n                }\n            ),\n            'selection': {\n                'features': ['pressure', 'temperature', 'wind_speed_U', 'wind_speed_V'],\n                'month': [str(i) for i in range(1, 3)],\n                'year': [str(i) for i in range(2015, 2017)]\n            }\n        }\n\n        config_obj = Config.from_dict(config)\n        actual = self.create_partition_configs([config_obj])\n\n        expected = [Config.from_dict(it) for it in [\n            {'parameters': dict(config['parameters'], api_key='KKKK1', api_url='UUUU1', subsection_name='research'),\n             'selection': {**config['selection'],\n                           **{'year': ['2015'], 'month': ['1']}}},\n            {'parameters': dict(config['parameters'], api_key='KKKK2', api_url='UUUU2', subsection_name='cloud'),\n             'selection': {**config['selection'],\n                           **{'year': ['2015'], 'month': ['2']}}},\n            {'parameters': dict(config['parameters'], api_key='KKKK3', api_url='UUUU3', subsection_name='deepmind'),\n             'selection': {**config['selection'],\n                           **{'year': ['2016'], 'month': ['1']}}},\n            {'parameters': dict(config['parameters'], api_key='KKKK1', api_url='UUUU1', subsection_name='research'),\n             'selection': {**config['selection'],\n                           **{'year': ['2016'], 'month': ['2']}}},\n        ]]\n\n        self.assertListEqual(actual, expected)\n\n    def test_prepare_partition_records_download_status_to_manifest(self):\n        config = {\n            'parameters': {\n                'partition_keys': ['year'],\n                'target_path': 'download-{}.nc',\n            },\n            'selection': {\n                'features': ['pressure', 'temperature', 'wind_speed_U', 'wind_speed_V'],\n                'month': [str(i) for i in range(1, 13)],\n                'year': [str(i) for i in range(2015, 2021)]\n            }\n        }\n\n        config_obj = Config.from_dict(config)\n\n        with tempfile.TemporaryDirectory() as tmpdir:\n            self.dummy_manifest = LocalManifest(Location(tmpdir))\n\n            self.create_partition_configs([config_obj])\n\n            with open(self.dummy_manifest.location, 'r') as f:\n                actual = json.load(f)\n\n            self.assertListEqual(\n                [d['selection'] for d in actual.values()], [\n                    json.dumps({**config['selection'], **{'year': [str(i)]}})\n                    for i in range(2015, 2021)\n                ])\n\n            self.assertTrue(\n                all([d['status'] == 'scheduled' for d in actual.values()])\n            )\n\n    def test_prepare_partition_records_download_status_to_manifest_for_already_downloaded_shard(self):\n        config = {\n            'parameters': {\n                'partition_keys': ['year'],\n                'target_path': 'download-{}.nc',\n            },\n            'selection': {\n                'features': ['pressure', 'temperature', 'wind_speed_U', 'wind_speed_V'],\n                'month': [str(i) for i in range(1, 13)],\n                'year': [str(i) for i in range(2015, 2021)]\n            }\n        }\n\n        config_obj = Config.from_dict(config)\n\n        with tempfile.TemporaryDirectory() as tmpdir:\n            self.mock_store = InMemoryStore()\n            self.mock_store.open('download-2015.nc')\n            self.dummy_manifest = LocalManifest(Location(tmpdir))\n\n            self.create_partition_configs(configs=[config_obj], store=self.mock_store)\n\n            with open(self.dummy_manifest.location, 'r') as f:\n                actual = json.load(f)\n\n            self.assertListEqual(\n                [d['selection'] for d in actual.values()], [\n                    json.dumps({**config['selection'], **{'year': [str(i)]}})\n                    for i in range(2015, 2021)\n                ])\n\n            self.assertTrue(\n                all([d['status'] == 'scheduled' if d['location'] != 'download-2015.nc'\n                     else d['stage'] == 'upload' and d['status'] == 'success'\n                     for d in actual.values()])\n            )\n\n    def test_prepare_partition_update_download_status_for_downloaded_shard_missing_upload_entry(self):\n        config = {\n            'parameters': {\n                'partition_keys': ['year'],\n                'target_path': 'download-{}.nc',\n            },\n            'selection': {\n                'features': ['pressure', 'temperature', 'wind_speed_U', 'wind_speed_V'],\n                'month': [str(i) for i in range(1, 13)],\n                'year': [str(i) for i in range(2015, 2021)]\n            }\n        }\n\n        config_obj = Config.from_dict(config)\n\n        with tempfile.TemporaryDirectory() as tmpdir:\n            self.mock_store = InMemoryStore()\n            self.mock_store.open('download-2015.nc')\n            self.dummy_manifest = LocalManifest(Location(tmpdir))\n            download_status = DownloadStatus(selection={**config['selection'], **{'year': [2015]}},\n                                             location='download-2015.nc',\n                                             status=Status.SUCCESS,\n                                             stage=Stage.DOWNLOAD)\n            self.dummy_manifest._update(download_status)\n\n            self.create_partition_configs(configs=[config_obj], store=self.mock_store)\n\n            with open(self.dummy_manifest.location, 'r') as f:\n                actual = json.load(f)\n\n            self.assertListEqual(\n                [d['selection'] for d in actual.values()], [\n                    json.dumps({**config['selection'], **{'year': [str(i)]}})\n                    for i in range(2015, 2021)\n                ])\n\n            self.assertTrue(\n                all([d['status'] == 'scheduled' if d['location'] != 'download-2015.nc'\n                     else d['stage'] == 'upload' and d['status'] == 'success'\n                     for d in actual.values()])\n            )\n\n    def test_prepare_partition_update_manifest_for_failed_upload_status_of_downloaded_shard(self):\n        config = {\n            'parameters': {\n                'partition_keys': ['year'],\n                'target_path': 'download-{}.nc',\n            },\n            'selection': {\n                'features': ['pressure', 'temperature', 'wind_speed_U', 'wind_speed_V'],\n                'month': [str(i) for i in range(1, 13)],\n                'year': [str(i) for i in range(2015, 2021)]\n            }\n        }\n\n        config_obj = Config.from_dict(config)\n\n        with tempfile.TemporaryDirectory() as tmpdir:\n            self.mock_store = InMemoryStore()\n            self.mock_store.open('download-2015.nc')\n            self.dummy_manifest = LocalManifest(Location(tmpdir))\n            download_status = DownloadStatus(selection={**config['selection'], **{'year': [2015]}},\n                                             location='download-2015.nc',\n                                             status=Status.FAILURE,\n                                             error='error',\n                                             stage=Stage.UPLOAD)\n            self.dummy_manifest._update(download_status)\n\n            self.create_partition_configs(configs=[config_obj], store=self.mock_store)\n\n            with open(self.dummy_manifest.location, 'r') as f:\n                actual = json.load(f)\n\n            self.assertListEqual(\n                [d['selection'] for d in actual.values()], [\n                    json.dumps({**config['selection'], **{'year': [str(i)]}})\n                    for i in range(2015, 2021)\n                ])\n\n            self.assertTrue(\n                all([d['status'] == 'scheduled' if d['location'] != 'download-2015.nc'\n                     else d['stage'] == 'upload' and d['status'] == 'success'\n                     for d in actual.values()])\n            )\n\n    def test_skip_partitions__never_unbalances_licenses(self):\n        skip_odd_files = OddFilesDoNotExistStore()\n        config = {\n            'parameters': dict(\n                partition_keys=['year', 'month'],\n                target_path='download-{}-{}.nc',\n                research={\n                    'api_key': 'KKKK1',\n                    'api_url': 'UUUU1'\n                },\n                cloud={\n                    'api_key': 'KKKK2',\n                    'api_url': 'UUUU2'\n                }\n            ),\n            'selection': {\n                'features': ['pressure', 'temperature', 'wind_speed_U', 'wind_speed_V'],\n                'month': [str(i) for i in range(1, 3)],\n                'year': [str(i) for i in range(2016, 2020)]\n            }\n        }\n\n        config_obj = Config.from_dict(config)\n        actual = self.create_partition_configs([config_obj], store=skip_odd_files)\n        research_configs = [cfg for cfg in actual if cfg and t.cast('str', cfg.kwargs.get('api_url', \"\")).endswith('1')]\n        cloud_configs = [cfg for cfg in actual if cfg and t.cast('str', cfg.kwargs.get('api_url', \"\")).endswith('2')]\n\n        self.assertEqual(len(research_configs), len(cloud_configs))\n\n    def test_multi_config_partition_single_key(self):\n        configs = [\n            Config.from_dict({\n                'parameters': {\n                    'partition_keys': ['year'],\n                    'target_path': 'download-{}-%d.nc' % level,\n                },\n                'selection': {\n                    'features': ['pressure', 'temperature', 'wind_speed_U', 'wind_speed_V'],\n                    'month': [str(i) for i in range(1, 13)],\n                    'year': [str(i) for i in range(2015, 2021)],\n                    'level': [str(level)]\n                }\n            }) for level in range(500, 901, 100)\n        ]\n\n        actual = self.create_partition_configs(configs)\n\n        self.assertListEqual([d.selection for d in actual], [\n            {**configs[0].selection, **{'year': [str(i)], 'level': [str(level)]}}\n            for level in range(500, 901, 100)\n            for i in range(2015, 2021)\n        ])\n\n    def test_multi_config_partition_single_key_fair_schedule(self):\n        configs = [\n            Config.from_dict({\n                'parameters': {\n                    'partition_keys': ['year'],\n                    'target_path': 'download-{}-%d.nc' % level,\n                },\n                'selection': {\n                    'features': ['pressure', 'temperature', 'wind_speed_U', 'wind_speed_V'],\n                    'month': [str(i) for i in range(1, 13)],\n                    'year': [str(i) for i in range(2015, 2021)],\n                    'level': [str(level)]\n                }\n            }) for level in range(500, 901, 100)\n        ]\n\n        actual = self.create_partition_configs(configs, schedule='fair')\n\n        self.assertListEqual([d.selection for d in actual], [\n            {**configs[0].selection, **{'year': [str(i)], 'level': [str(level)]}}\n            for i in range(2015, 2021)\n            for level in range(500, 901, 100)\n        ])\n\n    def test_multi_config_partition_multi_params_multi_key(self):\n        config_dicts = [\n            {\n                'parameters': dict(\n                    partition_keys=['year', 'month'],\n                    target_path='download-{}-{}-%d.nc' % level,\n                    research={\n                        'api_key': 'KKKK1',\n                        'api_url': 'UUUU1'\n                    },\n                    cloud={\n                        'api_key': 'KKKK2',\n                        'api_url': 'UUUU2'\n                    },\n                    deepmind={\n                        'api_key': 'KKKK3',\n                        'api_url': 'UUUU3'\n                    }\n                ),\n                'selection': {\n                    'features': ['pressure', 'temperature', 'wind_speed_U', 'wind_speed_V'],\n                    'month': [str(i) for i in range(1, 3)],\n                    'year': [str(i) for i in range(2015, 2017)],\n                    'level': [str(level)]\n                }\n            } for level in range(500, 901, 100)\n        ]\n\n        actual = self.create_partition_configs([Config.from_dict(c) for c in config_dicts])\n\n        combinations = [\n            {\n                'parameters': config['parameters'],\n                'selection': {**config['selection'], **{'year': [str(year)], 'month': [str(month)]}}\n            }\n            for config in config_dicts\n            for year in range(2015, 2017)\n            for month in range(1, 3)\n        ]\n\n        subsections = get_subsections(Config.from_dict(config_dicts[0]))\n        expected = []\n        for config, (name, params) in zip(combinations, itertools.cycle(subsections)):\n            config['parameters'].update(params)\n            config['parameters']['subsection_name'] = name\n            expected.append(Config.from_dict(config))\n\n        self.assertListEqual(actual, expected)\n\n    def test_multi_config_partition_multi_params_fair_schedule(self):\n        config_dicts = [\n            {\n                'parameters': dict(\n                    partition_keys=['year'],\n                    target_path='download-{}-%d.nc' % level,\n                    research={\n                        'api_key': 'KKKK1',\n                        'api_url': 'UUUU1'\n                    },\n                    cloud={\n                        'api_key': 'KKKK2',\n                        'api_url': 'UUUU2'\n                    },\n                    deepmind={\n                        'api_key': 'KKKK3',\n                        'api_url': 'UUUU3'\n                    }\n                ),\n                'selection': {\n                    'features': ['pressure', 'temperature', 'wind_speed_U', 'wind_speed_V'],\n                    'month': [str(1)],\n                    'year': [str(i) for i in range(2015, 2017)],\n                    'level': [str(level)]\n                }\n            } for level in range(500, 901, 100)\n        ]\n\n        # Three licenses, with one request per license (see groups)\n        actual = self.create_partition_configs([Config.from_dict(c) for c in config_dicts],\n                                               schedule='fair', n_requests_per=3)\n\n        combinations = [\n            {\n                'parameters': config['parameters'],\n                'selection': {**config['selection'], **{'year': [str(year)]}}\n            }\n            for config in config_dicts\n            for year in range(2015, 2017)\n        ]\n\n        subsections = get_subsections(Config.from_dict(config_dicts[0]))\n        expected = []\n        for config, (name, params) in zip(combinations, itertools.cycle(subsections)):\n            config['parameters'].update(params)\n            config['parameters']['subsection_name'] = name\n            expected.append(Config.from_dict(config))\n\n        self.assertListEqual(actual, expected)\n\n    def test_multi_config_partition_multi_params_keys_and_requests_with_fair_schedule(self):\n        self.maxDiff = None\n        config_dicts = [\n            {\n                'parameters': dict(\n                    partition_keys=['year', 'month'],\n                    target_path='download-{}-{}-%d.nc' % level,\n                    research={\n                        'api_key': 'KKKK1',\n                        'api_url': 'UUUU1'\n                    },\n                    cloud={\n                        'api_key': 'KKKK2',\n                        'api_url': 'UUUU2'\n                    },\n                    deepmind={\n                        'api_key': 'KKKK3',\n                        'api_url': 'UUUU3'\n                    }\n                ),\n                'selection': {\n                    'features': ['pressure', 'temperature', 'wind_speed_U', 'wind_speed_V'],\n                    'month': [str(i) for i in range(1, 3)],\n                    'year': [str(i) for i in range(2015, 2017)],\n                    'level': [str(level)]\n                }\n            } for level in range(500, 901, 100)\n        ]\n\n        # Three licenses, with two requests per license (see groups)\n        actual = self.create_partition_configs([Config.from_dict(c) for c in config_dicts],\n                                               schedule='fair', n_requests_per=3 * 2)\n\n        combinations = [\n            {\n                'parameters': config['parameters'],\n                'selection': {**config['selection'], **{'year': [str(year)], 'month': [str(month)]}}\n            }\n            for config in config_dicts\n            for year in range(2015, 2017)\n            for month in range(1, 3)\n        ]\n\n        subsections = get_subsections(Config.from_dict(config_dicts[0]))\n        expected = []\n        for config, (name, params) in zip(combinations, itertools.cycle(subsections)):\n            config['parameters'].update(params)\n            config['parameters']['subsection_name'] = name\n            expected.append(Config.from_dict(config))\n\n        self.assertListEqual(actual, expected)\n\n    def test_hdate_partition_single_key(self):\n        config = Config.from_dict({\n                'parameters': {\n                    'partition_keys': ['date'],\n                    'target_path': 'download-{}.nc',\n                },\n                'selection': {\n                    'features': ['pressure', 'temperature', 'wind_speed_U', 'wind_speed_V'],\n                    'date': ['2016-01-04', '2016-01-07', '2016-01-11'],\n                    'hdate': ['1', '2', '3'],\n                }\n            })\n\n        actual = self.create_partition_configs([config])\n\n        expected = [{'date': ['2016-01-04'], 'hdate': ['2015-01-04', '2014-01-04', '2013-01-04']},\n                    {'date': ['2016-01-07'], 'hdate': ['2015-01-07', '2014-01-07', '2013-01-07']},\n                    {'date': ['2016-01-11'], 'hdate': ['2015-01-11', '2014-01-11', '2013-01-11']},\n                    ]\n        self.assertListEqual([d.selection for d in actual], [{**config.selection, **e} for e in expected])\n\n    def test_partition_with_no_keys(self):\n        config = Config.from_dict({\n                'parameters': {\n                    'target_path': 'download.nc',\n                },\n                'selection': {\n                    'features': ['2m_temperature'],\n                    'date': ['2017-01-01', '2016-01-02', '2016-01-03'],\n                }\n            })\n\n        actual = self.create_partition_configs([config])\n\n        expected = [config]\n        self.assertListEqual(actual, expected)\n\n\nclass SkipPartitionsTest(unittest.TestCase):\n\n    def setUp(self) -> None:\n        self.mock_store = InMemoryStore()\n        self.dummy_manifest = MockManifest(Location('mock://dummy'))\n\n    def test_skip_partition_missing_force_download(self):\n        config = {\n            'parameters': {\n                'partition_keys': ['year', 'month'],\n                'target_path': 'download-{}-{}.nc',\n            },\n            'selection': {\n                'features': ['pressure', 'temperature', 'wind_speed_U', 'wind_speed_V'],\n                'month': [str(i) for i in range(1, 13)],\n                'year': [str(i) for i in range(2015, 2021)]\n            }\n        }\n\n        config_obj = Config.from_dict(config)\n        actual = skip_partition(config_obj, self.mock_store, self.dummy_manifest)\n\n        self.assertEqual(actual, False)\n\n    def test_skip_partition_force_download_true(self):\n        config = {\n            'parameters': {\n                'partition_keys': ['year', 'month'],\n                'target_path': 'download-{}-{}.nc',\n                'force_download': True\n            },\n            'selection': {\n                'features': ['pressure', 'temperature', 'wind_speed_U', 'wind_speed_V'],\n                'month': [str(i) for i in range(1, 13)],\n                'year': [str(i) for i in range(2015, 2021)]\n            }\n        }\n\n        config_obj = Config.from_dict(config)\n        actual = skip_partition(config_obj, self.mock_store, self.dummy_manifest)\n\n        self.assertEqual(actual, False)\n\n    def test_skip_partition_force_download_false(self):\n        config = {\n            'parameters': {\n                'partition_keys': ['year', 'month'],\n                'target_path': 'download-{}-{}.nc',\n                'force_download': False\n            },\n            'selection': {\n                'features': ['pressure'],\n                'month': ['12'],\n                'year': ['02']\n            }\n        }\n\n        config_obj = Config.from_dict(config)\n\n        self.mock_store.exists = MagicMock(return_value=True)\n\n        actual = skip_partition(config_obj, self.mock_store, self.dummy_manifest)\n\n        self.assertEqual(actual, True)\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "weather_dl/download_pipeline/pipeline.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Primary ECMWF Downloader Workflow.\"\"\"\nimport argparse\nimport dataclasses\nimport getpass\nimport itertools\nimport logging\nimport os\nimport typing as t\n\nimport apache_beam as beam\nfrom apache_beam.options.pipeline_options import (\n    PipelineOptions,\n    StandardOptions,\n    WorkerOptions,\n)\n\nfrom .clients import CLIENTS\nfrom .config import Config\nfrom .fetcher import Fetcher\nfrom .manifest import (\n    Location,\n    LocalManifest,\n    Manifest,\n    NoOpManifest,\n)\nfrom .parsers import (\n    parse_manifest,\n    process_config,\n    get_subsections,\n    validate_all_configs,\n)\nfrom .partition import PartitionConfig\nfrom .stores import TempFileStore, LocalFileStore\n\nlogger = logging.getLogger(__name__)\n\n\ndef configure_logger(verbosity: int) -> None:\n    \"\"\"Configures logging from verbosity. Default verbosity will show errors.\"\"\"\n    level = 40 - verbosity * 10\n    logger = logging.getLogger(__package__)\n    fmt = '%(levelname)s %(asctime)s %(name)s: %(message)s'\n    datefmt = '%y-%m-%d %H:%M:%S'\n    formatter = logging.Formatter(fmt=fmt, datefmt=datefmt)\n    handler = logging.StreamHandler()\n    handler.setFormatter(formatter)\n    logger.root.addHandler(handler)\n    logger.setLevel(level)\n\n\n@dataclasses.dataclass\nclass PipelineArgs:\n    \"\"\"Options for download pipeline.\n\n    Attributes:\n        known_args: Parsed arguments. Includes user-defined args and defaults.\n        pipeline_options: The apache_beam pipeline options.\n        configs: The download configs / data requests.\n        client_name: The type of download client (e.g. Copernicus, Mars, or a fake).\n        store: A Store, which is responsible for where downloads end up.\n        manifest: A Manifest, which records download progress.\n        num_requesters_per_key: Number of requests per subsection (license).\n    \"\"\"\n    known_args: argparse.Namespace\n    pipeline_options: PipelineOptions\n    configs: t.List[Config]\n    client_name: str\n    store: None\n    manifest: Manifest\n    num_requesters_per_key: int\n\n\ndef pipeline(args: PipelineArgs) -> None:\n    \"\"\"Main pipeline entrypoint.\"\"\"\n    import builtins\n    import typing as t\n    logger.info(f\"Using '{args.num_requesters_per_key}' requests per subsection (license).\")\n\n    subsections = get_subsections(args.configs[0])\n\n    # Capping the max number of workers to N i.e. possible simultaneous requests + fudge factor\n    if args.pipeline_options.view_as(WorkerOptions).max_num_workers is None:\n        max_num_workers = len(subsections) * args.num_requesters_per_key + 10\n        args.pipeline_options.view_as(WorkerOptions).max_num_workers = max_num_workers\n        logger.info(f\"Capped the max number of workers to '{max_num_workers}'.\")\n\n    request_idxs = {name: itertools.cycle(range(args.num_requesters_per_key)) for name, _ in subsections}\n\n    def subsection_and_request(it: Config) -> t.Tuple[str, int]:\n        subsection = it.subsection_name\n        return subsection, builtins.next(request_idxs[subsection])\n\n    subsections_cycle = itertools.cycle(subsections)\n\n    partition = PartitionConfig(args.store,\n                                subsections_cycle,\n                                args.manifest,\n                                args.known_args.schedule,\n                                args.known_args.partition_chunks,\n                                args.known_args.update_manifest,\n                                len(subsections) * args.num_requesters_per_key)\n\n    with beam.Pipeline(options=args.pipeline_options) as p:\n        partitions = (\n                p\n                | 'Create Configs' >> beam.Create(args.configs)\n                | 'Prepare Partitions' >> partition\n        )\n        # When the --update_manifest flag is passed, the tool will only update the manifest\n        # for already downloaded shards and then exit.\n        if not args.known_args.update_manifest:\n            (\n                partitions\n                | 'GroupBy Request Limits' >> beam.GroupBy(subsection_and_request)\n                | 'Fetch Data' >> beam.ParDo(Fetcher(args.client_name,\n                                                     args.manifest,\n                                                     args.store,\n                                                     args.known_args.log_level))\n            )\n\n\ndef run(argv: t.List[str], save_main_session: bool = True) -> PipelineArgs:\n    \"\"\"Parse user arguments and configure the pipeline.\"\"\"\n    parser = argparse.ArgumentParser(\n        prog='weather-dl',\n        description='Weather Downloader ingests weather data to cloud storage.'\n    )\n    parser.add_argument('config', type=str, nargs='+',\n                        help=\"path/to/configs.cfg, containing client and data information. Can take multiple configs.\"\n                             \"Accepts *.cfg and *.json files.\")\n    parser.add_argument('-f', '--force-download', action=\"store_true\", default=False,\n                        help=\"Force redownload of partitions that were previously downloaded.\")\n    parser.add_argument('-d', '--dry-run', action='store_true', default=False,\n                        help='Run pipeline steps without _actually_ downloading or writing to cloud storage.')\n    parser.add_argument('-l', '--local-run', action='store_true', default=False,\n                        help=\"Run pipeline locally, downloads to local hard drive.\")\n    parser.add_argument('-m', '--manifest-location', type=Location, default='cli://manifest',\n                        help=\"Location of the manifest. By default, it will use Cloud Logging (stdout for direct \"\n                             \"runner). You can set the name of the manifest as the hostname of a URL with the 'cli' \"\n                             \"protocol. For example, 'cli://manifest' will prefix all the manifest logs as \"\n                             \"'[manifest]'. In addition, users can specify either a Firestore collection URI \"\n                             \"('fs://<my-collection>?projectId=<my-project-id>'), or BigQuery table \"\n                             \"('bq://<project-id>.<dataset-name>.<table-name>') [Note: Tool will create the BQ table \"\n                             \"itself, if not already present. Or it will use the existing table but can report errors \"\n                             \"in case of schema mismatch.], or 'noop://<name>' for an in-memory location.\")\n    parser.add_argument('-n', '--num-requests-per-key', type=int, default=-1,\n                        help='Number of concurrent requests to make per API key. '\n                             'Default: make an educated guess per client & config. '\n                             'Please see the client documentation for more details.')\n    parser.add_argument('-p', '--partition-chunks', type=int, default=None,\n                        help='Group shards into chunks of this size when computing the partitions. Specifically, '\n                             'this controls how we chunk elements in a cartesian product, which affects '\n                             \"parallelization of that step. Default: chunks of 1000 elements for 'in-order' scheduling.\"\n                             \" Chunks of 1 element for 'fair' scheduling.\")\n    parser.add_argument('-s', '--schedule', choices=['in-order', 'fair'], default='in-order',\n                        help=\"When using multiple configs, decide how partitions are scheduled: 'in-order' implies \"\n                             \"that partitions will be processed in sequential order of each config; 'fair' means that \"\n                             \"partitions from each config will be interspersed evenly. \"\n                             \"Note: When using 'fair' scheduling, we recommend you set the '--partition-chunks' to a \"\n                             \"much smaller number. Default: 'in-order'.\")\n    parser.add_argument('--check-skip-in-dry-run', action='store_true', default=False,\n                        help=\"To enable file skipping logic in dry-run mode. Default: 'false'.\")\n    parser.add_argument('-u', '--update-manifest', action='store_true', default=False,\n                        help=\"Update the manifest for the already downloaded shards and exit. Default: 'false'.\")\n    parser.add_argument('--log-level', type=int, default=2,\n                        help='An integer to configure log level. Default: 2(INFO)')\n    parser.add_argument('--use-local-code', action='store_true', default=False,\n                        help='Supply local code to the Runner.')\n\n    known_args, pipeline_args = parser.parse_known_args(argv[1:])\n\n    configure_logger(known_args.log_level)  # 0 = error, 1 = warn, 2 = info, 3 = debug\n\n    configs = []\n    for cfg in known_args.config:\n        with open(cfg, 'r', encoding='utf-8') as f:\n            # configs/example.cfg -> example.cfg\n            config_name = os.path.split(cfg)[1]\n            config = process_config(f, config_name)\n\n        config.force_download = known_args.force_download\n        config.user_id = getpass.getuser()\n        configs.append(config)\n\n    # This enables support for updating just the manifest for multiple config (*.cfg)\n    # when running the tool with the '-u' or '--update-manifest' flag.\n    if not known_args.update_manifest:\n        validate_all_configs(configs)\n\n    if known_args.check_skip_in_dry_run and not known_args.dry_run:\n        raise RuntimeError('--check-skip-in-dry-run can only be used along with --dry-run flag.')\n\n    # We use the save_main_session option because one or more DoFn's in this\n    # workflow rely on global context (e.g., a module imported at module level).\n    save_main_session_args = ['--save_main_session'] + ['True' if save_main_session else 'False']\n    pipeline_options = PipelineOptions(pipeline_args + save_main_session_args)\n\n    client_name = config.client\n    store = None  # will default to using FileSystems()\n    manifest = parse_manifest(known_args.manifest_location, pipeline_options.get_all_options())\n\n    if known_args.dry_run:\n        client_name = 'fake'\n        if not known_args.check_skip_in_dry_run:\n            store = TempFileStore('dry_run')\n            logger.warning('File skipping logic is disabled by default in dry-run mode.'\n                           'To enable please pass the flag --check-skip-in-dry-run along with the'\n                           'dry run flag.')\n            for config in configs:\n                config.force_download = True\n        manifest = NoOpManifest(Location('noop://dry-run'))\n\n    if known_args.local_run:\n        local_dir = '{}/local_run'.format(os.getcwd())\n        store = LocalFileStore(local_dir)\n        pipeline_options.view_as(StandardOptions).runner = 'DirectRunner'\n        manifest = LocalManifest(Location(local_dir))\n\n    num_requesters_per_key = known_args.num_requests_per_key\n    known_args.log_level = 40 - known_args.log_level * 10\n    client = CLIENTS[client_name](configs[0], known_args.log_level)\n    if num_requesters_per_key == -1:\n        num_requesters_per_key = client.num_requests_per_key(config.dataset)\n\n    logger.warning(f'By using {client_name} datasets, '\n                   f'users agree to the terms and conditions specified in {client.license_url!r}')\n\n    return PipelineArgs(\n        known_args, pipeline_options, configs, client_name, store, manifest, num_requesters_per_key\n    )\n"
  },
  {
    "path": "weather_dl/download_pipeline/pipeline_test.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport copy\nimport dataclasses\nimport getpass\nimport os\nimport typing as t\nimport unittest\n\nfrom apache_beam.options.pipeline_options import PipelineOptions\n\nimport weather_dl\nfrom .config import Config\nfrom .manifest import Location, NoOpManifest, LocalManifest, ConsoleManifest\nfrom .pipeline import run, PipelineArgs\nfrom .stores import TempFileStore, LocalFileStore\n\nPATH_TO_CONFIG = os.path.join(os.path.dirname(list(weather_dl.__path__)[0]), 'configs', 'era5_example_config.cfg')\nCONFIG = {\n    'parameters': {'client': 'cds',\n                   'dataset': 'reanalysis-era5-pressure-levels',\n                   'target_path': 'gs://ecmwf-output-test/era5/{year:04d}/{month:02d}/{day:02d}'\n                                  '-pressure-{pressure_level}.nc',\n                   'partition_keys': ['year', 'month', 'day', 'pressure_level'],\n                   'force_download': False,\n                   'user_id': getpass.getuser(),\n                   'api_url': 'https://cds.climate.copernicus.eu/api',\n                   'api_key': '12345:1234567-ab12-34cd-9876-4o4fake90909',  # fake key for testing.\n                   },\n    'selection': {'product_type': 'reanalysis',\n                  'format': 'netcdf',\n                  'variable': ['divergence', 'fraction_of_cloud_cover', 'geopotential'],\n                  'pressure_level': ['500'],\n                  'year': ['2015', '2016', '2017'], 'month': ['01'],\n                  'day': ['01', '15'],\n                  'time': ['00:00', '06:00', '12:00', '18:00']}\n}\nDEFAULT_ARGS = PipelineArgs(\n    known_args=argparse.Namespace(config=[PATH_TO_CONFIG],\n                                  force_download=False,\n                                  dry_run=False,\n                                  local_run=False,\n                                  manifest_location='cli://manifest',\n                                  num_requests_per_key=-1,\n                                  partition_chunks=None,\n                                  schedule='in-order',\n                                  check_skip_in_dry_run=False,\n                                  update_manifest=False,\n                                  log_level=20,\n                                  use_local_code=False),\n    pipeline_options=PipelineOptions('--save_main_session True'.split()),\n    configs=[Config.from_dict(CONFIG)],\n    client_name='cds',\n    store=None,\n    manifest=ConsoleManifest(Location('cli://manifest')),\n    num_requesters_per_key=5,\n)\n\n\ndef default_args(parameters: t.Optional[t.Dict] = None, selection: t.Optional[t.Dict] = None,\n                 known_args: t.Optional[t.Dict] = None, **kwargs) -> PipelineArgs:\n    if parameters is None:\n        parameters = {}\n    if selection is None:\n        selection = {}\n    if known_args is None:\n        known_args = {}\n    args = dataclasses.replace(DEFAULT_ARGS, **kwargs)\n    temp_config = copy.deepcopy(CONFIG)\n    temp_config['parameters'].update(parameters)\n    temp_config['selection'].update(selection)\n    args.configs = [Config.from_dict(temp_config)]\n    args.configs[0].config_name = 'era5_example_config.cfg'\n    args.configs[0].user_id = getpass.getuser()\n    args.configs[0].force_download = parameters.get('force_download', False)\n    args.known_args = copy.deepcopy(args.known_args)\n    for k, v in known_args.items():\n        setattr(args.known_args, k, v)\n    return args\n\n\nclass ParsePipelineArgs(unittest.TestCase):\n    DEFAULT_CMD = f'weather-dl {PATH_TO_CONFIG}'\n\n    def assert_pipeline(self, args, expected):\n        actual = run(args.split())\n        self.assertEqual(vars(actual.known_args), vars(expected.known_args))\n        self.assertEqual(\n            actual.pipeline_options.get_all_options(drop_default=True),\n            expected.pipeline_options.get_all_options(drop_default=True)\n        )\n        self.assertEqual(actual.configs, expected.configs)\n        self.assertEqual(actual.client_name, expected.client_name)\n        self.assertEqual(type(actual.store), type(expected.store))\n        self.assertEqual(actual.manifest, expected.manifest)\n        self.assertEqual(type(actual.manifest), type(expected.manifest))\n        self.assertEqual(actual.num_requesters_per_key, expected.num_requesters_per_key)\n\n    def test_happy_path(self):\n        self.assert_pipeline(self.DEFAULT_CMD, default_args())\n\n    def test_force_download(self):\n        self.assert_pipeline(\n            f'{self.DEFAULT_CMD} -f',\n            default_args(dict(force_download=True), known_args=dict(force_download=True))\n        )\n\n    def test_dry_run(self):\n        self.assert_pipeline(\n            f'{self.DEFAULT_CMD} -d',\n            default_args(\n                dict(force_download=True), known_args=dict(dry_run=True), client_name='fake',\n                store=TempFileStore('dry_run'), manifest=NoOpManifest(Location('noop://dry-run')),\n                num_requesters_per_key=1\n            )\n        )\n\n    def test_local_run(self):\n        self.assert_pipeline(\n            f'{self.DEFAULT_CMD} -l',\n            default_args(\n                known_args=dict(local_run=True), store=LocalFileStore(f'{os.getcwd()}/local_run'),\n                manifest=LocalManifest(Location(f'{os.getcwd()}/local_run')),\n                pipeline_options=PipelineOptions('--runner DirectRunner --save_main_session True'.split())\n            )\n        )\n\n    def test_update_manifest(self):\n        self.assert_pipeline(\n            f'{self.DEFAULT_CMD} -u',\n            default_args(known_args=dict(update_manifest=True))\n        )\n\n    def test_user_specified_num_requests_per_key(self):\n        self.assert_pipeline(\n            f'{self.DEFAULT_CMD} -n 7',\n            default_args(\n                known_args=dict(num_requests_per_key=7), num_requesters_per_key=7\n            )\n        )\n\n    def test_check_skip_in_dry_run_raise_error_if_dry_run_flag_is_absent(self):\n        with self.assertRaisesRegex(RuntimeError, 'can only be used along with --dry-run flag.'):\n            run(f'{self.DEFAULT_CMD} --check-skip-in-dry-run'.split())\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "weather_dl/download_pipeline/stores.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Download destinations, or `Store`s.\"\"\"\n\nimport abc\nimport io\nimport os\nimport tempfile\nimport typing as t\n\nfrom apache_beam.io.filesystems import FileSystems\n\nfrom .util import retry_with_exponential_backoff\n\n\nclass Store(abc.ABC):\n    \"\"\"A interface to represent where downloads are stored.\n\n     Default implementation uses Apache Beam's Filesystems.\n     \"\"\"\n\n    @abc.abstractmethod\n    def open(self, filename: str, mode: str = 'r') -> t.IO:\n        pass\n\n    @abc.abstractmethod\n    def exists(self, filename: str) -> bool:\n        pass\n\n\nclass InMemoryStore(Store):\n    \"\"\"Store file data in memory.\"\"\"\n\n    def __init__(self):\n        self.store = {}\n\n    def open(self, filename: str, mode: str = 'r') -> t.IO:\n        \"\"\"Create or read in-memory data.\"\"\"\n        if 'b' in mode:\n            file = io.BytesIO()\n        else:\n            file = io.StringIO()\n        self.store[filename] = file\n        return file\n\n    def exists(self, filename: str) -> bool:\n        \"\"\"Return true if the 'file' exists in memory.\"\"\"\n        return filename in self.store\n\n\nclass TempFileStore(Store):\n    \"\"\"Store data into temporary files.\"\"\"\n\n    def __init__(self, directory: t.Optional[str] = None) -> None:\n        \"\"\"Optionally specify the directory that contains all temporary files.\"\"\"\n        self.dir = directory\n        if self.dir and not os.path.exists(self.dir):\n            os.makedirs(self.dir)\n\n    def open(self, filename: str, mode: str = 'r') -> t.IO:\n        \"\"\"Create a temporary file in the store directory.\"\"\"\n        return tempfile.TemporaryFile(mode, dir=self.dir)\n\n    def exists(self, filename: str) -> bool:\n        \"\"\"Return true if file exists.\"\"\"\n        return os.path.exists(filename)\n\n\nclass LocalFileStore(Store):\n    \"\"\"Store data into local files.\"\"\"\n\n    def __init__(self, directory: t.Optional[str] = None) -> None:\n        \"\"\"Optionally specify the directory that contains all downloaded files.\"\"\"\n        self.dir = directory\n        if self.dir and not os.path.exists(self.dir):\n            os.makedirs(self.dir)\n\n    def open(self, filename: str, mode: str = 'r') -> t.IO:\n        \"\"\"Open a local file from the store directory.\"\"\"\n        return open(os.sep.join([self.dir, filename]), mode)\n\n    def exists(self, filename: str) -> bool:\n        \"\"\"Returns true if local file exists.\"\"\"\n        return os.path.exists(os.sep.join([self.dir, filename]))\n\n\nclass FSStore(Store):\n    \"\"\"Store data into any store supported by Apache Beam's FileSystems.\"\"\"\n\n    @retry_with_exponential_backoff\n    def open(self, filename: str, mode: str = 'r') -> t.IO:\n        \"\"\"Open object in cloud bucket (or local file system) as a read or write channel.\n\n        To work with cloud storage systems, only a read or write channel can be openend\n        at one time. Data will be treated as bytes, not text (equivalent to `rb` or `wb`).\n\n        Further, append operations, or writes on existing objects, are dissallowed (the\n        error thrown will depend on the implementation of the underlying cloud provider).\n        \"\"\"\n        if 'r' in mode and 'w' not in mode:\n            return FileSystems().open(filename)\n\n        if 'w' in mode and 'r' not in mode:\n            return FileSystems().create(filename)\n\n        raise ValueError(\n            f\"invalid mode {mode!r}: mode must have either 'r' or 'w', but not both.\"\n        )\n\n    def exists(self, filename: str) -> bool:\n        \"\"\"Returns true if object exists.\"\"\"\n        return FileSystems().exists(filename)\n"
  },
  {
    "path": "weather_dl/download_pipeline/stores_test.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport os\nimport tempfile\nimport unittest\n\nfrom .stores import FSStore\n\n\nclass FSStoreTest(unittest.TestCase):\n\n    def test_writes(self):\n        with tempfile.TemporaryDirectory() as tmpdir:\n            target = f'{tmpdir}/my-file'\n            with FSStore().open(target, 'wb') as f:\n                f.write(b'{\"key\": 1}')\n\n            self.assertTrue(os.path.exists(target))\n\n    def test_reads(self):\n        with tempfile.TemporaryDirectory() as tmpdir:\n            target = f'{tmpdir}/my-file'\n            with open(target, 'w') as f:\n                f.write('data')\n            with FSStore().open(target, 'rb') as f:\n                self.assertEqual(f.readlines(), [b'data'])\n\n    def test_reads__default_argument(self):\n        with tempfile.TemporaryDirectory() as tmpdir:\n            target = f'{tmpdir}/my-file'\n            with open(target, 'w') as f:\n                f.write('data')\n            with FSStore().open(target) as f:\n                self.assertEqual(f.readlines(), [b'data'])\n\n    def test_asserts_bad_mode__both(self):\n        with self.assertRaisesRegex(ValueError, \"invalid mode 'rw':\"):\n            with tempfile.TemporaryDirectory() as tmpdir:\n                target = f'{tmpdir}/my-file'\n                with FSStore().open(target, 'rw') as f:\n                    f.read()\n\n    def test_asserts_bad_mode__neither(self):\n        with self.assertRaisesRegex(ValueError, \"invalid mode '':\"):\n            with tempfile.TemporaryDirectory() as tmpdir:\n                target = f'{tmpdir}/my-file'\n                with FSStore().open(target, '') as f:\n                    f.read()\n"
  },
  {
    "path": "weather_dl/download_pipeline/util.py",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport datetime\nfrom dateutil.relativedelta import relativedelta\nimport geojson\nimport hashlib\nimport itertools\nimport logging\nimport os\nimport shutil\nimport socket\nimport subprocess\nimport sys\nimport typing as t\n\nimport numpy as np\nimport pandas as pd\nfrom apache_beam.io.gcp import gcsio\nfrom apache_beam.utils import retry\nfrom xarray.core.utils import ensure_us_time_resolution\nfrom urllib.parse import urlparse\nfrom google.api_core.exceptions import BadRequest\n\nlogger = logging.getLogger(__name__)\n\nLATITUDE_RANGE = (-90, 90)\nLONGITUDE_RANGE = (-180, 180)\nGLOBAL_COVERAGE_AREA = [90, -180, -90, 180]\n\n\ndef _retry_if_valid_input_but_server_or_socket_error_and_timeout_filter(exception) -> bool:\n    if isinstance(exception, socket.timeout):\n        return True\n    if isinstance(exception, TimeoutError):\n        return True\n    # To handle the concurrency issue in BigQuery.\n    if isinstance(exception, BadRequest):\n        return True\n    return retry.retry_if_valid_input_but_server_error_and_timeout_filter(exception)\n\n\nclass _FakeClock:\n    def sleep(self, value):\n        pass\n\n\ndef retry_with_exponential_backoff(fun):\n    \"\"\"A retry decorator that doesn't apply during test time.\"\"\"\n    clock = retry.Clock()\n\n    # Use a fake clock only during test time...\n    if 'unittest' in sys.modules.keys():\n        clock = _FakeClock()\n\n    return retry.with_exponential_backoff(\n        retry_filter=_retry_if_valid_input_but_server_or_socket_error_and_timeout_filter,\n        clock=clock,\n    )(fun)\n\n\n# TODO(#245): Group with common utilities (duplicated)\ndef ichunked(iterable: t.Iterable, n: int) -> t.Iterator[t.Iterable]:\n    \"\"\"Yield evenly-sized chunks from an iterable.\"\"\"\n    input_ = iter(iterable)\n    try:\n        while True:\n            it = itertools.islice(input_, n)\n            # peek to check if 'it' has next item.\n            first = next(it)\n            yield itertools.chain([first], it)\n    except StopIteration:\n        pass\n\n\n# TODO(#245): Group with common utilities (duplicated)\ndef copy(src: str, dst: str) -> None:\n    \"\"\"Copy data via `gsutil` or local filesystem.\"\"\"\n    is_gs = src.startswith(\"gs://\") or dst.startswith(\"gs://\")\n    try:\n        if is_gs:\n            subprocess.run(['gcloud', 'storage', 'cp', src, dst], check=True,\n                           capture_output=True, text=True, input=\"n/n\")\n        else:\n            os.makedirs(os.path.dirname(dst) or '.', exist_ok=True)\n            shutil.copy(src, dst)\n    except Exception as e:\n        error_detail = getattr(e, \"stderr\", str(e)).strip()\n        msg = f\"Failed to copy {src!r} to {dst!r} due to {error_detail}\"\n        logger.error(msg)\n        raise EnvironmentError(msg) from e\n\n\n# TODO(#245): Group with common utilities (duplicated)\ndef to_json_serializable_type(value: t.Any) -> t.Any:\n    \"\"\"Returns the value with a type serializable to JSON\"\"\"\n    # Note: The order of processing is significant.\n    logger.debug('Serializing to JSON')\n\n    if pd.isna(value) or value is None:\n        return None\n    elif np.issubdtype(type(value), np.floating):\n        return float(value)\n    elif isinstance(value, np.ndarray):\n        # Will return a scaler if array is of size 1, else will return a list.\n        return value.tolist()\n    elif isinstance(value, datetime.datetime) or isinstance(value, str) or isinstance(value, np.datetime64):\n        # Assume strings are ISO format timestamps...\n        try:\n            value = datetime.datetime.fromisoformat(value)\n        except ValueError:\n            # ... if they are not, assume serialization is already correct.\n            return value\n        except TypeError:\n            # ... maybe value is a numpy datetime ...\n            try:\n                value = ensure_us_time_resolution(value).astype(datetime.datetime)\n            except AttributeError:\n                # ... value is a datetime object, continue.\n                pass\n\n        # We use a string timestamp representation.\n        if value.tzname():\n            return value.isoformat()\n\n        # We assume here that naive timestamps are in UTC timezone.\n        return value.replace(tzinfo=datetime.timezone.utc).isoformat()\n    elif isinstance(value, np.timedelta64):\n        # Return time delta in seconds.\n        return float(value / np.timedelta64(1, 's'))\n    # This check must happen after processing np.timedelta64 and np.datetime64.\n    elif np.issubdtype(type(value), np.integer):\n        return int(value)\n\n    return value\n\n\ndef fetch_geo_polygon(area: t.Union[list, str]) -> str:\n    \"\"\"Calculates a geography polygon from an input area.\"\"\"\n    # Ref: https://confluence.ecmwf.int/pages/viewpage.action?pageId=151520973\n    if isinstance(area, str):\n        # European area\n        if area == 'E':\n            area = [73.5, -27, 33, 45]\n        # Global area\n        elif area == 'G':\n            area = GLOBAL_COVERAGE_AREA\n        else:\n            raise RuntimeError(f'Not a valid value for area in config: {area}.')\n\n    n, w, s, e = [float(x) for x in area]\n    if s < LATITUDE_RANGE[0]:\n        raise ValueError(f\"Invalid latitude value for south: '{s}'\")\n    if n > LATITUDE_RANGE[1]:\n        raise ValueError(f\"Invalid latitude value for north: '{n}'\")\n    if w < LONGITUDE_RANGE[0]:\n        raise ValueError(f\"Invalid longitude value for west: '{w}'\")\n    if e > LONGITUDE_RANGE[1]:\n        raise ValueError(f\"Invalid longitude value for east: '{e}'\")\n\n    # Define the coordinates of the bounding box.\n    coords = [[w, n], [w, s], [e, s], [e, n], [w, n]]\n\n    # Create the GeoJSON polygon object.\n    polygon = geojson.dumps(geojson.Polygon([coords]))\n    return polygon\n\n\ndef get_file_size(path: str) -> float:\n    parsed_gcs_path = urlparse(path)\n    if parsed_gcs_path.scheme != 'gs' or parsed_gcs_path.netloc == '':\n        return os.stat(path).st_size / (1024 ** 3) if os.path.exists(path) else 0\n    else:\n        return gcsio.GcsIO().size(path) / (1024 ** 3) if gcsio.GcsIO().exists(path) else 0\n\n\ndef get_wait_interval(num_retries: int = 0) -> float:\n    \"\"\"Returns next wait interval in seconds, using an exponential backoff algorithm.\"\"\"\n    if 0 == num_retries:\n        return 0\n    return 2 ** num_retries\n\n\ndef generate_md5_hash(input: str) -> str:\n    \"\"\"Generates md5 hash for the input string.\"\"\"\n    return hashlib.md5(input.encode('utf-8')).hexdigest()\n\n\ndef download_with_aria2(url: str, path: str) -> None:\n    \"\"\"Downloads a file from the given URL using the `aria2c` command-line utility,\n    with options set to improve download speed and reliability.\"\"\"\n    dir_path, file_name = os.path.split(path)\n    try:\n        subprocess.run(\n            ['aria2c', '-x', '16', '-s', '16', url, '-d', dir_path, '-o', file_name, '--allow-overwrite'],\n            check=True,\n            capture_output=True)\n    except subprocess.CalledProcessError as e:\n        logger.error(f'Failed download from server {url!r} to {path!r} due to {e.stderr.decode(\"utf-8\")}')\n        raise\n\n\ndef generate_hdate(date: str, subtract_year: str) -> str:\n    \"\"\"Generate a historical date by subtracting a specified number of years from the given date.\n    If input date is leap day (Feb 29), return Feb 28 even if target hdate is also a leap year.\n    This is expected in ECMWF API.\n\n    Args:\n        date (str): The input date in the format 'YYYY-MM-DD'.\n        subtract_year (str): The number of years to subtract.\n\n    Returns:\n        str: The historical date in the format 'YYYY-MM-DD'.\n    \"\"\"\n    try:\n        input_date = datetime.datetime.strptime(date, \"%Y-%m-%d\")\n        # Check for leap day\n        if input_date.month == 2 and input_date.day == 29:\n            input_date = input_date - datetime.timedelta(days=1)\n        subtract_year = int(subtract_year)\n    except (ValueError, TypeError):\n        logger.error(\"Invalid input.\")\n        raise\n\n    hdate = input_date - relativedelta(years=subtract_year)\n    return hdate.strftime(\"%Y-%m-%d\")\n"
  },
  {
    "path": "weather_dl/download_pipeline/util_test.py",
    "content": "import unittest\n\nfrom .util import fetch_geo_polygon, generate_hdate, ichunked\n\n\n# TODO(#245): Duplicate tests; remove.\nclass IChunksTests(unittest.TestCase):\n    def setUp(self) -> None:\n        self.items = range(20)\n\n    def test_even_chunks(self):\n        actual = []\n        for chunk in ichunked(self.items, 4):\n            actual.append(list(chunk))\n\n        self.assertEqual(actual, [\n            [0, 1, 2, 3],\n            [4, 5, 6, 7],\n            [8, 9, 10, 11],\n            [12, 13, 14, 15],\n            [16, 17, 18, 19],\n        ])\n\n    def test_odd_chunks(self):\n        actual = []\n        for chunk in ichunked(self.items, 7):\n            actual.append(list(chunk))\n\n        self.assertEqual(actual, [\n            [0, 1, 2, 3, 4, 5, 6],\n            [7, 8, 9, 10, 11, 12, 13],\n            [14, 15, 16, 17, 18, 19]\n        ])\n\n\nclass TestFetchGeoPolygon(unittest.TestCase):\n    def test_valid_area(self):\n        # Test with valid area values.\n        area = ['40', '-75', '39', '-74']\n        expected_result = (\n            '{\"type\": \"Polygon\", \"coordinates\": '\n            '[[[-75.0, 40.0], [-75.0, 39.0], [-74.0, 39.0], [-74.0, 40.0], [-75.0, 40.0]]]}'\n        )\n        self.assertEqual(fetch_geo_polygon(area), expected_result)\n\n    def test_valid_string_area_value(self):\n        # Test with valid string area value.\n        area = 'E'\n        expected_result = (\n            '{\"type\": \"Polygon\", \"coordinates\": '\n            '[[[-27.0, 73.5], [-27.0, 33.0], [45.0, 33.0], [45.0, 73.5], [-27.0, 73.5]]]}'\n        )\n        self.assertEqual(fetch_geo_polygon(area), expected_result)\n\n    def test_invalid_string_area_value(self):\n        # Test with invalid string area value.\n        area = 'B'\n        with self.assertRaises(RuntimeError):\n            fetch_geo_polygon(area)\n\n    def test_invalid_latitude_south(self):\n        # Test with invalid south latitude value\n        area = [40, -75, -91, -74]\n        with self.assertRaises(ValueError):\n            fetch_geo_polygon(area)\n\n    def test_invalid_latitude_north(self):\n        # Test with invalid north latitude value\n        area = [91, -75, 39, -74]\n        with self.assertRaises(ValueError):\n            fetch_geo_polygon(area)\n\n    def test_invalid_longitude_west(self):\n        # Test with invalid west longitude value\n        area = [40, -181, 39, -74]\n        with self.assertRaises(ValueError):\n            fetch_geo_polygon(area)\n\n    def test_invalid_longitude_east(self):\n        # Test with invalid east longitude value\n        area = [40, -75, 39, 181]\n        with self.assertRaises(ValueError):\n            fetch_geo_polygon(area)\n\n\nclass TestGenerateHdate(unittest.TestCase):\n    def test_valid_hdate(self):\n        date = '2020-01-02'\n        substract_year = '4'\n        expected_result = '2016-01-02'\n        self.assertEqual(generate_hdate(date, substract_year), expected_result)\n\n        # Also test for leap day correctness\n        date = '2020-02-29'\n        substract_year = '3'\n        expected_result = '2017-02-28'\n        self.assertEqual(generate_hdate(date, substract_year), expected_result)\n\n        substract_year = '4'\n        expected_result = '2016-02-28'\n        self.assertEqual(generate_hdate(date, substract_year), expected_result)\n"
  },
  {
    "path": "weather_dl/setup.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom setuptools import setup, find_packages\n\nbeam_gcp_requirements = [\n    \"google-cloud-bigquery==2.34.4\",\n    \"google-cloud-bigquery-storage==2.14.1\",\n    \"google-cloud-bigtable==1.7.2\",\n    \"google-cloud-core==1.7.3\",\n    \"google-cloud-datastore==1.15.5\",\n    \"google-cloud-dlp==3.8.0\",\n    \"google-cloud-language==1.3.2\",\n    \"google-cloud-pubsub==2.13.4\",\n    \"google-cloud-pubsublite==1.4.2\",\n    \"google-cloud-recommendations-ai==0.2.0\",\n    \"google-cloud-spanner==1.19.3\",\n    \"google-cloud-videointelligence==1.16.3\",\n    \"google-cloud-vision==1.0.2\",\n    \"apache-beam[gcp]==2.40.0\",\n]\n\nbase_requirements = [\n    \"ecmwf-api-client==1.6.3\",\n    \"numpy>=1.19.1\",\n    \"pandas==1.5.1\",\n    \"xarray==2023.1.0\",\n    \"requests>=2.24.0\",\n    \"urllib3==1.26.5\",\n    \"google-cloud-firestore==2.6.0\",\n    \"firebase-admin==6.0.1\",\n    # \"gcloud\" should already be installed in the host image.\n    # If we install it here, we'll hit auth issues.\n]\n\nsetup(\n    name='download_pipeline',\n    packages=find_packages(),\n    version='0.1.28',\n    author='Anthromets',\n    author_email='anthromets-ecmwf@google.com',\n    url='https://weather-tools.readthedocs.io/en/latest/weather_dl/',\n    description='A tool to download weather data.',\n    install_requires=beam_gcp_requirements + base_requirements,\n)\n"
  },
  {
    "path": "weather_dl/weather-dl",
    "content": "#!/usr/bin/env python3\n# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport glob\nimport logging\nimport os\nimport subprocess\nimport sys\nimport tarfile\nimport tempfile\n\nimport weather_dl\n\nSDK_CONTAINER_IMAGE='gcr.io/weather-tools-prod/weather-tools:0.0.0'\n\nif __name__ == '__main__':\n    logging.getLogger().setLevel(logging.INFO)\n\n    site_pkg = weather_dl.__path__[0]\n\n    try:\n        from download_pipeline import cli\n    except ImportError:\n        # Install the subpackage.\n        subprocess.check_call(f'{sys.executable} -m pip -q install -e {site_pkg}'.split())\n\n        # Re-load sys.path\n        import site\n        from importlib import reload\n\n        reload(site)\n\n        # Re-attempt import. If this fails, the user probably has an older version of\n        # the package already installed on their machine that breaks this process.\n        # If that's the case, it's best to start from a clean virtual environment.\n        try:\n            from download_pipeline import cli\n        except ImportError as e:\n            raise ImportError('please re-install package in a clean python environment.') from e\n        \n    args = []\n\n    if \"DataflowRunner\" in sys.argv and  \"--sdk_container_image\" not in sys.argv:\n        args.extend(['--sdk_container_image',\n             os.getenv('SDK_CONTAINER_IMAGE', SDK_CONTAINER_IMAGE),\n             '--experiments',\n             'use_runner_v2'])\n        \n    if \"--use-local-code\" in sys.argv:\n        with tempfile.TemporaryDirectory() as tmpdir:\n            original_dir = os.getcwd()\n\n            # Convert subpackage to a tarball\n            os.chdir(site_pkg)\n            subprocess.check_call(\n                f'{sys.executable} ./setup.py -q sdist --dist-dir {tmpdir}'.split(),\n            )\n\n            os.chdir(original_dir)\n\n            # Set tarball as extra packages for Beam.\n            pkg_archive = glob.glob(os.path.join(tmpdir, '*.tar.gz'))[0]\n\n            with tarfile.open(pkg_archive, 'r') as tar:\n                assert any([f.endswith('.py') for f in tar.getnames()]), 'extra_package must include python files!'\n\n            # cleanup memory to prevent pickling error.\n            tar = None\n            weather_dl = None\n            args.extend(['--extra_package', pkg_archive])\n            cli(args)\n    else:\n        cli(args)\n"
  },
  {
    "path": "weather_dl_v2/README.md",
    "content": "## weather-dl-v2\n\n<!-- TODO: Create a setup script to ease deployment process. -->\n\n> **_NOTE:_**   weather-dl-v2 only supports python 3.10\n\n### Sequence of steps:\n1) Refer to downloader_kubernetes/README.md\n2) Refer to license_deployment/README.md\n3) Refer to fastapi-server/README.md\n4) Refer to cli/README.md\n\n\n### To create docker images of services\n\n```\nexport PROJECT_ID=<your-project-here>\nexport REPO=<repo> eg:weather-tools\n```\n\nChoose any ONE service from below:\n\n1. CLI\n```\nexport SERVICE=weather-dl-v2-cli\nexport FOLDER=cli\n```\n\n2. weather-dl-v2-downloader\n```\nexport SERVICE=weather-dl-v2-downloader\nexport FOLDER=downloader_kubernetes\n```\n\n3. weather-dl-v2-license-dep\n```\nexport SERVICE=weather-dl-v2-license-dep\nexport FOLDER=license_deployment\n```\n\n4. weather-dl-v2-server\n```\nexport SERVICE=weather-dl-v2-server\nexport FOLDER=fastapi-server\n```\n\nFinally run:\n```\nexport VER=$(cat $FOLDER/VERSION.txt)\n\ngcloud builds submit --config=cloudbuild.yml --substitutions=_PROJECT_ID=$PROJECT_ID,_REPO=$REPO,_SERVICE=$SERVICE,_VER=$VER,_FOLDER=$FOLDER\n```"
  },
  {
    "path": "weather_dl_v2/__init__.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "weather_dl_v2/cli/CLI-Documentation.md",
    "content": "# CLI Documentation\nThe following doc provides cli commands and their various arguments and options.\n\nBase Command:\n```\nweather-dl-v2\n```\n\n## Ping\nPing the FastAPI server and check if it’s live and reachable.\n\n <summary><code>weather-dl-v2 ping</code></summary>\n\n##### Usage\n```\nweather-dl-v2 ping\n```\n\n\n<br>\n\n## Download\nManage download configs.\n\n\n### Add Downloads\n <summary><code>weather-dl-v2 download add</code> <br>\n Adds a new download config to specific licenses.  \n </summary>\n\n\n##### Arguments\n> `FILE_PATH` : Path to config file.\n\n##### Options\n> `-l/--license` (Required): License ID to which this download has to be added to.  \n> `-f/--force-download` : Force redownload of partitions that were previously downloaded.\n\n##### Usage\n```\nweather-dl-v2 download add /path/to/example.cfg –l L1 -l L2 [--force-download]\n```\n\n### List Downloads\n <summary><code>weather-dl-v2 download list</code> <br>\n List all the active downloads.\n </summary>\n\nThe list can also be filtered out by client_names.  \nAvailable filters:\n```\nFilter Key: client_name\nValues: cds, mars, ecpublic\n\nFilter Key: status\nValues: completed, failed, in-progress\n``` \n\n##### Options\n> `--filter` : Filter the list by some key and value. Format of filter filter_key=filter_value\n\n##### Usage\n```\nweather-dl-v2 download list\nweather-dl-v2 download list --filter client_name=cds\nweather-dl-v2 download list --filter status=success\nweather-dl-v2 download list --filter status=failed\nweather-dl-v2 download list --filter status=in-progress\nweather-dl-v2 download list --filter client_name=cds --filter status=success\n```\n\n### Download Get\n <summary><code>weather-dl-v2 download get</code> <br>\n Get a particular download by config name.\n </summary>\n\n##### Arguments\n> `CONFIG_NAME` : Name of the download config.\n\n##### Usage\n```\nweather-dl-v2 download get example.cfg\n```\n\n### Download Show\n <summary><code>weather-dl-v2 download show</code> <br>\n Get contents of a particular config by config name.\n </summary>\n\n##### Arguments\n> `CONFIG_NAME` : Name of the download config.\n\n##### Usage\n```\nweather-dl-v2 download show example.cfg\n```\n\n### Download Remove\n <summary><code>weather-dl-v2 download remove</code> <br>\n Remove a download by config name.\n </summary>\n\n##### Arguments\n> `CONFIG_NAME` : Name of the download config.\n\n##### Usage\n```\nweather-dl-v2 download remove example.cfg\n```\n\n### Download Refetch\n <summary><code>weather-dl-v2 download refetch</code> <br>\n Refetch all non-successful partitions of a config.\n </summary>\n\n##### Arguments\n> `CONFIG_NAME` : Name of the download config.\n\n##### Options\n> `-l/--license` (Required): License ID to which this download has to be added to.\n\n##### Usage\n```\nweather-dl-v2 download refetch example.cfg -l L1 -l L2\n```\n\n<br>\n\n## License\nManage licenses.\n\n### License Add\n <summary><code>weather-dl-v2 license add</code> <br>\n Add a new license. New licenses are added using a json file.\n </summary>\n\nThe json file should be in this format:\n```\n{\n\t\"license_id: <license_id>,\n\t\"client_name\": <client_name>,\n\t\"number_of_requests\": <number_of_request>,\n\t\"secret_id\": <secret_manager_id>\n}\n```\nNOTE: `license_id` is case insensitive and has to be unique for each license.\n\n\n##### Arguments\n> `FILE_PATH` : Path to the license json.\n\n##### Usage\n```\nweather-dl-v2 license add /path/to/new-license.json\n```\n\n### License Get\n <summary><code>weather-dl-v2 license get</code> <br>\n Get a particular license by license ID.\n </summary>\n\n##### Arguments\n> `LICENSE` : License ID of the license to be fetched.\n\n##### Usage\n```\nweather-dl-v2 license get L1\n```\n\n### License Remove\n <summary><code>weather-dl-v2 license remove</code> <br>\n Remove a particular license by license ID.\n </summary>\n\n##### Arguments\n> `LICENSE` : License ID of the license to be removed.\n\n##### Usage\n```\nweather-dl-v2 license remove L1\n```\n\n### License List\n <summary><code>weather-dl-v2 license list</code> <br>\n List all the licenses available. \n </summary>\n\n The list can also be filtered by client name.\n\n##### Options\n> `--filter` : Filter the list by some key and value. Format of filter filter_key=filter_value.\n\n##### Usage\n```\nweather-dl-v2 license list\nweather-dl-v2 license list --filter client_name=cds\n```\n\n### License Update\n <summary><code>weather-dl-v2 license update</code> <br>\n Update an existing license using License ID and a license json.\n </summary>\n\n The json should be of the same format used to add a new license.\n\n##### Arguments\n> `LICENSE` : License ID of the license to be edited.  \n> `FILE_PATH` : Path to the license json.\n\n##### Usage\n```\nweather-dl-v2 license update L1 /path/to/license.json\n```\n\n<br>\n\n## Queue\nManage all the license queue.\n\n### Queue List\n <summary><code>weather-dl-v2 queue list</code> <br>\n List all the queues.\n </summary>\n\n The list can also be filtered by client name.\n\n##### Options\n> `--filter` : Filter the list by some key and value. Format of filter filter_key=filter_value.\n\n##### Usage\n```\nweather-dl-v2 queue list\nweather-dl-v2 queue list --filter client_name=cds\n```\n\n### Queue Get\n <summary><code>weather-dl-v2 queue get</code> <br>\n Get a queue by license ID.\n </summary>\n\n The list can also be filtered by client name.\n\n##### Arguments\n> `LICENSE` : License ID of the queue to be fetched.\n\n##### Usage\n```\nweather-dl-v2 queue get L1\n```\n\n### Queue Edit\n <summary><code>weather-dl-v2 queue edit</code> <br>\n Edit the priority of configs inside queues using edit.\n </summary>\n\nPriority can be edited in two ways:\n1. The new priority queue is passed using a priority json file that should follow the following format:\n```\n{\n\t“priority”: [“c1.cfg”, “c3.cfg”, “c2.cfg”]\n}\n```\n2. A config file name and its absolute priority can be passed and it updates the priority for that particular config file in the mentioned license queue.\n\n##### Arguments\n> `LICENSE` : License ID of queue to be edited.\n\n##### Options\n> `-f/--file` : Path of the new priority json file.  \n> `-c/--config` : Config name for absolute priority.  \n> `-p/--priority`: Absolute priority for the config in a license queue. Priority increases in ascending order with 0 having highest priority.\n\n##### Usage\n```\nweather-dl-v2 queue edit L1 --file /path/to/priority.json\nweather-dl-v2 queue edit L1 --config example.cfg --priority 0\n```\n\n<br>\n\n## Config\nConfigurations for cli.\n\n### Config Show IP\n <summary><code>weather-dl-v2 config show-ip</code> <br>\nSee the current server IP address.\n </summary>\n\n##### Usage\n```\nweather-dl-v2 config show-ip\n```\n\n### Config Set IP\n <summary><code>weather-dl-v2 config set-ip</code> <br>\nSee the current server IP address.\n </summary>\n\n##### Arguments\n> `NEW_IP` : New IP address. (Do not add port or protocol).\n\n##### Usage\n```\nweather-dl-v2 config set-ip 127.0.0.1\n```\n\n"
  },
  {
    "path": "weather_dl_v2/cli/Dockerfile",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nFROM continuumio/miniconda3:latest\n\nCOPY . .\n\n# Add the mamba solver for faster builds\nRUN conda install -n base conda-libmamba-solver\nRUN conda config --set solver libmamba\n\n# Create conda env using environment.yml\nRUN conda update conda -y\nRUN conda env create --name weather-dl-v2-cli --file=environment.yml\n\n# Activate the conda env and update the PATH\nARG CONDA_ENV_NAME=weather-dl-v2-cli\nRUN echo \"source activate ${CONDA_ENV_NAME}\" >> ~/.bashrc\nENV PATH /opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH\n\nRUN apt-get update -y\nRUN apt-get install nano -y\nRUN apt-get install vim -y\nRUN apt-get install curl -y\n\n# Install gsutil \nRUN curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-443.0.0-linux-arm.tar.gz\nRUN tar -xf google-cloud-cli-443.0.0-linux-arm.tar.gz\nRUN ./google-cloud-sdk/install.sh --quiet\nRUN echo \"if [ -f '/google-cloud-sdk/path.bash.inc' ]; then . '/google-cloud-sdk/path.bash.inc'; fi\" >> /root/.bashrc\nRUN echo \"if [ -f '/google-cloud-sdk/completion.bash.inc' ]; then . '/google-cloud-sdk/completion.bash.inc'; fi\" >> /root/.bashrc\n"
  },
  {
    "path": "weather_dl_v2/cli/README.md",
    "content": "# weather-dl-cli\nThis is a command line interface for talking to the weather-dl-v2 FastAPI server.\n\n- Due to our org level policy we can't expose external-ip using LoadBalancer Service\nwhile deploying our FastAPI server. Hence we need to deploy the CLI on a VM to interact\nthrough our fastapi server.\n\nReplace the FastAPI server pod's IP in cli_config.json.\n```\nPlease make approriate changes in cli_config.json, if required.\n```\n> Note: Command to get the Pod IP : `kubectl get pods -o wide`.\n>\n> Though note that in case of Pod restart IP might get change. So we need to look\n> for better solution for the same.\n\n## Create docker image for weather-dl-cli \nRefer instructions in weather_dl_v2/README.md\n\n## Create a VM using above created docker-image\n```\nexport ZONE=<zone> eg: us-west1-a\nexport SERVICE_ACCOUNT=<service account> # Let's keep this as Compute Engine Default Service Account\nexport IMAGE_PATH=<container-image-path> # The above created image-path\n\ngcloud compute instances create-with-container weather-dl-v2-cli \\\n    --project=$PROJECT_ID \\\n    --zone=$ZONE \\\n    --machine-type=e2-medium \\\n    --network-interface=network-tier=PREMIUM,subnet=default \\\n    --maintenance-policy=MIGRATE \\\n    --provisioning-model=STANDARD \\\n    --service-account=$SERVICE_ACCOUNT \\\n    --scopes=https://www.googleapis.com/auth/cloud-platform \\\n    --tags=http-server,https-server \\\n    --image=projects/cos-cloud/global/images/cos-stable-105-17412-101-24 \\\n    --boot-disk-size=10GB \\\n    --boot-disk-type=pd-balanced \\\n    --boot-disk-device-name=weather-dl-v2-cli \\\n    --container-image=$IMAGE_PATH \\\n    --container-restart-policy=on-failure \\\n    --container-tty \\\n    --no-shielded-secure-boot \\\n    --shielded-vtpm \\\n    --labels=goog-ec-src=vm_add-gcloud,container-vm=cos-stable-105-17412-101-24 \\\n    --metadata-from-file=startup-script=vm-startup.sh\n```\n\n## Use the cli after doing ssh in the above created VM\n```\nweather-dl-v2 --help\n```\n"
  },
  {
    "path": "weather_dl_v2/cli/VERSION.txt",
    "content": "1.0.4"
  },
  {
    "path": "weather_dl_v2/cli/app/__init__.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "weather_dl_v2/cli/app/cli_config.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport dataclasses\nimport json\nimport typing as t\n\nimport pkg_resources\n\nValues = t.Union[t.List[\"Values\"], t.Dict[str, \"Values\"], bool, int, float, str]  # pytype: disable=not-supported-yet\n\n\n@dataclasses.dataclass\nclass CliConfig:\n    pod_ip: str = \"\"\n    port: str = \"\"\n\n    @property\n    def BASE_URI(self) -> str:\n        # If pod IP is not present assume in dev environment.\n        if(self.pod_ip == \"<pod_ip>\"):\n            return \"http://127.0.0.1:8000\"\n        return f\"http://{self.pod_ip}:{self.port}\"\n\n    kwargs: t.Optional[t.Dict[str, Values]] = dataclasses.field(default_factory=dict)\n\n    @classmethod\n    def from_dict(cls, config: t.Dict):\n        config_instance = cls()\n\n        for key, value in config.items():\n            if hasattr(config_instance, key):\n                setattr(config_instance, key, value)\n            else:\n                config_instance.kwargs[key] = value\n\n        return config_instance\n\n\ncli_config = None\n\n\ndef get_config():\n    global cli_config\n\n    cli_config_json = pkg_resources.resource_filename('app', 'data/cli_config.json')\n\n    if cli_config is None:\n        with open(cli_config_json) as file:\n            firestore_dict = json.load(file)\n            cli_config = CliConfig.from_dict(firestore_dict)\n\n    return cli_config\n"
  },
  {
    "path": "weather_dl_v2/cli/app/data/cli_config.json",
    "content": "{\n    \"pod_ip\": \"<pod_ip>\",\n    \"port\": 8080\n}\n"
  },
  {
    "path": "weather_dl_v2/cli/app/main.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport logging\n\nimport requests\nimport typer\n\nfrom app.cli_config import get_config\nfrom app.subcommands import config, download, license, queue\nfrom app.utils import Loader\n\nlogger = logging.getLogger(__name__)\n\napp = typer.Typer(\n    help=\"weather-dl-v2 is a cli tool for communicating with FastAPI server.\"\n)\n\napp.add_typer(download.app, name=\"download\", help=\"Manage downloads.\")\napp.add_typer(queue.app, name=\"queue\", help=\"Manage queues.\")\napp.add_typer(license.app, name=\"license\", help=\"Manage licenses.\")\napp.add_typer(config.app, name=\"config\", help=\"Configurations for cli.\")\n\n\n@app.command(\"ping\", help=\"Check if FastAPI server is live and rechable.\")\ndef ping():\n    uri = f\"{get_config().BASE_URI}/\"\n    try:\n        with Loader(\"Sending request...\"):\n            x = requests.get(uri)\n    except Exception as e:\n        print(f\"error {e}\")\n        return\n    print(x.text)\n\n\nif __name__ == \"__main__\":\n    app()\n"
  },
  {
    "path": "weather_dl_v2/cli/app/services/__init__.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "weather_dl_v2/cli/app/services/download_service.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport abc\nimport json\nimport logging\nimport typing as t\n\nfrom app.cli_config import get_config\nfrom app.services.network_service import network_service\n\nlogger = logging.getLogger(__name__)\n\n\nclass DownloadService(abc.ABC):\n\n    @abc.abstractmethod\n    def _list_all_downloads(self):\n        pass\n\n    @abc.abstractmethod\n    def _list_all_downloads_by_filter(self, filter_dict: dict):\n        pass\n\n    @abc.abstractmethod\n    def _get_download_by_config(self, config_name: str):\n        pass\n\n    @abc.abstractmethod\n    def _show_config_content(self, config_name: str):\n        pass\n\n    @abc.abstractmethod\n    def _add_new_download(\n        self, file_path: str, licenses: t.List[str], force_download: bool, priority: int | None\n    ):\n        pass\n\n    @abc.abstractmethod\n    def _remove_download(self, config_name: str):\n        pass\n\n    @abc.abstractmethod\n    def _refetch_config_partitions(self, config_name: str, licenses: t.List[str], only_failed: bool):\n        pass\n\n\nclass DownloadServiceNetwork(DownloadService):\n\n    def __init__(self):\n        self.endpoint = f\"{get_config().BASE_URI}/download\"\n\n    def _list_all_downloads(self):\n        return network_service.get(\n            uri=self.endpoint, header={\"accept\": \"application/json\"}\n        )\n\n    def _list_all_downloads_by_filter(self, filter_dict: dict):\n        return network_service.get(\n            uri=self.endpoint,\n            header={\"accept\": \"application/json\"},\n            query=filter_dict,\n        )\n\n    def _get_download_by_config(self, config_name: str):\n        return network_service.get(\n            uri=f\"{self.endpoint}/{config_name}\",\n            header={\"accept\": \"application/json\"},\n        )\n\n    def _show_config_content(self, config_name: str):\n        return network_service.get(\n            uri=f\"{self.endpoint}/show/{config_name}\",\n            header={\"accept\": \"application/json\"},\n        )\n\n    def _add_new_download(\n        self, file_path: str, licenses: t.List[str], force_download: bool, priority: int | None\n    ):\n        try:\n            file = {\"file\": open(file_path, \"rb\")}\n        except FileNotFoundError:\n            return \"File not found.\"\n\n        return network_service.post(\n            uri=self.endpoint,\n            header={\"accept\": \"application/json\"},\n            file=file,\n            payload={\"licenses\": licenses},\n            query={\"force_download\": force_download, \"priority\": priority},\n        )\n\n    def _remove_download(self, config_name: str):\n        return network_service.delete(\n            uri=f\"{self.endpoint}/{config_name}\", header={\"accept\": \"application/json\"}\n        )\n\n    def _refetch_config_partitions(self, config_name: str, licenses: t.List[str], only_failed: bool):\n        return network_service.post(\n            uri=f\"{self.endpoint}/retry/{config_name}\",\n            header={\"accept\": \"application/json\"},\n            payload=json.dumps({\"licenses\": licenses}),\n            query={'only_failed': only_failed}\n        )\n\n\nclass DownloadServiceMock(DownloadService):\n    pass\n\n\ndef get_download_service(test: bool = False):\n    if test:\n        return DownloadServiceMock()\n    else:\n        return DownloadServiceNetwork()\n\n\ndownload_service = get_download_service()\n"
  },
  {
    "path": "weather_dl_v2/cli/app/services/license_service.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport abc\nimport json\nimport logging\n\nfrom app.cli_config import get_config\nfrom app.services.network_service import network_service\n\nlogger = logging.getLogger(__name__)\n\n\nclass LicenseService(abc.ABC):\n\n    @abc.abstractmethod\n    def _get_all_license(self):\n        pass\n\n    @abc.abstractmethod\n    def _get_all_license_by_client_name(self, client_name: str):\n        pass\n\n    @abc.abstractmethod\n    def _get_license_by_license_id(self, license_id: str):\n        pass\n\n    @abc.abstractmethod\n    def _add_license(self, license_dict: dict):\n        pass\n\n    @abc.abstractmethod\n    def _remove_license(self, license_id: str):\n        pass\n\n    @abc.abstractmethod\n    def _update_license(self, license_id: str, license_dict: dict):\n        pass\n\n    @abc.abstractmethod\n    def _redeploy_license_by_license_id(self, license_id: str):\n        pass\n\n    @abc.abstractmethod\n    def _redeploy_licenses_by_client(self, client_name: str):\n        pass\n\n\nclass LicenseServiceNetwork(LicenseService):\n\n    def __init__(self):\n        self.endpoint = f\"{get_config().BASE_URI}/license\"\n\n    def _get_all_license(self):\n        return network_service.get(\n            uri=self.endpoint, header={\"accept\": \"application/json\"}\n        )\n\n    def _get_all_license_by_client_name(self, client_name: str):\n        return network_service.get(\n            uri=self.endpoint,\n            header={\"accept\": \"application/json\"},\n            query={\"client_name\": client_name},\n        )\n\n    def _get_license_by_license_id(self, license_id: str):\n        return network_service.get(\n            uri=f\"{self.endpoint}/{license_id}\",\n            header={\"accept\": \"application/json\"},\n        )\n\n    def _add_license(self, license_dict: dict):\n        return network_service.post(\n            uri=self.endpoint,\n            header={\"accept\": \"application/json\"},\n            payload=json.dumps(license_dict),\n        )\n\n    def _remove_license(self, license_id: str):\n        return network_service.delete(\n            uri=f\"{self.endpoint}/{license_id}\",\n            header={\"accept\": \"application/json\"},\n        )\n\n    def _update_license(self, license_id: str, license_dict: dict):\n        return network_service.put(\n            uri=f\"{self.endpoint}/{license_id}\",\n            header={\"accept\": \"application/json\"},\n            payload=json.dumps(license_dict),\n        )\n\n    def _redeploy_license_by_license_id(self, license_id: str):\n        return network_service.patch(\n            uri=f\"{self.endpoint}/redeploy\",\n            header={\"accept\": \"application/json\"},\n            query={\"license_id\": license_id}\n        )\n\n    def _redeploy_licenses_by_client(self, client_name: str):\n        return network_service.patch(\n            uri=f\"{self.endpoint}/redeploy\",\n            header={\"accept\": \"application/json\"},\n            query={\"client_name\": client_name}\n        )\n\n\nclass LicenseServiceMock(LicenseService):\n    pass\n\n\ndef get_license_service(test: bool = False):\n    if test:\n        return LicenseServiceMock()\n    else:\n        return LicenseServiceNetwork()\n\n\nlicense_service = get_license_service()\n"
  },
  {
    "path": "weather_dl_v2/cli/app/services/network_service.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport json\nimport logging\n\nimport requests\n\nfrom app.utils import Loader, timeit\n\nlogger = logging.getLogger(__name__)\n\n\nclass NetworkService:\n\n    def parse_response(self, response: requests.Response):\n        try:\n            parsed = json.loads(response.text)\n        except Exception as e:\n            logger.info(f\"Parsing error: {e}.\")\n            logger.info(f\"Status code {response.status_code}\")\n            logger.info(f\"Response {response.text}\")\n            return\n\n        if isinstance(parsed, list):\n            print(f\"[Total {len(parsed)} items.]\")\n\n        return json.dumps(parsed, indent=3)\n\n    @timeit\n    def get(self, uri, header, query=None, payload=None):\n        try:\n            with Loader(\"Sending request...\"):\n                x = requests.get(uri, params=query, headers=header, data=payload)\n            return self.parse_response(x)\n        except requests.exceptions.RequestException as e:\n            logger.error(f\"request error: {e}\")\n            raise SystemExit(e)\n\n    @timeit\n    def post(self, uri, header, query=None, payload=None, file=None):\n        try:\n            with Loader(\"Sending request...\"):\n                x = requests.post(\n                    uri, params=query, headers=header, data=payload, files=file\n                )\n            return self.parse_response(x)\n        except requests.exceptions.RequestException as e:\n            logger.error(f\"request error: {e}\")\n            raise SystemExit(e)\n\n    @timeit\n    def put(self, uri, header, query=None, payload=None, file=None):\n        try:\n            with Loader(\"Sending request...\"):\n                x = requests.put(\n                    uri, params=query, headers=header, data=payload, files=file\n                )\n            return self.parse_response(x)\n        except requests.exceptions.RequestException as e:\n            logger.error(f\"request error: {e}\")\n            raise SystemExit(e)\n\n    @timeit\n    def delete(self, uri, header, query=None):\n        try:\n            with Loader(\"Sending request...\"):\n                x = requests.delete(uri, params=query, headers=header)\n            return self.parse_response(x)\n        except requests.exceptions.RequestException as e:\n            logger.error(f\"request error: {e}\")\n            raise SystemExit(e)\n\n    @timeit\n    def patch(self, uri, header, query=None):\n        try:\n            with Loader(\"Sending request...\"):\n                x = requests.patch(uri, params=query, headers=header)\n            return self.parse_response(x)\n        except requests.exceptions.RequestException as e:\n            logger.error(f\"request error: {e}\")\n            raise SystemExit(e)\n\n\nnetwork_service = NetworkService()\n"
  },
  {
    "path": "weather_dl_v2/cli/app/services/queue_service.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport abc\nimport json\nimport logging\nimport os\nimport typing as t\n\nfrom app.cli_config import get_config\nfrom app.services.network_service import network_service\n\nlogger = logging.getLogger(__name__)\n\n\nclass QueueService(abc.ABC):\n\n    @abc.abstractmethod\n    def _get_all_license_queues(self):\n        pass\n\n    @abc.abstractmethod\n    def _get_license_queue_by_client_name(self, client_name: str):\n        pass\n\n    @abc.abstractmethod\n    def _get_queue_by_license(self, license_id: str):\n        pass\n\n    @abc.abstractmethod\n    def _edit_license_queue(self, license_id: str, priority_list: t.List[str]):\n        pass\n\n    @abc.abstractmethod\n    def _edit_config_absolute_priority(\n        self, license_id: str, config_name: str, priority: int\n    ):\n        pass\n\n    @abc.abstractmethod\n    def _save_queue_to_file(self, license_id: str, dir: str):\n        pass\n\n\nclass QueueServiceNetwork(QueueService):\n\n    def __init__(self):\n        self.endpoint = f\"{get_config().BASE_URI}/queues\"\n\n    def _get_all_license_queues(self):\n        return network_service.get(\n            uri=self.endpoint, header={\"accept\": \"application/json\"}\n        )\n\n    def _get_license_queue_by_client_name(self, client_name: str):\n        return network_service.get(\n            uri=self.endpoint,\n            header={\"accept\": \"application/json\"},\n            query={\"client_name\": client_name},\n        )\n\n    def _get_queue_by_license(self, license_id: str):\n        return network_service.get(\n            uri=f\"{self.endpoint}/{license_id}\", header={\"accept\": \"application/json\"}\n        )\n\n    def _edit_license_queue(self, license_id: str, priority_list: t.List[str]):\n        return network_service.post(\n            uri=f\"{self.endpoint}/{license_id}\",\n            header={\"accept\": \"application/json\", \"Content-Type\": \"application/json\"},\n            payload=json.dumps(priority_list),\n        )\n\n    def _edit_config_absolute_priority(\n        self, license_id: str, config_name: str, priority: int\n    ):\n        return network_service.put(\n            uri=f\"{self.endpoint}/priority/{license_id}\",\n            header={\"accept\": \"application/json\"},\n            query={\"config_name\": config_name, \"priority\": priority},\n        )\n\n    def _save_queue_to_file(self, license_id: str, dir_path: str) -> str:\n        if not os.path.isdir(dir_path):\n            print(f\"{dir_path} is a not directory.\")\n            return None\n\n        response = self._get_queue_by_license(license_id)\n        parsed_response = json.loads(response)\n\n        if \"queue\" not in parsed_response:\n            print(response)\n            return None\n\n        json_data = {\"priority\": parsed_response[\"queue\"]}\n        file_path = os.path.join(dir_path, f\"{license_id}.json\")\n\n        with open(file_path, 'w') as json_file:\n            json.dump(json_data, json_file)\n\n        return file_path\n\n\nclass QueueServiceMock(QueueService):\n    pass\n\n\ndef get_queue_service(test: bool = False):\n    if test:\n        return QueueServiceMock()\n    else:\n        return QueueServiceNetwork()\n\n\nqueue_service = get_queue_service()\n"
  },
  {
    "path": "weather_dl_v2/cli/app/subcommands/__init__.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "weather_dl_v2/cli/app/subcommands/config.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport json\nimport subprocess\n\nimport pkg_resources\nimport typer\nfrom typing_extensions import Annotated\n\nfrom app.cli_config import get_config\nfrom app.utils import Validator, confirm_action\n\napp = typer.Typer()\n\n\nclass ConfigValidator(Validator):\n    pass\n\n\n@app.command(\"update\", help=\"Update the cli.\")\ndef update_cli():\n    confirm_action(\"Are you sure you want to update cli?\")\n    try:\n        print(\"Updating CLI. This will take some time...\")\n        subprocess.run(['pip', 'uninstall', 'weather-dl-v2', '-y', '-q'])\n        subprocess.run(['pip', 'install',\n                        'git+http://github.com/google/weather-tools#subdirectory=weather_dl_v2/cli'])\n        subprocess.run(['clear'])\n        print(\"CLI updated successfully. ✨\")\n    except Exception as e:\n        print(f\"Couldn't update CLI. Error: {e}.\")\n\n\n@app.command(\"show_ip\", help=\"See the current server IP address.\")\ndef show_server_ip():\n    print(f\"Current pod IP: {get_config().pod_ip}\")\n\n\n@app.command(\"set_ip\", help=\"Update the server IP address.\")\ndef update_server_ip(\n    new_ip: Annotated[\n        str, typer.Argument(help=\"New IP address. (Do not add port or protocol).\")\n    ],\n):\n    file_path = pkg_resources.resource_filename('app', 'data/cli_config.json')\n    cli_config = {}\n    with open(file_path, \"r\") as file:\n        cli_config = json.load(file)\n\n    old_ip = cli_config[\"pod_ip\"]\n    cli_config[\"pod_ip\"] = new_ip\n\n    with open(file_path, \"w\") as file:\n        json.dump(cli_config, file)\n\n    validator = ConfigValidator(valid_keys=[\"pod_ip\", \"port\"])\n\n    try:\n        cli_config = validator.validate_json(file_path=file_path)\n    except Exception as e:\n        print(f\"payload error: {e}\")\n        return\n\n    print(f\"Pod IP Updated {old_ip} -> {new_ip} .\")\n"
  },
  {
    "path": "weather_dl_v2/cli/app/subcommands/download.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nfrom typing import List\n\nimport typer\nfrom typing_extensions import Annotated\n\nfrom app.services.download_service import download_service\nfrom app.utils import Validator, as_table, confirm_action\n\napp = typer.Typer(rich_markup_mode=\"markdown\")\n\n\nclass DowloadFilterValidator(Validator):\n    pass\n\n\ndownload_key_order = [\n    'config_name', 'client_name', 'partitioning_status', 'scheduled_shards', 'in-progress_shards',\n    'downloaded_shards', 'failed_shards', 'total_shards'\n]\n\n\n@app.command(\"list\", help=\"List out all the configs.\")\ndef get_downloads(\n    filter: Annotated[\n        List[str],\n        typer.Option(\n            help=\"\"\"Filter by some value. Format: filter_key=filter_value. Available filters \"\"\"\n            \"\"\"[key: client_name, values: cds, mars, ecpublic] \"\"\"\n            \"\"\"[key: status, values: completed, failed, in-progress]\"\"\"\n        ),\n    ] = []\n):\n    if len(filter) > 0:\n        validator = DowloadFilterValidator(valid_keys=[\"client_name\", \"status\"])\n\n        try:\n            filter_dict = validator.validate(filters=filter, allow_missing=True)\n        except Exception as e:\n            print(f\"filter error: {e}\")\n            return\n\n        print(as_table(download_service._list_all_downloads_by_filter(filter_dict), download_key_order))\n        return\n\n    print(as_table(download_service._list_all_downloads(), download_key_order))\n\n\n# TODO: Add support for submitting multiple configs using *.cfg notation.\n@app.command(\"add\", help=\"Submit new config to download.\")\ndef submit_download(\n    file_path: Annotated[\n        str, typer.Argument(help=\"File path of config to be uploaded.\")\n    ],\n    license: Annotated[List[str], typer.Option(\"--license\", \"-l\", help=\"License ID.\")],\n    force_download: Annotated[\n        bool,\n        typer.Option(\n            \"-f\",\n            \"--force-download\",\n            help=\"Force redownload of partitions that were previously downloaded.\",\n        ),\n    ] = False,\n    priority: Annotated[\n        int,\n        typer.Option(\n            \"-p\",\n            \"--priority\",\n            help=\"Set the priority for submitted config in ALL licenses. If not added, the config is added\" \\\n                \"at the end of the queue. Priority decreases in ascending order with 0 having highest priority.\",\n        ),\n    ] = None,\n):\n    print(download_service._add_new_download(file_path, license, force_download, priority))\n\n\n@app.command(\"get\", help=\"Get a particular config.\")\ndef get_download_by_config(\n    config_name: Annotated[str, typer.Argument(help=\"Config file name.\")]\n):\n    print(as_table(download_service._get_download_by_config(config_name), download_key_order))\n\n\n@app.command(\"show\", help=\"Show contents of a particular config.\")\ndef show_config(\n    config_name: Annotated[str, typer.Argument(help=\"Config file name.\")]\n):\n    print(download_service._show_config_content(config_name))\n\n\n@app.command(\"remove\", help=\"Remove existing config.\")\ndef remove_download(\n    config_name: Annotated[str, typer.Argument(help=\"Config file name.\")],\n    auto_confirm: Annotated[bool, typer.Option(\"-y\", help=\"Automically confirm any promt.\")] = False\n):\n    if not auto_confirm:\n        confirm_action(f\"Are you sure you want to remove {config_name}?\")\n    print(download_service._remove_download(config_name))\n\n\n@app.command(\n    \"refetch\", help=\"Reschedule all partitions of a config that are not successful.\"\n)\ndef refetch_config(\n    config_name: Annotated[str, typer.Argument(help=\"Config file name.\")],\n    license: Annotated[List[str], typer.Option(\"--license\", \"-l\", help=\"License ID.\")],\n    only_failed: Annotated[bool, typer.Option(\"--only_failed\", help=\"Only refetch failed partitions.\")] = False,\n    auto_confirm: Annotated[bool, typer.Option(\"-y\", help=\"Automically confirm any promt.\")] = False\n):\n    if not auto_confirm:\n        confirm_action(f\"Are you sure you want to refetch {config_name}?\")\n    print(download_service._refetch_config_partitions(config_name, license, only_failed))\n"
  },
  {
    "path": "weather_dl_v2/cli/app/subcommands/license.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport typer\nfrom typing_extensions import Annotated\n\nfrom app.services.license_service import license_service\nfrom app.utils import Validator, as_table, confirm_action\n\napp = typer.Typer()\n\n\nclass LicenseValidator(Validator):\n    pass\n\n\nlicense_key_order = [\n    'license_id', 'client_name', 'status', 'number_of_requests', 'secret_id', 'k8s_deployment_id'\n    ]\n\n\n@app.command(\"list\", help=\"List all licenses.\")\ndef get_all_license(\n    filter: Annotated[\n        str, typer.Option(help=\"Filter by some value. Format: filter_key=filter_value\")\n    ] = None\n):\n    if filter:\n        validator = LicenseValidator(valid_keys=[\"client_name\"])\n\n        try:\n            data = validator.validate(filters=[filter])\n            client_name = data[\"client_name\"]\n        except Exception as e:\n            print(f\"filter error: {e}\")\n            return\n\n        print(as_table(license_service._get_all_license_by_client_name(client_name), license_key_order))\n        return\n\n    print(as_table(license_service._get_all_license(), license_key_order))\n\n\n@app.command(\"get\", help=\"Get a particular license by ID.\")\ndef get_license(license: Annotated[str, typer.Argument(help=\"License ID.\")]):\n    print(as_table(license_service._get_license_by_license_id(license), license_key_order))\n\n\n@app.command(\"add\", help=\"Add new license.\")\ndef add_license(\n    file_path: Annotated[\n        str,\n        typer.Argument(\n            help=\"\"\"Input json file. Example json for new license-\"\"\"\n            \"\"\"{\"license_id\" : <str>, \"client_name\" : <str>, \"number_of_requests\" : <int>, \"secret_id\" : <str>}\"\"\"\n            \"\"\"\\nNOTE: license_id is case insensitive and has to be unique for each license.\"\"\"\n        ),\n    ],\n):\n    validator = LicenseValidator(\n        valid_keys=[\"license_id\", \"client_name\", \"number_of_requests\", \"secret_id\"]\n    )\n\n    try:\n        license_dict = validator.validate_json(file_path=file_path)\n    except Exception as e:\n        print(f\"payload error: {e}\")\n        return\n\n    print(license_service._add_license(license_dict))\n\n\n@app.command(\"remove\", help=\"Remove a license.\")\ndef remove_license(\n    license: Annotated[str, typer.Argument(help=\"License ID.\")],\n    auto_confirm: Annotated[bool, typer.Option(\"-y\", help=\"Automically confirm any promt.\")] = False\n):\n    if not auto_confirm:\n        confirm_action(f\"Are you sure you want to remove {license}?\")\n    print(license_service._remove_license(license))\n\n\n@app.command(\"update\", help=\"Update existing license.\")\ndef update_license(\n    license: Annotated[str, typer.Argument(help=\"License ID.\")],\n    file_path: Annotated[\n        str,\n        typer.Argument(\n            help=\"\"\"Input json file. Example json for updated license- \"\"\"\n            \"\"\"{\"license_id\": <str>, \"client_name\" : <str>, \"number_of_requests\" : <int>, \"secret_id\" : <str>}\"\"\"\n        ),\n    ],\n    auto_confirm: Annotated[bool, typer.Option(\"-y\", help=\"Automically confirm any promt.\")] = False\n):\n    validator = LicenseValidator(\n        valid_keys=[\"license_id\", \"client_name\", \"number_of_requests\", \"secret_id\"]\n    )\n    try:\n        license_dict = validator.validate_json(file_path=file_path)\n    except Exception as e:\n        print(f\"payload error: {e}\")\n        return\n    if not auto_confirm:\n        confirm_action(f\"Are you sure you want to update {license}?\")\n    print(license_service._update_license(license, license_dict))\n\n\n@app.command(\"redeploy\",\n             help=\"\"\"Redeploy licenses.\"\"\"\n             \"\"\" CAUTION: Redeploying will cause licenses to stop whatever they are doing.\"\"\"\n             \"\"\" This can cause queues to be filled with stray requests from previous deployments.\"\"\"\n             )\ndef redeploy_license(\n    license_id: Annotated[\n        str,\n        typer.Option(\n            \"--license_id\",\n            help=\"\"\"Mention license_id of license to redeploy.\"\"\"\n            \"\"\" Send 'all' if want to redeploy all licenses.\"\"\"\n            )\n        ] = None,\n    client_name: Annotated[\n        str,\n        typer.Option(\n            \"--client_name\",\n            help=\"Redeploy all licenses of a particular client.\"\n            )\n        ] = None,\n    auto_confirm: Annotated[bool, typer.Option(\"-y\", help=\"Automically confirm any promt.\")] = False\n):\n    if license_id is not None and client_name is not None:\n        print(\"Can't pass both license_id and client_name. Please pass only one.\")\n        return\n\n    if license_id is None and client_name is None:\n        print(\"Please pass --license_id or --client_name.\")\n        return\n\n    if license_id is not None:\n        if not auto_confirm:\n            confirm_action(f\"Are you sure you want to redeploy {license_id}?\")\n        print(license_service._redeploy_license_by_license_id(license_id))\n        return\n\n    if client_name is not None:\n        if not auto_confirm:\n            confirm_action(f\"Are you sure you want to redeploy licenses from {client_name}?\")\n        print(license_service._redeploy_licenses_by_client(client_name))\n        return\n"
  },
  {
    "path": "weather_dl_v2/cli/app/subcommands/queue.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport typer\nfrom typing_extensions import Annotated\n\nfrom app.services.queue_service import queue_service\nfrom app.utils import Validator, as_table, confirm_action\n\napp = typer.Typer()\n\n\nclass QueueValidator(Validator):\n    pass\n\n\nqueue_key_order = ['license_id', 'client_name', 'queue']\n\n\n@app.command(\"list\", help=\"List all the license queues.\")\ndef get_all_license_queue(\n    filter: Annotated[\n        str, typer.Option(help=\"Filter by some value. Format: filter_key=filter_value\")\n    ] = None\n):\n    if filter:\n        validator = QueueValidator(valid_keys=[\"client_name\"])\n\n        try:\n            data = validator.validate(filters=[filter])\n            client_name = data[\"client_name\"]\n        except Exception as e:\n            print(f\"filter error: {e}\")\n            return\n\n        print(as_table(queue_service._get_license_queue_by_client_name(client_name), queue_key_order))\n        return\n\n    print(as_table(queue_service._get_all_license_queues(), queue_key_order))\n\n\n@app.command(\"get\", help=\"Get queue of particular license.\")\ndef get_license_queue(license: Annotated[str, typer.Argument(help=\"License ID\")]):\n    print(as_table(queue_service._get_queue_by_license(license), queue_key_order))\n\n\n@app.command(\n    \"edit\",\n    help=\"Edit existing license queue. Queue can edited via a priority\"\n    \"file or my moving a single config to a given priority.\",\n)\ndef modify_license_queue(\n    license: Annotated[str, typer.Argument(help=\"License ID.\")],\n    empty: Annotated[\n        bool,\n        typer.Option(\n            \"--empty\",\n            help=\"\"\"Empties the license queue. If this is passed, other options are ignored.\"\"\"\n        )\n    ] = False,\n    save_dir_path: Annotated[\n        str,\n        typer.Option(\n            \"--save_and_empty\",\n            help=\"\"\"Saves the license queue to a file and empties the queue.\"\"\"\n            \"\"\" Pass in path of directory. File will be saved as <license_id>.json .\"\"\"\n        )\n    ] = None,\n    file: Annotated[\n        str,\n        typer.Option(\n            \"--file\",\n            \"-f\",\n            help=\"\"\"File path of priority json file. Example json: {\"priority\": [\"c1.cfg\", \"c2.cfg\",...]}\"\"\",\n        ),\n    ] = None,\n    config: Annotated[\n        str, typer.Option(\"--config\", \"-c\", help=\"Config name for absolute priority.\")\n    ] = None,\n    priority: Annotated[\n        int,\n        typer.Option(\n            \"--priority\",\n            \"-p\",\n            help=\"Absolute priority for the config in a license queue.\"\n            \" Priority decreases in ascending order with 0 having highest priority.\",\n        ),\n    ] = None,\n    auto_confirm: Annotated[bool, typer.Option(\"-y\", help=\"Automically confirm any promt.\")] = False\n):\n\n    if empty and save_dir_path:\n        print(\"Both --empty and --save_and_empty can't be passed. Use only one.\")\n        return\n\n    if empty:\n        if not auto_confirm:\n            confirm_action(f\"Are you sure you want to empty queue for {license}?\")\n        print(\"Emptying license queue...\")\n        print(queue_service._edit_license_queue(license, []))\n        return\n\n    if save_dir_path:\n        if not auto_confirm:\n            confirm_action(f\"Are you sure you want to empty queue for {license}?\")\n        print(\"Saving and Emptying license queue...\")\n        file_path = queue_service._save_queue_to_file(license, save_dir_path)\n        print(f\"Queue saved at {file_path}.\")\n        print(queue_service._edit_license_queue(license, []))\n        return\n\n    if file is None and (config is None and priority is None):\n        print(\"Priority file or config name with absolute priority must be passed.\")\n        return\n\n    if file is not None and (config is not None or priority is not None):\n        print(\"--config & --priority can't be used along with --file argument.\")\n        return\n\n    if file is not None:\n        validator = QueueValidator(valid_keys=[\"priority\"])\n\n        try:\n            data = validator.validate_json(file_path=file)\n            priority_list = data[\"priority\"]\n        except Exception as e:\n            print(f\"key error: {e}\")\n            return\n        if not auto_confirm:\n            confirm_action(f\"Are you sure you want to edit {license} queue priority?\")\n        print(queue_service._edit_license_queue(license, priority_list))\n        return\n    elif config is not None and priority is not None:\n        if priority < 0:\n            print(\"Priority can not be negative.\")\n            return\n        if not auto_confirm:\n            confirm_action(f\"Are you sure you want to edit {license} queue priority?\")\n        print(queue_service._edit_config_absolute_priority(license, config, priority))\n        return\n    else:\n        print(\"--config & --priority arguments should be used together.\")\n        return\n"
  },
  {
    "path": "weather_dl_v2/cli/app/utils.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport abc\nimport dataclasses\nimport json\nimport logging\nimport typer\nimport typing as t\nfrom itertools import cycle\nfrom shutil import get_terminal_size\nfrom threading import Thread\nfrom time import sleep, time\n\nfrom tabulate import tabulate\n\nlogger = logging.getLogger(__name__)\n\n\ndef timeit(func):\n    def wrap_func(*args, **kwargs):\n        t1 = time()\n        result = func(*args, **kwargs)\n        t2 = time()\n        print(f\"[executed in {(t2-t1):.4f}s.]\")\n        return result\n\n    return wrap_func\n\n\ndef confirm_action(message: str = \"Are you sure you want to continue?\"):\n    _ = typer.confirm(message, abort=True)\n\n\ndef order_dict_fields(dictionary, key_order):\n    return {key: dictionary[key] for key in key_order if key in dictionary}\n\n\ndef as_table(response: str, key_order=None):\n    data = json.loads(response)\n\n    if not isinstance(data, list):\n        # convert response to list if not a list.\n        data = [data]\n\n    if len(data) == 0:\n        return \"\"\n\n    if key_order is None:\n        key_order = list(data[0].keys())\n\n    ordered_data = [order_dict_fields(d, key_order) for d in data]\n\n    # if any column has lists, convert that to a string.\n    rows = [\n        [\n            \",\\n\".join([f\"{i} {ele}\" for i, ele in enumerate(val)])\n            if isinstance(val, list)\n            else val\n            for _, val in x.items()\n        ]\n        for x in ordered_data\n    ]\n    rows.insert(0, list(key_order))\n    return tabulate(\n        rows, showindex=True, tablefmt=\"grid\", maxcolwidths=[16] * len(key_order)\n    )\n\n\nclass Loader:\n\n    def __init__(self, desc=\"Loading...\", end=\"\", timeout=0.1):\n        \"\"\"\n        A loader-like context manager\n\n        Args:\n            desc (str, optional): The loader's description. Defaults to \"Loading...\".\n            end (str, optional): Final print. Defaults to \"Done!\".\n            timeout (float, optional): Sleep time between prints. Defaults to 0.1.\n        \"\"\"\n        self.desc = desc\n        self.end = end\n        self.timeout = timeout\n\n        self._thread = Thread(target=self._animate, daemon=True)\n        self.steps = [\"⢿\", \"⣻\", \"⣽\", \"⣾\", \"⣷\", \"⣯\", \"⣟\", \"⡿\"]\n        self.done = False\n\n    def start(self):\n        self._thread.start()\n        return self\n\n    def _animate(self):\n        for c in cycle(self.steps):\n            if self.done:\n                break\n            print(f\"\\r{self.desc} {c}\", flush=True, end=\"\")\n            sleep(self.timeout)\n\n    def __enter__(self):\n        self.start()\n\n    def stop(self):\n        self.done = True\n        cols = get_terminal_size((80, 20)).columns\n        print(\"\\r\" + \" \" * cols, end=\"\", flush=True)\n\n    def __exit__(self, exc_type, exc_value, tb):\n        # handle exceptions with those variables ^\n        self.stop()\n\n\n@dataclasses.dataclass\nclass Validator(abc.ABC):\n    valid_keys: t.List[str]\n\n    def validate(\n        self, filters: t.List[str], show_valid_filters=True, allow_missing: bool = False\n    ):\n        filter_dict = {}\n\n        for filter in filters:\n            _filter = filter.split(\"=\")\n\n            if len(_filter) != 2:\n                if show_valid_filters:\n                    logger.info(f\"valid filters are: {self.valid_keys}.\")\n                raise ValueError(\"Incorrect Filter. Please Try again.\")\n\n            key, value = _filter\n            filter_dict[key] = value\n\n        data_set = set(filter_dict.keys())\n        valid_set = set(self.valid_keys)\n\n        if self._validate_keys(data_set, valid_set, allow_missing):\n            return filter_dict\n\n    def validate_json(self, file_path, allow_missing: bool = False):\n        try:\n            with open(file_path) as f:\n                data: dict = json.load(f)\n                data_keys = data.keys()\n\n                data_set = set(data_keys)\n                valid_set = set(self.valid_keys)\n\n                if self._validate_keys(data_set, valid_set, allow_missing):\n                    return data\n\n        except FileNotFoundError:\n            logger.info(\"file not found.\")\n            raise FileNotFoundError\n\n    def _validate_keys(self, data_set: set, valid_set: set, allow_missing: bool):\n        missing_keys = valid_set.difference(data_set)\n        invalid_keys = data_set.difference(valid_set)\n\n        if not allow_missing and len(missing_keys) > 0:\n            raise ValueError(f\"keys {missing_keys} are missing in file.\")\n\n        if len(invalid_keys) > 0:\n            raise ValueError(f\"keys {invalid_keys} are invalid keys.\")\n\n        if allow_missing or data_set == valid_set:\n            return True\n\n        return False\n"
  },
  {
    "path": "weather_dl_v2/cli/environment.yml",
    "content": "name: weather-dl-v2-cli\nchannels:\n  - conda-forge\ndependencies:\n  - python=3.10\n  - pip=23.0.1\n  - typer=0.9.0\n  - tabulate=0.9.0\n  - pip:\n    - requests\n    - ruff\n    - pytype\n    - pytest\n    - .\n"
  },
  {
    "path": "weather_dl_v2/cli/setup.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nfrom setuptools import setup\n\nrequirements = [\"typer\", \"requests\", \"tabulate\"]\n\nsetup(\n    name=\"weather-dl-v2\",\n    packages=[\"app\", \"app.subcommands\", \"app.services\"],\n    install_requires=requirements,\n    version=\"1.0.3\",\n    author='Anthromets',\n    author_email='anthromets-ecmwf@google.com',\n    description=(\n        \"This cli tools helps in interacting with weather dl v2 fast API server.\"\n    ),\n    entry_points={\"console_scripts\": [\"weather-dl-v2=app.main:app\", \"dl-v2=app.main:app\"]},\n    package_data={'app': ['data/*.json']},\n)\n"
  },
  {
    "path": "weather_dl_v2/cli/vm-startup.sh",
    "content": "#! /bin/bash\n\ncommand=\"docker exec -it \\\\\\$(docker ps -qf name=weather-dl-v2-cli) /bin/bash\"\nsudo sh -c \"echo \\\"$command\\\" >> /etc/profile\""
  },
  {
    "path": "weather_dl_v2/cloudbuild.yml",
    "content": "steps:\n- name: 'gcr.io/cloud-builders/docker'\n  args: ['build', '-t', 'gcr.io/$_PROJECT_ID/$_REPO:$_SERVICE', '$_FOLDER']\n- name: 'gcr.io/cloud-builders/docker'\n  args: ['push', 'gcr.io/$_PROJECT_ID/$_REPO:$_SERVICE']\n- name: 'gcr.io/cloud-builders/docker'\n  args: ['tag', 'gcr.io/$_PROJECT_ID/$_REPO:$_SERVICE', 'gcr.io/$_PROJECT_ID/$_REPO:$_SERVICE-$_VER']\n- name: 'gcr.io/cloud-builders/docker'\n  args: ['push', 'gcr.io/$_PROJECT_ID/$_REPO:$_SERVICE-$_VER']\ntimeout: 79200s\noptions:\n  machineType: E2_HIGHCPU_32\n"
  },
  {
    "path": "weather_dl_v2/config.json",
    "content": "{\n    \"download_collection\": \"download\",\n    \"queues_collection\": \"queues\",\n    \"license_collection\": \"license\",\n    \"manifest_collection\": \"manifest\",\n    \"storage_bucket\": \"XXXXXXX\",\n    \"gcs_project\": \"XXXXXXX\",\n    \"license_deployment_image\": \"XXXXXXX\",\n    \"downloader_k8_image\": \"XXXXXXX\",\n    \"welcome_message\": \"Greetings from weather-dl v2 from weather-dl-v2-cluster-2!\"\n}"
  },
  {
    "path": "weather_dl_v2/downloader_kubernetes/Dockerfile",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\n# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n# ==============================================================================\nFROM continuumio/miniconda3:latest\n\n# Update miniconda\nRUN conda update conda -y\n\n# Add the mamba solver for faster builds\nRUN conda install -n base conda-libmamba-solver\nRUN conda config --set solver libmamba\n\n# Create conda env using environment.yml\nCOPY . .\nRUN conda env create -f environment.yml --debug\n\n# Activate the conda env and update the PATH\nARG CONDA_ENV_NAME=weather-dl-v2-downloader\nRUN echo \"source activate ${CONDA_ENV_NAME}\" >> ~/.bashrc\nENV PATH /opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH\n"
  },
  {
    "path": "weather_dl_v2/downloader_kubernetes/README.md",
    "content": "# Deployment / Usage Instruction \n\n### User authorization required to set up the environment:\n* roles/container.admin\n\n### Authorization needed for the tool to operate:\nWe are not configuring any service account here hence make sure that compute engine default service account have roles:\n* roles/storage.admin\n* roles/bigquery.dataEditor\n* roles/bigquery.jobUser\n\n### Make changes in weather_dl_v2/config.json, if required [for running locally]\n```\nexport CONFIG_PATH=/path/to/weather_dl_v2/config.json\n```\n\n### Create docker image for downloader:\nRefer instructions in weather_dl_v2/README.md"
  },
  {
    "path": "weather_dl_v2/downloader_kubernetes/VERSION.txt",
    "content": "1.0.2"
  },
  {
    "path": "weather_dl_v2/downloader_kubernetes/downloader.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\n\"\"\"\nThis program downloads ECMWF data & upload it into GCS.\n\"\"\"\nimport tempfile\nimport os\nimport sys\nimport time\nfrom manifest import FirestoreManifest, Stage\nfrom util import copy, download_with_aria2, download_with_wget\nimport datetime\n\n\ndef download(url: str, path: str) -> None:\n    \"\"\"Download data from client, with retries.\"\"\"\n    if path:\n        if os.path.exists(path):\n            # Empty the target file, if it already exists, otherwise the\n            # transfer below might be fooled into thinking we're resuming\n            # an interrupted download.\n            open(path, \"w\").close()\n\n        download_methods = [\n            download_with_aria2,\n            download_with_aria2,\n            download_with_wget,\n        ]\n        errors = []\n\n        for method in download_methods:\n            print(f\"Trying {method.__name__}.\")\n            try:\n                method(url, path)\n                return\n            except Exception as e:\n                print(f\"{method.__name__} failed. Error: {e}.\")\n                errors.append(str(e))\n                print(\"Waiting for 2 mins.\")\n                time.sleep(120)\n\n        err_msgs = \"\\n\".join(errors)\n        print(f\"Failed to download {url}. Error Msg: {err_msgs}.\")\n        raise Exception(\n            f\"Downloading failed for url {url} & path {path}.\\nError Msg: {err_msgs}.\"\n        )\n\n\ndef main(\n    config_name, dataset, selection, user_id, url, target_path, license_id\n) -> None:\n    \"\"\"Download data from a client to a temp file.\"\"\"\n\n    manifest = FirestoreManifest(license_id=license_id)\n    temp_name = \"\"\n    with manifest.transact(config_name, dataset, selection, target_path, user_id):\n        with tempfile.NamedTemporaryFile(delete=False) as temp:\n            temp_name = temp.name\n            manifest.set_stage(Stage.DOWNLOAD)\n            precise_download_start_time = (\n                datetime.datetime.utcnow()\n                .replace(tzinfo=datetime.timezone.utc)\n                .isoformat(timespec=\"seconds\")\n            )\n            manifest.prev_stage_precise_start_time = precise_download_start_time\n            print(f\"Downloading data for {target_path!r}.\")\n            download(url, temp_name)\n            print(f\"Download completed for {target_path!r}.\")\n\n            manifest.set_stage(Stage.UPLOAD)\n            precise_upload_start_time = (\n                datetime.datetime.utcnow()\n                .replace(tzinfo=datetime.timezone.utc)\n                .isoformat(timespec=\"seconds\")\n            )\n            manifest.prev_stage_precise_start_time = precise_upload_start_time\n            print(f\"Uploading to store for {target_path!r}.\")\n            copy(temp_name, target_path)\n            print(f\"Upload to store complete for {target_path!r}.\")\n    os.unlink(temp_name)\n\n\nif __name__ == \"__main__\":\n    main(*sys.argv[1:])\n"
  },
  {
    "path": "weather_dl_v2/downloader_kubernetes/downloader_config.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport dataclasses\nimport typing as t\nimport json\nimport os\nimport logging\n\nlogger = logging.getLogger(__name__)\n\nValues = t.Union[t.List[\"Values\"], t.Dict[str, \"Values\"], bool, int, float, str]  # pytype: disable=not-supported-yet\n\n\n@dataclasses.dataclass\nclass DownloaderConfig:\n    manifest_collection: str = \"\"\n    kwargs: t.Optional[t.Dict[str, Values]] = dataclasses.field(default_factory=dict)\n\n    @classmethod\n    def from_dict(cls, config: t.Dict):\n        config_instance = cls()\n\n        for key, value in config.items():\n            if hasattr(config_instance, key):\n                setattr(config_instance, key, value)\n            else:\n                config_instance.kwargs[key] = value\n\n        return config_instance\n\n\ndownloader_config = None\n\n\ndef get_config():\n    global downloader_config\n    if downloader_config:\n        return downloader_config\n\n    downloader_config_json = \"config/config.json\"\n    if not os.path.exists(downloader_config_json):\n        downloader_config_json = os.environ.get(\"CONFIG_PATH\", None)\n\n    if downloader_config_json is None:\n        logger.error(\"Couldn't load config file for downloader.\")\n        raise FileNotFoundError(\"Couldn't load config file for downloader.\")\n\n    with open(downloader_config_json) as file:\n        config_dict = json.load(file)\n        downloader_config = DownloaderConfig.from_dict(config_dict)\n\n    return downloader_config\n"
  },
  {
    "path": "weather_dl_v2/downloader_kubernetes/environment.yml",
    "content": "name: weather-dl-v2-downloader\nchannels:\n  - conda-forge\ndependencies:\n  - python=3.10\n  - google-cloud-sdk=410.0.0\n  - aria2=1.36.0\n  - geojson=2.5.0=py_0\n  - xarray=2022.11.0\n  - google-apitools\n  - pip=22.3\n  - pip:\n    - apache_beam[gcp]==2.40.0\n    - firebase-admin\n    - google-cloud-pubsub\n    - kubernetes\n    - psutil\n"
  },
  {
    "path": "weather_dl_v2/downloader_kubernetes/manifest.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\n\"\"\"Client interface for connecting to a manifest.\"\"\"\n\nimport abc\nimport dataclasses\nimport datetime\nimport enum\nimport json\nimport pandas as pd\nimport time\nimport traceback\nimport typing as t\n\nfrom util import (\n    to_json_serializable_type,\n    fetch_geo_polygon,\n    get_file_size,\n    get_wait_interval,\n    generate_md5_hash,\n    GLOBAL_COVERAGE_AREA,\n)\n\nimport firebase_admin\nfrom firebase_admin import credentials\nfrom firebase_admin import firestore\nfrom google.cloud.firestore_v1 import DocumentReference\nfrom google.cloud.firestore_v1.types import WriteResult\nfrom downloader_config import get_config\n\n\"\"\"An implementation-dependent Manifest URI.\"\"\"\nLocation = t.NewType(\"Location\", str)\n\n\nclass ManifestException(Exception):\n    \"\"\"Errors that occur in Manifest Clients.\"\"\"\n\n    pass\n\n\nclass Stage(enum.Enum):\n    \"\"\"A request can be either in one of the following stages at a time:\n\n    fetch : This represents request is currently in fetch stage i.e. request placed on the client's server\n        & waiting for some result before starting download (eg. MARS client).\n    download : This represents request is currently in download stage i.e. data is being downloading from client's\n        server to the worker's local file system.\n    upload : This represents request is currently in upload stage i.e. data is getting uploaded from worker's local\n        file system to target location (GCS path).\n    retrieve : In case of clients where there is no proper separation of fetch & download stages (eg. CDS client),\n        request will be in the retrieve stage i.e. fetch + download.\n    \"\"\"\n\n    RETRIEVE = \"retrieve\"\n    FETCH = \"fetch\"\n    DOWNLOAD = \"download\"\n    UPLOAD = \"upload\"\n\n\nclass Status(enum.Enum):\n    \"\"\"Depicts the request's state status:\n\n    scheduled : A request partition is created & scheduled for processing.\n        Note: Its corresponding state can be None only.\n    in-progress : This represents the request state is currently in-progress (i.e. running).\n        The next status would be \"success\" or \"failure\".\n    success : This represents the request state execution completed successfully without any error.\n    failure : This represents the request state execution failed.\n    \"\"\"\n\n    SCHEDULED = \"scheduled\"\n    IN_PROGRESS = \"in-progress\"\n    SUCCESS = \"success\"\n    FAILURE = \"failure\"\n\n\n@dataclasses.dataclass\nclass DownloadStatus:\n    \"\"\"Data recorded in `Manifest`s reflecting the status of a download.\"\"\"\n\n    \"\"\"The name of the config file associated with the request.\"\"\"\n    config_name: str = \"\"\n\n    \"\"\"Represents the dataset field of the configuration.\"\"\"\n    dataset: t.Optional[str] = \"\"\n\n    \"\"\"Copy of selection section of the configuration.\"\"\"\n    selection: t.Dict = dataclasses.field(default_factory=dict)\n\n    \"\"\"Location of the downloaded data.\"\"\"\n    location: str = \"\"\n\n    \"\"\"Represents area covered by the shard.\"\"\"\n    area: str = \"\"\n\n    \"\"\"Current stage of request : 'fetch', 'download', 'retrieve', 'upload' or None.\"\"\"\n    stage: t.Optional[Stage] = None\n\n    \"\"\"Download status: 'scheduled', 'in-progress', 'success', or 'failure'.\"\"\"\n    status: t.Optional[Status] = None\n\n    \"\"\"Cause of error, if any.\"\"\"\n    error: t.Optional[str] = \"\"\n\n    \"\"\"Identifier for the user running the download.\"\"\"\n    username: str = \"\"\n\n    \"\"\"Shard size in GB.\"\"\"\n    size: t.Optional[float] = 0\n\n    \"\"\"A UTC datetime when download was scheduled.\"\"\"\n    scheduled_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the retrieve stage starts.\"\"\"\n    retrieve_start_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the retrieve state ends.\"\"\"\n    retrieve_end_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the fetch state starts.\"\"\"\n    fetch_start_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the fetch state ends.\"\"\"\n    fetch_end_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the download state starts.\"\"\"\n    download_start_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the download state ends.\"\"\"\n    download_end_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the upload state starts.\"\"\"\n    upload_start_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the upload state ends.\"\"\"\n    upload_end_time: t.Optional[str] = \"\"\n\n    @classmethod\n    def from_dict(cls, download_status: t.Dict) -> \"DownloadStatus\":\n        \"\"\"Instantiate DownloadStatus dataclass from dict.\"\"\"\n        download_status_instance = cls()\n        for key, value in download_status.items():\n            if key == \"status\":\n                setattr(download_status_instance, key, Status(value))\n            elif key == \"stage\" and value is not None:\n                setattr(download_status_instance, key, Stage(value))\n            else:\n                setattr(download_status_instance, key, value)\n        return download_status_instance\n\n    @classmethod\n    def to_dict(cls, instance) -> t.Dict:\n        \"\"\"Return the fields of a dataclass instance as a manifest ingestible\n        dictionary mapping of field names to field values.\"\"\"\n        download_status_dict = {}\n        for field in dataclasses.fields(instance):\n            key = field.name\n            value = getattr(instance, field.name)\n            if isinstance(value, Status) or isinstance(value, Stage):\n                download_status_dict[key] = value.value\n            elif isinstance(value, pd.Timestamp):\n                download_status_dict[key] = value.isoformat()\n            elif key == \"selection\" and value is not None:\n                download_status_dict[key] = json.dumps(value)\n            else:\n                download_status_dict[key] = value\n        return download_status_dict\n\n\n@dataclasses.dataclass\nclass Manifest(abc.ABC):\n    \"\"\"Abstract manifest of download statuses.\n\n    Update download statuses to some storage medium.\n\n    This class lets one indicate that a download is `scheduled` or in a transaction process.\n    In the event of a transaction, a download will be updated with an `in-progress`, `success`\n    or `failure` status (with accompanying metadata).\n\n    Example:\n        ```\n        my_manifest = parse_manifest_location(Location('fs://some-firestore-collection'))\n\n        # Schedule data for download\n        my_manifest.schedule({'some': 'metadata'}, 'path/to/downloaded/file', 'my-username')\n\n        # ...\n\n        # Initiate a transaction – it will record that the download is `in-progess`\n        with my_manifest.transact({'some': 'metadata'}, 'path/to/downloaded/file', 'my-username') as tx:\n            # download logic here\n            pass\n\n            # ...\n\n            # on error, will record the download as a `failure` before propagating the error.  By default, it will\n            # record download as a `success`.\n        ```\n\n    Attributes:\n        status: The current `DownloadStatus` of the Manifest.\n    \"\"\"\n\n    # To reduce the impact of _read() and _update() calls\n    # on the start time of the stage.\n    license_id: str = \"\"\n    prev_stage_precise_start_time: t.Optional[str] = None\n    status: t.Optional[DownloadStatus] = None\n\n    # This is overridden in subclass.\n    def __post_init__(self):\n        \"\"\"Initialize the manifest.\"\"\"\n        pass\n\n    def schedule(\n        self,\n        config_name: str,\n        dataset: str,\n        selection: t.Dict,\n        location: str,\n        user: str,\n    ) -> None:\n        \"\"\"Indicate that a job has been scheduled for download.\n\n        'scheduled' jobs occur before 'in-progress', 'success' or 'finished'.\n        \"\"\"\n        scheduled_time = (\n            datetime.datetime.utcnow()\n            .replace(tzinfo=datetime.timezone.utc)\n            .isoformat(timespec=\"seconds\")\n        )\n        self.status = DownloadStatus(\n            config_name=config_name,\n            dataset=dataset if dataset else None,\n            selection=selection,\n            location=location,\n            area=fetch_geo_polygon(selection.get(\"area\", GLOBAL_COVERAGE_AREA)),\n            username=user,\n            stage=None,\n            status=Status.SCHEDULED,\n            error=None,\n            size=None,\n            scheduled_time=scheduled_time,\n            retrieve_start_time=None,\n            retrieve_end_time=None,\n            fetch_start_time=None,\n            fetch_end_time=None,\n            download_start_time=None,\n            download_end_time=None,\n            upload_start_time=None,\n            upload_end_time=None,\n        )\n        self._update(self.status)\n\n    def skip(\n        self,\n        config_name: str,\n        dataset: str,\n        selection: t.Dict,\n        location: str,\n        user: str,\n    ) -> None:\n        \"\"\"Updates the manifest to mark the shards that were skipped in the current job\n        as 'upload' stage and 'success' status, indicating that they have already been downloaded.\n        \"\"\"\n        old_status = self._read(location)\n        # The manifest needs to be updated for a skipped shard if its entry is not present, or\n        # if the stage is not 'upload', or if the stage is 'upload' but the status is not 'success'.\n        if (\n            old_status.location != location\n            or old_status.stage != Stage.UPLOAD\n            or old_status.status != Status.SUCCESS\n        ):\n            current_utc_time = (\n                datetime.datetime.utcnow()\n                .replace(tzinfo=datetime.timezone.utc)\n                .isoformat(timespec=\"seconds\")\n            )\n\n            size = get_file_size(location)\n\n            status = DownloadStatus(\n                config_name=config_name,\n                dataset=dataset if dataset else None,\n                selection=selection,\n                location=location,\n                area=fetch_geo_polygon(selection.get(\"area\", GLOBAL_COVERAGE_AREA)),\n                username=user,\n                stage=Stage.UPLOAD,\n                status=Status.SUCCESS,\n                error=None,\n                size=size,\n                scheduled_time=None,\n                retrieve_start_time=None,\n                retrieve_end_time=None,\n                fetch_start_time=None,\n                fetch_end_time=None,\n                download_start_time=None,\n                download_end_time=None,\n                upload_start_time=current_utc_time,\n                upload_end_time=current_utc_time,\n            )\n            self._update(status)\n            print(\n                f\"Manifest updated for skipped shard: {location!r} -- {DownloadStatus.to_dict(status)!r}.\"\n            )\n\n    def _set_for_transaction(\n        self,\n        config_name: str,\n        dataset: str,\n        selection: t.Dict,\n        location: str,\n        user: str,\n    ) -> None:\n        \"\"\"Reset Manifest state in preparation for a new transaction.\"\"\"\n        self.status = dataclasses.replace(self._read(location))\n        self.status.config_name = config_name\n        self.status.dataset = dataset if dataset else None\n        self.status.selection = selection\n        self.status.location = location\n        self.status.username = user\n\n    def __enter__(self) -> None:\n        pass\n\n    def __exit__(self, exc_type, exc_inst, exc_tb) -> None:\n        \"\"\"Record end status of a transaction as either 'success' or 'failure'.\"\"\"\n        if exc_type is None:\n            status = Status.SUCCESS\n            error = None\n        else:\n            status = Status.FAILURE\n            # For explanation, see https://docs.python.org/3/library/traceback.html#traceback.format_exception\n            error = f\"license_id: {self.license_id} \"\n            error += \"\\n\".join(traceback.format_exception(exc_type, exc_inst, exc_tb))\n\n        new_status = dataclasses.replace(self.status)\n        new_status.error = error\n        new_status.status = status\n        current_utc_time = (\n            datetime.datetime.utcnow()\n            .replace(tzinfo=datetime.timezone.utc)\n            .isoformat(timespec=\"seconds\")\n        )\n\n        # This is necessary for setting the precise start time of the previous stage\n        # and end time of the final stage, as well as handling the case of Status.FAILURE.\n        if new_status.stage == Stage.FETCH:\n            new_status.fetch_start_time = self.prev_stage_precise_start_time\n            new_status.fetch_end_time = current_utc_time\n        elif new_status.stage == Stage.RETRIEVE:\n            new_status.retrieve_start_time = self.prev_stage_precise_start_time\n            new_status.retrieve_end_time = current_utc_time\n        elif new_status.stage == Stage.DOWNLOAD:\n            new_status.download_start_time = self.prev_stage_precise_start_time\n            new_status.download_end_time = current_utc_time\n        else:\n            new_status.upload_start_time = self.prev_stage_precise_start_time\n            new_status.upload_end_time = current_utc_time\n\n        new_status.size = get_file_size(new_status.location)\n\n        self.status = new_status\n\n        self._update(self.status)\n\n    def transact(\n        self,\n        config_name: str,\n        dataset: str,\n        selection: t.Dict,\n        location: str,\n        user: str,\n    ) -> \"Manifest\":\n        \"\"\"Create a download transaction.\"\"\"\n        self._set_for_transaction(config_name, dataset, selection, location, user)\n        return self\n\n    def set_stage(self, stage: Stage) -> None:\n        \"\"\"Sets the current stage in manifest.\"\"\"\n        new_status = dataclasses.replace(self.status)\n        new_status.stage = stage\n        new_status.status = Status.IN_PROGRESS\n        current_utc_time = (\n            datetime.datetime.utcnow()\n            .replace(tzinfo=datetime.timezone.utc)\n            .isoformat(timespec=\"seconds\")\n        )\n\n        if stage == Stage.DOWNLOAD:\n            new_status.download_start_time = current_utc_time\n        else:\n            new_status.download_start_time = self.prev_stage_precise_start_time\n            new_status.download_end_time = current_utc_time\n            new_status.upload_start_time = current_utc_time\n\n        self.status = new_status\n        self._update(self.status)\n\n    @abc.abstractmethod\n    def _read(self, location: str) -> DownloadStatus:\n        pass\n\n    @abc.abstractmethod\n    def _update(self, download_status: DownloadStatus) -> None:\n        pass\n\n\nclass FirestoreManifest(Manifest):\n    \"\"\"A Firestore Manifest.\n    This Manifest implementation stores DownloadStatuses in a Firebase document store.\n    The document hierarchy for the manifest is as follows:\n      [manifest  <or manifest name, configurable from CLI>]\n      ├── doc_id (md5 hash of the path) { 'selection': {...}, 'location': ..., 'username': ... }\n      └── etc...\n    Where `[<name>]` indicates a collection and `<name> {...}` indicates a document.\n    \"\"\"\n\n    def _get_db(self) -> firestore.firestore.Client:\n        \"\"\"Acquire a firestore client, initializing the firebase app if necessary.\n        Will attempt to get the db client five times. If it's still unsuccessful, a\n        `ManifestException` will be raised.\n        \"\"\"\n        db = None\n        attempts = 0\n\n        while db is None:\n            try:\n                db = firestore.client()\n            except ValueError as e:\n                # The above call will fail with a value error when the firebase app is not initialized.\n                # Initialize the app here, and try again.\n                # Use the application default credentials.\n                cred = credentials.ApplicationDefault()\n\n                firebase_admin.initialize_app(cred)\n                print(\"Initialized Firebase App.\")\n\n                if attempts > 4:\n                    raise ManifestException(\n                        \"Exceeded number of retries to get firestore client.\"\n                    ) from e\n\n            time.sleep(get_wait_interval(attempts))\n\n            attempts += 1\n\n        return db\n\n    def _read(self, location: str) -> DownloadStatus:\n        \"\"\"Reads the JSON data from a manifest.\"\"\"\n\n        doc_id = generate_md5_hash(location)\n\n        # Update document with download status\n        download_doc_ref = self.root_document_for_store(doc_id)\n\n        result = download_doc_ref.get()\n        row = {}\n        if result.exists:\n            records = result.to_dict()\n            row = {n: to_json_serializable_type(v) for n, v in records.items()}\n        return DownloadStatus.from_dict(row)\n\n    def _update(self, download_status: DownloadStatus) -> None:\n        \"\"\"Update or create a download status record.\"\"\"\n        print(\"Updating Firestore Manifest.\")\n\n        status = DownloadStatus.to_dict(download_status)\n        doc_id = generate_md5_hash(status[\"location\"])\n\n        # Update document with download status\n        download_doc_ref = self.root_document_for_store(doc_id)\n\n        result: WriteResult = download_doc_ref.set(status)\n\n        print(\n            f\"Firestore manifest updated. \"\n            f\"update_time={result.update_time}, \"\n            f\"filename={download_status.location}.\"\n        )\n\n    def root_document_for_store(self, store_scheme: str) -> DocumentReference:\n        \"\"\"Get the root manifest document given the user's config and current document's storage location.\"\"\"\n        return (\n            self._get_db()\n            .collection(get_config().manifest_collection)\n            .document(store_scheme)\n        )\n"
  },
  {
    "path": "weather_dl_v2/downloader_kubernetes/util.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport datetime\nimport geojson\nimport hashlib\nimport itertools\nimport os\nimport socket\nimport subprocess\nimport sys\nimport typing as t\n\nimport numpy as np\nimport pandas as pd\nfrom apache_beam.io.gcp import gcsio\nfrom apache_beam.utils import retry\nfrom xarray.core.utils import ensure_us_time_resolution\nfrom urllib.parse import urlparse\nfrom google.api_core.exceptions import BadRequest\n\n\nLATITUDE_RANGE = (-90, 90)\nLONGITUDE_RANGE = (-180, 180)\nGLOBAL_COVERAGE_AREA = [90, -180, -90, 180]\n\n\ndef _retry_if_valid_input_but_server_or_socket_error_and_timeout_filter(\n    exception,\n) -> bool:\n    if isinstance(exception, socket.timeout):\n        return True\n    if isinstance(exception, TimeoutError):\n        return True\n    # To handle the concurrency issue in BigQuery.\n    if isinstance(exception, BadRequest):\n        return True\n    return retry.retry_if_valid_input_but_server_error_and_timeout_filter(exception)\n\n\nclass _FakeClock:\n\n    def sleep(self, value):\n        pass\n\n\ndef retry_with_exponential_backoff(fun):\n    \"\"\"A retry decorator that doesn't apply during test time.\"\"\"\n    clock = retry.Clock()\n\n    # Use a fake clock only during test time...\n    if \"unittest\" in sys.modules.keys():\n        clock = _FakeClock()\n\n    return retry.with_exponential_backoff(\n        retry_filter=_retry_if_valid_input_but_server_or_socket_error_and_timeout_filter,\n        clock=clock,\n    )(fun)\n\n\n# TODO(#245): Group with common utilities (duplicated)\ndef ichunked(iterable: t.Iterable, n: int) -> t.Iterator[t.Iterable]:\n    \"\"\"Yield evenly-sized chunks from an iterable.\"\"\"\n    input_ = iter(iterable)\n    try:\n        while True:\n            it = itertools.islice(input_, n)\n            # peek to check if 'it' has next item.\n            first = next(it)\n            yield itertools.chain([first], it)\n    except StopIteration:\n        pass\n\n\n# TODO(#245): Group with common utilities (duplicated)\ndef copy(src: str, dst: str) -> None:\n    \"\"\"Copy data via `gcloud storage cp`.\"\"\"\n    try:\n        subprocess.run([\"gcloud\", \"storage\", \"cp\", src, dst], check=True, capture_output=True)\n    except subprocess.CalledProcessError as e:\n        print(\n            f'Failed to copy file {src!r} to {dst!r} due to {e.stderr.decode(\"utf-8\")}'\n        )\n        raise\n\n\n# TODO(#245): Group with common utilities (duplicated)\ndef to_json_serializable_type(value: t.Any) -> t.Any:\n    \"\"\"Returns the value with a type serializable to JSON\"\"\"\n    # Note: The order of processing is significant.\n    print(\"Serializing to JSON\")\n\n    if pd.isna(value) or value is None:\n        return None\n    elif np.issubdtype(type(value), np.floating):\n        return float(value)\n    elif isinstance(value, np.ndarray):\n        # Will return a scaler if array is of size 1, else will return a list.\n        return value.tolist()\n    elif (\n        isinstance(value, datetime.datetime)\n        or isinstance(value, str)\n        or isinstance(value, np.datetime64)\n    ):\n        # Assume strings are ISO format timestamps...\n        try:\n            value = datetime.datetime.fromisoformat(value)\n        except ValueError:\n            # ... if they are not, assume serialization is already correct.\n            return value\n        except TypeError:\n            # ... maybe value is a numpy datetime ...\n            try:\n                value = ensure_us_time_resolution(value).astype(datetime.datetime)\n            except AttributeError:\n                # ... value is a datetime object, continue.\n                pass\n\n        # We use a string timestamp representation.\n        if value.tzname():\n            return value.isoformat()\n\n        # We assume here that naive timestamps are in UTC timezone.\n        return value.replace(tzinfo=datetime.timezone.utc).isoformat()\n    elif isinstance(value, np.timedelta64):\n        # Return time delta in seconds.\n        return float(value / np.timedelta64(1, \"s\"))\n    # This check must happen after processing np.timedelta64 and np.datetime64.\n    elif np.issubdtype(type(value), np.integer):\n        return int(value)\n\n    return value\n\n\ndef fetch_geo_polygon(area: t.Union[list, str]) -> str:\n    \"\"\"Calculates a geography polygon from an input area.\"\"\"\n    # Ref: https://confluence.ecmwf.int/pages/viewpage.action?pageId=151520973\n    if isinstance(area, str):\n        # European area\n        if area == \"E\":\n            area = [73.5, -27, 33, 45]\n        # Global area\n        elif area == \"G\":\n            area = GLOBAL_COVERAGE_AREA\n        else:\n            raise RuntimeError(f\"Not a valid value for area in config: {area}.\")\n\n    n, w, s, e = [float(x) for x in area]\n    if s < LATITUDE_RANGE[0]:\n        raise ValueError(f\"Invalid latitude value for south: '{s}'\")\n    if n > LATITUDE_RANGE[1]:\n        raise ValueError(f\"Invalid latitude value for north: '{n}'\")\n    if w < LONGITUDE_RANGE[0]:\n        raise ValueError(f\"Invalid longitude value for west: '{w}'\")\n    if e > LONGITUDE_RANGE[1]:\n        raise ValueError(f\"Invalid longitude value for east: '{e}'\")\n\n    # Define the coordinates of the bounding box.\n    coords = [[w, n], [w, s], [e, s], [e, n], [w, n]]\n\n    # Create the GeoJSON polygon object.\n    polygon = geojson.dumps(geojson.Polygon([coords]))\n    return polygon\n\n\ndef get_file_size(path: str) -> float:\n    parsed_gcs_path = urlparse(path)\n    if parsed_gcs_path.scheme != \"gs\" or parsed_gcs_path.netloc == \"\":\n        return os.stat(path).st_size / (1024**3) if os.path.exists(path) else 0\n    else:\n        return (\n            gcsio.GcsIO().size(path) / (1024**3) if gcsio.GcsIO().exists(path) else 0\n        )\n\n\ndef get_wait_interval(num_retries: int = 0) -> float:\n    \"\"\"Returns next wait interval in seconds, using an exponential backoff algorithm.\"\"\"\n    if 0 == num_retries:\n        return 0\n    return 2**num_retries\n\n\ndef generate_md5_hash(input: str) -> str:\n    \"\"\"Generates md5 hash for the input string.\"\"\"\n    return hashlib.md5(input.encode(\"utf-8\")).hexdigest()\n\n\ndef download_with_aria2(url: str, path: str) -> None:\n    \"\"\"Downloads a file from the given URL using the `aria2c` command-line utility,\n    with options set to improve download speed and reliability.\"\"\"\n    dir_path, file_name = os.path.split(path)\n    try:\n        subprocess.run(\n            [\n                \"aria2c\",\n                \"-x\",\n                \"16\",\n                \"-s\",\n                \"16\",\n                url,\n                \"-d\",\n                dir_path,\n                \"-o\",\n                file_name,\n                \"--allow-overwrite\",\n            ],\n            check=True,\n            capture_output=True,\n        )\n    except subprocess.CalledProcessError as e:\n        print(\n            f'Failed download from server {url!r} to {path!r} due to {e.stderr.decode(\"utf-8\")}'\n        )\n        raise\n\n\ndef download_with_wget(url: str, path: str) -> None:\n    \"\"\"Downloads a file from given URL using `wget` command.\"\"\"\n    try:\n        subprocess.run([\"wget\", url, \"-O\", path], check=True, capture_output=True)\n    except subprocess.CalledProcessError as e:\n        print(\n            f'Failed download from server {url!r} to {path!r} due to {e.stderr.decode(\"utf-8\")}.'\n        )\n        raise\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/API-Interactions.md",
    "content": "# API Interactions\n| Command | Type | Endpoint |\n|---|---|---|\n| `weather-dl-v2 ping` | `get` | `/`\n| Download  |   |   |\n| `weather-dl-v2 download add <path> –l <license_id> [--force-download]` | `post` | `/download?force_download={value}` |\n| `weather-dl-v2 download list` | `get` | `/download/` |\n| `weather-dl-v2 download list --filter client_name=<name>` | `get` | `/download?client_name={name}` |\n| `weather-dl-v2 download get <config_name>` | `get` | `/download/{config_name}` |\n| `weather-dl-v2 download show <config_name>` | `get` | `/download/show/{config_name}` |\n| `weather-dl-v2 download remove <config_name>` | `delete` | `/download/{config_name}` |\n| `weather-dl-v2 download refetch <config_name> -l <license_id>` | `post` | `/download/refetch/{config_name}` |\n|  License |   |   |\n| `weather-dl-v2 license add <path>` | `post` | `/license/` |\n| `weather-dl-v2 license get <license_id>` | `get` | `/license/{license_id}` |\n| `weather-dl-v2 license remove <license_id>` | `delete` | `/license/{license_id}` |\n| `weather-dl-v2 license list` | `get` | `/license/` |\n| `weather-dl-v2 license list --filter client_name=<name>` | `get` | `/license?client_name={name}` |\n| `weather-dl-v2 license edit <license_id> <path>` | `put` | `/license/{license_id}` |\n| Queue  |   |   |\n| `weather-dl-v2 queue list` | `get` | `/queues/` |\n| `weather-dl-v2 queue list --filter client_name=<name>` | `get` | `/queues?client_name={name}` |\n| `weather-dl-v2 queue get <license_id>` | `get` | `/queues/{license_id}` |\n| `queue edit <license_id> --config <config_name> --priority <number>` | `post` | `/queues/{license_id}` |\n| `queue edit <license_id> --file <path>` | `put` | `/queues/priority/{license_id}` |"
  },
  {
    "path": "weather_dl_v2/fastapi-server/Dockerfile",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nFROM continuumio/miniconda3:latest\n\nEXPOSE 8080\n\n# Update miniconda\nRUN conda update conda -y\n\n# Add the mamba solver for faster builds\nRUN conda install -n base conda-libmamba-solver\nRUN conda config --set solver libmamba\n\nCOPY . .\n# Create conda env using environment.yml\nRUN conda env create -f environment.yml --debug\n\n# Activate the conda env and update the PATH\nARG CONDA_ENV_NAME=weather-dl-v2-server\nRUN echo \"source activate ${CONDA_ENV_NAME}\" >> ~/.bashrc\nENV PATH /opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH\n\n# Use the ping endpoint as a healthcheck,\n# so Docker knows if the API is still running ok or needs to be restarted\nHEALTHCHECK --interval=21s --timeout=3s --start-period=10s CMD curl --fail http://localhost:8080/ping || exit 1\n\nCMD [\"uvicorn\", \"main:app\", \"--host\", \"0.0.0.0\", \"--port\", \"8080\"]\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/README.md",
    "content": "# Deployment Instructions & General Notes\n\n### User authorization required to set up the environment:\n* roles/container.admin\n\n### Authorization needed for the tool to operate:\nWe are not configuring any service account here hence make sure that compute engine default service account have roles:\n* roles/pubsub.subscriber\n* roles/storage.admin\n* roles/bigquery.dataEditor\n* roles/bigquery.jobUser\n\n### Install kubectl:\n```\napt-get update\n\napt-get install -y kubectl\n```\n \n### Create cluster:\n```\nexport PROJECT_ID=anthromet-ingestion\nexport REGION=us-west1\nexport ZONE=us-west1-a\nexport CLUSTER_NAME=weather-dl-v2-cluster-2\nexport DOWNLOAD_NODE_POOL=downloader-pool\n\ngcloud beta container --project $PROJECT_ID clusters create $CLUSTER_NAME --zone $ZONE --no-enable-basic-auth --cluster-version \"1.29.5-gke.1091002\" --release-channel \"regular\" --machine-type \"e2-standard-2\" --image-type \"COS_CONTAINERD\" --disk-type \"pd-balanced\" --disk-size \"200\" --node-labels preemptible=false --metadata disable-legacy-endpoints=true --scopes \"https://www.googleapis.com/auth/cloud-platform\" --max-pods-per-node \"16\" --num-nodes \"3\" --logging=SYSTEM,WORKLOAD --monitoring=SYSTEM,STORAGE,POD,DEPLOYMENT,STATEFULSET,DAEMONSET,HPA,CADVISOR,KUBELET --enable-ip-alias --network \"projects/$PROJECT_ID/global/networks/default\" --subnetwork \"projects/$PROJECT_ID/regions/$REGION/subnetworks/default\" --no-enable-intra-node-visibility --default-max-pods-per-node \"110\" --enable-autoscaling --min-nodes \"1\" --max-nodes \"200\" --location-policy \"BALANCED\" --no-enable-master-authorized-networks --addons HorizontalPodAutoscaling,HttpLoadBalancing,GcePersistentDiskCsiDriver --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --binauthz-evaluation-mode=DISABLED --enable-managed-prometheus --enable-shielded-nodes --node-locations $ZONE\n\ngcloud beta container --project $PROJECT_ID node-pools create $DOWNLOAD_NODE_POOL --cluster $CLUSTER_NAME --zone $ZONE --machine-type \"e2-standard-8\" --image-type \"COS_CONTAINERD\" --disk-type \"pd-balanced\" --disk-size \"1000\" --node-labels preemptible=false --metadata disable-legacy-endpoints=true --scopes \"https://www.googleapis.com/auth/cloud-platform\" --num-nodes \"1\" --enable-autoscaling --min-nodes \"1\" --max-nodes \"100\" --location-policy \"BALANCED\" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --max-pods-per-node \"16\" --node-locations $ZONE\n\n### Connect to Cluster:\n```\ngcloud container clusters get-credentials $CLUSTER_NAME --zone $ZONE --project $PROJECT_ID\n```\n\n### How to create environment:\n```\nconda env create --name weather-dl-v2-server --file=environment.yml\n\nconda activate weather-dl-v2-server\n```\n\n### Make changes in weather_dl_v2/config.json, if required [for running locally]\n```\nexport CONFIG_PATH=/path/to/weather_dl_v2/config.json\n```\n\n### To run fastapi server:\n```\nuvicorn main:app --reload\n```\n\n* Open your browser at http://127.0.0.1:8000.\n\n\n### Create docker image for server:\nRefer instructions in weather_dl_v2/README.md\n\n### Add path of created server image in server.yaml:\n```\nPlease write down the fastAPI server's docker image path at Line 42 of server.yaml.\n```\n\n### Create ConfigMap of common configurations for services:\nMake necessary changes to weather_dl_v2/config.json and run following command.  \nConfigMap is used for:\n- Having a common configuration file for all services.\n- Decoupling docker image and config files.\n```\nkubectl create configmap dl-v2-config --from-file=/path/to/weather_dl_v2/config.json\n```\n\n### Deploy fastapi server on kubernetes:\n```\nkubectl apply -f server.yaml --force\n```\n\n## General Commands\n### For viewing the current pods:\n```\nkubectl get pods\n```\n\n### For deleting existing deployment:\n```\nkubectl delete -f server.yaml --force\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/VERSION.txt",
    "content": "1.0.12"
  },
  {
    "path": "weather_dl_v2/fastapi-server/__init__.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/config_processing/config.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport calendar\nimport copy\nimport dataclasses\nimport itertools\nimport typing as t\n\nValues = t.Union[t.List[\"Values\"], t.Dict[str, \"Values\"], bool, int, float, str]  # pytype: disable=not-supported-yet\n\n\n@dataclasses.dataclass\nclass Config:\n    \"\"\"Contains pipeline parameters.\n\n    Attributes:\n        config_name:\n            Name of the config file.\n        client:\n            Name of the Weather-API-client. Supported clients are mentioned in the 'CLIENTS' variable.\n        dataset (optional):\n            Name of the target dataset. Allowed options are dictated by the client.\n        partition_keys (optional):\n            Choose the keys from the selection section to partition the data request.\n            This will compute a cartesian cross product of the selected keys\n            and assign each as their own download.\n        target_path:\n            Download artifact filename template. Can make use of Python's standard string formatting.\n            It can contain format symbols to be replaced by partition keys;\n            if this is used, the total number of format symbols must match the number of partition keys.\n        subsection_name:\n            Name of the particular subsection. 'default' if there is no subsection.\n        force_download:\n            Force redownload of partitions that were previously downloaded.\n        user_id:\n            Username from the environment variables.\n        kwargs (optional):\n            For representing subsections or any other parameters.\n        selection:\n            Contains parameters used to select desired data.\n    \"\"\"\n\n    config_name: str = \"\"\n    client: str = \"\"\n    dataset: t.Optional[str] = \"\"\n    target_path: str = \"\"\n    partition_keys: t.Optional[t.List[str]] = dataclasses.field(default_factory=list)\n    subsection_name: str = \"default\"\n    force_download: bool = False\n    user_id: str = \"unknown\"\n    kwargs: t.Optional[t.Dict[str, Values]] = dataclasses.field(default_factory=dict)\n    selection: t.Dict[str, Values] = dataclasses.field(default_factory=dict)\n\n    @classmethod\n    def from_dict(cls, config: t.Dict) -> \"Config\":\n        config_instance = cls()\n        for section_key, section_value in config.items():\n            if section_key == \"parameters\":\n                for key, value in section_value.items():\n                    if hasattr(config_instance, key):\n                        setattr(config_instance, key, value)\n                    else:\n                        config_instance.kwargs[key] = value\n            if section_key == \"selection\":\n                config_instance.selection = section_value\n        return config_instance\n\n\ndef optimize_selection_partition(selection: t.Dict) -> t.Dict:\n    \"\"\"Compute right-hand-side values for the selection section of a single partition.\n\n    Used to support custom syntax and optimizations, such as 'all'.\n    \"\"\"\n    selection_ = copy.deepcopy(selection)\n\n    if \"date_range\" in selection_.keys():\n        selection_[\"date\"] = selection_[\"date_range\"][0]\n        del selection_[\"date_range\"]\n\n    if \"day\" in selection_.keys() and selection_[\"day\"] == \"all\":\n        years, months = selection_[\"year\"], selection_[\"month\"]\n\n        multiples_error = \"When using day='all' in selection, '/' is not allowed in {type}.\"\n\n        if isinstance(years, str):\n            years = [years]\n\n        if isinstance(months, str):\n            months = [months]\n\n        date_ranges = []\n\n        # Generating dates for every year-month.\n        for year, month in itertools.product(years, months):\n\n            if isinstance(year, str):\n                assert \"/\" not in year, multiples_error.format(type=\"year\")\n\n            if isinstance(month, str):\n                assert \"/\" not in month, multiples_error.format(type=\"month\")\n\n            year, month = int(year), int(month)\n\n            _, n_days_in_month = calendar.monthrange(year, month)\n\n            date_range = [f'{year:04d}-{month:02d}-{day:02d}' for day in range(1, n_days_in_month + 1)]\n\n            date_ranges.extend(date_range)\n\n        selection_[\"date\"] = date_ranges\n        del selection_[\"day\"]\n        del selection_[\"month\"]\n        del selection_[\"year\"]\n\n    return selection_\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/config_processing/manifest.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\n\"\"\"Client interface for connecting to a manifest.\"\"\"\n\nimport abc\nimport dataclasses\nimport logging\nimport datetime\nimport enum\nimport json\nimport pandas as pd\nimport time\nimport traceback\nimport typing as t\n\nfrom .util import (\n    to_json_serializable_type,\n    fetch_geo_polygon,\n    get_file_size,\n    get_wait_interval,\n    generate_md5_hash,\n    GLOBAL_COVERAGE_AREA,\n)\n\nimport firebase_admin\nfrom firebase_admin import credentials\nfrom firebase_admin import firestore\nfrom google.cloud.firestore_v1 import DocumentReference\nfrom google.cloud.firestore_v1.types import WriteResult\nfrom server_config import get_config\nfrom database.session import Database\n\n\"\"\"An implementation-dependent Manifest URI.\"\"\"\nLocation = t.NewType(\"Location\", str)\n\nlogger = logging.getLogger(__name__)\n\n\nclass ManifestException(Exception):\n    \"\"\"Errors that occur in Manifest Clients.\"\"\"\n\n    pass\n\n\nclass Stage(enum.Enum):\n    \"\"\"A request can be either in one of the following stages at a time:\n\n    fetch : This represents request is currently in fetch stage i.e. request placed on the client's server\n        & waiting for some result before starting download (eg. MARS client).\n    download : This represents request is currently in download stage i.e. data is being downloading from client's\n        server to the worker's local file system.\n    upload : This represents request is currently in upload stage i.e. data is getting uploaded from worker's local\n        file system to target location (GCS path).\n    retrieve : In case of clients where there is no proper separation of fetch & download stages (eg. CDS client),\n        request will be in the retrieve stage i.e. fetch + download.\n    \"\"\"\n\n    RETRIEVE = \"retrieve\"\n    FETCH = \"fetch\"\n    DOWNLOAD = \"download\"\n    UPLOAD = \"upload\"\n\n\nclass Status(enum.Enum):\n    \"\"\"Depicts the request's state status:\n\n    scheduled : A request partition is created & scheduled for processing.\n        Note: Its corresponding state can be None only.\n    in-progress : This represents the request state is currently in-progress (i.e. running).\n        The next status would be \"success\" or \"failure\".\n    success : This represents the request state execution completed successfully without any error.\n    failure : This represents the request state execution failed.\n    \"\"\"\n\n    SCHEDULED = \"scheduled\"\n    IN_PROGRESS = \"in-progress\"\n    SUCCESS = \"success\"\n    FAILURE = \"failure\"\n\n\n@dataclasses.dataclass\nclass DownloadStatus:\n    \"\"\"Data recorded in `Manifest`s reflecting the status of a download.\"\"\"\n\n    \"\"\"The name of the config file associated with the request.\"\"\"\n    config_name: str = \"\"\n\n    \"\"\"Represents the dataset field of the configuration.\"\"\"\n    dataset: t.Optional[str] = \"\"\n\n    \"\"\"Copy of selection section of the configuration.\"\"\"\n    selection: t.Dict = dataclasses.field(default_factory=dict)\n\n    \"\"\"Location of the downloaded data.\"\"\"\n    location: str = \"\"\n\n    \"\"\"Represents area covered by the shard.\"\"\"\n    area: str = \"\"\n\n    \"\"\"Current stage of request : 'fetch', 'download', 'retrieve', 'upload' or None.\"\"\"\n    stage: t.Optional[Stage] = None\n\n    \"\"\"Download status: 'scheduled', 'in-progress', 'success', or 'failure'.\"\"\"\n    status: t.Optional[Status] = None\n\n    \"\"\"Cause of error, if any.\"\"\"\n    error: t.Optional[str] = \"\"\n\n    \"\"\"Identifier for the user running the download.\"\"\"\n    username: str = \"\"\n\n    \"\"\"Shard size in GB.\"\"\"\n    size: t.Optional[float] = 0\n\n    \"\"\"A UTC datetime when download was scheduled.\"\"\"\n    scheduled_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the retrieve stage starts.\"\"\"\n    retrieve_start_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the retrieve state ends.\"\"\"\n    retrieve_end_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the fetch state starts.\"\"\"\n    fetch_start_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the fetch state ends.\"\"\"\n    fetch_end_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the download state starts.\"\"\"\n    download_start_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the download state ends.\"\"\"\n    download_end_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the upload state starts.\"\"\"\n    upload_start_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the upload state ends.\"\"\"\n    upload_end_time: t.Optional[str] = \"\"\n\n    @classmethod\n    def from_dict(cls, download_status: t.Dict) -> \"DownloadStatus\":\n        \"\"\"Instantiate DownloadStatus dataclass from dict.\"\"\"\n        download_status_instance = cls()\n        for key, value in download_status.items():\n            if key == \"status\":\n                setattr(download_status_instance, key, Status(value))\n            elif key == \"stage\" and value is not None:\n                setattr(download_status_instance, key, Stage(value))\n            else:\n                setattr(download_status_instance, key, value)\n        return download_status_instance\n\n    @classmethod\n    def to_dict(cls, instance) -> t.Dict:\n        \"\"\"Return the fields of a dataclass instance as a manifest ingestible\n        dictionary mapping of field names to field values.\"\"\"\n        download_status_dict = {}\n        for field in dataclasses.fields(instance):\n            key = field.name\n            value = getattr(instance, field.name)\n            if isinstance(value, Status) or isinstance(value, Stage):\n                download_status_dict[key] = value.value\n            elif isinstance(value, pd.Timestamp):\n                download_status_dict[key] = value.isoformat()\n            elif key == \"selection\" and value is not None:\n                download_status_dict[key] = json.dumps(value)\n            else:\n                download_status_dict[key] = value\n        return download_status_dict\n\n\n@dataclasses.dataclass\nclass Manifest(abc.ABC):\n    \"\"\"Abstract manifest of download statuses.\n\n    Update download statuses to some storage medium.\n\n    This class lets one indicate that a download is `scheduled` or in a transaction process.\n    In the event of a transaction, a download will be updated with an `in-progress`, `success`\n    or `failure` status (with accompanying metadata).\n\n    Example:\n        ```\n        my_manifest = parse_manifest_location(Location('fs://some-firestore-collection'))\n\n        # Schedule data for download\n        my_manifest.schedule({'some': 'metadata'}, 'path/to/downloaded/file', 'my-username')\n\n        # ...\n\n        # Initiate a transaction – it will record that the download is `in-progess`\n        with my_manifest.transact({'some': 'metadata'}, 'path/to/downloaded/file', 'my-username') as tx:\n            # download logic here\n            pass\n\n            # ...\n\n            # on error, will record the download as a `failure` before propagating the error.  By default, it will\n            # record download as a `success`.\n        ```\n\n    Attributes:\n        status: The current `DownloadStatus` of the Manifest.\n    \"\"\"\n\n    # To reduce the impact of _read() and _update() calls\n    # on the start time of the stage.\n    prev_stage_precise_start_time: t.Optional[str] = None\n    status: t.Optional[DownloadStatus] = None\n\n    # This is overridden in subclass.\n    def __post_init__(self):\n        \"\"\"Initialize the manifest.\"\"\"\n        pass\n\n    def schedule(\n        self,\n        config_name: str,\n        dataset: str,\n        selection: t.Dict,\n        location: str,\n        user: str,\n    ) -> None:\n        \"\"\"Indicate that a job has been scheduled for download.\n\n        'scheduled' jobs occur before 'in-progress', 'success' or 'finished'.\n        \"\"\"\n        scheduled_time = (\n            datetime.datetime.utcnow()\n            .replace(tzinfo=datetime.timezone.utc)\n            .isoformat(timespec=\"seconds\")\n        )\n        self.status = DownloadStatus(\n            config_name=config_name,\n            dataset=dataset if dataset else None,\n            selection=selection,\n            location=location,\n            area=fetch_geo_polygon(selection.get(\"area\", GLOBAL_COVERAGE_AREA)),\n            username=user,\n            stage=None,\n            status=Status.SCHEDULED,\n            error=None,\n            size=None,\n            scheduled_time=scheduled_time,\n            retrieve_start_time=None,\n            retrieve_end_time=None,\n            fetch_start_time=None,\n            fetch_end_time=None,\n            download_start_time=None,\n            download_end_time=None,\n            upload_start_time=None,\n            upload_end_time=None,\n        )\n        self._update(self.status)\n\n    def skip(\n        self,\n        config_name: str,\n        dataset: str,\n        selection: t.Dict,\n        location: str,\n        user: str,\n    ) -> None:\n        \"\"\"Updates the manifest to mark the shards that were skipped in the current job\n        as 'upload' stage and 'success' status, indicating that they have already been downloaded.\n        \"\"\"\n        old_status = self._read(location)\n        # The manifest needs to be updated for a skipped shard if its entry is not present, or\n        # if the stage is not 'upload', or if the stage is 'upload' but the status is not 'success'.\n        if (\n            old_status.location != location\n            or old_status.stage != Stage.UPLOAD\n            or old_status.status != Status.SUCCESS\n        ):\n            current_utc_time = (\n                datetime.datetime.utcnow()\n                .replace(tzinfo=datetime.timezone.utc)\n                .isoformat(timespec=\"seconds\")\n            )\n\n            size = get_file_size(location)\n\n            status = DownloadStatus(\n                config_name=config_name,\n                dataset=dataset if dataset else None,\n                selection=selection,\n                location=location,\n                area=fetch_geo_polygon(selection.get(\"area\", GLOBAL_COVERAGE_AREA)),\n                username=user,\n                stage=Stage.UPLOAD,\n                status=Status.SUCCESS,\n                error=None,\n                size=size,\n                scheduled_time=None,\n                retrieve_start_time=None,\n                retrieve_end_time=None,\n                fetch_start_time=None,\n                fetch_end_time=None,\n                download_start_time=None,\n                download_end_time=None,\n                upload_start_time=current_utc_time,\n                upload_end_time=current_utc_time,\n            )\n            self._update(status)\n            logger.info(\n                f\"Manifest updated for skipped shard: {location!r} -- {DownloadStatus.to_dict(status)!r}.\"\n            )\n\n    def _set_for_transaction(\n        self,\n        config_name: str,\n        dataset: str,\n        selection: t.Dict,\n        location: str,\n        user: str,\n    ) -> None:\n        \"\"\"Reset Manifest state in preparation for a new transaction.\"\"\"\n        self.status = dataclasses.replace(self._read(location))\n        self.status.config_name = config_name\n        self.status.dataset = dataset if dataset else None\n        self.status.selection = selection\n        self.status.location = location\n        self.status.username = user\n\n    def __enter__(self) -> None:\n        pass\n\n    def __exit__(self, exc_type, exc_inst, exc_tb) -> None:\n        \"\"\"Record end status of a transaction as either 'success' or 'failure'.\"\"\"\n        if exc_type is None:\n            status = Status.SUCCESS\n            error = None\n        else:\n            status = Status.FAILURE\n            # For explanation, see https://docs.python.org/3/library/traceback.html#traceback.format_exception\n            error = \"\\n\".join(traceback.format_exception(exc_type, exc_inst, exc_tb))\n\n        new_status = dataclasses.replace(self.status)\n        new_status.error = error\n        new_status.status = status\n        current_utc_time = (\n            datetime.datetime.utcnow()\n            .replace(tzinfo=datetime.timezone.utc)\n            .isoformat(timespec=\"seconds\")\n        )\n\n        # This is necessary for setting the precise start time of the previous stage\n        # and end time of the final stage, as well as handling the case of Status.FAILURE.\n        if new_status.stage == Stage.FETCH:\n            new_status.fetch_start_time = self.prev_stage_precise_start_time\n            new_status.fetch_end_time = current_utc_time\n        elif new_status.stage == Stage.RETRIEVE:\n            new_status.retrieve_start_time = self.prev_stage_precise_start_time\n            new_status.retrieve_end_time = current_utc_time\n        elif new_status.stage == Stage.DOWNLOAD:\n            new_status.download_start_time = self.prev_stage_precise_start_time\n            new_status.download_end_time = current_utc_time\n        else:\n            new_status.upload_start_time = self.prev_stage_precise_start_time\n            new_status.upload_end_time = current_utc_time\n\n        new_status.size = get_file_size(new_status.location)\n\n        self.status = new_status\n\n        self._update(self.status)\n\n    def transact(\n        self,\n        config_name: str,\n        dataset: str,\n        selection: t.Dict,\n        location: str,\n        user: str,\n    ) -> \"Manifest\":\n        \"\"\"Create a download transaction.\"\"\"\n        self._set_for_transaction(config_name, dataset, selection, location, user)\n        return self\n\n    def set_stage(self, stage: Stage) -> None:\n        \"\"\"Sets the current stage in manifest.\"\"\"\n        prev_stage = self.status.stage\n        new_status = dataclasses.replace(self.status)\n        new_status.stage = stage\n        new_status.status = Status.IN_PROGRESS\n        current_utc_time = (\n            datetime.datetime.utcnow()\n            .replace(tzinfo=datetime.timezone.utc)\n            .isoformat(timespec=\"seconds\")\n        )\n\n        if stage == Stage.FETCH:\n            new_status.fetch_start_time = current_utc_time\n        elif stage == Stage.RETRIEVE:\n            new_status.retrieve_start_time = current_utc_time\n        elif stage == Stage.DOWNLOAD:\n            new_status.fetch_start_time = self.prev_stage_precise_start_time\n            new_status.fetch_end_time = current_utc_time\n            new_status.download_start_time = current_utc_time\n        else:\n            if prev_stage == Stage.DOWNLOAD:\n                new_status.download_start_time = self.prev_stage_precise_start_time\n                new_status.download_end_time = current_utc_time\n            else:\n                new_status.retrieve_start_time = self.prev_stage_precise_start_time\n                new_status.retrieve_end_time = current_utc_time\n            new_status.upload_start_time = current_utc_time\n\n        self.status = new_status\n        self._update(self.status)\n\n    @abc.abstractmethod\n    def _read(self, location: str) -> DownloadStatus:\n        pass\n\n    @abc.abstractmethod\n    def _update(self, download_status: DownloadStatus) -> None:\n        pass\n\n\nclass FirestoreManifest(Manifest, Database):\n    \"\"\"A Firestore Manifest.\n    This Manifest implementation stores DownloadStatuses in a Firebase document store.\n    The document hierarchy for the manifest is as follows:\n      [manifest  <or manifest name, configurable from CLI>]\n      ├── doc_id (md5 hash of the path) { 'selection': {...}, 'location': ..., 'username': ... }\n      └── etc...\n    Where `[<name>]` indicates a collection and `<name> {...}` indicates a document.\n    \"\"\"\n\n    def _get_db(self) -> firestore.firestore.Client:\n        \"\"\"Acquire a firestore client, initializing the firebase app if necessary.\n        Will attempt to get the db client five times. If it's still unsuccessful, a\n        `ManifestException` will be raised.\n        \"\"\"\n        db = None\n        attempts = 0\n\n        while db is None:\n            try:\n                db = firestore.client()\n            except ValueError as e:\n                # The above call will fail with a value error when the firebase app is not initialized.\n                # Initialize the app here, and try again.\n                # Use the application default credentials.\n                cred = credentials.ApplicationDefault()\n\n                firebase_admin.initialize_app(cred)\n                logger.info(\"Initialized Firebase App.\")\n\n                if attempts > 4:\n                    raise ManifestException(\n                        \"Exceeded number of retries to get firestore client.\"\n                    ) from e\n\n            time.sleep(get_wait_interval(attempts))\n\n            attempts += 1\n\n        return db\n\n    def _read(self, location: str) -> DownloadStatus:\n        \"\"\"Reads the JSON data from a manifest.\"\"\"\n\n        doc_id = generate_md5_hash(location)\n\n        # Update document with download status\n        download_doc_ref = self.root_document_for_store(doc_id)\n\n        result = download_doc_ref.get()\n        row = {}\n        if result.exists:\n            records = result.to_dict()\n            row = {n: to_json_serializable_type(v) for n, v in records.items()}\n        return DownloadStatus.from_dict(row)\n\n    def _update(self, download_status: DownloadStatus) -> None:\n        \"\"\"Update or create a download status record.\"\"\"\n        logger.info(\"Updating Firestore Manifest.\")\n\n        status = DownloadStatus.to_dict(download_status)\n        doc_id = generate_md5_hash(status[\"location\"])\n\n        # Update document with download status\n        download_doc_ref = self.root_document_for_store(doc_id)\n\n        result: WriteResult = download_doc_ref.set(status)\n\n        logger.info(\n            f\"Firestore manifest updated. \"\n            f\"update_time={result.update_time}, \"\n            f\"filename={download_status.location}.\"\n        )\n\n    def root_document_for_store(self, store_scheme: str) -> DocumentReference:\n        \"\"\"Get the root manifest document given the user's config and current document's storage location.\"\"\"\n        root_collection = get_config().manifest_collection\n        return self._get_db().collection(root_collection).document(store_scheme)\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/config_processing/parsers.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\n\"\"\"Parsers for ECMWF download configuration.\"\"\"\n\nimport ast\nimport configparser\nimport copy as cp\nimport datetime\nimport json\nimport string\nimport textwrap\nimport typing as t\nimport numpy as np\nfrom collections import OrderedDict\nfrom dateutil.relativedelta import relativedelta\nfrom .config import Config\n\nCLIENTS = [\"cds\", \"mars\", \"ecpublic\"]\n\n\ndef date(candidate: str) -> datetime.date:\n    \"\"\"Converts ECMWF-format date strings into a `datetime.date`.\n\n    Accepted absolute date formats:\n    - YYYY-MM-DD\n    - YYYYMMDD\n    - YYYY-DDD, where DDD refers to the day of the year\n\n    For example:\n    - 2021-10-31\n    - 19700101\n    - 1950-007\n\n    See https://confluence.ecmwf.int/pages/viewpage.action?pageId=118817289 for date format spec.\n    Note: Name of month is not supported.\n    \"\"\"\n    converted = None\n\n    # Parse relative day value.\n    if candidate.startswith(\"-\"):\n        return datetime.date.today() + datetime.timedelta(days=int(candidate))\n\n    accepted_formats = [\"%Y-%m-%d\", \"%Y%m%d\", \"%Y-%j\"]\n\n    for fmt in accepted_formats:\n        try:\n            converted = datetime.datetime.strptime(candidate, fmt).date()\n            break\n        except ValueError:\n            pass\n\n    if converted is None:\n        raise ValueError(\n            f\"Not a valid date: '{candidate}'. Please use valid relative or absolute format.\"\n        )\n\n    return converted\n\n\ndef time(candidate: str) -> datetime.time:\n    \"\"\"Converts ECMWF-format time strings into a `datetime.time`.\n\n    Accepted time formats:\n    - HH:MM\n    - HHMM\n    - HH\n\n    For example:\n    - 18:00\n    - 1820\n    - 18\n\n    Note: If MM is omitted it defaults to 00.\n    \"\"\"\n    converted = None\n\n    accepted_formats = [\"%H\", \"%H:%M\", \"%H%M\"]\n\n    for fmt in accepted_formats:\n        try:\n            converted = datetime.datetime.strptime(candidate, fmt).time()\n            break\n        except ValueError:\n            pass\n\n    if converted is None:\n        raise ValueError(f\"Not a valid time: '{candidate}'. Please use valid format.\")\n\n    return converted\n\n\ndef day_month_year(candidate: t.Any) -> int:\n    \"\"\"Converts day, month and year strings into 'int'.\"\"\"\n    try:\n        if isinstance(candidate, str) or isinstance(candidate, int):\n            return int(candidate)\n        raise ValueError(\"must be a str or int.\")\n    except ValueError as e:\n        raise ValueError(\n            f\"Not a valid day, month, or year value: {candidate}. Please use valid value.\"\n        ) from e\n\n\ndef date_range_converter(candidate: str) -> str:\n    \"\"\"Replace / with _ to avoid directory creation.\"\"\"\n    return candidate.replace('/', '_')\n\n\ndef parse_literal(candidate: t.Any) -> t.Any:\n    try:\n        # Support parsing ints with leading zeros, e.g. '01'\n        if isinstance(candidate, str) and candidate.isdigit():\n            return int(candidate)\n        return ast.literal_eval(candidate)\n    except (ValueError, TypeError, SyntaxError, MemoryError, RecursionError):\n        return candidate\n\n\ndef validate(key: str, value: int) -> None:\n    \"\"\"Validates value based on the key.\"\"\"\n    if key == \"day\":\n        assert 1 <= value <= 31, \"Day value must be between 1 to 31.\"\n    if key == \"month\":\n        assert 1 <= value <= 12, \"Month value must be between 1 to 12.\"\n\n\ndef typecast(key: str, value: t.Any) -> t.Any:\n    \"\"\"Type the value to its appropriate datatype.\"\"\"\n    SWITCHER = {\n        \"date\": date,\n        \"time\": time,\n        \"day\": day_month_year,\n        \"month\": day_month_year,\n        \"year\": day_month_year,\n        'date_range': date_range_converter,\n    }\n    converted = SWITCHER.get(key, parse_literal)(value)\n    validate(key, converted)\n    return converted\n\n\ndef _read_config_file(file: t.IO) -> t.Dict:\n    \"\"\"Reads `*.json` or `*.cfg` files.\"\"\"\n    try:\n        return json.load(file)\n    except json.JSONDecodeError:\n        pass\n\n    file.seek(0)\n\n    try:\n        config = configparser.ConfigParser()\n        config.read_file(file)\n        config = {s: dict(config.items(s)) for s in config.sections()}\n        return config\n    except configparser.ParsingError:\n        return {}\n\n\ndef parse_config(file: t.IO) -> t.Dict:\n    \"\"\"Parses a `*.json` or `*.cfg` file into a configuration dictionary.\"\"\"\n    config = _read_config_file(file)\n    config_by_section = {s: _parse_lists(v, s) for s, v in config.items()}\n    config_with_nesting = parse_subsections(config_by_section)\n    return config_with_nesting\n\n\ndef _splitlines(block: str) -> t.List[str]:\n    \"\"\"Converts a multi-line block into a list of strings.\"\"\"\n    return [line.strip() for line in block.strip().splitlines()]\n\n\ndef mars_range_value(token: str, key: str) -> t.Union[datetime.date, int, float]:\n    \"\"\"Converts a range token into either a date, int, or float.\"\"\"\n    try:\n        if key == 'year-month':\n            return datetime.datetime.strptime(token, \"%Y-%m\").date()\n        else:\n            return date(token)\n    except ValueError:\n        pass\n\n    if token.isdecimal():\n        return int(token)\n\n    try:\n        return float(token)\n    except ValueError:\n        raise ValueError(\n            \"Token string must be an 'int', 'float', or 'datetime.date()'.\"\n        )\n\n\ndef mars_increment_value(token: str) -> t.Union[int, float]:\n    \"\"\"Converts an increment token into either an int or a float.\"\"\"\n    try:\n        return int(token)\n    except ValueError:\n        pass\n\n    try:\n        return float(token)\n    except ValueError:\n        raise ValueError(\"Token string must be an 'int' or a 'float'.\")\n\n\ndef parse_mars_syntax(block: str, key: str) -> t.List[str]:\n    \"\"\"Parses MARS list or range into a list of arguments; ranges are inclusive.\n\n    Types for the range and value are inferred.\n\n    Examples:\n        >>> parse_mars_syntax(\"10/to/12\")\n        ['10', '11', '12']\n        >>> parse_mars_syntax(\"12/to/10/by/-1\")\n        ['12', '11', '10']\n        >>> parse_mars_syntax(\"0.0/to/0.5/by/0.1\")\n        ['0.0', '0.1', '0.2', '0.30000000000000004', '0.4', '0.5']\n        >>> parse_mars_syntax(\"2020-01-07/to/2020-01-14/by/2\")\n        ['2020-01-07', '2020-01-09', '2020-01-11', '2020-01-13']\n        >>> parse_mars_syntax(\"2020-01-14/to/2020-01-07/by/-2\")\n        ['2020-01-14', '2020-01-12', '2020-01-10', '2020-01-08']\n\n    Returns:\n        A list of strings representing a range from start to finish, based on the\n        type of the values in the range.\n        If all range values are integers, it will return a list of strings of integers.\n        If range values are floats, it will return a list of strings of floats.\n        If the range values are dates, it will return a list of strings of dates in\n        YYYY-MM-DD format. (Note: here, the increment value should be an integer).\n    \"\"\"\n\n    # Split into tokens, omitting empty strings.\n    tokens = [b.strip() for b in block.split(\"/\") if b != \"\"]\n\n    # Return list if no range operators are present.\n    if \"to\" not in tokens and \"by\" not in tokens:\n        return tokens\n\n    # Parse range values, honoring 'to' and 'by' operators.\n    try:\n        to_idx = tokens.index(\"to\")\n        assert to_idx != 0, \"There must be a start token.\"\n        start_token, end_token = tokens[to_idx - 1], tokens[to_idx + 1]\n        start, end = mars_range_value(start_token, key), mars_range_value(end_token, key)\n\n        # Parse increment token, or choose default increment.\n        increment_token = \"1\"\n        increment = 1\n        if \"by\" in tokens:\n            increment_token = tokens[tokens.index(\"by\") + 1]\n            increment = mars_increment_value(increment_token)\n    except (AssertionError, IndexError, ValueError):\n        raise SyntaxError(f\"Improper range syntax in '{block}'.\")\n\n    # Return a range of values with appropriate data type.\n    if (key == 'year-month' and isinstance(start, datetime.date) and isinstance(end, datetime.date)\n            and isinstance(increment, int)):\n        result = []\n        offset = 1 if start <= end else -1\n        if increment >= 0:\n            increment *= offset  # ensure increment has correct direction\n        current = start\n        while current <= end if offset > 0 else current >= end:\n            result.append(current.strftime(\"%Y-%m\"))\n            current += relativedelta(months=increment)\n        return result\n    elif isinstance(start, datetime.date) and isinstance(end, datetime.date) and key != 'year-month':\n        increment *= -1 if start > end and increment > 0 else 1\n        if not isinstance(increment, int):\n            raise ValueError(\n                f\"Increments on a date range must be integer number of days, '{increment_token}' is invalid.\"\n            )\n        return [d.strftime(\"%Y-%m-%d\") for d in date_range(start, end, increment)]\n    elif (isinstance(start, float) or isinstance(end, float)) and not isinstance(\n        increment, datetime.date\n    ):\n        # Increment can be either an int or a float.\n        _round_places = 4\n        return [\n            str(round(x, _round_places)).zfill(len(start_token))\n            for x in np.arange(start, end + increment, increment)\n        ]\n    elif isinstance(start, int) and isinstance(end, int) and isinstance(increment, int):\n        # Honor leading zeros.\n        offset = 1 if start <= end else -1\n        return [\n            str(x).zfill(len(start_token))\n            for x in range(start, end + offset, increment)\n        ]\n    else:\n        raise ValueError(\n            f\"Range tokens (start='{start_token}', end='{end_token}', increment='{increment_token}')\"\n            f\" are inconsistent types.\"\n        )\n\n\ndef date_range(\n    start: datetime.date, end: datetime.date, increment: int = 1\n) -> t.Iterable[datetime.date]:\n    \"\"\"Gets a range of dates, inclusive.\"\"\"\n    offset = 1 if start <= end else -1\n    return (\n        start + datetime.timedelta(days=x)\n        for x in range(0, (end - start).days + offset, increment)\n    )\n\n\ndef _parse_lists(config: dict, section: str = \"\") -> t.Dict:\n    \"\"\"Parses multiline blocks in *.cfg and *.json files as lists.\"\"\"\n    for key, val in config.items():\n        # Checks str type for backward compatibility since it also support \"padding\": 0 in json config\n        if not isinstance(val, str):\n            continue\n\n        if \"/\" in val and \"parameters\" not in section and key != \"date_range\":\n            config[key] = parse_mars_syntax(val, key)\n        elif \"\\n\" in val:\n            config[key] = _splitlines(val)\n\n    return config\n\n\ndef _number_of_replacements(s: t.Text):\n    format_names = [v[1] for v in string.Formatter().parse(s) if v[1] is not None]\n    num_empty_names = len([empty for empty in format_names if empty == \"\"])\n    if num_empty_names != 0:\n        num_empty_names -= 1\n    return len(set(format_names)) + num_empty_names\n\n\ndef parse_subsections(config: t.Dict) -> t.Dict:\n    \"\"\"Interprets [section.subsection] as nested dictionaries in `.cfg` files.\"\"\"\n    copy = cp.deepcopy(config)\n    for key, val in copy.items():\n        path = key.split(\".\")\n        runner = copy\n        parent = {}\n        p = None\n        for p in path:\n            if p not in runner:\n                runner[p] = {}\n            parent = runner\n            runner = runner[p]\n        parent[p] = val\n\n    for_cleanup = [key for key, _ in copy.items() if \".\" in key]\n    for target in for_cleanup:\n        del copy[target]\n    return copy\n\n\ndef require(\n    condition: bool, message: str, error_type: t.Type[Exception] = ValueError\n) -> None:\n    \"\"\"A assert-like helper that wraps text and throws an error.\"\"\"\n    if not condition:\n        raise error_type(textwrap.dedent(message))\n\n\ndef process_config(file: t.IO, config_name: str) -> Config:\n    \"\"\"Read the config file and prompt the user if it is improperly structured.\"\"\"\n    config = parse_config(file)\n\n    require(bool(config), \"Unable to parse configuration file.\")\n    require(\n        \"parameters\" in config,\n        \"\"\"\n            'parameters' section required in configuration file.\n\n            The 'parameters' section specifies the 'client', 'dataset', 'target_path', and\n            'partition_key' for the API client.\n\n            Please consult the documentation for more information.\"\"\",\n    )\n\n    params = config.get(\"parameters\", {})\n    require(\n        \"target_template\" not in params,\n        \"\"\"\n            'target_template' is deprecated, use 'target_path' instead.\n\n            Please consult the documentation for more information.\"\"\",\n    )\n    require(\n        \"target_path\" in params,\n        \"\"\"\n            'parameters' section requires a 'target_path' key.\n\n            The 'target_path' is used to format the name of the output files. It\n            accepts Python 3.5+ string format symbols (e.g. '{}'). The number of symbols\n            should match the length of the 'partition_keys', as the 'partition_keys' args\n            are used to create the templates.\"\"\",\n    )\n    require(\n        \"client\" in params,\n        \"\"\"\n            'parameters' section requires a 'client' key.\n\n            Supported clients are {}\n            \"\"\".format(\n            str(CLIENTS)\n        ),\n    )\n    require(\n        params.get(\"client\") in CLIENTS,\n        \"\"\"\n            Invalid 'client' parameter.\n\n            Supported clients are {}\n            \"\"\".format(\n            str(CLIENTS)\n        ),\n    )\n    require(\n        \"append_date_dirs\" not in params,\n        \"\"\"\n            The current version of 'google-weather-tools' no longer supports 'append_date_dirs'!\n\n            Please refer to documentation for creating date-based directory hierarchy :\n            https://weather-tools.readthedocs.io/en/latest/Configuration.html#\"\"\"\n        \"\"\"creating-a-date-based-directory-hierarchy.\"\"\",\n        NotImplementedError,\n    )\n    require(\n        \"target_filename\" not in params,\n        \"\"\"\n            The current version of 'google-weather-tools' no longer supports 'target_filename'!\n\n            Please refer to documentation :\n            https://weather-tools.readthedocs.io/en/latest/Configuration.html#parameters-section.\"\"\",\n        NotImplementedError,\n    )\n\n    partition_keys = params.get(\"partition_keys\", list())\n    if isinstance(partition_keys, str):\n        partition_keys = [partition_keys.strip()]\n\n    selection = config.get(\"selection\", dict())\n    require(\n        all((key in selection for key in partition_keys)),\n        \"\"\"\n            All 'partition_keys' must appear in the 'selection' section.\n\n            'partition_keys' specify how to split data for workers. Please consult\n            documentation for more information.\"\"\",\n    )\n\n    num_template_replacements = _number_of_replacements(params[\"target_path\"])\n    num_partition_keys = len(partition_keys)\n\n    require(\n        num_template_replacements == num_partition_keys,\n        \"\"\"\n            'target_path' has {0} replacements. Expected {1}, since there are {1}\n            partition keys.\n            \"\"\".format(\n            num_template_replacements, num_partition_keys\n        ),\n    )\n\n    if \"day\" in partition_keys:\n        require(\n            selection[\"day\"] != \"all\",\n            \"\"\"If 'all' is used for a selection value, it cannot appear as a partition key.\"\"\",\n        )\n\n    if 'hdate' in selection:\n        require('date' in partition_keys,\n                \"\"\"\"If 'hdate' is specified in the 'selection' section,\n                then 'date' is required as a partition keys.\"\"\")\n\n    if 'date_range' in selection:\n        require('date_range' in partition_keys,\n                \"\"\"\"If 'date_range' is specified in the 'selection' section,\n                then it is also required as a partition keys.\"\"\")\n\n    # Ensure consistent lookup.\n    config[\"parameters\"][\"partition_keys\"] = partition_keys\n    # Add config file name.\n    config[\"parameters\"][\"config_name\"] = config_name\n\n    # Ensure the cartesian-cross can be taken on singleton values for the partition.\n    for key in partition_keys:\n        if not isinstance(selection[key], list):\n            selection[key] = [selection[key]]\n\n    return Config.from_dict(config)\n\n\ndef prepare_target_name(config: Config) -> str:\n    \"\"\"Returns name of target location.\"\"\"\n    partition_dict = OrderedDict(\n        (key, typecast(key, config.selection[key][0])) for key in config.partition_keys\n    )\n    target = config.target_path.format(*partition_dict.values(), **partition_dict)\n\n    return target\n\n\ndef get_subsections(config: Config) -> t.List[t.Tuple[str, t.Dict]]:\n    \"\"\"Collect parameter subsections from main configuration.\n\n    If the `parameters` section contains subsections (e.g. '[parameters.1]',\n    '[parameters.2]'), collect the subsection key-value pairs. Otherwise,\n    return an empty dictionary (i.e. there are no subsections).\n\n    This is useful for specifying multiple API keys for your configuration.\n    For example:\n    ```\n      [parameters.alice]\n      api_key=KKKKK1\n      api_url=UUUUU1\n      [parameters.bob]\n      api_key=KKKKK2\n      api_url=UUUUU2\n      [parameters.eve]\n      api_key=KKKKK3\n      api_url=UUUUU3\n    ```\n    \"\"\"\n    return [\n        (name, params)\n        for name, params in config.kwargs.items()\n        if isinstance(params, dict)\n    ] or [(\"default\", {})]\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/config_processing/partition.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport logging\nimport copy as cp\nimport dataclasses\nimport itertools\nimport typing as t\n\nfrom .manifest import Manifest\nfrom .parsers import prepare_target_name\nfrom .config import Config\nfrom .stores import Store, FSStore\nfrom .util import generate_hdate\n\nlogger = logging.getLogger(__name__)\n\n\n@dataclasses.dataclass\nclass PartitionConfig:\n    \"\"\"Partition a config into multiple data requests.\n\n    Partitioning involves four main operations: First, we fan-out shards based on\n    partition keys (a cross product of the values). Second, we filter out existing\n    downloads (unless we want to force downloads). Last, we assemble each partition\n    into a single Config.\n\n    Attributes:\n        store: A cloud storage system, used for checking the existence of downloads.\n        manifest: A download manifest to register preparation state.\n    \"\"\"\n\n    config: Config\n    store: Store\n    manifest: Manifest\n\n    def _create_partition_config(self, option: t.Tuple) -> Config:\n        \"\"\"Create a config for a single partition option.\n\n        Output a config dictionary, overriding the range of values for\n        each key with the partition instance in 'selection'.\n        Continuing the example from prepare_partitions, the selection section\n        would be:\n        { 'foo': ..., 'year': ['2020'], 'month': ['01'], ... }\n        { 'foo': ..., 'year': ['2020'], 'month': ['02'], ... }\n        { 'foo': ..., 'year': ['2020'], 'month': ['03'], ... }\n\n        Args:\n            option: A single item in the range of partition_keys.\n            config: The download config, including the parameters and selection sections.\n\n        Returns:\n            A configuration with that selects a single download partition.\n        \"\"\"\n        copy = cp.deepcopy(self.config.selection)\n        out = cp.deepcopy(self.config)\n        for idx, key in enumerate(self.config.partition_keys):\n            copy[key] = [option[idx]]\n\n        # Replace hdate with actual value.\n        if 'hdate' in copy:\n            copy['hdate'] = [generate_hdate(copy['date'][0], v) for v in copy['hdate']]\n\n        out.selection = copy\n        return out\n\n    def skip_partition(self, config: Config) -> bool:\n        \"\"\"Return true if partition should be skipped.\"\"\"\n\n        if config.force_download:\n            return False\n\n        target = prepare_target_name(config)\n        if self.store.exists(target):\n            logger.info(f\"file {target} found, skipping.\")\n            self.manifest.skip(\n                config.config_name,\n                config.dataset,\n                config.selection,\n                target,\n                config.user_id,\n            )\n            return True\n\n        return False\n\n    def prepare_partitions(self) -> t.Iterator[Config]:\n        \"\"\"Iterate over client parameters, partitioning over `partition_keys`.\n\n        This produces a Cartesian-Cross over the range of keys.\n\n        For example, if the keys were 'year' and 'month', it would produce\n        an iterable like:\n            ( ('2020', '01'), ('2020', '02'), ('2020', '03'), ...)\n\n        Returns:\n            An iterator of `Config`s.\n        \"\"\"\n        for option in itertools.product(\n            *[self.config.selection[key] for key in self.config.partition_keys]\n        ):\n            yield self._create_partition_config(option)\n\n    def new_downloads_only(self, candidate: Config) -> bool:\n        \"\"\"Predicate function to skip already downloaded partitions.\"\"\"\n        if self.store is None:\n            self.store = FSStore()\n        should_skip = self.skip_partition(candidate)\n\n        return not should_skip\n\n    def update_manifest_collection(self, partition: Config) -> Config:\n        \"\"\"Updates the DB.\"\"\"\n        location = prepare_target_name(partition)\n        self.manifest.schedule(\n            partition.config_name,\n            partition.dataset,\n            partition.selection,\n            location,\n            partition.user_id,\n        )\n        logger.info(f\"Created partition {location!r}.\")\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/config_processing/pipeline.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport getpass\nimport logging\nimport os\nfrom .parsers import process_config\nfrom .partition import PartitionConfig\nfrom .manifest import FirestoreManifest\nfrom database.download_handler import get_download_handler\nfrom database.queue_handler import get_queue_handler\nfrom fastapi.concurrency import run_in_threadpool\n\nlogger = logging.getLogger(__name__)\n\ndownload_handler = get_download_handler()\nqueue_handler = get_queue_handler()\n\n\ndef _do_partitions(partition_obj: PartitionConfig):\n    for partition in partition_obj.prepare_partitions():\n        # Skip existing downloads\n        if partition_obj.new_downloads_only(partition):\n            partition_obj.update_manifest_collection(partition)\n\n\n# TODO: Make partitioning faster.\nasync def start_processing_config(config_file, licenses, force_download, priority = None):\n    config = {}\n    manifest = FirestoreManifest()\n\n    with open(config_file, \"r\", encoding=\"utf-8\") as f:\n        # configs/example.cfg -> example.cfg\n        config_name = os.path.split(config_file)[1]\n        config = process_config(f, config_name)\n\n    config.force_download = force_download\n    config.user_id = getpass.getuser()\n\n    partition_obj = PartitionConfig(config, None, manifest)\n\n    # Make entry in 'download' & 'queues' collection.\n    await download_handler._start_download(config_name, config.client)\n    await download_handler._mark_partitioning_status(\n        config_name, \"Partitioning in-progress.\"\n    )\n    try:\n        # Prepare partitions\n        await run_in_threadpool(_do_partitions, partition_obj)\n        await download_handler._mark_partitioning_status(\n            config_name, \"Partitioning completed.\"\n        )\n        for license_id in licenses:\n            await queue_handler._update_config_priority_in_license(license_id, config_name, priority)\n    except Exception as e:\n        error_str = f\"Partitioning failed for {config_name} due to {e}.\"\n        logger.error(error_str)\n        await download_handler._mark_partitioning_status(config_name, error_str)\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/config_processing/stores.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\n\"\"\"Download destinations, or `Store`s.\"\"\"\n\nimport abc\nimport io\nimport os\nimport tempfile\nimport typing as t\n\nfrom apache_beam.io.filesystems import FileSystems\n\n\nclass Store(abc.ABC):\n    \"\"\"A interface to represent where downloads are stored.\n\n    Default implementation uses Apache Beam's Filesystems.\n    \"\"\"\n\n    @abc.abstractmethod\n    def open(self, filename: str, mode: str = \"r\") -> t.IO:\n        pass\n\n    @abc.abstractmethod\n    def exists(self, filename: str) -> bool:\n        pass\n\n\nclass InMemoryStore(Store):\n    \"\"\"Store file data in memory.\"\"\"\n\n    def __init__(self):\n        self.store = {}\n\n    def open(self, filename: str, mode: str = \"r\") -> t.IO:\n        \"\"\"Create or read in-memory data.\"\"\"\n        if \"b\" in mode:\n            file = io.BytesIO()\n        else:\n            file = io.StringIO()\n        self.store[filename] = file\n        return file\n\n    def exists(self, filename: str) -> bool:\n        \"\"\"Return true if the 'file' exists in memory.\"\"\"\n        return filename in self.store\n\n\nclass TempFileStore(Store):\n    \"\"\"Store data into temporary files.\"\"\"\n\n    def __init__(self, directory: t.Optional[str] = None) -> None:\n        \"\"\"Optionally specify the directory that contains all temporary files.\"\"\"\n        self.dir = directory\n        if self.dir and not os.path.exists(self.dir):\n            os.makedirs(self.dir)\n\n    def open(self, filename: str, mode: str = \"r\") -> t.IO:\n        \"\"\"Create a temporary file in the store directory.\"\"\"\n        return tempfile.TemporaryFile(mode, dir=self.dir)\n\n    def exists(self, filename: str) -> bool:\n        \"\"\"Return true if file exists.\"\"\"\n        return os.path.exists(filename)\n\n\nclass LocalFileStore(Store):\n    \"\"\"Store data into local files.\"\"\"\n\n    def __init__(self, directory: t.Optional[str] = None) -> None:\n        \"\"\"Optionally specify the directory that contains all downloaded files.\"\"\"\n        self.dir = directory\n        if self.dir and not os.path.exists(self.dir):\n            os.makedirs(self.dir)\n\n    def open(self, filename: str, mode: str = \"r\") -> t.IO:\n        \"\"\"Open a local file from the store directory.\"\"\"\n        return open(os.sep.join([self.dir, filename]), mode)\n\n    def exists(self, filename: str) -> bool:\n        \"\"\"Returns true if local file exists.\"\"\"\n        return os.path.exists(os.sep.join([self.dir, filename]))\n\n\nclass FSStore(Store):\n    \"\"\"Store data into any store supported by Apache Beam's FileSystems.\"\"\"\n\n    def open(self, filename: str, mode: str = \"r\") -> t.IO:\n        \"\"\"Open object in cloud bucket (or local file system) as a read or write channel.\n\n        To work with cloud storage systems, only a read or write channel can be openend\n        at one time. Data will be treated as bytes, not text (equivalent to `rb` or `wb`).\n\n        Further, append operations, or writes on existing objects, are dissallowed (the\n        error thrown will depend on the implementation of the underlying cloud provider).\n        \"\"\"\n        if \"r\" in mode and \"w\" not in mode:\n            return FileSystems().open(filename)\n\n        if \"w\" in mode and \"r\" not in mode:\n            return FileSystems().create(filename)\n\n        raise ValueError(\n            f\"invalid mode {mode!r}: mode must have either 'r' or 'w', but not both.\"\n        )\n\n    def exists(self, filename: str) -> bool:\n        \"\"\"Returns true if object exists.\"\"\"\n        return FileSystems().exists(filename)\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/config_processing/util.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom dateutil.relativedelta import relativedelta\nimport logging\nimport datetime\nimport geojson\nimport hashlib\nimport itertools\nimport os\nimport socket\nimport subprocess\nimport sys\nimport typing as t\n\nimport numpy as np\nimport pandas as pd\nfrom apache_beam.io.gcp import gcsio\nfrom apache_beam.utils import retry\nfrom xarray.core.utils import ensure_us_time_resolution\nfrom urllib.parse import urlparse\nfrom google.api_core.exceptions import BadRequest\n\n\nLATITUDE_RANGE = (-90, 90)\nLONGITUDE_RANGE = (-180, 180)\nGLOBAL_COVERAGE_AREA = [90, -180, -90, 180]\n\nlogger = logging.getLogger(__name__)\n\n\ndef _retry_if_valid_input_but_server_or_socket_error_and_timeout_filter(\n    exception,\n) -> bool:\n    if isinstance(exception, socket.timeout):\n        return True\n    if isinstance(exception, TimeoutError):\n        return True\n    # To handle the concurrency issue in BigQuery.\n    if isinstance(exception, BadRequest):\n        return True\n    return retry.retry_if_valid_input_but_server_error_and_timeout_filter(exception)\n\n\nclass _FakeClock:\n\n    def sleep(self, value):\n        pass\n\n\ndef retry_with_exponential_backoff(fun):\n    \"\"\"A retry decorator that doesn't apply during test time.\"\"\"\n    clock = retry.Clock()\n\n    # Use a fake clock only during test time...\n    if \"unittest\" in sys.modules.keys():\n        clock = _FakeClock()\n\n    return retry.with_exponential_backoff(\n        retry_filter=_retry_if_valid_input_but_server_or_socket_error_and_timeout_filter,\n        clock=clock,\n    )(fun)\n\n\n# TODO(#245): Group with common utilities (duplicated)\ndef ichunked(iterable: t.Iterable, n: int) -> t.Iterator[t.Iterable]:\n    \"\"\"Yield evenly-sized chunks from an iterable.\"\"\"\n    input_ = iter(iterable)\n    try:\n        while True:\n            it = itertools.islice(input_, n)\n            # peek to check if 'it' has next item.\n            first = next(it)\n            yield itertools.chain([first], it)\n    except StopIteration:\n        pass\n\n\n# TODO(#245): Group with common utilities (duplicated)\ndef copy(src: str, dst: str) -> None:\n    \"\"\"Copy data via `gcloud storage cp`.\"\"\"\n    try:\n        subprocess.run([\"gcloud\", \"storage\", \"cp\", src, dst], check=True, capture_output=True)\n    except subprocess.CalledProcessError as e:\n        logger.info(\n            f'Failed to copy file {src!r} to {dst!r} due to {e.stderr.decode(\"utf-8\")}.'\n        )\n        raise\n\n\n# TODO(#245): Group with common utilities (duplicated)\ndef to_json_serializable_type(value: t.Any) -> t.Any:\n    \"\"\"Returns the value with a type serializable to JSON\"\"\"\n    # Note: The order of processing is significant.\n    logger.info(\"Serializing to JSON.\")\n\n    if pd.isna(value) or value is None:\n        return None\n    elif np.issubdtype(type(value), np.floating):\n        return float(value)\n    elif isinstance(value, np.ndarray):\n        # Will return a scaler if array is of size 1, else will return a list.\n        return value.tolist()\n    elif (\n        isinstance(value, datetime.datetime)\n        or isinstance(value, str)\n        or isinstance(value, np.datetime64)\n    ):\n        # Assume strings are ISO format timestamps...\n        try:\n            value = datetime.datetime.fromisoformat(value)\n        except ValueError:\n            # ... if they are not, assume serialization is already correct.\n            return value\n        except TypeError:\n            # ... maybe value is a numpy datetime ...\n            try:\n                value = ensure_us_time_resolution(value).astype(datetime.datetime)\n            except AttributeError:\n                # ... value is a datetime object, continue.\n                pass\n\n        # We use a string timestamp representation.\n        if value.tzname():\n            return value.isoformat()\n\n        # We assume here that naive timestamps are in UTC timezone.\n        return value.replace(tzinfo=datetime.timezone.utc).isoformat()\n    elif isinstance(value, np.timedelta64):\n        # Return time delta in seconds.\n        return float(value / np.timedelta64(1, \"s\"))\n    # This check must happen after processing np.timedelta64 and np.datetime64.\n    elif np.issubdtype(type(value), np.integer):\n        return int(value)\n\n    return value\n\n\ndef fetch_geo_polygon(area: t.Union[list, str]) -> str:\n    \"\"\"Calculates a geography polygon from an input area.\"\"\"\n    # Ref: https://confluence.ecmwf.int/pages/viewpage.action?pageId=151520973\n    if isinstance(area, str):\n        # European area\n        if area == \"E\":\n            area = [73.5, -27, 33, 45]\n        # Global area\n        elif area == \"G\":\n            area = GLOBAL_COVERAGE_AREA\n        else:\n            raise RuntimeError(f\"Not a valid value for area in config: {area}.\")\n\n    n, w, s, e = [float(x) for x in area]\n    if s < LATITUDE_RANGE[0]:\n        raise ValueError(f\"Invalid latitude value for south: '{s}'\")\n    if n > LATITUDE_RANGE[1]:\n        raise ValueError(f\"Invalid latitude value for north: '{n}'\")\n    if w < LONGITUDE_RANGE[0]:\n        raise ValueError(f\"Invalid longitude value for west: '{w}'\")\n    if e > LONGITUDE_RANGE[1]:\n        raise ValueError(f\"Invalid longitude value for east: '{e}'\")\n\n    # Define the coordinates of the bounding box.\n    coords = [[w, n], [w, s], [e, s], [e, n], [w, n]]\n\n    # Create the GeoJSON polygon object.\n    polygon = geojson.dumps(geojson.Polygon([coords]))\n    return polygon\n\n\ndef get_file_size(path: str) -> float:\n    parsed_gcs_path = urlparse(path)\n    if parsed_gcs_path.scheme != \"gs\" or parsed_gcs_path.netloc == \"\":\n        return os.stat(path).st_size / (1024**3) if os.path.exists(path) else 0\n    else:\n        return (\n            gcsio.GcsIO().size(path) / (1024**3) if gcsio.GcsIO().exists(path) else 0\n        )\n\n\ndef get_wait_interval(num_retries: int = 0) -> float:\n    \"\"\"Returns next wait interval in seconds, using an exponential backoff algorithm.\"\"\"\n    if 0 == num_retries:\n        return 0\n    return 2**num_retries\n\n\ndef generate_md5_hash(input: str) -> str:\n    \"\"\"Generates md5 hash for the input string.\"\"\"\n    return hashlib.md5(input.encode(\"utf-8\")).hexdigest()\n\n\ndef download_with_aria2(url: str, path: str) -> None:\n    \"\"\"Downloads a file from the given URL using the `aria2c` command-line utility,\n    with options set to improve download speed and reliability.\"\"\"\n    dir_path, file_name = os.path.split(path)\n    try:\n        subprocess.run(\n            [\n                \"aria2c\",\n                \"-x\",\n                \"16\",\n                \"-s\",\n                \"16\",\n                url,\n                \"-d\",\n                dir_path,\n                \"-o\",\n                file_name,\n                \"--allow-overwrite\",\n            ],\n            check=True,\n            capture_output=True,\n        )\n    except subprocess.CalledProcessError as e:\n        logger.info(\n            f'Failed download from server {url!r} to {path!r} due to {e.stderr.decode(\"utf-8\")}.'\n        )\n        raise\n\ndef generate_hdate(date: str, subtract_year: str) -> str:\n    \"\"\"Generate a historical date by subtracting a specified number of years from the given date.\n    If input date is leap day (Feb 29), return Feb 28 even if target hdate is also a leap year.\n    This is expected in ECMWF API.\n\n    Args:\n        date (str): The input date in the format 'YYYY-MM-DD'.\n        subtract_year (str): The number of years to subtract.\n\n    Returns:\n        str: The historical date in the format 'YYYY-MM-DD'.\n    \"\"\"\n    try:\n        input_date = datetime.datetime.strptime(date, \"%Y-%m-%d\")\n        # Check for leap day\n        if input_date.month == 2 and input_date.day == 29:\n            input_date = input_date - datetime.timedelta(days=1)\n        subtract_year = int(subtract_year)\n    except (ValueError, TypeError):\n        logger.error(\"Invalid input.\")\n        raise\n\n    hdate = input_date - relativedelta(years=subtract_year)\n    return hdate.strftime(\"%Y-%m-%d\")\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/database/__init__.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/database/download_handler.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport abc\nimport logging\nfrom firebase_admin import firestore\nfrom google.cloud.firestore_v1 import DocumentSnapshot, FieldFilter\nfrom google.cloud.firestore_v1.types import WriteResult\nfrom database.session import get_async_client\nfrom server_config import get_config\n\nlogger = logging.getLogger(__name__)\n\n\ndef get_download_handler():\n    return DownloadHandlerFirestore(db=get_async_client())\n\n\ndef get_mock_download_handler():\n    return DownloadHandlerMock()\n\n\nclass DownloadHandler(abc.ABC):\n\n    @abc.abstractmethod\n    async def _start_download(self, config_name: str, client_name: str) -> None:\n        pass\n\n    @abc.abstractmethod\n    async def _stop_download(self, config_name: str) -> None:\n        pass\n\n    @abc.abstractmethod\n    async def _mark_partitioning_status(self, config_name: str, status: str) -> None:\n        pass\n\n    @abc.abstractmethod\n    async def _check_download_exists(self, config_name: str) -> bool:\n        pass\n\n    @abc.abstractmethod\n    async def _get_downloads(self, client_name: str) -> list:\n        pass\n\n    @abc.abstractmethod\n    async def _get_download_by_config_name(self, config_name: str):\n        pass\n\n\nclass DownloadHandlerMock(DownloadHandler):\n\n    def __init__(self):\n        pass\n\n    async def _start_download(self, config_name: str, client_name: str) -> None:\n        logger.info(\n            f\"Added {config_name} in 'download' collection. Update_time: 000000.\"\n        )\n\n    async def _stop_download(self, config_name: str) -> None:\n        logger.info(\n            f\"Removed {config_name} in 'download' collection. Update_time: 000000.\"\n        )\n\n    async def _mark_partitioning_status(self, config_name: str, status: str) -> None:\n        logger.info(\n            f\"Updated {config_name} in 'download' collection. Update_time: 000000.\"\n        )\n\n    async def _check_download_exists(self, config_name: str) -> bool:\n        if config_name == \"not_exist\":\n            return False\n        elif config_name == \"not_exist.cfg\":\n            return False\n        else:\n            return True\n\n    async def _get_downloads(self, client_name: str) -> list:\n        return [{\"config_name\": \"example.cfg\", \"client_name\": \"client\", \"status\": \"partitioning completed.\"}]\n\n    async def _get_download_by_config_name(self, config_name: str):\n        if config_name == \"not_exist\":\n            return None\n        return {\"config_name\": \"example.cfg\", \"client_name\": \"client\", \"status\": \"partitioning completed.\"}\n\n\nclass DownloadHandlerFirestore(DownloadHandler):\n\n    def __init__(self, db: firestore.firestore.Client):\n        self.db = db\n        self.collection = get_config().download_collection\n\n    async def _start_download(self, config_name: str, client_name: str) -> None:\n        result: WriteResult = (\n            await self.db.collection(self.collection)\n            .document(config_name)\n            .set({\"config_name\": config_name, \"client_name\": client_name})\n        )\n\n        logger.info(\n            f\"Added {config_name} in 'download' collection. Update_time: {result.update_time}.\"\n        )\n\n    async def _stop_download(self, config_name: str) -> None:\n        timestamp = (\n            await self.db.collection(self.collection).document(config_name).delete()\n        )\n        logger.info(\n            f\"Removed {config_name} in 'download' collection. Update_time: {timestamp}.\"\n        )\n\n    async def _mark_partitioning_status(self, config_name: str, status: str) -> None:\n        timestamp = (\n            await self.db.collection(self.collection)\n            .document(config_name)\n            .update({\"status\": status})\n        )\n        logger.info(\n            f\"Updated {config_name} in 'download' collection. Update_time: {timestamp}.\"\n        )\n\n    async def _check_download_exists(self, config_name: str) -> bool:\n        result: DocumentSnapshot = (\n            await self.db.collection(self.collection).document(config_name).get()\n        )\n        return result.exists\n\n    async def _get_downloads(self, client_name: str) -> list:\n        docs = []\n        if client_name:\n            docs = (\n                self.db.collection(self.collection)\n                .where(filter=FieldFilter(\"client_name\", \"==\", client_name))\n                .stream()\n            )\n        else:\n            docs = self.db.collection(self.collection).stream()\n\n        return [doc.to_dict() async for doc in docs]\n\n    async def _get_download_by_config_name(self, config_name: str):\n        result: DocumentSnapshot = (\n            await self.db.collection(self.collection).document(config_name).get()\n        )\n        if result.exists:\n            return result.to_dict()\n        else:\n            return None\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/database/license_handler.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport abc\nimport logging\nfrom firebase_admin import firestore\nfrom google.cloud.firestore_v1 import DocumentSnapshot, FieldFilter\nfrom google.cloud.firestore_v1.types import WriteResult\nfrom database.session import get_async_client\nfrom server_config import get_config\n\n\nlogger = logging.getLogger(__name__)\n\n\ndef get_license_handler():\n    return LicenseHandlerFirestore(db=get_async_client())\n\n\ndef get_mock_license_handler():\n    return LicenseHandlerMock()\n\n\nclass LicenseHandler(abc.ABC):\n\n    @abc.abstractmethod\n    async def _add_license(self, license_dict: dict) -> str:\n        pass\n\n    @abc.abstractmethod\n    async def _delete_license(self, license_id: str) -> None:\n        pass\n\n    @abc.abstractmethod\n    async def _check_license_exists(self, license_id: str) -> bool:\n        pass\n\n    @abc.abstractmethod\n    async def _get_license_by_license_id(self, license_id: str) -> dict:\n        pass\n\n    @abc.abstractmethod\n    async def _get_license_by_client_name(self, client_name: str) -> list:\n        pass\n\n    @abc.abstractmethod\n    async def _get_licenses(self) -> list:\n        pass\n\n    @abc.abstractmethod\n    async def _update_license(self, license_id: str, license_dict: dict) -> None:\n        pass\n\n    @abc.abstractmethod\n    async def _get_license_without_deployment(self) -> list:\n        pass\n\n    @abc.abstractmethod\n    async def _mark_license_status(self, license_id: str, status: str) -> None:\n        pass\n\nclass LicenseHandlerMock(LicenseHandler):\n\n    def __init__(self):\n        pass\n\n    async def _add_license(self, license_dict: dict) -> str:\n        license_id = \"L1\"\n        logger.info(f\"Added {license_id} in 'license' collection. Update_time: 00000.\")\n        return license_id\n\n    async def _delete_license(self, license_id: str) -> None:\n        logger.info(\n            f\"Removed {license_id} in 'license' collection. Update_time: 00000.\"\n        )\n\n    async def _update_license(self, license_id: str, license_dict: dict) -> None:\n        logger.info(\n            f\"Updated {license_id} in 'license' collection. Update_time: 00000.\"\n        )\n\n    async def _check_license_exists(self, license_id: str) -> bool:\n        if license_id == \"not_exist\":\n            return False\n        elif license_id == \"no-exists\":\n            return False\n        else:\n            return True\n\n    async def _get_license_by_license_id(self, license_id: str) -> dict:\n        if license_id == \"not_exist\":\n            return None\n        return {\n            \"license_id\": license_id,\n            \"secret_id\": \"xxxx\",\n            \"client_name\": \"dummy_client\",\n            \"k8s_deployment_id\": \"k1\",\n            \"number_of_requets\": 100,\n        }\n\n    async def _get_license_by_client_name(self, client_name: str) -> list:\n        return [{\n            \"license_id\": \"L1\",\n            \"secret_id\": \"xxxx\",\n            \"client_name\": client_name,\n            \"k8s_deployment_id\": \"k1\",\n            \"number_of_requets\": 100,\n        }]\n\n    async def _get_licenses(self) -> list:\n        return [{\n            \"license_id\": \"L1\",\n            \"secret_id\": \"xxxx\",\n            \"client_name\": \"dummy_client\",\n            \"k8s_deployment_id\": \"k1\",\n            \"number_of_requets\": 100,\n        }]\n\n    async def _get_license_without_deployment(self) -> list:\n        return []\n\n\nclass LicenseHandlerFirestore(LicenseHandler):\n\n    def __init__(self, db: firestore.firestore.AsyncClient):\n        self.db = db\n        self.collection = get_config().license_collection\n\n    async def _add_license(self, license_dict: dict) -> str:\n        license_dict[\"license_id\"] = license_dict[\"license_id\"].lower()\n        license_id = license_dict[\"license_id\"]\n\n        result: WriteResult = (\n            await self.db.collection(self.collection)\n            .document(license_id)\n            .set(license_dict)\n        )\n        logger.info(\n            f\"Added {license_id} in 'license' collection. Update_time: {result.update_time}.\"\n        )\n        return license_id\n\n    async def _delete_license(self, license_id: str) -> None:\n        timestamp = (\n            await self.db.collection(self.collection).document(license_id).delete()\n        )\n        logger.info(\n            f\"Removed {license_id} in 'license' collection. Update_time: {timestamp}.\"\n        )\n\n    async def _update_license(self, license_id: str, license_dict: dict) -> None:\n        result: WriteResult = (\n            await self.db.collection(self.collection)\n            .document(license_id)\n            .update(license_dict)\n        )\n        logger.info(\n            f\"Updated {license_id} in 'license' collection. Update_time: {result.update_time}.\"\n        )\n\n    async def _check_license_exists(self, license_id: str) -> bool:\n        result: DocumentSnapshot = (\n            await self.db.collection(self.collection).document(license_id).get()\n        )\n        return result.exists\n\n    async def _get_license_by_license_id(self, license_id: str) -> dict:\n        result: DocumentSnapshot = (\n            await self.db.collection(self.collection).document(license_id).get()\n        )\n        return result.to_dict()\n\n    async def _get_license_by_client_name(self, client_name: str) -> list:\n        docs = (\n            self.db.collection(self.collection)\n            .where(filter=FieldFilter(\"client_name\", \"==\", client_name))\n            .stream()\n        )\n        return [doc.to_dict() async for doc in docs]\n\n    async def _get_licenses(self) -> list:\n        docs = self.db.collection(self.collection).stream()\n        return [doc.to_dict() async for doc in docs]\n\n    async def _get_license_without_deployment(self) -> list:\n        docs = (\n            self.db.collection(self.collection)\n            .where(filter=FieldFilter(\"k8s_deployment_id\", \"==\", \"\"))\n            .stream()\n        )\n        return [doc.to_dict() async for doc in docs]\n\n    async def _mark_license_status(self, license_id: str, status: str) -> None:\n        timestamp = (\n            await self.db.collection(self.collection)\n            .document(license_id)\n            .update({\"status\": status})\n        )\n        logger.info(\n            f\"Updated {license_id} in 'license' collection. Update_time: {timestamp}.\"\n        )\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/database/manifest_handler.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport abc\nimport logging\nfrom firebase_admin import firestore\nfrom google.cloud.firestore_v1.base_query import FieldFilter, Or, And\nfrom server_config import get_config\nfrom database.session import get_async_client\n\nlogger = logging.getLogger(__name__)\n\n\ndef get_manifest_handler():\n    return ManifestHandlerFirestore(db=get_async_client())\n\n\ndef get_mock_manifest_handler():\n    return ManifestHandlerMock()\n\n\nclass ManifestHandler(abc.ABC):\n\n    @abc.abstractmethod\n    async def _get_download_success_count(self, config_name: str) -> int:\n        pass\n\n    @abc.abstractmethod\n    async def _get_download_failure_count(self, config_name: str) -> int:\n        pass\n\n    @abc.abstractmethod\n    async def _get_download_scheduled_count(self, config_name: str) -> int:\n        pass\n\n    @abc.abstractmethod\n    async def _get_download_inprogress_count(self, config_name: str) -> int:\n        pass\n\n    @abc.abstractmethod\n    async def _get_download_total_count(self, config_name: str) -> int:\n        pass\n\n    @abc.abstractmethod\n    async def _get_non_successfull_downloads(self, config_name: str) -> list:\n        pass\n\n    @abc.abstractmethod\n    async def _get_failed_downloads(self, config_name: str) -> list:\n        pass\n\n\nclass ManifestHandlerMock(ManifestHandler):\n\n    async def _get_download_failure_count(self, config_name: str) -> int:\n        return 0\n\n    async def _get_download_inprogress_count(self, config_name: str) -> int:\n        return 0\n\n    async def _get_download_scheduled_count(self, config_name: str) -> int:\n        return 0\n\n    async def _get_download_success_count(self, config_name: str) -> int:\n        return 0\n\n    async def _get_download_total_count(self, config_name: str) -> int:\n        return 0\n\n    async def _get_non_successfull_downloads(self, config_name: str) -> list:\n        return []\n\n    async def _get_failed_downloads(self, config_name: str) -> list:\n        return []\n\n\nclass ManifestHandlerFirestore(ManifestHandler):\n\n    def __init__(self, db: firestore.firestore.Client):\n        self.db = db\n        self.collection = get_config().manifest_collection\n\n    async def _get_download_success_count(self, config_name: str) -> int:\n        result = (\n            await self.db.collection(self.collection)\n            .where(filter=FieldFilter(\"config_name\", \"==\", config_name))\n            .where(filter=FieldFilter(\"stage\", \"==\", \"upload\"))\n            .where(filter=FieldFilter(\"status\", \"==\", \"success\"))\n            .count()\n            .get()\n        )\n\n        count = result[0][0].value\n\n        return count\n\n    async def _get_download_failure_count(self, config_name: str) -> int:\n        result = (\n            await self.db.collection(self.collection)\n            .where(filter=FieldFilter(\"config_name\", \"==\", config_name))\n            .where(filter=FieldFilter(\"status\", \"==\", \"failure\"))\n            .count()\n            .get()\n        )\n\n        count = result[0][0].value\n\n        return count\n\n    async def _get_download_scheduled_count(self, config_name: str) -> int:\n        result = (\n            await self.db.collection(self.collection)\n            .where(filter=FieldFilter(\"config_name\", \"==\", config_name))\n            .where(filter=FieldFilter(\"status\", \"==\", \"scheduled\"))\n            .count()\n            .get()\n        )\n\n        count = result[0][0].value\n\n        return count\n\n    async def _get_download_inprogress_count(self, config_name: str) -> int:\n        and_filter = And(\n            filters=[\n                FieldFilter(\"status\", \"==\", \"success\"),\n                FieldFilter(\"stage\", \"!=\", \"upload\"),\n            ]\n        )\n        or_filter = Or(filters=[\n            FieldFilter(\"status\", \"==\", \"in-progress\"),\n            FieldFilter(\"status\", \"==\", \"processing\"),\n            and_filter]\n        )\n\n        result = (\n            await self.db.collection(self.collection)\n            .where(filter=FieldFilter(\"config_name\", \"==\", config_name))\n            .where(filter=or_filter)\n            .count()\n            .get()\n        )\n\n        count = result[0][0].value\n\n        return count\n\n    async def _get_download_total_count(self, config_name: str) -> int:\n        result = (\n            await self.db.collection(self.collection)\n            .where(filter=FieldFilter(\"config_name\", \"==\", config_name))\n            .count()\n            .get()\n        )\n\n        count = result[0][0].value\n\n        return count\n\n    async def _get_non_successfull_downloads(self, config_name: str) -> list:\n        or_filter = Or(\n            filters=[\n                FieldFilter(\"stage\", \"==\", \"fetch\"),\n                FieldFilter(\"stage\", \"==\", \"download\"),\n                FieldFilter(\"stage\", \"==\", None),\n                And(\n                    filters=[\n                        FieldFilter(\"status\", \"!=\", \"success\"),\n                        FieldFilter(\"stage\", \"==\", \"upload\"),\n                    ]\n                ),\n            ]\n        )\n\n        docs = (\n            self.db.collection(self.collection)\n            .where(filter=FieldFilter(\"config_name\", \"==\", config_name))\n            .where(filter=or_filter)\n            .stream()\n        )\n        return [doc.to_dict() async for doc in docs]\n\n    async def _get_failed_downloads(self, config_name: str) -> list:\n        docs = (\n            self.db.collection(self.collection)\n            .where(filter=FieldFilter(\"config_name\", \"==\", config_name))\n            .where(filter=FieldFilter(\"status\", \"==\", \"failure\"))\n            .stream()\n        )\n        return [doc.to_dict() async for doc in docs]\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/database/queue_handler.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport abc\nimport logging\nfrom firebase_admin import firestore\nfrom google.cloud.firestore_v1 import DocumentSnapshot, FieldFilter\nfrom google.cloud.firestore_v1.types import WriteResult\nfrom database.session import get_async_client\nfrom server_config import get_config\n\nlogger = logging.getLogger(__name__)\n\n\ndef get_queue_handler():\n    return QueueHandlerFirestore(db=get_async_client())\n\n\ndef get_mock_queue_handler():\n    return QueueHandlerMock()\n\n\nclass QueueHandler(abc.ABC):\n\n    @abc.abstractmethod\n    async def _create_license_queue(self, license_id: str, client_name: str) -> None:\n        pass\n\n    @abc.abstractmethod\n    async def _remove_license_queue(self, license_id: str) -> None:\n        pass\n\n    @abc.abstractmethod\n    async def _get_queues(self) -> list:\n        pass\n\n    @abc.abstractmethod\n    async def _get_queue_by_license_id(self, license_id: str) -> dict:\n        pass\n\n    @abc.abstractmethod\n    async def _get_queue_by_client_name(self, client_name: str) -> list:\n        pass\n\n    @abc.abstractmethod\n    async def _update_license_queue(self, license_id: str, priority_list: list) -> None:\n        pass\n\n    @abc.abstractmethod\n    async def _update_queues_on_start_download(\n        self, config_name: str, licenses: list\n    ) -> None:\n        pass\n\n    @abc.abstractmethod\n    async def _update_queues_on_stop_download(self, config_name: str) -> None:\n        pass\n\n    @abc.abstractmethod\n    async def _update_config_priority_in_license(\n        self, license_id: str, config_name: str, priority: int\n    ) -> None:\n        pass\n\n    @abc.abstractmethod\n    async def _update_client_name_in_license_queue(\n        self, license_id: str, client_name: str\n    ) -> None:\n        pass\n\n\nclass QueueHandlerMock(QueueHandler):\n\n    def __init__(self):\n        pass\n\n    async def _create_license_queue(self, license_id: str, client_name: str) -> None:\n        logger.info(\n            f\"Added {license_id} queue in 'queues' collection. Update_time: 000000.\"\n        )\n\n    async def _remove_license_queue(self, license_id: str) -> None:\n        logger.info(\n            f\"Removed {license_id} queue in 'queues' collection. Update_time: 000000.\"\n        )\n\n    async def _get_queues(self) -> list:\n        return [{\"client_name\": \"dummy_client\", \"license_id\": \"L1\", \"queue\": []}]\n\n    async def _get_queue_by_license_id(self, license_id: str) -> dict:\n        if license_id == \"not_exist\":\n            return None\n        return {\"client_name\": \"dummy_client\", \"license_id\": license_id, \"queue\": []}\n\n    async def _get_queue_by_client_name(self, client_name: str) -> list:\n        return [{\"client_name\": client_name, \"license_id\": \"L1\", \"queue\": []}]\n\n    async def _update_license_queue(self, license_id: str, priority_list: list) -> None:\n        logger.info(\n            f\"Updated {license_id} queue in 'queues' collection. Update_time: 00000.\"\n        )\n\n    async def _update_queues_on_start_download(\n        self, config_name: str, licenses: list\n    ) -> None:\n        logger.info(\n            f\"Updated {license} queue in 'queues' collection. Update_time: 00000.\"\n        )\n\n    async def _update_queues_on_stop_download(self, config_name: str) -> None:\n        logger.info(\n            \"Updated snapshot.id queue in 'queues' collection. Update_time: 00000.\"\n        )\n\n    async def _update_config_priority_in_license(\n        self, license_id: str, config_name: str, priority: int\n    ) -> None:\n        logger.info(\n            \"Updated snapshot.id queue in 'queues' collection. Update_time: 00000.\"\n        )\n\n    async def _update_client_name_in_license_queue(\n        self, license_id: str, client_name: str\n    ) -> None:\n        logger.info(\n            \"Updated snapshot.id queue in 'queues' collection. Update_time: 00000.\"\n        )\n\n\nclass QueueHandlerFirestore(QueueHandler):\n\n    def __init__(self, db: firestore.firestore.Client):\n        self.db = db\n        self.collection = get_config().queues_collection\n\n    async def _create_license_queue(self, license_id: str, client_name: str) -> None:\n        result: WriteResult = (\n            await self.db.collection(self.collection)\n            .document(license_id)\n            .set({\"license_id\": license_id, \"client_name\": client_name, \"queue\": []})\n        )\n        logger.info(\n            f\"Added {license_id} queue in 'queues' collection. Update_time: {result.update_time}.\"\n        )\n\n    async def _remove_license_queue(self, license_id: str) -> None:\n        timestamp = (\n            await self.db.collection(self.collection).document(license_id).delete()\n        )\n        logger.info(\n            f\"Removed {license_id} queue in 'queues' collection. Update_time: {timestamp}.\"\n        )\n\n    async def _get_queues(self) -> list:\n        docs = self.db.collection(self.collection).stream()\n        return [doc.to_dict() async for doc in docs]\n\n    async def _get_queue_by_license_id(self, license_id: str) -> dict:\n        result: DocumentSnapshot = (\n            await self.db.collection(self.collection).document(license_id).get()\n        )\n        return result.to_dict()\n\n    async def _get_queue_by_client_name(self, client_name: str) -> list:\n        docs = (\n            self.db.collection(self.collection)\n            .where(filter=FieldFilter(\"client_name\", \"==\", client_name))\n            .stream()\n        )\n        return [doc.to_dict() async for doc in docs]\n\n    async def _update_license_queue(self, license_id: str, priority_list: list) -> None:\n        result: WriteResult = (\n            await self.db.collection(self.collection)\n            .document(license_id)\n            .update({\"queue\": priority_list})\n        )\n        logger.info(\n            f\"Updated {license_id} queue in 'queues' collection. Update_time: {result.update_time}.\"\n        )\n\n    async def _update_queues_on_start_download(\n        self, config_name: str, licenses: list\n    ) -> None:\n        for license in licenses:\n            result: WriteResult = (\n                await self.db.collection(self.collection)\n                .document(license)\n                .update({\"queue\": firestore.ArrayUnion([config_name])})\n            )\n            logger.info(\n                f\"Updated {license} queue in 'queues' collection. Update_time: {result.update_time}.\"\n            )\n\n    async def _update_queues_on_stop_download(self, config_name: str) -> None:\n        snapshot_list = await self.db.collection(self.collection).get()\n        for snapshot in snapshot_list:\n            result: WriteResult = (\n                await self.db.collection(self.collection)\n                .document(snapshot.id)\n                .update({\"queue\": firestore.ArrayRemove([config_name])})\n            )\n            logger.info(\n                f\"Updated {snapshot.id} queue in 'queues' collection. Update_time: {result.update_time}.\"\n            )\n\n    async def _update_config_priority_in_license(\n        self, license_id: str, config_name: str, priority: int | None\n    ) -> None:\n        snapshot: DocumentSnapshot = (\n            await self.db.collection(self.collection).document(license_id).get()\n        )\n        priority_list = snapshot.to_dict()[\"queue\"]\n        new_priority_list = [c for c in priority_list if c != config_name]\n        if priority is None:\n            # If no priority is given, insert at the end.\n            priority = len(new_priority_list)\n        new_priority_list.insert(priority, config_name)\n        result: WriteResult = (\n            await self.db.collection(self.collection)\n            .document(license_id)\n            .update({\"queue\": new_priority_list})\n        )\n        logger.info(\n            f\"Updated {snapshot.id} queue in 'queues' collection. Update_time: {result.update_time}.\"\n        )\n\n    async def _update_client_name_in_license_queue(\n        self, license_id: str, client_name: str\n    ) -> None:\n        result: WriteResult = (\n            await self.db.collection(self.collection)\n            .document(license_id)\n            .update({\"client_name\": client_name})\n        )\n        logger.info(\n            f\"Updated {license_id} queue in 'queues' collection. Update_time: {result.update_time}.\"\n        )\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/database/session.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport time\nimport abc\nimport logging\nimport firebase_admin\nfrom google.cloud import firestore\nfrom firebase_admin import credentials\nfrom config_processing.util import get_wait_interval\nfrom server_config import get_config\nfrom gcloud import storage\n\nlogger = logging.getLogger(__name__)\n\n\nclass Database(abc.ABC):\n\n    @abc.abstractmethod\n    def _get_db(self):\n        pass\n\n\ndb: firestore.AsyncClient = None\n\ndef get_async_client() -> firestore.AsyncClient:\n    global db\n    attempts = 0\n\n    while db is None:\n        try:\n            db = firestore.AsyncClient()\n        except ValueError as e:\n            # The above call will fail with a value error when the firebase app is not initialized.\n            # Initialize the app here, and try again.\n            # Use the application default credentials.\n            cred = credentials.ApplicationDefault()\n\n            firebase_admin.initialize_app(cred)\n            logger.info(\"Initialized Firebase App.\")\n\n            if attempts > 4:\n                raise RuntimeError(\n                    \"Exceeded number of retries to get firestore client.\"\n                ) from e\n\n        time.sleep(get_wait_interval(attempts))\n\n        attempts += 1\n\n    return db\n\n\ndef get_gcs_client() -> storage.Client:\n    try:\n        gcs = storage.Client(project=get_config().gcs_project)\n    except ValueError as e:\n        logger.error(f\"Error initializing GCS client: {e}.\")\n\n    return gcs\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/database/storage_handler.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport abc\nimport os\nimport logging\nimport tempfile\nimport contextlib\nimport typing as t\nfrom google.cloud import storage\nfrom database.session import get_gcs_client\nfrom server_config import get_config\n\n\nlogger = logging.getLogger(__name__)\n\n\ndef get_storage_handler():\n    return StorageHandlerGCS(client=get_gcs_client())\n\n\nclass StorageHandler(abc.ABC):\n\n    @abc.abstractmethod\n    def _upload_file(self, file_path) -> str:\n        pass\n\n    @abc.abstractmethod\n    def _open_local(self, file_name) -> t.Iterator[str]:\n        pass\n\n\nclass StorageHandlerMock(StorageHandler):\n\n    def __init__(self) -> None:\n        pass\n\n    def _upload_file(self, file_path) -> None:\n        pass\n\n    def _open_local(self, file_name) -> t.Iterator[str]:\n        pass\n\n\nclass StorageHandlerGCS(StorageHandler):\n\n    def __init__(self, client: storage.Client) -> None:\n        self.client = client\n        self.bucket = self.client.get_bucket(get_config().storage_bucket)\n\n    def _upload_file(self, file_path) -> str:\n        filename = os.path.basename(file_path).split(\"/\")[-1]\n\n        blob = self.bucket.blob(filename)\n        blob.upload_from_filename(file_path)\n\n        logger.info(f\"Uploaded {filename} to {self.bucket}.\")\n        return blob.public_url\n\n    @contextlib.contextmanager\n    def _open_local(self, file_name) -> t.Iterator[str]:\n        blob = self.bucket.blob(file_name)\n        with tempfile.NamedTemporaryFile() as dest_file:\n            blob.download_to_filename(dest_file.name)\n            yield dest_file.name\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/environment.yml",
    "content": "name: weather-dl-v2-server\nchannels:\n  - conda-forge\ndependencies:\n  - python=3.10\n  - xarray\n  - geojson\n  - pip=22.3\n  - google-cloud-sdk=410.0.0\n  - pip:\n    - kubernetes\n    - fastapi[all]==0.97.0\n    - python-multipart\n    - numpy\n    - apache-beam[gcp]\n    - aiohttp\n    - firebase-admin\n    - gcloud\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/example.cfg",
    "content": "[parameters]\nclient=mars\n\ntarget_path=gs://<bucket-path>/test-weather-dl-v2/{date}T00z.gb\npartition_keys=\n  date\n  # step\n\n# API Keys & Subsections go here...\n\n[selection]\nclass=od\ntype=pf\nstream=enfo\nexpver=0001\nlevtype=pl\nlevelist=100\n# params:\n# (z) Geopotential 129, (t) Temperature 130,\n# (u) U component of wind 131, (v) V component of wind 132,\n# (q) Specific humidity 133,  (w) vertical velocity 135,\n# (vo) Vorticity (relative) 138, (d) Divergence 155,\n# (r) Relative humidity 157\nparam=129.128\n#\n# next: 2019-01-01/to/existing\n#\ndate=2019-07-18/to/2019-07-20\ntime=0000\nstep=0/to/2\nnumber=1/to/2\ngrid=F640\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/license_dep/deployment_creator.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport logging\nfrom os import path\nimport yaml\nfrom kubernetes import client, config\nfrom server_config import get_config\n\nlogger = logging.getLogger(__name__)\n\n\ndef create_license_deployment(license_id: str) -> str:\n    \"\"\"Creates a kubernetes workflow of type Job for downloading the data.\"\"\"\n    config.load_config()\n\n    with open(path.join(path.dirname(__file__), \"license_deployment.yaml\")) as f:\n        deployment_manifest = yaml.safe_load(f)\n        deployment_name = f\"weather-dl-v2-license-dep-{license_id}\".lower()\n\n        # Update the deployment name with a unique identifier\n        deployment_manifest[\"metadata\"][\"name\"] = deployment_name\n        deployment_manifest[\"spec\"][\"template\"][\"spec\"][\"containers\"][0][\"args\"] = [\n            \"--license\",\n            license_id,\n        ]\n        deployment_manifest[\"spec\"][\"template\"][\"spec\"][\"containers\"][0][\n            \"image\"\n        ] = get_config().license_deployment_image\n\n        # Create an instance of the Kubernetes API client\n        batch_api = client.BatchV1Api()\n        # Create the deployment in the specified namespace\n        response = batch_api.create_namespaced_job(\n            body=deployment_manifest, namespace=\"default\"\n        )\n\n        logger.info(f\"Deployment created successfully: {response.metadata.name}.\")\n        return deployment_name\n\n\ndef terminate_license_deployment(license_id: str) -> None:\n    # Load Kubernetes configuration\n    config.load_config()\n\n    # Create an instance of the Kubernetes API client\n    batch_v1 = client.BatchV1Api()\n\n    # Specify the name and namespace of the deployment to delete\n    job_name = f\"weather-dl-v2-license-dep-{license_id}\".lower()\n\n    # Delete the deployment\n    batch_v1.delete_namespaced_job(\n        name=job_name,\n        namespace=\"default\",\n        body=client.V1DeleteOptions(\n            propagation_policy='Foreground'\n        )\n    )\n\n    logger.info(f\"Deployment '{job_name}' deleted successfully.\")\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/license_dep/license_deployment.yaml",
    "content": "# weather-dl-v2-license-dep Deployment\n# Defines the deployment of the app running in a pod on any worker node\napiVersion: batch/v1\nkind: Job\nmetadata:\n  name: weather-dl-v2-license-dep\nspec:\n  backoffLimit: 5\n  podReplacementPolicy: Failed\n  template:\n    spec:\n      restartPolicy: OnFailure\n      containers:\n        - name: weather-dl-v2-license-dep\n          image: XXXXXXX\n          imagePullPolicy: Always\n          args: []\n          resources:\n            requests:\n              cpu: \"1500m\"                  # CPU: 1.5 vCPU\n              memory: \"2Gi\"                 # RAM: 2 GiB\n              ephemeral-storage: \"10Gi\"     # Storage: 10 GiB\n          volumeMounts:\n          - name: config-volume\n            mountPath: ./config\n      terminationGracePeriodSeconds: 172800   # 48 hours\n      volumes:\n        - name: config-volume\n          configMap:\n            name: dl-v2-config"
  },
  {
    "path": "weather_dl_v2/fastapi-server/logging.conf",
    "content": "[loggers]\nkeys=root,server\n\n[handlers]\nkeys=consoleHandler,detailedConsoleHandler\n\n[formatters]\nkeys=normalFormatter,detailedFormatter\n\n[logger_root]\nlevel=INFO\nhandlers=consoleHandler\n\n[logger_server]\nlevel=DEBUG\nhandlers=detailedConsoleHandler\nqualname=server\npropagate=0\n\n[handler_consoleHandler]\nclass=StreamHandler\nlevel=DEBUG\nformatter=normalFormatter\nargs=(sys.stdout,)\n\n[handler_detailedConsoleHandler]\nclass=StreamHandler\nlevel=DEBUG\nformatter=detailedFormatter\nargs=(sys.stdout,)\n\n[formatter_normalFormatter]\nformat=%(asctime)s loglevel=%(levelname)-6s logger=%(name)s %(funcName)s() msg:%(message)s\n\n[formatter_detailedFormatter]\nformat=%(asctime)s loglevel=%(levelname)-6s logger=%(name)s %(funcName)s() msg:%(message)s   call_trace=%(pathname)s L%(lineno)-4d"
  },
  {
    "path": "weather_dl_v2/fastapi-server/main.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport logging\nimport os\nimport logging.config\nfrom contextlib import asynccontextmanager\nfrom fastapi import FastAPI\nfrom routers import license, download, queues\nfrom database.license_handler import get_license_handler\nfrom routers.license import get_create_deployment\nfrom server_config import get_config\n\nROOT_DIR = os.path.dirname(os.path.abspath(__file__))\n\n# set up logger.\nlogging.config.fileConfig(\"logging.conf\", disable_existing_loggers=False)\nlogger = logging.getLogger(__name__)\n\n\nasync def create_pending_license_deployments():\n    \"\"\"Creates license deployments for Licenses whose deployments does not exist.\"\"\"\n    license_handler = get_license_handler()\n    create_deployment = get_create_deployment()\n    license_list = await license_handler._get_license_without_deployment()\n\n    for _license in license_list:\n        license_id = _license[\"license_id\"]\n        try:\n            logger.info(f\"Creating license deployment for {license_id}.\")\n            await create_deployment(license_id, license_handler)\n        except Exception as e:\n            logger.error(f\"License deployment failed for {license_id}. Exception: {e}.\")\n\n\n@asynccontextmanager\nasync def lifespan(app: FastAPI):\n    logger.info(\"Started FastAPI server.\")\n    # Boot up\n    # Make directory to store the uploaded config files.\n    os.makedirs(os.path.join(os.getcwd(), \"config_files\"), exist_ok=True)\n    # Retrieve license information & create license deployment if needed.\n    await create_pending_license_deployments()\n    # TODO: Automatically create required indexes on firestore collections on server startup.\n    yield\n    # Clean up\n\n\napp = FastAPI(lifespan=lifespan)\n\napp.include_router(license.router)\napp.include_router(download.router)\napp.include_router(queues.router)\n\n\n@app.get(\"/\")\nasync def main():\n    return {\"msg\": get_config().welcome_message}\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/routers/download.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport asyncio\nimport logging\nimport os\nimport shutil\nimport json\n\nfrom enum import Enum\nfrom config_processing.parsers import parse_config, process_config\nfrom config_processing.config import Config\nfrom fastapi import APIRouter, HTTPException, BackgroundTasks, UploadFile, Depends, Body\nfrom config_processing.pipeline import start_processing_config\nfrom database.download_handler import DownloadHandler, get_download_handler\nfrom database.queue_handler import QueueHandler, get_queue_handler\nfrom database.license_handler import LicenseHandler, get_license_handler\nfrom database.manifest_handler import ManifestHandler, get_manifest_handler\nfrom database.storage_handler import StorageHandler, get_storage_handler\nfrom config_processing.manifest import FirestoreManifest, Manifest\nfrom fastapi.concurrency import run_in_threadpool\nfrom routers.license import mark_license_active\n\nlogger = logging.getLogger(__name__)\n\nrouter = APIRouter(\n    prefix=\"/download\",\n    tags=[\"download\"],\n    responses={404: {\"description\": \"Not found\"}},\n)\n\n\nasync def fetch_config_stats(\n    config_name: str, client_name: str, status: str, manifest_handler: ManifestHandler\n):\n    \"\"\"Get all the config stats parallely.\"\"\"\n\n    success_coroutine = manifest_handler._get_download_success_count(config_name)\n    scheduled_coroutine = manifest_handler._get_download_scheduled_count(config_name)\n    failure_coroutine = manifest_handler._get_download_failure_count(config_name)\n    inprogress_coroutine = manifest_handler._get_download_inprogress_count(config_name)\n    total_coroutine = manifest_handler._get_download_total_count(config_name)\n\n    (\n        success_count,\n        scheduled_count,\n        failure_count,\n        inprogress_count,\n        total_count,\n    ) = await asyncio.gather(\n        success_coroutine,\n        scheduled_coroutine,\n        failure_coroutine,\n        inprogress_coroutine,\n        total_coroutine,\n    )\n\n    return {\n        \"config_name\": config_name,\n        \"client_name\": client_name,\n        \"partitioning_status\": status,\n        \"scheduled_shards\": scheduled_count,\n        \"in-progress_shards\": inprogress_count,\n        \"downloaded_shards\": success_count,\n        \"failed_shards\": failure_count,\n        \"total_shards\": total_count,\n    }\n\n\ndef get_fetch_config_stats():\n    return fetch_config_stats\n\n\ndef get_fetch_config_stats_mock():\n    async def fetch_config_stats(\n        config_name: str, client_name: str, status: str, manifest_handler: ManifestHandler\n    ):\n        return {\n            \"config_name\": config_name,\n            \"client_name\": client_name,\n            \"scheduled_shards\": 0,\n            \"in-progress_shards\": 0,\n            \"downloaded_shards\": 0,\n            \"failed_shards\": 0,\n            \"total_shards\": 0,\n        }\n\n    return fetch_config_stats\n\n\ndef get_upload():\n    def upload(file: UploadFile):\n        dest = os.path.join(os.getcwd(), \"config_files\", file.filename)\n        with open(dest, \"wb+\") as dest_:\n            shutil.copyfileobj(file.file, dest_)\n\n        logger.info(f\"Uploading {file.filename} to gcs bucket.\")\n        storage_handler: StorageHandler = get_storage_handler()\n        storage_handler._upload_file(dest)\n        return dest\n\n    return upload\n\n\ndef get_upload_mock():\n    def upload(file: UploadFile):\n        return f\"{os.getcwd()}/tests/test_data/{file.filename}\"\n\n    return upload\n\n\ndef get_reschedule_partitions():\n    def invoke_manifest_schedule(\n        partition_list: list, config: Config, manifest: Manifest\n    ):\n        for partition in partition_list:\n            logger.info(f\"Rescheduling partition {partition}.\")\n            manifest.schedule(\n                config.config_name,\n                config.dataset,\n                json.loads(partition[\"selection\"]),\n                partition[\"location\"],\n                partition[\"username\"],\n            )\n\n    async def reschedule_partitions(config_name: str, licenses: list, only_failed: bool = False):\n        manifest_handler: ManifestHandler = get_manifest_handler()\n        download_handler: DownloadHandler = get_download_handler()\n        queue_handler: QueueHandler = get_queue_handler()\n        storage_handler: StorageHandler = get_storage_handler()\n\n        if only_failed:\n            partition_list = await manifest_handler._get_failed_downloads(config_name)\n        else:\n            partition_list = await manifest_handler._get_non_successfull_downloads(\n                config_name\n            )\n\n        config = None\n        manifest = FirestoreManifest()\n\n        with storage_handler._open_local(config_name) as local_path:\n            with open(local_path, \"r\", encoding=\"utf-8\") as f:\n                config = process_config(f, config_name)\n\n        await download_handler._mark_partitioning_status(\n            config_name, \"Partitioning in-progress.\"\n        )\n\n        try:\n            if config is None:\n                logger.error(\n                    f\"Failed reschedule_partitions. Could not open {config_name}.\"\n                )\n                raise FileNotFoundError(\n                    f\"Failed reschedule_partitions. Could not open {config_name}.\"\n                )\n\n            await run_in_threadpool(\n                invoke_manifest_schedule, partition_list, config, manifest\n            )\n            await download_handler._mark_partitioning_status(\n                config_name, \"Partitioning completed.\"\n            )\n            if len(licenses) > 0:\n                await queue_handler._update_queues_on_start_download(config_name, licenses)\n        except Exception as e:\n            error_str = f\"Partitioning failed for {config_name} due to {e}.\"\n            logger.error(error_str)\n            await download_handler._mark_partitioning_status(config_name, error_str)\n\n    return reschedule_partitions\n\n\ndef get_reschedule_partitions_mock():\n    async def reschedule_partitions(config_name: str, licenses: list, only_failed: bool = False):\n        pass\n\n    return reschedule_partitions\n\n\n# Can submit a config to the server.\n@router.post(\"/\")\nasync def submit_download(\n    file: UploadFile | None = None,\n    licenses: list = [],\n    force_download: bool = False,\n    priority: int | None = None,\n    background_tasks: BackgroundTasks = BackgroundTasks(),\n    download_handler: DownloadHandler = Depends(get_download_handler),\n    license_handler: LicenseHandler = Depends(get_license_handler),\n    upload=Depends(get_upload),\n):\n    if not file:\n        logger.error(\"No upload file sent.\")\n        raise HTTPException(status_code=404, detail=\"No upload file sent.\")\n    else:\n        if await download_handler._check_download_exists(file.filename):\n            logger.error(\n                f\"Please stop the ongoing download of the config file '{file.filename}' \"\n                \"before attempting to start a new download.\"\n            )\n            raise HTTPException(\n                status_code=400,\n                detail=f\"Please stop the ongoing download of the config file '{file.filename}' \"\n                \"before attempting to start a new download.\",\n            )\n\n        for license_id in licenses:\n            if not await license_handler._check_license_exists(license_id):\n                logger.info(f\"No such license {license_id}.\")\n                raise HTTPException(\n                    status_code=404, detail=f\"No such license {license_id}.\"\n                )\n            await mark_license_active(license_id, license_handler)\n        try:\n            dest = upload(file)\n            # Start processing config.\n            background_tasks.add_task(\n                start_processing_config, dest, licenses, force_download, priority\n            )\n            return {\n                \"message\": f\"file '{file.filename}' saved at '{dest}' successfully.\"\n            }\n        except Exception as e:\n            logger.error(f\"Failed to save file '{file.filename} due to {e}.\")\n            raise HTTPException(\n                status_code=500, detail=f\"Failed to save file '{file.filename}'.\"\n            )\n\n\nclass DownloadStatus(str, Enum):\n    COMPLETED = \"completed\"\n    FAILED = \"failed\"\n    IN_PROGRESS = \"in-progress\"\n\n\n@router.get(\"/show/{config_name}\")\nasync def show_download_config(\n    config_name: str,\n    download_handler: DownloadHandler = Depends(get_download_handler),\n    storage_handler: StorageHandler = Depends(get_storage_handler),\n):\n    if not await download_handler._check_download_exists(config_name):\n        logger.error(f\"No such download config {config_name} to show.\")\n        raise HTTPException(\n            status_code=404,\n            detail=f\"No such download config {config_name} to show.\",\n        )\n\n    contents = None\n\n    with storage_handler._open_local(config_name) as local_path:\n        with open(local_path, \"r\", encoding=\"utf-8\") as f:\n            contents = parse_config(f)\n            logger.info(f\"Contents of {config_name}: {contents}.\")\n\n    return {\"config_name\": config_name, \"contents\": contents}\n\n\n# Can check the current status of the submitted config.\n# List status for all the downloads + handle filters\n@router.get(\"/\")\nasync def get_downloads(\n    client_name: str | None = None,\n    status: DownloadStatus | None = None,\n    download_handler: DownloadHandler = Depends(get_download_handler),\n    manifest_handler: ManifestHandler = Depends(get_manifest_handler),\n    fetch_config_stats=Depends(get_fetch_config_stats),\n):\n    downloads = await download_handler._get_downloads(client_name)\n    coroutines = []\n\n    for download in downloads:\n        coroutines.append(\n            fetch_config_stats(\n                download[\"config_name\"],\n                download[\"client_name\"],\n                download[\"status\"],\n                manifest_handler,\n            )\n        )\n\n    config_details = await asyncio.gather(*coroutines)\n\n    if status is None:\n        return config_details\n\n    if status.value == DownloadStatus.COMPLETED:\n        return list(\n            filter(\n                lambda detail: detail[\"downloaded_shards\"] == detail[\"total_shards\"],\n                config_details,\n            )\n        )\n    elif status.value == DownloadStatus.FAILED:\n        return list(filter(lambda detail: detail[\"failed_shards\"] > 0, config_details))\n    elif status.value == DownloadStatus.IN_PROGRESS:\n        return list(\n            filter(\n                lambda detail: detail[\"downloaded_shards\"] != detail[\"total_shards\"],\n                config_details,\n            )\n        )\n    else:\n        return config_details\n\n\n# Get status of particular download\n@router.get(\"/{config_name}\")\nasync def get_download_by_config_name(\n    config_name: str,\n    download_handler: DownloadHandler = Depends(get_download_handler),\n    manifest_handler: ManifestHandler = Depends(get_manifest_handler),\n    fetch_config_stats=Depends(get_fetch_config_stats),\n):\n    download = await download_handler._get_download_by_config_name(config_name)\n\n    if download is None:\n        logger.error(f\"Download config {config_name} not found in weather-dl v2.\")\n        raise HTTPException(\n            status_code=404,\n            detail=f\"Download config {config_name} not found in weather-dl v2.\",\n        )\n\n    return await fetch_config_stats(\n        download[\"config_name\"],\n        download[\"client_name\"],\n        download[\"status\"],\n        manifest_handler,\n    )\n\n\n# Stop & remove the execution of the config.\n@router.delete(\"/{config_name}\")\nasync def delete_download(\n    config_name: str,\n    download_handler: DownloadHandler = Depends(get_download_handler),\n    queue_handler: QueueHandler = Depends(get_queue_handler),\n):\n    if not await download_handler._check_download_exists(config_name):\n        logger.error(f\"No such download config {config_name} to stop & remove.\")\n        raise HTTPException(\n            status_code=404,\n            detail=f\"No such download config {config_name} to stop & remove.\",\n        )\n\n    await download_handler._stop_download(config_name)\n    await queue_handler._update_queues_on_stop_download(config_name)\n    return {\n        \"config_name\": config_name,\n        \"message\": \"Download config stopped & removed successfully.\",\n    }\n\n\n@router.post(\"/retry/{config_name}\")\nasync def retry_config(\n    config_name: str,\n    licenses: list = Body(embed=True),\n    only_failed: bool = False,\n    background_tasks: BackgroundTasks = BackgroundTasks(),\n    download_handler: DownloadHandler = Depends(get_download_handler),\n    license_handler: LicenseHandler = Depends(get_license_handler),\n    reschedule_partitions=Depends(get_reschedule_partitions),\n):\n    if not await download_handler._check_download_exists(config_name):\n        logger.error(f\"No such download config {config_name} to retry.\")\n        raise HTTPException(\n            status_code=404,\n            detail=f\"No such download config {config_name} to retry.\",\n        )\n\n    for license_id in licenses:\n        if not await license_handler._check_license_exists(license_id):\n            logger.info(f\"No such license {license_id}.\")\n            raise HTTPException(\n                status_code=404, detail=f\"No such license {license_id}.\"\n            )\n        await mark_license_active(license_id, license_handler)\n\n    background_tasks.add_task(reschedule_partitions, config_name, licenses, only_failed)\n\n    return {\"msg\": \"Refetch initiated successfully.\"}\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/routers/license.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport logging\nimport re\nfrom fastapi import APIRouter, HTTPException, BackgroundTasks, Depends\nfrom pydantic import BaseModel\nfrom license_dep.deployment_creator import create_license_deployment, terminate_license_deployment\nfrom database.license_handler import LicenseHandler, get_license_handler\nfrom database.queue_handler import QueueHandler, get_queue_handler\n\nlogger = logging.getLogger(__name__)\n\n\nclass License(BaseModel):\n    license_id: str\n    client_name: str\n    number_of_requests: int\n    secret_id: str\n\n\nclass LicenseInternal(License):\n    k8s_deployment_id: str\n\n\n# Can perform CRUD on license table -- helps in handling API KEY expiry.\nrouter = APIRouter(\n    prefix=\"/license\",\n    tags=[\"license\"],\n    responses={404: {\"description\": \"Not found\"}},\n)\n\nasync def mark_license_active(license_id: str, license_handler: LicenseHandler):\n    logger.info(f\"Marking {license_id} active.\")\n    await license_handler._mark_license_status(license_id, 'License Active.')\n\n# Add/Update k8s deployment ID for existing license (intenally).\nasync def update_license_internal(\n    license_id: str,\n    k8s_deployment_id: str,\n    license_handler: LicenseHandler,\n):\n    if not await license_handler._check_license_exists(license_id):\n        logger.info(f\"No such license {license_id} to update.\")\n        raise HTTPException(\n            status_code=404, detail=f\"No such license {license_id} to update.\"\n        )\n    license_dict = {\"k8s_deployment_id\": k8s_deployment_id}\n\n    await license_handler._update_license(license_id, license_dict)\n    return {\"license_id\": license_id, \"message\": \"License updated successfully.\"}\n\n\ndef get_create_deployment():\n    async def create_deployment(license_id: str, license_handler: LicenseHandler):\n        k8s_deployment_id = create_license_deployment(license_id)\n        await update_license_internal(license_id, k8s_deployment_id, license_handler)\n\n    return create_deployment\n\n\ndef get_create_deployment_mock():\n    async def create_deployment_mock(license_id: str, license_handler: LicenseHandler):\n        logger.info(\"create deployment mock.\")\n\n    return create_deployment_mock\n\n\ndef get_terminate_license_deployment():\n    return terminate_license_deployment\n\n\ndef get_terminate_license_deployment_mock():\n    def get_terminate_license_deployment_mock(license_id):\n        logger.info(f\"terminating license deployment for {license_id}.\")\n\n    return get_terminate_license_deployment_mock\n\n\n# List all the license + handle filters of {client_name}\n@router.get(\"/\")\nasync def get_licenses(\n    client_name: str | None = None,\n    license_handler: LicenseHandler = Depends(get_license_handler),\n):\n    if client_name:\n        result = await license_handler._get_license_by_client_name(client_name)\n    else:\n        result = await license_handler._get_licenses()\n    return result\n\n\n# Get particular license\n@router.get(\"/{license_id}\")\nasync def get_license_by_license_id(\n    license_id: str, license_handler: LicenseHandler = Depends(get_license_handler)\n):\n    result = await license_handler._get_license_by_license_id(license_id)\n    if not result:\n        logger.info(f\"License {license_id} not found.\")\n        raise HTTPException(status_code=404, detail=f\"License {license_id} not found.\")\n    return result\n\n\n# Update existing license\n@router.put(\"/{license_id}\")\nasync def update_license(\n    license_id: str,\n    license: License,\n    license_handler: LicenseHandler = Depends(get_license_handler),\n    queue_handler: QueueHandler = Depends(get_queue_handler),\n    create_deployment=Depends(get_create_deployment),\n    terminate_license_deployment=Depends(get_terminate_license_deployment),\n):\n    if not await license_handler._check_license_exists(license_id):\n        logger.error(f\"No such license {license_id} to update.\")\n        raise HTTPException(\n            status_code=404, detail=f\"No such license {license_id} to update.\"\n        )\n\n    license_dict = license.dict()\n    await license_handler._update_license(license_id, license_dict)\n    await mark_license_active(license_id, license_handler)\n    await queue_handler._update_client_name_in_license_queue(\n        license_id, license_dict[\"client_name\"]\n    )\n\n    terminate_license_deployment(license_id)\n    await create_deployment(license_id, license_handler)\n    return {\"license_id\": license_id, \"name\": \"License updated successfully.\"}\n\n\n# Add new license\n@router.post(\"/\")\nasync def add_license(\n    license: License,\n    background_tasks: BackgroundTasks = BackgroundTasks(),\n    license_handler: LicenseHandler = Depends(get_license_handler),\n    queue_handler: QueueHandler = Depends(get_queue_handler),\n    create_deployment=Depends(get_create_deployment),\n):\n    license_id = license.license_id.lower()\n\n    # Check if license id is in correct format.\n    LICENSE_REGEX = re.compile(\n        r\"[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\"\n    )\n    if not bool(LICENSE_REGEX.fullmatch(license_id)):\n        logger.error(\n            \"\"\"Invalid format for license_id. License id must consist of lower case alphanumeric\"\"\"\n            \"\"\" characters, '-' or '.', and must start and end with an alphanumeric character\"\"\"\n        )\n        raise HTTPException(\n            status_code=400,\n            detail=\"\"\"Invalid format for license_id. License id must consist of lower case alphanumeric\"\"\"\n            \"\"\" characters, '-' or '.', and must start and end with an alphanumeric character\"\"\",\n        )\n\n    if await license_handler._check_license_exists(license_id):\n        logger.error(f\"License with license_id {license_id} already exist.\")\n        raise HTTPException(\n            status_code=409,\n            detail=f\"License with license_id {license_id} already exist.\",\n        )\n\n    license_dict = license.dict()\n    license_dict[\"k8s_deployment_id\"] = \"\"\n    license_id = await license_handler._add_license(license_dict)\n    await mark_license_active(license_id, license_handler)\n    await queue_handler._create_license_queue(license_id, license_dict[\"client_name\"])\n    background_tasks.add_task(create_deployment, license_id, license_handler)\n    return {\"license_id\": license_id, \"message\": \"License added successfully.\"}\n\n\n# Remove license\n@router.delete(\"/{license_id}\")\nasync def delete_license(\n    license_id: str,\n    background_tasks: BackgroundTasks = BackgroundTasks(),\n    license_handler: LicenseHandler = Depends(get_license_handler),\n    queue_handler: QueueHandler = Depends(get_queue_handler),\n    terminate_license_deployment=Depends(get_terminate_license_deployment),\n):\n    if not await license_handler._check_license_exists(license_id):\n        logger.error(f\"No such license {license_id} to delete.\")\n        raise HTTPException(\n            status_code=404, detail=f\"No such license {license_id} to delete.\"\n        )\n    await license_handler._delete_license(license_id)\n    await queue_handler._remove_license_queue(license_id)\n    background_tasks.add_task(terminate_license_deployment, license_id)\n    return {\"license_id\": license_id, \"message\": \"License removed successfully.\"}\n\n# TODO: Create a better response for this route.\n@router.patch(\"/redeploy\")\nasync def redeploy_licenses(\n    license_id: str = None,\n    client_name: str = None,\n    license_handler: LicenseHandler = Depends(get_license_handler),\n    terminate_license_deployment=Depends(get_terminate_license_deployment),\n    create_deployment=Depends(get_create_deployment),\n):\n    licenses = []\n    if license_id is not None:\n        if license_id == \"all\":\n            licenses = await license_handler._get_licenses()\n        else:\n            license = await license_handler._get_license_by_license_id(license_id)\n            licenses = [license]\n\n    if client_name is not None:\n        licenses = await license_handler._get_license_by_client_name(client_name)\n\n    if len(licenses) == 0:\n        return {\"message\": \"No license found.\"}\n\n    for license in licenses:\n        license_id = license['license_id']\n        logger.info(f\"Terminating deployment {license['k8s_deployment_id']}.\")\n        try:\n            terminate_license_deployment(license_id)\n        except Exception as e:\n            logger.error(f\"Couldn't terminate Deployment {license_id}. Error: {e}.\")\n\n        logger.info(f\"Creating deployment for {license_id}.\")\n        try:\n            await create_deployment(license_id, license_handler)\n            await mark_license_active(license_id, license_handler)\n        except Exception as e:\n            logger.error(f\"Couldn't create Deployment {license_id}. Error: {e}.\")\n\n    return {\"message\": \"Licenses redeployed.\"}\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/routers/queues.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport logging\n\nfrom fastapi import APIRouter, HTTPException, Depends\nfrom database.queue_handler import QueueHandler, get_queue_handler\nfrom database.license_handler import LicenseHandler, get_license_handler\nfrom database.download_handler import DownloadHandler, get_download_handler\n\nlogger = logging.getLogger(__name__)\n\nrouter = APIRouter(\n    prefix=\"/queues\",\n    tags=[\"queues\"],\n    responses={404: {\"description\": \"Not found\"}},\n)\n\n\n# Users can change the execution order of config per license basis.\n# List the licenses priority + {client_name} filter\n@router.get(\"/\")\nasync def get_all_license_queue(\n    client_name: str | None = None,\n    queue_handler: QueueHandler = Depends(get_queue_handler),\n):\n    if client_name:\n        result = await queue_handler._get_queue_by_client_name(client_name)\n    else:\n        result = await queue_handler._get_queues()\n    return result\n\n\n# Get particular license priority\n@router.get(\"/{license_id}\")\nasync def get_license_queue(\n    license_id: str, queue_handler: QueueHandler = Depends(get_queue_handler)\n):\n    result = await queue_handler._get_queue_by_license_id(license_id)\n    if not result:\n        logger.error(f\"License priority for {license_id} not found.\")\n        raise HTTPException(\n            status_code=404, detail=f\"License priority for {license_id} not found.\"\n        )\n    return result\n\n\n# Change priority queue of particular license\n@router.post(\"/{license_id}\")\nasync def modify_license_queue(\n    license_id: str,\n    priority_list: list | None = [],\n    queue_handler: QueueHandler = Depends(get_queue_handler),\n    license_handler: LicenseHandler = Depends(get_license_handler),\n    download_handler: DownloadHandler = Depends(get_download_handler),\n):\n    if not await license_handler._check_license_exists(license_id):\n        logger.error(f\"License {license_id} not found.\")\n        raise HTTPException(status_code=404, detail=f\"License {license_id} not found.\")\n\n    for config_name in priority_list:\n        config = await download_handler._get_download_by_config_name(config_name)\n        if config is None:\n            logger.error(f\"Download config {config_name} not found in weather-dl v2.\")\n            raise HTTPException(\n                status_code=404,\n                detail=f\"Download config {config_name} not found in weather-dl v2.\",\n            )\n    try:\n        await queue_handler._update_license_queue(license_id, priority_list)\n        return {\"message\": f\"'{license_id}' license priority updated successfully.\"}\n    except Exception as e:\n        logger.error(f\"Failed to update '{license_id}' license priority due to {e}.\")\n        raise HTTPException(\n            status_code=404, detail=f\"Failed to update '{license_id}' license priority.\"\n        )\n\n\n# Change config's priority in particular license\n@router.put(\"/priority/{license_id}\")\nasync def modify_config_priority_in_license(\n    license_id: str,\n    config_name: str,\n    priority: int,\n    queue_handler: QueueHandler = Depends(get_queue_handler),\n    license_handler: LicenseHandler = Depends(get_license_handler),\n    download_handler: DownloadHandler = Depends(get_download_handler),\n):\n    if not await license_handler._check_license_exists(license_id):\n        logger.error(f\"License {license_id} not found.\")\n        raise HTTPException(status_code=404, detail=f\"License {license_id} not found.\")\n\n    config = await download_handler._get_download_by_config_name(config_name)\n    if config is None:\n        logger.error(f\"Download config {config_name} not found in weather-dl v2.\")\n        raise HTTPException(\n            status_code=404,\n            detail=f\"Download config {config_name} not found in weather-dl v2.\",\n        )\n\n    try:\n        await queue_handler._update_config_priority_in_license(\n            license_id, config_name, priority\n        )\n        return {\n            \"message\": f\"'{license_id}' license -- '{config_name}' priority updated successfully.\"\n        }\n    except Exception as e:\n        logger.error(f\"Failed to update '{license_id}' license priority due to {e}.\")\n        raise HTTPException(\n            status_code=404, detail=f\"Failed to update '{license_id}' license priority.\"\n        )\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/server.yaml",
    "content": "# Due to our org level policy we can't expose external-ip. \n# In case your project don't have any such restriction a\n# then no need to create a nginx-server on VM to access this fastapi server\n# instead create the LoadBalancer Service given below.\n# \n# # weather-dl server LoadBalancer Service\n# # Enables the pods in a deployment to be accessible from outside the cluster\n# apiVersion: v1\n# kind: Service\n# metadata:\n#   name: weather-dl-v2-server-service\n# spec:\n#   selector:\n#     app: weather-dl-v2-server-api\n#   ports:\n#     - protocol: \"TCP\"\n#       port: 8080\n#       targetPort: 8080\n#   type: LoadBalancer\n\n---\n# weather-dl-server-api Deployment\n# Defines the deployment of the app running in a pod on any worker node\napiVersion: apps/v1\nkind: Deployment\nmetadata:\n  name: weather-dl-v2-server-api\n  labels:\n    app: weather-dl-v2-server-api\nspec:\n  replicas: 1\n  selector:\n    matchLabels:\n      app: weather-dl-v2-server-api\n  template:\n    metadata:\n      labels:\n        app: weather-dl-v2-server-api\n    spec:\n      nodeSelector:\n        cloud.google.com/gke-nodepool: default-pool\n      containers:\n        - name: weather-dl-v2-server-api\n          image: XXXXX\n          ports:\n            - containerPort: 8080\n          imagePullPolicy: Always\n          volumeMounts:\n            - name: config-volume\n              mountPath: ./config\n      volumes:\n        - name: config-volume\n          configMap:\n            name: dl-v2-config\n          # resources:\n          #   # You must specify requests for CPU to autoscale\n          #   # based on CPU utilization\n          #   requests:\n          #     cpu: \"250m\"\n---\nkind: Role\napiVersion: rbac.authorization.k8s.io/v1\nmetadata:\n  name: weather-dl-v2-server-api\nrules:\n  - apiGroups:\n      - \"\"\n      - \"apps\"\n      - \"batch\"\n    resources:\n      - endpoints\n      - deployments\n      - pods\n      - jobs\n    verbs:\n      - get\n      - list\n      - watch\n      - create\n      - delete\n---\nkind: RoleBinding\napiVersion: rbac.authorization.k8s.io/v1\nmetadata:\n  name: weather-dl-v2-server-api\n  namespace: default\nsubjects:\n  - kind: ServiceAccount\n    name: default\n    namespace: default\nroleRef:\n  apiGroup: rbac.authorization.k8s.io\n  kind: Role\n  name: weather-dl-v2-server-api\n---"
  },
  {
    "path": "weather_dl_v2/fastapi-server/server_config.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport dataclasses\nimport typing as t\nimport json\nimport os\nimport logging\n\nlogger = logging.getLogger(__name__)\n\nValues = t.Union[t.List[\"Values\"], t.Dict[str, \"Values\"], bool, int, float, str]  # pytype: disable=not-supported-yet\n\n\n@dataclasses.dataclass\nclass ServerConfig:\n    download_collection: str = \"\"\n    queues_collection: str = \"\"\n    license_collection: str = \"\"\n    manifest_collection: str = \"\"\n    storage_bucket: str = \"\"\n    gcs_project: str = \"\"\n    license_deployment_image: str = \"\"\n    welcome_message: str = \"\"\n    kwargs: t.Optional[t.Dict[str, Values]] = dataclasses.field(default_factory=dict)\n\n    @classmethod\n    def from_dict(cls, config: t.Dict):\n        config_instance = cls()\n\n        for key, value in config.items():\n            if hasattr(config_instance, key):\n                setattr(config_instance, key, value)\n            else:\n                config_instance.kwargs[key] = value\n\n        return config_instance\n\n\nserver_config = None\n\n\ndef get_config():\n    global server_config\n    if server_config:\n        return server_config\n\n    server_config_json = \"config/config.json\"\n    if not os.path.exists(server_config_json):\n        server_config_json = os.environ.get(\"CONFIG_PATH\", None)\n\n    if server_config_json is None:\n        logger.error(\"Couldn't load config file for fastAPI server.\")\n        raise FileNotFoundError(\"Couldn't load config file for fastAPI server.\")\n\n    with open(server_config_json) as file:\n        config_dict = json.load(file)\n        server_config = ServerConfig.from_dict(config_dict)\n\n    return server_config\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/tests/__init__.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/tests/integration/__init__.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/tests/integration/test_download.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport logging\nimport os\nfrom fastapi.testclient import TestClient\nfrom main import app, ROOT_DIR\nfrom database.download_handler import get_download_handler, get_mock_download_handler\nfrom database.license_handler import get_license_handler, get_mock_license_handler\nfrom database.queue_handler import get_queue_handler, get_mock_queue_handler\nfrom routers.download import get_upload, get_upload_mock, get_fetch_config_stats, get_fetch_config_stats_mock\n\nclient = TestClient(app)\n\nlogger = logging.getLogger(__name__)\n\napp.dependency_overrides[get_download_handler] = get_mock_download_handler\napp.dependency_overrides[get_license_handler] = get_mock_license_handler\napp.dependency_overrides[get_queue_handler] = get_mock_queue_handler\napp.dependency_overrides[get_upload] = get_upload_mock\napp.dependency_overrides[get_fetch_config_stats] = get_fetch_config_stats_mock\n\n\ndef _get_download(headers, query, code, expected):\n    response = client.get(\"/download\", headers=headers, params=query)\n\n    assert response.status_code == code\n    assert response.json() == expected\n\n\ndef test_get_downloads_basic():\n    headers = {}\n    query = {}\n    code = 200\n    expected = [{\n        \"config_name\": \"example.cfg\",\n        \"client_name\": \"client\",\n        \"downloaded_shards\": 0,\n        \"scheduled_shards\": 0,\n        \"failed_shards\": 0,\n        \"in-progress_shards\": 0,\n        \"total_shards\": 0,\n    }]\n\n    _get_download(headers, query, code, expected)\n\n\ndef _submit_download(headers, file_path, licenses, code, expected):\n    file = None\n    try:\n        file = {\"file\": open(file_path, \"rb\")}\n    except FileNotFoundError:\n        logger.info(\"file not found.\")\n\n    payload = {\"licenses\": licenses}\n\n    response = client.post(\"/download\", headers=headers, files=file, data=payload)\n\n    logger.info(f\"resp {response.json()}\")\n\n    assert response.status_code == code\n    assert response.json() == expected\n\n\ndef test_submit_download_basic():\n    header = {\n        \"accept\": \"application/json\",\n    }\n    file_path = os.path.join(ROOT_DIR, \"tests/test_data/not_exist.cfg\")\n    licenses = [\"L1\"]\n    code = 200\n    expected = {\n        \"message\": f\"file 'not_exist.cfg' saved at '{os.getcwd()}/tests/test_data/not_exist.cfg' \"\n        \"successfully.\"\n    }\n\n    _submit_download(header, file_path, licenses, code, expected)\n\n\ndef test_submit_download_file_not_uploaded():\n    header = {\n        \"accept\": \"application/json\",\n    }\n    file_path = os.path.join(ROOT_DIR, \"tests/test_data/wrong_file.cfg\")\n    licenses = [\"L1\"]\n    code = 404\n    expected = {\"detail\": \"No upload file sent.\"}\n\n    _submit_download(header, file_path, licenses, code, expected)\n\n\ndef test_submit_download_file_alreadys_exist():\n    header = {\n        \"accept\": \"application/json\",\n    }\n    file_path = os.path.join(ROOT_DIR, \"tests/test_data/example.cfg\")\n    licenses = [\"L1\"]\n    code = 400\n    expected = {\n        \"detail\": \"Please stop the ongoing download of the config file 'example.cfg' before attempting to start a new download.\"  # noqa: E501\n    }\n\n    _submit_download(header, file_path, licenses, code, expected)\n\n\ndef _get_download_by_config(headers, config_name, code, expected):\n    response = client.get(f\"/download/{config_name}\", headers=headers)\n\n    assert response.status_code == code\n    assert response.json() == expected\n\n\ndef test_get_download_by_config_basic():\n    headers = {}\n    config_name = \"example.cfg\"\n    code = 200\n    expected = {\n        \"config_name\": config_name,\n        \"client_name\": \"client\",\n        \"downloaded_shards\": 0,\n        \"scheduled_shards\": 0,\n        \"failed_shards\": 0,\n        \"in-progress_shards\": 0,\n        \"total_shards\": 0,\n    }\n\n    _get_download_by_config(headers, config_name, code, expected)\n\n\ndef test_get_download_by_config_wrong_config():\n    headers = {}\n    config_name = \"not_exist\"\n    code = 404\n    expected = {\"detail\": \"Download config not_exist not found in weather-dl v2.\"}\n\n    _get_download_by_config(headers, config_name, code, expected)\n\n\ndef _delete_download_by_config(headers, config_name, code, expected):\n    response = client.delete(f\"/download/{config_name}\", headers=headers)\n    assert response.status_code == code\n    assert response.json() == expected\n\n\ndef test_delete_download_by_config_basic():\n    headers = {}\n    config_name = \"dummy_config\"\n    code = 200\n    expected = {\n        \"config_name\": \"dummy_config\",\n        \"message\": \"Download config stopped & removed successfully.\",\n    }\n\n    _delete_download_by_config(headers, config_name, code, expected)\n\n\ndef test_delete_download_by_config_wrong_config():\n    headers = {}\n    config_name = \"not_exist\"\n    code = 404\n    expected = {\"detail\": \"No such download config not_exist to stop & remove.\"}\n\n    _delete_download_by_config(headers, config_name, code, expected)\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/tests/integration/test_license.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport logging\nimport json\nfrom fastapi.testclient import TestClient\nfrom main import app\nfrom database.download_handler import get_download_handler, get_mock_download_handler\nfrom database.license_handler import get_license_handler, get_mock_license_handler\nfrom routers.license import (\n    get_create_deployment,\n    get_create_deployment_mock,\n    get_terminate_license_deployment,\n    get_terminate_license_deployment_mock,\n)\nfrom database.queue_handler import get_queue_handler, get_mock_queue_handler\n\nclient = TestClient(app)\n\nlogger = logging.getLogger(__name__)\n\napp.dependency_overrides[get_download_handler] = get_mock_download_handler\napp.dependency_overrides[get_license_handler] = get_mock_license_handler\napp.dependency_overrides[get_queue_handler] = get_mock_queue_handler\napp.dependency_overrides[get_create_deployment] = get_create_deployment_mock\napp.dependency_overrides[\n    get_terminate_license_deployment\n] = get_terminate_license_deployment_mock\n\n\ndef _get_license(headers, query, code, expected):\n    response = client.get(\"/license\", headers=headers, params=query)\n\n    assert response.status_code == code\n    assert response.json() == expected\n\n\ndef test_get_license_basic():\n    headers = {}\n    query = {}\n    code = 200\n    expected = [{\n        \"license_id\": \"L1\",\n        \"secret_id\": \"xxxx\",\n        \"client_name\": \"dummy_client\",\n        \"k8s_deployment_id\": \"k1\",\n        \"number_of_requets\": 100,\n    }]\n\n    _get_license(headers, query, code, expected)\n\n\ndef test_get_license_client_name():\n    headers = {}\n    client_name = \"dummy_client\"\n    query = {\"client_name\": client_name}\n    code = 200\n    expected = [{\n        \"license_id\": \"L1\",\n        \"secret_id\": \"xxxx\",\n        \"client_name\": client_name,\n        \"k8s_deployment_id\": \"k1\",\n        \"number_of_requets\": 100,\n    }]\n\n    _get_license(headers, query, code, expected)\n\n\ndef _add_license(headers, payload, code, expected):\n    response = client.post(\n        \"/license\",\n        headers=headers,\n        data=json.dumps(payload),\n        params={\"license_id\": \"L1\"},\n    )\n\n    print(f\"test add license {response.json()}\")\n\n    assert response.status_code == code\n    assert response.json() == expected\n\n\ndef test_add_license_basic():\n    headers = {\"accept\": \"application/json\", \"Content-Type\": \"application/json\"}\n    license = {\n        \"license_id\": \"no-exists\",\n        \"client_name\": \"dummy_client\",\n        \"number_of_requests\": 0,\n        \"secret_id\": \"xxxx\",\n    }\n    payload = license\n    code = 200\n    expected = {\"license_id\": \"L1\", \"message\": \"License added successfully.\"}\n\n    _add_license(headers, payload, code, expected)\n\n\ndef _get_license_by_license_id(headers, license_id, code, expected):\n    response = client.get(f\"/license/{license_id}\", headers=headers)\n\n    logger.info(f\"response {response.json()}\")\n    assert response.status_code == code\n    assert response.json() == expected\n\n\ndef test_get_license_by_license_id():\n    headers = {\"accept\": \"application/json\", \"Content-Type\": \"application/json\"}\n    license_id = \"L1\"\n    code = 200\n    expected = {\n        \"license_id\": license_id,\n        \"secret_id\": \"xxxx\",\n        \"client_name\": \"dummy_client\",\n        \"k8s_deployment_id\": \"k1\",\n        \"number_of_requets\": 100,\n    }\n\n    _get_license_by_license_id(headers, license_id, code, expected)\n\n\ndef test_get_license_wrong_license():\n    headers = {}\n    license_id = \"not_exist\"\n    code = 404\n    expected = {\n        \"detail\": \"License not_exist not found.\",\n    }\n\n    _get_license_by_license_id(headers, license_id, code, expected)\n\n\ndef _update_license(headers, license_id, license, code, expected):\n    response = client.put(\n        f\"/license/{license_id}\", headers=headers, data=json.dumps(license)\n    )\n\n    print(f\"_update license {response.json()}\")\n\n    assert response.status_code == code\n    assert response.json() == expected\n\n\ndef test_update_license_basic():\n    headers = {}\n    license_id = \"L1\"\n    license = {\n        \"license_id\": \"L1\",\n        \"client_name\": \"dummy_client\",\n        \"number_of_requests\": 0,\n        \"secret_id\": \"xxxx\",\n    }\n    code = 200\n    expected = {\"license_id\": license_id, \"name\": \"License updated successfully.\"}\n\n    _update_license(headers, license_id, license, code, expected)\n\n\ndef test_update_license_wrong_license_id():\n    headers = {}\n    license_id = \"no-exists\"\n    license = {\n        \"license_id\": \"no-exists\",\n        \"client_name\": \"dummy_client\",\n        \"number_of_requests\": 0,\n        \"secret_id\": \"xxxx\",\n    }\n    code = 404\n    expected = {\"detail\": \"No such license no-exists to update.\"}\n\n    _update_license(headers, license_id, license, code, expected)\n\n\ndef _delete_license(headers, license_id, code, expected):\n    response = client.delete(f\"/license/{license_id}\", headers=headers)\n\n    assert response.status_code == code\n    assert response.json() == expected\n\n\ndef test_delete_license_basic():\n    headers = {}\n    license_id = \"L1\"\n    code = 200\n    expected = {\"license_id\": license_id, \"message\": \"License removed successfully.\"}\n\n    _delete_license(headers, license_id, code, expected)\n\n\ndef test_delete_license_wrong_license():\n    headers = {}\n    license_id = \"not_exist\"\n    code = 404\n    expected = {\"detail\": \"No such license not_exist to delete.\"}\n\n    _delete_license(headers, license_id, code, expected)\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/tests/integration/test_queues.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport logging\nfrom main import app\nfrom fastapi.testclient import TestClient\nfrom database.download_handler import get_download_handler, get_mock_download_handler\nfrom database.license_handler import get_license_handler, get_mock_license_handler\nfrom database.queue_handler import get_queue_handler, get_mock_queue_handler\n\nclient = TestClient(app)\n\nlogger = logging.getLogger(__name__)\n\napp.dependency_overrides[get_download_handler] = get_mock_download_handler\napp.dependency_overrides[get_license_handler] = get_mock_license_handler\napp.dependency_overrides[get_queue_handler] = get_mock_queue_handler\n\n\ndef _get_all_queue(headers, query, code, expected):\n    response = client.get(\"/queues\", headers=headers, params=query)\n\n    assert response.status_code == code\n    assert response.json() == expected\n\n\ndef test_get_all_queues():\n    headers = {}\n    query = {}\n    code = 200\n    expected = [{\"client_name\": \"dummy_client\", \"license_id\": \"L1\", \"queue\": []}]\n\n    _get_all_queue(headers, query, code, expected)\n\n\ndef test_get_client_queues():\n    headers = {}\n    client_name = \"dummy_client\"\n    query = {\"client_name\": client_name}\n    code = 200\n    expected = [{\"client_name\": client_name, \"license_id\": \"L1\", \"queue\": []}]\n\n    _get_all_queue(headers, query, code, expected)\n\n\ndef _get_queue_by_license(headers, license_id, code, expected):\n    response = client.get(f\"/queues/{license_id}\", headers=headers)\n\n    assert response.status_code == code\n    assert response.json() == expected\n\n\ndef test_get_queue_by_license_basic():\n    headers = {}\n    license_id = \"L1\"\n    code = 200\n    expected = {\"client_name\": \"dummy_client\", \"license_id\": license_id, \"queue\": []}\n\n    _get_queue_by_license(headers, license_id, code, expected)\n\n\ndef test_get_queue_by_license_wrong_license():\n    headers = {}\n    license_id = \"not_exist\"\n    code = 404\n    expected = {\"detail\": 'License priority for not_exist not found.'}\n\n    _get_queue_by_license(headers, license_id, code, expected)\n\n\ndef _modify_license_queue(headers, license_id, priority_list, code, expected):\n    response = client.post(f\"/queues/{license_id}\", headers=headers, data=priority_list)\n\n    assert response.status_code == code\n    assert response.json() == expected\n\n\ndef test_modify_license_queue_basic():\n    headers = {}\n    license_id = \"L1\"\n    priority_list = []\n    code = 200\n    expected = {\"message\": f\"'{license_id}' license priority updated successfully.\"}\n\n    _modify_license_queue(headers, license_id, priority_list, code, expected)\n\n\ndef test_modify_license_queue_wrong_license_id():\n    headers = {}\n    license_id = \"not_exist\"\n    priority_list = []\n    code = 404\n    expected = {\"detail\": 'License not_exist not found.'}\n\n    _modify_license_queue(headers, license_id, priority_list, code, expected)\n\n\ndef _modify_config_priority_in_license(headers, license_id, query, code, expected):\n    response = client.put(f\"/queues/priority/{license_id}\", params=query)\n\n    logger.info(f\"response {response.json()}\")\n\n    assert response.status_code == code\n    assert response.json() == expected\n\n\ndef test_modify_config_priority_in_license_basic():\n    headers = {}\n    license_id = \"L1\"\n    query = {\"config_name\": \"example.cfg\", \"priority\": 0}\n    code = 200\n    expected = {\n        \"message\": f\"'{license_id}' license -- 'example.cfg' priority updated successfully.\"\n    }\n\n    _modify_config_priority_in_license(headers, license_id, query, code, expected)\n\n\ndef test_modify_config_priority_in_license_wrong_license():\n    headers = {}\n    license_id = \"not_exist\"\n    query = {\"config_name\": \"example.cfg\", \"priority\": 0}\n    code = 404\n    expected = {\"detail\": 'License not_exist not found.'}\n\n    _modify_config_priority_in_license(headers, license_id, query, code, expected)\n\n\ndef test_modify_config_priority_in_license_wrong_config():\n    headers = {}\n    license_id = \"not_exist\"\n    query = {\"config_name\": \"wrong.cfg\", \"priority\": 0}\n    code = 404\n    expected = {\"detail\": 'License not_exist not found.'}\n\n    _modify_config_priority_in_license(headers, license_id, query, code, expected)\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/tests/test_data/example.cfg",
    "content": "[parameters]\nclient=mars\n\ntarget_path=gs://<bucket-path>/test-weather-dl-v2/{date}T00z.gb\npartition_keys=\n  date\n  # step\n\n# API Keys & Subsections go here...\n\n[selection]\nclass=od\ntype=pf\nstream=enfo\nexpver=0001\nlevtype=pl\nlevelist=100\n# params:\n# (z) Geopotential 129, (t) Temperature 130,\n# (u) U component of wind 131, (v) V component of wind 132,\n# (q) Specific humidity 133,  (w) vertical velocity 135,\n# (vo) Vorticity (relative) 138, (d) Divergence 155,\n# (r) Relative humidity 157\nparam=129.128\n#\n# next: 2019-01-01/to/existing\n#\ndate=2019-07-18/to/2019-07-20\ntime=0000\nstep=0/to/2\nnumber=1/to/2\ngrid=F640\n"
  },
  {
    "path": "weather_dl_v2/fastapi-server/tests/test_data/not_exist.cfg",
    "content": "[parameters]\nclient=mars\n\ntarget_path=gs://<bucket-path>/test-weather-dl-v2/{date}T00z.gb\npartition_keys=\n  date\n  # step\n\n# API Keys & Subsections go here...\n\n[selection]\nclass=od\ntype=pf\nstream=enfo\nexpver=0001\nlevtype=pl\nlevelist=100\n# params:\n# (z) Geopotential 129, (t) Temperature 130,\n# (u) U component of wind 131, (v) V component of wind 132,\n# (q) Specific humidity 133,  (w) vertical velocity 135,\n# (vo) Vorticity (relative) 138, (d) Divergence 155,\n# (r) Relative humidity 157\nparam=129.128\n#\n# next: 2019-01-01/to/existing\n#\ndate=2019-07-18/to/2019-07-20\ntime=0000\nstep=0/to/2\nnumber=1/to/2\ngrid=F640\n"
  },
  {
    "path": "weather_dl_v2/license_deployment/Dockerfile",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nFROM continuumio/miniconda3:latest\n\n# Update miniconda\nRUN conda update conda -y\n\n# Add the mamba solver for faster builds\nRUN conda install -n base conda-libmamba-solver\nRUN conda config --set solver libmamba\n\nCOPY . .\n# Create conda env using environment.yml\nRUN conda env create -f environment.yml --debug\n\n# Activate the conda env and update the PATH\nARG CONDA_ENV_NAME=weather-dl-v2-license-dep\nRUN echo \"source activate ${CONDA_ENV_NAME}\" >> ~/.bashrc\nENV PATH /opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH\n\nENTRYPOINT [\"python\", \"-u\", \"fetch.py\"]\n"
  },
  {
    "path": "weather_dl_v2/license_deployment/README.md",
    "content": "# Deployment Instructions & General Notes\n\n### How to create environment\n```\nconda env create --name weather-dl-v2-license-dep --file=environment.yml\n\nconda activate weather-dl-v2-license-dep\n```\n\n### Make changes in weather_dl_v2/config.json, if required [for running locally]\n```\nexport CONFIG_PATH=/path/to/weather_dl_v2/config.json\n```\n\n### Create docker image for license deployment\nRefer instructions in weather_dl_v2/README.md"
  },
  {
    "path": "weather_dl_v2/license_deployment/VERSION.txt",
    "content": "1.0.9\n"
  },
  {
    "path": "weather_dl_v2/license_deployment/__init__.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "weather_dl_v2/license_deployment/clients.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\n\"\"\"ECMWF Downloader Clients.\"\"\"\n\nimport abc\nimport collections\nimport contextlib\nimport datetime\nimport io\nimport logging\nimport os\nimport time\nimport typing as t\nimport warnings\nfrom urllib.parse import urljoin\n\nfrom cdsapi import api as cds_api\nimport urllib3\nfrom ecmwfapi import api\n\nfrom config import optimize_selection_partition\nfrom manifest import Manifest, Stage\nfrom util import download_with_aria2, retry_with_exponential_backoff\n\nwarnings.simplefilter(\"ignore\", category=urllib3.connectionpool.InsecureRequestWarning)\n\n\nclass Client(abc.ABC):\n    \"\"\"Weather data provider client interface.\n\n    Defines methods and properties required to efficiently interact with weather\n    data providers.\n\n    Attributes:\n        config: A config that contains pipeline parameters, such as API keys.\n        level: Default log level for the client.\n    \"\"\"\n\n    def __init__(self, dataset: str, level: int = logging.INFO) -> None:\n        \"\"\"Clients are initialized with the general CLI configuration.\"\"\"\n        self.dataset = dataset\n        self.logger = logging.getLogger(f\"{__name__}.{type(self).__name__}\")\n        self.logger.setLevel(level)\n\n    @abc.abstractmethod\n    def retrieve(\n        self, dataset: str, selection: t.Dict, output: str, manifest: Manifest\n    ) -> None:\n        \"\"\"Download from data source.\"\"\"\n        pass\n\n    @classmethod\n    @abc.abstractmethod\n    def num_requests_per_key(cls, dataset: str) -> int:\n        \"\"\"Specifies the number of workers to be used per api key for the dataset.\"\"\"\n        pass\n\n    @property\n    @abc.abstractmethod\n    def license_url(self):\n        \"\"\"Specifies the License URL.\"\"\"\n        pass\n\n\nclass SplitCDSRequest():\n    \"\"\"Extended CDS class that separates fetch and download stage.\"\"\"\n    def __init__(self, *args, **kwargs):\n        self.__cds_client = cds_api.Client(*args, **kwargs)\n\n    @retry_with_exponential_backoff\n    def _download(self, url, path: str, size: int) -> None:\n        self.__cds_client.info(\"Downloading %s to %s (%s)\", url, path, cds_api.bytes_to_string(size))\n        start = time.time()\n\n        download_with_aria2(url, path)\n\n        elapsed = time.time() - start\n        if elapsed:\n            self.__cds_client.info(\"Download rate %s/s\", cds_api.bytes_to_string(size / elapsed))\n\n    def fetch(self, request: t.Dict, dataset: str) -> t.Dict:\n        result = self.__cds_client.retrieve(dataset, request)\n        return {\"href\": result.location, \"size\": result.content_length}\n\n    def download(self, result: cds_api.Result, target: t.Optional[str] = None) -> None:\n        if target:\n            if os.path.exists(target):\n                # Empty the target file, if it already exists, otherwise the\n                # transfer below might be fooled into thinking we're resuming\n                # an interrupted download.\n                open(target, \"w\").close()\n\n            self._download(result[\"href\"], target, result[\"size\"])\n\n\nclass CdsClient(Client):\n    \"\"\"A client to access weather data from the Cloud Data Store (CDS).\n\n    Datasets on CDS can be found at:\n      https://cds.climate.copernicus.eu/cdsapp#!/search?type=dataset\n\n    The parameters section of the input `config` requires two values: `api_url` and\n    `api_key`. Or, these values can be set as the environment variables: `CDSAPI_URL`\n    and `CDSAPI_KEY`. These can be acquired from the following URL, which requires\n    creating a free account: https://cds.climate.copernicus.eu/api-how-to\n\n    The CDS global queues for data access has dynamic rate limits. These can be viewed\n    live here: https://cds.climate.copernicus.eu/live/limits.\n\n    Attributes:\n        config: A config that contains pipeline parameters, such as API keys.\n        level: Default log level for the client.\n    \"\"\"\n\n    \"\"\"Name patterns of datasets that are hosted internally on CDS servers.\"\"\"\n    cds_hosted_datasets = {\"reanalysis-era\"}\n\n    def retrieve(self, dataset: str, selection: t.Dict, manifest: Manifest) -> None:\n        c = CDSClientExtended(\n            url=os.environ.get(\"CLIENT_URL\"),\n            key=os.environ.get(\"CLIENT_KEY\"),\n            debug_callback=self.logger.debug,\n            info_callback=self.logger.info,\n            warning_callback=self.logger.warning,\n            error_callback=self.logger.error,\n        )\n        selection_ = optimize_selection_partition(selection)\n        with StdoutLogger(self.logger, level=logging.DEBUG):\n            manifest.set_stage(Stage.FETCH)\n            precise_fetch_start_time = (\n                datetime.datetime.utcnow()\n                .replace(tzinfo=datetime.timezone.utc)\n                .isoformat(timespec=\"seconds\")\n            )\n            manifest.prev_stage_precise_start_time = precise_fetch_start_time\n            result = c.fetch(selection_, dataset)\n            return result\n\n    @property\n    def license_url(self):\n        return \"https://cds.climate.copernicus.eu/api/v2/terms/static/licence-to-use-copernicus-products.pdf\"\n\n    @classmethod\n    def num_requests_per_key(cls, dataset: str) -> int:\n        \"\"\"Number of requests per key from the CDS API.\n\n        CDS has dynamic, data-specific limits, defined here:\n          https://cds.climate.copernicus.eu/live/limits\n\n        Typically, the reanalysis dataset allows for 3-5 simultaneous requets.\n        For all standard CDS data (backed on disk drives), it's common that 2\n        requests are allowed, though this is dynamically set, too.\n\n        If the Beam pipeline encounters a user request limit error, please cancel\n        all outstanding requests (per each user account) at the following link:\n        https://cds.climate.copernicus.eu/cdsapp#!/yourrequests\n        \"\"\"\n        # TODO(#15): Parse live CDS limits API to set data-specific limits.\n        for internal_set in cls.cds_hosted_datasets:\n            if dataset.startswith(internal_set):\n                return 5\n        return 2\n\n\nclass StdoutLogger(io.StringIO):\n    \"\"\"Special logger to redirect stdout to logs.\"\"\"\n\n    def __init__(self, logger_: logging.Logger, level: int = logging.INFO):\n        super().__init__()\n        self.logger = logger_\n        self.level = level\n        self._redirector = contextlib.redirect_stdout(self)\n\n    def log(self, msg) -> None:\n        self.logger.log(self.level, msg)\n\n    def write(self, msg):\n        if msg and not msg.isspace():\n            self.logger.log(self.level, msg)\n\n    def __enter__(self):\n        self._redirector.__enter__()\n        return self\n\n    def __exit__(self, exc_type, exc_value, traceback):\n        # let contextlib do any exception handling here\n        self._redirector.__exit__(exc_type, exc_value, traceback)\n\n\nclass SplitMARSRequest(api.APIRequest):\n    \"\"\"Extended MARS APIRequest class that separates fetch and download stage.\"\"\"\n\n    @retry_with_exponential_backoff\n    def _download(self, url, path: str, size: int) -> None:\n        self.log(\"Transferring %s into %s\" % (self._bytename(size), path))\n        self.log(\"From %s\" % (url,))\n\n        download_with_aria2(url, path)\n\n    def fetch(self, request: t.Dict, dataset: str) -> t.Dict:\n        status = None\n\n        self.connection.submit(\"%s/%s/requests\" % (self.url, self.service), request)\n        self.log(\"Request submitted\")\n        self.log(\"Request id: \" + self.connection.last.get(\"name\"))\n        if self.connection.status != status:\n            status = self.connection.status\n            self.log(\"Request is %s\" % (status,))\n\n        while not self.connection.ready():\n            if self.connection.status != status:\n                status = self.connection.status\n                self.log(\"Request is %s\" % (status,))\n            self.connection.wait()\n\n        if self.connection.status != status:\n            status = self.connection.status\n            self.log(\"Request is %s\" % (status,))\n\n        result = self.connection.result()\n        return result\n\n    def download(self, result: t.Dict, target: t.Optional[str] = None) -> None:\n        if target:\n            if os.path.exists(target):\n                # Empty the target file, if it already exists, otherwise the\n                # transfer below might be fooled into thinking we're resuming\n                # an interrupted download.\n                open(target, \"w\").close()\n\n            self._download(urljoin(self.url, result[\"href\"]), target, result[\"size\"])\n        self.connection.cleanup()\n\n\nclass SplitRequestMixin:\n    c = None\n\n    def fetch(self, req: t.Dict, dataset: t.Optional[str] = None) -> t.Dict:\n        return self.c.fetch(req, dataset)\n\n    def download(self, res: t.Dict, target: str) -> None:\n        self.c.download(res, target)\n\n\nclass CDSClientExtended(SplitRequestMixin):\n    \"\"\"Extended CDS Client class that separates fetch and download stage.\"\"\"\n\n    def __init__(self, *args, **kwargs):\n        self.c = SplitCDSRequest(*args, **kwargs)\n\n\nclass MARSECMWFServiceExtended(api.ECMWFService, SplitRequestMixin):\n    \"\"\"Extended MARS ECMFService class that separates fetch and download stage.\"\"\"\n\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n        self.c = SplitMARSRequest(\n            self.url,\n            \"services/%s\" % (self.service,),\n            email=self.email,\n            key=self.key,\n            log=self.log,\n            verbose=self.verbose,\n            quiet=self.quiet,\n        )\n\n\nclass PublicECMWFServerExtended(api.ECMWFDataServer, SplitRequestMixin):\n\n    def __init__(self, *args, dataset=\"\", **kwargs):\n        super().__init__(*args, **kwargs)\n        self.c = SplitMARSRequest(\n            self.url,\n            \"datasets/%s\" % (dataset,),\n            email=self.email,\n            key=self.key,\n            log=self.log,\n            verbose=self.verbose,\n        )\n\n\nclass MarsClient(Client):\n    \"\"\"A client to access data from the Meteorological Archival and Retrieval System (MARS).\n\n    See https://www.ecmwf.int/en/forecasts/datasets for a summary of datasets available\n    on MARS. Most notable, MARS provides access to ECMWF's Operational Archive\n    https://www.ecmwf.int/en/forecasts/dataset/operational-archive.\n\n    The client config must contain three parameters to autheticate access to the MARS archive:\n    `api_key`, `api_url`, and `api_email`. These can also be configued by setting the\n    commensurate environment variables: `MARSAPI_KEY`, `MARSAPI_URL`, and `MARSAPI_EMAIL`.\n    These credentials can be looked up by after registering for an ECMWF account\n    (https://apps.ecmwf.int/registration/) and visitng: https://api.ecmwf.int/v1/key/.\n\n    MARS server activity can be observed at https://apps.ecmwf.int/mars-activity/.\n\n    Attributes:\n        config: A config that contains pipeline parameters, such as API keys.\n        level: Default log level for the client.\n    \"\"\"\n\n    def retrieve(self, dataset: str, selection: t.Dict, manifest: Manifest) -> None:\n        c = MARSECMWFServiceExtended(\n            \"mars\",\n            key=os.environ.get(\"CLIENT_KEY\"),\n            url=os.environ.get(\"CLIENT_URL\"),\n            email=os.environ.get(\"CLIENT_EMAIL\"),\n            log=self.logger.debug,\n            verbose=True,\n        )\n        selection_ = optimize_selection_partition(selection)\n        with StdoutLogger(self.logger, level=logging.DEBUG):\n            manifest.set_stage(Stage.FETCH)\n            precise_fetch_start_time = (\n                datetime.datetime.utcnow()\n                .replace(tzinfo=datetime.timezone.utc)\n                .isoformat(timespec=\"seconds\")\n            )\n            manifest.prev_stage_precise_start_time = precise_fetch_start_time\n            result = c.fetch(req=selection_)\n            return result\n\n    @property\n    def license_url(self):\n        return \"https://apps.ecmwf.int/datasets/licences/general/\"\n\n    @classmethod\n    def num_requests_per_key(cls, dataset: str) -> int:\n        \"\"\"Number of requests per key (or user) for the Mars API.\n\n        Mars allows 2 active requests per user and 20 queued requests per user, as of Sept 27, 2021.\n        To ensure we never hit a rate limit error during download, we only make use of the active\n        requests.\n        See: https://confluence.ecmwf.int/display/UDOC/Total+number+of+requests+a+user+can+submit+-+Web+API+FAQ\n\n        Queued requests can _only_ be canceled manually from a web dashboard. If the\n        `ERROR 101 (USER_QUEUED_LIMIT_EXCEEDED)` error occurs in the Beam pipeline, then go to\n        http://apps.ecmwf.int/webmars/joblist/ and cancel queued jobs.\n        \"\"\"\n        return 2\n\n\nclass ECMWFPublicClient(Client):\n    \"\"\"A client for ECMWF's public datasets, like TIGGE.\"\"\"\n\n    def retrieve(self, dataset: str, selection: t.Dict, manifest: Manifest) -> None:\n        c = PublicECMWFServerExtended(\n            url=os.environ.get(\"CLIENT_URL\"),\n            key=os.environ.get(\"CLIENT_KEY\"),\n            email=os.environ.get(\"CLIENT_EMAIL\"),\n            log=self.logger.debug,\n            verbose=True,\n            dataset=dataset,\n        )\n        selection_ = optimize_selection_partition(selection)\n        with StdoutLogger(self.logger, level=logging.DEBUG):\n            manifest.set_stage(Stage.FETCH)\n            precise_fetch_start_time = (\n                datetime.datetime.utcnow()\n                .replace(tzinfo=datetime.timezone.utc)\n                .isoformat(timespec=\"seconds\")\n            )\n            manifest.prev_stage_precise_start_time = precise_fetch_start_time\n            result = c.fetch(req=selection_)\n            return result\n\n    @classmethod\n    def num_requests_per_key(cls, dataset: str) -> int:\n        # Experimentally validated request limit.\n        return 5\n\n    @property\n    def license_url(self):\n        if not self.dataset:\n            raise ValueError(\"must specify a dataset for this client!\")\n        return f\"https://apps.ecmwf.int/datasets/data/{self.dataset.lower()}/licence/\"\n\n\nclass FakeClient(Client):\n    \"\"\"A client that writes the selection arguments to the output file.\"\"\"\n\n    def retrieve(self, dataset: str, selection: t.Dict, manifest: Manifest) -> None:\n        manifest.set_stage(Stage.RETRIEVE)\n        precise_retrieve_start_time = (\n            datetime.datetime.utcnow()\n            .replace(tzinfo=datetime.timezone.utc)\n            .isoformat(timespec=\"seconds\")\n        )\n        manifest.prev_stage_precise_start_time = precise_retrieve_start_time\n        self.logger.debug(f\"Downloading {dataset}.\")\n\n    @property\n    def license_url(self):\n        return \"lorem ipsum\"\n\n    @classmethod\n    def num_requests_per_key(cls, dataset: str) -> int:\n        return 1\n\n\nCLIENTS = collections.OrderedDict(\n    cds=CdsClient,\n    mars=MarsClient,\n    ecpublic=ECMWFPublicClient,\n    fake=FakeClient,\n)\n"
  },
  {
    "path": "weather_dl_v2/license_deployment/config.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport calendar\nimport copy\nimport dataclasses\nimport itertools\nimport typing as t\n\nValues = t.Union[t.List[\"Values\"], t.Dict[str, \"Values\"], bool, int, float, str]  # pytype: disable=not-supported-yet\n\n\n@dataclasses.dataclass\nclass Config:\n    \"\"\"Contains pipeline parameters.\n\n    Attributes:\n        config_name:\n            Name of the config file.\n        client:\n            Name of the Weather-API-client. Supported clients are mentioned in the 'CLIENTS' variable.\n        dataset (optional):\n            Name of the target dataset. Allowed options are dictated by the client.\n        partition_keys (optional):\n            Choose the keys from the selection section to partition the data request.\n            This will compute a cartesian cross product of the selected keys\n            and assign each as their own download.\n        target_path:\n            Download artifact filename template. Can make use of Python's standard string formatting.\n            It can contain format symbols to be replaced by partition keys;\n            if this is used, the total number of format symbols must match the number of partition keys.\n        subsection_name:\n            Name of the particular subsection. 'default' if there is no subsection.\n        force_download:\n            Force redownload of partitions that were previously downloaded.\n        user_id:\n            Username from the environment variables.\n        kwargs (optional):\n            For representing subsections or any other parameters.\n        selection:\n            Contains parameters used to select desired data.\n    \"\"\"\n\n    config_name: str = \"\"\n    client: str = \"\"\n    dataset: t.Optional[str] = \"\"\n    target_path: str = \"\"\n    partition_keys: t.Optional[t.List[str]] = dataclasses.field(default_factory=list)\n    subsection_name: str = \"default\"\n    force_download: bool = False\n    user_id: str = \"unknown\"\n    kwargs: t.Optional[t.Dict[str, Values]] = dataclasses.field(default_factory=dict)\n    selection: t.Dict[str, Values] = dataclasses.field(default_factory=dict)\n\n    @classmethod\n    def from_dict(cls, config: t.Dict) -> \"Config\":\n        config_instance = cls()\n        for section_key, section_value in config.items():\n            if section_key == \"parameters\":\n                for key, value in section_value.items():\n                    if hasattr(config_instance, key):\n                        setattr(config_instance, key, value)\n                    else:\n                        config_instance.kwargs[key] = value\n            if section_key == \"selection\":\n                config_instance.selection = section_value\n        return config_instance\n\n\ndef optimize_selection_partition(selection: t.Dict) -> t.Dict:\n    \"\"\"Compute right-hand-side values for the selection section of a single partition.\n\n    Used to support custom syntax and optimizations, such as 'all'.\n    \"\"\"\n    selection_ = copy.deepcopy(selection)\n\n    if \"date_range\" in selection_.keys():\n        selection_[\"date\"] = selection_[\"date_range\"][0]\n        del selection_[\"date_range\"]\n\n    if \"day\" in selection_.keys() and selection_[\"day\"] == \"all\":\n        years, months = selection_[\"year\"], selection_[\"month\"]\n\n        multiples_error = \"When using day='all' in selection, '/' is not allowed in {type}.\"\n\n        if isinstance(years, str):\n            years = [years]\n\n        if isinstance(months, str):\n            months = [months]\n\n        date_ranges = []\n\n        # Generating dates for every year-month.\n        for year, month in itertools.product(years, months):\n\n            if isinstance(year, str):\n                assert \"/\" not in year, multiples_error.format(type=\"year\")\n\n            if isinstance(month, str):\n                assert \"/\" not in month, multiples_error.format(type=\"month\")\n\n            year, month = int(year), int(month)\n\n            _, n_days_in_month = calendar.monthrange(year, month)\n\n            date_range = [f'{year:04d}-{month:02d}-{day:02d}' for day in range(1, n_days_in_month + 1)]\n\n            date_ranges.extend(date_range)\n\n        selection_[\"date\"] = date_ranges\n        del selection_[\"day\"]\n        del selection_[\"month\"]\n        del selection_[\"year\"]\n\n    return selection_\n"
  },
  {
    "path": "weather_dl_v2/license_deployment/database.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport abc\nimport time\nimport logging\nimport firebase_admin\nfrom firebase_admin import firestore\nfrom firebase_admin import credentials\nfrom google.cloud.firestore_v1 import DocumentSnapshot, DocumentReference\nfrom google.cloud.firestore_v1.types import WriteResult\nfrom google.cloud.firestore_v1.base_query import FieldFilter, And\nfrom util import get_wait_interval\nfrom deployment_config import get_config\n\nlogger = logging.getLogger(__name__)\n\n\nclass Database(abc.ABC):\n\n    @abc.abstractmethod\n    def _get_db(self):\n        pass\n\n\nclass CRUDOperations(abc.ABC):\n\n    @abc.abstractmethod\n    def _initialize_license_deployment(self, license_id: str) -> dict:\n        pass\n\n    @abc.abstractmethod\n    def _get_config_from_queue_by_license_id(self, license_id: str) -> dict:\n        pass\n\n    @abc.abstractmethod\n    def _remove_config_from_license_queue(\n        self, license_id: str, config_name: str\n    ) -> None:\n        pass\n\n    @abc.abstractmethod\n    def _empty_license_queue(self, license_id: str) -> None:\n        pass\n\n    @abc.abstractmethod\n    def _get_partition_from_manifest(self, config_name: str) -> str:\n        pass\n\n    @abc.abstractmethod\n    def _mark_license_status(self, license_id: str, status: str) -> None:\n        pass\n\nclass FirestoreClient(Database, CRUDOperations):\n\n    def _get_db(self) -> firestore.firestore.Client:\n        \"\"\"Acquire a firestore client, initializing the firebase app if necessary.\n        Will attempt to get the db client five times. If it's still unsuccessful, a\n        `ManifestException` will be raised.\n        \"\"\"\n        db = None\n        attempts = 0\n\n        while db is None:\n            try:\n                db = firestore.client()\n            except ValueError as e:\n                # The above call will fail with a value error when the firebase app is not initialized.\n                # Initialize the app here, and try again.\n                # Use the application default credentials.\n                cred = credentials.ApplicationDefault()\n\n                firebase_admin.initialize_app(cred)\n                logger.info(\"Initialized Firebase App.\")\n\n                if attempts > 4:\n                    raise RuntimeError(\n                        \"Exceeded number of retries to get firestore client.\"\n                    ) from e\n\n            time.sleep(get_wait_interval(attempts))\n\n            attempts += 1\n\n        return db\n\n    def _initialize_license_deployment(self, license_id: str) -> dict:\n        result: DocumentSnapshot = (\n            self._get_db()\n            .collection(get_config().license_collection)\n            .document(license_id)\n            .get()\n        )\n        return result.to_dict()\n\n    def _get_config_from_queue_by_license_id(self, license_id: str) -> str | None:\n        result: DocumentSnapshot = (\n            self._get_db()\n            .collection(get_config().queues_collection)\n            .document(license_id)\n            .get([\"queue\"])\n        )\n        if result.exists:\n            queue = result.to_dict()[\"queue\"]\n            if len(queue) > 0:\n                return queue[0]\n        return None\n\n    def _get_partition_from_manifest(self, config_name: str) -> str | None:\n        transaction = self._get_db().transaction()\n        return get_partition_from_manifest(transaction, config_name)\n\n    def _remove_config_from_license_queue(\n        self, license_id: str, config_name: str\n    ) -> None:\n        result: WriteResult = (\n            self._get_db()\n            .collection(get_config().queues_collection)\n            .document(license_id)\n            .update({\"queue\": firestore.ArrayRemove([config_name])})\n        )\n        logger.info(\n            f\"Updated {license_id} queue in 'queues' collection. Update_time: {result.update_time}.\"\n        )\n\n    def _empty_license_queue(self, license_id: str) -> None:\n        result: WriteResult = (\n            self._get_db()\n            .collection(get_config().queues_collection)\n            .document(license_id)\n            .update({\"queue\": []})\n        )\n        logger.info(\n            f\"Updated {license_id} queue in 'queues' collection. Update_time: {result.update_time}.\"\n        )\n\n    def _mark_license_status(self, license_id: str, status: str) -> None:\n        timestamp = (\n            self._get_db()\n            .collection(get_config().license_collection)\n            .document(license_id)\n            .update({\"status\": status})\n        )\n        logger.info(\n            f\"Updated {license_id} in 'license' collection. Update_time: {timestamp}.\"\n        )\n\n# TODO: Firestore transcational fails after reading a document 20 times with roll over.\n# This happens when too many licenses try to access the same partition document.\n# Find some alternative approach to handle this.\n@firestore.transactional\ndef get_partition_from_manifest(transaction, config_name: str) -> str | None:\n    db_client = FirestoreClient()\n    filter_1 = FieldFilter(\"config_name\", \"==\", config_name)\n    filter_2 = FieldFilter(\"status\", \"==\", \"scheduled\")\n    and_filter = And(filters=[filter_1, filter_2])\n\n    snapshot = (\n        db_client._get_db()\n        .collection(get_config().manifest_collection)\n        .where(filter=and_filter)\n        .limit(1)\n        .get(transaction=transaction)\n    )\n    if len(snapshot) > 0:\n        snapshot = snapshot[0]\n    else:\n        return None\n\n    ref: DocumentReference = (\n        db_client._get_db()\n        .collection(get_config().manifest_collection)\n        .document(snapshot.id)\n    )\n    transaction.update(ref, {\"status\": \"processing\"})\n\n    return snapshot.to_dict()\n"
  },
  {
    "path": "weather_dl_v2/license_deployment/deployment_config.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport dataclasses\nimport typing as t\nimport json\nimport os\nimport logging\n\nlogger = logging.getLogger(__name__)\n\nValues = t.Union[t.List[\"Values\"], t.Dict[str, \"Values\"], bool, int, float, str]  # pytype: disable=not-supported-yet\n\n\n@dataclasses.dataclass\nclass DeploymentConfig:\n    download_collection: str = \"\"\n    queues_collection: str = \"\"\n    license_collection: str = \"\"\n    manifest_collection: str = \"\"\n    downloader_k8_image: str = \"\"\n    kwargs: t.Optional[t.Dict[str, Values]] = dataclasses.field(default_factory=dict)\n\n    @classmethod\n    def from_dict(cls, config: t.Dict):\n        config_instance = cls()\n\n        for key, value in config.items():\n            if hasattr(config_instance, key):\n                setattr(config_instance, key, value)\n            else:\n                config_instance.kwargs[key] = value\n\n        return config_instance\n\n\ndeployment_config = None\n\n\ndef get_config():\n    global deployment_config\n    if deployment_config:\n        return deployment_config\n\n    deployment_config_json = \"config/config.json\"\n    if not os.path.exists(deployment_config_json):\n        deployment_config_json = os.environ.get(\"CONFIG_PATH\", None)\n\n    if deployment_config_json is None:\n        logger.error(\"Couldn't load config file for license deployment.\")\n        raise FileNotFoundError(\"Couldn't load config file for license deployment.\")\n\n    with open(deployment_config_json) as file:\n        config_dict = json.load(file)\n        deployment_config = DeploymentConfig.from_dict(config_dict)\n\n    return deployment_config\n"
  },
  {
    "path": "weather_dl_v2/license_deployment/downloader.yaml",
    "content": "apiVersion: batch/v1\nkind: Job\nmetadata:\n  name: downloader-with-ttl\nspec:\n  ttlSecondsAfterFinished: 0\n  backoffLimit: 0\n  template:\n    spec:\n      nodeSelector:\n        cloud.google.com/gke-nodepool: downloader-pool\n      containers:\n      - name: downloader\n        image: XXXXXXX\n        imagePullPolicy: Always\n        command: []\n        resources:\n          requests:\n            cpu: \"1000m\"                  # CPU: 1 vCPU\n            memory: \"2Gi\"                 # RAM: 2 GiB\n            ephemeral-storage: \"100Gi\"    # Storage: 100 GiB\n        volumeMounts:\n        - name: data\n          mountPath: /data\n        - name: config-volume\n          mountPath: ./config\n      restartPolicy: Never\n      volumes:\n      - name: data\n        emptyDir:\n          sizeLimit: 100Gi\n      - name: config-volume\n        configMap:\n          name: dl-v2-config"
  },
  {
    "path": "weather_dl_v2/license_deployment/environment.yml",
    "content": "name: weather-dl-v2-license-dep\nchannels:\n  - conda-forge\ndependencies:\n  - python=3.10\n  - geojson\n  - ecmwf-api-client=1.6.3\n  - pip=22.3\n  - cdsapi=0.7.5\n  - pip:\n    - kubernetes\n    - google-cloud-logging\n    - google-cloud-secret-manager\n    - aiohttp\n    - numpy\n    - xarray\n    - apache-beam[gcp]\n    - firebase-admin\n"
  },
  {
    "path": "weather_dl_v2/license_deployment/fetch.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nfrom concurrent.futures import ThreadPoolExecutor\nfrom google.cloud import secretmanager\nimport json\nimport logging\nimport time\nimport sys\nimport os\nimport google.cloud.logging\n\nfrom database import FirestoreClient\nfrom job_creator import create_download_job\nfrom clients import CLIENTS\nfrom manifest import FirestoreManifest\nfrom util import exceptionit, ThreadSafeDict, GracefulKiller\n\ndb_client = FirestoreClient()\nlog_client = google.cloud.logging.Client()\nlog_client.setup_logging()\nsecretmanager_client = secretmanager.SecretManagerServiceClient()\nCONFIG_MAX_ERROR_COUNT = 10\n\nlogger = logging.getLogger(__name__)\n\ndef create_job(request, result):\n    res = {\n        \"config_name\": request[\"config_name\"],\n        \"dataset\": request[\"dataset\"],\n        \"selection\": json.loads(request[\"selection\"]),\n        \"user_id\": request[\"username\"],\n        \"url\": result[\"href\"],\n        \"target_path\": request[\"location\"],\n        \"license_id\": license_id,\n    }\n\n    data_str = json.dumps(res)\n    logger.info(f\"Creating download job for res: {data_str}\")\n    create_download_job(data_str)\n\n\n@exceptionit\ndef make_fetch_request(request, error_map: ThreadSafeDict):\n    client = CLIENTS[client_name](request[\"dataset\"])\n    manifest = FirestoreManifest(license_id=license_id)\n    logger.info(\n        f\"By using {client_name} datasets, \"\n        f\"users agree to the terms and conditions specified in {client.license_url!r}.\"\n    )\n\n    target = request[\"location\"]\n    selection = json.loads(request[\"selection\"])\n\n    logger.info(f\"Fetching data for {target!r}.\")\n\n    config_name = request[\"config_name\"]\n\n    if not error_map.has_key(config_name):\n        error_map[config_name] = 0\n\n    if error_map[config_name] >= CONFIG_MAX_ERROR_COUNT:\n        logger.info(f\"Error count for config {config_name} exceeded CONFIG_MAX_ERROR_COUNT ({CONFIG_MAX_ERROR_COUNT}).\")\n        error_map.remove(config_name)\n        logger.info(f\"Removing config {config_name} from license queue.\")\n        # Remove config from this license queue.\n        db_client._remove_config_from_license_queue(license_id=license_id, config_name=config_name)\n        return\n\n    # Wait for exponential time based on error count.\n    if error_map[config_name] > 0:\n        logger.info(f\"Error count for  config {config_name}: {error_map[config_name]}.\")\n        sleep_time = error_map.exponential_time(config_name)\n        logger.info(f\"Sleeping for {sleep_time} secs.\")\n        time.sleep(sleep_time)\n\n    try:\n        with manifest.transact(\n            request[\"config_name\"],\n            request[\"dataset\"],\n            selection,\n            target,\n            request[\"username\"],\n        ):\n            result = client.retrieve(request[\"dataset\"], selection, manifest)\n    except Exception as e:\n        # We are handling this as generic case as CDS client throws generic exceptions.\n\n        # License expired.\n        if \"Access token expired\" in str(e):\n            logger.error(f\"{license_id} expired. Emptying queue! error: {e}.\")\n            db_client._empty_license_queue(license_id=license_id)\n            db_client._mark_license_status(license_id, \"License Expired.\")\n            return\n\n        if \"Access token disabled'\" in str(e):\n            logger.error(f\"{license_id} disabled. Emptying queue! error: {e}.\")\n            db_client._empty_license_queue(license_id=license_id)\n            db_client._mark_license_status(license_id, \"License Disabled.\")\n            return\n\n        # License queue full on client side.\n        if \"USER_QUEUED_LIMIT_EXCEEDED\" in str(e) or \\\n            \"Too many queued requests\" in str(e):\n            logger.error(f\"{license_id} queue full. Emptying queue! error: {e}.\")\n            db_client._empty_license_queue(license_id=license_id)\n            db_client._mark_license_status(license_id, \"License Queue Full.\")\n            return\n\n        # Increment error count for a config.\n        logger.error(f\"Partition fetching failed. Error {e}.\")\n        error_map.increment(config_name)\n        return\n\n    # If any partition in successful reset the error count.\n    error_map[config_name] = 0\n    create_job(request, result)\n\n\ndef fetch_request_from_db():\n    request = None\n    config_name = db_client._get_config_from_queue_by_license_id(license_id)\n    if config_name:\n        try:\n            logger.info(f\"Fetching partition for {config_name}.\")\n            request = db_client._get_partition_from_manifest(config_name)\n            if not request:\n                db_client._remove_config_from_license_queue(license_id, config_name)\n        except Exception as e:\n            logger.error(\n                f\"Error in fetch_request_from_db for {config_name}. error: {e}.\"\n            )\n    return request\n\n\ndef main():\n    logger.info(\"Started looking at the request.\")\n    error_map = ThreadSafeDict()\n    killer = GracefulKiller()\n    with ThreadPoolExecutor(concurrency_limit) as executor:\n        # Disclaimer: A license will pick always pick concurrency_limit + 1\n        # parition. One extra parition will be kept in threadpool task queue.\n\n        log_count = 0\n        while True:\n            # Check if SIGTERM was recived for graceful termination.\n            if not killer.kill_now:\n                # Fetch a request from the database.\n                request = fetch_request_from_db()\n            else:\n                logger.warning('SIGTERM recieved. Stopping further requets processing.')\n                break\n\n            if request is not None:\n                executor.submit(make_fetch_request, request, error_map)\n            else:\n                logger.info(\"No request available. Waiting...\")\n                time.sleep(30)\n\n            # Each license should not pick more partitions than it's\n            # concurrency_limit. We limit the threadpool queue size to just 1\n            # to prevent the license from picking more partitions than\n            # it's concurrency_limit. When an executor is freed up, the task\n            # in queue is picked and license fetches another task.\n            while executor._work_queue.qsize() >= 1:\n                # To prevent flooding of this log, we log this every 60 seconds.\n                if log_count%60 == 0:\n                    logger.info(\"Worker busy. Waiting...\")\n                # Reset log_count if it goes beyond 3600.\n                log_count = 1 if log_count >= 3600 else log_count + 1\n                time.sleep(1)\n\n        logger.warning('Graceful Termination. Waiting for remaining requests to complete.')\n\n        # Making sure all pending requests are completed.\n        executor.shutdown(wait=True)\n\n        logger.warning('Graceful Termination. Completed all pending requests.')\n\n        # We want mark the pod as failed as we want to start a new pod which will\n        # continue to fetch requests.\n        raise RuntimeError('License Deployment was Graceful Terminated. ' \\\n                           'Raising Error to mark the pod as failed.')\n\n\ndef boot_up(license: str) -> None:\n    global license_id, client_name, concurrency_limit\n\n    result = db_client._initialize_license_deployment(license)\n    license_id = license\n    client_name = result[\"client_name\"]\n    concurrency_limit = result[\"number_of_requests\"]\n\n    response = secretmanager_client.access_secret_version(\n        request={\"name\": result[\"secret_id\"]}\n    )\n    payload = response.payload.data.decode(\"UTF-8\")\n    secret_dict = json.loads(payload)\n\n    os.environ.setdefault(\"CLIENT_URL\", secret_dict.get(\"api_url\", \"\"))\n    os.environ.setdefault(\"CLIENT_KEY\", secret_dict.get(\"api_key\", \"\"))\n    os.environ.setdefault(\"CLIENT_EMAIL\", secret_dict.get(\"api_email\", \"\"))\n\n\nif __name__ == \"__main__\":\n    try:\n        license = sys.argv[2]\n\n        logger.info(f\"Deployment for license: {license}.\")\n        boot_up(license)\n        main()\n    except Exception as e:\n        logger.info(f\"License error: {e}.\")\n        raise e\n\n    logger.info('License deployment shutting down.')\n"
  },
  {
    "path": "weather_dl_v2/license_deployment/job_creator.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nfrom os import path\nimport yaml\nimport json\nimport uuid\nfrom kubernetes import client, config\nfrom deployment_config import get_config\n\n\ndef create_download_job(message):\n    \"\"\"Creates a kubernetes workflow of type Job for downloading the data.\"\"\"\n    parsed_message = json.loads(message)\n    (\n        config_name,\n        dataset,\n        selection,\n        user_id,\n        url,\n        target_path,\n        license_id,\n    ) = parsed_message.values()\n    selection = str(selection).replace(\" \", \"\")\n    config.load_config()\n\n    with open(path.join(path.dirname(__file__), \"downloader.yaml\")) as f:\n        dep = yaml.safe_load(f)\n        uid = uuid.uuid4()\n        dep[\"metadata\"][\"name\"] = f\"downloader-job-id-{uid}\"\n        dep[\"spec\"][\"template\"][\"spec\"][\"containers\"][0][\"command\"] = [\n            \"python\",\n            \"downloader.py\",\n            config_name,\n            dataset,\n            selection,\n            user_id,\n            url,\n            target_path,\n            license_id,\n        ]\n        dep[\"spec\"][\"template\"][\"spec\"][\"containers\"][0][\n            \"image\"\n        ] = get_config().downloader_k8_image\n        batch_api = client.BatchV1Api()\n        batch_api.create_namespaced_job(body=dep, namespace=\"default\")\n"
  },
  {
    "path": "weather_dl_v2/license_deployment/manifest.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\n\"\"\"Client interface for connecting to a manifest.\"\"\"\n\nimport abc\nimport logging\nimport dataclasses\nimport datetime\nimport enum\nimport json\nimport pandas as pd\nimport time\nimport traceback\nimport typing as t\n\nfrom util import (\n    to_json_serializable_type,\n    fetch_geo_polygon,\n    get_file_size,\n    get_wait_interval,\n    generate_md5_hash,\n    GLOBAL_COVERAGE_AREA,\n)\n\nimport firebase_admin\nfrom firebase_admin import credentials\nfrom firebase_admin import firestore\nfrom google.cloud.firestore_v1 import DocumentReference\nfrom google.cloud.firestore_v1.types import WriteResult\nfrom deployment_config import get_config\nfrom database import Database\n\nlogger = logging.getLogger(__name__)\n\n\"\"\"An implementation-dependent Manifest URI.\"\"\"\nLocation = t.NewType(\"Location\", str)\n\n\nclass ManifestException(Exception):\n    \"\"\"Errors that occur in Manifest Clients.\"\"\"\n\n    pass\n\n\nclass Stage(enum.Enum):\n    \"\"\"A request can be either in one of the following stages at a time:\n\n    fetch : This represents request is currently in fetch stage i.e. request placed on the client's server\n        & waiting for some result before starting download (eg. MARS client).\n    download : This represents request is currently in download stage i.e. data is being downloading from client's\n        server to the worker's local file system.\n    upload : This represents request is currently in upload stage i.e. data is getting uploaded from worker's local\n        file system to target location (GCS path).\n    retrieve : In case of clients where there is no proper separation of fetch & download stages (eg. CDS client),\n        request will be in the retrieve stage i.e. fetch + download.\n    \"\"\"\n\n    RETRIEVE = \"retrieve\"\n    FETCH = \"fetch\"\n    DOWNLOAD = \"download\"\n    UPLOAD = \"upload\"\n\n\nclass Status(enum.Enum):\n    \"\"\"Depicts the request's state status:\n\n    scheduled : A request partition is created & scheduled for processing.\n        Note: Its corresponding state can be None only.\n    processing: This represents that the request picked by license deployment.\n    in-progress : This represents the request state is currently in-progress (i.e. running).\n        The next status would be \"success\" or \"failure\".\n    success : This represents the request state execution completed successfully without any error.\n    failure : This represents the request state execution failed.\n    \"\"\"\n\n    PROCESSING = \"processing\"\n    SCHEDULED = \"scheduled\"\n    IN_PROGRESS = \"in-progress\"\n    SUCCESS = \"success\"\n    FAILURE = \"failure\"\n\n\n@dataclasses.dataclass\nclass DownloadStatus:\n    \"\"\"Data recorded in `Manifest`s reflecting the status of a download.\"\"\"\n\n    \"\"\"The name of the config file associated with the request.\"\"\"\n    config_name: str = \"\"\n\n    \"\"\"Represents the dataset field of the configuration.\"\"\"\n    dataset: t.Optional[str] = \"\"\n\n    \"\"\"Copy of selection section of the configuration.\"\"\"\n    selection: t.Dict = dataclasses.field(default_factory=dict)\n\n    \"\"\"Location of the downloaded data.\"\"\"\n    location: str = \"\"\n\n    \"\"\"Represents area covered by the shard.\"\"\"\n    area: str = \"\"\n\n    \"\"\"Current stage of request : 'fetch', 'download', 'retrieve', 'upload' or None.\"\"\"\n    stage: t.Optional[Stage] = None\n\n    \"\"\"Download status: 'scheduled', 'in-progress', 'success', or 'failure'.\"\"\"\n    status: t.Optional[Status] = None\n\n    \"\"\"Cause of error, if any.\"\"\"\n    error: t.Optional[str] = \"\"\n\n    \"\"\"Identifier for the user running the download.\"\"\"\n    username: str = \"\"\n\n    \"\"\"Shard size in GB.\"\"\"\n    size: t.Optional[float] = 0\n\n    \"\"\"A UTC datetime when download was scheduled.\"\"\"\n    scheduled_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the retrieve stage starts.\"\"\"\n    retrieve_start_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the retrieve state ends.\"\"\"\n    retrieve_end_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the fetch state starts.\"\"\"\n    fetch_start_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the fetch state ends.\"\"\"\n    fetch_end_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the download state starts.\"\"\"\n    download_start_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the download state ends.\"\"\"\n    download_end_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the upload state starts.\"\"\"\n    upload_start_time: t.Optional[str] = \"\"\n\n    \"\"\"A UTC datetime when the upload state ends.\"\"\"\n    upload_end_time: t.Optional[str] = \"\"\n\n    @classmethod\n    def from_dict(cls, download_status: t.Dict) -> \"DownloadStatus\":\n        \"\"\"Instantiate DownloadStatus dataclass from dict.\"\"\"\n        download_status_instance = cls()\n        for key, value in download_status.items():\n            if key == \"status\":\n                setattr(download_status_instance, key, Status(value))\n            elif key == \"stage\" and value is not None:\n                setattr(download_status_instance, key, Stage(value))\n            else:\n                setattr(download_status_instance, key, value)\n        return download_status_instance\n\n    @classmethod\n    def to_dict(cls, instance) -> t.Dict:\n        \"\"\"Return the fields of a dataclass instance as a manifest ingestible\n        dictionary mapping of field names to field values.\"\"\"\n        download_status_dict = {}\n        for field in dataclasses.fields(instance):\n            key = field.name\n            value = getattr(instance, field.name)\n            if isinstance(value, Status) or isinstance(value, Stage):\n                download_status_dict[key] = value.value\n            elif isinstance(value, pd.Timestamp):\n                download_status_dict[key] = value.isoformat()\n            elif key == \"selection\" and value is not None:\n                download_status_dict[key] = json.dumps(value)\n            else:\n                download_status_dict[key] = value\n        return download_status_dict\n\n\n@dataclasses.dataclass\nclass Manifest(abc.ABC):\n    \"\"\"Abstract manifest of download statuses.\n\n    Update download statuses to some storage medium.\n\n    This class lets one indicate that a download is `scheduled` or in a transaction process.\n    In the event of a transaction, a download will be updated with an `in-progress`, `success`\n    or `failure` status (with accompanying metadata).\n\n    Example:\n        ```\n        my_manifest = parse_manifest_location(Location('fs://some-firestore-collection'))\n\n        # Schedule data for download\n        my_manifest.schedule({'some': 'metadata'}, 'path/to/downloaded/file', 'my-username')\n\n        # ...\n\n        # Initiate a transaction – it will record that the download is `in-progess`\n        with my_manifest.transact({'some': 'metadata'}, 'path/to/downloaded/file', 'my-username') as tx:\n            # download logic here\n            pass\n\n            # ...\n\n            # on error, will record the download as a `failure` before propagating the error.  By default, it will\n            # record download as a `success`.\n        ```\n\n    Attributes:\n        status: The current `DownloadStatus` of the Manifest.\n    \"\"\"\n\n    # To reduce the impact of _read() and _update() calls\n    # on the start time of the stage.\n    license_id: str = \"\"\n    prev_stage_precise_start_time: t.Optional[str] = None\n    status: t.Optional[DownloadStatus] = None\n\n    # This is overridden in subclass.\n    def __post_init__(self):\n        \"\"\"Initialize the manifest.\"\"\"\n        pass\n\n    def schedule(\n        self,\n        config_name: str,\n        dataset: str,\n        selection: t.Dict,\n        location: str,\n        user: str,\n    ) -> None:\n        \"\"\"Indicate that a job has been scheduled for download.\n\n        'scheduled' jobs occur before 'in-progress', 'success' or 'finished'.\n        \"\"\"\n        scheduled_time = (\n            datetime.datetime.utcnow()\n            .replace(tzinfo=datetime.timezone.utc)\n            .isoformat(timespec=\"seconds\")\n        )\n        self.status = DownloadStatus(\n            config_name=config_name,\n            dataset=dataset if dataset else None,\n            selection=selection,\n            location=location,\n            area=fetch_geo_polygon(selection.get(\"area\", GLOBAL_COVERAGE_AREA)),\n            username=user,\n            stage=None,\n            status=Status.SCHEDULED,\n            error=None,\n            size=None,\n            scheduled_time=scheduled_time,\n            retrieve_start_time=None,\n            retrieve_end_time=None,\n            fetch_start_time=None,\n            fetch_end_time=None,\n            download_start_time=None,\n            download_end_time=None,\n            upload_start_time=None,\n            upload_end_time=None,\n        )\n        self._update(self.status)\n\n    def skip(\n        self,\n        config_name: str,\n        dataset: str,\n        selection: t.Dict,\n        location: str,\n        user: str,\n    ) -> None:\n        \"\"\"Updates the manifest to mark the shards that were skipped in the current job\n        as 'upload' stage and 'success' status, indicating that they have already been downloaded.\n        \"\"\"\n        old_status = self._read(location)\n        # The manifest needs to be updated for a skipped shard if its entry is not present, or\n        # if the stage is not 'upload', or if the stage is 'upload' but the status is not 'success'.\n        if (\n            old_status.location != location\n            or old_status.stage != Stage.UPLOAD\n            or old_status.status != Status.SUCCESS\n        ):\n            current_utc_time = (\n                datetime.datetime.utcnow()\n                .replace(tzinfo=datetime.timezone.utc)\n                .isoformat(timespec=\"seconds\")\n            )\n\n            size = get_file_size(location)\n\n            status = DownloadStatus(\n                config_name=config_name,\n                dataset=dataset if dataset else None,\n                selection=selection,\n                location=location,\n                area=fetch_geo_polygon(selection.get(\"area\", GLOBAL_COVERAGE_AREA)),\n                username=user,\n                stage=Stage.UPLOAD,\n                status=Status.SUCCESS,\n                error=None,\n                size=size,\n                scheduled_time=None,\n                retrieve_start_time=None,\n                retrieve_end_time=None,\n                fetch_start_time=None,\n                fetch_end_time=None,\n                download_start_time=None,\n                download_end_time=None,\n                upload_start_time=current_utc_time,\n                upload_end_time=current_utc_time,\n            )\n            self._update(status)\n            logger.info(\n                f\"Manifest updated for skipped shard: {location!r} -- {DownloadStatus.to_dict(status)!r}.\"\n            )\n\n    def _set_for_transaction(\n        self,\n        config_name: str,\n        dataset: str,\n        selection: t.Dict,\n        location: str,\n        user: str,\n    ) -> None:\n        \"\"\"Reset Manifest state in preparation for a new transaction.\"\"\"\n        self.status = dataclasses.replace(self._read(location))\n        self.status.config_name = config_name\n        self.status.dataset = dataset if dataset else None\n        self.status.selection = selection\n        self.status.location = location\n        self.status.username = user\n\n    def __enter__(self) -> None:\n        pass\n\n    def __exit__(self, exc_type, exc_inst, exc_tb) -> None:\n        \"\"\"Record end status of a transaction as either 'success' or 'failure'.\"\"\"\n        if exc_type is None:\n            status = Status.SUCCESS\n            error = None\n        else:\n            status = Status.FAILURE\n            # For explanation, see https://docs.python.org/3/library/traceback.html#traceback.format_exception\n            error = f\"license_id: {self.license_id} \"\n            error += \"\\n\".join(traceback.format_exception(exc_type, exc_inst, exc_tb))\n\n        new_status = dataclasses.replace(self.status)\n        new_status.error = error\n        new_status.status = status\n        current_utc_time = (\n            datetime.datetime.utcnow()\n            .replace(tzinfo=datetime.timezone.utc)\n            .isoformat(timespec=\"seconds\")\n        )\n\n        # This is necessary for setting the precise start time of the previous stage\n        # and end time of the final stage, as well as handling the case of Status.FAILURE.\n        if new_status.stage == Stage.FETCH:\n            new_status.fetch_start_time = self.prev_stage_precise_start_time\n            new_status.fetch_end_time = current_utc_time\n        elif new_status.stage == Stage.RETRIEVE:\n            new_status.retrieve_start_time = self.prev_stage_precise_start_time\n            new_status.retrieve_end_time = current_utc_time\n        elif new_status.stage == Stage.DOWNLOAD:\n            new_status.download_start_time = self.prev_stage_precise_start_time\n            new_status.download_end_time = current_utc_time\n        else:\n            new_status.upload_start_time = self.prev_stage_precise_start_time\n            new_status.upload_end_time = current_utc_time\n\n        new_status.size = get_file_size(new_status.location)\n\n        self.status = new_status\n\n        self._update(self.status)\n\n    def transact(\n        self,\n        config_name: str,\n        dataset: str,\n        selection: t.Dict,\n        location: str,\n        user: str,\n    ) -> \"Manifest\":\n        \"\"\"Create a download transaction.\"\"\"\n        self._set_for_transaction(config_name, dataset, selection, location, user)\n        return self\n\n    def set_stage(self, stage: Stage) -> None:\n        \"\"\"Sets the current stage in manifest.\"\"\"\n        prev_stage = self.status.stage\n        new_status = dataclasses.replace(self.status)\n        new_status.stage = stage\n        new_status.status = Status.IN_PROGRESS\n        current_utc_time = (\n            datetime.datetime.utcnow()\n            .replace(tzinfo=datetime.timezone.utc)\n            .isoformat(timespec=\"seconds\")\n        )\n\n        if stage == Stage.FETCH:\n            new_status.fetch_start_time = current_utc_time\n        elif stage == Stage.RETRIEVE:\n            new_status.retrieve_start_time = current_utc_time\n        elif stage == Stage.DOWNLOAD:\n            new_status.fetch_start_time = self.prev_stage_precise_start_time\n            new_status.fetch_end_time = current_utc_time\n            new_status.download_start_time = current_utc_time\n        else:\n            if prev_stage == Stage.DOWNLOAD:\n                new_status.download_start_time = self.prev_stage_precise_start_time\n                new_status.download_end_time = current_utc_time\n            else:\n                new_status.retrieve_start_time = self.prev_stage_precise_start_time\n                new_status.retrieve_end_time = current_utc_time\n            new_status.upload_start_time = current_utc_time\n\n        self.status = new_status\n        self._update(self.status)\n\n    @abc.abstractmethod\n    def _read(self, location: str) -> DownloadStatus:\n        pass\n\n    @abc.abstractmethod\n    def _update(self, download_status: DownloadStatus) -> None:\n        pass\n\n\nclass FirestoreManifest(Manifest, Database):\n    \"\"\"A Firestore Manifest.\n    This Manifest implementation stores DownloadStatuses in a Firebase document store.\n    The document hierarchy for the manifest is as follows:\n      [manifest  <or manifest name, configurable from CLI>]\n      ├── doc_id (md5 hash of the path) { 'selection': {...}, 'location': ..., 'username': ... }\n      └── etc...\n    Where `[<name>]` indicates a collection and `<name> {...}` indicates a document.\n    \"\"\"\n\n    def _get_db(self) -> firestore.firestore.Client:\n        \"\"\"Acquire a firestore client, initializing the firebase app if necessary.\n        Will attempt to get the db client five times. If it's still unsuccessful, a\n        `ManifestException` will be raised.\n        \"\"\"\n        db = None\n        attempts = 0\n\n        while db is None:\n            try:\n                db = firestore.client()\n            except ValueError as e:\n                # The above call will fail with a value error when the firebase app is not initialized.\n                # Initialize the app here, and try again.\n                # Use the application default credentials.\n                cred = credentials.ApplicationDefault()\n\n                firebase_admin.initialize_app(cred)\n                logger.info(\"Initialized Firebase App.\")\n\n                if attempts > 4:\n                    raise ManifestException(\n                        \"Exceeded number of retries to get firestore client.\"\n                    ) from e\n\n            time.sleep(get_wait_interval(attempts))\n\n            attempts += 1\n\n        return db\n\n    def _read(self, location: str) -> DownloadStatus:\n        \"\"\"Reads the JSON data from a manifest.\"\"\"\n\n        doc_id = generate_md5_hash(location)\n\n        # Update document with download status\n        download_doc_ref = self.root_document_for_store(doc_id)\n\n        result = download_doc_ref.get()\n        row = {}\n        if result.exists:\n            records = result.to_dict()\n            row = {n: to_json_serializable_type(v) for n, v in records.items()}\n        return DownloadStatus.from_dict(row)\n\n    def _update(self, download_status: DownloadStatus) -> None:\n        \"\"\"Update or create a download status record.\"\"\"\n        logger.info(\"Updating Firestore Manifest.\")\n\n        status = DownloadStatus.to_dict(download_status)\n        doc_id = generate_md5_hash(status[\"location\"])\n\n        # Update document with download status.\n        download_doc_ref = self.root_document_for_store(doc_id)\n\n        result: WriteResult = download_doc_ref.set(status)\n\n        logger.info(\n            \"Firestore manifest updated. \" +\n            f\"update_time={result.update_time}, \" +\n            f\"status={status['status']} \" +\n            f\"stage={status['stage']} \" +\n            f\"filename={download_status.location}.\"\n        )\n\n    def root_document_for_store(self, store_scheme: str) -> DocumentReference:\n        \"\"\"Get the root manifest document given the user's config and current document's storage location.\"\"\"\n        return (\n            self._get_db()\n            .collection(get_config().manifest_collection)\n            .document(store_scheme)\n        )\n"
  },
  {
    "path": "weather_dl_v2/license_deployment/util.py",
    "content": "# Copyright 2023 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nimport datetime\nimport logging\nimport geojson\nimport hashlib\nimport itertools\nimport os\nimport signal\nimport socket\nimport subprocess\nimport sys\nimport typing as t\n\nimport numpy as np\nimport pandas as pd\nfrom apache_beam.io.gcp import gcsio\nfrom apache_beam.utils import retry\nfrom xarray.core.utils import ensure_us_time_resolution\nfrom urllib.parse import urlparse\nfrom google.api_core.exceptions import BadRequest\nfrom threading import Lock\n\nlogger = logging.getLogger(__name__)\nlogger.setLevel(logging.INFO)\n\nLATITUDE_RANGE = (-90, 90)\nLONGITUDE_RANGE = (-180, 180)\nGLOBAL_COVERAGE_AREA = [90, -180, -90, 180]\n\n\ndef exceptionit(func):\n    def inner_function(*args, **kwargs):\n        try:\n            func(*args, **kwargs)\n        except Exception as e:\n            logger.error(f\"exception in {func.__name__}  {e.__class__.__name__} {e}.\")\n\n    return inner_function\n\n\ndef _retry_if_valid_input_but_server_or_socket_error_and_timeout_filter(\n    exception,\n) -> bool:\n    if isinstance(exception, socket.timeout):\n        return True\n    if isinstance(exception, TimeoutError):\n        return True\n    # To handle the concurrency issue in BigQuery.\n    if isinstance(exception, BadRequest):\n        return True\n    return retry.retry_if_valid_input_but_server_error_and_timeout_filter(exception)\n\nclass GracefulKiller:\n  \"\"\"Class to check for SIGTERM signal.\n  Used to handle gracefull termination. If ever SIGTERM is recived by\n  the process GracefulKiller.kill_now will be `true`.\"\"\"\n\n  kill_now = False\n  def __init__(self):\n    signal.signal(signal.SIGINT, self.exit_gracefully)\n    signal.signal(signal.SIGTERM, self.exit_gracefully)\n\n  def exit_gracefully(self, signum, frame):\n    logger.warning('SIGTERM recieved.')\n    self.kill_now = True\n\n\nclass _FakeClock:\n\n    def sleep(self, value):\n        pass\n\n\ndef retry_with_exponential_backoff(fun):\n    \"\"\"A retry decorator that doesn't apply during test time.\"\"\"\n    clock = retry.Clock()\n\n    # Use a fake clock only during test time...\n    if \"unittest\" in sys.modules.keys():\n        clock = _FakeClock()\n\n    return retry.with_exponential_backoff(\n        retry_filter=_retry_if_valid_input_but_server_or_socket_error_and_timeout_filter,\n        clock=clock,\n    )(fun)\n\n\n# TODO(#245): Group with common utilities (duplicated)\ndef ichunked(iterable: t.Iterable, n: int) -> t.Iterator[t.Iterable]:\n    \"\"\"Yield evenly-sized chunks from an iterable.\"\"\"\n    input_ = iter(iterable)\n    try:\n        while True:\n            it = itertools.islice(input_, n)\n            # peek to check if 'it' has next item.\n            first = next(it)\n            yield itertools.chain([first], it)\n    except StopIteration:\n        pass\n\n\n# TODO(#245): Group with common utilities (duplicated)\ndef copy(src: str, dst: str) -> None:\n    \"\"\"Copy data via `gcloud storage cp`.\"\"\"\n    try:\n        subprocess.run([\"gcloud\", \"storage\", \"cp\", src, dst], check=True, capture_output=True)\n    except subprocess.CalledProcessError as e:\n        logger.info(\n            f'Failed to copy file {src!r} to {dst!r} due to {e.stderr.decode(\"utf-8\")}.'\n        )\n        raise\n\n\n# TODO(#245): Group with common utilities (duplicated)\ndef to_json_serializable_type(value: t.Any) -> t.Any:\n    \"\"\"Returns the value with a type serializable to JSON\"\"\"\n    # Note: The order of processing is significant.\n    logger.info(\"Serializing to JSON.\")\n\n    if pd.isna(value) or value is None:\n        return None\n    elif np.issubdtype(type(value), np.floating):\n        return float(value)\n    elif isinstance(value, np.ndarray):\n        # Will return a scaler if array is of size 1, else will return a list.\n        return value.tolist()\n    elif (\n        isinstance(value, datetime.datetime)\n        or isinstance(value, str)\n        or isinstance(value, np.datetime64)\n    ):\n        # Assume strings are ISO format timestamps...\n        try:\n            value = datetime.datetime.fromisoformat(value)\n        except ValueError:\n            # ... if they are not, assume serialization is already correct.\n            return value\n        except TypeError:\n            # ... maybe value is a numpy datetime ...\n            try:\n                value = ensure_us_time_resolution(value).astype(datetime.datetime)\n            except AttributeError:\n                # ... value is a datetime object, continue.\n                pass\n\n        # We use a string timestamp representation.\n        if value.tzname():\n            return value.isoformat()\n\n        # We assume here that naive timestamps are in UTC timezone.\n        return value.replace(tzinfo=datetime.timezone.utc).isoformat()\n    elif isinstance(value, np.timedelta64):\n        # Return time delta in seconds.\n        return float(value / np.timedelta64(1, \"s\"))\n    # This check must happen after processing np.timedelta64 and np.datetime64.\n    elif np.issubdtype(type(value), np.integer):\n        return int(value)\n\n    return value\n\n\ndef fetch_geo_polygon(area: t.Union[list, str]) -> str:\n    \"\"\"Calculates a geography polygon from an input area.\"\"\"\n    # Ref: https://confluence.ecmwf.int/pages/viewpage.action?pageId=151520973\n    if isinstance(area, str):\n        # European area\n        if area == \"E\":\n            area = [73.5, -27, 33, 45]\n        # Global area\n        elif area == \"G\":\n            area = GLOBAL_COVERAGE_AREA\n        else:\n            raise RuntimeError(f\"Not a valid value for area in config: {area}.\")\n\n    n, w, s, e = [float(x) for x in area]\n    if s < LATITUDE_RANGE[0]:\n        raise ValueError(f\"Invalid latitude value for south: '{s}'\")\n    if n > LATITUDE_RANGE[1]:\n        raise ValueError(f\"Invalid latitude value for north: '{n}'\")\n    if w < LONGITUDE_RANGE[0]:\n        raise ValueError(f\"Invalid longitude value for west: '{w}'\")\n    if e > LONGITUDE_RANGE[1]:\n        raise ValueError(f\"Invalid longitude value for east: '{e}'\")\n\n    # Define the coordinates of the bounding box.\n    coords = [[w, n], [w, s], [e, s], [e, n], [w, n]]\n\n    # Create the GeoJSON polygon object.\n    polygon = geojson.dumps(geojson.Polygon([coords]))\n    return polygon\n\n\ndef get_file_size(path: str) -> float:\n    parsed_gcs_path = urlparse(path)\n    if parsed_gcs_path.scheme != \"gs\" or parsed_gcs_path.netloc == \"\":\n        return os.stat(path).st_size / (1024**3) if os.path.exists(path) else 0\n    else:\n        return (\n            gcsio.GcsIO().size(path) / (1024**3) if gcsio.GcsIO().exists(path) else 0\n        )\n\n\ndef get_wait_interval(num_retries: int = 0) -> float:\n    \"\"\"Returns next wait interval in seconds, using an exponential backoff algorithm.\"\"\"\n    if 0 == num_retries:\n        return 0\n    return 2**num_retries\n\n\ndef generate_md5_hash(input: str) -> str:\n    \"\"\"Generates md5 hash for the input string.\"\"\"\n    return hashlib.md5(input.encode(\"utf-8\")).hexdigest()\n\n\ndef download_with_aria2(url: str, path: str) -> None:\n    \"\"\"Downloads a file from the given URL using the `aria2c` command-line utility,\n    with options set to improve download speed and reliability.\"\"\"\n    dir_path, file_name = os.path.split(path)\n    try:\n        subprocess.run(\n            [\n                \"aria2c\",\n                \"-x\",\n                \"16\",\n                \"-s\",\n                \"16\",\n                url,\n                \"-d\",\n                dir_path,\n                \"-o\",\n                file_name,\n                \"--allow-overwrite\",\n            ],\n            check=True,\n            capture_output=True,\n        )\n    except subprocess.CalledProcessError as e:\n        logger.info(\n            f'Failed download from server {url!r} to {path!r} due to {e.stderr.decode(\"utf-8\")}.'\n        )\n        raise\n\nclass ThreadSafeDict:\n    \"\"\"A thread safe dict with crud operations.\"\"\"\n\n\n    def __init__(self) -> None:\n        self._dict = {}\n        self._lock = Lock()\n        self.initial_delay = 1\n        self.factor = 0.5\n\n\n    def __getitem__(self, key):\n        val = None\n        with self._lock:\n            val = self._dict[key]\n        return val\n\n\n    def __setitem__(self, key, value):\n        with self._lock:\n            self._dict[key] = value\n\n\n    def remove(self, key):\n        with self._lock:\n            self._dict.__delitem__(key)\n\n\n    def has_key(self, key):\n        present = False\n        with self._lock:\n            present = key in self._dict\n        return present\n\n\n    def increment(self, key, delta=1):\n        with self._lock:\n            if key in self._dict:\n                self._dict[key] += delta\n\n\n    def decrement(self, key, delta=1):\n        with self._lock:\n            if key in self._dict:\n                self._dict[key] -= delta\n\n\n    def find_exponential_delay(self, n: int) -> int:\n        delay = self.initial_delay\n        for _ in range(n):\n            delay += delay*self.factor\n        return delay\n\n\n    def exponential_time(self, key):\n        \"\"\"Returns exponential time based on dict value. Time in seconds.\"\"\"\n        delay = 0\n        with self._lock:\n            if key in self._dict:\n                delay = self.find_exponential_delay(self._dict[key])\n        return delay * 60\n"
  },
  {
    "path": "weather_mv/MANIFEST.in",
    "content": "global-exclude *_test.py\ninclude README.md\nprune test_data\n"
  },
  {
    "path": "weather_mv/README.md",
    "content": "# ⛅️ `weather-mv` – Weather Mover\n\nWeather Mover loads weather data from cloud storage into analytics engines,\nlike [Google BigQuery](https://cloud.google.com/bigquery) (_alpha_).\n\n## Features\n\n* **Rapid Querability**: After geospatial data is in BigQuery, data wranging becomes as simple as writing SQL. This\n  allows for rapid data exploration, visualization, and model pipeline prototyping.\n* **Simple Versioning**: All rows in the table come with a `data_import_time` column. This provides some notion of how\n  the data is versioned. Downstream analysis can adapt to data ingested at differen times by updating a `WHERE` clause.\n* **Parallel Upload**: Each file will be processed in parallel. With Dataflow autoscaling, even large datasets can be\n  processed in a reasonable amount of time.\n* **Streaming support**: When running the mover in streaming mode, it will automatically process files as they appear in\n  cloud buckets via PubSub.\n* _(new)_ **Grib Regridding**: `weather-mv regrid` uses [MetView](https://metview.readthedocs.io/en/latest/) to\n  interpolate Grib files to a\n  [range of grids.](https://metview.readthedocs.io/en/latest/metview/using_metview/regrid_intro.html?highlight=grid#grid)\n* _(new)_ **Earth Engine Ingestion**: `weather-mv earthengine` ingests weather data into [Google Earth Engine](https://earthengine.google.com/).\n\n## Usage\n\n```\nusage: weather-mv [-h] {bigquery,bq,regrid,rg} ...\n\nWeather Mover loads weather data from cloud storage into analytics engines.\n\npositional arguments:\n  {bigquery,bq,regrid,rg,earthengine,ee}\n                        help for subcommand\n    bigquery (bq)       Move data into Google BigQuery\n    regrid (rg)         Copy and regrid grib data with MetView.\n    earthengine (ee)    Move data into Google Earth Engine\n\noptional arguments:\n  -h, --help            show this help message and exit\n```\n\nThe weather mover makes use of subcommands to distinguish between tasks. The above tasks are currently supported.\n\n_Common options_\n\n* `-i, --uris`: (required) URI glob pattern matching input weather data, e.g. 'gs://ecmwf/era5/era5-2015-*.gb'.\n* `--topic`: A Pub/Sub topic for GCS OBJECT_FINALIZE events, or equivalent, of a cloud bucket. E.g.\n  'projects/<PROJECT_ID>/topics/<TOPIC_ID>'. Cannot be used with `--subscription`.\n* `--subscription`: A Pub/Sub subscription for GCS OBJECT_FINALIZE events, or equivalent, of a cloud bucket. Cannot be \n  used with `--topic`.\n* `--window_size`: Output file's window size in minutes. Only used with the `topic` flag. Default: 1.0 minute.\n* `--num_shards`: Number of shards to use when writing windowed elements to cloud storage. Only used with the `topic`\n  flag. Default: 5 shards.\n* `-d, --dry-run`: Preview the load into BigQuery. Default: off.\n* `--log-level`: An integer to configure log level. Default: 2(INFO).\n* `--use-local-code`: Supply local code to the Runner. Default: False.\n\nInvoke with `-h` or `--help` to see the full range of options.\n\n### `weather-mv bigquery`\n\n```\nusage: weather-mv bigquery [-h] -i URIS [--topic TOPIC] [--window_size WINDOW_SIZE] [--num_shards NUM_SHARDS] [-d]\n                           -o OUTPUT_TABLE --geo_data_parquet_path GEO_DATA_PARQUET [-v variables [variables ...]]\n                           [-a area [area ...]] [--import_time IMPORT_TIME] [--infer_schema]\n                           [--xarray_open_dataset_kwargs XARRAY_OPEN_DATASET_KWARGS]\n                           [--tif_metadata_for_start_time TIF_METADATA_FOR_START_TIME]\n                           [--tif_metadata_for_end_time TIF_METADATA_FOR_END_TIME] [-s]\n                           [--rows_chunk_size rows_chunk_size] [--skip_creating_polygon]\n                           [--skip_creating_geo_data_parquet]\n```\n\nThe `bigquery` subcommand loads weather data into BigQuery. In addition to the common options above, users may specify\ncommand-specific options:\n\n_Command options_:\n\n* `-o, --output_table`: (required) Full name of destination BigQuery table. Ex: my_project.my_dataset.my_table\n* `--geo_data_parquet_path`: (required) A path to dump the geo data parquet. This parquet consists of columns:\n  latitude, longitude, geo_point, and geo_polygon. We calculate all of this information\n  upfront so that we do not need to process it every time we process a set of files.\n* `-v, --variables`:  Target variables (or coordinates) for the BigQuery schema. Default: will import all data variables\n  as columns.\n* `-a, --area`:  Target area in [N, W, S, E]. Default: Will include all available area.\n* `--import_time`: When writing data to BigQuery, record that data import occurred at this time\n  (format: YYYY-MM-DD HH:MM:SS.usec+offset). Default: now in UTC.\n* `--infer_schema`: Download one file in the URI pattern and infer a schema from that file. Default: off.\n* `--xarray_open_dataset_kwargs`: Keyword-args to pass into `xarray.open_dataset()` in the form of a JSON string.\n* `--rows_chunk_size`: The size of the chunk of rows to be loaded into memory for processing.\n  Depending on your system's memory, use this to tune how much rows to process. Default: 1_000_000.\n* `--tif_metadata_for_start_time` : Metadata that contains tif file's start/initialization time. Applicable only for tif files.\n* `--tif_metadata_for_end_time` : Metadata that contains tif file's end/forecast time. Applicable only for tif files (optional).\n* `-s, --skip_region_validation` : Skip validation of regions for data migration. Default: off.\n* `--disable_grib_schema_normalization` : To disable grib's schema normalization. Default: off.\n* `--skip_creating_polygon` : Not ingest grid points as polygons in BigQuery. Default: Ingest grid points as Polygon in \n  BigQuery. Note: This feature relies on the assumption that the provided grid has an equal distance between consecutive \n  points of latitude and longitude.\n* `--skip_creating_geo_data_parquet`: Skip the generation of the geo data parquet if it already exists at the given --geo_data_parquet_path.\n  Please note that the geo data parquet is mandatory for ingesting data into BigQuery. Default: Create geo data parquet file.\n\nInvoke with `bq -h` or `bigquery --help` to see the full range of options.\n\n> Note: In case of grib files, by default its schema will be normalized and the name of the data variables will look \n> like `<level>_<height>_<attrs['GRIB_stepType']>_<key>`.\n> \n> This solves the issue of skipping over some of the data due to: https://github.com/ecmwf/cfgrib#filter-heterogeneous-grib-files.\n\n_Usage examples_:\n\n```bash\nweather-mv bigquery --uris \"gs://your-bucket/*.nc\" \\\n           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \\\n           --temp_location \"gs://$BUCKET/tmp\" \\  # Needed for batch writes to BigQuery\n           --direct_num_workers 2\n```\n\nUsing the subcommand alias `bq`:\n\n```bash\nweather-mv bq --uris \"gs://your-bucket/*.nc\" \\\n           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \\\n           --temp_location \"gs://$BUCKET/tmp\" \\  # Needed for batch writes to BigQuery\n           --direct_num_workers 2\n```\n\nPreview load with a dry run:\n\n```bash\nweather-mv bq --uris \"gs://your-bucket/*.nc\" \\\n           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \\\n           --temp_location \"gs://$BUCKET/tmp\" \\  # Needed for batch writes to BigQuery\n           --direct_num_workers 2 \\\n           --dry-run\n```\n\nIngest grid points with skip creating polygon in BigQuery:\n\n```bash\nweather-mv bq --uris \"gs://your-bucket/*.nc\" \\\n           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \\\n           --temp_location \"gs://$BUCKET/tmp\" \\  # Needed for batch writes to BigQuery\n           --direct_num_workers 2 \\\n           --skip_creating_polygon\n```\n\nLoad COG's (.tif) files:\n\n```bash\nweather-mv bq --uris \"gs://your-bucket/*.tif\" \\\n           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \\\n           --temp_location \"gs://$BUCKET/tmp\" \\  # Needed for batch writes to BigQuery\n           --direct_num_workers 2 \\\n           --tif_metadata_for_start_time start_time \\\n           --tif_metadata_for_end_time end_time\n```\n\nUpload only a subset of variables:\n\n```bash\nweather-mv bq --uris \"gs://your-bucket/*.nc\" \\\n           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \\\n           --variables u10 v10 t\n           --temp_location \"gs://$BUCKET/tmp\" \\\n           --direct_num_workers 2\n```\n\nUpload all variables, but for a specific geographic region (for example, the continental US):\n\n```bash\nweather-mv bq --uris \"gs://your-bucket/*.nc\" \\\n           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \\\n           --area 49 -124 24 -66 \\\n           --temp_location \"gs://$BUCKET/tmp\" \\\n           --direct_num_workers 2\n```\n\nUpload a zarr file:\n\n```bash\nweather-mv bq --uris \"gs://your-bucket/*.zarr\" \\\n           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \\\n           --temp_location \"gs://$BUCKET/tmp\" \\\n           --use-local-code \\\n           --zarr \\\n           --direct_num_workers 2\n```\n\nUpload a specific date range's data from the zarr file:\n\n```bash\nweather-mv bq --uris \"gs://your-bucket/*.zarr\" \\\n           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \\\n           --temp_location \"gs://$BUCKET/tmp\" \\\n           --use-local-code \\\n           --zarr \\\n           --zarr_kwargs '{\"start_date\": \"2021-07-18\", \"end_date\": \"2021-07-19\"}' \\\n           --direct_num_workers 2\n```\n\nUpload a specific date range's data from the file:\n\n```bash\nweather-mv bq --uris \"gs://your-bucket/*.nc\" \\\n           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \\\n           --temp_location \"gs://$BUCKET/tmp\" \\\n           --use-local-code \\\n           --xarray_open_dataset_kwargs '{\"start_date\": \"2021-07-18\", \"end_date\": \"2021-07-19\"}' \\\n```\n\nControl how weather data is opened with XArray:\n\n```bash\nweather-mv bq --uris \"gs://your-bucket/*.grib\" \\\n           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \\\n           --xarray_open_dataset_kwargs '{\"engine\": \"cfgrib\", \"indexpath\": \"\", \"backend_kwargs\": {\"filter_by_keys\": {\"typeOfLevel\": \"surface\", \"edition\": 1}}}' \\\n           --temp_location \"gs://$BUCKET/tmp\" \\\n           --direct_num_workers 2\n```\n\nUsing DataflowRunner:\n\n```bash\nweather-mv bq --uris \"gs://your-bucket/*.nc\" \\\n           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \\\n           --runner DataflowRunner \\\n           --project $PROJECT \\\n           --region  $REGION \\\n           --temp_location \"gs://$BUCKET/tmp\" \\\n           --job_name $JOB_NAME \n```\n\nUsing DataflowRunner and using local code for pipeline\n\n```bash\nweather-mv bq --uris \"gs://your-bucket/*.nc\" \\\n           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \\\n           --runner DataflowRunner \\\n           --project $PROJECT \\\n           --region  $REGION \\\n           --temp_location \"gs://$BUCKET/tmp\" \\\n           --job_name $JOB_NAME \\\n           --use-local-code\n```\n\nFor a full list of how to configure the Dataflow pipeline, please review\n[this table](https://cloud.google.com/dataflow/docs/reference/pipeline-options).\n\n### `weather-mv regrid`\n\n```\nusage: weather-mv regrid [-h] -i URIS [--topic TOPIC] [--window_size WINDOW_SIZE] [--num_shards NUM_SHARDS] [-d]\n                         --output_path OUTPUT_PATH [--regrid_kwargs REGRID_KWARGS] [--to_netcdf]\n```\n\nThe `regrid` subcommand makes a regridded copy of the input data with MetView.\n\nTo use this capability of the weather mover, please use the `[regrid]` extra when installing:\n\n```shell\npip install google-weather-tools[regrid]\n```\n\n> **Warning**: MetView requires a decent amount of disk space in order to perform any regrid operation! Intermediary \n> regridding steps will write temporary grib data to disk. Thus, please make use of the `--disk_size_gb` Dataflow \n> option. A good rule of thumb would be to consume `30 + 2.5x` GBs of disk, where `x` is the size of each source data\n> file. \n> \n> TODO([#191](https://github.com/google/weather-tools/issues/191)): Find smaller disk space bound.\n\nIn addition to the common options above, users may specify command-specific options:\n\n_Command options_:\n\n* `-o, --output_path`: (required) The destination path for the regridded files.\n* `-k, --regrid_kwargs`: Keyword-args to pass into `metview.regrid()` in the form of a JSON string. Will default to\n  '{\"grid\": [0.25, 0.25]}'.\n* `--to_netcdf`: Write output file in NetCDF via XArray. Default: off\n* `-bz2`, `--apply_bz2_compression`: Enable bzip2 (.bz2) compression for the regridded file. Default: off.\n\nFor a full range of grid options, please\nconsult [this documentation.](https://metview.readthedocs.io/en/latest/metview/using_metview/regrid_intro.html?highlight=grid#grid)\n\nInvoke with `rg -h` or `regrid --help` to see the full range of options.\n\n> Note: Currently, `regrid` doesn't work out-of-the-box! Until [#172](https://github.com/google/weather-tools/issues/172)\n> is fixed, users will have to use a workaround in order to ensure MetView is installed in their runner environment\n> (instructions are below).\n\n_Usage examples_:\n\n```bash\nweather-mv regrid --uris \"gs://your-bucket/*.gb\" \\\n           --output_path \"gs://regrid-bucket/\" \n \n```\n\nUsing the subcomand alias 'rg':\n\n```bash\nweather-mv rg --uris \"gs://your-bucket/*.gb\" \\\n           --output_path \"gs://regrid-bucket/\" \n \n```\n\nPreview regrid with a dry run:\n\n```bash\nweather-mv rg --uris \"gs://your-bucket/*.gb\" \\\n           --output_path \"gs://regrid-bucket/\" \\\n           --dry-run\n \n```\n\nInterpolate to a finer grid resolution:\n\n```bash\nweather-mv rg --uris \"gs://your-bucket/*.gb\" \\\n           --output_path \"gs://regrid-bucket/\" \\\n           --regrid_kwargs '{\"grid\": [0.1, 0.1]}'.\n \n```\n\nInterpolate to a high-resolution octahedral gaussian grid:\n\n```bash\nweather-mv rg --uris \"gs://your-bucket/*.gb\" \\\n           --output_path \"gs://regrid-bucket/\" \\\n           --regrid_kwargs '{\"grid\": \"O1280}'.\n \n```\n\nConvert gribs to NetCDF on copy:\n\n```bash\nweather-mv rg --uris \"gs://your-bucket/*.gb\" \\\n           --output_path \"gs://regrid-bucket/\" \\\n           --to_netcdf\n```\n\nUsing DataflowRunner:\n\n```bash\nweather-mv rg --uris \"gs://your-bucket/*.nc\" \\\n           --output_path \"gs://regrid-bucket/\" \\\n           --runner DataflowRunner \\\n           --project $PROJECT \\\n           --region  $REGION \\\n           --temp_location \"gs://$BUCKET/tmp\" \\\n           --experiment=use_runner_v2 \\\n           --sdk_container_image=\"gcr.io/$PROJECT/$REPO:latest\"  \\\n           --job_name $JOB_NAME \n```\n\nUsing DataflowRunner, with added disk per VM:\n\n```bash\nweather-mv rg --uris \"gs://your-bucket/*.nc\" \\\n           --output_path \"gs://regrid-bucket/\" \\\n           --runner DataflowRunner \\\n           --project $PROJECT \\\n           --region  $REGION \\\n           --disk_size_gb 250 \\ \n           --temp_location \"gs://$BUCKET/tmp\" \\\n           --experiment=use_runner_v2 \\\n           --sdk_container_image=\"gcr.io/$PROJECT/$REPO:latest\"  \\\n           --job_name $JOB_NAME \n```\n\nFor a full list of how to configure the Dataflow pipeline, please review\n[this table](https://cloud.google.com/dataflow/docs/reference/pipeline-options).\n\n### `weather-mv earthengine`\n\n```\nusage: weather-mv earthengine [-h] -i URIS --asset_location ASSET_LOCATION --ee_asset EE_ASSET\n                           [--ee_asset_type ASSET_TYPE] [--disable_grib_schema_normalization] [--use_personal_account] [-s]\n                           [--xarray_open_dataset_kwargs XARRAY_OPEN_DATASET_KWARGS]\n                           [--service_account my-service-account@...gserviceaccount.com --private_key PRIVATE_KEY_LOCATION]\n                           [--ee_qps EE_QPS] [--ee_latency EE_LATENCY] [--ee_max_concurrent EE_MAX_CONCURRENT]\n```\n\nThe `earthengine` subcommand ingests weather data into Earth Engine. It includes a caching function that allows it to\nskip ingestion for assets that have already been created in Earth Engine or for which the asset file already exists in\nthe GCS bucket. In addition to the common options above, users may specify command-specific options:\n\n_Command options_:\n\n* `--asset_location`: (required) Bucket location at which asset files will be pushed.\n* `--ee_asset`: (required) The asset folder path in earth engine project where the asset files will be pushed.\n  It should be in format: `projects/<project-id>/assets/<asset-folder>`. Make sure that <asset-folder> is there\n  under <project-id> in earth engine assets. i.e. projects/my-gcp-project/assets/my/foo/bar.\n* `--ee_asset_type`: The type of asset to ingest in the earth engine. Default: IMAGE.\n  Supported types are `IMAGE` and `TABLE`.\\\n  `IMAGE`: Uploads georeferenced raster datasets in GeoTIFF format.\\\n  `TABLE`: Uploads the datsets in the CSV format. Useful in case of point data (sparse data). \n* `--disable_grib_schema_normalization`:  Restricts merging of grib datasets. Default: False\n* `-u, --use_personal_account`: To use personal account for earth engine authentication.\n* `--service_account`: Service account address when using a private key for earth engine authentication.\n* `--private_key`: To use a private key for earth engine authentication. Only used with the `service_account` flag.\n* `--xarray_open_dataset_kwargs`: Keyword-args to pass into `xarray.open_dataset()` in the form of a JSON string.\n* `-s, --skip_region_validation` : Skip validation of regions for data migration. Default: off.\n* `-f, --force`: A flag that allows overwriting of existing asset files in the GCS bucket. Default: off, which means\n  that the ingestion of URIs for which assets files (GeoTiff/CSV) already exist in the GCS bucket will be skipped.\n* `--ee_qps`: Maximum queries per second allowed by EE for your project. Default: 10.\n* `--ee_latency`: The expected latency per requests, in seconds. Default: 0.5.\n* `--ee_max_concurrent`: Maximum concurrent api requests to EE allowed for your project. Default: 10.\n* `--band_names_mapping`: A JSON file which contains the band names for the TIFF file.\n* `--initialization_time_regex`: A Regex string to get the initialization time from the filename.\n* `--forecast_time_regex`: A Regex string to get the forecast/end time from the filename.\n* `--group_common_hypercubes`: A flag that allows to split up large grib files into multiple level-wise ImageCollections / COGS.\n* `--use_deflate`: A flag that allows you to use deflate algorithm for beter compression. Using deflate compression\n  takes extra time in COG creation. Default:False.\n* `--use_metrics`: A flag that allows you to add Beam metrics to the pipeline. Default: False.\n* `--use_monitoring_metrics`: A flag that allows you to to add Google Cloud Monitoring metrics to the pipeline. Default: False.\n\nInvoke with `ee -h` or `earthengine --help` to see the full range of options.\n\n_Usage examples_:\n\n```bash\nweather-mv earthengine --uris \"gs://your-bucket/*.grib\" \\\n           --asset_location \"gs://$BUCKET/assets\" \\  # Needed to store assets generated from *.grib\n           --ee_asset \"projects/$PROJECT/assets/test_dir\"\n```\n\nUsing the subcommand alias `ee`:\n\n```bash\nweather-mv ee --uris \"gs://your-bucket/*.grib\" \\\n           --asset_location \"gs://$BUCKET/assets\" \\  # Needed to store assets generated from *.grib\n           --ee_asset \"projects/$PROJECT/assets/test_dir\"\n```\n\nPreview ingestion with a dry run:\n\n```bash\nweather-mv ee --uris \"gs://your-bucket/*.grib\" \\\n           --asset_location \"gs://$BUCKET/assets\" \\  # Needed to store assets generated from *.grib\n           --ee_asset \"projects/$PROJECT/assets/test_dir\" \\\n           --dry-run\n```\n\nAuthenticate earth engine using personal account:\n\n```bash\nweather-mv ee --uris \"gs://your-bucket/*.grib\" \\\n           --asset_location \"gs://$BUCKET/assets\" \\  # Needed to store assets generated from *.grib\n           --ee_asset \"projects/$PROJECT/assets/test_dir\" \\\n           --use_personal_account\n```\n\nAuthenticate earth engine using a private key:\n\n```bash\nweather-mv ee --uris \"gs://your-bucket/*.grib\" \\\n           --asset_location \"gs://$BUCKET/assets\" \\  # Needed to store assets generated from *.grib\n           --ee_asset \"projects/$PROJECT/assets/test_dir\" \\\n           --service_account \"my-service-account@...gserviceaccount.com\" \\\n           --private_key \"path/to/private_key.json\"\n```\n\nIngest asset as table in earth engine:\n\n```bash\nweather-mv ee --uris \"gs://your-bucket/*.grib\" \\\n           --asset_location \"gs://$BUCKET/assets\" \\  # Needed to store assets generated from *.grib\n           --ee_asset \"projects/$PROJECT/assets/test_dir\" \\\n           --ee_asset_type \"TABLE\"\n```\n\nRestrict merging all bands or grib normalization:\n\n```bash\nweather-mv ee --uris \"gs://your-bucket/*.grib\" \\\n           --asset_location \"gs://$BUCKET/assets\" \\  # Needed to store assets generated from *.grib\n           --ee_asset \"projects/$PROJECT/assets/test_dir\" \\\n           --disable_grib_schema_normalization\n```\n\nControl how weather data is opened with XArray:\n\n```bash\nweather-mv ee --uris \"gs://your-bucket/*.grib\" \\\n           --asset_location \"gs://$BUCKET/assets\" \\  # Needed to store assets generated from *.grib\n           --ee_asset \"projects/$PROJECT/assets/test_dir\"\n           --xarray_open_dataset_kwargs '{\"engine\": \"cfgrib\", \"indexpath\": \"\", \"backend_kwargs\": {\"filter_by_keys\": {\"typeOfLevel\": \"surface\", \"edition\": 1}}}' \\\n           --temp_location \"gs://$BUCKET/tmp\"\n```\n\nLimit EE requests:\n\n```bash\nweather-mv ee --uris \"gs://your-bucket/*.grib\" \\\n           --asset_location \"gs://$BUCKET/assets\" \\  # Needed to store assets generated from *.grib\n           --ee_asset \"projects/$PROJECT/assets/test_dir\" \\\n           --ee_qps 10 \\\n           --ee_latency 0.5 \\\n           --ee_max_concurrent 10\n```\n\nCustom Band names:\n\n```bash\nweather-mv ee --uris \"gs://your-bucket/*.tif\" \\\n           --asset_location \"gs://$BUCKET/assets\" \\ # Needed to store assets generated from *.tif\n           --ee_asset \"projects/$PROJECT/assets/test_dir\" \\\n           --band_names_mapping \"filename.json\"\n```\n\nGetting initialization and forecast/end date-time from the filename:\n\n```bash\nweather-mv ee --uris \"gs://your-bucket/*.tif\" \\\n           --asset_location \"gs://$BUCKET/assets\" \\ # Needed to store assets generated from *.tif\n           --ee_asset \"projects/$PROJECT/assets/test_dir\" \\\n           --initialization_time_regex \"$REGEX\" \\\n           --forecast_time_regex \"$REGEX\"\n```\n\nExample:\n\n```bash\nweather-mv ee --uris \"gs://tmp-gs-bucket/3B-HHR-E_MS_MRG_3IMERG_20220901-S000000-E002959_0000_V06C_30min.tiff\" \\\n           --asset_location \"gs://$BUCKET/assets\" \\ # Needed to store assets generated from *.tif\n           --ee_asset \"projects/$PROJECT/assets/test_dir\" \\\n           --initialization_time_regex \"3B-HHR-E_MS_MRG_3IMERG_%Y%m%d-S%H%M%S-*tiff\" \\\n           --forecast_time_regex \"3B-HHR-E_MS_MRG_3IMERG_%Y%m%d-S*-E%H%M%S*tiff\"\n```\n\nIngesting a file into Earth Engine in a levelwise manner:\n\n```bash\nweather-mv ee --uris \"gs://your-bucket/*.tif\" \\\n           --asset_location \"gs://$BUCKET/assets\" \\ # Needed to store assets generated from *.tif\n           --ee_asset \"projects/$PROJECT/assets/test_dir\" \\\n           --group_common_hypercubes\n```\n\nUsing DataflowRunner:\n\n```bash\nweather-mv ee --uris \"gs://your-bucket/*.grib\" \\\n           --asset_location \"gs://$BUCKET/assets\" \\  # Needed to store assets generated from *.grib\n           --ee_asset \"projects/$PROJECT/assets/test_dir\"\n           --runner DataflowRunner \\\n           --project $PROJECT \\\n           --region  $REGION \\\n           --temp_location \"gs://$BUCKET/tmp\" \\\n           --job_name $JOB_NAME\n```\n\nFor a full list of how to configure the Dataflow pipeline, please review\n[this table](https://cloud.google.com/dataflow/docs/reference/pipeline-options).\n\n## Streaming ingestion\n\n`weather-mv` optionally provides the ability to react\nto [Pub/Sub events for objects added to GCS](https://cloud.google.com/storage/docs/pubsub-notifications). This can be\nused to automate ingestion into BigQuery as soon as weather data is disseminated. Another common use case it to\nautomatically create a down-sampled version of a dataset with `regrid`. To set up the Weather Mover with streaming\ningestion, use the `--topic` or `--subscription` flag  (see \"Common options\" above).\n\nObjects that don't match the `--uris` glob pattern will be filtered out of ingestion. This way, a bucket can contain\nmultiple types of data yet only have subsets processed with `weather-mv`.\n\n> It's worth noting: when setting up PubSub, **make sure to create a topic for GCS `OBJECT_FINALIZE` events only.**\n\n_Usage examples_:\n\n```shell\nweather-mv bq --uris \"gs://your-bucket/*.nc\" \\\n           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \\\n           --topic \"projects/$PROJECT/topics/$TOPIC_ID\" \\\n           --runner DataflowRunner \\\n           --project $PROJECT \\\n           --temp_location gs://$BUCKET/tmp \\\n           --job_name $JOB_NAME \n```\n\nWindow incoming data every five minutes instead of every minute.\n\n```shell\nweather-mv bq --uris \"gs://your-bucket/*.nc\" \\\n           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \\\n           --topic \"projects/$PROJECT/topics/$TOPIC_ID\" \\\n           --window_size 5 \\\n           --runner DataflowRunner \\\n           --project $PROJECT \\\n           --temp_location gs://$BUCKET/tmp \\\n           --job_name $JOB_NAME \n```\n\nIncrease the number of shards per window.\n\n```shell\nweather-mv bq --uris \"gs://your-bucket/*.nc\" \\\n           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \\\n           --topic \"projects/$PROJECT/topics/$TOPIC_ID\" \\\n           --num_shards 10 \\\n           --runner DataflowRunner \\\n           --project $PROJECT \\\n           --temp_location gs://$BUCKET/tmp \\\n           --job_name $JOB_NAME \n```\n\n### BigQuery\n\nData is written into BigQuery using streaming inserts. It may\ntake [up to 90 minutes](https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability)\nfor buffers to persist into storage. However, weather data will be available for querying immediately.\n\n> Note: It's recommended that you specify variables to ingest (`-v, --variables`) instead of inferring the schema for\n> streaming pipelines. Not all variables will be distributed with every file, especially when they are in Grib format.\n\n## Private Network Configuration\n\nWhile running `weather-mv` pipeline in GCP, there is a possibility that you may receive following error -\n\"Quotas were exceeded: IN_USE_ADDRESSES\"\n\nThis error occurs when GCP is trying to add new worker-instances and finds that, “Public IP” quota (assigned to your\nproject) is exhausted.\n\nTo solve this, we recommend using private IP while running your dataflow pipelines.\n\n```shell\nweather-mv bq --uris \"gs://your-bucket/*.nc\" \\\n           --output_table $PROJECT.$DATASET_ID.$TABLE_ID \\\n           --temp_location gs://$BUCKET/tmp \\\n           --runner DataflowRunner \\\n           --project $PROJECT \\\n           --region $REGION \\\n           --no_use_public_ips \\\n           --network=$NETWORK \\\n           --subnetwork=regions/$REGION/subnetworks/$SUBNETWORK\n```\n\n_Common options_:\n\n* `--no_use_public_ips`: To make Dataflow workers use private IP addresses for all communication, specify the\n  command-line flag: --no_use_public_ips. Make sure that the specified network or subnetwork has Private Google Access\n  enabled.\n* `--network`: The Compute Engine network for launching Compute Engine instances to run your pipeline.\n* `--subnetwork`:  The Compute Engine subnetwork for launching Compute Engine instances to run your pipeline.\n\nFor more information regarding how to configure Private IP, please refer\nto [Private IP Configuration Guide for Dataflow Pipeline Execution](../Private-IP-Configuration.md)\n.\n\nFor more information regarding Pipeline options, please refer\nto [pipeline-options](https://cloud.google.com/dataflow/docs/reference/pipeline-options).\n\n## Custom Dataflow Container for ECMWF dependencies (like MetView)\n\nIt's difficult to install all necessary system dependencies on a Dataflow worker with a pure python solution. For\nexample, MetView requires binaries to be installed on the system machine, which are broken in the standard debian\ninstall channels (they are only maintained via `conda-forge`).\n\nThus, to include such dependencies, we've provided steps for you to build\na [Beam container environment](https://beam.apache.org/documentation/runtime/environments/). In the near future, we'll\narrange things so you don't have to worry about any of these extra\nsteps ([#172](https://github.com/google/weather-tools/issues/172)). See [these instructions](../Runtime-Container.md) \nto learn how to build a custom image for this project.\n\nCurrently, this image is necessary for the `weather-mv regrid` command, but no other commands. To deploy this tool,\nplease do the following:\n\n1. Host a container image of the included Dockerfile in your repository of choice (instructions for building images in\n   GCS are in the next section).\n2. Add the following two flags to your regrid pipeline.\n   ```\n   --experiment=use_runner_v2 \\\n   --sdk_container_image=$CONTAINER_URL\n   ```\n   For example, the full Dataflow command, assuming you follow the next section's instructions, should look like:\n\n   ```bash\n   weather-mv rg --uris \"gs://your-bucket/*.nc\" \\\n              --output_path \"gs://regrid-bucket/\" \\\n              --runner DataflowRunner \\\n              --project $PROJECT \\\n              --region  $REGION \\\n              --temp_location \"gs://$BUCKET/tmp\" \\\n              --experiment=use_runner_v2 \\\n              --sdk_container_image=\"gcr.io/$PROJECT/$REPO:latest\"\n              --job_name $JOB_NAME \n   ```\n"
  },
  {
    "path": "weather_mv/__init__.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "weather_mv/loader_pipeline/__init__.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport logging\nimport sys\n\nfrom .pipeline import run, pipeline\n\n\ndef cli(extra=[]):\n    logging.getLogger().setLevel(logging.INFO)\n    pipeline(*run(sys.argv + extra))\n"
  },
  {
    "path": "weather_mv/loader_pipeline/bq.py",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport dataclasses\nimport datetime\nimport itertools\nimport json\nimport logging\nimport math\nimport os\nimport pandas as pd\nimport tempfile\nimport typing as t\nfrom pprint import pformat\n\nimport apache_beam as beam\nimport geojson\nimport numpy as np\nimport xarray as xr\nimport xarray_beam as xbeam\nfrom apache_beam.io import WriteToBigQuery, BigQueryDisposition\nfrom apache_beam.options.pipeline_options import PipelineOptions\nfrom apache_beam.transforms import window\nfrom google.cloud import bigquery\nfrom xarray.core.utils import ensure_us_time_resolution\n\nfrom .sinks import ToDataSink, open_dataset, copy, open_local\nfrom .util import (\n    to_json_serializable_type,\n    validate_region,\n    _only_target_vars,\n    get_coordinates,\n    BQ_EXCLUDE_COORDS,\n)\n\nlogger = logging.getLogger(__name__)\n\nDEFAULT_IMPORT_TIME = datetime.datetime.utcfromtimestamp(0).replace(tzinfo=datetime.timezone.utc).isoformat()\nDATA_IMPORT_TIME_COLUMN = 'data_import_time'\nDATA_URI_COLUMN = 'data_uri'\nDATA_FIRST_STEP = 'data_first_step'\nGEO_POINT_COLUMN = 'geo_point'\nGEO_POLYGON_COLUMN = 'geo_polygon'\nLATITUDE_RANGE = (-90, 90)\nLONGITUDE_RANGE = (-180, 180)\n\n\n@dataclasses.dataclass\nclass ToBigQuery(ToDataSink):\n    \"\"\"Load weather data into Google BigQuery.\n\n    A sink that loads de-normalized weather data into BigQuery. First, this sink will\n    create a BigQuery table from user input (either from `variables` or by inferring the\n    schema). Next, it will convert the weather data into rows and then write each row to\n    the BigQuery table.\n\n    During a batch job, this transform will use the BigQueryWriter's file processing\n    step, which requires that a `temp_location` is passed into the main CLI. This\n    transform will perform streaming writes to BigQuery during a streaming Beam job. See\n    `these docs`_ for more.\n\n    Attributes:\n        output_table: The destination for where data should be written in BigQuery\n        geo_data_parquet_path: A path to dump the geo data parquet. This parquet consists of columns:\n          latitude, longitude, geo_point, and geo_polygon. We calculate all of this information\n          upfront so that we do not need to process it every time we process a set of files.\n        variables: Target variables (or coordinates) for the BigQuery schema. By default,\n          all data variables will be imported as columns.\n        area: Target area in [N, W, S, E]; by default, all available area is included.\n        import_time: The time when data was imported. This is used as a simple way to\n          version data — variables can be distinguished based on import time. If None,\n          the system will recompute the current time upon row extraction for each file.\n        infer_schema: If true, this sink will attempt to read in an example data file\n          read all its variables, and generate a BigQuery schema.\n        xarray_open_dataset_kwargs: A dictionary of kwargs to pass to xr.open_dataset().\n        tif_metadata_for_start_time: If the input is a .tif file, parse the tif metadata at\n          this location for a start time / initialization time.\n        tif_metadata_for_end_time: If the input is a .tif file, parse the tif metadata at\n          this location for a end/forecast time.\n        skip_region_validation: Turn off validation that checks if all Cloud resources\n          are in the same region.\n        skip_creating_geo_data_parquet: Skip the generation of the geo data parquet if it already\n          exists at the given --geo_data_parquet_path. Please note that the geo data parquet is mandatory\n          for ingesting data into BigQuery.\n        disable_grib_schema_normalization: Turn off grib's schema normalization; Default: normalization enabled.\n        rows_chunk_size: The size of the chunk of rows to be loaded into memory for processing.\n          Depending on your system's memory, use this to tune how much rows to process.\n\n    .. _these docs: https://beam.apache.org/documentation/io/built-in/google-bigquery/#setting-the-insertion-method\n    \"\"\"\n    output_table: str\n    geo_data_parquet_path: str\n    variables: t.List[str]\n    area: t.List[float]\n    import_time: t.Optional[datetime.datetime]\n    infer_schema: bool\n    xarray_open_dataset_kwargs: t.Dict\n    tif_metadata_for_start_time: t.Optional[str]\n    tif_metadata_for_end_time: t.Optional[str]\n    skip_region_validation: bool\n    disable_grib_schema_normalization: bool\n    rows_chunk_size: int = 1_000_000\n    skip_creating_polygon: bool = False\n    skip_creating_geo_data_parquet: bool = False\n    lat_grid_resolution: t.Optional[float] = None\n    lon_grid_resolution: t.Optional[float] = None\n\n    @classmethod\n    def add_parser_arguments(cls, subparser: argparse.ArgumentParser):\n        subparser.add_argument('-o', '--output_table', type=str, required=True,\n                               help=\"Full name of destination BigQuery table (<project>.<dataset>.<table>). Table \"\n                                    \"will be created if it doesn't exist.\")\n        subparser.add_argument('--geo_data_parquet_path', type=str, required=True,\n                               help=\"A path to dump the geo data parquet.\")\n        subparser.add_argument('--skip_creating_geo_data_parquet', action='store_true', default=False,\n                               help=\"Skip the generation of geo data parquet if it already exists at given \"\n                                    \"--geo_data_parquet_path. Please note that the geo data parquet is manditory for \"\n                                    \" ingesting data into BigQuery. Default: off.\")\n        subparser.add_argument('-v', '--variables', metavar='variables', type=str, nargs='+', default=list(),\n                               help='Target variables (or coordinates) for the BigQuery schema. Default: will import '\n                                    'all data variables as columns.')\n        subparser.add_argument('-a', '--area', metavar='area', type=float, nargs='+', default=list(),\n                               help='Target area in [N, W, S, E]. Default: Will include all available area.')\n        subparser.add_argument('--skip_creating_polygon', action='store_true',\n                               help='Not ingest grid points as polygons in BigQuery. Default: Ingest grid points as '\n                                    'Polygon in BigQuery. Note: This feature relies on the assumption that the '\n                                    'provided grid has an equal distance between consecutive points of latitude and '\n                                    'longitude.')\n        subparser.add_argument('--import_time', type=str, default=datetime.datetime.utcnow().isoformat(),\n                               help=(\"When writing data to BigQuery, record that data import occurred at this \"\n                                     \"time (format: YYYY-MM-DD HH:MM:SS.usec+offset). Default: now in UTC.\"))\n        subparser.add_argument('--infer_schema', action='store_true', default=False,\n                               help='Download one file in the URI pattern and infer a schema from that file. Default: '\n                                    'off')\n        subparser.add_argument('--xarray_open_dataset_kwargs', type=json.loads, default='{}',\n                               help='Keyword-args to pass into `xarray.open_dataset()` in the form of a JSON string.')\n        subparser.add_argument('--tif_metadata_for_start_time', type=str, default=None,\n                               help='Metadata that contains tif file\\'s start/initialization time. '\n                                    'Applicable only for tif files.')\n        subparser.add_argument('--tif_metadata_for_end_time', type=str, default=None,\n                               help='Metadata that contains tif file\\'s end/forecast time. '\n                                    'Applicable only for tif files.')\n        subparser.add_argument('-s', '--skip_region_validation', action='store_true', default=False,\n                               help='Skip validation of regions for data migration. Default: off')\n        subparser.add_argument('--rows_chunk_size', type=int, default=1_000_000,\n                               help=\"The size of the chunk of rows to be loaded into memory for processing. \"\n                                    \"Depending on your system's memory, use this to tune how much rows to process.\")\n        subparser.add_argument('--disable_grib_schema_normalization', action='store_true', default=False,\n                               help=\"To disable grib's schema normalization. Default: off\")\n\n    @classmethod\n    def validate_arguments(cls, known_args: argparse.Namespace, pipeline_args: t.List[str]) -> None:\n        pipeline_options = PipelineOptions(pipeline_args)\n        pipeline_options_dict = pipeline_options.get_all_options()\n\n        if known_args.area:\n            assert len(known_args.area) == 4, 'Must specify exactly 4 lat/long values for area: N, W, S, E boundaries.'\n\n        # Add a check for group_common_hypercubes.\n        if pipeline_options_dict.get('group_common_hypercubes'):\n            raise RuntimeError('--group_common_hypercubes can be specified only for earth engine ingestions.')\n\n        # Check that all arguments are supplied for COG input.\n        _, uri_extension = os.path.splitext(known_args.uris)\n        if (uri_extension in ['.tif', '.tiff'] and not known_args.tif_metadata_for_start_time):\n            raise RuntimeError(\"'--tif_metadata_for_start_time' is required for tif files.\")\n        elif uri_extension not in ['.tif', '.tiff'] and (\n            known_args.tif_metadata_for_start_time\n            or known_args.tif_metadata_for_end_time\n        ):\n            raise RuntimeError(\"'--tif_metadata_for_start_time' and \"\n                               \"'--tif_metadata_for_end_time' can be specified only for tif files.\")\n\n        if not known_args.geo_data_parquet_path.endswith(\".parquet\"):\n            raise RuntimeError(f\"'--geo_data_parquet_path' {known_args.geo_data_parquet_path} must \"\n                                \"end with '.parquet'.\")\n\n        # Check that Cloud resource regions are consistent.\n        if not (known_args.dry_run or known_args.skip_region_validation):\n            # Program execution will terminate on failure of region validation.\n            logger.info('Validating regions for data migration. This might take a few seconds...')\n            validate_region(known_args.output_table, temp_location=pipeline_options_dict.get('temp_location'),\n                            region=pipeline_options_dict.get('region'))\n            logger.info('Region validation completed successfully.')\n\n    def generate_parquet(\n        self,\n        parquet_path: str,\n        lats: t.List,\n        lons: t.List,\n        lat_grid_resolution: float,\n        lon_grid_resolution: float,\n        skip_creating_polygon: bool = False,\n    ):\n        \"\"\"Generates geo data parquet.\"\"\"\n        logger.info(\"Generating geo data parquet ...\")\n        # Generate Cartesian product of latitudes and longitudes.\n        lat_lon_pairs = itertools.product(lats, lons)\n        # Create a temp parquet file for writing.\n        with tempfile.NamedTemporaryFile(suffix='.parquet', mode='w+', newline='') as temp:\n            # Define header.\n            header = ['latitude', 'longitude', GEO_POINT_COLUMN, GEO_POLYGON_COLUMN]\n            data = []\n            for lat, lon in lat_lon_pairs:\n                lat = float(lat)\n                lon = float(lon)\n                row = [lat, lon]\n                sanitized_lon = (((lon % 360) + 540) % 360) - 180\n\n                # Fetch the geo point.\n                geo_point = fetch_geo_point(lat, sanitized_lon)\n                row.append(geo_point)\n\n                # Fetch the geo polygon if not skipped.\n                if not skip_creating_polygon:\n                    geo_polygon = fetch_geo_polygon(lat, sanitized_lon, lat_grid_resolution, lon_grid_resolution)\n                    row.append(geo_polygon)\n                else:\n                    row.append(None)\n                data.append(row)\n\n            df = pd.DataFrame(data, columns=header)\n            # Write DataFrame to parquet.\n            df.to_parquet(temp.name, index=False)\n            logger.info(f\"geo data parquet generated successfully. Uploading to {parquet_path}.\")\n            copy(temp.name, parquet_path)\n            logger.info(f\"geo data parquet uploaded successfully at {parquet_path}.\")\n\n    def __post_init__(self):\n        \"\"\"Initializes Sink by creating a BigQuery table based on user input.\"\"\"\n        if self.zarr:\n            self.xarray_open_dataset_kwargs = self.zarr_kwargs\n        with open_dataset(self.first_uri, self.xarray_open_dataset_kwargs,\n                          self.disable_grib_schema_normalization, self.tif_metadata_for_start_time,\n                          self.tif_metadata_for_end_time, is_zarr=self.zarr) as open_ds:\n\n            if not self.skip_creating_polygon:\n                logger.warning(\"Assumes that equal distance between consecutive points of latitude \"\n                               \"and longitude for the entire grid.\")\n                # Find the grid_resolution.\n                if open_ds['latitude'].size > 1 and open_ds['longitude'].size > 1:\n                    latitude_length = len(open_ds['latitude'])\n                    longitude_length = len(open_ds['longitude'])\n\n                    latitude_range = np.ptp(open_ds[\"latitude\"].values)\n                    longitude_range = np.ptp(open_ds[\"longitude\"].values)\n\n                    self.lat_grid_resolution = abs(latitude_range / latitude_length) / 2\n                    self.lon_grid_resolution = abs(longitude_range / longitude_length) / 2\n\n                else:\n                    self.skip_creating_polygon = True\n                    logger.warning(\"Polygon can't be genereated as provided dataset has a only single grid point.\")\n            else:\n                logger.info(\"Polygon is not created as '--skip_creating_polygon' flag passed.\")\n\n            if not self.skip_creating_geo_data_parquet:\n                if self.area:\n                    n, w, s, e = self.area\n                    open_ds = open_ds.sel(latitude=slice(n, s), longitude=slice(w, e))\n\n                lats = open_ds[\"latitude\"].values.tolist()\n                lons = open_ds[\"longitude\"].values.tolist()\n                self.generate_parquet(\n                    self.geo_data_parquet_path,\n                    [lats] if isinstance(lats, float) else lats,\n                    [lons] if isinstance(lons, float) else lons,\n                    self.lat_grid_resolution,\n                    self.lon_grid_resolution,\n                    self.skip_creating_polygon,\n                )\n            else:\n                logger.info(\"geo data parquet is not created as '--skip_creating_geo_data_parquet' flag passed.\")\n\n            # Define table from user input\n            if self.variables and not self.infer_schema and not open_ds.attrs['is_normalized']:\n                logger.info('Creating schema from input variables.')\n                table_schema = to_table_schema(\n                    [('latitude', 'FLOAT64'), ('longitude', 'FLOAT64'), ('time', 'TIMESTAMP')] +\n                    [(var, 'FLOAT64') for var in self.variables]\n                )\n            else:\n                logger.info('Inferring schema from data.')\n                ds: xr.Dataset = _only_target_vars(open_ds, self.variables)\n                table_schema = dataset_to_table_schema(ds)\n\n        if self.dry_run:\n            logger.debug('Created the BigQuery table with schema...')\n            logger.debug(f'\\n{pformat(table_schema)}')\n            return\n\n        # Create the table in BigQuery\n        try:\n            table = bigquery.Table(self.output_table, schema=table_schema)\n            self.table = bigquery.Client().create_table(table, exists_ok=True)\n        except Exception as e:\n            logger.error(f'Unable to create table in BigQuery: {e}')\n            raise\n\n    def prepare_coordinates(self, uri: str) -> t.Iterator[t.Tuple[str, t.Dict]]:\n        \"\"\"Open the dataset, filter by area, and prepare chunks of coordinates for parallel ingestion into BigQuery.\"\"\"\n        logger.info(f'Preparing coordinates for: {uri!r}.')\n\n        with open_dataset(uri, self.xarray_open_dataset_kwargs, self.disable_grib_schema_normalization,\n                          self.tif_metadata_for_start_time, self.tif_metadata_for_end_time, is_zarr=self.zarr) as ds:\n            data_ds: xr.Dataset = _only_target_vars(ds, self.variables)\n            for coordinate in get_coordinates(data_ds, uri):\n                yield uri, coordinate\n\n    def extract_rows(self, uri: str, coordinate: t.Dict) -> t.Iterator[t.Dict]:\n        \"\"\"Reads an asset and coordinates, then yields its rows as a mapping of column names to values.\"\"\"\n        logger.info(f'Extracting rows for {coordinate!r} of {uri!r}.')\n\n        # Re-calculate import time for streaming extractions.\n        if not self.import_time:\n            self.import_time = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)\n\n        with open_dataset(uri, self.xarray_open_dataset_kwargs, self.disable_grib_schema_normalization,\n                          self.tif_metadata_for_start_time, self.tif_metadata_for_end_time, is_zarr=self.zarr) as ds:\n            data_ds: xr.Dataset = _only_target_vars(ds, self.variables)\n            if self.area:\n                n, w, s, e = self.area\n                data_ds = data_ds.sel(latitude=slice(n, s), longitude=slice(w, e))\n                logger.info(f'Data filtered by area, size: {data_ds.nbytes}')\n            yield from self.to_rows(coordinate, data_ds, uri)\n\n    def to_rows(self, coordinate: t.Dict, ds: xr.Dataset, uri: str) -> t.Iterator[t.Dict]:\n        first_ts_raw = (\n            ds.time[0].values if isinstance(ds.time.values, np.ndarray)\n            else ds.time.values\n        )\n        first_time_step = to_json_serializable_type(first_ts_raw)\n        with open_local(self.geo_data_parquet_path) as master_lat_lon:\n            selected_ds = ds.loc[coordinate]\n\n            # Ensure that the latitude and longitude dimensions are in sync with the geo data parquet.\n            if not BQ_EXCLUDE_COORDS - set(selected_ds.dims.keys()):\n                selected_ds = selected_ds.transpose('latitude', 'longitude')\n\n            vector_df = pd.read_parquet(master_lat_lon)\n            if self.skip_creating_polygon:\n                vector_df[GEO_POLYGON_COLUMN] = None\n\n            # Add indexed coordinates.\n            for k, v in coordinate.items():\n                vector_df[k] = to_json_serializable_type(v)\n\n            # Add un-indexed coordinates.\n            # Filter out excluded coordinates from coords.\n            filtered_coords = (c for c in selected_ds.coords if c not in BQ_EXCLUDE_COORDS)\n            for c in filtered_coords:\n                if c not in coordinate and (not self.variables or c in self.variables):\n                    vector_df[c] = to_json_serializable_type(ensure_us_time_resolution(selected_ds[c].values))\n\n            # We are not directly assigning values to dataframe because we need to consider 'None'.\n            # Vectorized operations are generally more faster and efficient than iterating over rows.\n            # Furthermore, pd.Series does not enforces length consistency so just added a safety check.\n            for var in selected_ds.data_vars:\n                values = to_json_serializable_type(ensure_us_time_resolution(selected_ds[var].values.ravel()))\n                if len(values) != len(vector_df):\n                    raise ValueError(\n                        f\"Length of values {len(values)} does not match number of rows in DataFrame {len(vector_df)}.\"\n                    )\n                vector_df[var] = pd.Series(values, dtype=object)\n\n            vector_df[DATA_IMPORT_TIME_COLUMN] = self.import_time\n            vector_df[DATA_URI_COLUMN] = uri\n            vector_df[DATA_FIRST_STEP] = first_time_step\n            num_chunks = math.ceil(len(vector_df) / self.rows_chunk_size)\n            logger.info(f\"{uri!r} -- {coordinate!r}'s vector_df divided into {num_chunks} chunk(s).\")\n            for i in range(num_chunks):\n                chunk = vector_df[i * self.rows_chunk_size:(i + 1) * self.rows_chunk_size]\n                rows = chunk.to_dict('records')\n                logger.info(f\"{uri!r} -- {coordinate!r}'s rows for {i} chunk converted to dict.\")\n                yield from rows\n\n    def chunks_to_rows(self, _, ds: xr.Dataset) -> t.Iterator[t.Dict]:\n        uri = ds.attrs.get(DATA_URI_COLUMN, '')\n        # Re-calculate import time for streaming extractions.\n        if not self.import_time or self.zarr:\n            self.import_time = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)\n\n        for coordinate in get_coordinates(ds, uri):\n            yield from self.to_rows(coordinate, ds, uri)\n\n    def expand(self, paths):\n        \"\"\"Extract rows of variables from data paths into a BigQuery table.\"\"\"\n        if not self.zarr:\n            extracted_rows = (\n                paths\n                | 'PrepareCoordinates' >> beam.FlatMap(self.prepare_coordinates)\n                | beam.Reshuffle()\n                | 'ExtractRows' >> beam.FlatMapTuple(self.extract_rows)\n            )\n        else:\n            xarray_open_dataset_kwargs = self.xarray_open_dataset_kwargs.copy()\n            xarray_open_dataset_kwargs.pop('chunks')\n            start_date = xarray_open_dataset_kwargs.pop('start_date', None)\n            end_date = xarray_open_dataset_kwargs.pop('end_date', None)\n            ds, chunks = xbeam.open_zarr(self.first_uri, **xarray_open_dataset_kwargs)\n\n            if start_date is not None and end_date is not None:\n                ds = ds.sel(time=slice(start_date, end_date))\n\n            ds.attrs[DATA_URI_COLUMN] = self.first_uri\n            extracted_rows = (\n                paths\n                | 'OpenChunks' >> xbeam.DatasetToChunks(ds, chunks)\n                | 'ExtractRows' >> beam.FlatMapTuple(self.chunks_to_rows)\n                | 'Window' >> beam.WindowInto(window.FixedWindows(60))\n                | 'AddTimestamp' >> beam.Map(timestamp_row)\n            )\n\n        if self.dry_run:\n            return extracted_rows | 'Log Rows' >> beam.Map(logger.info)\n        return (\n            extracted_rows\n            | 'WriteToBigQuery' >> WriteToBigQuery(\n                project=self.table.project,\n                dataset=self.table.dataset_id,\n                table=self.table.table_id,\n                write_disposition=BigQueryDisposition.WRITE_APPEND,\n                create_disposition=BigQueryDisposition.CREATE_NEVER)\n        )\n\n\ndef map_dtype_to_sql_type(var_type: np.dtype) -> str:\n    \"\"\"Maps a np.dtype to a suitable BigQuery column type.\"\"\"\n    if var_type in {np.dtype('float64'), np.dtype('float32'), np.dtype('timedelta64[ns]')}:\n        return 'FLOAT64'\n    elif var_type in {np.dtype('<M8[ns]')}:\n        return 'TIMESTAMP'\n    elif var_type in {np.dtype('int8'), np.dtype('int16'), np.dtype('int32'), np.dtype('int64')}:\n        return 'INT64'\n    raise ValueError(f\"Unknown mapping from '{var_type}' to SQL type\")\n\n\ndef dataset_to_table_schema(ds: xr.Dataset) -> t.List[bigquery.SchemaField]:\n    \"\"\"Returns a BigQuery table schema able to store the data in 'ds'.\"\"\"\n    # Get the columns and data types for all variables in the dataframe\n    columns = [\n        (str(col), map_dtype_to_sql_type(ds.variables[col].dtype))\n        for col in ds.variables.keys() if ds.variables[col].size != 0\n    ]\n\n    return to_table_schema(columns)\n\n\ndef to_table_schema(columns: t.List[t.Tuple[str, str]]) -> t.List[bigquery.SchemaField]:\n    # Fields are all Nullable because data may have NANs. We treat these as null.\n    fields = [\n        bigquery.SchemaField(column, var_type, mode='NULLABLE')\n        for column, var_type in columns\n    ]\n\n    # Add an extra columns for recording import metadata.\n    fields.append(bigquery.SchemaField(DATA_IMPORT_TIME_COLUMN, 'TIMESTAMP', mode='NULLABLE'))\n    fields.append(bigquery.SchemaField(DATA_URI_COLUMN, 'STRING', mode='NULLABLE'))\n    fields.append(bigquery.SchemaField(DATA_FIRST_STEP, 'TIMESTAMP', mode='NULLABLE'))\n    fields.append(bigquery.SchemaField(GEO_POINT_COLUMN, 'GEOGRAPHY', mode='NULLABLE'))\n    fields.append(bigquery.SchemaField(GEO_POLYGON_COLUMN, 'GEOGRAPHY', mode='NULLABLE'))\n\n    return fields\n\n\ndef timestamp_row(it: t.Dict) -> window.TimestampedValue:\n    \"\"\"Associate an extracted row with the import_time timestamp.\"\"\"\n    timestamp = it[DATA_IMPORT_TIME_COLUMN].timestamp()\n    return window.TimestampedValue(it, timestamp)\n\n\ndef fetch_geo_point(lat: float, long: float) -> str:\n    \"\"\"Calculates a geography point from an input latitude and longitude.\"\"\"\n    if lat > LATITUDE_RANGE[1] or lat < LATITUDE_RANGE[0]:\n        raise ValueError(f\"Invalid latitude value '{lat}'\")\n    if long > LONGITUDE_RANGE[1] or long < LONGITUDE_RANGE[0]:\n        raise ValueError(f\"Invalid longitude value '{long}'\")\n    point = geojson.dumps(geojson.Point((long, lat)))\n    return point\n\n\ndef fetch_geo_polygon(latitude: float, longitude: float, lat_grid_resolution: float, lon_grid_resolution: float) -> str:\n    \"\"\"Create a Polygon based on latitude, longitude and resolution.\n\n    Example ::\n        * - . - *\n        |       |\n        .   •   .\n        |       |\n        * - . - *\n    In order to create the polygon, we require the `*` point as indicated in the above example.\n    To determine the position of the `*` point, we find the `.` point.\n    The `get_lat_lon_range` function gives the `.` point and `bound_point` gives the `*` point.\n    \"\"\"\n    lat_lon_bound = bound_point(latitude, longitude, lat_grid_resolution, lon_grid_resolution)\n    polygon = geojson.dumps(geojson.Polygon([[\n        (lat_lon_bound[0][0], lat_lon_bound[0][1]),  # lower_left\n        (lat_lon_bound[1][0], lat_lon_bound[1][1]),  # upper_left\n        (lat_lon_bound[2][0], lat_lon_bound[2][1]),  # upper_right\n        (lat_lon_bound[3][0], lat_lon_bound[3][1]),  # lower_right\n        (lat_lon_bound[0][0], lat_lon_bound[0][1]),  # lower_left\n    ]]))\n    return polygon\n\n\ndef bound_point(latitude: float, longitude: float, lat_grid_resolution: float, lon_grid_resolution: float) -> t.List:\n    \"\"\"Calculate the bound point based on latitude, longitude and grid resolution.\n\n    Example ::\n        * - . - *\n        |       |\n        .   •   .\n        |       |\n        * - . - *\n    This function gives the `*` point in the above example.\n    \"\"\"\n    lat_in_bound = latitude in [90.0, -90.0]\n    lon_in_bound = longitude in [-180.0, 180.0]\n\n    lat_range = get_lat_lon_range(latitude, \"latitude\", lat_in_bound,\n                                  lat_grid_resolution, lon_grid_resolution)\n    lon_range = get_lat_lon_range(longitude, \"longitude\", lon_in_bound,\n                                  lat_grid_resolution, lon_grid_resolution)\n    lower_left = [lon_range[1], lat_range[1]]\n    upper_left = [lon_range[1], lat_range[0]]\n    upper_right = [lon_range[0], lat_range[0]]\n    lower_right = [lon_range[0], lat_range[1]]\n    return [lower_left, upper_left, upper_right, lower_right]\n\n\ndef get_lat_lon_range(value: float, lat_lon: str, is_point_out_of_bound: bool,\n                      lat_grid_resolution: float, lon_grid_resolution: float) -> t.List:\n    \"\"\"Calculate the latitude, longitude point range point latitude, longitude and grid resolution.\n\n    Example ::\n        * - . - *\n        |       |\n        .   •   .\n        |       |\n        * - . - *\n    This function gives the `.` point in the above example.\n    \"\"\"\n    if lat_lon == 'latitude':\n        if is_point_out_of_bound:\n            return [-90 + lat_grid_resolution, 90 - lat_grid_resolution]\n        else:\n            return [value + lat_grid_resolution, value - lat_grid_resolution]\n    else:\n        if is_point_out_of_bound:\n            return [-180 + lon_grid_resolution, 180 - lon_grid_resolution]\n        else:\n            return [value + lon_grid_resolution, value - lon_grid_resolution]\n"
  },
  {
    "path": "weather_mv/loader_pipeline/bq_test.py",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport datetime\nimport json\nimport logging\nimport os\nimport tempfile\nimport typing as t\nimport unittest\n\nimport geojson\nimport numpy as np\nimport pandas as pd\nimport simplejson\nimport xarray as xr\nfrom apache_beam.testing.test_pipeline import TestPipeline\nfrom apache_beam.testing.util import assert_that, is_not_empty\nfrom google.cloud.bigquery import SchemaField\n\nfrom .bq import (\n    DEFAULT_IMPORT_TIME,\n    dataset_to_table_schema,\n    fetch_geo_point,\n    fetch_geo_polygon,\n    ToBigQuery,\n)\nfrom .sinks_test import TestDataBase, _handle_missing_grib_be\nfrom .util import _only_target_vars\n\nlogger = logging.getLogger(__name__)\n\n\nclass SchemaCreationTests(TestDataBase):\n\n    def setUp(self) -> None:\n        super().setUp()\n        self.test_dataset = {\n            \"coords\": {\"a\": {\"dims\": (\"a\",), \"data\": [pd.Timestamp(0)], \"attrs\": {}}},\n            \"attrs\": {\"is_normalized\": False},\n            \"dims\": \"a\",\n            \"data_vars\": {\n                \"b\": {\"dims\": (\"a\",), \"data\": [np.float32(1.0)]},\n                \"c\": {\"dims\": (\"a\",), \"data\": [np.float64(2.0)]},\n                \"d\": {\"dims\": (\"a\",), \"data\": [3.0]},\n            }\n        }\n        self.test_dataset__with_schema_normalization = {\n            \"coords\": {\"a\": {\"dims\": (\"a\",), \"data\": [pd.Timestamp(0)], \"attrs\": {}}},\n            \"attrs\": {\"is_normalized\": True},\n            \"dims\": \"a\",\n            \"data_vars\": {\n                \"e_0_00_instant_b\": {\"dims\": (\"a\",), \"data\": [np.float32(1.0)]},\n                \"e_0_00_instant_c\": {\"dims\": (\"a\",), \"data\": [np.float64(2.0)]},\n                \"e_0_00_instant_d\": {\"dims\": (\"a\",), \"data\": [3.0]},\n            }\n        }\n\n    def test_schema_generation(self):\n        ds = xr.Dataset.from_dict(self.test_dataset)\n        schema = dataset_to_table_schema(ds)\n        expected_schema = [\n            SchemaField('a', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('b', 'FLOAT64', 'NULLABLE', None, (), None),\n            SchemaField('c', 'FLOAT64', 'NULLABLE', None, (), None),\n            SchemaField('d', 'FLOAT64', 'NULLABLE', None, (), None),\n            SchemaField('data_import_time', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('data_uri', 'STRING', 'NULLABLE', None, (), None),\n            SchemaField('data_first_step', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('geo_point', 'GEOGRAPHY', 'NULLABLE', None, (), None),\n            SchemaField('geo_polygon', 'GEOGRAPHY', 'NULLABLE', None, (), None)\n        ]\n        self.assertListEqual(schema, expected_schema)\n\n    def test_schema_generation__with_schema_normalization(self):\n        ds = xr.Dataset.from_dict(self.test_dataset__with_schema_normalization)\n        schema = dataset_to_table_schema(ds)\n        expected_schema = [\n            SchemaField('a', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('e_0_00_instant_b', 'FLOAT64', 'NULLABLE', None, (), None),\n            SchemaField('e_0_00_instant_c', 'FLOAT64', 'NULLABLE', None, (), None),\n            SchemaField('e_0_00_instant_d', 'FLOAT64', 'NULLABLE', None, (), None),\n            SchemaField('data_import_time', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('data_uri', 'STRING', 'NULLABLE', None, (), None),\n            SchemaField('data_first_step', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('geo_point', 'GEOGRAPHY', 'NULLABLE', None, (), None),\n            SchemaField('geo_polygon', 'GEOGRAPHY', 'NULLABLE', None, (), None)\n        ]\n        self.assertListEqual(schema, expected_schema)\n\n    def test_schema_generation__with_target_columns(self):\n        target_variables = ['c', 'd']\n        ds = _only_target_vars(xr.Dataset.from_dict(self.test_dataset), target_variables)\n        schema = dataset_to_table_schema(ds)\n        expected_schema = [\n            SchemaField('a', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('c', 'FLOAT64', 'NULLABLE', None, (), None),\n            SchemaField('d', 'FLOAT64', 'NULLABLE', None, (), None),\n            SchemaField('data_import_time', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('data_uri', 'STRING', 'NULLABLE', None, (), None),\n            SchemaField('data_first_step', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('geo_point', 'GEOGRAPHY', 'NULLABLE', None, (), None),\n            SchemaField('geo_polygon', 'GEOGRAPHY', 'NULLABLE', None, (), None)\n        ]\n        self.assertListEqual(schema, expected_schema)\n\n    def test_schema_generation__with_target_columns__with_schema_normalization(self):\n        target_variables = ['c', 'd']\n        ds = _only_target_vars(xr.Dataset.from_dict(self.test_dataset__with_schema_normalization), target_variables)\n        schema = dataset_to_table_schema(ds)\n        expected_schema = [\n            SchemaField('a', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('e_0_00_instant_c', 'FLOAT64', 'NULLABLE', None, (), None),\n            SchemaField('e_0_00_instant_d', 'FLOAT64', 'NULLABLE', None, (), None),\n            SchemaField('data_import_time', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('data_uri', 'STRING', 'NULLABLE', None, (), None),\n            SchemaField('data_first_step', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('geo_point', 'GEOGRAPHY', 'NULLABLE', None, (), None),\n            SchemaField('geo_polygon', 'GEOGRAPHY', 'NULLABLE', None, (), None)\n        ]\n        self.assertListEqual(schema, expected_schema)\n\n    def test_schema_generation__no_targets_specified(self):\n        target_variables = []  # intentionally empty\n        ds = _only_target_vars(xr.Dataset.from_dict(self.test_dataset), target_variables)\n        schema = dataset_to_table_schema(ds)\n        expected_schema = [\n            SchemaField('a', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('b', 'FLOAT64', 'NULLABLE', None, (), None),\n            SchemaField('c', 'FLOAT64', 'NULLABLE', None, (), None),\n            SchemaField('d', 'FLOAT64', 'NULLABLE', None, (), None),\n            SchemaField('data_import_time', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('data_uri', 'STRING', 'NULLABLE', None, (), None),\n            SchemaField('data_first_step', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('geo_point', 'GEOGRAPHY', 'NULLABLE', None, (), None),\n            SchemaField('geo_polygon', 'GEOGRAPHY', 'NULLABLE', None, (), None)\n        ]\n        self.assertListEqual(schema, expected_schema)\n\n    def test_schema_generation__no_targets_specified__with_schema_normalization(self):\n        target_variables = []  # intentionally empty\n        ds = _only_target_vars(xr.Dataset.from_dict(self.test_dataset__with_schema_normalization), target_variables)\n        schema = dataset_to_table_schema(ds)\n        expected_schema = [\n            SchemaField('a', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('e_0_00_instant_b', 'FLOAT64', 'NULLABLE', None, (), None),\n            SchemaField('e_0_00_instant_c', 'FLOAT64', 'NULLABLE', None, (), None),\n            SchemaField('e_0_00_instant_d', 'FLOAT64', 'NULLABLE', None, (), None),\n            SchemaField('data_import_time', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('data_uri', 'STRING', 'NULLABLE', None, (), None),\n            SchemaField('data_first_step', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('geo_point', 'GEOGRAPHY', 'NULLABLE', None, (), None),\n            SchemaField('geo_polygon', 'GEOGRAPHY', 'NULLABLE', None, (), None)\n        ]\n        self.assertListEqual(schema, expected_schema)\n\n    def test_schema_generation__missing_target(self):\n        with self.assertRaisesRegex(AssertionError, 'Target variable must be in original dataset.'):\n            target_variables = ['a', 'foobar', 'd']\n            _only_target_vars(xr.Dataset.from_dict(self.test_dataset), target_variables)\n\n    def test_schema_generation__missing_target__with_schema_normalization(self):\n        with self.assertRaisesRegex(AssertionError, 'Target variable must be in original dataset.'):\n            target_variables = ['a', 'foobar', 'd']\n            _only_target_vars(xr.Dataset.from_dict(self.test_dataset__with_schema_normalization), target_variables)\n\n    @_handle_missing_grib_be\n    def test_schema_generation__non_index_coords(self):\n        test_single_var = xr.open_dataset(\n            f'{self.test_data_folder}/test_data_grib_single_timestep',\n            engine='cfgrib'\n        )\n        schema = dataset_to_table_schema(test_single_var)\n        expected_schema = [\n            SchemaField('number', 'INT64', 'NULLABLE', None, (), None),\n            SchemaField('time', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('step', 'FLOAT64', 'NULLABLE', None, (), None),\n            SchemaField('surface', 'FLOAT64', 'NULLABLE', None, (), None),\n            SchemaField('latitude', 'FLOAT64', 'NULLABLE', None, (), None),\n            SchemaField('longitude', 'FLOAT64', 'NULLABLE', None, (), None),\n            SchemaField('valid_time', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('z', 'FLOAT64', 'NULLABLE', None, (), None),\n            SchemaField('data_import_time', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('data_uri', 'STRING', 'NULLABLE', None, (), None),\n            SchemaField('data_first_step', 'TIMESTAMP', 'NULLABLE', None, (), None),\n            SchemaField('geo_point', 'GEOGRAPHY', 'NULLABLE', None, (), None),\n            SchemaField('geo_polygon', 'GEOGRAPHY', 'NULLABLE', None, (), None)\n\n        ]\n        self.assertListEqual(schema, expected_schema)\n\n\nclass ExtractRowsTestBase(TestDataBase):\n\n    def extract(self, data_path, *, variables=None, area=None, open_dataset_kwargs=None,\n                import_time=DEFAULT_IMPORT_TIME, disable_grib_schema_normalization=False,\n                tif_metadata_for_start_time=None, tif_metadata_for_end_time=None, zarr: bool = False, zarr_kwargs=None,\n                skip_creating_polygon: bool = False, geo_data_parquet_path,\n                skip_creating_geo_data_parquet : bool = False) -> t.Iterator[t.Dict]:\n        if zarr_kwargs is None:\n            zarr_kwargs = {}\n        op = ToBigQuery.from_kwargs(\n            first_uri=data_path, dry_run=True, zarr=zarr, zarr_kwargs=zarr_kwargs,\n            output_table='foo.bar.baz', variables=variables, area=area,\n            xarray_open_dataset_kwargs=open_dataset_kwargs, import_time=import_time, infer_schema=False,\n            tif_metadata_for_start_time=tif_metadata_for_start_time,\n            tif_metadata_for_end_time=tif_metadata_for_end_time, skip_region_validation=True,\n            disable_grib_schema_normalization=disable_grib_schema_normalization, rows_chunk_size=1_000_000,\n            skip_creating_polygon=skip_creating_polygon,\n            geo_data_parquet_path=geo_data_parquet_path,\n            skip_creating_geo_data_parquet=skip_creating_geo_data_parquet)\n        coords = op.prepare_coordinates(data_path)\n        for uri, chunk in coords:\n            yield from op.extract_rows(uri, chunk)\n\n    def assertGeopointEqual(self, actual: str, expected: str) -> None:\n        expected_json, actual_json = geojson.loads(expected), geojson.loads(actual)\n        self.assertEqual(actual_json['type'], expected_json['type'])\n        self.assertTrue(np.allclose(actual_json['coordinates'], expected_json['coordinates']))\n\n    def assertRowsEqual(self, actual: t.Dict, expected: t.Dict):\n        self.assertEqual(expected.keys(), actual.keys())\n        for key in expected.keys():\n            if isinstance(expected[key], str):\n                # Handle Geopoint JSON strings...\n                try:\n                    self.assertGeopointEqual(actual[key], expected[key])\n                except (simplejson.JSONDecodeError, json.JSONDecodeError, KeyError):\n                    self.assertEqual(actual[key], expected[key])\n                continue\n            self.assertAlmostEqual(actual[key], expected[key], places=4)\n            self.assertNotIsInstance(actual[key], np.dtype)\n            self.assertNotIsInstance(actual[key], np.float64)\n            self.assertNotIsInstance(actual[key], np.float32)\n\n\nclass ExtractRowsTest(ExtractRowsTestBase):\n\n    def setUp(self) -> None:\n        super().setUp()\n        self.test_data_path = f'{self.test_data_folder}/test_data_20180101.nc'\n        self.geo_data_parquet_path = f'{self.test_data_folder}/test_data_20180101_geo_data.parquet'\n\n    def test_01_extract_rows(self):\n        actual = next(\n            self.extract(\n                self.test_data_path,\n                geo_data_parquet_path=self.geo_data_parquet_path,\n                skip_creating_polygon=False,\n                skip_creating_geo_data_parquet=False\n            )\n        )\n        expected = {\n            'd2m': 242.3035430908203,\n            'data_import_time': '1970-01-01T00:00:00+00:00',\n            'data_first_step': '2018-01-02T06:00:00+00:00',\n            'data_uri': self.test_data_path,\n            'latitude': 49.0,\n            'longitude': -108.0,\n            'time': '2018-01-02T06:00:00+00:00',\n            'u10': 3.4776244163513184,\n            'v10': 0.03294110298156738,\n            'geo_point': geojson.dumps(geojson.Point((-108.0, 49.0))),\n            'geo_polygon': geojson.dumps(geojson.Polygon([\n                        (-108.098837, 48.900826), (-108.098837, 49.099174),\n                        (-107.901163, 49.099174), (-107.901163, 48.900826),\n                        (-108.098837, 48.900826)]))\n        }\n        self.assertRowsEqual(actual, expected)\n\n    def test_02_extract_rows__with_subset_variables(self):\n        actual = next(\n            self.extract(\n                self.test_data_path,\n                geo_data_parquet_path=self.geo_data_parquet_path,\n                skip_creating_geo_data_parquet=True,\n                skip_creating_polygon=True,\n                variables=['u10']\n            )\n        )\n        expected = {\n            'data_import_time': '1970-01-01T00:00:00+00:00',\n            'data_first_step': '2018-01-02T06:00:00+00:00',\n            'data_uri': self.test_data_path,\n            'latitude': 49.0,\n            'longitude': -108.0,\n            'time': '2018-01-02T06:00:00+00:00',\n            'u10': 3.4776244163513184,\n            'geo_point': geojson.dumps(geojson.Point((-108.0, 49.0))),\n            'geo_polygon': None\n        }\n        self.assertRowsEqual(actual, expected)\n\n    def test_03_extract_rows__specific_area(self):\n        actual = next(\n            self.extract(\n                self.test_data_path,\n                area=[45, -103, 33, -92],\n                geo_data_parquet_path='./geo_data.parquet',\n                skip_creating_polygon=True\n            )\n        )\n        expected = {\n            'd2m': 246.19993591308594,\n            'data_import_time': '1970-01-01T00:00:00+00:00',\n            'data_first_step': '2018-01-02T06:00:00+00:00',\n            'data_uri': self.test_data_path,\n            'latitude': 45.0,\n            'longitude': -103.0,\n            'time': '2018-01-02T06:00:00+00:00',\n            'u10': 2.73445987701416,\n            'v10': 0.08277571201324463,\n            'geo_point': geojson.dumps(geojson.Point((-103.0, 45.0))),\n            'geo_polygon': None\n        }\n        self.assertRowsEqual(actual, expected)\n\n    def test_04_extract_rows__specific_area_float_points(self):\n        actual = next(\n            self.extract(\n                self.test_data_path,\n                geo_data_parquet_path='./geo_data.parquet',\n                area=[45.34, -103.45, 33.34, -92.87]\n            )\n        )\n        expected = {\n            'd2m': 246.47116088867188,\n            'data_import_time': '1970-01-01T00:00:00+00:00',\n            'data_first_step': '2018-01-02T06:00:00+00:00',\n            'data_uri': self.test_data_path,\n            'latitude': 45.20000076293945,\n            'longitude': -103.4000015258789,\n            'time': '2018-01-02T06:00:00+00:00',\n            'u10': 3.94743275642395,\n            'v10': -0.19749987125396729,\n            'geo_point': geojson.dumps(geojson.Point((-103.400002, 45.200001))),\n            'geo_polygon': geojson.dumps(geojson.Polygon([\n                        (-103.498839, 45.100827), (-103.498839, 45.299174),\n                        (-103.301164, 45.299174), (-103.301164, 45.100827),\n                        (-103.498839, 45.100827)]))\n        }\n        self.assertRowsEqual(actual, expected)\n\n    def test_05_extract_rows_raises_error_when_geo_data_parquet_dimensions_mismatch(self):\n        with self.assertRaisesRegex(ValueError, 'Length of values '):\n            next(\n                self.extract(\n                    self.test_data_path,\n                    area=[45, -103, 33, -92],\n                    geo_data_parquet_path=self.geo_data_parquet_path,\n                    skip_creating_geo_data_parquet=True,\n                    skip_creating_polygon=True\n                )\n            )\n\n    def test_06_extract_rows__specify_import_time(self):\n        now = datetime.datetime.utcnow().isoformat()\n        actual = next(\n            self.extract(\n                self.test_data_path,\n                geo_data_parquet_path=self.geo_data_parquet_path,\n                skip_creating_geo_data_parquet=True,\n                import_time=now\n            )\n        )\n        expected = {\n            'd2m': 242.3035430908203,\n            'data_import_time': now,\n            'data_first_step': '2018-01-02T06:00:00+00:00',\n            'data_uri': self.test_data_path,\n            'latitude': 49.0,\n            'longitude': -108.0,\n            'time': '2018-01-02T06:00:00+00:00',\n            'u10': 3.4776244163513184,\n            'v10': 0.03294110298156738,\n            'geo_point': geojson.dumps(geojson.Point((-108.0, 49.0))),\n            'geo_polygon': geojson.dumps(geojson.Polygon([\n                        (-108.098837, 48.900826), (-108.098837, 49.099174),\n                        (-107.901163, 49.099174), (-107.901163, 48.900826),\n                        (-108.098837, 48.900826)]))\n        }\n        self.assertRowsEqual(actual, expected)\n\n    def test_07_extract_rows_single_point(self):\n        self.test_data_path = f'{self.test_data_folder}/test_data_single_point.nc'\n        self.geo_data_parquet_path = f'{self.test_data_folder}/test_data_single_point_geo_data.parquet'\n        actual = next(\n            self.extract(\n                self.test_data_path,\n                geo_data_parquet_path=self.geo_data_parquet_path,\n                skip_creating_geo_data_parquet=False,\n            )\n        )\n        expected = {\n            'd2m': 242.3035430908203,\n            'data_import_time': '1970-01-01T00:00:00+00:00',\n            'data_first_step': '2018-01-02T06:00:00+00:00',\n            'data_uri': self.test_data_path,\n            'latitude': 49.0,\n            'longitude': -108.0,\n            'time': '2018-01-02T06:00:00+00:00',\n            'u10': 3.4776244163513184,\n            'v10': 0.03294110298156738,\n            'geo_point': geojson.dumps(geojson.Point((-108.0, 49.0))),\n            'geo_polygon': None\n        }\n        self.assertRowsEqual(actual, expected)\n\n    def test_08_extract_rows_nan(self):\n        self.test_data_path = f'{self.test_data_folder}/test_data_has_nan.nc'\n        self.geo_data_parquet_path = f'{self.test_data_folder}/test_data_has_nan_geo_data.parquet'\n        actual = next(\n            self.extract(\n                self.test_data_path,\n                geo_data_parquet_path=self.geo_data_parquet_path,\n                skip_creating_geo_data_parquet=False,\n            )\n        )\n        expected = {\n            'd2m': 242.3035430908203,\n            'data_import_time': '1970-01-01T00:00:00+00:00',\n            'data_first_step': '2018-01-02T06:00:00+00:00',\n            'data_uri': self.test_data_path,\n            'latitude': 49.0,\n            'longitude': -108.0,\n            'time': '2018-01-02T06:00:00+00:00',\n            'u10': None,\n            'v10': 0.03294110298156738,\n            'geo_point': geojson.dumps(geojson.Point((-108.0, 49.0))),\n            'geo_polygon': geojson.dumps(geojson.Polygon([\n                        (-108.098837, 48.900826), (-108.098837, 49.099174),\n                        (-107.901163, 49.099174), (-107.901163, 48.900826),\n                        (-108.098837, 48.900826)]))\n        }\n        self.assertRowsEqual(actual, expected)\n\n    def test_09_extract_rows__with_valid_lat_long_with_point(self):\n        valid_lat_long = [[-90, 0], [-90, 1], [-45, -180], [-45, -45], [0, 0], [45, 45], [45, -180], [90, -1],\n                          [90, 0]]\n        actual_val = [\n            '{\"type\": \"Point\", \"coordinates\": [0, -90]}',\n            '{\"type\": \"Point\", \"coordinates\": [1, -90]}',\n            '{\"type\": \"Point\", \"coordinates\": [-180, -45]}',\n            '{\"type\": \"Point\", \"coordinates\": [-45, -45]}',\n            '{\"type\": \"Point\", \"coordinates\": [0, 0]}',\n            '{\"type\": \"Point\", \"coordinates\": [45, 45]}',\n            '{\"type\": \"Point\", \"coordinates\": [-180, 45]}',\n            '{\"type\": \"Point\", \"coordinates\": [-1, 90]}',\n            '{\"type\": \"Point\", \"coordinates\": [0, 90]}'\n        ]\n        for actual, (lat, long) in zip(actual_val, valid_lat_long):\n            with self.subTest():\n                expected = fetch_geo_point(lat, long)\n                self.assertEqual(actual, expected)\n\n    def test_10_extract_rows__with_valid_lat_long_with_polygon(self):\n        valid_lat_long = [[-90, 0], [-90, -180], [-45, -180], [-45, 180], [0, 0], [90, 180], [45, -180], [-90, 180],\n                          [90, 1], [0, 180], [1, -180], [90, -180]]\n        actual_val = [\n            '{\"type\": \"Polygon\", \"coordinates\": [[[-1, 89], [-1, -89], [1, -89], [1, 89], [-1, 89]]]}',\n            '{\"type\": \"Polygon\", \"coordinates\": [[[179, 89], [179, -89], [-179, -89], [-179, 89], [179, 89]]]}',\n            '{\"type\": \"Polygon\", \"coordinates\": [[[179, -46], [179, -44], [-179, -44], [-179, -46], [179, -46]]]}',\n            '{\"type\": \"Polygon\", \"coordinates\": [[[179, -46], [179, -44], [-179, -44], [-179, -46], [179, -46]]]}',\n            '{\"type\": \"Polygon\", \"coordinates\": [[[-1, -1], [-1, 1], [1, 1], [1, -1], [-1, -1]]]}',\n            '{\"type\": \"Polygon\", \"coordinates\": [[[179, 89], [179, -89], [-179, -89], [-179, 89], [179, 89]]]}',\n            '{\"type\": \"Polygon\", \"coordinates\": [[[179, 44], [179, 46], [-179, 46], [-179, 44], [179, 44]]]}',\n            '{\"type\": \"Polygon\", \"coordinates\": [[[179, 89], [179, -89], [-179, -89], [-179, 89], [179, 89]]]}',\n            '{\"type\": \"Polygon\", \"coordinates\": [[[0, 89], [0, -89], [2, -89], [2, 89], [0, 89]]]}',\n            '{\"type\": \"Polygon\", \"coordinates\": [[[179, -1], [179, 1], [-179, 1], [-179, -1], [179, -1]]]}',\n            '{\"type\": \"Polygon\", \"coordinates\": [[[179, 0], [179, 2], [-179, 2], [-179, 0], [179, 0]]]}',\n            '{\"type\": \"Polygon\", \"coordinates\": [[[179, 89], [179, -89], [-179, -89], [-179, 89], [179, 89]]]}'\n        ]\n        lat_grid_resolution = 1\n        lon_grid_resolution = 1\n        for actual, (lat, long) in zip(actual_val, valid_lat_long):\n            with self.subTest():\n                expected = fetch_geo_polygon(lat, long, lat_grid_resolution, lon_grid_resolution)\n                self.assertEqual(actual, expected)\n\n    def test_11_extract_rows__with_invalid_lat_lon(self):\n        invalid_lat_long = [[-100, -2000], [-100, -500], [100, 500], [100, 2000]]\n        for (lat, long) in invalid_lat_long:\n            with self.subTest():\n                with self.assertRaises(ValueError):\n                    fetch_geo_point(lat, long)\n\n    def test_12_extract_rows_zarr(self):\n        input_path = os.path.join(self.test_data_folder, 'test_data.zarr')\n        geo_data_parquet_path = os.path.join(self.test_data_folder, 'test_data_zarr_geo_data.parquet')\n        actual = next(\n            self.extract(\n                input_path,\n                geo_data_parquet_path=geo_data_parquet_path,\n                skip_creating_geo_data_parquet=False,\n                zarr=True\n            )\n        )\n        expected = {\n            'cape': 0.623291015625,\n            'd2m': 237.5404052734375,\n            'data_import_time': '1970-01-01T00:00:00+00:00',\n            'data_first_step': '1959-01-01T00:00:00+00:00',\n            'data_uri': input_path,\n            'latitude': 90,\n            'longitude': 0,\n            'time': '1959-01-01T00:00:00+00:00',\n            'geo_point': geojson.dumps(geojson.Point((0.0, 90.0))),\n            'geo_polygon': geojson.dumps(geojson.Polygon([\n                        (-0.124913, 89.875173), (-0.124913, -89.875173),\n                        (0.124913, -89.875173), (0.124913, 89.875173),\n                        (-0.124913, 89.875173)]))\n        }\n        self.assertRowsEqual(actual, expected)\n\n    def test_13_droping_variable_while_opening_zarr(self):\n        input_path = os.path.join(self.test_data_folder, 'test_data.zarr')\n        geo_data_parquet_path = os.path.join(self.test_data_folder, 'test_data_zarr_geo_data.parquet')\n        actual = next(\n            self.extract(\n                input_path,\n                geo_data_parquet_path=geo_data_parquet_path,\n                skip_creating_geo_data_parquet=True,\n                zarr=True,\n                zarr_kwargs={'drop_variables': ['cape']}\n            )\n        )\n        expected = {\n            'd2m': 237.5404052734375,\n            'data_import_time': '1970-01-01T00:00:00+00:00',\n            'data_first_step': '1959-01-01T00:00:00+00:00',\n            'data_uri': input_path,\n            'latitude': 90,\n            'longitude': 0,\n            'time': '1959-01-01T00:00:00+00:00',\n            'geo_point': geojson.dumps(geojson.Point((0.0, 90.0))),\n            'geo_polygon': geojson.dumps(geojson.Polygon([\n                        (-0.124913, 89.875173), (-0.124913, -89.875173),\n                        (0.124913, -89.875173), (0.124913, 89.875173),\n                        (-0.124913, 89.875173)]))\n        }\n        self.assertRowsEqual(actual, expected)\n\n\nclass ExtractRowsTifSupportTest(ExtractRowsTestBase):\n\n    def setUp(self) -> None:\n        super().setUp()\n        self.test_data_path = f'{self.test_data_folder}/test_data_tif_time.tif'\n        self.geo_data_parquet_path = f'{self.test_data_folder}/test_data_tif_time_geo_data.parquet'\n\n    def test_01_extract_rows_with_end_time(self):\n        actual = next(\n            self.extract(\n                self.test_data_path,\n                geo_data_parquet_path=self.geo_data_parquet_path,\n                skip_creating_geo_data_parquet=False,\n                tif_metadata_for_start_time='start_time',\n                tif_metadata_for_end_time='end_time'\n            )\n        )\n        expected = {\n            'dewpoint_temperature_2m': 281.09349060058594,\n            'temperature_2m': 296.8329772949219,\n            'data_import_time': '1970-01-01T00:00:00+00:00',\n            'data_first_step': '2020-07-01T00:00:00+00:00',\n            'data_uri': self.test_data_path,\n            'latitude': 42.09783344918844,\n            'longitude': -123.66686981141397,\n            'time': '2020-07-01T00:00:00+00:00',\n            'valid_time': '2020-07-01T00:00:00+00:00',\n            'geo_point': geojson.dumps(geojson.Point((-123.66687, 42.097833))),\n            'geo_polygon': geojson.dumps(geojson.Polygon([\n                        (-123.669853, 42.095605), (-123.669853, 42.100066),\n                        (-123.663885, 42.100066), (-123.663885, 42.095605),\n                        (-123.669853, 42.095605)]))\n        }\n        self.assertRowsEqual(actual, expected)\n\n    def test_02_extract_rows_without_end_time(self):\n        actual = next(\n            self.extract(\n                self.test_data_path,\n                geo_data_parquet_path=self.geo_data_parquet_path,\n                skip_creating_geo_data_parquet=True,\n                tif_metadata_for_start_time='start_time'\n            )\n        )\n        expected = {\n            'dewpoint_temperature_2m': 281.09349060058594,\n            'temperature_2m': 296.8329772949219,\n            'data_import_time': '1970-01-01T00:00:00+00:00',\n            'data_first_step': '2020-07-01T00:00:00+00:00',\n            'data_uri': self.test_data_path,\n            'latitude': 42.09783344918844,\n            'longitude': -123.66686981141397,\n            'time': '2020-07-01T00:00:00+00:00',\n            'geo_point': geojson.dumps(geojson.Point((-123.66687, 42.097833))),\n            'geo_polygon': geojson.dumps(geojson.Polygon([\n                        (-123.669853, 42.095605), (-123.669853, 42.100066),\n                        (-123.663885, 42.100066), (-123.663885, 42.095605),\n                        (-123.669853, 42.095605)]))\n        }\n        self.assertRowsEqual(actual, expected)\n\n\nclass ExtractRowsGribSupportTest(ExtractRowsTestBase):\n\n    def setUp(self) -> None:\n        super().setUp()\n        self.test_data_path = f'{self.test_data_folder}/test_data_grib_single_timestep'\n        self.geo_data_parquet_path = f'{self.test_data_folder}/test_data_grib_single_timestep_geo_data.parquet'\n\n    @_handle_missing_grib_be\n    def test_01_extract_rows(self):\n        actual = next(\n            self.extract(\n                self.test_data_path,\n                geo_data_parquet_path=self.geo_data_parquet_path,\n                skip_creating_geo_data_parquet=False,\n                disable_grib_schema_normalization=True\n            )\n        )\n        expected = {\n            'data_import_time': '1970-01-01T00:00:00+00:00',\n            'data_first_step': '2021-10-18T06:00:00+00:00',\n            'data_uri': self.test_data_path,\n            'latitude': 90.0,\n            'longitude': -180.0,\n            'number': 0,\n            'step': 0.0,\n            'surface': 0.0,\n            'time': '2021-10-18T06:00:00+00:00',\n            'valid_time': '2021-10-18T06:00:00+00:00',\n            'z': 1.42578125,\n            'geo_point': geojson.dumps(geojson.Point((-180.0, 90.0))),\n            'geo_polygon': geojson.dumps(geojson.Polygon([\n                        (179.950014, 89.950028), (179.950014, -89.950028),\n                        (-179.950014, -89.950028), (-179.950014, 89.950028),\n                        (179.950014, 89.950028)]))\n        }\n        self.assertRowsEqual(actual, expected)\n\n    @_handle_missing_grib_be\n    def test_02_extract_rows__with_vars__excludes_non_index_coords__without_schema_normalization(self):\n        actual = next(\n            self.extract(\n                self.test_data_path,\n                geo_data_parquet_path=self.geo_data_parquet_path,\n                skip_creating_geo_data_parquet=True,\n                disable_grib_schema_normalization=True, variables=['z']))\n        expected = {\n            'data_import_time': '1970-01-01T00:00:00+00:00',\n            'data_first_step': '2021-10-18T06:00:00+00:00',\n            'data_uri': self.test_data_path,\n            'latitude': 90.0,\n            'longitude': -180.0,\n            'z': 1.42578125,\n            'geo_point': geojson.dumps(geojson.Point((-180.0, 90.0))),\n            'geo_polygon': geojson.dumps(geojson.Polygon([\n                        (179.950014, 89.950028), (179.950014, -89.950028),\n                        (-179.950014, -89.950028), (-179.950014, 89.950028),\n                        (179.950014, 89.950028)]))\n        }\n        self.assertRowsEqual(actual, expected)\n\n    @_handle_missing_grib_be\n    def test_03_extract_rows__with_vars__includes_coordinates_in_vars__without_schema_normalization(self):\n        actual = next(\n            self.extract(\n                self.test_data_path,\n                geo_data_parquet_path=self.geo_data_parquet_path,\n                skip_creating_geo_data_parquet=True,\n                disable_grib_schema_normalization=True,\n                variables=['z', 'step']\n            )\n        )\n        expected = {\n            'data_import_time': '1970-01-01T00:00:00+00:00',\n            'data_first_step': '2021-10-18T06:00:00+00:00',\n            'data_uri': self.test_data_path,\n            'latitude': 90.0,\n            'longitude': -180.0,\n            'step': 0,\n            'z': 1.42578125,\n            'geo_point': geojson.dumps(geojson.Point((-180.0, 90.0))),\n            'geo_polygon': geojson.dumps(geojson.Polygon([\n                        (179.950014, 89.950028), (179.950014, -89.950028),\n                        (-179.950014, -89.950028), (-179.950014, 89.950028),\n                        (179.950014, 89.950028)]))\n        }\n        self.assertRowsEqual(actual, expected)\n\n    @_handle_missing_grib_be\n    def test_04_extract_rows__with_vars__excludes_non_index_coords__with_schema_normalization(self):\n        actual = next(\n            self.extract(\n                self.test_data_path,\n                geo_data_parquet_path=self.geo_data_parquet_path,\n                skip_creating_geo_data_parquet=True,\n                variables=['z']\n            )\n        )\n        expected = {\n            'data_import_time': '1970-01-01T00:00:00+00:00',\n            'data_first_step': '2021-10-18T06:00:00+00:00',\n            'data_uri': self.test_data_path,\n            'latitude': 90.0,\n            'longitude': -180.0,\n            'surface_0_00_instant_z': 1.42578125,\n            'geo_point': geojson.dumps(geojson.Point((-180.0, 90.0))),\n            'geo_polygon': geojson.dumps(geojson.Polygon([\n                        (179.950014, 89.950028), (179.950014, -89.950028),\n                        (-179.950014, -89.950028), (-179.950014, 89.950028),\n                        (179.950014, 89.950028)]))\n        }\n        self.assertRowsEqual(actual, expected)\n\n    @_handle_missing_grib_be\n    def test_05_extract_rows__with_vars__includes_coordinates_in_vars__with_schema_normalization(self):\n        actual = next(\n            self.extract(\n                self.test_data_path,\n                geo_data_parquet_path=self.geo_data_parquet_path,\n                skip_creating_geo_data_parquet=True,\n                variables=['z', 'step']\n            )\n        )\n        expected = {\n            'data_import_time': '1970-01-01T00:00:00+00:00',\n            'data_first_step': '2021-10-18T06:00:00+00:00',\n            'data_uri': self.test_data_path,\n            'latitude': 90.0,\n            'longitude': -180.0,\n            'step': 0,\n            'surface_0_00_instant_z': 1.42578125,\n            'geo_point': geojson.dumps(geojson.Point((-180.0, 90.0))),\n            'geo_polygon': geojson.dumps(geojson.Polygon([\n                        (179.950014, 89.950028), (179.950014, -89.950028),\n                        (-179.950014, -89.950028), (-179.950014, 89.950028),\n                        (179.950014, 89.950028)]))\n        }\n        self.assertRowsEqual(actual, expected)\n\n    @_handle_missing_grib_be\n    def test_06_multiple_editions__without_schema_normalization(self):\n        self.test_data_path = f'{self.test_data_folder}/test_data_grib_multiple_edition_single_timestep.bz2'\n        self.geo_data_parquet_path = (\n            f'{self.test_data_folder}/test_data_grib_multiple_edition_single_timestep_geo_data.parquet'\n        )\n        actual = next(\n            self.extract(\n                self.test_data_path,\n                geo_data_parquet_path=self.geo_data_parquet_path,\n                skip_creating_geo_data_parquet=False,\n                disable_grib_schema_normalization=True\n            )\n        )\n        expected = {\n            'cape': 0.0,\n            'cbh': None,\n            'cp': 0.0,\n            'crr': 0.0,\n            'd2m': 248.3846893310547,\n            'data_first_step': '2021-12-10T12:00:00+00:00',\n            'data_import_time': '1970-01-01T00:00:00+00:00',\n            'data_uri': self.test_data_path,\n            'depthBelowLandLayer': 0.0,\n            'dsrp': 0.0,\n            'fdir': 0.0,\n            'hcc': 0.0,\n            'hcct': None,\n            'hwbt0': 0.0,\n            'i10fg': 7.41250467300415,\n            'latitude': 90.0,\n            'longitude': -180.0,\n            'lsp': 1.1444091796875e-05,\n            'mcc': 0.0,\n            'msl': 99867.3125,\n            'number': 0,\n            'p3020': 20306.701171875,\n            'sd': 0.0,\n            'sf': 1.049041748046875e-05,\n            'sp': 99867.15625,\n            'step': 28800.0,\n            'stl1': 251.02520751953125,\n            'surface': 0.0,\n            'swvl1': -1.9539930654413618e-13,\n            't2m': 251.18968200683594,\n            'tcc': 0.9609375,\n            'tcrw': 0.0,\n            'tcw': 2.314192295074463,\n            'tcwv': 2.314192295074463,\n            'time': '2021-12-10T12:00:00+00:00',\n            'tp': 1.1444091796875e-05,\n            'tsr': 0.0,\n            'u10': -4.6668853759765625,\n            'u100': -7.6197662353515625,\n            'u200': -9.176498413085938,\n            'v10': -3.2414093017578125,\n            'v100': -4.1650390625,\n            'v200': -3.6647186279296875,\n            'valid_time': '2021-12-10T20:00:00+00:00',\n            'geo_point': geojson.dumps(geojson.Point((-180.0, 90.0))),\n            'geo_polygon': geojson.dumps(geojson.Polygon([\n                        (179.950014, 89.950028), (179.950014, -89.950028),\n                        (-179.950014, -89.950028), (-179.950014, 89.950028),\n                        (179.950014, 89.950028)]))\n        }\n        self.assertRowsEqual(actual, expected)\n\n    @_handle_missing_grib_be\n    def test_07_multiple_editions__with_schema_normalization(self):\n        self.test_data_path = f'{self.test_data_folder}/test_data_grib_multiple_edition_single_timestep.bz2'\n        self.geo_data_parquet_path = (\n            f'{self.test_data_folder}/test_data_grib_multiple_edition_single_timestep_geo_data.parquet'\n        )\n        actual = next(\n            self.extract(\n                self.test_data_path,\n                geo_data_parquet_path=self.geo_data_parquet_path,\n                skip_creating_geo_data_parquet=True,\n            )\n        )\n        expected = {\n            'surface_0_00_instant_cape': 0.0,\n            'surface_0_00_instant_cbh': None,\n            'surface_0_00_instant_cp': 0.0,\n            'surface_0_00_instant_crr': 0.0,\n            'surface_0_00_instant_d2m': 248.3846893310547,\n            'data_first_step': '2021-12-10T12:00:00+00:00',\n            'data_import_time': '1970-01-01T00:00:00+00:00',\n            'data_uri': self.test_data_path,\n            'surface_0_00_instant_dsrp': 0.0,\n            'surface_0_00_instant_fdir': 0.0,\n            'surface_0_00_instant_hcc': 0.0,\n            'surface_0_00_instant_hcct': None,\n            'surface_0_00_instant_hwbt0': 0.0,\n            'surface_0_00_instant_i10fg': 7.41250467300415,\n            'latitude': 90.0,\n            'longitude': -180.0,\n            'surface_0_00_instant_lsp': 1.1444091796875e-05,\n            'surface_0_00_instant_mcc': 0.0,\n            'surface_0_00_instant_msl': 99867.3125,\n            'number': 0,\n            'surface_0_00_instant_p3020': 20306.701171875,\n            'surface_0_00_instant_sd': 0.0,\n            'surface_0_00_instant_sf': 1.049041748046875e-05,\n            'surface_0_00_instant_sp': 99867.15625,\n            'step': 28800.0,\n            'depthBelowLandLayer_0_00_instant_stl1': 251.02520751953125,\n            'depthBelowLandLayer_0_00_instant_swvl1': -1.9539930654413618e-13,\n            'depthBelowLandLayer_7_00_instant_stl2': 253.54124450683594,\n            'entireAtmosphere_0_00_instant_litoti': 0.0,\n            'surface_0_00_instant_t2m': 251.18968200683594,\n            'surface_0_00_instant_tcc': 0.9609375,\n            'surface_0_00_instant_tcrw': 0.0,\n            'surface_0_00_instant_tcw': 2.314192295074463,\n            'surface_0_00_instant_tcwv': 2.314192295074463,\n            'time': '2021-12-10T12:00:00+00:00',\n            'surface_0_00_instant_tp': 1.1444091796875e-05,\n            'surface_0_00_instant_tsr': 0.0,\n            'surface_0_00_instant_u10': -4.6668853759765625,\n            'surface_0_00_instant_u100': -7.6197662353515625,\n            'surface_0_00_instant_u200': -9.176498413085938,\n            'surface_0_00_instant_v10': -3.2414093017578125,\n            'surface_0_00_instant_v100': -4.1650390625,\n            'surface_0_00_instant_v200': -3.6647186279296875,\n            'surface_0_00_instant_ptype': 5.0,\n            'surface_0_00_instant_tprate': 0.0,\n            'surface_0_00_instant_ceil': 179.17018127441406,\n            'valid_time': '2021-12-10T20:00:00+00:00',\n            'geo_point': geojson.dumps(geojson.Point((-180.0, 90.0))),\n            'geo_polygon': geojson.dumps(geojson.Polygon([\n                        (179.950014, 89.950028), (179.950014, -89.950028),\n                        (-179.950014, -89.950028), (-179.950014, 89.950028),\n                        (179.950014, 89.950028)]))\n        }\n        self.assertRowsEqual(actual, expected)\n\n    @_handle_missing_grib_be\n    def test_08_multiple_editions__with_vars__includes_coordinates_in_vars__with_schema_normalization(self):\n        self.test_data_path = f'{self.test_data_folder}/test_data_grib_multiple_edition_single_timestep.bz2'\n        self.geo_data_parquet_path = (\n            f'{self.test_data_folder}/test_data_grib_multiple_edition_single_timestep_geo_data.parquet'\n        )\n        actual = next(\n            self.extract(\n                self.test_data_path,\n                geo_data_parquet_path=self.geo_data_parquet_path,\n                skip_creating_geo_data_parquet=True,\n                variables=['p3020', 'depthBelowLandLayer', 'step']\n            )\n        )\n        expected = {\n            'data_import_time': '1970-01-01T00:00:00+00:00',\n            'data_first_step': '2021-12-10T12:00:00+00:00',\n            'data_uri': self.test_data_path,\n            'latitude': 90.0,\n            'longitude': -180.0,\n            'step': 28800.0,\n            'surface_0_00_instant_p3020': 20306.701171875,\n            'depthBelowLandLayer_0_00_instant_swvl1': -1.9539930654413618e-13,\n            'depthBelowLandLayer_0_00_instant_stl1': 251.02520751953125,\n            'depthBelowLandLayer_7_00_instant_stl2': 253.54124450683594,\n            'geo_point': geojson.dumps(geojson.Point((-180.0, 90.0))),\n            'geo_polygon': geojson.dumps(geojson.Polygon([\n                        (179.950014, 89.950028), (179.950014, -89.950028),\n                        (-179.950014, -89.950028), (-179.950014, 89.950028),\n                        (179.950014, 89.950028)]))\n\n        }\n        self.assertRowsEqual(actual, expected)\n\n\nclass ExtractRowsFromZarrTest(ExtractRowsTestBase):\n\n    def setUp(self) -> None:\n        super().setUp()\n        self.tmpdir = tempfile.TemporaryDirectory()\n\n    def tearDown(self) -> None:\n        super().tearDown()\n        self.tmpdir.cleanup()\n\n    def test_01_extracts_rows(self):\n        input_zarr = os.path.join(self.tmpdir.name, 'air_temp.zarr')\n\n        ds = (\n            xr.tutorial.open_dataset('air_temperature', cache_dir=self.test_data_folder)\n            .isel(time=slice(0, 4), lat=slice(0, 4), lon=slice(0, 4))\n            .rename(dict(lon='longitude', lat='latitude'))\n        )\n        ds.to_zarr(input_zarr)\n\n        op = ToBigQuery.from_kwargs(\n            first_uri=input_zarr, zarr_kwargs=dict(chunks=None, consolidated=True), dry_run=True, zarr=True,\n            output_table='foo.bar.baz',\n            variables=list(), area=list(), xarray_open_dataset_kwargs=dict(), import_time=None, infer_schema=False,\n            tif_metadata_for_start_time=None, tif_metadata_for_end_time=None, skip_region_validation=True,\n            disable_grib_schema_normalization=False,\n            geo_data_parquet_path=os.path.join(self.tmpdir.name, 'geo_data.parquet'),\n        )\n\n        with TestPipeline() as p:\n            result = p | op\n            assert_that(result, is_not_empty())\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "weather_mv/loader_pipeline/ee.py",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport csv\nimport dataclasses\nimport json\nimport logging\nimport math\nimport os\nimport re\nimport shutil\nimport subprocess\nimport tempfile\nimport time\nimport typing as t\nfrom multiprocessing import Process, Queue\n\nimport apache_beam as beam\nimport ee\nimport numpy as np\nfrom apache_beam.io.filesystems import FileSystems\nfrom apache_beam.metrics import metric\nfrom apache_beam.io.gcp.gcsio import WRITE_CHUNK_SIZE\nfrom apache_beam.options.pipeline_options import PipelineOptions\nfrom apache_beam.utils import retry\nfrom google.auth import compute_engine, default, credentials\nfrom google.auth.transport import requests\nfrom google.auth.transport.requests import AuthorizedSession\nfrom rasterio.io import MemoryFile\n\nfrom .sinks import ToDataSink, open_dataset, open_local, KwargsFactoryMixin, copy\nfrom .util import make_attrs_ee_compatible, RateLimit, validate_region, get_utc_timestamp\nfrom .metrics import timeit, AddTimer, AddMetrics\n\nlogger = logging.getLogger(__name__)\n\nCOMPUTE_ENGINE_STR = 'Metadata-Flavor: Google'\n# For EE ingestion retry logic.\nINITIAL_DELAY = 1.0  # Initial delay in seconds.\nMAX_DELAY = 600  # Maximum delay before giving up in seconds.\nNUM_RETRIES = 10  # Number of tries with exponential backoff.\nTASK_QUEUE_WAIT_TIME = 120  # Task queue wait time in seconds.\nASSET_TYPE_TO_EXTENSION_MAPPING = {\n    'IMAGE': '.tiff',\n    'TABLE': '.csv'\n}\nROWS_PER_WRITE = 10_000  # Number of rows per feature collection write.\n\n\ndef is_compute_engine() -> bool:\n    \"\"\"Determines if the application in running in Compute Engine Environment.\"\"\"\n    command = ['curl', 'metadata.google.internal', '-i']\n    result = subprocess.run(command, stdout=subprocess.PIPE)\n    result_output = result.stdout.decode('utf-8')\n    return COMPUTE_ENGINE_STR in result_output\n\n\ndef get_creds(use_personal_account: bool, service_account: str, private_key: str) -> credentials.Credentials:\n    \"\"\"Fetches credentials for authentication.\n\n    If the `use_personal_account` argument is true then it will authenticate with pop-up\n    browser window using personal account. Otherwise, if the application is running\n    in compute engine, it will use credentials of service account bound to the VM.\n    Otherwise, it will try to use user credentials.\n\n    Args:\n        use_personal_account: A flag to use personal account for ee authentication.\n        service_account: Service account address when using a private key for earth engine authentication.\n        private_key: A private key path to authenticate earth engine using private key.\n\n    Returns:\n        cred: Credentials object.\n    \"\"\"\n    # TODO(Issue #197): Test private key authentication.\n    if service_account and private_key:\n        try:\n            with open_local(private_key) as local_path:\n                creds = ee.ServiceAccountCredentials(service_account, local_path)\n        except Exception:\n            raise RuntimeError(f'Unable to open the private key {private_key}.')\n    elif use_personal_account:\n        ee.Authenticate()\n        creds, _ = default()\n    elif is_compute_engine():\n        creds = compute_engine.Credentials()\n    else:\n        creds, _ = default()\n\n    creds.refresh(requests.Request())\n    return creds\n\n\ndef ee_initialize(use_personal_account: bool = False,\n                  enforce_high_volume: bool = False,\n                  service_account: t.Optional[str] = None,\n                  private_key: t.Optional[str] = None,\n                  project_id: t.Optional[str] = None) -> None:\n    \"\"\"Initializes earth engine with the high volume API when using a compute engine VM.\n\n    Args:\n        use_personal_account: A flag to use personal account for ee authentication. Default: False.\n        enforce_high_volume: A flag to use the high volume API when using a compute engine VM. Default: False.\n        service_account: Service account address when using a private key for earth engine authentication.\n        private_key: A private key path to authenticate earth engine using private key. Default: None.\n        Project ID: An identifier that represents the name of a project present in Earth Engine.\n    Raises:\n        RuntimeError: Earth Engine did not initialize.\n    \"\"\"\n    creds = get_creds(use_personal_account, service_account, private_key)\n    on_compute_engine = is_compute_engine()\n    # Using the high volume api.\n    if on_compute_engine:\n        if project_id is None and use_personal_account:\n            raise RuntimeError('Project_name should not be None!')\n        params = {'credentials': creds, 'opt_url': 'https://earthengine-highvolume.googleapis.com'}\n        if project_id:\n            params['project'] = project_id\n        ee.Initialize(**params)\n\n    # Only the compute engine service service account can access the high volume api.\n    elif enforce_high_volume and not on_compute_engine:\n        raise RuntimeError(\n            'Must run on a compute engine VM to use the high volume earth engine api.'\n        )\n    else:\n        ee.Initialize(creds)\n\n\nclass SetupEarthEngine(RateLimit):\n    \"\"\"A base class to setup the earth engine.\"\"\"\n\n    def __init__(self,\n                 ee_qps: int,\n                 ee_latency: float,\n                 ee_max_concurrent: int,\n                 private_key: str,\n                 service_account: str,\n                 use_personal_account: bool,\n                 use_metrics: bool):\n        super().__init__(global_rate_limit_qps=ee_qps,\n                         latency_per_request=ee_latency,\n                         max_concurrent_requests=ee_max_concurrent,\n                         use_metrics=use_metrics)\n        self._has_setup = False\n        self.private_key = private_key\n        self.service_account = service_account\n        self.use_personal_account = use_personal_account\n        self.use_metrics = use_metrics\n\n    def setup(self, project_id):\n        \"\"\"Makes sure ee is set up on every worker.\"\"\"\n        ee_initialize(use_personal_account=self.use_personal_account,\n                      service_account=self.service_account,\n                      private_key=self.private_key,\n                      project_id=project_id)\n        self._has_setup = True\n\n    def check_setup(self, project_id: t.Optional[str] = None):\n        \"\"\"Ensures that setup has been called.\"\"\"\n        if not self._has_setup:\n            try:\n                # This throws an exception if ee is not initialized.\n                ee.data.getAlgorithms()\n                self._has_setup = True\n            except ee.EEException:\n                self.setup(project_id)\n\n    def process(self, *args, **kwargs):\n        \"\"\"Checks that setup has been called then call the process implementation.\"\"\"\n        self.check_setup()\n\n\ndef get_ee_safe_name(uri: str) -> str:\n    \"\"\"Extracts file name and converts it into an EE-safe name\"\"\"\n    basename = os.path.basename(uri)\n    # Strip the extension from the basename.\n    basename, _ = os.path.splitext(basename)\n    # An asset ID can only contain letters, numbers, hyphens, and underscores.\n    # Converting everything else to underscore.\n    asset_name = re.sub(r'[^a-zA-Z0-9-_]+', r'_', basename)\n    return asset_name\n\n\n@dataclasses.dataclass\nclass AssetData:\n    \"\"\"A class for holding the asset data.\n\n    Attributes:\n        name: The EE-safe name of the asset.\n        target_path: The location of the asset in GCS.\n        channel_names: A list of channel names in the asset.\n        start_time: Image start time in floating point seconds since epoch.\n        end_time: Image end time in floating point seconds since epoch.\n        properties: A dictionary of asset metadata.\n    \"\"\"\n    name: str\n    target_path: str\n    channel_names: t.List[str]\n    start_time: float\n    end_time: float\n    properties: t.Dict[str, t.Union[str, float, int]]\n\n\n@dataclasses.dataclass\nclass ToEarthEngine(ToDataSink):\n    \"\"\"Loads weather data into Google Earth Engine.\n\n    A sink that loads dataset (either normalized or read using user-provided kwargs).\n    This sink will read each channel data and merge them into a single dataset if the\n    `disable_grib_schema_normalization` flag is not specified. It will read the\n    dataset and create an asset. Next, it will write the asset to the specified\n    bucket path and initiate the earth engine upload request.\n\n    When using the default service account bound to the VM, it is required to register the\n    service account with EE from `here`_. See `this doc`_ for more detail.\n\n    Attributes:\n        asset_location: The bucket location at which asset files will be pushed.\n        ee_asset: The asset folder path in earth engine project where the asset files will be pushed.\n        ee_asset_type: The type of asset to ingest in the earth engine. Default: IMAGE.\n        xarray_open_dataset_kwargs: A dictionary of kwargs to pass to xr.open_dataset().\n        disable_grib_schema_normalization: A flag to turn grib schema normalization off; Default: on.\n        skip_region_validation: Turn off validation that checks if all Cloud resources\n          are in the same region.\n        use_personal_account: A flag to authenticate earth engine using personal account. Default: False.\n\n    .. _here: https://signup.earthengine.google.com/#!/service_accounts\n    .. _this doc: https://developers.google.com/earth-engine/guides/service_account\n    \"\"\"\n    asset_location: str\n    ee_asset: str\n    ee_asset_type: str\n    xarray_open_dataset_kwargs: t.Dict\n    disable_grib_schema_normalization: bool\n    skip_region_validation: bool\n    use_personal_account: bool\n    force: bool\n    service_account: str\n    private_key: str\n    ee_qps: int\n    ee_latency: float\n    ee_max_concurrent: int\n    group_common_hypercubes: bool\n    band_names_mapping: str\n    initialization_time_regex: str\n    forecast_time_regex: str\n    ingest_as_virtual_asset: bool\n    use_deflate: bool\n    use_metrics: bool\n    use_monitoring_metrics: bool\n    topic: str\n    # Pipeline arguments.\n    job_name: str\n    project: str\n    region: str\n\n    @classmethod\n    def add_parser_arguments(cls, subparser: argparse.ArgumentParser):\n        subparser.add_argument('--asset_location', type=str, required=True, default=None,\n                               help='The GCS location where the asset files will be pushed.')\n        subparser.add_argument('--ee_asset', type=str, required=True, default=None,\n                               help='The asset folder path in earth engine project where the asset files'\n                               ' will be pushed.')\n        subparser.add_argument('--ee_asset_type', type=str, choices=['IMAGE', 'TABLE'], default='IMAGE',\n                               help='The type of asset to ingest in the earth engine.')\n        subparser.add_argument('--xarray_open_dataset_kwargs', type=json.loads, default='{}',\n                               help='Keyword-args to pass into `xarray.open_dataset()` in the form of a JSON string.')\n        subparser.add_argument('--disable_grib_schema_normalization', action='store_true', default=False,\n                               help='To disable merge of grib datasets. Default: False')\n        subparser.add_argument('-s', '--skip-region-validation', action='store_true', default=False,\n                               help='Skip validation of regions for data migration. Default: off')\n        subparser.add_argument('-u', '--use_personal_account', action='store_true', default=False,\n                               help='To use personal account for earth engine authentication.')\n        subparser.add_argument('-f', '--force', action='store_true', default=False,\n                               help='A flag that allows overwriting of existing asset files in the GCS bucket.'\n                                    ' Default: off, which means that the ingestion of URIs for which assets files'\n                                    ' (GeoTiff/CSV) already exist in the GCS bucket will be skipped.')\n        subparser.add_argument('--service_account', type=str, default=None,\n                               help='Service account address when using a private key for earth engine authentication.')\n        subparser.add_argument('--private_key', type=str, default=None,\n                               help='To use a private key for earth engine authentication.')\n        subparser.add_argument('--ee_qps', type=int, default=10,\n                               help='Maximum queries per second allowed by EE for your project. Default: 10')\n        subparser.add_argument('--ee_latency', type=float, default=0.5,\n                               help='The expected latency per requests, in seconds. Default: 0.5')\n        subparser.add_argument('--ee_max_concurrent', type=int, default=10,\n                               help='Maximum concurrent api requests to EE allowed for your project. Default: 10')\n        subparser.add_argument('--group_common_hypercubes', action='store_true', default=False,\n                               help='To group common hypercubes into image collections when loading grib data.')\n        subparser.add_argument('--band_names_mapping', type=str, default=None,\n                               help='A JSON file which contains the band names for the TIFF file.')\n        subparser.add_argument('--initialization_time_regex', type=str, default=None,\n                               help='A Regex string to get the initialization time from the filename.')\n        subparser.add_argument('--forecast_time_regex', type=str, default=None,\n                               help='A Regex string to get the forecast/end time from the filename.')\n        subparser.add_argument('--ingest_as_virtual_asset', action='store_true', default=False,\n                               help='To ingest image as a virtual asset. Default: False')\n        subparser.add_argument('--use_deflate', action='store_true', default=False,\n                               help='To use deflate compression algorithm. Default: False')\n        subparser.add_argument('--use_metrics', action='store_true', default=False,\n                               help='If you want to add Beam metrics to your pipeline. Default: False')\n        subparser.add_argument('--use_monitoring_metrics', action='store_true', default=False,\n                               help='If you want to add GCP Monitoring metrics to your pipeline. Default: False')\n\n    @classmethod\n    def validate_arguments(cls, known_args: argparse.Namespace, pipeline_args: t.List[str]) -> None:\n        pipeline_options = PipelineOptions(pipeline_args)\n        pipeline_options_dict = pipeline_options.get_all_options()\n\n        if known_args.zarr:\n            raise RuntimeError('Reading Zarr is not (yet) supported.')\n\n        # Check that ee_asset is in correct format.\n        if not re.match(\"^projects/.+/assets.*\", known_args.ee_asset):\n            raise RuntimeError(\"'--ee_asset' is required to be in format: projects/+/assets/*.\")\n\n        # Check that both service_account and private_key are provided, or none is.\n        if bool(known_args.service_account) ^ bool(known_args.private_key):\n            raise RuntimeError(\"'--service_account' and '--private_key' both are required.\")\n\n        # Check that either personal or service account is asked to use.\n        if known_args.use_personal_account and known_args.service_account:\n            raise RuntimeError(\"Both personal and service account cannot be used at once.\")\n\n        if known_args.ee_qps and known_args.ee_qps < 1:\n            raise RuntimeError(\"Queries per second should not be less than 1.\")\n\n        if known_args.ee_latency and known_args.ee_latency < 0.001:\n            raise RuntimeError(\"Latency per request should not be less than 0.001.\")\n\n        if known_args.ee_max_concurrent and known_args.ee_max_concurrent < 1:\n            raise RuntimeError(\"Maximum concurrent requests should not be less than 1.\")\n\n        # Check that when ingesting as a virtual asset, asset type is image.\n        if known_args.ingest_as_virtual_asset and known_args.ee_asset_type != \"IMAGE\":\n            raise RuntimeError(\"Only assets with IMAGE type can be ingested as a virtual asset.\")\n\n        # Check that Cloud resource regions are consistent.\n        if not (known_args.dry_run or known_args.skip_region_validation):\n            # Program execution will terminate on failure of region validation.\n            logger.info('Validating regions for data migration. This might take a few seconds...')\n            validate_region(temp_location=pipeline_options_dict.get('temp_location'),\n                            region=pipeline_options_dict.get('region'))\n            logger.info('Region validation completed successfully.')\n\n        # Check for the band_names_mapping json file.\n        if known_args.band_names_mapping:\n            if not os.path.exists(known_args.band_names_mapping):\n                raise RuntimeError(\"--band_names_mapping file does not exist.\")\n            _, band_names_mapping_extension = os.path.splitext(known_args.band_names_mapping)\n            if not band_names_mapping_extension == '.json':\n                raise RuntimeError(\"--band_names_mapping should contain a json file as input.\")\n\n        # Check the initialization_time_regex and forecast_time_regex strings.\n        if bool(known_args.initialization_time_regex) ^ bool(known_args.forecast_time_regex):\n            raise RuntimeError(\"Both --initialization_time_regex & --forecast_time_regex flags need to be present\")\n\n        logger.info(f\"Add metrics to pipeline: {known_args.use_metrics}\")\n        logger.info(f\"Add Google Cloud Monitoring metrics to pipeline: {known_args.use_monitoring_metrics}\")\n\n    def expand(self, paths):\n        \"\"\"Converts input data files into assets and uploads them into the earth engine.\"\"\"\n        band_names_dict = {}\n        if self.band_names_mapping:\n            with open(self.band_names_mapping, 'r', encoding='utf-8') as f:\n                band_names_dict = json.load(f)\n\n        if self.use_metrics:\n            paths = paths | 'AddTimer' >> beam.ParDo(AddTimer.from_kwargs(**vars(self)))\n\n        if not self.dry_run:\n            output = (\n                paths\n                | 'FilterFiles' >> FilterFilesTransform.from_kwargs(**vars(self))\n                | 'ReshuffleFiles' >> beam.Reshuffle()\n                | 'ConvertToAsset' >> beam.ParDo(\n                    ConvertToAsset.from_kwargs(band_names_dict=band_names_dict, **vars(self))\n                    )\n                | 'IngestIntoEE' >> IngestIntoEETransform.from_kwargs(**vars(self))\n            )\n\n            if self.use_metrics:\n                output | 'AddMetrics' >> AddMetrics.from_kwargs(**vars(self))\n        else:\n            (\n                paths\n                | 'Log Files' >> beam.Map(logger.info)\n            )\n\n\nclass FilterFilesTransform(SetupEarthEngine, KwargsFactoryMixin):\n    \"\"\"Filters out paths for which the assets that are already in the earth engine.\n\n    Attributes:\n        ee_asset: The asset folder path in earth engine project where the asset files will be pushed.\n        ee_qps: Maximum queries per second allowed by EE for your project.\n        ee_latency: The expected latency per requests, in seconds.\n        ee_max_concurrent: Maximum concurrent api requests to EE allowed for your project.\n        force: A flag that allows overwriting of existing asset files in the GCS bucket.\n        private_key: A private key path to authenticate earth engine using private key. Default: None.\n        service_account: Service account address when using a private key for earth engine authentication.\n        use_personal_account: A flag to authenticate earth engine using personal account. Default: False.\n    \"\"\"\n\n    def __init__(self,\n                 asset_location: str,\n                 ee_asset: str,\n                 ee_asset_type: str,\n                 ee_qps: int,\n                 ee_latency: float,\n                 ee_max_concurrent: int,\n                 force: bool,\n                 private_key: str,\n                 service_account: str,\n                 use_personal_account: bool,\n                 use_metrics: bool):\n        \"\"\"Sets up rate limit and initializes the earth engine.\"\"\"\n        super().__init__(ee_qps=ee_qps,\n                         ee_latency=ee_latency,\n                         ee_max_concurrent=ee_max_concurrent,\n                         private_key=private_key,\n                         service_account=service_account,\n                         use_personal_account=use_personal_account,\n                         use_metrics=use_metrics)\n        self.asset_location = asset_location\n        self.ee_asset = ee_asset\n        self.ee_asset_type = ee_asset_type\n        self.force_overwrite = force\n        self.use_metrics = use_metrics\n\n    @timeit('FilterFileTransform')\n    def process(self, uri: str) -> t.Iterator[str]:\n        \"\"\"Yields uri if the asset does not already exist.\"\"\"\n        project_id = self.ee_asset.split('/')[1]\n        self.check_setup(project_id)\n        asset_name = get_ee_safe_name(uri)\n\n        # Checks if the asset is already present in the GCS bucket or not.\n        target_path = os.path.join(\n            self.asset_location, f'{asset_name}{ASSET_TYPE_TO_EXTENSION_MAPPING[self.ee_asset_type]}')\n        if not self.force_overwrite and FileSystems.exists(target_path):\n            logger.info(f'Asset file {target_path} already exists in GCS bucket. Skipping...')\n            return\n\n        asset_id = os.path.join(self.ee_asset, asset_name)\n        try:\n            ee.data.getAsset(asset_id)\n            logger.info(f'Asset {asset_id} already exists in EE. Skipping...')\n        except ee.EEException:\n            yield uri\n\n\n@dataclasses.dataclass\nclass ConvertToAsset(beam.DoFn, KwargsFactoryMixin):\n    \"\"\"Writes asset after extracting input data and uploads it to GCS.\n\n    Attributes:\n        ee_asset_type: The type of asset to ingest in the earth engine. Default: IMAGE.\n        asset_location: The bucket location at which asset files will be pushed.\n        xarray_open_dataset_kwargs: A dictionary of kwargs to pass to xr.open_dataset().\n        disable_grib_schema_normalization: A flag to turn grib schema normalization off; Default: on.\n    \"\"\"\n\n    asset_location: str\n    ee_asset_type: str = 'IMAGE'\n    xarray_open_dataset_kwargs: t.Optional[t.Dict] = None\n    disable_grib_schema_normalization: bool = False\n    group_common_hypercubes: t.Optional[bool] = False\n    band_names_dict: t.Optional[t.Dict] = None\n    initialization_time_regex: t.Optional[str] = None\n    forecast_time_regex: t.Optional[str] = None\n    use_deflate: t.Optional[bool] = False\n    use_metrics: t.Optional[bool] = False\n\n    def add_to_queue(self, queue: Queue, item: t.Any):\n        \"\"\"Adds a new item to the queue.\n\n        It will wait until the queue has a room to add a new item.\n        \"\"\"\n        while queue.full():\n            pass\n        queue.put_nowait(item)\n\n    def convert_to_asset(self, queue: Queue, uri: str):\n        \"\"\"Converts source data into EE asset (GeoTiff or CSV) and uploads it to the bucket.\"\"\"\n        child_logger = logging.getLogger(__name__)\n        child_logger.info(f'Converting {uri!r} to COGs...')\n\n        job_start_time = get_utc_timestamp()\n\n        with open_dataset(uri,\n                          self.xarray_open_dataset_kwargs,\n                          self.disable_grib_schema_normalization,\n                          initialization_time_regex=self.initialization_time_regex,\n                          forecast_time_regex=self.forecast_time_regex,\n                          group_common_hypercubes=self.group_common_hypercubes) as ds_list:\n            if not isinstance(ds_list, list):\n                ds_list = [ds_list]\n\n            for ds in ds_list:\n                attrs = ds.attrs\n                data = list(ds.values())\n                asset_name = get_ee_safe_name(uri)\n                channel_names = [\n                    self.band_names_dict.get(da.name, da.name) if self.band_names_dict\n                    else da.name for da in data\n                ]\n\n                dtype, crs, transform = (attrs.pop(key) for key in ['dtype', 'crs', 'transform'])\n                # Adding job_start_time to properites.\n                attrs[\"job_start_time\"] = job_start_time\n                # Make attrs EE ingestable.\n                attrs = make_attrs_ee_compatible(attrs)\n                start_time, end_time = (attrs.get(key) for key in ('start_time', 'end_time'))\n\n                if self.group_common_hypercubes:\n                    level, height = (attrs.pop(key) for key in ['level', 'height'])\n                    safe_level_name = get_ee_safe_name(level)\n                    asset_name = f'{asset_name}_{safe_level_name}'\n\n                compression = 'lzw'\n                predictor = 'NO'\n                if self.use_deflate:\n                    compression = 'deflate'\n                    # Depending on dtype select predictor value.\n                    # Predictor is a method of storing only the difference from the\n                    # previous value instead of the actual value.\n                    predictor = 2 if np.issubdtype(dtype, np.integer) else 3\n\n                # For tiff ingestions.\n                if self.ee_asset_type == 'IMAGE':\n                    file_name = f'{asset_name}.tiff'\n\n                    with MemoryFile() as memfile:\n                        with memfile.open(driver='COG',\n                                          dtype=dtype,\n                                          width=data[0].data.shape[1],\n                                          height=data[0].data.shape[0],\n                                          count=len(data),\n                                          nodata=np.nan,\n                                          crs=crs,\n                                          transform=transform,\n                                          compress=compression,\n                                          predictor=predictor) as f:\n                            for i, da in enumerate(data):\n                                f.write(da, i+1)\n                                # Making the channel name EE-safe before adding it as a band name.\n                                f.set_band_description(i+1, get_ee_safe_name(channel_names[i]))\n                                f.update_tags(i+1, band_name=channel_names[i])\n                                f.update_tags(i+1, **da.attrs)\n\n                            # Write attributes as tags in tiff.\n                            f.update_tags(**attrs)\n\n                        # Copy in-memory tiff to gcs.\n                        target_path = os.path.join(self.asset_location, file_name)\n                        with FileSystems().create(target_path) as dst:\n                            shutil.copyfileobj(memfile, dst, WRITE_CHUNK_SIZE)\n                            child_logger.info(f\"Uploaded {uri!r}'s COG to {target_path}\")\n\n                # For feature collection ingestions.\n                elif self.ee_asset_type == 'TABLE':\n                    channel_names = []\n                    file_name = f'{asset_name}.csv'\n\n                    shape = math.prod(list(ds.dims.values()))\n                    # Names of dimesions, coordinates and data variables.\n                    dims = list(ds.dims)\n                    coords = [c for c in list(ds.coords) if c not in dims]\n                    vars = list(ds.data_vars)\n                    header = dims + coords + vars\n\n                    # Data of dimesions, coordinates and data variables.\n                    dims_data = [ds[dim].data for dim in dims]\n                    coords_data = [np.full((shape,), ds[coord].data) for coord in coords]\n                    vars_data = [ds[var].data.flatten() for var in vars]\n                    data = coords_data + vars_data\n\n                    dims_shape = [len(ds[dim].data) for dim in dims]\n\n                    def get_dims_data(index: int) -> t.List[t.Any]:\n                        \"\"\"Returns dimensions for the given flattened index.\"\"\"\n                        return [\n                            dim[int(index/math.prod(dims_shape[i+1:])) % len(dim)] for (i, dim) in enumerate(dims_data)\n                        ]\n\n                    # Copy CSV to gcs.\n                    target_path = os.path.join(self.asset_location, file_name)\n                    with tempfile.NamedTemporaryFile() as temp:\n                        with open(temp.name, 'w', newline='') as f:\n                            writer = csv.writer(f)\n                            writer.writerows([header])\n                            # Write rows in batches.\n                            for i in range(0, shape, ROWS_PER_WRITE):\n                                writer.writerows(\n                                    [get_dims_data(i) + list(row) for row in zip(\n                                        *[d[i:i + ROWS_PER_WRITE] for d in data]\n                                    )]\n                                )\n\n                        copy(temp.name, target_path)\n\n                asset_data = AssetData(\n                    name=asset_name,\n                    target_path=target_path,\n                    channel_names=channel_names,\n                    start_time=start_time,\n                    end_time=end_time,\n                    properties=attrs\n                )\n\n                self.add_to_queue(queue, asset_data)\n            self.add_to_queue(queue, None)  # Indicates end of the subprocess.\n\n    @timeit('ConvertToAsset')\n    def process(self, uri: str) -> t.Iterator[AssetData]:\n        \"\"\"Opens grib files and yields AssetData.\n\n        We observed that the convert-to-cog process increases memory usage over time because xarray (v2022.11.0) is not\n        releasing memory as expected while opening any dataset. So we will perform the convert-to-asset process in an\n        isolated process so that the memory consumed while processing will be cleared after the process is killed.\n\n        The process puts the asset data into the queue which the main process will consume. Queue buffer size is limited\n        so the process will be able to put another item in a queue only after the main process has consumed the queue\n        item, that way it makes sure that no queue item is dropped due to queue buffer size.\n        \"\"\"\n        queue = Queue(maxsize=1)\n        process = Process(target=self.convert_to_asset, args=(queue, uri))\n        process.start()\n\n        while True:\n            if not queue.empty():\n                asset_data = queue.get_nowait()\n\n                # Not needed now but keeping this check for backwards compatibility.\n                if asset_data is None:\n                    break\n                yield asset_data\n\n            # When the convert-to-asset process terminates unexpectedly...\n            if not process.is_alive():\n                logger.warning(f'Failed to convert {uri!r} to asset!')\n                break\n\n        process.terminate()\n\n\nclass IngestIntoEETransform(SetupEarthEngine, KwargsFactoryMixin):\n    \"\"\"Ingests asset into earth engine and yields asset id.\n\n    Attributes:\n        ee_asset: The asset folder path in earth engine project where the asset files will be pushed.\n        ee_asset_type: The type of asset to ingest in the earth engine. Default: IMAGE.\n        ee_qps: Maximum queries per second allowed by EE for your project.\n        ee_latency: The expected latency per requests, in seconds.\n        ee_max_concurrent: Maximum concurrent api requests to EE allowed for your project.\n        private_key: A private key path to authenticate earth engine using private key. Default: None.\n        service_account: Service account address when using a private key for earth engine authentication.\n        use_personal_account: A flag to authenticate earth engine using personal account. Default: False.\n    \"\"\"\n\n    def __init__(self,\n                 ee_asset: str,\n                 ee_asset_type: str,\n                 ee_qps: int,\n                 ee_latency: float,\n                 ee_max_concurrent: int,\n                 private_key: str,\n                 service_account: str,\n                 use_personal_account: bool,\n                 ingest_as_virtual_asset: bool,\n                 use_metrics: bool):\n        \"\"\"Sets up rate limit.\"\"\"\n        super().__init__(ee_qps=ee_qps,\n                         ee_latency=ee_latency,\n                         ee_max_concurrent=ee_max_concurrent,\n                         private_key=private_key,\n                         service_account=service_account,\n                         use_personal_account=use_personal_account,\n                         use_metrics=use_metrics)\n        self.ee_asset = ee_asset\n        self.ee_asset_type = ee_asset_type\n        self.ingest_as_virtual_asset = ingest_as_virtual_asset\n        self.use_metrics = use_metrics\n\n    def get_project_id(self) -> str:\n        return self.ee_asset.split('/')[1]\n\n    def ee_tasks_remaining(self) -> int:\n        \"\"\"Returns the remaining number of tasks in the tassk queue of earth engine.\"\"\"\n        return len([task for task in ee.data.getTaskList()\n                    if task['state'] in ['UNSUBMITTED', 'READY', 'RUNNING']])\n\n    def wait_for_task_queue(self) -> None:\n        \"\"\"Waits until the task queue has space.\n\n        Ingestion of table in the earth engine creates a task and every project has a limited task queue size. This\n        function checks the task queue size and waits until the task queue has some space.\n        \"\"\"\n        while self.ee_tasks_remaining() >= self._num_shards:\n            time.sleep(TASK_QUEUE_WAIT_TIME)\n\n    @retry.with_exponential_backoff(\n        num_retries=NUM_RETRIES,\n        logger=logger.warning,\n        initial_delay_secs=INITIAL_DELAY,\n        max_delay_secs=MAX_DELAY\n    )\n    def start_ingestion(self, asset_data: AssetData) -> t.Optional[str]:\n        \"\"\"Creates COG-backed asset in earth engine. Returns the asset id.\"\"\"\n        project_id = self.get_project_id()\n        self.check_setup(project_id)\n        asset_name = os.path.join(self.ee_asset, asset_data.name)\n        asset_data.properties['ingestion_time'] = get_utc_timestamp()\n\n        try:\n            logger.info(f\"Uploading asset {asset_data.target_path} to Asset ID '{asset_name}'.\")\n\n            if self.ee_asset_type == 'IMAGE':  # Ingest an image.\n\n                creds = get_creds(self.use_personal_account, self.service_account, self.private_key)\n                session = AuthorizedSession(creds)\n\n                image_manifest = {\n                    'name': asset_name,\n                    'tilesets': [\n                        {\n                            'id': '0',\n                            'sources': [{'uris': [asset_data.target_path]}]\n                        }\n                    ],\n                    'startTime': asset_data.start_time,\n                    'endTime': asset_data.end_time,\n                    'properties': asset_data.properties\n                }\n\n                headers = {\n                    'Content-Type': 'application/json',\n                    'x-goog-user-project': project_id,\n                }\n\n                data = json.dumps({'imageManifest': image_manifest, 'overwrite': True})\n\n                if self.ingest_as_virtual_asset:  # as a virtual image.\n                    # Makes an api call to register the virtual asset.\n                    url = (\n                        f'https://earthengine-highvolume.googleapis.com/v1/projects/{project_id}/'\n                        f'image:import?overwrite=true&mode=VIRTUAL'\n                    )\n                else:  # as a COG based image.\n                    url = (\n                        f'https://earthengine-highvolume.googleapis.com/v1alpha/projects/{project_id}/'\n                        f'image:importExternal'\n                    )\n                # Send API request\n                response = session.post(url=url, data=data, headers=headers)\n                logger.info(f\"EE Asset ingestion response for {asset_name}: {response.text}\")\n\n                if response.status_code != 200:\n                    logger.info(f\"Failed to ingest asset '{asset_name}' in Earth Engine: {response.text}\")\n                    raise ee.EEException(response.text)\n                if self.ingest_as_virtual_asset:\n                    response_json = response.json()\n                    ingestion_state = (\n                        response_json\n                        .get(\"metadata\", {})\n                        .get(\"state\", \"STATE_UNSPECIFIED\")\n                        .upper()\n                    )\n                    if ingestion_state != \"SUCCEEDED\":\n                        raise ee.EEException(response.text)\n                return asset_name\n            elif self.ee_asset_type == 'TABLE':  # ingest a feature collection.\n                self.wait_for_task_queue()\n                task_id = ee.data.newTaskId(1)[0]\n                ee.data.startTableIngestion(task_id, {\n                    'name': asset_name,\n                    'sources': [{\n                        'uris': [asset_data.target_path]\n                    }],\n                    'startTime': asset_data.start_time,\n                    'endTime': asset_data.end_time,\n                    'properties': asset_data.properties\n                })\n                return asset_name\n        except ee.EEException as e:\n            if \"Could not parse a valid CRS from the first overview of the GeoTIFF\" in repr(e):\n                logger.error(f\"Failed to create asset '{asset_name}' in earth engine: {e}. Moving on...\")\n                return \"\"\n\n            if (\n                \"whose type does not match the type of the same property of existing \"\n                \"assets in the same collection\"\n                in repr(e)\n            ):\n                logger.error(f\"Failed to ingest asset '{asset_name}' due to property mismatch: {e} Moving on...\")\n                return \"\"\n\n            if \"The metadata of the TIFF could not be read in the first 10000000 bytes.\" in repr(e):\n                logger.error(f\"Faild to ingest asset '{asset_name}', check the tiff file: {e} Moving on...\")\n                return \"\"\n\n            logger.error(f\"Failed to create asset '{asset_name}' in earth engine: {e}\")\n            # We do have logic for skipping the already created assets in FilterFilesTransform but\n            # somehow we are observing that streaming pipeline reports \"Cannot overwrite ...\" error\n            # so this will act as a quick fix for this issue.\n            if f\"Cannot overwrite asset '{asset_name}'\" in repr(e):\n                ee.data.deleteAsset(asset_name)\n            raise\n\n    @timeit('IngestIntoEE')\n    def process(self, asset_data: AssetData) -> t.Iterator[t.Tuple[str, float]]:\n        \"\"\"Uploads an asset into the earth engine.\"\"\"\n        asset_name = self.start_ingestion(asset_data)\n        if asset_name:\n            metric.Metrics.counter('Success', 'IngestIntoEE').inc()\n            asset_start_time = asset_data.start_time\n            yield asset_name, asset_start_time\n"
  },
  {
    "path": "weather_mv/loader_pipeline/ee_test.py",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport logging\nimport os\nimport tempfile\nimport unittest\n\nfrom .ee import (\n    get_ee_safe_name,\n    ConvertToAsset\n)\nfrom .sinks_test import TestDataBase\n\nlogger = logging.getLogger(__name__)\n\n\nclass AssetNameCreationTests(unittest.TestCase):\n\n    def test_asset_name_creation(self):\n        uri = 'weather_mv/test_data/grib_multiple_edition_single_timestep.bz2'\n        expected = 'grib_multiple_edition_single_timestep'\n\n        actual = get_ee_safe_name(uri)\n\n        self.assertEqual(actual, expected)\n\n    def test_asset_name_creation__with_special_chars(self):\n        uri = 'weather_mv/test_data/grib@2nd-edition&timestep#1.bz2'\n        expected = 'grib_2nd-edition_timestep_1'\n\n        actual = get_ee_safe_name(uri)\n\n        self.assertEqual(actual, expected)\n\n    def test_asset_name_creation__with_missing_filename(self):\n        uri = 'weather_mv/test_data/'\n        expected = ''\n\n        actual = get_ee_safe_name(uri)\n\n        self.assertEqual(actual, expected)\n\n    def test_asset_name_creation__with_only_filename(self):\n        uri = 'grib@2nd-edition&timestep#1.bz2'\n        expected = 'grib_2nd-edition_timestep_1'\n\n        actual = get_ee_safe_name(uri)\n\n        self.assertEqual(actual, expected)\n\n\nclass ConvertToAssetTests(TestDataBase):\n\n    def setUp(self) -> None:\n        super().setUp()\n        self.tmpdir = tempfile.TemporaryDirectory()\n        self.convert_to_image_asset = ConvertToAsset(asset_location=self.tmpdir.name)\n        self.convert_to_table_asset = ConvertToAsset(asset_location=self.tmpdir.name, ee_asset_type='TABLE')\n\n    def tearDown(self):\n        self.tmpdir.cleanup()\n\n    def test_convert_to_image_asset(self):\n        data_path = f'{self.test_data_folder}/test_data_grib_single_timestep'\n        asset_path = os.path.join(self.tmpdir.name, 'test_data_grib_single_timestep.tiff')\n\n        next(self.convert_to_image_asset.process(data_path))\n\n        # The size of tiff is expected to be more than grib.\n        self.assertTrue(os.path.getsize(asset_path) > os.path.getsize(data_path))\n\n    def test_convert_to_image_asset__with_multiple_grib_edition(self):\n        data_path = f'{self.test_data_folder}/test_data_grib_multiple_edition_single_timestep.bz2'\n        asset_path = os.path.join(self.tmpdir.name, 'test_data_grib_multiple_edition_single_timestep.tiff')\n\n        next(self.convert_to_image_asset.process(data_path))\n\n        # The size of tiff is expected to be more than grib.\n        self.assertTrue(os.path.getsize(asset_path) > os.path.getsize(data_path))\n\n    def test_convert_to_table_asset(self):\n        data_path = f'{self.test_data_folder}/test_data_grib_single_timestep'\n        asset_path = os.path.join(self.tmpdir.name, 'test_data_grib_single_timestep.csv')\n\n        next(self.convert_to_table_asset.process(data_path))\n\n        # The size of tiff is expected to be more than grib.\n        self.assertTrue(os.path.getsize(asset_path) > os.path.getsize(data_path))\n\n    def test_convert_to_table_asset__with_multiple_grib_edition(self):\n        data_path = f'{self.test_data_folder}/test_data_grib_multiple_edition_single_timestep.bz2'\n        asset_path = os.path.join(self.tmpdir.name, 'test_data_grib_multiple_edition_single_timestep.csv')\n\n        next(self.convert_to_table_asset.process(data_path))\n\n        # The size of tiff is expected to be more than grib.\n        self.assertTrue(os.path.getsize(asset_path) > os.path.getsize(data_path))\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "weather_mv/loader_pipeline/execution_test.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport unittest\nimport weather_mv\n\nfrom .pipeline import run, pipeline\n\n\nclass ExecutionTest(unittest.TestCase):\n\n    TEST_DATA_FOLDER = f'{next(iter(weather_mv.__path__))}/test_data'\n    DEFAULT_CMD = 'weather-mv bq --output_table myproject.mydataset.mytable --temp_location \"gs://mybucket/tmp\" ' \\\n                  '--geo_data_parquet_path ./geo_data.parquet --dry-run '\n    LOCAL_DATA_SOURCE = f'{TEST_DATA_FOLDER}/test_data_single_point.nc'\n    TEST_CASES = [\n        ('local data source and local execution', f'{DEFAULT_CMD} --uris {LOCAL_DATA_SOURCE} --direct_num_workers 2'),\n    ]\n\n    def test_run(self):\n        for msg, args in self.TEST_CASES:\n            with self.subTest(msg):\n                pipeline(*run(args.split()))\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "weather_mv/loader_pipeline/metrics.py",
    "content": "# Copyright 2024 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\"\"\"Utilities for adding metrics to beam pipeline.\"\"\"\n\nimport copy\nimport dataclasses\nimport datetime\nimport inspect\nimport json\nimport logging\nimport time\nimport typing as t\nfrom collections import OrderedDict\n\nimport apache_beam as beam\nfrom apache_beam.metrics import metric\nfrom apache_beam.transforms import window, trigger\nfrom functools import wraps\nfrom google.cloud import monitoring_v3\n\nfrom .sinks import get_file_time, KwargsFactoryMixin\n\nlogger = logging.getLogger(__name__)\n\n# For Metrics API retry logic.\nINITIAL_DELAY = 1.0  # Initial delay in seconds.\nMAX_DELAY = 600  # Maximum delay before giving up in seconds.\nNUM_RETRIES = 10  # Number of tries with exponential backoff.\nTASK_QUEUE_WAIT_TIME = 120  # Task queue wait time in seconds.\n\n\ndef timeit(func_name: str, keyed_fn: bool = False):\n    \"\"\"Decorator to add time it takes for an element to be processed by a stage.\n\n    Args:\n        func_name: A unique name of the stage.\n        keyed_fn (optional): This has to be passed true if the input is adding keys to the element.\n\n        For example a stage like\n\n        class Shard(beam.DoFn):\n            @timeit('Sharding', keyed_fn=True)\n            def process(self,element):\n                key = randrange(10)\n                yield key, element\n\n        We are passing `keyed_fn=True` as we are adding a key to our element. Usually keys are added\n        to later group the element by a `GroupBy` stage.\n    \"\"\"\n\n    def decorator(func):\n        @wraps(func)\n        def wrapper(self, *args, **kwargs):\n            # If metrics are turned off, don't do anything.\n            if not hasattr(self, \"use_metrics\") or (\n                hasattr(self, \"use_metrics\") and not self.use_metrics\n            ):\n                for result in func(self, *args, **kwargs):\n                    yield result\n\n                return\n\n            # Only the first timer wrapper will have no time_dict.\n            # All subsequent wrappers can extract out the dict.\n            # args 0 would be a tuple.\n            if len(args[0]) == 1:\n                raise ValueError(\"time_dict not found.\")\n\n            element, time_dict = args[0]\n            args = (element,) + args[1:]\n\n            if not isinstance(time_dict, OrderedDict):\n                raise ValueError(\"time_dict not found.\")\n\n            # If the function is a generator, yield the output\n            # othewise return it.\n            if inspect.isgeneratorfunction(func):\n                for result in func(self, *args, **kwargs):\n                    new_time_dict = copy.deepcopy(time_dict)\n                    if func_name in new_time_dict:\n                        del new_time_dict[func_name]\n                    new_time_dict[func_name] = time.time()\n                    if keyed_fn:\n                        (key, element) = result\n                        yield key, (element, new_time_dict)\n                    else:\n                        yield result, new_time_dict\n            else:\n                raise ValueError(\"Function is not a generator.\")\n\n        return wrapper\n\n    return decorator\n\n\n@dataclasses.dataclass\nclass AddTimer(beam.DoFn, KwargsFactoryMixin):\n    \"\"\"DoFn to add a time_dict with uri, file time in GCS bucket, when it was\n    picked up in PCollection. This dict will contain each stage_names as keys\n    and the timestamp when it finished that step's execution.\"\"\"\n\n    topic: t.Optional[str] = None\n\n    def process(self, element) -> t.Iterator[t.Any]:\n        time_dict = OrderedDict(\n            [\n                (\"uri\", element),\n                (\"bucket\", get_file_time(element) if self.topic else time.time()),\n                (\"pickup\", time.time()),\n            ]\n        )\n        yield element, time_dict\n\n\nclass AddBeamMetrics(beam.DoFn):\n    \"\"\"DoFn to add Element Processing Time metric to beam. Expects PCollection\n    to contain a time_dict.\"\"\"\n\n    def __init__(self, asset_start_time_format: str):\n        super().__init__()\n        self.element_processing_time = metric.Metrics.distribution(\n            \"Time\", \"element_processing_time_ms\"\n        )\n        self.data_latency_time = metric.Metrics.distribution(\n            \"Time\", \"data_latency_time_ms\"\n        )\n        self.asset_start_time_format = asset_start_time_format\n\n    def process(self, element):\n        try:\n            if len(element) == 0:\n                raise ValueError(\"time_dict not found.\")\n            (asset_name, asset_start_time), time_dict = element\n            if not isinstance(time_dict, OrderedDict):\n                raise ValueError(\"time_dict not found.\")\n\n            uri = time_dict.pop(\"uri\")\n\n            # Time for a file to get ingested into EE from when it appeared in bucket.\n            # When the pipeline is in batch mode, it will be from when the file\n            # was picked up by the pipeline.\n            element_processing_time = (\n                time_dict[\"IngestIntoEE\"] - time_dict[\"bucket\"]\n            ) * 1000\n            self.element_processing_time.update(int(element_processing_time))\n\n            # Adding data latency.\n            if asset_start_time:\n                current_time = time.time()\n                asset_start_time = datetime.datetime.strptime(\n                    asset_start_time, self.asset_start_time_format\n                ).timestamp()\n\n                # Converting seconds to milli seconds.\n                data_latency_ms = (current_time - asset_start_time) * 1000\n                self.data_latency_time.update(int(data_latency_ms))\n\n                # Logging file init to bucket time as well.\n                time_dict.update({\"FileInit\": asset_start_time})\n                time_dict.move_to_end(\"FileInit\", last=False)\n\n            # Logging time taken by each step...\n            step_intervals = {\n                f\"{current_step} -> {next_step}\": round(next_time - current_time)\n                for (current_step, current_time), (next_step, next_time)\n                in zip(time_dict.items(), list(time_dict.items())[1:])\n            }\n            logger.info(\n                f\"Step intervals for {uri}:{asset_name} :: {json.dumps(step_intervals, indent=4)}\"\n            )\n\n            yield (\"custom_metrics\", (data_latency_ms / 1000, element_processing_time / 1000))\n        except Exception as e:\n            logger.warning(\n                f\"Some error occured while adding metrics. Error {e}\"\n            )\n\n\n@dataclasses.dataclass\nclass CreateTimeSeries(beam.DoFn):\n    \"\"\"DoFn to write metrics TimeSeries data in Google Cloud Monitoring.\"\"\"\n\n    job_name: str\n    project: str\n    region: str\n\n    def create_time_series_object(self, metric_name: str, metric_value: float):\n        \"\"\"Returns a Metrics TimeSeries object.\"\"\"\n        series = monitoring_v3.TimeSeries()\n        series.metric.type = f\"custom.googleapis.com/{metric_name}\"\n        series.metric.labels[\"description\"] = metric_name\n        series.resource.type = \"dataflow_job\"\n        series.resource.labels[\"job_name\"] = self.job_name\n        series.resource.labels[\"project_id\"] = self.project\n        series.resource.labels[\"region\"] = self.region\n\n        now = time.time()\n        seconds = int(now)\n        nanos = int((now - seconds) * 10**9)\n        interval = monitoring_v3.TimeInterval(\n            {\"end_time\": {\"seconds\": seconds, \"nanos\": nanos}}\n        )\n\n        point = monitoring_v3.Point(\n            {\"interval\": interval, \"value\": {\"double_value\": metric_value}}\n        )\n        series.points = [point]\n        return series\n\n    def process(self, element: t.Any):\n        _, metric_values = element\n        data_latency_times = [x[0] for x in metric_values]\n        element_processing_times = [x[1] for x in metric_values]\n\n        logger.info(f\"data_latency_time values: {data_latency_times}\")\n        data_latency_max_series = self.create_time_series_object(\n            \"data_latency_time_max\", max(data_latency_times)\n        )\n        data_latency_mean_series = self.create_time_series_object(\n            \"data_latency_time_mean\",\n            sum(data_latency_times) / len(data_latency_times),\n        )\n\n        logger.info(\n            f\"element_processing_time values: {element_processing_times}\"\n        )\n        element_processing_max_series = self.create_time_series_object(\n            \"element_processing_time_max\", max(element_processing_times)\n        )\n        element_processing_mean_series = self.create_time_series_object(\n            \"element_processing_time_mean\",\n            sum(element_processing_times) / len(element_processing_times),\n        )\n\n        client = monitoring_v3.MetricServiceClient()\n        client.create_time_series(\n            name=f\"projects/{self.project}\",\n            time_series=[\n                data_latency_max_series,\n                data_latency_mean_series,\n                element_processing_max_series,\n                element_processing_mean_series,\n            ],\n        )\n\n\n@dataclasses.dataclass\nclass AddMetrics(beam.PTransform, KwargsFactoryMixin):\n    \"\"\"A custom transform to add metrics to the pipeline.\"\"\"\n\n    job_name: str\n    project: str\n    region: str\n    use_monitoring_metrics: bool\n    asset_start_time_format: str = \"%Y-%m-%dT%H:%M:%SZ\"\n\n    def expand(self, pcoll: beam.PCollection):\n        metrics = pcoll | \"AddBeamMetrics\" >> beam.ParDo(AddBeamMetrics(self.asset_start_time_format))\n        if self.use_monitoring_metrics:\n            (\n                metrics\n                | \"AddTimestamps\"\n                >> beam.Map(\n                    lambda element: window.TimestampedValue(element, time.time())\n                )\n                | \"Window\"\n                >> beam.WindowInto(\n                    window.GlobalWindows(),\n                    trigger=trigger.Repeatedly(trigger.AfterProcessingTime(5)),\n                    accumulation_mode=trigger.AccumulationMode.DISCARDING,\n                )\n                | \"GroupByKeyAndWindow\" >> beam.GroupByKey(lambda element: element)\n                | \"CreateTimeSeries\"\n                >> beam.ParDo(\n                    CreateTimeSeries(self.job_name, self.project, self.region)\n                )\n            )\n"
  },
  {
    "path": "weather_mv/loader_pipeline/pipeline.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Pipeline for loading weather data into analysis-ready mediums, like Google BigQuery.\"\"\"\n\nimport argparse\nimport json\nimport logging\nimport typing as t\nimport warnings\n\nimport apache_beam as beam\nfrom apache_beam.io.filesystems import FileSystems\nfrom apache_beam.options.pipeline_options import PipelineOptions\n\nfrom .bq import ToBigQuery\nfrom .regrid import Regrid\nfrom .ee import ToEarthEngine\nfrom .streaming import GroupMessagesByFixedWindows, ParsePaths\n\nlogger = logging.getLogger(__name__)\nSDK_CONTAINER_IMAGE = 'gcr.io/weather-tools-prod/weather-tools:0.0.0'\n\n\ndef configure_logger(verbosity: int) -> None:\n    \"\"\"Configures logging from verbosity. Default verbosity will show errors.\"\"\"\n    level = (40 - verbosity * 10)\n    logging.getLogger(__package__).setLevel(level)\n    logger.setLevel(level)\n\n\ndef pattern_to_uris(match_pattern: str, is_zarr: bool = False) -> t.Iterable[str]:\n    if is_zarr:\n        yield match_pattern\n        return\n\n    for match in FileSystems().match([match_pattern]):\n        yield from [x.path for x in match.metadata_list]\n\n\ndef pipeline(known_args: argparse.Namespace, pipeline_args: t.List[str]) -> None:\n    all_uris = list(pattern_to_uris(known_args.uris, known_args.zarr))\n    if not all_uris:\n        raise FileNotFoundError(f\"File pattern '{known_args.uris}' matched no objects\")\n\n    # First URI is useful to get an example data shard. It also can be a Zarr path.\n    known_args.first_uri = next(iter(all_uris))\n\n    with beam.Pipeline(argv=pipeline_args) as p:\n        if known_args.zarr:\n            paths = p\n        elif known_args.topic or known_args.subscription:\n            paths = (\n                    p\n                    # Windowing is based on this code sample:\n                    # https://cloud.google.com/pubsub/docs/pubsub-dataflow#code_sample\n                    | 'ReadUploadEvent' >> beam.io.ReadFromPubSub(known_args.topic, known_args.subscription)\n                    | 'WindowInto' >> GroupMessagesByFixedWindows(known_args.window_size, known_args.num_shards)\n                    | 'ParsePaths' >> beam.ParDo(ParsePaths(known_args.uris))\n            )\n        else:\n            paths = p | 'Create' >> beam.Create(all_uris)\n\n        if known_args.subcommand == 'bigquery' or known_args.subcommand == 'bq':\n            paths | \"MoveToBigQuery\" >> ToBigQuery.from_kwargs(**vars(known_args))\n        elif known_args.subcommand == 'regrid' or known_args.subcommand == 'rg':\n            paths | \"Regrid\" >> Regrid.from_kwargs(**vars(known_args))\n        elif known_args.subcommand == 'earthengine' or known_args.subcommand == 'ee':\n            pipeline_options = PipelineOptions(pipeline_args)\n            pipeline_options_dict = pipeline_options.get_all_options()\n            # all_args stores all arguments passed to the pipeline.\n            # This is necessary because pipeline_args are later used by\n            # the CreateTimeSeries DoFn in the AddMetrics transform.\n            all_args = {**pipeline_options_dict, **vars(known_args)}\n            paths | \"MoveToEarthEngine\" >> ToEarthEngine.from_kwargs(**all_args)\n        else:\n            raise ValueError('invalid subcommand!')\n\n    logger.info('Pipeline is finished.')\n\n\ndef run(argv: t.List[str]) -> t.Tuple[argparse.Namespace, t.List[str]]:\n    \"\"\"Main entrypoint & pipeline definition.\"\"\"\n    parser = argparse.ArgumentParser(\n        prog='weather-mv',\n        description='Weather Mover loads weather data from cloud storage into analytics engines.'\n    )\n\n    # Common arguments to all commands\n    base = argparse.ArgumentParser(add_help=False)\n    base.add_argument('-i', '--uris', type=str, required=True,\n                      help=\"URI glob pattern matching input weather data, e.g. 'gs://ecmwf/era5/era5-2015-*.gb'. Or, \"\n                           \"a path to a Zarr.\")\n    base.add_argument('--topic', type=str,\n                      help=\"A Pub/Sub topic for GCS OBJECT_FINALIZE events, or equivalent, of a cloud bucket. \"\n                           \"E.g. 'projects/<PROJECT_ID>/topics/<TOPIC_ID>'. Cannot be used with `--subscription`.\")\n    base.add_argument('--subscription', type=str,\n                      help='A Pub/Sub subscription for GCS OBJECT_FINALIZE events, or equivalent, of a cloud bucket. '\n                           'Cannot be used with `--topic`.')\n    base.add_argument(\"--window_size\", type=float, default=1.0,\n                      help=\"Output file's window size in minutes. Only used with the `topic` flag. Default: 1.0 \"\n                           \"minute.\")\n    base.add_argument('--num_shards', type=int, default=5,\n                      help='Number of shards to use when writing windowed elements to cloud storage. Only used with '\n                           'the `topic` flag. Default: 5 shards.')\n    base.add_argument('--zarr', action='store_true', default=False,\n                      help=\"Treat the input URI as a Zarr. If the URI ends with '.zarr', this will be set to True. \"\n                           \"Default: off\")\n    base.add_argument('--zarr_kwargs', type=json.loads, default='{}',\n                      help='Keyword arguments to pass into `xarray.open_zarr()`, as a JSON string. '\n                           'Default: `{\"chunks\": null, \"consolidated\": true}`.')\n    base.add_argument('-d', '--dry-run', action='store_true', default=False,\n                      help='Preview the weather-mv job. Default: off')\n    base.add_argument('--log-level', type=int, default=2,\n                      help='An integer to configure log level. Default: 2(INFO)')\n    base.add_argument('--use-local-code', action='store_true', default=False, help='Supply local code to the Runner.')\n\n    subparsers = parser.add_subparsers(help='help for subcommand', dest='subcommand')\n\n    # BigQuery command registration\n    bq_parser = subparsers.add_parser('bigquery', aliases=['bq'], parents=[base],\n                                      help='Move data into Google BigQuery')\n    ToBigQuery.add_parser_arguments(bq_parser)\n\n    # Regrid command registration\n    rg_parser = subparsers.add_parser('regrid', aliases=['rg'], parents=[base],\n                                      help='Copy and regrid grib data with MetView.')\n    Regrid.add_parser_arguments(rg_parser)\n\n    # EarthEngine command registration\n    ee_parser = subparsers.add_parser('earthengine', aliases=['ee'], parents=[base],\n                                      help='Move data into Google EarthEngine')\n    ToEarthEngine.add_parser_arguments(ee_parser)\n\n    known_args, pipeline_args = parser.parse_known_args(argv[1:])\n\n    configure_logger(known_args.log_level)  # 0 = error, 1 = warn, 2 = info, 3 = debug\n\n    # Validate Zarr arguments\n    if known_args.uris.endswith('.zarr'):\n        known_args.zarr = True\n\n    if known_args.zarr_kwargs and not known_args.zarr:\n        raise ValueError('`--zarr_kwargs` argument is only allowed with valid Zarr input URI.')\n\n    if known_args.zarr_kwargs:\n        if not known_args.zarr_kwargs.get('start_date') or not known_args.zarr_kwargs.get('end_date'):\n            warnings.warn('`--zarr_kwargs` not contains both `start_date` and `end_date`'\n                          'so whole zarr-dataset will ingested.')\n\n    if known_args.zarr:\n        known_args.zarr_kwargs['chunks'] = known_args.zarr_kwargs.get('chunks', None)\n        known_args.zarr_kwargs['consolidated'] = known_args.zarr_kwargs.get('consolidated', True)\n\n    # Validate subcommand\n    if known_args.subcommand == 'bigquery' or known_args.subcommand == 'bq':\n        ToBigQuery.validate_arguments(known_args, pipeline_args)\n    elif known_args.subcommand == 'regrid' or known_args.subcommand == 'rg':\n        Regrid.validate_arguments(known_args, pipeline_args)\n    elif known_args.subcommand == 'earthengine' or known_args.subcommand == 'ee':\n        ToEarthEngine.validate_arguments(known_args, pipeline_args)\n\n    # If a Pub/Sub is used, then the pipeline must be a streaming pipeline.\n    if known_args.topic or known_args.subscription:\n        if known_args.topic and known_args.subscription:\n            raise ValueError('only one argument can be provided at a time: `topic` or `subscription`.')\n\n        if known_args.zarr:\n            raise ValueError('streaming updates to a Zarr file is not (yet) supported.')\n\n        pipeline_args.extend('--streaming true'.split())\n\n        # make sure we re-compute utcnow() every time rows are extracted from a file.\n        known_args.import_time = None\n\n    # We use the save_main_session option because one or more DoFn's in this\n    # workflow rely on global context (e.g., a module imported at module level).\n    pipeline_args.extend('--save_main_session true'.split())\n\n    return known_args, pipeline_args\n"
  },
  {
    "path": "weather_mv/loader_pipeline/pipeline_test.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport json\nimport unittest\n\nimport weather_mv\nfrom .pipeline import run, pipeline\n\n\nclass CLITests(unittest.TestCase):\n    def setUp(self) -> None:\n        self.test_data_folder = f'{next(iter(weather_mv.__path__))}/test_data'\n        self.base_cli_args = (\n            'weather-mv bq '\n            f'-i {self.test_data_folder}/test_data_2018*.nc '\n            '-o myproject.mydataset.mytable '\n            '--import_time 2022-02-04T22:22:12.125893 '\n            '--geo_data_parquet_path geo_data.parquet '\n            '-s'\n        ).split()\n        self.tif_base_cli_args = (\n            'weather-mv bq '\n            f'-i {self.test_data_folder}/test_data_tif_time.tif '\n            '-o myproject.mydataset.mytable '\n            '--import_time 2022-02-04T22:22:12.125893 '\n            '-s --geo_data_parquet_path geo_data.parquet'\n        ).split()\n        self.bq_base_cli_args = (\n            'weather-mv bq '\n            f'-i {self.test_data_folder}/test_data_2018*.nc '\n            '-o myproject.mydataset.mytable '\n            '--import_time 2022-02-04T22:22:12.125893 '\n            '-s'\n        ).split()\n        self.ee_cli_args = (\n            'weather-mv ee '\n            '-i weather_mv/test_data/test_data_2018*.nc '\n            '--asset_location gs://bucket/my-assets/ '\n            '--ee_asset \"projects/my-project/assets/asset_dir'\n        ).split()\n        self.rg_cli_args = (\n            'weather-mv rg '\n            '-i weather_mv/test_data/test_data_2018*.nc '\n            '-o weather_mv/test_data/output/ '\n        ).split()\n        self.base_cli_known_args = {\n            'subcommand': 'bq',\n            'uris': f'{self.test_data_folder}/test_data_2018*.nc',\n            'output_table': 'myproject.mydataset.mytable',\n            'dry_run': False,\n            'skip_region_validation': True,\n            'import_time': '2022-02-04T22:22:12.125893',\n            'infer_schema': False,\n            'num_shards': 5,\n            'topic': None,\n            'subscription': None,\n            'variables': [],\n            'window_size': 1.0,\n            'xarray_open_dataset_kwargs': {},\n            'rows_chunk_size': 1_000_000,\n            'disable_grib_schema_normalization': False,\n            'tif_metadata_for_start_time': None,\n            'tif_metadata_for_end_time': None,\n            'zarr': False,\n            'zarr_kwargs': {},\n            'log_level': 2,\n            'use_local_code': False,\n            'skip_creating_polygon': False,\n            'geo_data_parquet_path': 'geo_data.parquet',\n            'skip_creating_geo_data_parquet': False\n        }\n\n\nclass TestCLI(CLITests):\n\n    def test_dry_runs_are_allowed(self):\n        known_args, _ = run(self.base_cli_args + '--dry-run'.split())\n        self.assertEqual(known_args.dry_run, True)\n\n    def test_log_level_arg(self):\n        known_args, _ = run(self.base_cli_args + '--log-level 3'.split())\n        self.assertEqual(known_args.log_level, 3)\n\n    def test_tif_metadata_for_datetime_raise_error_for_non_tif_file(self):\n        with self.assertRaisesRegex(RuntimeError, 'can be specified only for tif files.'):\n            run(self.base_cli_args + '--tif_metadata_for_start_time start_time '\n                '--tif_metadata_for_end_time end_time'.split())\n\n    def test_tif_metadata_for_datetime_raise_error_if_flag_is_absent(self):\n        with self.assertRaisesRegex(RuntimeError, 'is required for tif files.'):\n            run(self.tif_base_cli_args)\n\n    def test_area_only_allows_four(self):\n        with self.assertRaisesRegex(AssertionError, 'Must specify exactly 4 lat/long .* N, W, S, E'):\n            run(self.base_cli_args + '--area 1 2 3'.split())\n\n        with self.assertRaisesRegex(AssertionError, 'Must specify exactly 4 lat/long .* N, W, S, E'):\n            run(self.base_cli_args + '--area 1 2 3 4 5'.split())\n\n        known_args, pipeline_args = run(self.base_cli_args + '--area 1 2 3 4'.split())\n        self.assertEqual(pipeline_args, ['--save_main_session', 'true'])\n        self.assertEqual(vars(known_args), {\n            **self.base_cli_known_args,\n            'area': [1, 2, 3, 4]\n        })\n\n    def test_topic_creates_a_streaming_pipeline(self):\n        _, pipeline_args = run(self.base_cli_args + '--topic projects/myproject/topics/my-topic'.split())\n        self.assertEqual(pipeline_args, ['--streaming', 'true', '--save_main_session', 'true'])\n\n    def test_subscription_creates_a_streaming_pipeline(self):\n        _, pipeline_args = run(self.base_cli_args + '--subscription projects/myproject/topics/my-topic'.split())\n        self.assertEqual(pipeline_args, ['--streaming', 'true', '--save_main_session', 'true'])\n\n    def test_accepts_json_string_for_xarray_open(self):\n        xarray_kwargs = dict(engine='cfgrib', backend_kwargs={'filter_by_keys': {'edition': 1}})\n        json_kwargs = json.dumps(xarray_kwargs)\n        known_args, _ = run(\n            self.base_cli_args + [\"--xarray_open_dataset_kwargs\", f\"{json_kwargs}\"]\n        )\n        self.assertEqual(known_args.xarray_open_dataset_kwargs, xarray_kwargs)\n\n    def test_ee_does_not_yet_support_zarr(self):\n        with self.assertRaisesRegex(RuntimeError, 'Reading Zarr'):\n            run(self.ee_cli_args + '--zarr'.split())\n\n    def test_rg_zarr_cant_output_netcdf(self):\n        with self.assertRaisesRegex(ValueError, 'only Zarr-to-Zarr'):\n            run(self.rg_cli_args + '--zarr --to_netcdf'.split())\n\n    def test_rg_happy_path(self):\n        run(self.rg_cli_args + ['--zarr'])\n\n    def test_zarr_kwargs_must_come_with_zarr(self):\n        with self.assertRaisesRegex(ValueError, 'allowed with valid Zarr input URI'):\n            run(self.base_cli_args + ['--zarr_kwargs', json.dumps({\"time\": 100})])\n\n    def test_topic_and_subscription__mutually_exclusive(self):\n        with self.assertRaisesRegex(ValueError, '`topic` or `subscription`'):\n            run(self.base_cli_args + '--topic foo --subscription bar'.split())\n\n    def test_geo_data_parquet_path_must_be_parquet(self):\n        with self.assertRaisesRegex(RuntimeError, \"must end with '.parquet'\"):\n            run(self.bq_base_cli_args + '--geo_data_parquet_path test_geo.txt'.split())\n\nclass IntegrationTest(CLITests):\n    def test_dry_runs_are_allowed(self):\n        pipeline(*run(self.base_cli_args + '--dry-run'.split()))\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "weather_mv/loader_pipeline/regrid.py",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport argparse\nimport contextlib\nimport dataclasses\nimport glob\nimport json\nimport logging\nimport os.path\nimport shutil\nimport subprocess\nimport tempfile\nimport typing as t\nimport warnings\n\nimport apache_beam as beam\nimport dask\nimport xarray as xr\nimport xarray_beam as xbeam\nfrom apache_beam.io.filesystems import FileSystems\n\nfrom .sinks import ToDataSink, open_local, copy\n\nlogger = logging.getLogger(__name__)\n\ntry:\n    import metview as mv\n    Fieldset = mv.bindings.Fieldset\nexcept (ModuleNotFoundError, ImportError, FileNotFoundError, ValueError):\n    logger.error('Metview could not be imported.')\n    mv = None  # noqa\n    Fieldset = t.Any\n\n\ndef _clear_metview():\n    \"\"\"Clear the metview temporary directory.\n\n    By default, caches are cleared when the MetView _process_ ends.\n    This method is necessary to free space sooner than that, namely\n    after invoking MetView functions.\n    \"\"\"\n    cache_dirs = glob.glob(f'{tempfile.gettempdir()}/mv.*')\n    for cache_dir in cache_dirs:\n        shutil.rmtree(cache_dir)\n        os.makedirs(cache_dir)\n\n\n@contextlib.contextmanager\ndef _metview_op() -> t.Iterator[None]:\n    \"\"\"Perform operation with MetView, including error handling and cleanup.\"\"\"\n    try:\n        yield\n    except (ModuleNotFoundError, ImportError, FileNotFoundError) as e:\n        raise ImportError('Please install MetView with Anaconda:\\n'\n                          '`conda install metview-batch -c conda-forge`') from e\n    finally:\n        _clear_metview()\n\n\nclass MapChunkAsFieldset(beam.PTransform):\n    \"\"\"Apply an operation with MetView on a xarray.Dataset as if it's a metview.Fieldset.\n\n    This transform will handle converting to and from xr.Datasets to mv.Fieldsets. This\n    allows the user to perform any MetView or Fieldset operation within the overridable\n    `apply()` method.\n\n    > Warning: This cannot process large Datasets without a decent amount of disk space!\n    \"\"\"\n\n    def apply(self, key: xbeam.Key, fs: Fieldset) -> t.Tuple[xbeam.Key, Fieldset]:\n        return key, fs\n\n    def _apply(self, key: xbeam.Key, ds: xr.Dataset) -> t.Tuple[xbeam.Key, xr.Dataset]:\n        # Clear metadata so ecCodes doesn't mess up the conversion. Instead of default grib fields,\n        # ecCodes will use the parameter ID. Thus, the fields will appear in the final, regridded\n        # dataset.\n        for dv in ds.data_vars:\n            for to_del in ['GRIB_cfName', 'GRIB_shortName', 'GRIB_cfVarName']:\n                if to_del in ds[dv].attrs:\n                    del ds[dv].attrs[to_del]\n\n        with _metview_op():\n            # mv.dataset_to_fieldset() will error on input where there is only 1 value\n            # in a dimension. ECMWF's cfgrib is in its alpha version.\n            try:\n                fs = mv.dataset_to_fieldset(ds)\n            except ValueError as e:\n                raise ValueError(\n                    'please change `zarr_input_chunk`s so that there are no'\n                    'single element dimensions (e.g. {\"time\": 1} is not allowed).'\n                ) from e\n\n            # Apply any & all MetView or FieldSet operations.\n            kout, fs_out = self.apply(key, fs)\n\n            return kout, fs_out.to_dataset().compute()\n\n    def expand(self, pcoll):\n        return pcoll | beam.MapTuple(self._apply)\n\n\n@dataclasses.dataclass\nclass RegridChunk(MapChunkAsFieldset):\n    \"\"\"Regrid a xarray.Dataset with MetView.\n\n    Attributes:\n        regrid_kwargs: A dictionary of keyword-args to be passed into `mv.regrid()`\n            (excluding the dataset).\n        zarr_input_chunks: (Optional) When regridding Zarr data, how the input\n            dataset should be chunked upon open.\n    \"\"\"\n    regrid_kwargs: t.Dict\n    zarr_input_chunks: t.Optional[t.Dict] = None\n\n    def template(self, source_ds: xr.Dataset) -> xr.Dataset:\n        \"\"\"Calculate the output Zarr template by regridding (a tiny slice of) the input dataset.\"\"\"\n        # Silence Dask warning...\n        with dask.config.set(**{'array.slicing.split_large_chunks': False}):\n            zeros = source_ds.chunk().pipe(xr.zeros_like)\n\n            # If the chunked source dataset is small (less than 10 MB), just regrid it!\n            if (zeros.nbytes / 1024 / 1024) < 10:\n                _, ds = self._apply(xbeam.Key(), zeros)\n                return ds.chunk()\n\n            # source_ds is probably very big! Let's shrink it by a non-spatial dimension\n            # so calculating the template will be tractable...\n\n            # Get a single timeslice of the zeros Dataset (or equivalent chunkable dimension).\n            # We don't know for sure that 'time' is in the Zarr dataset, so here we make our\n            # best attempt to find a good slice.\n            t0 = None\n            for dim in ['time', *(self.zarr_input_chunks or {}).keys()]:\n                if dim in zeros:\n                    t0 = zeros.isel({dim: 0}, drop=True)\n                    break\n            if t0 is None:\n                raise ValueError('cannot infer any dimension when creating a Zarr template. '\n                                 'Please define at least one chunk in `--zarr_input_chunks`.')\n\n            _, ds = self._apply(xbeam.Key(), t0)\n            # Regrid the single time, then expand the Dataset to span all times.\n            tmpl = (\n                ds\n                .chunk()\n                .expand_dims({dim: zeros[dim]}, 0)\n            )\n\n            return tmpl\n\n    def apply(self, key: xbeam.Key, fs: Fieldset) -> t.Tuple[xbeam.Key, Fieldset]:\n        return key, mv.regrid(data=fs, **self.regrid_kwargs)\n\n\n@dataclasses.dataclass\nclass Regrid(ToDataSink):\n    \"\"\"Regrid data using MetView.\n\n    See https://metview.readthedocs.io/en/latest/metview/using_metview/regrid_intro.html\n    for an in-depth intro on regridding with MetView.\n\n    Attributes:\n        output_path: URI for regridding target. Can be a glob pattern of NetCDF or Grib files; optionally,\n            it can be a Zarr corpus is supported.\n        regrid_kwargs: A dictionary of keyword-args to be passed into `mv.regrid()` (excluding the dataset).\n        to_netcdf: When set, it raw data output will be written as NetCDF. Cannot use with Zarr datasets.\n        zarr_input_chunks: (Optional) When regridding Zarr data, how the input dataset should be chunked upon open.\n        zarr_output_chunks: (Optional, recommended) When regridding Zarr data, how the output Zarr dataset should be\n            divided into chunks.\n    \"\"\"\n    output_path: str\n    regrid_kwargs: t.Dict\n    force_regrid: bool = False\n    to_netcdf: bool = False\n    apply_bz2_compression: bool = False\n    zarr_input_chunks: t.Optional[t.Dict] = None\n    zarr_output_chunks: t.Optional[t.Dict] = None\n\n    @classmethod\n    def add_parser_arguments(cls, subparser: argparse.ArgumentParser) -> None:\n        subparser.add_argument('-o', '--output_path', type=str, required=True,\n                               help='The destination path for the regridded files.')\n        subparser.add_argument('-k', '--regrid_kwargs', type=json.loads, default='{\"grid\": [0.25, 0.25]}',\n                               help=\"\"\"Keyword-args to pass into `metview.regrid()` in the form of a JSON string. \"\"\"\n                                    \"\"\"Will default to '{\"grid\": [0.25, 0.25]}'.\"\"\")\n        subparser.add_argument('--force_regrid', action='store_true', default=False,\n                               help='Force regrid all files even if file is present at output_path.')\n        subparser.add_argument('--to_netcdf', action='store_true', default=False,\n                               help='Write output file in NetCDF via XArray. Default: off')\n        subparser.add_argument('-bz2', '--apply_bz2_compression', action='store_true', default=False,\n                               help='Enable bzip2 (.bz2) compression for the regridded file. Default: off.')\n        subparser.add_argument('-zi', '--zarr_input_chunks', type=json.loads, default=None,\n                               help='When reading a Zarr, break up the data into chunks. Takes a JSON string.')\n        subparser.add_argument('-zo', '--zarr_output_chunks', type=json.loads, default=None,\n                               help='When writing a Zarr, write the data with chunks. Takes a JSON string.')\n\n    @classmethod\n    def validate_arguments(cls, known_args: argparse.Namespace, pipeline_options: t.List[str]) -> None:\n        if known_args.zarr and known_args.to_netcdf:\n            raise ValueError('only Zarr-to-Zarr regridding is allowed!')\n\n        if not known_args.zarr and (known_args.zarr_input_chunks or known_args.zarr_output_chunks):\n            raise ValueError('chunks can only be set when input URI is a Zarr.')\n\n        if known_args.zarr:\n            # Encourage use of correct output_path format.\n            _, out_ext = os.path.splitext(known_args.output_path)\n            if out_ext not in ['', '.zarr']:\n                warnings.warn('if input is a Zarr, the output_path must also be a Zarr.', RuntimeWarning)\n\n    def target_from(self, uri: str) -> str:\n        \"\"\"Create the target path from the input URI.\n\n        In the case of Zarr, the output will be treated like a valid path.\n        For NetCDF, this will change the extension to '.nc'.\n        \"\"\"\n        if self.zarr:\n            return self.output_path\n\n        base = os.path.basename(uri)\n        in_dest = os.path.join(self.output_path, base)\n\n        if not self.to_netcdf:\n            return in_dest\n\n        # If we convert to NetCDF, change the extension.\n        no_ext, _ = os.path.splitext(in_dest)\n        return f'{no_ext}.nc'\n\n    def is_grib_file_corrupt(self, local_grib: str) -> bool:\n        try:\n            # Run grib_ls command to check the file\n            subprocess.check_output(['grib_ls', local_grib])\n            return False\n        except subprocess.CalledProcessError as e:\n            logger.info(f\"Encountered error while reading GRIB: {e}.\")\n            return True\n\n    def path_exists(self, path: str, force_regrid: bool = False) -> bool:\n        \"\"\"Check if path exists. Pass force_regrid to skip checking.\"\"\"\n        if force_regrid:\n            return False\n        matches = FileSystems().match([path])\n        assert len(matches) == 1\n        return len(matches[0].metadata_list) > 0\n\n    def apply(self, uri: str) -> None:\n        logger.info(f'Regridding from {uri!r} to {self.target_from(uri)!r}.')\n\n        if self.dry_run:\n            return\n\n        if self.path_exists(self.target_from(uri), self.force_regrid):\n            logger.info(f\"Skipping {uri}.\")\n            return\n\n        with _metview_op():\n            try:\n                logger.info(f'Copying grib from {uri!r} to local disk.')\n\n                with open_local(uri) as local_grib:\n                    logger.info(f\"Checking for {uri}'s validity...\")\n                    if self.is_grib_file_corrupt(local_grib):\n                        logger.error(f\"Corrupt GRIB file found: {uri}.\")\n                        return\n                    logger.info(f\"No issues found with {uri}.\")\n\n                    logger.info(f'Regridding {uri!r} using {self.regrid_kwargs}.')\n                    fs = mv.bindings.Fieldset(path=local_grib)\n                    fieldset = mv.regrid(data=fs, **self.regrid_kwargs)\n\n                with tempfile.NamedTemporaryFile() as src:\n                    logger.info(f'Writing {self.target_from(uri)!r} to local disk.')\n                    if self.to_netcdf:\n                        fieldset.to_dataset().to_netcdf(src.name)\n                    else:\n                        mv.write(src.name, fieldset)\n\n                    src.flush()\n\n                    _clear_metview()\n\n                    logger.info(f'Uploading {self.target_from(uri)!r}.')\n\n                    if self.apply_bz2_compression:\n                        logger.info(\n                            f'Applying bzip2 compression before copying to {self.target_from(uri)!r} ...'\n                        )\n                        subprocess.run(f\"bzip2 -k {src.name}\".split())\n\n                        copy(src.name + '.bz2', self.target_from(uri))\n\n                        logger.info(f'Cleaning up {src.name}.bz2 ...')\n                        os.unlink(src.name + '.bz2')  # Deleting the tempfile.bz2 file.\n                    else:\n                        copy(src.name, self.target_from(uri))\n            except Exception as e:\n                logger.info(f'Regrid failed for {uri!r}. Error: {str(e)}')\n\n    def expand(self, paths):\n        if not self.zarr:\n            paths | beam.Map(self.apply)\n            return\n\n        # Since `chunks=None` here, data will be opened lazily upon access.\n        # This is used to get the Zarr metadata without loading the data.\n        source_ds = xr.open_zarr(self.first_uri, **self.zarr_kwargs)\n\n        regrid_op = RegridChunk(self.regrid_kwargs, self.zarr_input_chunks)\n\n        regridded = (\n                paths\n                | xbeam.DatasetToChunks(source_ds, self.zarr_input_chunks)\n                | 'RegridChunk' >> regrid_op\n        )\n\n        tmpl = paths | beam.Create([source_ds]) | 'CalcZarrTemplate' >> beam.Map(regrid_op.template)\n\n        to_write = regridded\n        if self.zarr_output_chunks:\n            to_write |= xbeam.ConsolidateChunks(self.zarr_output_chunks)\n\n        to_write | xbeam.ChunksToZarr(self.output_path, beam.pvalue.AsSingleton(tmpl), self.zarr_output_chunks)\n"
  },
  {
    "path": "weather_mv/loader_pipeline/regrid_test.py",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport dataclasses\nimport glob\nimport os.path\nimport tempfile\nimport unittest\n\nimport numpy as np\nimport xarray as xr\nfrom apache_beam.testing.test_pipeline import TestPipeline\nfrom cfgrib.xarray_to_grib import to_grib\n\nfrom .regrid import Regrid\nfrom .sinks_test import TestDataBase\n\ntry:\n    import metview  # noqa\nexcept (ModuleNotFoundError, ImportError, FileNotFoundError):\n    raise unittest.SkipTest('MetView dependency is not installed. Skipping tests...')\n\n\ndef make_skin_temperature_dataset() -> xr.Dataset:\n    ds = xr.DataArray(\n        np.full((4, 5, 6), 300.),\n        coords=[\n            np.arange(0, 4),\n            np.linspace(90., -90., 5),\n            np.linspace(0., 360., 6, endpoint=False),\n        ],\n        dims=['time', 'latitude', 'longitude'],\n    ).to_dataset(name='skin_temperature')\n    ds.skin_temperature.attrs['GRIB_shortName'] = 'skt'\n    ds.skin_temperature.attrs['GRIB_gridType'] = 'regular_ll'\n    return ds\n\n\ndef metview_cache_exists() -> bool:\n    caches = glob.glob(f'{tempfile.gettempdir()}/mv.*/')\n    return any(os.path.isfile(p) for p in caches)\n\n\nclass RegridTest(TestDataBase):\n\n    # TODO(alxr): Test the quality of the regridding...\n\n    def setUp(self) -> None:\n        super().setUp()\n        self.tmpdir = tempfile.TemporaryDirectory()\n        self.input_dir = os.path.join(self.tmpdir.name, 'input')\n        self.input_grib = os.path.join(self.input_dir, 'test.gb')\n        os.mkdir(self.input_dir)\n        self.Op = Regrid(\n            output_path=self.tmpdir.name,\n            first_uri=self.input_grib,\n            regrid_kwargs={'grid': [0.25, 0.25]},\n            dry_run=False,\n            zarr=False,\n            zarr_kwargs={},\n        )\n\n    def tearDown(self) -> None:\n        self.tmpdir.cleanup()\n\n    def test_target_name(self):\n        actual = self.Op.target_from('path/to/data/called/foobar.gb')\n        self.assertEqual(actual, f'{self.tmpdir.name}/foobar.gb')\n\n    def test_target_name__to_netCDF__changes_ext(self):\n        Op = dataclasses.replace(self.Op, to_netcdf=True)\n        actual = Op.target_from('path/to/data/called/foobar.gb')\n        self.assertEqual(actual, f'{self.tmpdir.name}/foobar.nc')\n\n    def test_apply__creates_a_file(self):\n        to_grib(make_skin_temperature_dataset(), self.input_grib)\n\n        self.assertTrue(os.path.exists(self.input_grib))\n\n        self.Op.apply(self.input_grib)\n\n        self.assertTrue(os.path.exists(f'{self.tmpdir.name}/test.gb'))\n        self.assertFalse(metview_cache_exists())\n\n    def test_apply__works_when_called_twice(self):\n        for _ in range(2):\n            self.test_apply__creates_a_file()\n\n    def test_apply__to_netCDF__creates_a_netCDF_file(self):\n        to_grib(make_skin_temperature_dataset(), self.input_grib)\n        self.assertTrue(os.path.exists(self.input_grib))\n\n        Op = dataclasses.replace(self.Op, to_netcdf=True)\n        Op.apply(self.input_grib)\n\n        expected = f'{self.tmpdir.name}/test.nc'\n        self.assertTrue(os.path.exists(expected))\n\n        try:\n            xr.open_dataset(expected)\n        except:  # noqa\n            self.fail('Cannot open netCDF with Xarray.')\n\n    def test_zarr__coarsen(self):\n        input_zarr = os.path.join(self.input_dir, 'input.zarr')\n        output_zarr = os.path.join(self.input_dir, 'output.zarr')\n\n        xr.open_dataset(os.path.join(self.test_data_folder, 'test_data_20180101.nc')).to_zarr(input_zarr)\n        self.assertTrue(os.path.exists(input_zarr))\n\n        Op = dataclasses.replace(\n            self.Op,\n            first_uri=input_zarr,\n            output_path=output_zarr,\n            zarr_input_chunks={\"time\": 25},\n            zarr=True\n        )\n\n        with TestPipeline() as p:\n            p | Op\n\n        self.assertTrue(os.path.exists(output_zarr))\n        try:\n            xr.open_zarr(output_zarr)\n        except:  # noqa\n            self.fail('Cannot open Zarr with Xarray.')\n\n    def test_corrupt_grib_file(self):\n        correct_file_path = os.path.join(self.test_data_folder, 'test_data_grib_single_timestep')\n        corrupt_file_path = os.path.join(self.test_data_folder, 'test_data_corrupt_grib')\n\n        self.assertFalse(self.Op.is_grib_file_corrupt(correct_file_path))\n        self.assertTrue(self.Op.is_grib_file_corrupt(corrupt_file_path))\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "weather_mv/loader_pipeline/sinks.py",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport abc\nimport argparse\nimport contextlib\nimport dataclasses\nimport datetime\nimport inspect\nimport logging\nimport os\nimport re\nimport shutil\nimport subprocess\nimport tempfile\nimport time\nimport typing as t\nfrom urllib.parse import urlparse\n\nimport apache_beam as beam\nimport cfgrib\nimport numpy as np\nimport rasterio\nimport rioxarray\nimport xarray as xr\nfrom apache_beam.io.filesystem import CompressionTypes, FileSystem, CompressedFile, DEFAULT_READ_BUFFER_SIZE\nfrom apache_beam.utils import retry\nfrom google.cloud import storage\nfrom pyproj import Transformer\n\nTIF_TRANSFORM_CRS_TO = \"EPSG:4326\"\n# A constant for all the things in the coords key set that aren't the level name.\nDEFAULT_COORD_KEYS = frozenset(('latitude', 'time', 'step', 'valid_time', 'longitude', 'number'))\nDEFAULT_TIME_ORDER_LIST = ['%Y', '%m', '%d', '%H', '%M', '%S']\n# For uploading / downloading retry logic.\nINITIAL_DELAY = 1.0  # Initial delay in seconds.\nMAX_DELAY = 600  # Maximum delay before giving up in seconds.\nNUM_RETRIES = 10  # Number of tries with exponential backoff.\n\nlogger = logging.getLogger(__name__)\n\n\nclass KwargsFactoryMixin:\n    \"\"\"Adds a factory method to classes or dataclasses for key-word args.\"\"\"\n    @classmethod\n    def from_kwargs(cls, **kwargs):\n        if dataclasses.is_dataclass(cls):\n            fields = [f.name for f in dataclasses.fields(cls)]\n        else:\n            fields = inspect.signature(cls.__init__).parameters.keys()\n\n        return cls(**{k: v for k, v, in kwargs.items() if k in fields})\n\n\n@dataclasses.dataclass\nclass ToDataSink(abc.ABC, beam.PTransform, KwargsFactoryMixin):\n    first_uri: str\n    dry_run: bool\n    zarr: bool\n    zarr_kwargs: t.Dict\n\n    @classmethod\n    @abc.abstractmethod\n    def add_parser_arguments(cls, subparser: argparse.ArgumentParser) -> None:\n        pass\n\n    @classmethod\n    @abc.abstractmethod\n    def validate_arguments(cls, known_args: argparse.Namespace, pipeline_options: t.List[str]) -> None:\n        pass\n\n\ndef _make_grib_dataset_inmem(grib_ds: xr.Dataset) -> xr.Dataset:\n    \"\"\"Copies all the vars in-memory to reduce disk seeks every time a single row is processed.\n\n    This also removes the need to keep the backing temp source file around.\n    \"\"\"\n    data_ds = grib_ds.copy(deep=True)\n    for v in grib_ds.variables:\n        if v not in data_ds.coords:\n            data_ds[v].variable.values = grib_ds[v].variable.values\n    return data_ds\n\n\ndef match_datetime(file_name: str, regex_expression: str) -> datetime.datetime:\n    \"\"\"Matches the regex string given and extracts the datetime object.\n\n    Args:\n        file_name: File name from which you want to extract datetime.\n        regex_expression: Regex expression for extracting datetime from the filename.\n\n    Returns:\n        A datetime object after extracting from the filename.\n    \"\"\"\n\n    def rearrange_time_list(order_list: t.List, time_list: t.List) -> t.List:\n        if order_list == DEFAULT_TIME_ORDER_LIST:\n            return time_list\n        new_time_list = []\n        for i, j in zip(order_list, time_list):\n            dst = DEFAULT_TIME_ORDER_LIST.index(i)\n            new_time_list.insert(dst, j)\n        return new_time_list\n\n    char_to_replace = {\n        '%Y': ['([0-9]{4})', [0, 1978]],\n        '%m': ['([0-9]{2})', [1, 1]],\n        '%d': ['([0-9]{2})', [2, 1]],\n        '%H': ['([0-9]{2})', [3, 0]],\n        '%M': ['([0-9]{2})', [4, 0]],\n        '%S': ['([0-9]{2})', [5, 0]],\n        '*': ['.*']\n    }\n\n    missing_idx_list = []\n    temp_expression = regex_expression\n\n    for key, value in char_to_replace.items():\n        if key != '*' and regex_expression.find(key) == -1:\n            missing_idx_list.append(value[1])\n        else:\n            temp_expression = temp_expression.replace(key, value[0])\n\n    regex_matches = re.findall(temp_expression, file_name)[0]\n\n    order_list = [f'%{char}' for char in re.findall(r'%(\\w{1})', regex_expression)]\n\n    time_list = list(map(int, regex_matches))\n    time_list = rearrange_time_list(order_list, time_list)\n\n    if missing_idx_list:\n        for [idx, val] in missing_idx_list:\n            time_list.insert(idx, val)\n\n    return datetime.datetime(*time_list)\n\n\ndef _preprocess_tif(\n    ds: xr.Dataset,\n    tif_metadata_for_start_time: str,\n    tif_metadata_for_end_time: str,\n    uri: str,\n    initialization_time_regex: str,\n    forecast_time_regex: str\n) -> xr.Dataset:\n    \"\"\"Transforms (y, x) coordinates into (lat, long) and adds bands data in data variables.\n\n    This also retrieves datetime from tif's metadata and stores it into dataset.\n    \"\"\"\n\n    def _replace_dataarray_names_with_long_names(ds: xr.Dataset):\n        rename_dict = {var_name: ds[var_name].attrs.get('long_name', var_name) for var_name in ds.variables}\n        return ds.rename(rename_dict)\n\n    y, x = np.meshgrid(ds['y'], ds['x'])\n    transformer = Transformer.from_crs(ds.spatial_ref.crs_wkt, TIF_TRANSFORM_CRS_TO, always_xy=True)\n    lon, lat = transformer.transform(x, y)\n\n    ds['y'] = lat[0, :]\n    ds['x'] = lon[:, 0]\n    ds = ds.rename({'y': 'latitude', 'x': 'longitude'})\n\n    ds = ds.squeeze().drop_vars('spatial_ref')\n\n    ds = _replace_dataarray_names_with_long_names(ds)\n\n    end_time = None\n    start_time = None\n    if initialization_time_regex and forecast_time_regex:\n        try:\n            start_time = match_datetime(uri, initialization_time_regex)\n        except Exception:\n            raise RuntimeError(\"Wrong regex passed in --initialization_time_regex.\")\n        try:\n            end_time = match_datetime(uri, forecast_time_regex)\n        except Exception:\n            raise RuntimeError(\"Wrong regex passed in --forecast_time_regex.\")\n        ds.attrs['start_time'] = start_time\n        ds.attrs['end_time'] = end_time\n\n    init_time = None\n    forecast_time = None\n    coords = {}\n    try:\n        # if start_time/end_time is in integer milliseconds\n        init_time = (int(start_time.timestamp()) if start_time is not None\n                     else int(ds.attrs[tif_metadata_for_start_time]) / 1000.0)\n        coords['time'] = datetime.datetime.utcfromtimestamp(init_time)\n\n        if tif_metadata_for_end_time:\n            forecast_time = (int(end_time.timestamp()) if end_time is not None\n                             else int(ds.attrs[tif_metadata_for_end_time]) / 1000.0)\n            coords['valid_time'] = datetime.datetime.utcfromtimestamp(forecast_time)\n\n        ds = ds.assign_coords(coords)\n    except KeyError as e:\n        raise RuntimeError(f\"Invalid datetime metadata of tif: {e}.\")\n    except ValueError:\n        try:\n            # if start_time/end_time is in UTC string format\n            init_time = (int(start_time.timestamp()) if start_time is not None\n                         else datetime.datetime.strptime(ds.attrs[tif_metadata_for_start_time],\n                                                         '%Y-%m-%dT%H:%M:%SZ'))\n            coords['time'] = init_time\n\n            if tif_metadata_for_end_time:\n                forecast_time = (int(end_time.timestamp()) if end_time is not None\n                                 else datetime.datetime.strptime(ds.attrs[tif_metadata_for_end_time],\n                                                                 '%Y-%m-%dT%H:%M:%SZ'))\n                coords['valid_time'] = forecast_time\n\n            ds = ds.assign_coords(coords)\n        except ValueError as e:\n            raise RuntimeError(f\"Invalid datetime value in tif's metadata: {e}.\")\n\n    return ds\n\n\ndef _to_utc_timestring(np_time: np.datetime64) -> str:\n    \"\"\"Turn a numpy datetime64 into UTC timestring.\"\"\"\n    timestamp = float((np_time - np.datetime64(0, 's')) / np.timedelta64(1, 's'))\n    return datetime.datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%dT%H:%M:%SZ')\n\n\ndef _add_is_normalized_attr(ds: xr.Dataset, value: bool) -> xr.Dataset:\n    \"\"\"Adds is_normalized to the attrs of the xarray.Dataset.\n\n    This attribute represents if the dataset is the merged dataset (i.e. created by combining N datasets,\n    specifically for normalizing grib's schema) or not.\n    \"\"\"\n    ds.attrs['is_normalized'] = value\n    return ds\n\n\ndef _is_3d_da(da):\n    \"\"\"Checks whether data array is 3d or not.\"\"\"\n    return len(da.shape) == 3\n\n\ndef __normalize_grib_dataset(filename: str,\n                             group_common_hypercubes: t.Optional[bool] = False) -> t.Union[xr.Dataset,\n                                                                                           t.List[xr.Dataset]]:\n    \"\"\"Reads a list of datasets and merge them into a single dataset.\"\"\"\n    _level_data_dict = {}\n\n    list_ds = cfgrib.open_datasets(filename)\n    ds_attrs = list_ds[0].attrs\n    dv_units_dict = {}\n\n    for ds in list_ds:\n        coords_set = set(ds.coords.keys())\n        level_set = coords_set.difference(DEFAULT_COORD_KEYS)\n        level = level_set.pop()\n\n        # Now look at what data vars are in each level.\n        for key in ds.data_vars.keys():\n            da = ds[key]  # The data array\n            attrs = da.attrs  # The metadata for this dataset.\n\n            # Also figure out the forecast hour for this file.\n            forecast_hour = int(da.step.values / np.timedelta64(1, 'h'))\n\n            # We are going to treat the time field as start_time and the\n            # valid_time field as the end_time for EE purposes. Also, get the\n            # times into UTC timestrings.\n            start_time = _to_utc_timestring(da.time.values)\n            end_time = _to_utc_timestring(da.valid_time.values)\n\n            attrs['forecast_hour'] = forecast_hour  # Stick the forecast hour in the metadata as well, that's useful.\n            attrs['start_time'] = start_time\n            attrs['end_time'] = end_time\n\n            if group_common_hypercubes:\n                attrs['level'] = level  # Adding the level in the metadata, will remove in further steps.\n                attrs['is_normalized'] = True  # Adding the 'is_normalized' attribute in the metadata.\n\n            if level not in _level_data_dict:\n                _level_data_dict[level] = []\n\n            no_of_levels = da.shape[0] if _is_3d_da(da) else 1\n\n            # Deal with the randomness that is 3d data interspersed with 2d.\n            # For 3d data, we need to extract ds for each value of level.\n            for sub_c in range(no_of_levels):\n                copied_da = da.copy(deep=True)\n                height = copied_da.coords[level].data.flatten()[sub_c]\n\n                # Some heights are super small, but we can't have decimal points\n                # in channel names & schema fields for Earth Engine & BigQuery respectively , so mostly cut off the\n                # fractional part, unless we are forced to keep it. If so,\n                # replace the decimal point with yet another underscore.\n                if height >= 10:\n                    height_string = f'{height:.0f}'\n                else:\n                    height_string = f'{height:.2f}'.replace('.', '_')\n\n                channel_name = f'{level}_{height_string}_{attrs[\"GRIB_stepType\"]}_{key}'\n                logger.debug('Found channel %s', channel_name)\n\n                # Add the height as a metadata field, that seems useful.\n                copied_da.attrs['height'] = height_string\n\n                # Add the units of each band as a metadata field.\n                dv_units_dict['unit_'+channel_name] = None\n                if 'units' in attrs:\n                    dv_units_dict['unit_'+channel_name] = attrs['units']\n\n                copied_da.name = channel_name\n                if _is_3d_da(da):\n                    copied_da = copied_da.sel({level: height})\n                copied_da = copied_da.drop_vars(level)\n\n                _level_data_dict[level].append(copied_da)\n\n    _data_array_list = []\n    _data_array_list = [xr.merge(list_da) for list_da in _level_data_dict.values()]\n\n    if not group_common_hypercubes:\n        # Stick the forecast hour, start_time, end_time, data variables units\n        # in the ds attrs as well, that's useful.\n        ds_attrs['forecast_hour'] = _data_array_list[0].attrs['forecast_hour']\n        ds_attrs['start_time'] = _data_array_list[0].attrs['start_time']\n        ds_attrs['end_time'] = _data_array_list[0].attrs['end_time']\n        ds_attrs.update(**dv_units_dict)\n\n        merged_dataset = xr.merge(_data_array_list)\n        merged_dataset.attrs.clear()\n        merged_dataset.attrs.update(ds_attrs)\n        return merged_dataset\n\n    return _data_array_list\n\n\ndef __open_dataset_file(filename: str,\n                        uri_extension: str,\n                        disable_grib_schema_normalization: bool,\n                        open_dataset_kwargs: t.Optional[t.Dict] = None,\n                        group_common_hypercubes: t.Optional[bool] = False) -> t.Union[xr.Dataset, t.List[xr.Dataset]]:\n    \"\"\"Opens the dataset at 'uri' and returns a xarray.Dataset.\"\"\"\n    # add a flag to group common hypercubes\n    if group_common_hypercubes:\n        return __normalize_grib_dataset(filename, group_common_hypercubes)\n\n    if open_dataset_kwargs:\n        return _add_is_normalized_attr(xr.open_dataset(filename, **open_dataset_kwargs), False)\n\n    # If URI extension is .tif, try opening file by specifying engine=\"rasterio\".\n    if uri_extension in ['.tif', '.tiff']:\n        return _add_is_normalized_attr(rioxarray.open_rasterio(filename, band_as_variable=True), False)\n\n    # If no open kwargs are available and URI extension is other than tif, make educated guesses about the dataset.\n    try:\n        return _add_is_normalized_attr(xr.open_dataset(filename), False)\n    except ValueError as e:\n        e_str = str(e)\n        if not (\"Consider explicitly selecting one of the installed engines\" in e_str and \"cfgrib\" in e_str):\n            raise\n\n    if not disable_grib_schema_normalization:\n        logger.warning(\"Assuming grib.\")\n        logger.info(\"Normalizing the grib schema, name of the data variables will look like \"\n                    \"'<level>_<height>_<attrs['GRIB_stepType']>_<key>'.\")\n        return _add_is_normalized_attr(__normalize_grib_dataset(filename), True)\n\n    # Trying with explicit engine for cfgrib.\n    try:\n        return _add_is_normalized_attr(\n            xr.open_dataset(filename, engine='cfgrib', backend_kwargs={'indexpath': ''}),\n            False)\n    except ValueError as e:\n        if \"multiple values for key 'edition'\" not in str(e):\n            raise\n    logger.warning(\"Assuming grib edition 1.\")\n    # Try with edition 1\n    # Note: picking edition 1 for now as it seems to get the most data/variables for ECMWF realtime data.\n    return _add_is_normalized_attr(\n        xr.open_dataset(filename, engine='cfgrib', backend_kwargs={'filter_by_keys': {'edition': 1}, 'indexpath': ''}),\n        False)\n\n\n@retry.with_exponential_backoff(\n    num_retries=NUM_RETRIES,\n    logger=logger.warning,\n    initial_delay_secs=INITIAL_DELAY,\n    max_delay_secs=MAX_DELAY\n)\ndef copy(src: str, dst: str) -> None:\n    \"\"\"Copy data via `gsutil` or local filesystem.\"\"\"\n    is_gs = src.startswith(\"gs://\") or dst.startswith(\"gs://\")\n    try:\n        if is_gs:\n            subprocess.run(['gcloud', 'storage', 'cp', src, dst], check=True,\n                           capture_output=True, text=True, input=\"n/n\")\n        else:\n            os.makedirs(os.path.dirname(dst) or '.', exist_ok=True)\n            shutil.copy(src, dst)\n    except Exception as e:\n        error_detail = getattr(e, \"stderr\", str(e)).strip()\n        msg = f\"Failed to copy {src!r} to {dst!r} due to {error_detail}\"\n        logger.error(msg)\n        raise EnvironmentError(msg) from e\n\n\n@contextlib.contextmanager\ndef open_local(uri: str) -> t.Iterator[str]:\n    \"\"\"Copy a cloud object (e.g. a netcdf, grib, or tif file) from cloud storage, like GCS, to local file.\"\"\"\n    with tempfile.NamedTemporaryFile() as dest_file:\n        # Transfer data with gsutil.\n        copy(uri, dest_file.name)\n\n        # Check if data is compressed. Decompress the data using the same methods that beam's\n        # FileSystems interface uses.\n        compression_type = FileSystem._get_compression_type(uri, CompressionTypes.AUTO)\n        if compression_type == CompressionTypes.UNCOMPRESSED:\n            yield dest_file.name\n            return\n\n        dest_file.seek(0)\n        with tempfile.NamedTemporaryFile() as dest_uncompressed:\n            with CompressedFile(open(dest_file.name, 'rb'), compression_type=compression_type) as dcomp:\n                shutil.copyfileobj(dcomp, dest_uncompressed, DEFAULT_READ_BUFFER_SIZE)\n                dest_uncompressed.seek(0)  # Reposition the file pointer to the start.\n                yield dest_uncompressed.name\n\n\n@contextlib.contextmanager\ndef open_dataset(uri: str,\n                 open_dataset_kwargs: t.Optional[t.Dict] = None,\n                 disable_grib_schema_normalization: bool = False,\n                 tif_metadata_for_start_time: t.Optional[str] = None,\n                 tif_metadata_for_end_time: t.Optional[str] = None,\n                 initialization_time_regex: t.Optional[str] = None,\n                 forecast_time_regex: t.Optional[str] = None,\n                 group_common_hypercubes: t.Optional[bool] = False,\n                 is_zarr: bool = False) -> t.Iterator[xr.Dataset]:\n    \"\"\"Open the dataset at 'uri' and return a xarray.Dataset.\"\"\"\n    try:\n        local_open_dataset_kwargs = start_date = end_date = None\n        if open_dataset_kwargs is not None:\n            local_open_dataset_kwargs = open_dataset_kwargs.copy()\n            start_date = local_open_dataset_kwargs.pop('start_date', None)\n            end_date = local_open_dataset_kwargs.pop('end_date', None)\n\n        if is_zarr:\n            ds: xr.Dataset = _add_is_normalized_attr(xr.open_dataset(uri, engine='zarr',\n                                                                     **local_open_dataset_kwargs), False)\n            if start_date is not None and end_date is not None:\n                ds = ds.sel(time=slice(start_date, end_date))\n            beam.metrics.Metrics.counter('Success', 'ReadNetcdfData').inc()\n            yield ds\n            ds.close()\n            return\n        with open_local(uri) as local_path:\n            _, uri_extension = os.path.splitext(uri)\n            xr_datasets: xr.Dataset = __open_dataset_file(local_path,\n                                                          uri_extension,\n                                                          disable_grib_schema_normalization,\n                                                          local_open_dataset_kwargs,\n                                                          group_common_hypercubes)\n            # Extracting dtype, crs and transform from the dataset.\n            rasterio_error = False\n            try:\n                with rasterio.open(local_path, 'r') as f:\n                    dtype, crs, transform = (f.profile.get(key) for key in ['dtype', 'crs', 'transform'])\n            except rasterio.errors.RasterioIOError:\n                rasterio_error = True\n                logger.warning('Cannot parse projection and data type information for Dataset %r.', uri)\n\n            if group_common_hypercubes:\n                total_size_in_bytes = 0\n\n                for xr_dataset in xr_datasets:\n                    if not rasterio_error:\n                        xr_dataset.attrs.update({'dtype': dtype, 'crs': crs, 'transform': transform})\n                    total_size_in_bytes += xr_dataset.nbytes\n\n                logger.info(f'opened dataset size: {total_size_in_bytes}')\n            else:\n                xr_dataset = xr_datasets\n                if start_date is not None and end_date is not None:\n                    xr_dataset = xr_datasets.sel(time=slice(start_date, end_date))\n                if uri_extension in ['.tif', '.tiff']:\n                    xr_dataset = _preprocess_tif(xr_dataset,\n                                                 tif_metadata_for_start_time,\n                                                 tif_metadata_for_end_time,\n                                                 uri,\n                                                 initialization_time_regex,\n                                                 forecast_time_regex)\n\n                if not rasterio_error:\n                    # Extracting dtype, crs and transform from the dataset & storing them as attributes.\n                    xr_dataset.attrs.update({'dtype': dtype, 'crs': crs, 'transform': transform})\n                logger.info(f'opened dataset size: {xr_dataset.nbytes}')\n\n            beam.metrics.Metrics.counter('Success', 'ReadNetcdfData').inc()\n            yield xr_datasets if group_common_hypercubes else xr_dataset\n\n            # Releasing any resources linked to the object(s).\n            if group_common_hypercubes:\n                for xr_dataset in xr_datasets:\n                    xr_dataset.close()\n            else:\n                xr_dataset.close()\n\n    except Exception as e:\n        beam.metrics.Metrics.counter('Failure', 'ReadNetcdfData').inc()\n        logger.error(f'Unable to open file {uri!r}: {e}')\n        raise\n\n\ndef get_file_time(element: t.Any) -> int:\n    \"\"\"Calculates element file's write timestamp in UTC.\"\"\"\n    try:\n        element_parsed = urlparse(element)\n        if element_parsed.scheme == \"gs\":  # For file in Google cloud storage.\n            client = storage.Client()\n            bucket = client.get_bucket(element_parsed.netloc)\n            blob = bucket.get_blob(element_parsed.path[1:])\n\n            updated_time = int(blob.updated.timestamp())\n        else:  # For file in local.\n            file_stats = os.stat(element)\n            updated_time = int(time.mktime(time.gmtime(file_stats.st_mtime)))\n\n        return updated_time\n    except Exception as e:\n        raise ValueError(\n            f\"Error fetching raw data file bucket time for {element!r}. Error: {str(e)}\"\n        )\n"
  },
  {
    "path": "weather_mv/loader_pipeline/sinks_test.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport contextlib\nimport datetime\nfrom functools import wraps\nimport numpy as np\nimport os\nimport tempfile\nimport tracemalloc\nimport unittest\nimport xarray as xr\n\nimport weather_mv\nfrom .sinks import match_datetime, open_dataset\n\n\nclass TestDataBase(unittest.TestCase):\n    def setUp(self) -> None:\n        self.test_data_folder = f'{next(iter(weather_mv.__path__))}/test_data'\n\n\ndef _handle_missing_grib_be(f):\n    @wraps(f)\n    def decorated(*args, **kwargs):\n        try:\n            return f(*args, **kwargs)\n        except ValueError as e:\n            # Some setups may not have Cfgrib installed properly. Ignore tests for these cases.\n            e_str = str(e)\n            if \"Consider explicitly selecting one of the installed engines\" not in e_str or \"cfgrib\" in e_str:\n                raise\n\n    return decorated\n\n\n@contextlib.contextmanager\ndef limit_memory(max_memory=30):\n    ''''Measure memory consumption of the function.\n        'memory limit' in MB\n    '''\n    try:\n        tracemalloc.start()\n        yield\n    finally:\n        current, peak = tracemalloc.get_traced_memory()\n        tracemalloc.stop()\n        assert peak / 1024 ** 2 <= max_memory, f\"Memory usage {peak / 1024 ** 2} exceeded {max_memory} MB limit.\"\n\n\n@contextlib.contextmanager\ndef write_netcdf():\n    \"\"\"Generates temporary netCDF file using xarray.\"\"\"\n    lat_dim = 3210\n    lon_dim = 3440\n    lat = np.linspace(-90, 90, lat_dim)\n    lon = np.linspace(-180, 180, lon_dim)\n    data_arr = np.random.uniform(low=0, high=0.1, size=(5, lat_dim, lon_dim))\n\n    ds = xr.Dataset(\n        {\"var_1\": (('time', 'lat', 'lon'), data_arr)},\n        coords={\n            \"lat\": lat,\n            \"lon\": lon,\n        })\n    with tempfile.NamedTemporaryFile() as fp:\n        ds.to_netcdf(fp.name)\n        yield fp.name\n\n\nclass OpenDatasetTest(TestDataBase):\n\n    def setUp(self) -> None:\n        super().setUp()\n        self.test_data_path = os.path.join(self.test_data_folder, 'test_data_20180101.nc')\n        self.test_grib_path = os.path.join(self.test_data_folder, 'test_data_grib_single_timestep')\n        self.test_tif_path = os.path.join(self.test_data_folder, 'test_data_tif_time.tif')\n        self.test_zarr_path = os.path.join(self.test_data_folder, 'test_data.zarr')\n\n    def test_opens_grib_files(self):\n        with open_dataset(self.test_grib_path) as ds1:\n            self.assertIsNotNone(ds1)\n            self.assertDictContainsSubset({'is_normalized': True}, ds1.attrs)\n        with open_dataset(self.test_grib_path, disable_grib_schema_normalization=True) as ds2:\n            self.assertIsNotNone(ds2)\n            self.assertDictContainsSubset({'is_normalized': False}, ds2.attrs)\n\n    def test_accepts_xarray_kwargs(self):\n        with open_dataset(self.test_data_path) as ds1:\n            self.assertIn('d2m', ds1)\n            self.assertDictContainsSubset({'is_normalized': False}, ds1.attrs)\n        with open_dataset(self.test_data_path, {'drop_variables': 'd2m'}) as ds2:\n            self.assertNotIn('d2m', ds2)\n            self.assertDictContainsSubset({'is_normalized': False}, ds2.attrs)\n\n    def test_opens_tif_files(self):\n        with open_dataset(self.test_tif_path, tif_metadata_for_start_time='start_time',\n                            tif_metadata_for_end_time='end_time') as ds:\n            self.assertIsNotNone(ds)\n            self.assertDictContainsSubset({'is_normalized': False}, ds.attrs)\n\n    def test_opens_zarr(self):\n        with open_dataset(self.test_zarr_path, is_zarr=True, open_dataset_kwargs={}) as ds:\n            self.assertIsNotNone(ds)\n            self.assertEqual(list(ds.data_vars), ['cape', 'd2m'])\n\n    def test_open_dataset__fits_memory_bounds(self):\n        with write_netcdf() as test_netcdf_path:\n            with limit_memory(max_memory=30):\n                with open_dataset(test_netcdf_path) as _:\n                    pass\n\n    def test_group_common_hypercubes(self):\n        with open_dataset(self.test_grib_path,\n                          group_common_hypercubes=True) as ds:\n            self.assertEqual(isinstance(ds, list), True)\n\n\nclass DatetimeTest(unittest.TestCase):\n\n    def test_datetime_regex_string(self):\n        file_name = '3B-HHR-E_MS_MRG_3IMERG_20220901-S000000-E002959_0000_V06C_30min.tiff'\n\n        regex_str = '3B-HHR-E_MS_MRG_3IMERG_%Y%m%d-S%H%M%S-*.tiff'\n\n        expected = datetime.datetime.strptime('2022-09-01 00:00:00', '%Y-%m-%d %H:%M:%S')\n        actual = match_datetime(file_name, regex_str)\n\n        self.assertEqual(actual, expected)\n\n    def test_datetime_regex_string_with_missing_parameters(self):\n        file_name = '3B-HHR-E_MS_MRG_3IMERG_0901-S000000-E002959_0000_V06C_30min.tiff'\n\n        regex_str = '3B-HHR-E_MS_MRG_3IMERG_%m%d-S%H%M%S-*.tiff'\n\n        expected = datetime.datetime.strptime('1978-09-01 00:00:00', '%Y-%m-%d %H:%M:%S')\n        actual = match_datetime(file_name, regex_str)\n\n        self.assertEqual(actual, expected)\n\n    def test_datetime_regex_string_with_different_order(self):\n        file_name = '3B-HHR-E_MS_MRG_3IMERG_09012022-S000000-E002959_0000_V06C_30min.tiff'\n\n        regex_str = '3B-HHR-E_MS_MRG_3IMERG_%m%d%Y-S%H%M%S-*.tiff'\n\n        expected = datetime.datetime.strptime('2022-09-01 00:00:00', '%Y-%m-%d %H:%M:%S')\n        actual = match_datetime(file_name, regex_str)\n\n        self.assertEqual(actual, expected)\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "weather_mv/loader_pipeline/streaming.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Window and parse Pub/Sub streams of real-time weather data added to cloud storage.\n\nExample windowing code borrowed from:\n  https://cloud.google.com/pubsub/docs/pubsub-dataflow#code_sample\n\"\"\"\n\nimport datetime\nimport fnmatch\nimport json\nimport logging\nimport random\nimport typing as t\nfrom urllib.parse import urlparse\n\nimport apache_beam as beam\nfrom apache_beam.transforms.window import FixedWindows\n\nlogger = logging.getLogger(__name__)\n\n\nclass GroupMessagesByFixedWindows(beam.PTransform):\n    \"\"\"A composite transform that groups Pub/Sub messages based on publish time\n    and outputs a list of tuples, each containing a message and its publish time.\n    \"\"\"\n\n    def __init__(self, window_size: int, num_shards: int = 5):\n        # Set window size to 60 seconds.\n        self.window_size = int(window_size * 60)\n        self.num_shards = num_shards\n\n    def expand(self, pcoll):\n        return (\n                pcoll\n                # Bind window info to each element using element timestamp (or publish time).\n                | \"Window into fixed intervals\" >> beam.WindowInto(FixedWindows(self.window_size))\n                | \"Add timestamp to windowed elements\" >> beam.ParDo(AddTimestamp())\n                # Assign a random key to each windowed element based on the number of shards.\n                | \"Add key\" >> beam.WithKeys(lambda _: random.randint(0, self.num_shards - 1))\n                # Group windowed elements by key. All the elements in the same window must fit\n                # memory for this. If not, you need to use `beam.util.BatchElements`.\n                | \"Group by key\" >> beam.GroupByKey()\n        )\n\n\nclass AddTimestamp(beam.DoFn):\n    \"\"\"Processes each windowed element by extracting the message body and its\n    publish time into a tuple.\n    \"\"\"\n\n    def process(self, element, publish_time=beam.DoFn.TimestampParam) -> t.Iterable[t.Tuple[str, str]]:\n        yield (\n            element.decode(\"utf-8\"),\n            datetime.datetime.utcfromtimestamp(float(publish_time)).strftime(\n                \"%Y-%m-%d %H:%M:%S.%f\"\n            ),\n        )\n\n\nclass ParsePaths(beam.DoFn):\n    \"\"\"Parse paths to real-time weather data from windowed-batches.\"\"\"\n\n    def __init__(self, uri_pattern: str):\n        self.uri_pattern = uri_pattern\n        self.protocol = f'{urlparse(uri_pattern).scheme}://'\n        super().__init__()\n\n    @classmethod\n    def try_parse_message(cls, message_body: t.Union[str, t.Dict]) -> t.Dict:\n        \"\"\"Robustly parse message body, which will be JSON in the vast majority of cases,\n         but might be a dictionary.\"\"\"\n        try:\n            return json.loads(message_body)\n        except (json.JSONDecodeError, TypeError):\n            if isinstance(message_body, dict):\n                return message_body\n            raise\n\n    def to_object_path(self, payload: t.Dict) -> str:\n        \"\"\"Parse cloud object from Pub/Sub topic payload.\"\"\"\n        return f'{self.protocol}{payload[\"bucket\"]}/{payload[\"name\"]}'\n\n    def should_skip(self, message_body: t.Dict) -> bool:\n        \"\"\"Returns true if Pub/Sub topic does *not* match the target file URI pattern.\"\"\"\n        try:\n            return not fnmatch.fnmatch(self.to_object_path(message_body), self.uri_pattern)\n        except KeyError:\n            return True\n\n    def process(self, key_value, window=beam.DoFn.WindowParam) -> t.Iterable[str]:\n        \"\"\"Yield paths to real-time weather data in cloud storage.\"\"\"\n\n        shard_id, batch = key_value\n\n        logger.debug(f'Processing shard {shard_id!r}.')\n\n        for message_body, publish_time in batch:\n            logger.debug(message_body)\n\n            parsed_msg = self.try_parse_message(message_body)\n\n            target = self.to_object_path(parsed_msg)\n            logger.info(f'Parsed path {target!r}...')\n\n            if self.should_skip(parsed_msg):\n                logger.info(f'skipping {target!r}.')\n                continue\n\n            yield target\n"
  },
  {
    "path": "weather_mv/loader_pipeline/streaming_test.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport json\nimport unittest\n\nfrom .streaming import ParsePaths\n\n\nclass ParsePathsTests(unittest.TestCase):\n    def setUp(self) -> None:\n        self.parser = ParsePaths('gs://XXXX/tmp/*')\n        self.test_input = \"\"\"{\"bucket\": \"XXXX\", \"name\": \"tmp/T1D10091200101309001\"}\"\"\"\n\n        media_link = 'https://www.googleapis.com/download/storage/v1/b/XXXX/o/tmp%2FT1D10091200101309001?generation' \\\n                     '=1635366553038121&alt=media'\n        self.real_input = f\"\"\"\n        {{\n            \"kind\": \"storage#object\",\n            \"id\": \"XXXX/tmp/T1D10091200101309001/1635366553038121\",\n            \"selfLink\": \"https://www.googleapis.com/storage/v1/b/XXXX/o/tmp%2FT1D10091200101309001\",\n            \"name\": \"tmp/T1D10091200101309001\",\n            \"bucket\": \"XXXX\",\n            \"generation\": \"1635366553038121\",\n            \"metageneration\": \"1\",\n            \"contentType\": \"application/octet-stream\",\n            \"timeCreated\": \"2021-10-27T20:29:13.152Z\",\n            \"updated\": \"2021-10-27T20:29:13.152Z\",\n            \"storageClass\": \"STANDARD\",\n            \"timeStorageClassUpdated\": \"2021-10-27T20:29:13.152Z\",\n            \"size\": \"9725508\",\n            \"md5Hash\": \"qrMcuK4nTr9uCD7aJJqtkA==\",\n            \"mediaLink\": \"{media_link}\",\n            \"crc32c\": \"zKlm3w==\",\n            \"etag\": \"CKna4pO36/MCEAE=\"\n        }}\"\"\"\n\n    def test_parse_message(self):\n        actual = ParsePaths.try_parse_message(self.test_input)\n        self.assertEqual(actual, {'bucket': 'XXXX', 'name': 'tmp/T1D10091200101309001'})\n\n    def test_parse_message__already_is_dict(self):\n        actual = ParsePaths.try_parse_message({'bucket': 'XXXX', 'name': 'tmp/T1D10091200101309001'})\n        self.assertEqual(actual, {'bucket': 'XXXX', 'name': 'tmp/T1D10091200101309001'})\n\n    def test_parse_message__bad_json(self):\n        with self.assertRaises(json.JSONDecodeError):\n            ParsePaths.try_parse_message(\"\"\"{\"foo\": 1, \"bar\": 2\"\"\")\n\n    def test_parse_message__round_trip(self):\n        parsed = ParsePaths.try_parse_message(self.real_input)\n        converted = json.dumps(parsed)\n        re_parsed = ParsePaths.try_parse_message(converted)\n\n        self.assertEqual(parsed, re_parsed)\n\n    def test_should_skip(self):\n        parsed = self.parser.try_parse_message(self.real_input)\n        self.assertFalse(self.parser.should_skip(parsed))\n\n    def test_should_skip__missing_values(self):\n        missing_field = \"\"\"{\"name\": \"tmp/T1D10091200101309001\"}\"\"\"\n        parsed = self.parser.try_parse_message(missing_field)\n        self.assertTrue(self.parser.should_skip(parsed))\n\n    def test_should_skip__mismatch_pattern(self):\n        self.parser = ParsePaths('gs://XXXX/foo/*')\n        parsed = self.parser.try_parse_message(self.test_input)\n        self.assertTrue(self.parser.should_skip(parsed))\n"
  },
  {
    "path": "weather_mv/loader_pipeline/util.py",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport abc\nimport datetime\nimport inspect\nimport itertools\nimport json\nimport logging\nimport math\nimport re\nimport signal\nimport sys\nimport tempfile\nimport time\nimport traceback\nimport typing as t\nimport uuid\nfrom functools import partial\nfrom urllib.parse import urlparse\nimport apache_beam as beam\nimport numpy as np\nimport pandas as pd\nimport xarray as xr\nfrom google.api_core.exceptions import BadRequest\nfrom google.api_core.exceptions import NotFound\nfrom google.cloud import bigquery, storage\nfrom xarray.core.utils import ensure_us_time_resolution\n\nfrom .sinks import DEFAULT_COORD_KEYS\nfrom .metrics import timeit\n\nlogger = logging.getLogger(__name__)\n\nCANARY_BUCKET_NAME = 'anthromet_canary_bucket'\nCANARY_RECORD = {'foo': 'bar'}\nCANARY_RECORD_FILE_NAME = 'canary_record.json'\nCANARY_OUTPUT_TABLE_SUFFIX = '_anthromet_canary_table'\nCANARY_TABLE_SCHEMA = [bigquery.SchemaField('name', 'STRING', mode='NULLABLE')]\nBQ_EXCLUDE_COORDS = {'longitude', 'latitude'}\n\n\ndef make_attrs_ee_compatible(attrs: t.Dict) -> t.Dict:\n    \"\"\"Scans EEE asset attributes and makes them EE compatible.\n\n    EE asset attribute names must contain only the following characters: A..Z,\n    a..z, 0..9 or '_'. Maximum length is 110 characters. Attribute values must be\n    string or number.\n\n    If an attribute name is more than 110 characters, it will consider the first\n    110 characters as the attribute name.\n    \"\"\"\n    new_attrs = {}\n\n    for k, v in attrs.items():\n        if len(k) > 110:  # Truncate attribute name with > 110 characters.\n            k = k[:110]\n        # Replace unaccepted characters with underscores.\n        k = re.sub(r'[^a-zA-Z0-9-_]+', r'_', k)\n\n        if type(v) not in [int, float]:\n            v = str(v)\n            if len(v) > 1024:\n                v = f'{v[:1021]}...'  # Since 1 char = 1 byte.\n\n        v = to_json_serializable_type(v)\n        new_attrs[k] = v\n\n    return new_attrs\n\n\n# TODO(#245): Group with common utilities (duplicated)\ndef to_json_serializable_type(value: t.Any) -> t.Any:\n    \"\"\"Returns the value with a type serializable to JSON\"\"\"\n    # Note: The order of processing is significant.\n    logger.debug('Serializing to JSON')\n\n    # pd.isna() returns ndarray if input is not scalar therefore checking if value is scalar.\n    if (np.isscalar(value) and pd.isna(value)) or value is None:\n        return None\n    elif np.issubdtype(type(value), np.floating):\n        return float(value)\n    elif isinstance(value, set):\n        value = list(value)\n        return np.where(pd.isna(value), None, value).tolist()\n    elif isinstance(value, np.ndarray):\n        # Will return a scaler if array is of size 1, else will return a list.\n        # Replace all NaNs, NaTs with None.\n        return np.where(pd.isna(value), None, value).tolist()\n    elif isinstance(value, datetime.datetime) or isinstance(value, str) or isinstance(value, np.datetime64):\n        # Assume strings are ISO format timestamps...\n        try:\n            value = datetime.datetime.fromisoformat(value)\n        except ValueError:\n            # ... if they are not, assume serialization is already correct.\n            return value\n        except TypeError:\n            # ... maybe value is a numpy datetime ...\n            try:\n                value = ensure_us_time_resolution(value).astype(datetime.datetime)\n            except AttributeError:\n                # ... value is a datetime object, continue.\n                pass\n\n        # We use a string timestamp representation.\n        if value.tzname():\n            return value.isoformat()\n\n        # We assume here that naive timestamps are in UTC timezone.\n        return value.replace(tzinfo=datetime.timezone.utc).isoformat()\n    elif isinstance(value, datetime.timedelta):\n        return value.total_seconds()\n    elif isinstance(value, np.timedelta64):\n        # Return time delta in seconds.\n        return float(value / np.timedelta64(1, 's'))\n    # This check must happen after processing np.timedelta64 and np.datetime64.\n    elif np.issubdtype(type(value), np.integer):\n        return int(value)\n\n    return value\n\n\ndef _check_for_coords_vars(ds_data_var: str, target_var: str) -> bool:\n    \"\"\"Checks if the dataset's data variable matches with the target variables (or coordinates)\n    specified by the user.\"\"\"\n    return ds_data_var.endswith('_'+target_var) or ds_data_var.startswith(target_var+'_')\n\ndef get_utc_timestamp() -> float:\n    \"\"\"Returns the current UTC Timestamp.\"\"\"\n    return datetime.datetime.now().timestamp()\n\ndef _only_target_coordinate_vars(ds: xr.Dataset, data_vars: t.List[str]) -> t.List[str]:\n    \"\"\"If the user specifies target fields in the dataset, get all the matching coords & data vars.\"\"\"\n\n    # If the dataset is not the merged dataset (created for normalizing grib's schema),\n    # return the target fields specified by the user as it is.\n    if not ds.attrs['is_normalized']:\n        return data_vars\n\n    keep_coords_vars = []\n\n    for dv in data_vars:\n        keep_coords_vars.extend([v for v in ds.data_vars if _check_for_coords_vars(v, dv)])\n        keep_coords_vars.extend([v for v in DEFAULT_COORD_KEYS if v in dv])\n\n    return keep_coords_vars\n\n\ndef _only_target_vars(ds: xr.Dataset, data_vars: t.Optional[t.List[str]] = None) -> xr.Dataset:\n    \"\"\"If the user specifies target fields in the dataset, create a schema only from those fields.\"\"\"\n\n    # If there are no restrictions on data vars, include the whole dataset.\n    if not data_vars:\n        logger.info(f'target data_vars empty; using whole dataset; size: {ds.nbytes}')\n        return ds\n\n    if not ds.attrs['is_normalized']:\n        assert all([dv in ds.data_vars or dv in ds.coords for dv in data_vars]), 'Target variable must be in original '\\\n                                                                                'dataset. '\n\n        dropped_ds = ds.drop_vars([v for v in ds.data_vars if v not in data_vars])\n    else:\n        drop_vars = []\n        check_target_variable = []\n        for dv in data_vars:\n            searched_data_vars = [_check_for_coords_vars(v, dv) for v in ds.data_vars]\n            searched_coords = [] if dv not in DEFAULT_COORD_KEYS else [dv in ds.coords]\n            check_target_variable.append(any(searched_data_vars) or any(searched_coords))\n\n        assert all(check_target_variable), 'Target variable must be in original dataset.'\n\n        for v in ds.data_vars:\n            searched_data_vars = [_check_for_coords_vars(v, dv) for dv in data_vars]\n            if not any(searched_data_vars):\n                drop_vars.append(v)\n\n        dropped_ds = ds.drop_vars(drop_vars)\n\n    logger.info(f'target-only dataset size: {dropped_ds.nbytes}')\n\n    return dropped_ds\n\n\ndef ichunked(iterable: t.Iterable, n: int) -> t.Iterator[t.Iterable]:\n    \"\"\"Yield evenly-sized chunks from an iterable.\"\"\"\n    input_ = iter(iterable)\n    try:\n        while True:\n            it = itertools.islice(input_, n)\n            # peek to check if 'it' has next item.\n            first = next(it)\n            yield itertools.chain([first], it)\n    except StopIteration:\n        pass\n\n\ndef get_coordinates(ds: xr.Dataset, uri: str = '') -> t.Iterator[t.Dict]:\n    \"\"\"Generates normalized coordinate dictionaries that can be used to index Datasets with `.loc[]`.\"\"\"\n    # Creates flattened iterator of all coordinate positions in the Dataset.\n    #\n    # Example: (datetime.datetime('2018-01-02T22:00:00+00:00'))\n\n    # Filter out excluded coordinates from coords_indexes.\n    filtered_coords_indexes = [c for c in ds.coords.indexes if c not in BQ_EXCLUDE_COORDS]\n\n    # Filter out excluded coordinates from coords_dims.\n    filtered_coords_dims = {c: ds.coords.dims [c] for c in ds.coords.dims if c not in BQ_EXCLUDE_COORDS}\n    coords = itertools.product(\n        *(\n            (\n                v\n                for v in ensure_us_time_resolution(ds[c].variable.values).tolist()\n            )\n            for c in filtered_coords_indexes\n        )\n    )\n    # Give dictionary keys to a coordinate index.\n    #\n    # Example:\n    #   {'time': datetime.datetime('2018-01-02T23:00:00+00:00')}\n    idx = 0\n    total_coords = math.prod(filtered_coords_dims.values())\n    for idx, it in enumerate(coords):\n        logger.info(f'Processed {idx+1} / {total_coords} coordinates for {uri!r}...')\n        yield dict(zip(filtered_coords_indexes, it))\n\n    logger.info(f'Finished processing all {idx+1} coordinates.')\n\n\ndef _cleanup_bigquery(bigquery_client: bigquery.Client,\n                      canary_output_table: str,\n                      sig: t.Optional[t.Any] = None,\n                      frame: t.Optional[t.Any] = None) -> None:\n    \"\"\"Deletes the bigquery table.\"\"\"\n    if bigquery_client:\n        bigquery_client.delete_table(canary_output_table, not_found_ok=True)\n    if sig:\n        traceback.print_stack(frame)\n        sys.exit(0)\n\n\ndef _cleanup_bucket(storage_client: storage.Client,\n                    canary_bucket_name: str,\n                    sig: t.Optional[t.Any] = None,\n                    frame: t.Optional[t.Any] = None) -> None:\n    \"\"\"Deletes the bucket.\"\"\"\n    try:\n        storage_client.get_bucket(canary_bucket_name).delete(force=True)\n    except NotFound:\n        pass\n    if sig:\n        traceback.print_stack(frame)\n        sys.exit(0)\n\n\ndef validate_region(output_table: t.Optional[str] = None,\n                    temp_location: t.Optional[str] = None,\n                    region: t.Optional[str] = None) -> None:\n    \"\"\"Validates non-compatible regions scenarios by performing sanity check.\"\"\"\n    if not region and not temp_location:\n        raise ValueError('Invalid GCS location: None.')\n\n    bucket_region = region\n    storage_client = storage.Client()\n    canary_bucket_name = CANARY_BUCKET_NAME + str(uuid.uuid4())\n\n    # Doing cleanup if operation get cut off midway.\n    # TODO : Should we handle some other signals ?\n    do_bucket_cleanup = partial(_cleanup_bucket, storage_client, canary_bucket_name)\n    original_sigint_handler = signal.getsignal(signal.SIGINT)\n    original_sigtstp_handler = signal.getsignal(signal.SIGTSTP)\n    signal.signal(signal.SIGINT, do_bucket_cleanup)\n    signal.signal(signal.SIGTSTP, do_bucket_cleanup)\n\n    if output_table:\n        table_region = None\n        bigquery_client = bigquery.Client()\n        canary_output_table = output_table + CANARY_OUTPUT_TABLE_SUFFIX + str(uuid.uuid4())\n        do_bigquery_cleanup = partial(_cleanup_bigquery, bigquery_client, canary_output_table)\n        signal.signal(signal.SIGINT, do_bigquery_cleanup)\n        signal.signal(signal.SIGTSTP, do_bigquery_cleanup)\n\n    if temp_location:\n        parsed_temp_location = urlparse(temp_location)\n        if parsed_temp_location.scheme != 'gs' or parsed_temp_location.netloc == '':\n            raise ValueError(f'Invalid GCS location: {temp_location!r}.')\n        bucket_name = parsed_temp_location.netloc\n        bucket_region = storage_client.get_bucket(bucket_name).location\n\n    try:\n        bucket = storage_client.create_bucket(canary_bucket_name, location=bucket_region)\n        with tempfile.NamedTemporaryFile(mode='w+') as temp:\n            json.dump(CANARY_RECORD, temp)\n            temp.flush()\n            blob = bucket.blob(CANARY_RECORD_FILE_NAME)\n            blob.upload_from_filename(temp.name)\n\n        if output_table:\n            table = bigquery.Table(canary_output_table, schema=CANARY_TABLE_SCHEMA)\n            table = bigquery_client.create_table(table, exists_ok=True)\n            table_region = table.location\n\n            load_job = bigquery_client.load_table_from_uri(\n                f'gs://{canary_bucket_name}/{CANARY_RECORD_FILE_NAME}',\n                canary_output_table,\n            )\n            load_job.result()\n    except BadRequest:\n        if output_table:\n            raise RuntimeError(f'Can\\'t migrate from source: {bucket_region} to destination: {table_region}')\n        raise RuntimeError(f'Can\\'t upload to destination: {bucket_region}')\n    finally:\n        _cleanup_bucket(storage_client, canary_bucket_name)\n        if output_table:\n            _cleanup_bigquery(bigquery_client, canary_output_table)\n        signal.signal(signal.SIGINT, original_sigint_handler)\n        signal.signal(signal.SIGINT, original_sigtstp_handler)\n\n\ndef _shard(elem, num_shards: int):\n    return (np.random.randint(0, num_shards), elem)\n\n\nclass Shard(beam.DoFn):\n    \"\"\"DoFn to shard elements into groups.\"\"\"\n    def __init__(self, num_shards: int, use_metrics: bool):\n        super().__init__()\n        self.num_shards = num_shards\n        self.use_metrics = use_metrics\n\n    @timeit('Sharding', keyed_fn=True)\n    def process(self, element, *args, **kwargs):\n        yield _shard(element, num_shards=self.num_shards)\n\nclass RateLimit(beam.PTransform, abc.ABC):\n    \"\"\"PTransform to extend to apply a global rate limit to an operation.\n\n    The input PCollection and be of any type and the output will be whatever is\n    returned by the `process` method.\n    \"\"\"\n\n    def __init__(self,\n                 global_rate_limit_qps: int,\n                 latency_per_request: float,\n                 max_concurrent_requests: int,\n                 use_metrics: bool):\n        \"\"\"Creates a RateLimit object.\n\n        global_rate_limit_qps and latency_per_request are used to determine how the\n        data should be sharded via:\n        global_rate_limit_qps * latency_per_request.total_seconds()\n\n        For example, global_rate_limit_qps = 500 and latency_per_request=.5 seconds.\n        Then the data will be sharded into 500*.5=250 groups.  Each group can be\n        processed in parallel and will call the 'process' function at most once\n        every latency_per_request.\n\n        It is important to note that the max QPS may not be reach based on how many\n        workers are scheduled.\n\n        Args:\n            global_rate_limit_qps: QPS to rate limit requests across all workers to.\n            latency_per_request: The expected latency per request.\n            max_concurrent_requests: Maximum allowed concurrent api requests to EE.\n        \"\"\"\n\n        self._rate_limit = global_rate_limit_qps\n        self._latency_per_request = datetime.timedelta(seconds=latency_per_request)\n        self._num_shards = max(1, min(int(self._rate_limit * self._latency_per_request.total_seconds()),\n                                      max_concurrent_requests))\n        self.use_metrics = use_metrics\n\n    @abc.abstractmethod\n    def process(self, elem: t.Any):\n        \"\"\"Process is the operation that will be rate limited.\n\n        Results will be yielded each time time the process method is called.\n\n        Args:\n            elem: The individual element to process.\n\n        Returns:\n            Output can be anything, output will be the output of the RateLimit\n            PTransform.\n        \"\"\"\n        pass\n\n    def expand(self, pcol: beam.PCollection):\n        return (pcol\n                | beam.ParDo(Shard(num_shards=self._num_shards, use_metrics=self.use_metrics))\n                | beam.GroupByKey()\n                | beam.ParDo(\n                    _RateLimitDoFn(self.process, self._latency_per_request)))\n\n\nclass _RateLimitDoFn(beam.DoFn):\n    \"\"\"DoFn that ratelimits calls to rate_limit_fn.\"\"\"\n\n    def __init__(self, rate_limit_fn: t.Callable, wait_time: datetime.timedelta):\n        self._rate_limit_fn = rate_limit_fn\n        self._wait_time = wait_time\n        self._is_generator = inspect.isgeneratorfunction(self._rate_limit_fn)  # type: ignore\n\n    def process(self, keyed_elem: t.Tuple[t.Any, t.Iterable[t.Any]]):\n        shard, elems = keyed_elem\n        logger.info(f'processing shard: {shard}')\n\n        start_time = datetime.datetime.now()\n        end_time = None\n        for elem in elems:\n            if end_time is not None and (end_time - start_time) < self._wait_time:\n                logger.info(f'previous operation took: {(end_time - start_time).total_seconds()}')\n                wait_time = (self._wait_time - (end_time - start_time))\n                logger.info(f'wating: {wait_time.total_seconds()}')\n                time.sleep(wait_time.total_seconds())\n            start_time = datetime.datetime.now()\n            if self._is_generator:\n                yield from self._rate_limit_fn(elem)\n            else:\n                yield self._rate_limit_fn(elem)\n            end_time = datetime.datetime.now()\n"
  },
  {
    "path": "weather_mv/loader_pipeline/util_test.py",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nimport itertools\nimport unittest\nfrom collections import Counter\nfrom datetime import datetime, timezone, timedelta\n\nimport xarray\nimport xarray as xr\nimport numpy as np\n\nfrom .sinks_test import TestDataBase\nfrom .util import (\n    get_coordinates,\n    ichunked,\n    make_attrs_ee_compatible,\n    to_json_serializable_type,\n)\n\n\nclass GetCoordinatesTest(TestDataBase):\n    def setUp(self) -> None:\n        super().setUp()\n        self.test_data_path = f'{self.test_data_folder}/test_data_20180101.nc'\n\n    def test_gets_indexed_coordinates(self):\n        ds = xr.open_dataset(self.test_data_path)\n        self.assertEqual(\n            next(get_coordinates(ds)),\n            {'time': datetime.fromisoformat('2018-01-02T06:00:00+00:00').replace(tzinfo=None)}\n        )\n\n    def test_no_duplicate_coordinates(self):\n        ds = xr.open_dataset(self.test_data_path)\n\n        # Assert that all the coordinates are unique.\n        counts = Counter([tuple(c.values()) for c in get_coordinates(ds)])\n        self.assertTrue(all((c == 1 for c in counts.values())))\n\n\nclass IChunksTests(TestDataBase):\n    def setUp(self) -> None:\n        super().setUp()\n        test_data_path = f'{self.test_data_folder}/test_data_20180101.nc'\n        self.items = range(20)\n        self.coords = get_coordinates(xarray.open_dataset(test_data_path), test_data_path)\n\n    def test_even_chunks(self):\n        actual = []\n        for chunk in ichunked(self.items, 4):\n            actual.append(list(chunk))\n\n        self.assertEqual(actual, [\n            [0, 1, 2, 3],\n            [4, 5, 6, 7],\n            [8, 9, 10, 11],\n            [12, 13, 14, 15],\n            [16, 17, 18, 19],\n        ])\n\n    def test_odd_chunks(self):\n        actual = []\n        for chunk in ichunked(self.items, 7):\n            actual.append(list(chunk))\n\n        self.assertEqual(actual, [\n            [0, 1, 2, 3, 4, 5, 6],\n            [7, 8, 9, 10, 11, 12, 13],\n            [14, 15, 16, 17, 18, 19]\n        ])\n\n    def test_get_coordinates(self):\n        actual = []\n        for chunk in ichunked(itertools.islice(self.coords, 4), 3):\n            actual.append(list(chunk))\n\n        self.assertEqual(\n            actual,\n            [\n                [\n                    {\n                        'time': datetime.fromisoformat('2018-01-02T06:00:00+00:00').replace(tzinfo=None)\n                    },\n                    {\n                        'time': datetime.fromisoformat('2018-01-02T07:00:00+00:00').replace(tzinfo=None)\n                    },\n                    {\n                        'time': datetime.fromisoformat('2018-01-02T08:00:00+00:00').replace(tzinfo=None)\n                    },\n                ],\n                [\n                    {\n                        'time': datetime.fromisoformat('2018-01-02T09:00:00+00:00').replace(tzinfo=None)\n                     }\n                ]\n            ]\n        )\n\n\nclass MakeAttrsEeCompatibleTests(TestDataBase):\n    def test_make_attrs_ee_compatible_a1(self):\n        attrs = {\n            'int_attr': 48,\n            'float_attr': 48.48,\n            'str_attr': '48.48',\n            'str_long_attr': 'Lorem ipsum dolor sit amet, consectetur '\n            'adipiscing elit. Fusce bibendum odio ac lorem tristique, sed '\n            'tincidunt orci ultricies. Vivamus eu rhoncus metus. Praesent '\n            'vitae imperdiet sapien. Donec vel ipsum sapien. Aliquam '\n            'suscipit suscipit turpis, a vehicula neque. Maecenas '\n            'hendrerit, mauris eu consequat aliquam, nunc elit lacinia '\n            'elit, vel accumsan ipsum ex a tellus. Pellentesque habitant '\n            'morbi tristique senectus et netus et malesuada fames ac '\n            'turpis egestas. Fusce a felis vel dolor lobortis vestibulum '\n            'ac ac velit. Etiam vitae nibh sed justo hendrerit feugiat. '\n            'Sed vulputate, turpis eget fringilla euismod, urna magna '\n            'consequat turpis, at aliquam metus dolor vel tortor. Sed sit '\n            'amet dolor quis libero venenatis porttitor a non odio. Morbi '\n            'interdum tellus non neque placerat, vel fermentum turpis '\n            'bibendum. In efficitur nunc ac leo eleifend commodo. Maecenas '\n            'in tincidunt diam. In consectetur eget sapien a suscipit. '\n            'Nulla porttitor ullamcorper tellus sit amet ornare. Aliquam '\n            'in nibh at mauris tincidunt bibendum a a elit.',\n            'bool_attr': True,\n            'none_attr': None,\n            'key_long_raesent_id_tincidunt_velit_Integer_eget_sapien_tincidunt_'\n            'iaculis_nulla_vitae_consectetur_metus_Vestibul': 'long_string'\n        }\n        expected = {\n            'int_attr': 48,\n            'float_attr': 48.48,\n            'str_attr': '48.48',\n            'str_long_attr': 'Lorem ipsum dolor sit amet, consectetur '\n            'adipiscing elit. Fusce bibendum odio ac lorem tristique, sed '\n            'tincidunt orci ultricies. Vivamus eu rhoncus metus. Praesent '\n            'vitae imperdiet sapien. Donec vel ipsum sapien. Aliquam '\n            'suscipit suscipit turpis, a vehicula neque. Maecenas '\n            'hendrerit, mauris eu consequat aliquam, nunc elit lacinia '\n            'elit, vel accumsan ipsum ex a tellus. Pellentesque habitant '\n            'morbi tristique senectus et netus et malesuada fames ac '\n            'turpis egestas. Fusce a felis vel dolor lobortis vestibulum '\n            'ac ac velit. Etiam vitae nibh sed justo hendrerit feugiat. '\n            'Sed vulputate, turpis eget fringilla euismod, urna magna '\n            'consequat turpis, at aliquam metus dolor vel tortor. Sed sit '\n            'amet dolor quis libero venenatis porttitor a non odio. Morbi '\n            'interdum tellus non neque placerat, vel fermentum turpis '\n            'bibendum. In efficitur nunc ac leo eleifend commodo. Maecenas '\n            'in tincidunt diam. In consectetur eget sapien a suscipit. '\n            'Nulla porttitor ullamcorper tellus sit amet ornare. Aliquam '\n            'in nibh at mauris tincidunt bibendum a a ...',\n            'bool_attr': 'True',\n            'none_attr': 'None',\n            'key_long_raesent_id_tincidunt_velit_Integer_eget_sapien_tincidunt_'\n            'iaculis_nulla_vitae_consectetur_metus_Vestib': 'long_string'\n        }\n\n        actual = make_attrs_ee_compatible(attrs)\n\n        self.assertDictEqual(actual, expected)\n\n    def test_make_attrs_ee_compatible_a2(self):\n        attrs = {\n            'list_attr': ['attr1', 'attr1'],\n            'tuple_attr': ('attr1', 'attr2'),\n            'dict_attr': {\n                'attr1': 1,\n                'attr2': 'two',\n                'attr3': 3.0,\n                'attr4': True\n            }\n        }\n        expected = {\n            'list_attr': \"['attr1', 'attr1']\",\n            'tuple_attr': \"('attr1', 'attr2')\",\n            'dict_attr': \"{'attr1': 1, 'attr2': 'two', 'attr3': 3.0, \"\n            \"'attr4': True}\"\n        }\n\n        actual = make_attrs_ee_compatible(attrs)\n\n        self.assertDictEqual(actual, expected)\n\n\nclass ToJsonSerializableTypeTests(unittest.TestCase):\n\n    def _convert(self, value):\n        return to_json_serializable_type(value)\n\n    def test_to_json_serializable_type_none(self):\n        self.assertIsNone(self._convert(None))\n        self.assertIsNone(self._convert(float('NaN')))\n        self.assertIsNone(self._convert(np.NaN))\n        self.assertIsNone(self._convert(np.datetime64('NaT')))\n        self.assertIsNotNone(self._convert(np.array([])))\n\n    def test_to_json_serializable_type_float(self):\n        self.assertIsInstance(self._convert(np.float32('0.1')), float)\n        self.assertIsInstance(self._convert(np.float32('1')), float)\n        self.assertIsInstance(self._convert(np.float16('0.1')), float)\n        self.assertIsInstance(self._convert(np.single('0.1')), float)\n        self.assertIsInstance(self._convert(np.double('0.1')), float)\n        self.assertNotIsInstance(self._convert(1), float)\n        self.assertNotIsInstance(self._convert(np.csingle('0.1')), float)\n        self.assertNotIsInstance(self._convert(np.cdouble('0.1')), float)\n        self.assertNotIsInstance(self._convert(np.intc('1')), float)\n\n    def test_to_json_serializable_type_int(self):\n        self.assertIsInstance(self._convert(np.int16('1')), int)\n        self.assertEqual(self._convert(np.int16('1')), int(1))\n        self.assertEqual(self._convert(np.int32('1')), int(1))\n        self.assertEqual(self._convert(np.int64('1')), int(1))\n        self.assertEqual(self._convert(np.int64(-10_000)), -10_000)\n        self.assertEqual(self._convert(np.uint16(25)), 25)\n\n    def test_to_json_serializable_type_set(self):\n        self.assertEqual(self._convert(set({})), [])\n        self.assertEqual(self._convert(set({1, 2, 3})), [1, 2, 3])\n        self.assertEqual(self._convert(set({1})), [1])\n        self.assertEqual(self._convert(set({None})), [None])\n        self.assertEqual(self._convert(set({float('NaN')})), [None])\n\n    def test_to_json_serializable_type_ndarray(self):\n        self.assertIsInstance(self._convert(np.array(list(range(10)))), list)\n        self.assertEqual(self._convert(np.array(list(range(10)))), list(range(10)))\n        self.assertEqual(self._convert(np.array([1])), [1])\n        self.assertEqual(self._convert(np.array([[1, 2, 3], [4, 5, 6]])), [[1, 2, 3], [4, 5, 6]])\n        self.assertEqual(self._convert(np.array(1)), 1)\n\n    def test_to_json_serializable_type_datetime(self):\n        input_date = '2000-01-01T00:00:00+00:00'\n        now = datetime.now()\n\n        self.assertEqual(self._convert(datetime.fromisoformat(input_date)), input_date)\n        self.assertEqual(self._convert(now), now.replace(tzinfo=timezone.utc).isoformat())\n        self.assertEqual(self._convert(input_date), input_date)\n        self.assertEqual(self._convert(np.datetime64(input_date)), input_date)\n        self.assertEqual(self._convert(np.datetime64(1, 'Y')), '1971-01-01T00:00:00+00:00')\n        self.assertEqual(self._convert(np.datetime64(30, 'Y')), input_date)\n        self.assertEqual(self._convert(np.timedelta64(1, 'm')), float(60))\n        self.assertEqual(self._convert(timedelta(seconds=1)), float(1))\n        self.assertEqual(self._convert(timedelta(minutes=1)), float(60))\n        self.assertEqual(self._convert(timedelta(days=1)), float(86400))\n"
  },
  {
    "path": "weather_mv/setup.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Setup weather-mv.\n\nThis setup.py script makes use of Apache Beam's recommended way to install non-python dependencies to worker images.\nThis is employed to enable a portable installation of cfgrib, which requires ecCodes.\n\nPlease see this documentation and example code:\n- https://beam.apache.org/documentation/sdks/python-pipeline-dependencies/#nonpython\n- https://github.com/apache/beam/blob/master/sdks/python/apache_beam/examples/complete/juliaset/setup.py\n\"\"\"\n\nfrom setuptools import setup, find_packages\n\nbeam_gcp_requirements = [\n    \"google-cloud-bigquery==2.34.4\",\n    \"google-cloud-bigquery-storage==2.14.1\",\n    \"google-cloud-bigtable==1.7.2\",\n    \"google-cloud-core==1.7.3\",\n    \"google-cloud-datastore==1.15.5\",\n    \"google-cloud-dlp==3.8.0\",\n    \"google-cloud-language==1.3.2\",\n    \"google-cloud-pubsub==2.13.4\",\n    \"google-cloud-pubsublite==1.4.2\",\n    \"google-cloud-recommendations-ai==0.2.0\",\n    \"google-cloud-spanner==1.19.3\",\n    \"google-cloud-videointelligence==1.16.3\",\n    \"google-cloud-vision==1.0.2\",\n    \"apache-beam[gcp]==2.40.0\",\n]\n\nbase_requirements = [\n    \"dataclasses\",\n    \"numpy==1.22.4\",\n    \"pandas==1.5.1\",\n    \"xarray==2023.1.0\",\n    \"xarray-beam==0.6.2\",\n    \"cfgrib==0.9.10.2\",\n    \"netcdf4==1.6.1\",\n    \"geojson==2.5.0\",\n    \"simplejson==3.17.6\",\n    \"rioxarray==0.13.4\",\n    \"metview==1.13.1\",\n    \"rasterio==1.3.1\",\n    \"earthengine-api>=0.1.263\",\n    \"pyproj==3.4.0\",  # requires separate binary installation!\n    \"gdal==3.5.1\",  # requires separate binary installation!\n    \"gcsfs==2022.11.0\",\n    \"zarr==2.15.0\",\n]\n\nsetup(\n    name='loader_pipeline',\n    packages=find_packages(),\n    author='Anthromets',\n    author_email='anthromets-ecmwf@google.com',\n    version='0.2.43',\n    url='https://weather-tools.readthedocs.io/en/latest/weather_mv/',\n    description='A tool to load weather data into BigQuery.',\n    install_requires=beam_gcp_requirements + base_requirements,\n)\n"
  },
  {
    "path": "weather_mv/test_data/test_data.zarr/.zattrs",
    "content": "{\n    \"Conventions\": \"CF-1.6\",\n    \"history\": \"2022-12-07 15:15:12 GMT by grib_to_netcdf-2.25.1: /opt/ecmwf/mars-client/bin/grib_to_netcdf.bin -S param -o /cache/data1/adaptor.mars.internal-1670426111.8966048-4371-13-ae423cb0-986b-4dc9-abe5-76478aea4cef.nc /cache/tmp/ae423cb0-986b-4dc9-abe5-76478aea4cef-adaptor.mars.internal-1670426109.7807662-4371-20-tmp.grib\"\n}"
  },
  {
    "path": "weather_mv/test_data/test_data.zarr/.zgroup",
    "content": "{\n    \"zarr_format\": 2\n}"
  },
  {
    "path": "weather_mv/test_data/test_data.zarr/.zmetadata",
    "content": "{\n    \"metadata\": {\n        \".zattrs\": {\n            \"Conventions\": \"CF-1.6\",\n            \"history\": \"2022-12-07 15:15:12 GMT by grib_to_netcdf-2.25.1: /opt/ecmwf/mars-client/bin/grib_to_netcdf.bin -S param -o /cache/data1/adaptor.mars.internal-1670426111.8966048-4371-13-ae423cb0-986b-4dc9-abe5-76478aea4cef.nc /cache/tmp/ae423cb0-986b-4dc9-abe5-76478aea4cef-adaptor.mars.internal-1670426109.7807662-4371-20-tmp.grib\"\n        },\n        \".zgroup\": {\n            \"zarr_format\": 2\n        },\n        \"cape/.zarray\": {\n            \"chunks\": [\n                24,\n                721,\n                1440\n            ],\n            \"compressor\": {\n                \"blocksize\": 0,\n                \"clevel\": 5,\n                \"cname\": \"lz4\",\n                \"id\": \"blosc\",\n                \"shuffle\": 1\n            },\n            \"dtype\": \"<i2\",\n            \"fill_value\": -32767,\n            \"filters\": null,\n            \"order\": \"C\",\n            \"shape\": [\n                24,\n                721,\n                1440\n            ],\n            \"zarr_format\": 2\n        },\n        \"cape/.zattrs\": {\n            \"_ARRAY_DIMENSIONS\": [\n                \"time\",\n                \"latitude\",\n                \"longitude\"\n            ],\n            \"add_offset\": 3403.948056704256,\n            \"long_name\": \"Convective available potential energy\",\n            \"missing_value\": -32767,\n            \"scale_factor\": 0.10388659148825782,\n            \"units\": \"J kg**-1\"\n        },\n        \"d2m/.zarray\": {\n            \"chunks\": [\n                24,\n                721,\n                1440\n            ],\n            \"compressor\": {\n                \"blocksize\": 0,\n                \"clevel\": 5,\n                \"cname\": \"lz4\",\n                \"id\": \"blosc\",\n                \"shuffle\": 1\n            },\n            \"dtype\": \"<i2\",\n            \"fill_value\": -32767,\n            \"filters\": null,\n            \"order\": \"C\",\n            \"shape\": [\n                24,\n                721,\n                1440\n            ],\n            \"zarr_format\": 2\n        },\n        \"d2m/.zattrs\": {\n            \"_ARRAY_DIMENSIONS\": [\n                \"time\",\n                \"latitude\",\n                \"longitude\"\n            ],\n            \"add_offset\": 253.12335326621286,\n            \"long_name\": \"2 metre dewpoint temperature\",\n            \"missing_value\": -32767,\n            \"scale_factor\": 0.0014471540977416674,\n            \"units\": \"K\"\n        },\n        \"latitude/.zarray\": {\n            \"chunks\": [\n                721\n            ],\n            \"compressor\": {\n                \"blocksize\": 0,\n                \"clevel\": 5,\n                \"cname\": \"lz4\",\n                \"id\": \"blosc\",\n                \"shuffle\": 1\n            },\n            \"dtype\": \"<f4\",\n            \"fill_value\": \"NaN\",\n            \"filters\": null,\n            \"order\": \"C\",\n            \"shape\": [\n                721\n            ],\n            \"zarr_format\": 2\n        },\n        \"latitude/.zattrs\": {\n            \"_ARRAY_DIMENSIONS\": [\n                \"latitude\"\n            ],\n            \"long_name\": \"latitude\",\n            \"units\": \"degrees_north\"\n        },\n        \"longitude/.zarray\": {\n            \"chunks\": [\n                1440\n            ],\n            \"compressor\": {\n                \"blocksize\": 0,\n                \"clevel\": 5,\n                \"cname\": \"lz4\",\n                \"id\": \"blosc\",\n                \"shuffle\": 1\n            },\n            \"dtype\": \"<f4\",\n            \"fill_value\": \"NaN\",\n            \"filters\": null,\n            \"order\": \"C\",\n            \"shape\": [\n                1440\n            ],\n            \"zarr_format\": 2\n        },\n        \"longitude/.zattrs\": {\n            \"_ARRAY_DIMENSIONS\": [\n                \"longitude\"\n            ],\n            \"long_name\": \"longitude\",\n            \"units\": \"degrees_east\"\n        },\n        \"time/.zarray\": {\n            \"chunks\": [\n                24\n            ],\n            \"compressor\": {\n                \"blocksize\": 0,\n                \"clevel\": 5,\n                \"cname\": \"lz4\",\n                \"id\": \"blosc\",\n                \"shuffle\": 1\n            },\n            \"dtype\": \"<i4\",\n            \"fill_value\": null,\n            \"filters\": null,\n            \"order\": \"C\",\n            \"shape\": [\n                24\n            ],\n            \"zarr_format\": 2\n        },\n        \"time/.zattrs\": {\n            \"_ARRAY_DIMENSIONS\": [\n                \"time\"\n            ],\n            \"calendar\": \"gregorian\",\n            \"long_name\": \"time\",\n            \"units\": \"hours since 1900-01-01\"\n        }\n    },\n    \"zarr_consolidated_format\": 1\n}"
  },
  {
    "path": "weather_mv/test_data/test_data.zarr/cape/.zarray",
    "content": "{\n    \"chunks\": [\n        24,\n        721,\n        1440\n    ],\n    \"compressor\": {\n        \"blocksize\": 0,\n        \"clevel\": 5,\n        \"cname\": \"lz4\",\n        \"id\": \"blosc\",\n        \"shuffle\": 1\n    },\n    \"dtype\": \"<i2\",\n    \"fill_value\": -32767,\n    \"filters\": null,\n    \"order\": \"C\",\n    \"shape\": [\n        24,\n        721,\n        1440\n    ],\n    \"zarr_format\": 2\n}"
  },
  {
    "path": "weather_mv/test_data/test_data.zarr/cape/.zattrs",
    "content": "{\n    \"_ARRAY_DIMENSIONS\": [\n        \"time\",\n        \"latitude\",\n        \"longitude\"\n    ],\n    \"add_offset\": 3403.948056704256,\n    \"long_name\": \"Convective available potential energy\",\n    \"missing_value\": -32767,\n    \"scale_factor\": 0.10388659148825782,\n    \"units\": \"J kg**-1\"\n}"
  },
  {
    "path": "weather_mv/test_data/test_data.zarr/d2m/.zarray",
    "content": "{\n    \"chunks\": [\n        24,\n        721,\n        1440\n    ],\n    \"compressor\": {\n        \"blocksize\": 0,\n        \"clevel\": 5,\n        \"cname\": \"lz4\",\n        \"id\": \"blosc\",\n        \"shuffle\": 1\n    },\n    \"dtype\": \"<i2\",\n    \"fill_value\": -32767,\n    \"filters\": null,\n    \"order\": \"C\",\n    \"shape\": [\n        24,\n        721,\n        1440\n    ],\n    \"zarr_format\": 2\n}"
  },
  {
    "path": "weather_mv/test_data/test_data.zarr/d2m/.zattrs",
    "content": "{\n    \"_ARRAY_DIMENSIONS\": [\n        \"time\",\n        \"latitude\",\n        \"longitude\"\n    ],\n    \"add_offset\": 253.12335326621286,\n    \"long_name\": \"2 metre dewpoint temperature\",\n    \"missing_value\": -32767,\n    \"scale_factor\": 0.0014471540977416674,\n    \"units\": \"K\"\n}"
  },
  {
    "path": "weather_mv/test_data/test_data.zarr/latitude/.zarray",
    "content": "{\n    \"chunks\": [\n        721\n    ],\n    \"compressor\": {\n        \"blocksize\": 0,\n        \"clevel\": 5,\n        \"cname\": \"lz4\",\n        \"id\": \"blosc\",\n        \"shuffle\": 1\n    },\n    \"dtype\": \"<f4\",\n    \"fill_value\": \"NaN\",\n    \"filters\": null,\n    \"order\": \"C\",\n    \"shape\": [\n        721\n    ],\n    \"zarr_format\": 2\n}"
  },
  {
    "path": "weather_mv/test_data/test_data.zarr/latitude/.zattrs",
    "content": "{\n    \"_ARRAY_DIMENSIONS\": [\n        \"latitude\"\n    ],\n    \"long_name\": \"latitude\",\n    \"units\": \"degrees_north\"\n}"
  },
  {
    "path": "weather_mv/test_data/test_data.zarr/longitude/.zarray",
    "content": "{\n    \"chunks\": [\n        1440\n    ],\n    \"compressor\": {\n        \"blocksize\": 0,\n        \"clevel\": 5,\n        \"cname\": \"lz4\",\n        \"id\": \"blosc\",\n        \"shuffle\": 1\n    },\n    \"dtype\": \"<f4\",\n    \"fill_value\": \"NaN\",\n    \"filters\": null,\n    \"order\": \"C\",\n    \"shape\": [\n        1440\n    ],\n    \"zarr_format\": 2\n}"
  },
  {
    "path": "weather_mv/test_data/test_data.zarr/longitude/.zattrs",
    "content": "{\n    \"_ARRAY_DIMENSIONS\": [\n        \"longitude\"\n    ],\n    \"long_name\": \"longitude\",\n    \"units\": \"degrees_east\"\n}"
  },
  {
    "path": "weather_mv/test_data/test_data.zarr/time/.zarray",
    "content": "{\n    \"chunks\": [\n        24\n    ],\n    \"compressor\": {\n        \"blocksize\": 0,\n        \"clevel\": 5,\n        \"cname\": \"lz4\",\n        \"id\": \"blosc\",\n        \"shuffle\": 1\n    },\n    \"dtype\": \"<i4\",\n    \"fill_value\": null,\n    \"filters\": null,\n    \"order\": \"C\",\n    \"shape\": [\n        24\n    ],\n    \"zarr_format\": 2\n}"
  },
  {
    "path": "weather_mv/test_data/test_data.zarr/time/.zattrs",
    "content": "{\n    \"_ARRAY_DIMENSIONS\": [\n        \"time\"\n    ],\n    \"calendar\": \"gregorian\",\n    \"long_name\": \"time\",\n    \"units\": \"hours since 1900-01-01\"\n}"
  },
  {
    "path": "weather_mv/weather-mv",
    "content": "#!/usr/bin/env python3\n# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport glob\nimport logging\nimport os\nimport subprocess\nimport sys\nimport tarfile\nimport tempfile\n\nimport weather_mv\n\nSDK_CONTAINER_IMAGE='gcr.io/weather-tools-prod/weather-tools:0.0.0'\n\nif __name__ == '__main__':\n    logging.getLogger().setLevel(logging.INFO)\n\n    site_pkg = weather_mv.__path__[0]\n\n    try:\n        from loader_pipeline import cli\n    except ImportError:\n        # Install the subpackage.\n        subprocess.check_call(f'{sys.executable} -m pip -q install -e {site_pkg}'.split())\n\n        # Re-load sys.path\n        import site\n        from importlib import reload\n        reload(site)\n\n        # Re-attempt import. If this fails, the user probably has an older version of\n        # the package already installed on their machine that breaks this process.\n        # If that's the case, it's best to start from a clean virtual environment.\n        try:\n            from loader_pipeline import cli\n        except ImportError as e:\n            raise ImportError('please re-install package in a clean python environment.') from e\n\n    args = []\n\n    if \"DataflowRunner\" in sys.argv and  \"--sdk_container_image\" not in sys.argv:\n        args.extend(['--sdk_container_image',\n             os.getenv('SDK_CONTAINER_IMAGE', SDK_CONTAINER_IMAGE),\n             '--experiments',\n             'use_runner_v2'])\n        \n    if \"--use-local-code\" in sys.argv:\n        with tempfile.TemporaryDirectory() as tmpdir:\n            original_dir = os.getcwd()\n\n            # Convert subpackage to a tarball\n            os.chdir(site_pkg)\n            subprocess.check_call(\n                f'{sys.executable} ./setup.py -q sdist --dist-dir {tmpdir}'.split(),\n            )\n\n            os.chdir(original_dir)\n\n            # Set tarball as extra packages for Beam.\n            pkg_archive = glob.glob(os.path.join(tmpdir, '*.tar.gz'))[0]\n\n            with tarfile.open(pkg_archive, 'r') as tar:\n                assert any([f.endswith('.py') for f in tar.getnames()]), 'extra_package must include python files!'\n\n            # cleanup memory to prevent pickling error.\n            tar = None\n            weather_mv = None\n            args.extend(['--extra_package', pkg_archive])\n            cli(args)\n    else:\n        cli(args)\n"
  },
  {
    "path": "weather_sp/MANIFEST.in",
    "content": "global-exclude *_test.py\ninclude README.md\nglobal-exclude test_data/*\n"
  },
  {
    "path": "weather_sp/README.md",
    "content": "# 🌪 `weather-sp` – Weather Splitter\n\nSplits NetCDF and Grib files into several files by variable or other dimension. (_alpha_).\n\n## Features\n\n* **Format-Aware Processing**: Care is taken to ensure that each input file format is handled appropriately. For\n  example, for Grib files, overlapping variables with different levels are kept separate. In addition, buckets with\n  mixtures of NetCDF and Grib files can be processed on the same run.\n* **Arbitrary Splits**: You can split weather data by almost any dimension.\n\n## Usage\n\n```\nusage: weather-sp [-h] -i INPUT_PATTERN (--output-template OUTPUT_TEMPLATE | --output-dir OUTPUT_DIR)\n                  [--formatting FORMATTING] [-d] [-f]\n```\n\n_Common options_:\n\n* `-i, --input-pattern`: Pattern for input weather data.\n* `--output-template`: Template of output file path using Python formatting, see [Output section](#output) below. Mutually exclusive\n  with `--output-dir`.\n* `--output-dir`: Path to base folder for output files, see [Output section](#output) below. Mutually exclusive\n  with `--output-template`\n* `--formatting`: Used only with `--output-dir` to specify splitting and output format\n   using Python formatting, see [Output section](#output) below.\n* `-f, --force`: Force re-splitting of the pipeline. Turns of skipping of already split data.\n* `-d, --dry-run`: Test the input file matching and the output file scheme without splitting.\n* `--log-level`: An integer to configure log level. Default: 2(INFO).\n* `--use-local-code`: Supply local code to the Runner. Default: False.\n* `-w, --where`: Optional GRIB filter expression to apply during file splitting using grib_copy. This allows filtering GRIB messages based on key-value pairs, such as level, type of level, or date. Example: 'typeOfLevel=isobaricInhPa,level=1000'. This flag is only applicable to GRIB files and is specifically supported by the GribSplitterV2 implementation. More details can be found [here](https://confluence.ecmwf.int/display/ECC/grib_copy).\n* `--topic`: A Pub/Sub topic for GCS OBJECT_FINALIZE events, or equivalent, of a cloud bucket. E.g. 'projects/<PROJECT_ID>/topics/<TOPIC_ID>'. Cannot be used with --subscription.\n* `--subscription`: A Pub/Sub subscription for GCS OBJECT_FINALIZE events, or equivalent, of a cloud bucket. Cannot be used with `--topic`.\n* `--window_size`: Output file's window size in minutes. Only used with the `topic` flag. Default: 1.0 minute.\n* `--num_shards`: Number of shards to use when writing windowed elements to cloud storage. Only used with the `topic` flag. Default: 5 shards.\n\nInvoke with `-h` or `--help` to see the full range of options.\n\n_Usage examples_:\n\n```bash\nweather-sp --input-pattern 'gs://test-tmp/era5/2017/**' \\\n           --output-dir 'gs://test-tmp/era5/splits' \\\n           --formatting '.{typeOfLevel}'\n```\n\nPreview splits with a dry run:\n\n```bash\nweather-sp --input-pattern 'gs://test-tmp/era5/2017/**' \\\n           --output-dir 'gs://test-tmp/era5/splits' \\\n           --formatting '.{typeOfLevel}' \\\n           --dry-run\n```\n\nUsing DataflowRunner\n\n```bash\nweather-sp --input-pattern 'gs://test-tmp/era5/2015/**' \\\n           --output-dir 'gs://test-tmp/era5/splits'\n           --formatting '.{typeOfLevel}' \\\n           --runner DataflowRunner \\\n           --project $PROJECT \\\n           --temp_location gs://$BUCKET/tmp  \\\n           --job_name $JOB_NAME\n```\n\nUsing DataflowRunner and using local code for pipeline\n\n```bash\nweather-sp --input-pattern 'gs://test-tmp/era5/2015/**' \\\n           --output-dir 'gs://test-tmp/era5/splits'\n           --formatting '.{typeOfLevel}' \\\n           --runner DataflowRunner \\\n           --project $PROJECT \\\n           --temp_location gs://$BUCKET/tmp  \\\n           --job_name $JOB_NAME \\\n           --use-local-code\n```\n\nUsing DataflowRunner with using --where flag\n\n```bash\nweather-sp --input-pattern 'gs://test-tmp/2015/*.gb' \\\n           --output-template 'gs://temp/domain_{domain}/class_{class}/stream_{stream}/expver_{expver}/levtype_{levtype}/date_{date}/time_{time}/step_{step}/{shortName}.gb' \\\n           --runner DataflowRunner \\\n           --project $PROJECT \\\n           --temp_location gs://$BUCKET/tmp  \\\n           --job_name $JOB_NAME \\\n           --where \"typeOfLevel=surface,shortName=lcc\"\n```\n\nUsing ecCodes-powered grib splitting on Dataflow (this is often more robust, especially when splitting multiple \ndimensions at once):\n\n```bash\nweather-sp --input-pattern 'gs://test-tmp/era5/2017/**' \\\n           --output-dir 'gs://test-tmp/era5/splits' \\\n           --formatting '.{typeOfLevel}' \\\n           --runner DataflowRunner \\\n           --project $PROJECT \\\n           --temp_location gs://$BUCKET/tmp  \\\n           --experiment=use_runner_v2 \\\n           --sdk_container_image=\"gcr.io/$PROJECT/$REPO:latest\"  \\\n           --job_name $JOB_NAME\n```\n\n### Streaming mode example\n\nRun a streaming pipeline that reads file paths from a Pub/Sub topic and splits the files:\n\n```bash\nweather-sp --input-pattern 'gs://bucket/data/**' \\\n           --topic 'projects/my-project/topics/my-topic' \\\n            --window_size 5 \\\n           --num_shards 10 \\\n           --output-template 'gs://output/grid_0p1/domain_{domain}/class_{class}/{shortName}.gb' \\\n           --runner DataflowRunner \\\n           --project $PROJECT \\\n           --region us-west4 \\\n           --temp_location gs://$BUCKET/tmp \\\n           --experiment use_runner_v2 \\\n           --sdk_container_image=\"gcr.io/$PROJECT/$REPO:latest\" \\\n           --job_name $JOB_NAME\n```\n_Consult [this documentation](../Runtime-Container.md) for steps on how to create a sufficient image._\n_See the `weather-mv` documentation for more information about the container image._\n\nFor a full list of how to configure the Dataflow pipeline, please review\n[this table](https://cloud.google.com/dataflow/docs/reference/pipeline-options).\n\n## Specifying input files\n\nThe input file pattern matching is done by the Apache Beam filesystems module,\nsee [this documentation](https://beam.apache.org/releases/pydoc/2.12.0/apache_beam.io.filesystems.html#apache_beam.io.filesystems.FileSystems.match)\nunder 'pattern syntax'\n\nNotably, `**` Is equivalent to `.*`, so to match all files in a directory, use\n\n```bash\n--input-pattern 'gs://test-tmp/era5/2017/**'\n```\n\nOn the other hand, to specify a specific pattern use\n\n```bash\n--input-pattern 'gs://test-tmp/era5/2017/*/*.nc'\n```\n\n## Output & Split Dimensions\n\nThe base output file names are specified using the `--output-template` or `--output-dir` flags. These flags are mutually\nexclusive, and one of them is required. \\\nThe output formatting also specifies which dimensions to split by using Python formatting.\nFor example, adding `{time}` in the output template or formatting will cause the file to be split along the time dimension.\n\n### Available dimensions to split\n\nHow a file can be split depends on the file type.\n\n#### GRIB\nGRIB files can be split along any dimensions that is available in the file's metadata. \\\nExamples: 'typeOfLevel', 'level', 'step', 'shortName', 'gridType', 'time', 'forecastTime' \\\nAny available dimensions can be combined when splitting.\n\n#### NetCDF\nNetCDF files are already in a hypercube format and can only be split by one of the dimensions and by data variable.\nSince splitting by latitude or longitude would lead to a large number of small files, this is not supported,\nand it is recommended to use the `weather-mv` tool instead. \\\nSupported splits for NetCDF files are thus 'variable' to split by data variable, and any dimension \nother than latitude and longitude.\n\n\n### Output directory\n\nBased on the output directory path, the directory structure of the input pattern is replicated.\n\nTo create the directory structure, the common path of the input pattern is removed from the input file path and replaced\nwith the output path. \\\nThe formatting specified by the `--formatting` flag is added between file name and ending and is\nused to determine along which dimensions to split.\n\nExample:\n\n```bash\n--input-pattern 'gs://test-input/era5/2020/**' \\\n--output-dir 'gs://test-output/splits'\n--formatting '.{variable}'\n```\n\nFor a file `gs://test-input/era5/2020/02/01.nc` the output file pattern is\n`gs://test-output/splits/2020/02/01.{variable}.nc` and if the temperature is a variable in that data, the output file\nfor that split will be `gs://test-output/splits/2020/02/01.t.nc`.\n\nExample:\n\n```bash\n--input-pattern 'gs://test-input/era5/2020/**' \\\n--output-dir 'gs://test-output/splits'\n--formatting '_{date}:{time}_{level}hPa'\n```\nFor a file `gs://test-input/era5/2020/02/01.grib` the output file pattern is\n`gs://test-output/splits/2020/02/01_{date}:{time}_{level}hPa.grib` and for date = 20200101, time = 1200, level = 400,\n the output file for that split will be `gs://test-output/splits/2020/02/01_20200101:1200_400hPa.grib`.\n\n\n### Output template with Python-style formatting\n\nUsing Python-style substitution (e.g. `{1}`) allows for more flexibility when creating the output files. The\nsubstitutions are based on the directory structure of the input file, where each `{<x>}` stands for one directory name,\ncounting backwards from the end, i.e. the file name is `{0}`, the immediate directory in which it is located is `{1}`,\nand so on. In addition, you need to supply the split dimensions in the output template. These will be filled by values\nfound in each file.\n\nExample:\n\n```bash\n--input-pattern 'gs://test-input/era5/2020/**' \\\n--output-template 'gs://test-output/splits/{2}.{0}.{1}T00.{variable}.nc'\n```\n\nFor a file `gs://test-input/era5/2020/02/01.nc` the output file pattern is\n`gs://test-output/splits/2020.01.02T00.{variable}.nc` and if the temperature is a variable in that data, the output\nfile for that split will be `gs://test-output/splits/2020.01.02T00.t.nc`\n\n## Dry run\n\nTo verify the input file matching and the output naming scheme, `weather-sp` can be run with the `--dry-run` option.\nThis does not read the files, so it will not check whether the files are readable and in the correct format. It will\nonly list the input files with the corresponding output file schemes.\n"
  },
  {
    "path": "weather_sp/__init__.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "weather_sp/setup.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom setuptools import setup, find_packages\n\nbeam_gcp_requirements = [\n    \"google-cloud-bigquery==2.34.4\",\n    \"google-cloud-bigquery-storage==2.14.1\",\n    \"google-cloud-bigtable==1.7.2\",\n    \"google-cloud-core==1.7.3\",\n    \"google-cloud-datastore==1.15.5\",\n    \"google-cloud-dlp==3.8.0\",\n    \"google-cloud-language==1.3.2\",\n    \"google-cloud-pubsub==2.13.4\",\n    \"google-cloud-pubsublite==1.4.2\",\n    \"google-cloud-recommendations-ai==0.2.0\",\n    \"google-cloud-spanner==1.19.3\",\n    \"google-cloud-videointelligence==1.16.3\",\n    \"google-cloud-vision==1.0.2\",\n    \"apache-beam[gcp]==2.40.0\",\n]\n\nbase_requirements = [\n    \"pygrib==2.1.4\",\n    \"eccodes\",\n    \"numpy>=1.20.3\",\n    \"xarray==2023.1.0\",\n    \"scipy==1.9.3\",\n]\n\nsetup(\n    name='splitter_pipeline',\n    packages=find_packages(),\n    author='Anthromets',\n    author_email='anthromets-ecmwf@google.com',\n    version='0.3.8',\n    url='https://weather-tools.readthedocs.io/en/latest/weather_sp/',\n    description='A tool to split weather data files into per-variable files.',\n    install_requires=beam_gcp_requirements + base_requirements,\n)\n"
  },
  {
    "path": "weather_sp/splitter_pipeline/__init__.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .pipeline import run\n\n\ndef cli(extra=[]):\n    import sys\n    run(sys.argv + extra)\n"
  },
  {
    "path": "weather_sp/splitter_pipeline/file_name_utils.py",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom dataclasses import dataclass\nimport logging\nimport os\nimport string\nimport typing as t\n\nlogger = logging.getLogger(__name__)\n\nGRIB_FILE_ENDINGS = ('.grib', '.grb', '.grb2', '.grib2', '.gb')\nNETCDF_FILE_ENDINGS = ('.nc', '.cd')\n\n\n@dataclass\nclass OutFileInfo:\n    \"\"\"Holds data required to construct an output file name.\n\n    Attributes:\n        file_name_template: base output path, may contain python-style formatting\n                            marks. This can be a base directory or a full name.\n        formatting: added after file_name_template to add formatting. Only used\n                    when using --output-dir.\n        ending: file ending.\n        template_folders: list of input file directory structure. Only used with\n                          --output-template\n    \"\"\"\n    file_name_template: str\n    formatting: str\n    ending: str\n    template_folders: t.List[str]\n\n    def __repr__(self):\n        return self.unformatted_output_path()\n\n    def unformatted_output_path(self):\n        \"\"\"Construct output file name with formatting marks.\"\"\"\n        return self.file_name_template + self.formatting + self.ending\n\n    def split_dims(self) -> t.List[str]:\n        all_format = list(filter(None, [field[1] for field in string.Formatter().parse(\n            self.unformatted_output_path())]))\n        return [key for key in all_format if not key.isdigit()]\n\n    def formatted_output_path(self, splits: t.Dict[str, str]) -> str:\n        \"\"\"Construct output file name with formatting applied\"\"\"\n        return self.unformatted_output_path().format(*self.template_folders, **splits)\n\n\ndef get_output_file_info(filename: str,\n                         input_base_dir: str = '',\n                         out_pattern: t.Optional[str] = None,\n                         out_dir: t.Optional[str] = None,\n                         formatting: str = '') -> OutFileInfo:\n    \"\"\"Construct the base output file name by applying the out_pattern to the\n    filename.\n\n    Example:\n        filename = 'gs://my_bucket/data_to_split/2020/01/21.nc'\n        out_pattern = 'gs://my_bucket/splits/{2}-{1}-{0}_old_data.'\n        resulting output base = 'gs://my_bucket/splits/2020-01-21_old_data.'\n        resulting file ending = '.nc'\n\n    Args:\n        filename: input file to be split\n        out_pattern: pattern to apply when creating output file\n        out_dir: directory to replace input base directory\n        formatting: output formatting of split fields. Required when using\n            out_dir, ignored when using out_pattern.\n        input_base_dir: used if out_pattern does not contain any '{}'\n            substitutions.\n            The output file is then created by replacing this part of the input\n            name with the output pattern.\n    \"\"\"\n    split_name, ending = os.path.splitext(filename)\n    if ending in GRIB_FILE_ENDINGS or ending in NETCDF_FILE_ENDINGS:\n        filename = split_name\n    else:\n        ending = ''\n\n    if out_dir and not formatting:\n        raise ValueError('No formatting specified when using --output-dir.')\n    if out_dir:\n        return OutFileInfo(\n            f'{filename.replace(input_base_dir, out_dir)}',\n            formatting,\n            ending,\n            []\n        )\n\n    if out_pattern:\n        in_sections = []\n        path = filename\n        while path:\n            path, tail = os.path.split(path)\n            in_sections.append(tail)\n        # setting formatting and ending to empty strings since they are\n        # part of the specified pattern.\n        return OutFileInfo(out_pattern, '', '', in_sections)\n\n    raise ValueError('no output specified.')\n"
  },
  {
    "path": "weather_sp/splitter_pipeline/file_name_utils_test.py",
    "content": "# Copyright 2022 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport unittest\n\nfrom .file_name_utils import get_output_file_info, OutFileInfo\n\n\nclass FileNameUtilsTest(unittest.TestCase):\n\n    def test_get_output_file_info_pattern(self):\n        actual = get_output_file_info(filename='gs://my_bucket/data_to_split/2020/01/21.nc',\n                                      out_pattern='gs://my_bucket/splits/{2}-{1}-{0}_old_data.{variable}',\n                                      out_dir='',\n                                      input_base_dir='ignored')\n        expected = OutFileInfo(\n            file_name_template='gs://my_bucket/splits/{2}-{1}-{0}_old_data.{variable}',\n            template_folders=['21', '01', '2020',\n                              'data_to_split', 'my_bucket', 'gs:'],\n            ending='',\n            formatting='')\n        self.assertEqual(actual, expected)\n\n    def test_get_output_file_info_dir(self):\n        actual = get_output_file_info(filename='gs://my_bucket/data_to_split/2020/01/21.nc',\n                                      out_pattern='',\n                                      out_dir='gs://my_bucket/splits/',\n                                      input_base_dir='gs://my_bucket/data_to_split/',\n                                      formatting='_{foo}')\n        expected = OutFileInfo(\n            file_name_template='gs://my_bucket/splits/2020/01/21',\n            template_folders=[],\n            ending='.nc',\n            formatting='_{foo}')\n        self.assertEqual(actual, expected)\n\n    def test_get_output_file_info_no_fileending(self):\n        actual = get_output_file_info(filename='gs://my_bucket/data_to_split/2020/01/21',\n                                      out_pattern='gs://my_bucket/splits/{2}-{1}-{0}_old_data.',\n                                      out_dir='',\n                                      input_base_dir='ignored')\n        expected = OutFileInfo(\n            file_name_template='gs://my_bucket/splits/{2}-{1}-{0}_old_data.',\n            template_folders=['21', '01', '2020',\n                              'data_to_split', 'my_bucket', 'gs:'],\n            ending='',\n            formatting='')\n        self.assertEqual(actual, expected)\n\n    def test_get_output_file_info_filecontainsdots(self):\n        actual = get_output_file_info(filename='gs://my_bucket/data_to_split/2020/01/21.T00z.stuff',\n                                      out_dir='gs://my_bucket/splits/',\n                                      input_base_dir='gs://my_bucket/data_to_split/',\n                                      formatting='.{foo}')\n        expected = OutFileInfo(\n            file_name_template='gs://my_bucket/splits/2020/01/21.T00z.stuff',\n            template_folders=[],\n            ending='',\n            formatting='.{foo}')\n        self.assertEqual(actual, expected)\n\n    def test_get_output_file_info_dir_no_formatting(self):\n        with self.assertRaises(ValueError):\n            get_output_file_info(filename='gs://my_bucket/data_to_split/2020/01/21.nc',\n                                 out_pattern='',\n                                 out_dir='gs://my_bucket/splits/',\n                                 input_base_dir='gs://my_bucket/data_to_split/')\n\n    def test_output_pattern_ignores_formatting(self):\n        actual = get_output_file_info(filename='gs://my_bucket/data_to_split/2020/01/21.nc',\n                                      out_pattern='gs://my_bucket/splits/{2}-{1}-{0}_old_data.{variable}',\n                                      out_dir=None,\n                                      input_base_dir='ignored',\n                                      formatting='_{time}_{level}hPa')\n        expected = OutFileInfo(\n            file_name_template='gs://my_bucket/splits/{2}-{1}-{0}_old_data.{variable}',\n            template_folders=['21', '01', '2020',\n                              'data_to_split', 'my_bucket', 'gs:'],\n            ending='',\n            formatting='')\n        self.assertEqual(actual, expected)\n\n    def test_split_dims(self):\n        actual = get_output_file_info(filename='gs://my_bucket/data_to_split/2020/01/21.nc',\n                                      out_pattern='gs://my_bucket/splits/{2}-{1}-{0}_old_data.{variable}',\n                                      out_dir=None,\n                                      input_base_dir='ignored')\n        self.assertEqual(actual.split_dims(), ['variable'])\n"
  },
  {
    "path": "weather_sp/splitter_pipeline/file_splitters.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport abc\nimport itertools\nimport logging\nimport os\nimport re\nimport shutil\nimport string\nimport subprocess\nimport tempfile\nimport typing as t\nfrom contextlib import contextmanager\n\nimport apache_beam.metrics as metrics\nimport numpy as np\nimport pygrib\nimport xarray as xr\n\nfrom apache_beam.io.filesystem import CompressionTypes, FileSystem, CompressedFile, DEFAULT_READ_BUFFER_SIZE\nfrom apache_beam.io.filesystems import FileSystems\nfrom apache_beam.utils import retry\n\nfrom .file_name_utils import OutFileInfo\n\n# For uploading / downloading retry logic.\nINITIAL_DELAY = 1.0  # Initial delay in seconds.\nMAX_DELAY = 600  # Maximum delay before giving up in seconds.\nNUM_RETRIES = 10  # Number of tries with exponential backoff.\n\nlogger = logging.getLogger(__name__)\n\n\n# TODO(#245): Group with common utilities (duplicated)\n@retry.with_exponential_backoff(\n    num_retries=NUM_RETRIES,\n    logger=logger.warning,\n    initial_delay_secs=INITIAL_DELAY,\n    max_delay_secs=MAX_DELAY\n)\ndef copy(src: str, dst: str) -> None:\n    \"\"\"Copy data via `gsutil` or local filesystem.\"\"\"\n    is_gs = src.startswith(\"gs://\") or dst.startswith(\"gs://\")\n    try:\n        if is_gs:\n            subprocess.run(['gcloud', 'storage', 'cp', src, dst], check=True,\n                           capture_output=True, text=True, input=\"n/n\")\n        else:\n            os.makedirs(os.path.dirname(dst) or '.', exist_ok=True)\n            shutil.copy(src, dst)\n    except Exception as e:\n        error_detail = getattr(e, \"stderr\", str(e)).strip()\n        msg = f\"Failed to copy {src!r} to {dst!r} due to {error_detail}\"\n        logger.error(msg)\n        raise EnvironmentError(msg) from e\n\n\nclass FileSplitter(abc.ABC):\n    \"\"\"Base class for weather file splitters.\"\"\"\n\n    def __init__(self, input_path: str, output_info: OutFileInfo,\n                 force_split: bool = False, logging_level: int = logging.INFO,\n                 grib_filter_expression: t.Optional[str] = None):\n        self.input_path = input_path\n        self.output_info = output_info\n        self.force_split = force_split\n        self.logger = logging.getLogger(f'{__name__}.{type(self).__name__}')\n        self.logger.setLevel(logging_level)\n        self.logger.debug('Splitter for path=%s, output base=%s',\n                          self.input_path, self.output_info)\n        self.grib_filter_expression = grib_filter_expression\n\n    @abc.abstractmethod\n    def split_data(self) -> None:\n        raise NotImplementedError()\n\n    @contextmanager\n    def _copy_to_local_file(self) -> t.Iterator[t.IO]:\n        self.logger.info(f'Copying {self.input_path!r} locally.')\n        with tempfile.NamedTemporaryFile() as dest_file:\n            copy(self.input_path, dest_file.name)\n\n            # Check if data is compressed. Decompress the data using the same methods that beam's\n            # FileSystems interface uses.\n            compression_type = FileSystem._get_compression_type(self.input_path, CompressionTypes.AUTO)\n            if compression_type == CompressionTypes.UNCOMPRESSED:\n                yield dest_file\n                return\n\n            dest_file.seek(0)\n            with tempfile.NamedTemporaryFile() as dest_uncompressed:\n                with CompressedFile(open(dest_file.name, 'rb'), compression_type=compression_type) as dcomp:\n                    shutil.copyfileobj(dcomp, dest_uncompressed, DEFAULT_READ_BUFFER_SIZE)\n                    dest_uncompressed.seek(0)  # Reposition the file pointer to the start.\n                    yield dest_uncompressed\n\n    def should_skip(self):\n        \"\"\"Skip splitting if the data was already split.\"\"\"\n        if self.force_split:\n            return False\n\n        for match in FileSystems().match([\n            self.output_info.formatted_output_path(\n                {var: '*' for var in self.output_info.split_dims()}),\n        ]):\n            if len(match.metadata_list) > 0:\n                return True\n        return False\n\n    def should_skip_file(self, output_path: str) -> bool:\n        \"\"\"Skip splitting if the data file was already split.\n\n        TODO(#287): Consider making this the default skipping implementation...\n        \"\"\"\n        if self.force_split:\n            return False\n\n        matches = FileSystems().match([output_path])\n        assert len(matches) == 1\n        if len(matches[0].metadata_list) > 0:\n            return True\n        return False\n\n\nclass GribSplitter(FileSplitter):\n\n    def split_data(self) -> None:\n        if not self.output_info.split_dims():\n            raise ValueError('No splitting specified in template.')\n\n        if self.should_skip():\n            metrics.Metrics.counter('file_splitters', 'skipped').inc()\n            self.logger.info('Skipping %s, file already split.',\n                             repr(self.input_path))\n            return\n\n        # Here, we keep a map of open file objects (`outputs`). We need these since\n        # each output grib file (named `key`) will include multiple `grb` messages\n        # each. By writing data to the cache of open file objects, we can keep a\n        # minimal amount of data in memory at a time.\n        outputs = dict()\n        with self._open_grib_locally() as grbs:\n            self.logger.info('Splitting & uploading %r...', self.input_path)\n            try:\n                for grb in grbs:\n                    # Iterate through the split dimensions of the grib message in order to\n                    # produce the right output file.\n                    splits = dict()\n                    for dim in self.output_info.split_dims():\n                        try:\n                            splits[dim] = getattr(grb, dim)\n                        except RuntimeError:\n                            self.logger.error(\n                                'Variable not found in grib: %s', dim)\n                    key = self.output_info.formatted_output_path(splits)\n\n                    # Append the current grib message to a set number of output files.\n                    # If the target shard doesn't exist, create it.\n                    if key not in outputs:\n                        outputs[key] = FileSystems.create(key)\n                    outputs[key].write(grb.tostring())\n                    outputs[key].flush()\n\n                    # Delete the grib message from memory – *and disk* – before moving on to the next\n                    # grib message. See the pygrib sources for more info.\n                    # https://github.com/jswhit/pygrib/blob/v2.1.4rel/src/pygrib/_pygrib.pyx#L759\n                    del grb\n            finally:\n                for out in outputs.values():\n                    out.close()\n            self.logger.info('Split %s into %d files',\n                             self.input_path, len(outputs))\n\n    @contextmanager\n    def _open_grib_locally(self) -> t.Iterator[t.Iterator[pygrib.gribmessage]]:\n        with self._copy_to_local_file() as local_file:\n            with pygrib.open(local_file.name) as gb:\n                yield gb\n\n\nclass GribSplitterV2(GribSplitter):\n    \"\"\"Splitter that makes use of `grib_copy` util for high performance splitting.\n\n    See https://confluence.ecmwf.int/display/ECC/grib_copy.\n    \"\"\"\n\n    def replace_non_numeric_bracket(self, match: re.Match) -> str:\n        value = match.group(1)\n        return f\"[{value}]\" if not value.isdigit() else \"{\" + value + \"}\"\n\n    def split_data(self) -> None:\n        if not self.output_info.split_dims():\n            raise ValueError('No splitting specified in template.')\n\n        grib_copy_cmd = shutil.which('grib_copy')\n        grib_get_cmd = shutil.which('grib_get')\n        uniq_cmd = shutil.which('uniq')\n        for cmd, name in [(grib_get_cmd, 'grib_copy'), (grib_get_cmd, 'grib_get'), (uniq_cmd, 'uniq')]:\n            if not cmd:\n                raise EnvironmentError(f'binary {name!r} is not available in the current environment!')\n\n        unformatted_output_path = self.output_info.unformatted_output_path()\n        prefix, _ = os.path.split(next(iter(string.Formatter().parse(unformatted_output_path)))[0])\n        _, tail = unformatted_output_path.split(prefix)\n\n        # Replace { with [ and } with ] only for non-numeric values inside {} of tail\n        output_str = re.sub(r'\\{(\\w+)\\}', self.replace_non_numeric_bracket, tail)\n        output_template = output_str.format(*self.output_info.template_folders)\n\n        slash = '/'\n        delimiter = 'DELIMITER'\n        flat_output_template = output_template.replace('/', delimiter)\n        split_dims = self.output_info.split_dims()\n        # Construct a string where each split dimension is \"dim:s\".\n        # This ensures dims like time are represented as 0600 instead of 600.\n        split_dims_arg = ','.join(f'{dim}:s' for dim in split_dims)\n        with self._copy_to_local_file() as local_file:\n            self.logger.info('Skipping as needed...')\n            # Append -w flag to filter GRIB messages matching the given expression\n            if self.grib_filter_expression:\n                grib_get_args = [grib_get_cmd, '-p', split_dims_arg, '-w', self.grib_filter_expression, local_file.name]\n            else:\n                grib_get_args = [grib_get_cmd, '-p', split_dims_arg, local_file.name]\n            grib_get_process = subprocess.Popen(grib_get_args, stdout=subprocess.PIPE)\n            uniq_output = subprocess.check_output((uniq_cmd,), stdin=grib_get_process.stdout)\n            output_paths = []\n            skipped_paths = []\n            for line in uniq_output.decode('utf-8').rstrip('\\n').split('\\n'):\n                splits = dict(zip(split_dims, line.split(' ')))\n                output_path = self.output_info.formatted_output_path(splits)\n                if self.should_skip_file(output_path):\n                    skipped_paths.append(output_path)\n                    continue\n                output_paths.append(output_path)\n            if not output_paths:\n                metrics.Metrics.counter('file_splitters', 'skipped').inc()\n                self.logger.info('Skipping %s, file already split into: %s',\n                                 repr(self.input_path), ', '.join(skipped_paths))\n                return\n\n            with tempfile.TemporaryDirectory() as tmpdir:\n                self.logger.info('Performing split.')\n                dest = os.path.join(tmpdir, flat_output_template)\n                if self.grib_filter_expression:\n                    subprocess.run([grib_copy_cmd, \"-w\",\n                                    self.grib_filter_expression,\n                                    local_file.name, dest], check=True)\n                else:\n                    subprocess.run([grib_copy_cmd, local_file.name, dest],\n                                   check=True)\n\n                self.logger.info('Uploading %r...', self.input_path)\n                for flat_target in os.listdir(tmpdir):\n                    dest_file_path = f'{prefix}{flat_target.replace(delimiter, slash)}'\n                    self.logger.info([prefix, dest_file_path, local_file.name,\n                                      self.output_info.unformatted_output_path()])\n\n                    copy(os.path.join(tmpdir, flat_target), dest_file_path)\n                self.logger.info('Finished uploading %r', self.input_path)\n\n\nclass NetCdfSplitter(FileSplitter):\n\n    _UNSUPPORTED_DIMENSIONS = ('latitude', 'longitude', 'lat', 'lon')\n\n    def split_data(self) -> None:\n        if not self.output_info.split_dims():\n            raise ValueError('No splitting specified in template.')\n        if any(dim in self._UNSUPPORTED_DIMENSIONS for dim in self.output_info.split_dims()):\n            raise ValueError('Unsupported split dimension (lat, lng).')\n        if self.should_skip():\n            metrics.Metrics.counter('file_splitters', 'skipped').inc()\n            self.logger.info('Skipping %s, file already split.',\n                             repr(self.input_path))\n            return\n\n        with self._open_dataset_locally() as dataset:\n            if any(split not in dataset.dims and split not in ('variable') for split in self.output_info.split_dims()):\n                raise ValueError(\n                    'netcdf split: requested dimension not in dataset')\n            iterlists = []\n            if 'variable' in self.output_info.split_dims():\n                iterlists.append([dataset[var].to_dataset()\n                                  for var in dataset.data_vars])\n            else:\n                iterlists.append([dataset])\n            filtered_split_dims = [\n                x for x in self.output_info.split_dims() if x not in ('variable', self._UNSUPPORTED_DIMENSIONS)]\n            for dim in filtered_split_dims:\n                iterlists.append(dataset[dim])\n            combinations = itertools.product(*iterlists)\n            self.logger.info('Splitting & uploading %r...', self.input_path)\n            for comb in combinations:\n                selected = comb[0]\n                for da in comb[1:]:\n                    for dim in da.coords:\n                        selected = selected.sel({dim: getattr(da, dim)})\n                self._write_dataset(selected, filtered_split_dims)\n            self.logger.info('Finished splitting & uploading %r.', self.input_path)\n\n    @contextmanager\n    def _open_dataset_locally(self) -> t.Iterator[xr.Dataset]:\n        with self._copy_to_local_file() as local_file:\n            ds = xr.open_dataset(local_file.name, engine='netcdf4')\n            yield ds\n            ds.close()\n\n    def _write_dataset(self, dataset: xr.Dataset, split_dims: t.List[str]) -> None:\n        \"\"\"Write destination NetCDF file in NETCDF4 format.\"\"\"\n        # Here, we need to write the file locally, since only the scipy engine supports file objects or\n        # returning bytes. Further, the scipy engine does not support NETCDF4 (which is HDF5 compliant).\n        # Storing data in HDF5 is advantageous since it allows opening NetCDF files with buffered readers.\n        with tempfile.NamedTemporaryFile() as tmp:\n            dataset.to_netcdf(path=tmp.name, engine='netcdf4', format='NETCDF4')\n            copy(tmp.name, self._get_output_for_dataset(dataset, split_dims))\n\n    def _get_output_for_dataset(self, dataset: xr.Dataset, split_dims: t.List[str]) -> str:\n        splits = {'variable': list(dataset.data_vars.keys())[0]}\n        for dim in split_dims:\n            value = dataset[dim].values\n            if dim == 'time':\n                value = np.datetime_as_string(value, unit='m')\n            splits[dim] = value\n        return self.output_info.formatted_output_path(splits)\n\n\nclass DrySplitter(FileSplitter):\n\n    def split_data(self) -> None:\n        if not self.output_info.split_dims():\n            raise ValueError('No splitting specified in template.')\n        self.logger.info('input file: %s - output scheme: %s',\n                         self.input_path, self.output_info.formatted_output_path(self._get_keys()))\n\n    def _get_keys(self) -> t.Dict[str, str]:\n        return {name: name for name in self.output_info.split_dims()}\n\n\ndef get_splitter(file_path: str,\n                 output_info: OutFileInfo,\n                 dry_run: bool,\n                 force_split: bool = False,\n                 logging_level: int = logging.INFO,\n                 grib_filter_expression: t.Optional[str] = None) -> FileSplitter:\n    if dry_run:\n        logger.info('Using splitter: DrySplitter')\n        return DrySplitter(file_path, output_info, logging_level=logging_level)\n\n    with FileSystems.open(file_path) as f:\n        header = f.read(4)\n\n    if b'GRIB' in header:\n        metrics.Metrics.counter('get_splitter', 'grib').inc()\n\n        # Decide which version of the grib splitter to use depending on if ecCodes is installed.\n        # Prefer the v2 grib splitter, which should be more robust -- especially when splitting by\n        # multiple dimensions at once.\n        cmd = shutil.which('grib_copy')\n        if cmd:\n            logger.info('Using splitter: GribSplitterV2')\n            return GribSplitterV2(file_path, output_info, force_split,\n                                  logging_level, grib_filter_expression)\n        else:\n            logger.info('Using splitter: GribSplitter')\n            return GribSplitter(file_path, output_info, force_split,\n                                logging_level)\n\n    # See the NetCDF Spec docs:\n    # https://docs.unidata.ucar.edu/netcdf-c/current/faq.html#How-can-I-tell-which-format-a-netCDF-file-uses\n    if b'CDF' in header or b'HDF' in header:\n        metrics.Metrics.counter('get_splitter', 'netcdf').inc()\n        logger.info('Using splitter: NetCdfSplitter')\n        return NetCdfSplitter(file_path, output_info, force_split)\n\n    raise ValueError(\n        f'cannot determine if file {file_path!r} is Grib or NetCDF.')\n"
  },
  {
    "path": "weather_sp/splitter_pipeline/file_splitters_test.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport shutil\nfrom collections import defaultdict\n\nimport h5py\nimport numpy as np\nimport pygrib\nimport pytest\nimport xarray as xr\nfrom unittest.mock import patch\n\nimport weather_sp\nfrom .file_name_utils import OutFileInfo\nfrom .file_name_utils import get_output_file_info\nfrom .file_splitters import (\n    DrySplitter,\n    GribSplitter,\n    GribSplitterV2,\n    NetCdfSplitter,\n    get_splitter,\n)\n\n\n@pytest.fixture()\ndef data_dir():\n    data_dir = f'{next(iter(weather_sp.__path__))}/test_data'\n    yield data_dir\n    split_dir = f'{data_dir}/split_files/'\n    if os.path.exists(split_dir):\n        shutil.rmtree(split_dir)\n\n\n@pytest.fixture(params=[GribSplitter, GribSplitterV2])\ndef grib_splitter(request):\n    return request.param\n\n\nclass TestGetSplitter:\n\n    def test_get_splitter_grib(self, data_dir):\n        splitter = get_splitter(f'{data_dir}/era5_sample.grib',\n                                OutFileInfo(\n                                    file_name_template='some_out_{split}',\n                                    ending='.grib',\n                                    formatting='',\n                                    template_folders=[]),\n                                dry_run=False)\n        assert isinstance(splitter, GribSplitter)\n\n    def test_get_splitter_nc(self, data_dir):\n        splitter = get_splitter(f'{data_dir}/era5_sample.nc',\n                                OutFileInfo(\n                                    file_name_template='some_out_{split}',\n                                    ending='.nc',\n                                    formatting='',\n                                    template_folders=[]),\n                                dry_run=False)\n\n        assert isinstance(splitter, NetCdfSplitter)\n\n    def test_get_splitter_undetermined_grib(self, data_dir):\n        splitter = get_splitter(f'{data_dir}/era5_sample_grib',\n                                OutFileInfo(\n                                    file_name_template='some_out_{split}',\n                                    ending='',\n                                    formatting='',\n                                    template_folders=[]),\n                                dry_run=False)\n        assert isinstance(splitter, GribSplitter)\n\n    def test_get_splitter_dryrun(self):\n        splitter = get_splitter('some/file/path/data.grib',\n                                OutFileInfo(\n                                    file_name_template='some_out_{split}',\n                                    ending='.grib',\n                                    formatting='',\n                                    template_folders=[]),\n                                dry_run=True)\n        assert isinstance(splitter, DrySplitter)\n\n\nclass TestGribSplitter:\n    def test_get_output_file_path(self, grib_splitter):\n        splitter = grib_splitter(\n            'path/to/input',\n            OutFileInfo(\n                file_name_template='path/output/file.{typeOfLevel}_{shortName}',\n                ending='.grib',\n                formatting='',\n                template_folders=[])\n        )\n        out = splitter.output_info.formatted_output_path(\n            {'typeOfLevel': 'surface', 'shortName': 'cc'})\n        assert out == 'path/output/file.surface_cc.grib'\n\n    def test_split_data_with_filter(self, data_dir):\n        input_path = f'{data_dir}/era5_sample.grib'\n        output_base = f'{data_dir}/split_files/era5_sample'\n        splitter = GribSplitterV2(\n            input_path,\n            OutFileInfo(\n                output_base,\n                formatting='_{typeOfLevel}_{shortName}',\n                ending='.grib',\n                template_folders=[]),\n            grib_filter_expression=\"typeOfLevel=isobaricInhPa,level=200\"\n        )\n        splitter.split_data()\n        assert os.path.exists(f'{data_dir}/split_files/') is True\n\n        short_names = ['d', 'cc', 'z', 'r']\n        input_data = defaultdict(list)\n        split_data = defaultdict(list)\n\n        input_grbs = pygrib.open(input_path)\n        for grb in input_grbs:\n            if grb.typeOfLevel == 'isobaricInhPa' and grb.level == 200:\n                input_data[grb.shortName].append(grb.values)\n\n        for sn in short_names:\n            split_file = f'{data_dir}/split_files/era5_sample_isobaricInhPa_{sn}.grib'\n            split_grbs = pygrib.open(split_file)\n            for grb in split_grbs:\n                split_data[sn].append(grb.values)\n\n        for sn in short_names:\n            orig = np.array(input_data[sn])\n            split = np.array(split_data[sn])\n            assert orig.shape == split.shape\n            np.testing.assert_allclose(orig, split)\n\n    def test_split_data(self, data_dir, grib_splitter):\n        input_path = f'{data_dir}/era5_sample.grib'\n        output_base = f'{data_dir}/split_files/era5_sample'\n        splitter = grib_splitter(\n            input_path,\n            OutFileInfo(\n                output_base,\n                formatting='_{typeOfLevel}_{shortName}',\n                ending='.grib',\n                template_folders=[])\n        )\n        splitter.split_data()\n        assert os.path.exists(f'{data_dir}/split_files/') is True\n\n        short_names = ['z', 'r', 'cc', 'd']\n        input_data = defaultdict(list)\n        split_data = defaultdict(list)\n\n        input_grbs = pygrib.open(input_path)\n        for grb in input_grbs:\n            input_data[grb.shortName].append(grb.values)\n\n        for sn in short_names:\n            split_file = f'{data_dir}/split_files/era5_sample_isobaricInhPa_{sn}.grib'\n            split_grbs = pygrib.open(split_file)\n            for grb in split_grbs:\n                split_data[sn].append(grb.values)\n\n        for sn in short_names:\n            orig = np.array(input_data[sn])\n            split = np.array(split_data[sn])\n            assert orig.shape == split.shape\n            np.testing.assert_allclose(orig, split)\n\n    def test_skips_existing_split(self, data_dir, grib_splitter):\n        input_path = f'{data_dir}/era5_sample.grib'\n        splitter = grib_splitter(\n            input_path,\n            OutFileInfo(f'{data_dir}/split_files/era5_sample',\n                        formatting='_{typeOfLevel}_{shortName}',\n                        ending='.grib',\n                        template_folders=[])\n        )\n        assert not splitter.should_skip()\n        splitter.split_data()\n        assert os.path.exists(f'{data_dir}/split_files/')\n        assert splitter.should_skip()\n\n    @patch('weather_sp.splitter_pipeline.file_splitters.FileSplitter.should_skip_file')\n    def test_skips_existing_split_with_filter(self, mock_should_skip_file, data_dir):\n        input_path = f'{data_dir}/era5_sample.grib'\n        splitter = GribSplitterV2(\n            input_path,\n            OutFileInfo(f'{data_dir}/split_files/era5_sample',\n                        formatting='_{typeOfLevel}_{shortName}',\n                        ending='.grib',\n                        template_folders=[]),\n            grib_filter_expression=\"typeOfLevel=isobaricInhPa,level=200\"\n        )\n        splitter.split_data()\n        assert mock_should_skip_file.call_count == 8\n\n    def test_does_not_skip__if_forced(self, data_dir, grib_splitter):\n        input_path = f'{data_dir}/era5_sample.grib'\n        output_base = f'{data_dir}/split_files/era5_sample'\n        splitter = grib_splitter(\n            input_path,\n            OutFileInfo(\n                output_base,\n                formatting='_{levelType}_{shortName}',\n                ending='.grib',\n                template_folders=[]),\n            force_split=True\n        )\n        assert not splitter.should_skip()\n        splitter.split_data()\n        assert os.path.exists(f'{data_dir}/split_files/')\n        assert not splitter.should_skip()\n\n    @pytest.mark.limit_memory('30 MB')\n    def test_split__fits_memory_bounds(self, data_dir, grib_splitter):\n        input_path = f'{data_dir}/era5_sample.grib'\n        output_base = f'{data_dir}/split_files/era5_sample'\n        splitter = grib_splitter(\n            input_path,\n            OutFileInfo(\n                output_base,\n                formatting='_{typeOfLevel}_{shortName}',\n                ending='.grib',\n                template_folders=[])\n        )\n        splitter.split_data()\n\n\nclass TestNetCdfSplitter:\n\n    def test_get_output_file_path(self):\n        splitter = NetCdfSplitter(\n            'path/to/input',\n            OutFileInfo('path/output/file_{variable}',\n                        ending='.nc',\n                        formatting='',\n                        template_folders=[]))\n        out = splitter.output_info.formatted_output_path({'variable': 'cc'})\n        assert out == 'path/output/file_cc.nc'\n\n    def test_split_data(self, data_dir):\n        input_path = f'{data_dir}/era5_sample.nc'\n        output_base = f'{data_dir}/split_files/era5_sample'\n        splitter = NetCdfSplitter(\n            input_path,\n            OutFileInfo(output_base,\n                        formatting='_{time}_{variable}',\n                        ending='.nc',\n                        template_folders=[]))\n\n        splitter.split_data()\n        assert os.path.exists(f'{data_dir}/split_files/')\n        input_data = xr.open_dataset(input_path, engine='netcdf4')\n        for time in ['2015-01-15T00:00', '2015-01-15T06:00', '2015-01-15T12:00', '2015-01-15T18:00']:\n            expect = input_data.sel(time=time)\n            for sn in ['d', 'cc', 'z']:\n                split_file = f'{data_dir}/split_files/era5_sample_{time}_{sn}.nc'\n                split_data = xr.open_dataset(split_file, engine='netcdf4')\n                xr.testing.assert_allclose(expect[sn], split_data[sn])\n\n    def test_split_data__not_in_dims_raises(self, data_dir):\n        input_path = f'{data_dir}/era5_sample.nc'\n        output_base = f'{data_dir}/split_files/era5_sample'\n        splitter = NetCdfSplitter(input_path,\n                                  OutFileInfo(output_base,\n                                              formatting='_{level}',\n                                              ending='.nc',\n                                              template_folders=[]))\n        with pytest.raises(ValueError):\n            splitter.split_data()\n\n    def test_split_data__unsupported_dim_raises(self, data_dir):\n        input_path = f'{data_dir}/era5_sample.nc'\n        output_base = f'{data_dir}/split_files/era5_sample'\n        splitter = NetCdfSplitter(input_path,\n                                  OutFileInfo(output_base,\n                                              formatting='_{longitude}',\n                                              ending='.nc',\n                                              template_folders=[]))\n        with pytest.raises(ValueError):\n            splitter.split_data()\n\n    def test_skips_existing_split(self, data_dir):\n        input_path = f'{data_dir}/era5_sample.nc'\n        output_base = f'{data_dir}/split_files/era5_sample'\n        splitter = NetCdfSplitter(input_path,\n                                  OutFileInfo(output_base,\n                                              formatting='_{variable}',\n                                              ending='.nc',\n                                              template_folders=[]))\n        assert not splitter.should_skip()\n        splitter.split_data()\n        assert os.path.exists(f'{data_dir}/split_files/')\n        assert splitter.should_skip()\n\n    def test_does_not_skip__if_forced(self, data_dir):\n        input_path = f'{data_dir}/era5_sample.nc'\n        output_base = f'{data_dir}/split_files/era5_sample'\n        splitter = NetCdfSplitter(input_path,\n                                  OutFileInfo(\n                                      output_base,\n                                      formatting='_{variable}',\n                                      ending='.nc',\n                                      template_folders=[]),\n                                  force_split=True)\n        assert not splitter.should_skip()\n        splitter.split_data()\n        assert os.path.exists(f'{data_dir}/split_files/')\n        assert not splitter.should_skip()\n\n    @pytest.mark.limit_memory('25 MB')\n    def test_split_data__fits_memory_bounds(self, data_dir):\n        input_path = f'{data_dir}/era5_sample.nc'\n        output_base = f'{data_dir}/split_files/era5_sample'\n        splitter = NetCdfSplitter(\n            input_path,\n            OutFileInfo(output_base,\n                        formatting='_{time}_{variable}',\n                        ending='.nc',\n                        template_folders=[]))\n\n        splitter.split_data()\n\n    def test_split_data__is_netcdf4(self, data_dir):\n        input_path = f'{data_dir}/era5_sample.nc'\n        assert not h5py.is_hdf5(input_path)\n        output_base = f'{data_dir}/split_files/era5_sample'\n        splitter = NetCdfSplitter(\n            input_path,\n            OutFileInfo(output_base,\n                        formatting='_{time}_{variable}',\n                        ending='.nc',\n                        template_folders=[]))\n\n        splitter.split_data()\n        assert os.path.exists(f'{data_dir}/split_files/')\n        for time in ['2015-01-15T00:00', '2015-01-15T06:00', '2015-01-15T12:00', '2015-01-15T18:00']:\n            for sn in ['d', 'cc', 'z']:\n                split_file = f'{data_dir}/split_files/era5_sample_{time}_{sn}.nc'\n                assert h5py.is_hdf5(split_file)\n\n\nclass TestDrySplitter:\n\n    def test_path_with_output_pattern(self):\n        input_path = 'a/b/c/d/file.nc'\n        out_pattern = 'gs://my_bucket/splits/{2}-{1}-{0}_old_data.{variable}.cd'\n        out_info = get_output_file_info(\n            filename=input_path, out_pattern=out_pattern)\n        splitter = DrySplitter(input_path, out_info)\n        keys = splitter._get_keys()\n        assert keys == {'variable': 'variable'}\n        out_file = splitter.output_info.formatted_output_path(keys)\n        assert out_file == 'gs://my_bucket/splits/c-d-file_old_data.variable.cd'\n\n    def test_path_with_output_pattern_no_formatting(self):\n        # OutFileInfo using pattern but without any formatting marks.\n        splitter = DrySplitter(\"input_path/file.grib\",\n                               OutFileInfo(\"some/out/no/formatting\",\n                                           formatting='',\n                                           ending='',\n                                           template_folders=[]))\n        with pytest.raises(ValueError):\n            splitter.split_data()\n"
  },
  {
    "path": "weather_sp/splitter_pipeline/pipeline.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport argparse\nimport logging\nimport os\nimport typing as t\n\nimport apache_beam as beam\nimport apache_beam.metrics as metrics\nfrom apache_beam.io.fileio import MatchFiles, ReadMatches\nfrom apache_beam.options.pipeline_options import PipelineOptions, SetupOptions\nfrom apache_beam.io.gcp.pubsub import ReadFromPubSub\n\nfrom .file_name_utils import OutFileInfo, get_output_file_info\nfrom .file_splitters import get_splitter\nfrom .streaming import GroupMessagesByFixedWindows, ParsePaths\n\nlogger = logging.getLogger(__name__)\nSDK_CONTAINER_IMAGE='gcr.io/weather-tools-prod/weather-tools:0.0.0'\n\n\ndef configure_logger(verbosity: int) -> None:\n    \"\"\"Configures logging from verbosity. Default verbosity will show errors.\"\"\"\n    level = 40 - verbosity * 10\n    logging.getLogger(__package__).setLevel(level)\n    logger.setLevel(level)\n\n\ndef split_file(input_file: str,\n               input_base_dir: str,\n               output_template: t.Optional[str],\n               output_dir: t.Optional[str],\n               formatting: str,\n               dry_run: bool,\n               force_split: bool = False,\n               logging_level: int = logging.INFO,\n               grib_filter_expression: t.Optional[str] = None):\n    output_base_name = get_output_base_name(input_path=input_file,\n                                            input_base=input_base_dir,\n                                            output_template=output_template,\n                                            output_dir=output_dir,\n                                            formatting=formatting)\n    logger.info('Splitting file: %s. Output base name: %s',\n                input_file, output_base_name)\n    metrics.Metrics.counter('pipeline', 'splitting file').inc()\n    level = 40 - logging_level * 10\n    splitter = get_splitter(input_file,\n                            output_base_name,\n                            dry_run,\n                            force_split,\n                            level,\n                            grib_filter_expression)\n    splitter.split_data()\n\n\ndef _get_base_input_directory(input_pattern: str) -> str:\n    base_dir = input_pattern\n    for x in ['*', '?', '[']:\n        base_dir = base_dir.split(x, maxsplit=1)[0]\n    # Go one directory up to include the last common directory in output path.\n    return os.path.dirname(os.path.dirname(base_dir))\n\n\ndef get_output_base_name(input_path: str,\n                         input_base: str,\n                         output_template: t.Optional[str],\n                         output_dir: t.Optional[str],\n                         formatting: str) -> OutFileInfo:\n    return get_output_file_info(input_path,\n                                input_base_dir=input_base,\n                                out_pattern=output_template,\n                                out_dir=output_dir,\n                                formatting=formatting)\n\n\ndef run(argv: t.List[str], save_main_session: bool = True):\n    \"\"\"Main entrypoint & pipeline definition.\"\"\"\n    parser = argparse.ArgumentParser(\n        prog='weather-sp',\n        description='Split weather data file into files by variable or other dimension.'\n    )\n    parser.add_argument('-i', '--input-pattern', type=str, required=True,\n                        help='Pattern for input weather data.')\n    parser.add_argument('--use-local-code', action='store_true', default=False, help='Supply local code to the Runner.')\n    output_options = parser.add_mutually_exclusive_group(required=True)\n    output_options.add_argument(\n        '--output-template', type=str,\n        help='Template specifying path to output files using '\n        'python-style formatting substitution of input '\n        'directory names. '\n        'For `input_pattern a/b/c/**` and file `a/b/c/file.grib`, '\n        'a template with formatting `/somewhere/{1}-{0}.{level}_{shortName}.grib` '\n        'will give `somewhere/c-file.level_shortName.grib`'\n    )\n    output_options.add_argument(\n        '--output-dir', type=str,\n        help='Output directory that will replace the common path of the '\n        'input_pattern. '\n        'For `input_pattern a/b/c/**` and file `a/b/c/file.nc`, '\n        '`outputdir /x/y/z` will create '\n        'output files like `/x/y/z/c/file_variable.nc`'\n    )\n    parser.add_argument(\n        '--formatting', type=str, default='',\n        help='Used in combination with `output-dir`: specifies the how to '\n        'split the data and format the output file. '\n        'Example: `_{time}_{level}hPa`'\n    )\n    parser.add_argument('-d', '--dry-run', action='store_true', default=False,\n                        help='Test the input file matching and the output file scheme without splitting.')\n    parser.add_argument('-f', '--force', action='store_true', default=False,\n                        help='Force re-splitting of the pipeline. Turns of skipping of already split data.')\n    parser.add_argument('--log-level', type=int, default=2,\n                        help='An integer to configure log level. Default: 2(INFO)')\n    parser.add_argument('-w', '--where', type=str, default=None,\n                        help='Optional GRIB filter expression to apply during'\n                        'file splitting using grib_copy.'\n                        'This allows filtering GRIB messages based on'\n                        'key-value pairs, such as level, type of level,'\n                        'or date.'\n                        'This flag is only applicable to GRIB files and is'\n                        'specifically supported by the GribSplitterV2'\n                        'implementation.'\n                        'Example: typeOfLevel=isobaricInhPa,level=1000')\n    parser.add_argument('--topic', type=str, default=None,\n                        help='Pub/Sub topic to read from for streaming mode.')\n    parser.add_argument('--subscription', type=str, default=None,\n                        help='Pub/Sub subscription to read from for streaming mode.')\n    parser.add_argument('--window-size', type=int, default=1,\n                        help='Window size in minutes for grouping Pub/Sub messages.')\n    parser.add_argument('--num-shards', type=int, default=5,\n                        help='Number of shards for partitioning windowed data.')\n\n    known_args, pipeline_args = parser.parse_known_args(argv[1:])\n\n    # If a Pub/Sub is used, then the pipeline must be a streaming pipeline.\n    if known_args.topic or known_args.subscription:\n        if known_args.topic and known_args.subscription:\n            raise ValueError('only one argument can be provided at a time: `topic` or `subscription`.')\n        pipeline_args.extend('--streaming true'.split())\n\n    configure_logger(known_args.log_level)  # 0 = error, 1 = warn, 2 = info, 3 = debug\n\n    pipeline_options = PipelineOptions(pipeline_args)\n    pipeline_options.view_as(SetupOptions).save_main_session = save_main_session\n    input_pattern = known_args.input_pattern\n    input_base_dir = _get_base_input_directory(input_pattern)\n    output_template = known_args.output_template\n    output_dir = known_args.output_dir\n    formatting = known_args.formatting\n    dry_run = known_args.dry_run\n    grib_filter_expression = known_args.where\n\n    if not output_template and not output_dir:\n        raise ValueError('No output specified')\n\n    output_file_tmpl = os.path.basename(output_template)\n    if '[' in output_file_tmpl or ']' in output_file_tmpl or '[' in formatting or ']' in formatting:\n        raise ValueError('Tokens `[]` are disallowed in the file output.')\n\n    logger.debug('input_pattern: %s', input_pattern)\n    logger.debug('input_base_dir: %s', input_base_dir)\n    if output_template:\n        logger.debug('output_template: %s', output_template)\n    if output_dir:\n        logger.debug('output_dir: %s', output_dir)\n    logger.debug('dry_run: %s', known_args.dry_run)\n\n    with beam.Pipeline(options=pipeline_options) as p:\n        if known_args.topic or known_args.subscription:\n            paths = (\n                p\n                # Windowing is based on this code sample:\n                # https://cloud.google.com/pubsub/docs/pubsub-dataflow#code_sample\n                | 'ReadUploadEvent' >> ReadFromPubSub(\n                    known_args.topic, known_args.subscription\n                )\n                | 'WindowInto' >> GroupMessagesByFixedWindows(\n                    known_args.window_size, known_args.num_shards\n                )\n                | 'ParsePaths' >> beam.ParDo(\n                    ParsePaths(known_args.input_pattern)\n                )\n            )\n        else:\n            paths = (\n                p\n                | 'MatchFiles' >> MatchFiles(input_pattern)\n                | 'ReadMatchedFiles' >> ReadMatches()\n                | 'Shuffle' >> beam.Reshuffle()\n                | 'GetPath' >> beam.Map(lambda x: x.metadata.path)\n            )\n        (\n            paths\n            | 'SplitFiles' >> beam.Map(\n                split_file,\n                input_base_dir,\n                output_template,\n                output_dir,\n                formatting,\n                dry_run,\n                known_args.force,\n                known_args.log_level,\n                grib_filter_expression,\n            )\n        )\n\n    logger.info('Pipeline is finished.')\n"
  },
  {
    "path": "weather_sp/splitter_pipeline/pipeline_test.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport unittest\nfrom unittest.mock import patch, ANY\n\nfrom .file_name_utils import OutFileInfo\nfrom .pipeline import _get_base_input_directory\nfrom .pipeline import get_output_base_name\nfrom .pipeline import split_file\n\n\nclass PipelineTest(unittest.TestCase):\n\n    def test_get_base_input_directory(self):\n        self.assertEqual(\n            _get_base_input_directory(\n                '/path/to/some/wild/*/card/??[0-1].nc'),\n            '/path/to/some')\n        self.assertEqual(\n            _get_base_input_directory(\n                '/path/to/some/wild/??/card/*[0-1].nc'),\n            '/path/to/some')\n        self.assertEqual(\n            _get_base_input_directory(\n                '/path/to/some/wild/201[8,9]/card/??.nc'),\n            '/path/to/some')\n\n    def test_get_output_base_name(self):\n        self.assertEqual(get_output_base_name(\n            input_path='somewhere/somefile',\n            input_base='somewhere',\n            output_template=None,\n            formatting='.{shortName}',\n            output_dir='out/there').file_name_template, 'out/there/somefile')\n\n    @patch('weather_sp.splitter_pipeline.file_splitters.get_splitter')\n    @unittest.skip('bad mocks')\n    def test_split_file(self, mock_get_splitter):\n        split_file(input_file='somewhere/somefile',\n                   input_base_dir='somewhere',\n                   output_dir='out/there',\n                   output_template=None,\n                   formatting='_{variable}',\n                   dry_run=True)\n        mock_get_splitter.assert_called_with('somewhere/somefile',\n                                             OutFileInfo('out/there/somefile',\n                                                         formatting='_{variable}',\n                                                         ending='',\n                                                         template_folders=[]),\n                                             True)\n\n    @patch('weather_sp.splitter_pipeline.pipeline.get_splitter')\n    def test_split_file_with_filter(self, mock_get_splitter):\n        split_file(\n            input_file='somewhere/somefile',\n            input_base_dir='somewhere',\n            output_dir='out/there',\n            output_template=None,\n            formatting='_{variable}',\n            dry_run=True,\n            grib_filter_expression='typeOfLevel=isobaricInhPa,level=200',\n        )\n        mock_get_splitter.assert_called_with(\n            'somewhere/somefile',\n            OutFileInfo('out/there/somefile',\n                        formatting='_{variable}',\n                        ending='',\n                        template_folders=[]),\n            True,\n            False,\n            ANY,\n            'typeOfLevel=isobaricInhPa,level=200')\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "weather_sp/splitter_pipeline/streaming.py",
    "content": "# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"Window and parse Pub/Sub streams of real-time weather data added to cloud storage.\n\nExample windowing code borrowed from:\n  https://cloud.google.com/pubsub/docs/pubsub-dataflow#code_sample\n\"\"\"\n\nimport datetime\nimport fnmatch\nimport json\nimport logging\nimport random\nimport typing as t\nfrom urllib.parse import urlparse\n\nimport apache_beam as beam\nfrom apache_beam.transforms.window import FixedWindows\n\nlogger = logging.getLogger(__name__)\n\n\nclass GroupMessagesByFixedWindows(beam.PTransform):\n    \"\"\"A composite transform that groups Pub/Sub messages based on publish time\n    and outputs a list of tuples, each containing a message and its publish time.\n    \"\"\"\n\n    def __init__(self, window_size: int, num_shards: int = 5):\n        # Set window size to 60 seconds.\n        self.window_size = int(window_size * 60)\n        self.num_shards = num_shards\n\n    def expand(self, pcoll):\n        return (\n                pcoll\n                # Bind window info to each element using element timestamp (or publish time).\n                | \"Window into fixed intervals\" >> beam.WindowInto(FixedWindows(self.window_size))\n                | \"Add timestamp to windowed elements\" >> beam.ParDo(AddTimestamp())\n                # Assign a random key to each windowed element based on the number of shards.\n                | \"Add key\" >> beam.WithKeys(lambda _: random.randint(0, self.num_shards - 1))\n                # Group windowed elements by key. All the elements in the same window must fit\n                # memory for this. If not, you need to use `beam.util.BatchElements`.\n                | \"Group by key\" >> beam.GroupByKey()\n        )\n\n\nclass AddTimestamp(beam.DoFn):\n    \"\"\"Processes each windowed element by extracting the message body and its\n    publish time into a tuple.\n    \"\"\"\n\n    def process(self, element, publish_time=beam.DoFn.TimestampParam) -> t.Iterable[t.Tuple[str, str]]:\n        yield (\n            element.decode(\"utf-8\"),\n            datetime.datetime.utcfromtimestamp(float(publish_time)).strftime(\n                \"%Y-%m-%d %H:%M:%S.%f\"\n            ),\n        )\n\n\nclass ParsePaths(beam.DoFn):\n    \"\"\"Parse paths to real-time weather data from windowed-batches.\"\"\"\n\n    def __init__(self, uri_pattern: str):\n        self.uri_pattern = uri_pattern\n        self.protocol = f'{urlparse(uri_pattern).scheme}://'\n        super().__init__()\n\n    @classmethod\n    def try_parse_message(cls, message_body: t.Union[str, t.Dict]) -> t.Dict:\n        \"\"\"Robustly parse message body, which will be JSON in the vast majority of cases,\n         but might be a dictionary.\"\"\"\n        try:\n            return json.loads(message_body)\n        except (json.JSONDecodeError, TypeError):\n            if isinstance(message_body, dict):\n                return message_body\n            raise\n\n    def to_object_path(self, payload: t.Dict) -> str:\n        \"\"\"Parse cloud object from Pub/Sub topic payload.\"\"\"\n        return f'{self.protocol}{payload[\"bucket\"]}/{payload[\"name\"]}'\n\n    def should_skip(self, message_body: t.Dict) -> bool:\n        \"\"\"Returns true if Pub/Sub topic does *not* match the target file URI pattern.\"\"\"\n        try:\n            return not fnmatch.fnmatch(self.to_object_path(message_body), self.uri_pattern)\n        except KeyError:\n            return True\n\n    def process(self, key_value, window=beam.DoFn.WindowParam) -> t.Iterable[str]:\n        \"\"\"Yield paths to real-time weather data in cloud storage.\"\"\"\n\n        shard_id, batch = key_value\n\n        logger.debug(f'Processing shard {shard_id!r}.')\n\n        for message_body, publish_time in batch:\n            logger.debug(message_body)\n\n            parsed_msg = self.try_parse_message(message_body)\n\n            target = self.to_object_path(parsed_msg)\n            logger.info(f'Parsed path {target!r}...')\n\n            if self.should_skip(parsed_msg):\n                logger.info(f'skipping {target!r}.')\n                continue\n\n            yield target\n"
  },
  {
    "path": "weather_sp/weather-sp",
    "content": "#!/usr/bin/env python3\n# Copyright 2021 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport glob\nimport logging\nimport os\nimport subprocess\nimport sys\nimport tarfile\nimport tempfile\n\nimport weather_sp\n\nSDK_CONTAINER_IMAGE='gcr.io/weather-tools-prod/weather-tools:0.0.0'\n\nif __name__ == '__main__':\n    logging.getLogger().setLevel(logging.INFO)\n\n    site_pkg = weather_sp.__path__[0]\n\n    try:\n        from splitter_pipeline import cli\n    except ImportError:\n        # Install the subpackage.\n        subprocess.check_call(\n            f'{sys.executable} -m pip -q install -e {site_pkg}'.split())\n\n        # Re-load sys.path\n        import site\n        from importlib import reload\n\n        reload(site)\n\n        # Re-attempt import. If this fails, the user probably has an older version of\n        # the package already installed on their machine that breaks this process.\n        # If that's the case, it's best to start from a clean virtual environment.\n        try:\n            from splitter_pipeline import cli\n        except ImportError as e:\n            raise ImportError(\n                'please re-install package in a clean python environment.') from e\n\n    args = []\n\n    if \"DataflowRunner\" in sys.argv and  \"--sdk_container_image\" not in sys.argv:\n        args.extend(['--sdk_container_image',\n             os.getenv('SDK_CONTAINER_IMAGE', SDK_CONTAINER_IMAGE),\n             '--experiments',\n             'use_runner_v2'])\n        \n    if \"--use-local-code\" in sys.argv:\n        with tempfile.TemporaryDirectory() as tmpdir:\n            original_dir = os.getcwd()\n\n            # Convert subpackage to a tarball\n            os.chdir(site_pkg)\n            subprocess.check_call(\n                f'{sys.executable} ./setup.py -q sdist --dist-dir {tmpdir}'.split(),\n            )\n\n            os.chdir(original_dir)\n\n            # Set tarball as extra packages for Beam.\n            pkg_archive = glob.glob(os.path.join(tmpdir, '*.tar.gz'))[0]\n\n            with tarfile.open(pkg_archive, 'r') as tar:\n                assert any([f.endswith('.py') for f in tar.getnames()]), 'extra_package must include python files!'\n\n            # cleanup memory to prevent pickling error.\n            tar = None\n            weather_sp = None\n            args.extend(['--extra_package', pkg_archive])\n            cli(args)\n    else:\n        cli(args)"
  },
  {
    "path": "xql/README.md",
    "content": "# `xql` - Querying Xarray Datasets with SQL\n\nRunning SQL like queries on Xarray Datasets. Consider dataset as a table and data variable as a column.\n> Note: For now, we support only zarr datasets and earth engine image collections.\n\n# Supported Features\n\n* **`Select` Variables** - From a large dataset having hundreds of variables select only needed variables.\n* **Apply `where` clause** - A general where condition like SQL. Applicable for queries which includes data for specific time range or only for specific regions. \n* **`group by` Functions** - This is supported on the coordinates only. e.g. time, latitude, longitude, etc.\n* **`aggregate` Functions** - Aggregate functions `AVG()`, `MIN()`, `MAX()`, etc. Only supported on data variables.\n* **`limit and offset` clause** - Apply limit and offset to filter out the required result.\n* For more checkout the [road-map](https://github.com/google/weather-tools/tree/xql-init/xql#roadmap).\n> Note: For now, we support `where` conditions and `groupby` on coordinates only. `orderby` can only be applied either on selected variables or on coordinates.\n\n# Quickstart\n\n## Prerequisites\n\nGet an access to the dataset you want to query. Here as an example we're going to use the analysis ready era5 public dataset. [full_37-1h-0p25deg-chunk-1.zarr-v3](https://pantheon.corp.google.com/storage/browser/gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3?project=gcp-public-data-signals).\n\nFor this `gcloud` must be configured in your local environment. Refer [Initializing the gcloud CLI](https://cloud.google.com/sdk/docs/initializing) for configuring the `gcloud` locally.\n\n## Usage\n\n```\n# Install required packages\npip install xql\n\n# Jump into xql\npython xql/main.py xql\n```\n---\n### Supported meta commands\n`.help`: For usage info.\n\n`.exit`: To exit from the xql interpreter.\n\n`.set`: To set the dataset uri as a shortened key.\n```\n.set era5 gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3\n```\n\n`.show`: To list down dataset shortened key. Eg. `.show` or `.show [key]`\n\n```\n.show era5\n```\n\n`[query]`  =>  Any valid sql like query.\n\n---\n### Example Queries\n\n1. Apply a conditions. Query to get temperature of arctic region in January 2022:\n    ```\n    SELECT \n        temperature \n    FROM 'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3' \n    WHERE\n        time >= '2022-01-01' AND \n        time < '2022-02-01' AND \n        latitude >= 66.5\n    ```\n    > Note: Multiline queries are not yet supported. Convert copied queries into single line before execution.\n\n2. Aggregating results using Group By and Aggregate function. Daily average of temperature of arctic region in January 2022.\n    Setting the table name as shortened key.\n\n    ```\n    .set era5 gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3\n    ```\n    ```\n    SELECT \n        AVG(temperature), SUM(charnock), MIN('100m_v_component_of_wind') \n    FROM era5\n    WHERE \n        time >= '2022-01-01' AND \n        time < '2022-02-01' AND \n        latitude >= 66.5\n    GROUP BY time_date\n    ```\n    Replace `time_date` to `time_month` or `time_year` if monthly or yearly average is needed. Also use `MIN()` and `MAX()` functions same way as `AVG()`.\n\n3. `caveat`: Above queries run on the client's local machine and it generates a large two dimensional array so querying for very large amount of data will fall into out of memory erros.\n\n    e.g. Query like below will give OOM errors if the client machine don't have the enough RAM.\n\n    ```\n    SELECT \n        evaporation,\n        geopotential_at_surface,\n        temperature \n    FROM era5\n    ```\n\n# Dask Cluster Configuration\n\nSteps to deploy a Dask Cluster on GKE.\n1. Create a Kubernetes cluster if don't have any. Follow [Creating a zonal cluster](https://cloud.google.com/kubernetes-engine/docs/how-to/creating-a-zonal-cluster).\n2. Get Cluster Credentials on Local Machine\n    ```\n    gcloud container clusters get-credentials {cluster_name} --region {cluster_region} --project {project}\n    ```\n3. Install `helm`. Follow [Helm | Installing Helm](https://helm.sh/docs/intro/install/). \n4. Deploy Dask Cluster using below `helm` commands.\n    ```\n    helm repo add dask https://helm.dask.org/\n    helm repo update\n    helm install xql-dask dask/dask\n    ```\n    > Replace `xql-dask` from above command with the name you want your dask cluster to have. Just set `DASK_CLUSTER={dask_cluster_name}` environment variable.\n4. Connect to a dask cluster\n    ```\n    from xql.utils import connect_dask_cluster\n    connect_dask_cluster()\n    ```\n\n# Roadmap\n\n_Updated on 2024-01-08_\n\n1. [x] **Select Variables**\n    1. [ ] On Coordinates\n    2. [x] On Variables \n2. [x] **Where Clause**: `=`, `>`, `>=`, `<`, `<=`, etc.\n    1. [x] On Coordinates\n    2. [ ] On Variables \n3. [x] **Aggregate Functions**: Only `AVG()`, `MIN()`, `MAX()`, `SUM()` are supported.\n   1. [x] With Group By\n   2. [x] Without Group By\n   3. [x] Multiple Aggregate function in a single query\n4. [x] **Order By**: Apply sorting on the result.\n5. [x] **Limit**: Limiting the result to display.\n6. [ ] **Mathematical Operators** `(+, - , *, / )`: Add support to use mathematical operators in the query.\n7. [ ] **Aliases**: Add support to alias while querying.\n8. [ ] **Join Operations**: Support joining tables and apply query.\n9. [ ] **Nested Queries**: Add support to write nested queries.\n10. [ ] **Custom Aggregate Functions**: Support custom aggregate functions\n\n# `weather-lm` - Querying weather data using Natural Language prompts\n\nQuerying weather data using Natural Language prompts. This uses a gemini (large language model from Google) to generate SQL like queries and `xql` to execute that query.\n\n# Quickstart\n\n## Prerequisites\n\nGoogle API Key is needed to initiate Language Model. Refer [Setup your API key](https://ai.google.dev/tutorials/python_quickstart#setup_your_api_key) to generate that key.\n\nSet that key as an environment variable. Run below command.\n```\nexport GOOGLE_API_KEY=\"generate_key\"\n```\n\n## Usage\n```\n# Install required packages\npip install xql\n\n# Jump into language model\npython xql/main.py lm\n```\n---\n### Examples\n`Input Prompt`: Daily average temperature of New York for January 2015\n\nRelevant SQL Query:\n```\nSELECT \n    AVG('temperature') \nFROM \n    'gs://darshan-store/ar/2013-2022-full_37-1h-0p25deg-chunk-1.zarr-v3' \nWHERE \n    time >= '2015-01-01' AND \n    time < '2015-02-01' AND \n    city = 'New York' \nGROUP BY time_date\n```\nOutput Data:\n```\n     time_date  avg_temperature\n0   2015-01-01       240.978073\n1   2015-01-02       243.375031\n2   2015-01-03       244.584747\n3   2015-01-04       249.673065\n4   2015-01-05       245.650833\n...\nQuery took: 00:01:55\n```\n"
  },
  {
    "path": "xql/main.py",
    "content": "# Copyright 2024 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport argparse\n\nfrom src.weather_lm import nl_to_weather_data\nfrom src.xql import main\nfrom src.xql.utils import connect_dask_cluster\nfrom typing import List, Tuple\n\ndef parse_args() -> Tuple[argparse.Namespace, List[str]]:\n    parser = argparse.ArgumentParser()\n\n    parser.add_argument('mode', type=str, help='Select one from [xql, lm]')\n\n    return parser.parse_known_args()\n\n\nif __name__ == '__main__':\n    known_args, _ = parse_args()\n    if known_args.mode not in [\"xql\", \"lm\"]:\n        raise RuntimeError(\"Invalid mode type. Select one from [xql, lm]\")\n    prefix = \"xql\" if known_args.mode == \"xql\" else \"lm\"\n    try:\n        # Connect Dask Cluster\n        connect_dask_cluster()\n        while True:\n            query = input(f\"{prefix}> \")\n            if query == \".exit\":\n                break\n            if known_args.mode == \"xql\":\n                main(query)\n            else:\n                print(nl_to_weather_data(query))\n    except ImportError as e:\n        raise ImportError('main function is not imported please try again.') from e\n\n"
  },
  {
    "path": "xql/setup.py",
    "content": "# Copyright 2024 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\nfrom setuptools import find_packages, setup\n# Please maintain the dask_kubernetes version at 2024.4.2 (https://github.com/dask/dask-kubernetes/releases/tag/2024.4.2).\n# Subsequent versions have removed code which is essential for the current codebase to function properly.\nrequirements = [\n    \"dask==2024.3.0\",\n    \"distributed==2024.3.0\",\n    \"dask_kubernetes==2024.4.2\",\n    \"fsspec\",\n    \"gcsfs==2024.2.0\",\n    \"numpy==1.26.3\",\n    \"sqlglot\",\n    \"toolz==0.12.0\",\n    \"xarray==2024.01.0\",\n    \"xee\",\n    \"zarr==2.17.0\",\n    \"langchain\",\n    \"langchain-experimental\",\n    \"langchain-openai\",\n    \"langchain-google-genai\"\n]\n\nsetup(\n    name=\"xql\",\n    packages=find_packages(where='src'),\n    package_dir={'': 'src'},\n    install_requires=requirements,\n    version=\"0.0.2\",\n    author='anthromet',\n    author_email='anthromets-ecmwf@google.com',\n    description=(\"Running SQL queries on Xarray Datasets. Consider dataset as a table and data variable as a column.\"),\n    long_description=open('README.md', 'r', encoding='utf-8').read(),\n    long_description_content_type='text/markdown',\n    python_requires='>=3.9, <3.11',\n)\n"
  },
  {
    "path": "xql/src/__init__.py",
    "content": "# Copyright 2024 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n"
  },
  {
    "path": "xql/src/weather_lm/__init__.py",
    "content": "#!/usr/bin/env python3\n# Copyright 2024 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom .gemini import nl_to_sql_query, nl_to_weather_data #noqa\n"
  },
  {
    "path": "xql/src/weather_lm/constant.py",
    "content": "#!/usr/bin/env python3\n# Copyright 2024 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# ruff: noqa: E501\n\nMETADATA_URI = \"gs://darshan-store/xql/metadata.json\"\n\nGENERATE_SQL_TEMPLATE = \"\"\"You are a SQL expert. Given an input question, first create a syntactically correct SQL query to execute.\nNever query for all columns from a table. You must query only the columns that are needed to answer the question.\nPay attention to use only the column names you can see in the tables below.\nBe careful to not query for columns that do not exist. Also, pay attention to which column is in which table.\nEverytime wrap table name in single quotes ('').\nSpecify the time range, latitude, longitude as follows: (time >= '2012-12-01' AND time < '2013-04-01').\n\nWhile accessing the variable name from the table don't use \"\\\" this.\nEx. ( SELECT MAX(\"vertical_velocity\") FROM 'table' ) => True syntax\n    ( SELECT MAX(\\\"vertical_velocity\\\") FROM 'table' ) => False syntax\nAvoid using the 'time BETWEEN', 'latitude BETWEEN' syntax, opt for the former style instead.\n\nNote: At present, only data variables are supported in the SELECT Clause.  Coordinates (latitude, longitude, time) are not supported.  Therefore,\ncoordinates should not be used in the SELECT Clause.\n\nExample:\n\nSome important data details to consider:\n- Use latitude and longitude ranges for cities and countries.\n- Standard aggregations are applied to the data. A unique convention for aggregation for daily, monthly and yearly are time_date, time_month and time_year.\n- The WHERE clause and GROUP BY is specifically applies to coordinates variables. e.g. timestamp, latitude, longitude, and level coordinates.\n    For \"timestamp,\" use time_date for grouping by date and time_month for grouping by month. Standard SQL GROUP BY operations apply only to \"latitude\",\n    \"longitude\", and \"level\" column.\n- Write time always into 'YYYY-MM-DD' format. i.e. '2021-12-01'.\n\nPlease use the following format:\n\nQuestion: \"Question here\"\nSQLQuery: \"SQL Query to run\"\n\nUse the following information for the database:\n- Use {table} as table name.\n- The dataset includes columns like {columns}. Select appropriate columns from these which are most relevant to the Question.\n- Latitude range is {latitude_range}, and longitude range is {longitude_range}. Generate query accordingly.\n- {latitude_dim} and {longitude_dim} are my columns for latitude and longitude so use them everywhere in query.\n    Ex. If lat and lon are in the {dims} then instead of latitude > x AND longitude > y use lat > x AND lon > y.\n- The interpretation of the \"organic\" soil type is value of soil type is equal to 6.\n- \"Over all locations\", \"globally\" entails iterating through \"latitude\" & \"longitude.\"\n\nSome examples of SQL queries that correspond to questions are:\n\n{few_shot_examples}\n\nQuestion: {question}\"\"\"\n\nfew_shots = {\n    \"Aggregate precipitation over months over all locations?\" : \"SELECT SUM(precipitation) FROM 'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3' GROUP BY latitude, longitude, time_month\",\n    \"Daily average temperature.\":\"\"\"SELECT AVG(temperature) FROM \"gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3\" GROUP BY time_date\"\"\",\n    \"Average temperature of the Antarctic Area during last monsoon over months.\":\"SELECT AVG(temperature) FROM 'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3' WHERE time >= '2022-06-01' AND time < '2022-11-01' AND latitude < 66.5 GROUP BY time_month\",\n    \"Average temperature over years.\":\"SELECT AVG(temperature) FROM 'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3' GROUP BY time_year\",\n    \"Aggregate precipitation globally?\" : \"SELECT SUM(precipitation) FROM 'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3' GROUP BY latitude, longitude\",\n    \"For January 2000\" : \"SELECT * from TABLE where time >= '2000-01-01 00:00:00' AND time < '2000-02-01 00:00:00' \",\n    \"Daily average temperature of city x for January 2015?\": \"SELECT AVG(temperature) FROM 'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3' WHERE time >= '2015-01-01' AND time < '2015-02-01' AND latitude > 40 AND latitude < 41 AND longitude > 286 AND longitude < 287 GROUP BY time_date\",\n    \"Daily min reflectivity of city x for January 2015?\": \"SELECT AVG(reflectivity) FROM 'ee://projects/anthromet-prod/assets/opera/instantaneous_maximum_reflectivity' WHERE time >= '2015-01-01' AND time < '2015-02-01' AND lat > 40.48 AND lat < 41.87 AND lon > -74.25 AND lon < -71.98 GROUP BY time_date\"\n}\n\nSELECT_DATASET_TEMPLATE = \"\"\"\nI have some description of tables that stores weather related data.\nAnalyze and give me an table that i need to query for provided question.\nSometimes the exact column not be there in the table so select table that contains most relevant columns.\n    Ex. Daily average of precipitation rate asked but exact precipitation column is not there then select the table that contains relevant column like total_precipitation, precipitation_rate, total_precipitation_rate, etc.\nBelow is the description and input qustion\n{table_map}\n\nQuestion: {question}\n\nPlease use the following format:\n\n{question}:appropriate table\n\"\"\"\n"
  },
  {
    "path": "xql/src/weather_lm/gemini.py",
    "content": "#!/usr/bin/env python3\n# Copyright 2024 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\n\nfrom langchain_google_genai import ChatGoogleGenerativeAI\n\nfrom .constant import few_shots\nfrom .template import DEFINED_PROMPTS\nfrom .utils import get_invocation_steps, get_table_map_prompt\nfrom xql import run_query\n\ndef nl_to_sql_query(input_statement: str) -> str:\n    \"\"\"\n    Convert a natural language query to SQL.\n\n    Parameters:\n    - input_statement (str): The natural language query.\n\n    Returns:\n    - str: The generated SQL query.\n    \"\"\"\n    # Check if API key is provided either directly or through environment variable\n    api_key = os.getenv(\"GOOGLE_API_KEY\")\n\n    if api_key is None:\n        raise RuntimeError(\"Environment variable GOOGLE_API_KEY is not set.\")\n\n    # Get table map\n    table_map_prompt, table_map = get_table_map_prompt()\n\n    # Initialize model for natural language processing\n    model = ChatGoogleGenerativeAI(model=\"gemini-pro\")\n\n    # Get invocation steps for selecting dataset\n    select_dataset_model = get_invocation_steps(DEFINED_PROMPTS['select_dataset'], model)\n\n    # Get invocation steps for generating SQL\n    generate_sql_model = get_invocation_steps(DEFINED_PROMPTS['generate_sql'], model)\n\n    # Invoke pipeline to select dataset based on input statement\n    select_dataset_res = select_dataset_model.invoke({ \"question\": input_statement, \"table_map\": table_map_prompt })\n\n    # Extract dataset key from result\n    dataset_key = select_dataset_res.split(\":\")[-1].strip()\n\n    # Retrieve dataset metadata using dataset key\n    dataset_metadata = table_map[dataset_key]\n\n    # Invoke pipeline to generate SQL query\n    generate_sql_res = generate_sql_model.invoke({\n        \"question\": input_statement,\n        \"table\": dataset_metadata['uri'],\n        \"columns\": dataset_metadata['columns'],\n        \"few_shot_examples\": few_shots,\n        \"dims\": dataset_metadata[\"dims\"],\n        'latitude_dim': dataset_metadata[\"latitude_dim\"],\n        'latitude_range': dataset_metadata[\"latitude_range\"],\n        'longitude_dim': dataset_metadata[\"longitude_dim\"],\n        'longitude_range': dataset_metadata[\"longitude_range\"]\n    })\n\n    # Extract SQL query from result.\n    # The response will look like [SQLQuery: SELECT * FROM {table} WHERE ...].\n    # So slice the sql query from string.\n    sql_query = generate_sql_res[11:-1]\n\n    return sql_query\n\n\ndef nl_to_weather_data(input_statement: str):\n    \"\"\"\n    Convert a natural language query to SQL and fetch weather data.\n\n    Parameters:\n    - input_statement (str): The natural language query.\n    \"\"\"\n    # Generate SQL query\n    sql_query = nl_to_sql_query(input_statement)\n\n    # Print generated SQL statement for debugging\n    print(\"Generated SQL Statement:\", sql_query)\n\n    # Execute SQL query to fetch weather data\n    return run_query(sql_query)\n"
  },
  {
    "path": "xql/src/weather_lm/template.py",
    "content": "#!/usr/bin/env python3\n# Copyright 2024 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom langchain.prompts import PromptTemplate\n\nfrom .constant import GENERATE_SQL_TEMPLATE, SELECT_DATASET_TEMPLATE\n\nDEFINED_PROMPTS = {\n    'select_dataset': PromptTemplate(\n        input_variables = [\"table_map\", \"question\"],\n        template = SELECT_DATASET_TEMPLATE,\n    ),\n    'generate_sql': PromptTemplate(\n        input_variables = [\n            \"question\",\n            \"few_shot_examples\",\n            \"table\",\n            \"columns\",\n            \"dims\",\n            \"latitude_dim\",\n            \"latitude_range\",\n            \"longitude_dim\",\n            \"longitude_range\"\n        ],\n        template = GENERATE_SQL_TEMPLATE,\n    )\n}\n"
  },
  {
    "path": "xql/src/weather_lm/utils.py",
    "content": "#!/usr/bin/env python3\n# Copyright 2024 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport json\nimport typing as t\n\nfrom gcsfs import GCSFileSystem\nfrom langchain.prompts import PromptTemplate\nfrom langchain.schema.output_parser import StrOutputParser\nfrom langchain_google_genai import ChatGoogleGenerativeAI\n\nfrom .constant import METADATA_URI\n\ndef get_table_map() -> t.Dict:\n    \"\"\"\n    Load and return the table map from dataset-meta.json file.\n\n    Returns:\n        dict: Dictionary containing table names as keys and their metadata as values.\n    \"\"\"\n    fs = GCSFileSystem()\n    table_map = {}\n    with fs.open(METADATA_URI) as f:\n        table_map = json.load(f)\n    return table_map\n\ndef get_table_map_prompt() -> t.Tuple:\n    \"\"\"\n    Generate a prompt containing information about each table in the dataset.\n\n    Returns:\n        tuple: A tuple containing the prompt string and the table map dictionary.\n    \"\"\"\n    table_prompts = []\n    table_map = get_table_map()\n    for k, v in table_map.items():\n        data_str = f\"\"\"Table name is {k}.\n        It's located at {v['uri']} and containing following columns: {', '.join(v['columns'])}\"\"\"\n\n        table_prompts.append(data_str)\n    return \"\\n\".join(table_prompts), table_map\n\n\ndef get_invocation_steps(prompt: PromptTemplate, model: ChatGoogleGenerativeAI):\n    \"\"\"\n    Get the invocation steps for a given prompt and model.\n\n    Parameters:\n    - prompt (PromptTemplate): The prompt template to use.\n    - model (ChatGoogleGenerativeAI): The generative model to use.\n\n    Returns:\n    - Pipeline: The invocation steps for the given prompt and model.\n    \"\"\"\n    chat = (\n        prompt\n        | model.bind()\n        | StrOutputParser()\n    )\n    return chat\n"
  },
  {
    "path": "xql/src/xql/__init__.py",
    "content": "# Copyright 2024 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\nfrom .apply import main, parse_query, run_query #noqa\n"
  },
  {
    "path": "xql/src/xql/apply.py",
    "content": "#!/usr/bin/env python3\n# Copyright 2024 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport readline # noqa\n\nimport numpy as np\nimport pandas as pd\nimport typing as t\nimport xarray as xr\n\nfrom sqlglot import parse_one, exp\nfrom xarray.core.groupby import DatasetGroupBy\n\nfrom .open import get_chunking, open_dataset\nfrom .utils import timing\nfrom .where import apply_where\n\ncommand_info = {\n    \".exit\": \"To exit from the current session.\",\n    \".set\": \"To set the dataset uri as a shortened key. e.g. .set era5 gs://{BUCKET}/dataset-uri\",\n    \".show\": \"To list down dataset shortened key. e.g. .show or .show [key]\",\n    \"[query]\": \"Any valid sql like query.\"\n}\n\ntable_dataset_map = {} # To store dataset shortened keys for a single session.\n\noperate = {\n    \"and\" : lambda a, b: a & b,\n    \"or\" : lambda a, b: a | b,\n    \"eq\" : lambda a, b: a == b,\n    \"gt\" : lambda a, b: a > b,\n    \"lt\" : lambda a, b: a < b,\n    \"gte\" : lambda a, b: a >= b,\n    \"lte\" : lambda a, b: a <= b,\n}\n\naggregate_function_map = {\n    'avg': lambda x, y: x.mean(dim=y),\n    'min': lambda x, y: x.min(dim=y),\n    'max': lambda x, y: x.max(dim=y),\n    'sum': lambda x, y: x.sum(dim=y),\n}\n\ntimestamp_formats = {\n    'time_date':\"%Y-%m-%d\",\n    'time_month':\"%Y-%m\",\n    'time_year': \"%Y\"\n}\n\ndef parse(a: t.Union[xr.DataArray, str], b: t.Union[xr.DataArray, str]) -> t.Tuple[t.Union[xr.DataArray, str],\n                                                                                   t.Union[xr.DataArray, str]]:\n    \"\"\"\n    Parse input values 'a' and 'b' into NumPy arrays with compatible types for evaluation.\n\n    Parameters:\n    - a (Union[xr.DataArray, str]): The first input value.\n    - b (Union[xr.DataArray, str]): The second input value.\n\n    Returns:\n    - Tuple[xr.DataArray, Union[np.float64, np.float32, np.datetime64]]: Parsed NumPy arrays 'a' and 'b'.\n    \"\"\"\n\n    if isinstance(a, str):\n        a, b = b, a\n    arr_type = a.dtype.name\n    if arr_type == 'float64':\n        b = np.float64(b)\n    elif arr_type == 'float32':\n        b = np.float32(b)\n    elif arr_type == 'datetime64[ns]':\n        b = np.datetime64(b)\n    return a, b\n\n\ndef apply_orderby(e: exp.Order, df: pd.DataFrame) -> pd.DataFrame:\n    \"\"\"\n    Apply ORDER BY clause to the DataFrame.\n\n    Args:\n    - e (exp.Order): Parsed ORDER BY expression.\n    - df (pd.DataFrame): DataFrame to be sorted.\n\n    Returns:\n    - pd.DataFrame: Sorted DataFrame based on the ORDER BY clause.\n    \"\"\"\n    orderby_columns = []\n    orderby_columns_order = []\n\n    # Extract columns and sorting orders from the parsed ORDER BY expression\n    for el in e.expressions:\n        orderby_column = el.find(exp.Column).find(exp.Identifier).this\n        order = not bool(el.args['desc'])  # Descending if desc=False\n        orderby_columns.append(orderby_column)\n        orderby_columns_order.append(order)\n\n    # Sort the DataFrame based on the extracted columns and orders\n    df = df.sort_values(orderby_columns, ascending=orderby_columns_order)\n\n    return df\n\n\ndef aggregate_variables(agg_funcs: t.List[t.Dict[str, str]],\n                        ds: xr.Dataset,\n                        time_fields: t.List[str],\n                        coords_to_squeeze: t.List[str]) -> xr.Dataset:\n    \"\"\"\n    Aggregate variables in an xarray dataset based on aggregation functions.\n\n    Args:\n        agg_funcs (List[Dict[str, str]]): List of dictionaries specifying aggregation functions for variables.\n        ds (xr.Dataset): The input xarray dataset.\n        time_fields (List[str]): List of time fields to consider for time-based grouping.\n        coords_to_squeeze (List[str]): List of coordinates to be squeezed during aggregation.\n\n    Returns:\n        xr.Dataset: The aggregated xarray dataset.\n    \"\"\"\n    agg_dataset = xr.Dataset(coords=ds.coords, attrs=ds.attrs)\n\n    # Aggregate based on time fields\n    if len(time_fields):\n        agg_dataset = agg_dataset.groupby(ds['time'].dt.strftime(timestamp_formats[time_fields[0]]))\n        agg_dataset = apply_aggregation(agg_dataset, 'avg', None)\n        agg_dataset = agg_dataset.rename({\"strftime\": time_fields[0]})\n\n    # Aggregate based on other coordinates\n    agg_dataset = apply_aggregation(agg_dataset, 'avg', coords_to_squeeze)\n\n    # Loop through aggregation functions\n    for agg_func in agg_funcs:\n        variable, function = agg_func['var'], agg_func['func']\n        grouped_ds = ds[variable]\n        dims = [value for value in coords_to_squeeze if value in ds[variable].coords] if coords_to_squeeze else None\n\n        # If time fields are specified, group by time\n        if len(time_fields):\n            groups = grouped_ds.groupby(ds['time'].dt.strftime(timestamp_formats[time_fields[0]]))\n            grouped_ds = apply_aggregation(groups, function, None)\n            grouped_ds = grouped_ds.rename({\"strftime\": time_fields[0]})\n\n        # Apply aggregation on dimensions\n        agg_dim_ds = apply_aggregation(grouped_ds, function, dims)\n        agg_dataset = agg_dataset.assign({f\"{function}_{variable}\": agg_dim_ds})\n\n    return agg_dataset\n\n\ndef apply_group_by(time_fields: t.List[str], ds: xr.Dataset, agg_funcs: t.Dict[str, str],\n                   coords_to_squeeze: t.List[str] = []) -> xr.Dataset:\n    \"\"\"\n    Apply group-by and aggregation operations to the dataset based on specified fields and aggregation functions.\n\n    Parameters:\n    - time_fields (List[str]): List of time_fields(coordinates) to be used for grouping.\n    - ds (xarray.Dataset): The input dataset.\n    - agg_funcs (t.List[t.Dict[str, str]]): Dictionary mapping aggregation function names to their corresponding\n    xarray-compatible string representations.\n    - coords_to_squeeze (t.List[str]): The dimension along which to apply the aggregation.\n        If None, aggregation is applied to the entire dataset.\n\n    Returns:\n    - xarray.Dataset: The dataset after applying group-by and aggregation operations.\n    \"\"\"\n\n    grouped_ds = ds\n\n    if len(time_fields) > 1:\n        raise NotImplementedError(\"GroupBy using multiple time fields is not supported.\")\n\n    elif len(time_fields) == 1:\n        grouped_ds = aggregate_variables(agg_funcs, ds, time_fields, coords_to_squeeze)\n\n    return grouped_ds\n\n\ndef apply_aggregation(groups: t.Union[xr.Dataset, DatasetGroupBy], fun: str, dim: t.List[str] = []) -> xr.DataArray:\n    \"\"\"\n    Apply aggregation to the groups based on the specified aggregation function.\n\n    Parameters:\n    - groups (Union[xr.Dataset, xr.core.groupby.DatasetGroupBy]): The input dataset or dataset groupby object.\n    - fun (str): The aggregation function to be applied.\n    - dim (Optional[str]): The dimension along which to apply the aggregation. If None, aggregation is applied\n    to the entire dataset.\n\n    Returns:\n    - xr.Dataset: The dataset after applying the aggregation.\n    \"\"\"\n\n    return aggregate_function_map[fun](groups, dim)\n\n\ndef get_coords_to_squeeze(fields: t.List[str], ds: xr.Dataset) -> t.List[str]:\n    \"\"\"\n    Get the coordinates to squeeze from an xarray dataset.\n\n    The function identifies coordinates in the dataset that are not part of the specified fields\n    and are not the 'time' coordinate.\n\n    Args:\n        fields (List[str]): List of field names.\n        ds (xr.Dataset): The xarray dataset.\n\n    Returns:\n        List[str]: List of coordinates to squeeze.\n    \"\"\"\n    # Identify coordinates not in fields and not 'time'\n    coords_to_squeeze = [coord for coord in ds.coords if coord not in fields and (coord != \"time\")]\n\n    return coords_to_squeeze\n\n\ndef get_table(e: exp.Expression) -> str:\n    \"\"\"\n    Get the table name from an expression.\n\n    Args:\n        e (Expression): The expression containing table information.\n\n    Returns:\n        str: The table name.\n    \"\"\"\n    # Extract the table name from the expression\n    table = e.find(exp.Table).args['this'].args['this']\n\n    # Check if the table is mapped in table_dataset_map\n    if table in table_dataset_map:\n        table = table_dataset_map[table]\n\n    return table\n\n\ndef parse_query(query: str) -> xr.Dataset:\n\n    expr = parse_one(query)\n\n    if not isinstance(expr, exp.Select):\n        return \"ERROR: Only select queries are supported.\"\n\n    table = get_table(expr)\n\n    is_star = expr.find(exp.Star)\n\n    data_vars = []\n    if is_star is None:\n        data_vars = [var.args['this'].args['this'] if var.key == 'column' else var.args['this']\n             for var in expr.expressions\n             if (var.key == \"column\" or (var.key == \"literal\" and var.args.get(\"is_string\") is True))]\n\n    where_clause = expr.find(exp.Where)\n    group_by = expr.find(exp.Group)\n\n    agg_funcs = [\n        {\n        'var': var.args['this'].args['this'].args['this'] if  var.args['this'].key == 'column'\n                                                        else var.args['this'].args['this'],\n        'func': var.key\n        }\n    for var in expr.expressions if var.key in aggregate_function_map\n    ]\n\n    if len(agg_funcs):\n        data_vars = [ agg_var['var'] for agg_var in agg_funcs ]\n\n    ds, chunkable = open_dataset(table)\n\n    if is_star is None:\n        ds = ds[data_vars]\n\n    if where_clause is not None:\n        ds = apply_where(ds, where_clause.args['this'])\n\n    if chunkable:\n        ds = ds.chunk(chunks=get_chunking(table, list(ds.data_vars)))\n\n    coords_to_squeeze = None\n    time_fields = []\n    if group_by:\n        fields = [ e.args['this'].args['this'] for e in group_by.args['expressions'] ]\n        time_fields = list(filter(lambda field: \"time\" in field, fields))\n        coords_to_squeeze = get_coords_to_squeeze(fields, ds)\n        ds = apply_group_by(time_fields, ds, agg_funcs, coords_to_squeeze)\n\n    if len(time_fields) == 0 and len(agg_funcs):\n        if isinstance(coords_to_squeeze, t.List):\n            coords_to_squeeze.append('time')\n        ds = aggregate_variables(agg_funcs, ds, time_fields, coords_to_squeeze)\n\n    return ds\n\n\ndef convert_to_dataframe(ds: xr.Dataset) -> pd.DataFrame:\n    \"\"\"\n    Convert xarray Dataset to pandas DataFrame.\n\n    Args:\n        ds (xr.Dataset): xarray Dataset to be converted.\n\n    Returns:\n        pd.DataFrame: Pandas DataFrame containing the data from the xarray Dataset.\n    \"\"\"\n    if len(ds.coords):\n        # If the dataset has coordinates, convert it to DataFrame and reset index\n        df = ds.to_dataframe().reset_index()\n    else:\n        # If the dataset doesn't have coordinates, compute it and convert to dictionary\n        ds = ds.compute().to_dict(data=\"list\")\n        # Create DataFrame from dictionary\n        df = pd.DataFrame({k: [v['data']] for k, v in ds['data_vars'].items()})\n\n    return df\n\n\ndef filter_records(df: pd.DataFrame, query: str) -> pd.DataFrame:\n    \"\"\"\n    Filter records in an xarray Dataset based on a given query.\n\n    Args:\n        ds (xr.Dataset): The xarray Dataset to filter.\n        query (str): The query string for filtering the dataset.\n\n    Returns:\n        pd.DataFrame: A pandas DataFrame containing the filtered records.\n    \"\"\"\n\n    # Parse the query expression\n    expr = parse_one(query)\n\n    # Find Limit, Offset and OrderBy clauses in the query\n    orderby_clause = expr.find(exp.Order)\n    limit_clause = expr.find(exp.Limit)\n    offset_clause = expr.find(exp.Offset)\n\n    # Apply orderby clause if present\n    if orderby_clause:\n        df = apply_orderby(orderby_clause, df)\n\n    # Initialize start location for slicing\n    start_loc = 0\n\n    # Apply offset clause if present\n    if offset_clause:\n        start_loc = int(offset_clause.expression.args['this'])\n        df = df.iloc[start_loc:]\n\n    # Apply limit clause if present\n    if limit_clause:\n        limit = int(limit_clause.expression.args['this'])\n        df = df.iloc[:start_loc + limit]\n\n    # Compute and return the filtered DataFrame\n    return df\n\n\ndef set_dataset_table(cmd: str) -> None:\n    \"\"\"\n    Set the mapping between a key and a dataset.\n\n    Args:\n        cmd (str): The command string in the format \".set key val\"\n            where key is the identifier and val is the dataset table.\n    \"\"\"\n    # Split the command into parts\n    cmd_parts = cmd.split(\" \")\n\n    # Check if the command has the correct number of arguments\n    if len(cmd_parts) == 3:\n        # Extract key and val from the command\n        _, key, val = cmd_parts\n        # Update the dataset table mapping\n        table_dataset_map[key] = val\n    else:\n        # Print an error message for incorrect arguments\n        print(\"Incorrect args. Run .help .set for usage info.\")\n\n\ndef list_key_values(input: t.Dict[str, str]) -> None:\n    \"\"\"\n    Display key-value pairs from a dictionary.\n\n    Args:\n        input (Dict[str, str]): The dictionary containing key-value pairs.\n    \"\"\"\n    for cmd, desc in input.items():\n        print(f\"{cmd}  =>  {desc}\")\n\n\ndef display_help(cmd: str) -> None:\n    \"\"\"\n    Display help information for commands.\n\n    Args:\n        cmd (str): The command string.\n    \"\"\"\n    cmd_parts = cmd.split(\" \")\n\n    if len(cmd_parts) == 2:\n        if cmd_parts[1] in command_info:\n            print(f\"{cmd_parts[1]}  =>  {command_info[cmd_parts[1]]}\")\n        else:\n            list_key_values(command_info)\n    elif len(cmd_parts) == 1:\n        list_key_values(command_info)\n    else:\n        print(\"Incorrect usage. Run .help or .help [cmd] for usage info.\")\n\n\ndef display_table_dataset_map(cmd: str) -> None:\n    \"\"\"\n    Display information from the table_dataset_map.\n\n    Args:\n        cmd (str): The command string.\n    \"\"\"\n    cmd_parts = cmd.split(\" \")\n\n    if len(cmd_parts) == 2:\n        if cmd_parts[1] in table_dataset_map:\n            print(f\"{cmd_parts[1]}  =>  {table_dataset_map[cmd_parts[1]]}\")\n        else:\n            list_key_values(table_dataset_map)\n    else:\n        list_key_values(table_dataset_map)\n\n\n@timing\ndef run_query(query: str) -> None:\n    \"\"\"\n    Run a query and display the result.\n\n    Args:\n        query (str): The query to be executed.\n    \"\"\"\n    try:\n        result = parse_query(query)\n    except Exception as e:\n        result = f\"ERROR: {type(e).__name__}: {e.__str__()}.\"\n        return result\n    return filter_records(convert_to_dataframe(result), query)\n\n@timing\ndef main(query: str):\n    \"\"\"\n    Main function for runnning this file.\n    \"\"\"\n    if \".help\" in query:\n        display_help(query)\n\n    elif \".set\" in query:\n        set_dataset_table(query)\n\n    elif \".show\" in query:\n        display_table_dataset_map(query)\n\n    else:\n        result = run_query(query)\n        print(result)\n"
  },
  {
    "path": "xql/src/xql/constant.py",
    "content": "#!/usr/bin/env python3\n# Copyright 2024 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nSUPPORTED_CUSTOM_COORDS = ['city', 'country']\n\nCOUNTRIES_BOUNDING_BOXES = {\n    'india': (6.5546079, 35.4940095078, 68.1766451354, 97.4025614766),\n    'canada': (41.6751050889, 83.23324, -140.99778, -52.6480987209),\n    'japan': (31.0295791692, 45.5514834662, 129.408463169, 145.543137242),\n    'united kingdom': (49.959999905, 58.6350001085, -7.57216793459, 1.68153079591),\n    'south africa': (-34.8191663551, -22.0913127581, 16.3449768409, 32.830120477),\n    'australia': (-44, -10, 113, 154),\n    'united states': (24.396308, 49.384358, -125.0, -66.93457)\n}\n\nCITIES_BOUNDING_BOXES = {\n    'delhi': (28.404, 28.883, 76.838, 77.348),\n    'new york': (40.4774, 40.9176, -74.2591, -73.7002),\n    'san francisco': (37.6398, 37.9298, -122.5975, -122.3210),\n    'los angeles': (33.7036, 34.3373, -118.6682, -118.1553),\n    'london': (51.3849, 51.6724, -0.3515, 0.1482)\n}\n"
  },
  {
    "path": "xql/src/xql/open.py",
    "content": "#!/usr/bin/env python3\n# Copyright 2024 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport ee\nimport logging\nimport zarr\n\nimport typing as t\nimport xarray as xr\n\nlogger = logging.getLogger(__name__)\n\nOPENER_MAP = {\n    \"zarr\": \"zarr\",\n    \"ee\": \"ee\"\n}\n\ndef get_chunking(uri: str, variables: t.List[str]) -> t.Dict:\n    \"\"\"\n    Retrieve chunking information for the specified variables in a Zarr dataset.\n\n    Parameters:\n        uri (str): The URI of the Zarr dataset.\n        variables (List[str]): A list of variable names.\n\n    Returns:\n        t.Dict: A dictionary containing chunking information for each variable.\n    \"\"\"\n\n    # Initialize dictionary to store chunking information\n    chunks = {}\n\n    # Open the Zarr dataset\n    zf = zarr.open(uri)\n\n    # Iterate over each variable\n    for v in variables:\n        # Get the variable object\n        var = zf[v]\n\n        # Get chunking info for the variable\n        var_chunks = var.chunks\n\n        # Get variable dimensions\n        var_dims = var.attrs.get('_ARRAY_DIMENSIONS')\n\n        # Map dimensions to chunk sizes\n        chunk_dict = dict(zip(var_dims, var_chunks))\n\n        # Update chunks with array chunk dimensions\n        chunks.update(chunk_dict)\n\n    # Return chunking information dictionary\n    return chunks\n\n\ndef open_dataset(uri: str) -> t.Tuple[xr.Dataset, bool]:\n    \"\"\"\n    Open a dataset from the given URI using the appropriate engine.\n\n    Parameters:\n    - uri (str): The URI of the dataset to open.\n\n    Returns:\n    - xr.Dataset: The opened dataset.\n\n    Raises:\n    - RuntimeError: If unable to open the dataset.\n    \"\"\"\n    chunkable = False\n    try:\n        # Check if the URI starts with \"ee://\"\n        if uri.startswith(\"ee://\"):\n            # If yes, initialize Earth Engine\n            ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')\n            # Open dataset using Earth Engine engine\n            ds = xr.open_dataset(uri, engine=OPENER_MAP[\"ee\"])\n        else:\n            # If not, open dataset using zarr engine\n            ds = xr.open_zarr(uri, chunks=None)\n            chunkable = True\n    except Exception:\n        # If opening fails, raise RuntimeError\n        raise RuntimeError(\"Unable to open dataset. [zarr, ee] are the only supported dataset types.\")\n\n    return ds, chunkable\n"
  },
  {
    "path": "xql/src/xql/utils.py",
    "content": "#!/usr/bin/env python3\n# Copyright 2024 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\n\nfrom dask_kubernetes import HelmCluster\nfrom dask.distributed import Client\nfrom functools import wraps\nfrom time import gmtime, strftime, time\n\ndef timing(f):\n    \"\"\"Measure a time for any function execution.\"\"\"\n    @wraps(f)\n    def wrap(*args, **kw):\n        ts = time()\n        result = f(*args, **kw)\n        te = time()\n        print(f\"Query took: { strftime('%H:%M:%S', gmtime(te - ts)) }\")\n        return result\n    return wrap\n\n\ndef connect_dask_cluster() -> None:\n    \"\"\"\n    Connects to a Dask cluster.\n    \"\"\"\n    # Fetch the cluster name from environment variable, default to \"xql-dask\" if not set\n    cluster_name = os.getenv('DASK_CLUSTER', \"xql-dask\")\n\n    try:\n        # Create a HelmCluster instance with the specified release name\n        cluster = HelmCluster(release_name=cluster_name)\n\n        # Connect a Dask client to the cluster\n        client = Client(cluster) # noqa: F841\n\n        # Print a message indicating successful connection\n        print(\"Dask cluster connected.\")\n\n    except Exception:\n        # Print a message indicating failure to connect\n        print(\"Dask cluster not connected.\")\n"
  },
  {
    "path": "xql/src/xql/where.py",
    "content": "#!/usr/bin/env python3\n# Copyright 2024 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     https://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport numpy as np\nimport typing as t\nimport xarray as xr\n\nfrom sqlglot import exp\n\nfrom .constant import CITIES_BOUNDING_BOXES, COUNTRIES_BOUNDING_BOXES, SUPPORTED_CUSTOM_COORDS\n\ndef parse(a: t.Union[xr.DataArray, str], b: t.Union[xr.DataArray, str]) -> t.Tuple[t.Union[xr.DataArray, str],\n                                                                                   t.Union[xr.DataArray, str]]:\n    \"\"\"\n    Parse input values 'a' and 'b' into NumPy arrays with compatible types for evaluation.\n\n    Parameters:\n    - a (Union[xr.DataArray, str]): The first input value.\n    - b (Union[xr.DataArray, str]): The second input value.\n\n    Returns:\n    - Tuple[xr.DataArray, Union[np.float64, np.float32, np.datetime64]]: Parsed NumPy arrays 'a' and 'b'.\n    \"\"\"\n\n    if isinstance(a, str):\n        a, b = b, a\n    arr_type = a.dtype.name\n    if arr_type == 'float64':\n        b = np.float64(b)\n    elif arr_type == 'float32':\n        b = np.float32(b)\n    elif arr_type == 'datetime64[ns]':\n        b = np.datetime64(b)\n    return a, b\n\ndef get_sop_terms(expression: exp.Expression):\n    def cross_product(exp1: t.List[exp.Expression], exp2: t.List[exp.Expression]):\n        pdt = []\n        if(len(exp1) == 0):\n            return exp2\n        if(len(exp2) == 0):\n            return exp1\n        for item1 in exp1:\n            for item2 in exp2:\n                term = item1.and_(item2)\n                pdt.append(term)\n        return pdt\n    if expression.key == \"and\":\n        children = list(expression.flatten())\n        product_terms = []\n        for child in children:\n            terms = get_sop_terms(child)\n            product_terms = cross_product(product_terms, terms)\n        return product_terms\n    elif expression.key == \"or\":\n        children = list(expression.flatten())\n        sum_terms = []\n        for child in children:\n            if child.key==\"and\" or child.key==\"or\":\n                sum_terms.extend(get_sop_terms(child))\n            else:\n                sum_terms.append(child)\n        return sum_terms\n    else:\n        return [expression]\n\ndef check_conditional(expression: exp.Expression) -> bool:\n    k = expression.key\n    if k == 'lt' or k == 'gt' or k == 'gte' or k == 'lte' or k == 'eq':\n        return True\n    return False\n\ndef parse_condition(expression: exp.Expression, condition_dict: dict) -> bool:\n    # TODO: This function right now assumnes that for a condition\n    # LHS is always Column name and RHS is always value.\n    op = expression.key\n    args = expression.args\n\n    left = args.get('this', None)\n    right = args.get('expression', None)\n\n    identifier = left.args.get('this')\n    coordinate = identifier.args.get('this')\n\n    # handing negative\n    if isinstance(right, exp.Neg):\n        right = right.args.get('this')\n        value = \"-\" + right.args.get('this')\n    else:\n        value = right.args.get('this')\n\n    if coordinate not in condition_dict:\n        condition_dict[coordinate] = {}\n\n    condition_dict[coordinate][op] = value\n\ndef is_ascending_order(da: xr.DataArray) -> bool:\n    \"\"\"Simple check if a dataarray is in increasing or descreasing order.\"\"\"\n    da = da.drop_duplicates(...)\n    if(da[0].values < da[1].values):\n        return True\n    return False\n\ndef select_coordinate(da: xr.DataArray, coordinate: str, operator: str, value: any) -> xr.DataArray:\n    \"\"\"Based on operator and sort order (if data is in ascending or descending order)\n    It will apply the greater or lesser condition.\n    Equal condition does not depend on that.\n    \"\"\"\n    op = operator\n    da, parsed_value = parse(da, value)\n    if op == 'eq':\n        return da.sel({coordinate: parsed_value})\n    if(is_ascending_order(da)):\n        if op == 'gt' or op == 'gte':\n            return da.sel({coordinate: slice(parsed_value, None)})\n        elif op == 'lt' or op == 'lte':\n            return da.sel({coordinate: slice(None, parsed_value)})\n        else:\n            raise ValueError(f\"Unkown operator in select_coordinate op: {op}.\")\n    else:\n        if op == 'gt' or op == 'gte':\n            return da.sel({coordinate: slice(None, parsed_value)})\n        elif op == 'lt' or op == 'lte':\n            return da.sel({coordinate: slice(parsed_value, None)})\n        else:\n            raise ValueError(f\"Unkown operator in select_coordinate op: {op}.\")\n\ndef get_coords_condition(coordinate: str, condition: t.Dict[str, str]):\n    \"\"\"Generate a bounding box from the defined lat long ranges for cities / countries.\"\"\"\n    value = condition['eq'].lower()\n    bounding_box = { 'latitude': { }, 'longitude': { } }\n    if value in COUNTRIES_BOUNDING_BOXES:\n        lat_min, lat_max, lon_min, lon_max  = COUNTRIES_BOUNDING_BOXES[value]\n    elif value in CITIES_BOUNDING_BOXES:\n        lat_min, lat_max, lon_min, lon_max  = CITIES_BOUNDING_BOXES[value]\n    else:\n        raise NotImplementedError(f\"Can not query for {coordinate}:{value}\")\n    bounding_box['latitude']['gte'] = lat_min\n    bounding_box['latitude']['lte'] = lat_max\n    bounding_box['longitude']['gte'] = lon_min + 360 if lon_min < 0 else lon_min\n    bounding_box['longitude']['lte'] = lon_max + 360 if lon_max < 0 else lon_max\n\n    return bounding_box\n\n\ndef filter_condition_dict(condition_dict: dict, ds: xr.Dataset):\n    \"\"\"Filter out custom fields and update them with actual dataset supported coord.\n    { 'country': 'new york' } => { 'latitude': {...}, 'longitude': {...} }\n    \"\"\"\n    result = {}\n    for coordinate, conditions in condition_dict.items():\n        if coordinate in ds.coords:\n            result[coordinate] = conditions\n        elif coordinate in SUPPORTED_CUSTOM_COORDS:\n            simple_coords_dict = get_coords_condition(coordinate, conditions)\n            result.update(simple_coords_dict)\n        else:\n            raise NotImplementedError(f\"Dataset can not be queried over {coordinate} field\")\n    return result\n\ndef apply_select_condition(ds: xr.Dataset, condition_dict: dict) -> xr.Dataset:\n    \"\"\"A condition dict will be in form\n    {   'coord1': {'lt': val1, 'gt': val2},\n        'coord2: {'eq': someval},\n    ...}\n    This function applies above conditions on the dataset.\n    \"\"\"\n    absolute_condition_dict = {}\n    condition_dict = filter_condition_dict(condition_dict, ds)\n    for coordinate, conditions in condition_dict.items():\n        coordinate_array = ds[coordinate]\n\n        # Iterate each operation and apply on a data array.\n        for operator, value in conditions.items():\n            coordinate_array = select_coordinate(coordinate_array, coordinate,  operator, value)\n\n        coordinate_values = coordinate_array.values\n\n        # If the condition mentions lt(less than) or gt(greater than),\n        # then remove the last or first element based on below conditions.\n        if 'gt' in conditions:\n            if is_ascending_order(coordinate_array):\n                coordinate_values = np.delete(coordinate_values, 0)\n            else:\n                coordinate_values = np.delete(coordinate_values, -1)\n        if 'lt' in conditions:\n            if is_ascending_order(coordinate_array):\n                coordinate_values = np.delete(coordinate_values, -1)\n            else:\n                coordinate_values = np.delete(coordinate_values, 0)\n\n        absolute_condition_dict[coordinate] =  coordinate_values\n\n    # Finally perform the select operation.\n    return ds.sel(absolute_condition_dict)\n\ndef postorder(expression: exp.Expression, condition_dict: dict):\n    \"\"\"Performs post order traversal on sqlglot expression and converts it into a dict.\n    The dict in updated in place.\n    \"\"\"\n    if expression is None:\n        return\n    if expression.key == \"literal\" or expression.key == \"identifier\" or expression.key == \"column\":\n        return\n\n    args = expression.args\n\n    left = args.get('this', None)\n    right = args.get('expression', None)\n\n    postorder(left, condition_dict)\n    postorder(right, condition_dict)\n\n    if(check_conditional(expression)):\n        parse_condition(expression, condition_dict)\n\ndef apply_where(ds: xr.Dataset, expression: exp.Expression) -> xr.Dataset:\n    terms = get_sop_terms(expression)\n    or_ds = []\n    for term in terms:\n        condition_dict = {}\n        postorder(term, condition_dict)\n        reduced_ds = apply_select_condition(ds, condition_dict)\n        or_ds.append(reduced_ds)\n    # TODO: Add support for OR operation.\n    return or_ds[0]\n"
  }
]