[
  {
    "path": ".coveragerc",
    "content": "[report]\nomit =\n    luigi/mrrunner.py\n    test/_test_time_generated_module*.py\n    */python?.?/*\n    */site-packages/nose/*\n    *__init__*\n    *test/*\n    */.tox/*\n    */setup.py\n    */bin/luigidc\n    hadoop_test.py\n    minicluster.py\n\n[run]\nparallel=True\nconcurrency=multiprocessing\n"
  },
  {
    "path": ".github/CODEOWNERS",
    "content": "# The following patterns are used to auto-assign review requests\n# to specific individuals. Order is important; the last matching\n# pattern takes the most precedence.\n\n# These owners will be the default owners for everything in\n# the repo. Unless a later match takes precedence,\n* @dlstadther @spotify/dataex\n\n# Specific files, directories, paths, or file types can be\n# assigned more specificially.\ncontrib/redshift*.py @dlstadther\n\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE.md",
    "content": "<!---\nWe use GitHub issues mainly for tracking bugs and feature requests.\nQuestions for how to use luigi can be sent to the mailing list.\n\nCurrently, there are no strict procedures or guidelines for submitting issues.\nIn short, please just use common sense.\n\nCommon sense includes this at bare-minimum:\n\n * search for similar issues posted before creating a new issue.\n * Use markdown to format all code/logs. Issues which are hard to read\n   when rendered on GitHub might be closed with a friendly reminder of this.\n * If applicable, reading relevant parts of the documentation.\n\nAlso, add steps to reproduce the bug, if applicable. Sample code would be nice too :)\n\nFor more information on how to submit valuable contributions,\nsee https://opensource.guide/how-to-contribute/#how-to-submit-a-contribution\n-->\n"
  },
  {
    "path": ".github/PULL_REQUEST_TEMPLATE.md",
    "content": "<!--- This template is optional. Please use it as a starting point to help guide PRs -->\n\n<!--- Provide a general summary of your changes in the Title above -->\n\n## Description\n<!--- Describe your changes -->\n\n## Motivation and Context\n<!--- Why is this change required? What problem does it solve? -->\n<!--- If it fixes an open issue, please link to the issue here. -->\n\n## Have you tested this? If so, how?\n<!--- Valid responses are \"I have included unit tests.\" or --> \n<!--- \"I ran my jobs with this code and it works for me.\" -->\n\n<!---\nfor more information on how to submit valuable contributions,\nsee https://opensource.guide/how-to-contribute/#how-to-submit-a-contribution\n-->\n"
  },
  {
    "path": ".github/stale.yml",
    "content": "# Number of days of inactivity before an issue becomes stale\ndaysUntilStale: 120\n# Number of days of inactivity before a stale issue is closed\ndaysUntilClose: 14\n# Issues with these labels will never be considered stale\nexemptLabels:\n  - pinned\n  - security\n# Label to use when marking an issue as stale\nstaleLabel: wontfix\n# Comment to post when marking an issue as stale. Set to `false` to disable\nmarkComment: >\n  This issue has been automatically marked as stale because it has not had\n  recent activity. It will be closed if no further activity occurs.\n  If closed, you may revisit when your time allows and reopen!\n  Thank you for your contributions.\n# Comment to post when closing a stale issue. Set to `false` to disable\ncloseComment: false\n# Limit to only `issues` or `pulls`\n# only: issues\n"
  },
  {
    "path": ".github/workflows/codeql.yml",
    "content": "name: \"CodeQL\"\n\non:\n  push:\n    branches: [ 'master' ]\n  pull_request:\n    # The branches below must be a subset of the branches above\n    branches: [ 'master' ]\n  schedule:\n    - cron: '29 18 * * 0'\n\njobs:\n  analyze:\n    name: Analyze\n    runs-on: ubuntu-latest\n    permissions:\n      actions: read\n      contents: read\n      security-events: write\n\n    strategy:\n      fail-fast: false\n      matrix:\n        language: [ 'python', 'javascript' ]\n        # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]\n        # Use only 'java' to analyze code written in Java, Kotlin or both\n        # Use only 'javascript' to analyze code written in JavaScript, TypeScript or both\n        # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support\n\n    steps:\n    - name: Checkout repository\n      uses: actions/checkout@v4\n\n    # Initializes the CodeQL tools for scanning.\n    - name: Initialize CodeQL\n      uses: github/codeql-action/init@v2\n      with:\n        languages: ${{ matrix.language }}\n        # If you wish to specify custom queries, you can do so here or in a config file.\n        # By default, queries listed here will override any specified in a config file.\n        # Prefix the list here with \"+\" to use these queries and those in the config file.\n\n        # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs\n        # queries: security-extended,security-and-quality\n\n\n    # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift).\n    # If this step fails, then you should remove it and run the build manually (see below)\n    - name: Autobuild\n      uses: github/codeql-action/autobuild@v2\n\n    # ℹ️ Command-line programs to run using the OS shell.\n    # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun\n\n    #   If the Autobuild fails above, remove it and uncomment the following three lines.\n    #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.\n\n    # - run: |\n    #     echo \"Run, Build Application using script\"\n    #     ./location_of_script_within_repo/buildscript.sh\n\n    - name: Perform CodeQL Analysis\n      uses: github/codeql-action/analyze@v2\n      with:\n        category: \"/language:${{matrix.language}}\"\n"
  },
  {
    "path": ".github/workflows/pythonbuild.yml",
    "content": "name: Build\n\non:\n  push:\n    branches:\n      - master\n  pull_request:\n\njobs:\n  core:\n    runs-on: ubuntu-22.04\n\n    strategy:\n      matrix:\n        include:\n          - tox-env: py310-core\n          - tox-env: py311-core\n          - tox-env: py312-core\n          - tox-env: py313-core\n\n    steps:\n      - uses: actions/checkout@v6\n      - name: Set up the latest version of uv\n        uses: astral-sh/setup-uv@v7\n        with:\n          enable-cache: true\n          cache-dependency-glob: \"pyproject.toml\"\n      - name: Install dependencies\n        run: |\n          uv tool install --python-preference only-managed --python 3.12 tox --with tox-uv\n      - name: Build\n        env:\n          TOXENV: ${{ matrix.tox-env }}\n        run: uvx --with tox-uv tox run\n      - name: Upload coverage to Codecov\n        uses: codecov/codecov-action@v5\n        with:\n          fail_ci_if_error: true\n          verbose: true\n        env:\n          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}\n\n  mysql:\n    runs-on: ubuntu-22.04\n\n    strategy:\n      matrix:\n        include:\n          - tox-env: py310-mysql\n          - tox-env: py311-mysql\n          - tox-env: py312-mysql\n          - tox-env: py313-mysql\n\n    steps:\n      - uses: actions/checkout@v6\n      - name: Set up the latest version of uv\n        uses: astral-sh/setup-uv@v7\n        with:\n          enable-cache: true\n          cache-dependency-glob: \"pyproject.toml\"\n      - name: Install dependencies\n        run: |\n          uv tool install --python-preference only-managed --python 3.12 tox --with tox-uv\n      - name: Setup MySQL DB\n        run: |\n          sudo /etc/init.d/mysql start\n          mysql -e 'create database IF NOT EXISTS luigi_test;' -uroot -proot || true\n          mysql -e 'create user 'travis'@'localhost';' -uroot -proot || true\n          mysql -e 'grant all privileges ON *.* TO 'travis'@'localhost';' -uroot -proot || true\n      - name: Build\n        env:\n          TOXENV: ${{ matrix.tox-env }}\n        run: uvx --with tox-uv tox run\n      - name: Upload coverage to Codecov\n        uses: codecov/codecov-action@v5\n        with:\n          fail_ci_if_error: true\n          verbose: true\n        env:\n          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}\n\n  postgres:\n    runs-on: ubuntu-22.04\n    services:\n      postgres:\n        image: postgres\n        env:\n          POSTGRES_USER: postgres\n          POSTGRES_PASSWORD: postgres\n          POSTGRES_DB: postgres\n        ports:\n        - 5432:5432\n        # Set health checks to wait until postgres has started\n        options: >-\n          --health-cmd pg_isready\n          --health-interval 10s\n          --health-timeout 5s\n          --health-retries 5\n\n    strategy:\n      matrix:\n        include:\n          - tox-env: py310-postgres\n          - tox-env: py311-postgres\n          - tox-env: py312-postgres\n          - tox-env: py313-postgres\n\n    steps:\n      - uses: actions/checkout@v6\n      - name: Set up the latest version of uv\n        uses: astral-sh/setup-uv@v7\n        with:\n          enable-cache: true\n          cache-dependency-glob: \"pyproject.toml\"\n      - name: Install dependencies\n        run: |\n          uv tool install --python-preference only-managed --python 3.12 tox --with tox-uv\n      - name: Create PSQL database\n        run: |\n          PGPASSWORD=postgres psql -h localhost -p 5432 -c 'create database spotify;' -U postgres\n      - name: Build\n        env:\n          TOXENV: ${{ matrix.tox-env }}\n        run: uvx --with tox-uv tox run\n      - name: Upload coverage to Codecov\n        uses: codecov/codecov-action@v5\n        with:\n          fail_ci_if_error: true\n          verbose: true\n        env:\n          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}\n\n  base:\n    runs-on: ubuntu-22.04\n    env:\n      AWS_DEFAULT_REGION: us-east-1\n      AWS_ACCESS_KEY_ID: accesskey\n      AWS_SECRET_ACCESS_KEY: secretkey\n\n    strategy:\n      matrix:\n        include:\n          - tox-env: py310-aws\n          - tox-env: py311-aws\n          - tox-env: py312-aws\n          - tox-env: py313-aws\n\n          - tox-env: py310-unixsocket\n            OVERRIDE_SKIP_CI_TESTS: True\n          - tox-env: py311-unixsocket\n            OVERRIDE_SKIP_CI_TESTS: True\n          - tox-env: py312-unixsocket\n            OVERRIDE_SKIP_CI_TESTS: True\n          - tox-env: py313-unixsocket\n            OVERRIDE_SKIP_CI_TESTS: True\n\n          - tox-env: py310-apache\n          - tox-env: py311-apache\n          - tox-env: py312-apache\n          - tox-env: py313-apache\n\n          - tox-env: py310-azureblob\n          - tox-env: py311-azureblob\n          - tox-env: py312-azureblob\n          - tox-env: py313-azureblob\n\n          - tox-env: py310-contrib\n          - tox-env: py311-contrib\n          - tox-env: py312-contrib\n          - tox-env: py313-contrib\n\n    steps:\n      - uses: actions/checkout@v6\n      - name: Set up the latest version of uv\n        uses: astral-sh/setup-uv@v7\n        with:\n          enable-cache: true\n          cache-dependency-glob: \"pyproject.toml\"\n      - name: Install dependencies\n        run: |\n          uv tool install --python-preference only-managed --python 3.12 tox --with tox-uv\n      - name: Build\n        env:\n          TOXENV: ${{ matrix.tox-env }}\n          OVERRIDE_SKIP_CI_TESTS: ${{ matrix.OVERRIDE_SKIP_CI_TESTS }}\n        run: uvx --with tox-uv tox run\n      - name: Upload coverage to Codecov\n        uses: codecov/codecov-action@v5\n        with:\n          fail_ci_if_error: true\n          verbose: true\n        env:\n          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}\n\n  others:\n    runs-on: ubuntu-22.04\n\n    strategy:\n      matrix:\n        include:\n          - tox-env: lint\n          - tox-env: docs\n          - tox-env: typecheck\n    steps:\n      - uses: actions/checkout@v6\n      - name: Set up the latest version of uv\n        uses: astral-sh/setup-uv@v7\n        with:\n          enable-cache: true\n          cache-dependency-glob: \"pyproject.toml\"\n      - name: Install dependencies\n        run: |\n          uv tool install --python-preference only-managed --python 3.12 tox --with tox-uv\n      - name: Build\n        env:\n          TOXENV: ${{ matrix.tox-env }}\n          OVERRIDE_SKIP_CI_TESTS: ${{ matrix.OVERRIDE_SKIP_CI_TESTS }}\n        run: uvx --with tox-uv tox run\n"
  },
  {
    "path": ".gitignore",
    "content": ".coverage.*\ndoc/api/*.rst\ntest/gcloud-credentials.json\n.hypothesis/\n\n.nicesetup\n\nclient.cfg\nluigi.cfg\n\nhadoop_test.py\nminicluster.py\nmrrunner.py\npig_property_file\n\npackages.tar\n\n# Ignore the data files\ndata\ntest/data\nexamples/data\n\nVagrantfile\n\n*.pickle\n*.rej\n*.orig\n\n# Created by https://www.gitignore.io\n\n### Python ###\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nenv/\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n# NOTE : lib/ prevents inclusion of static/visualiser/lib\n#lib/\nlib64/\nparts/\nsdist/\nvar/\n*.egg-info/\n.installed.cfg\n*.egg\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\nmy_dir\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\n\n# Sphinx documentation\ndoc/_build/\n\n# PyBuilder\ntarget/\n\n\n### Vim ###\n[._]*.s[a-w][a-z]\n[._]s[a-w][a-z]\n*.un~\nSession.vim\n.netrwhist\n*~\n\n\n### PyCharm ###\n# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm\n\n*.iml\n\n## Directory-based project format:\n.idea/\n# if you remove the above rule, at least ignore the following:\n\n# User-specific stuff:\n# .idea/workspace.xml\n# .idea/tasks.xml\n# .idea/dictionaries\n\n# Sensitive or high-churn files:\n# .idea/dataSources.ids\n# .idea/dataSources.xml\n# .idea/sqlDataSources.xml\n# .idea/dynamic.xml\n# .idea/uiDesigner.xml\n\n# Gradle:\n# .idea/gradle.xml\n# .idea/libraries\n\n# Mongo Explorer plugin:\n# .idea/mongoSettings.xml\n\n## File-based project format:\n*.ipr\n*.iws\n\n## Plugin-specific files:\n\n# IntelliJ\nout/\n\n# mpeltonen/sbt-idea plugin\n.idea_modules/\n\n# JIRA plugin\natlassian-ide-plugin.xml\n\n# Crashlytics plugin (for Android Studio and IntelliJ)\ncom_crashlytics_export_strings.xml\ncrashlytics.properties\ncrashlytics-build.properties\n\n\n### Vagrant ###\n.vagrant/\n\n\n### OSX ###\n.DS_Store\n.AppleDouble\n.LSOverride\n\n# Icon must end with two \\r\nIcon\n\n\n# Thumbnails\n._*\n\n# Files that might appear on external disk\n.Spotlight-V100\n.Trashes\n\n# Directories potentially created on remote AFP share\n.AppleDB\n.AppleDesktop\nNetwork Trash Folder\nTemporary Items\n.apdisk\n\n.python-version\n"
  },
  {
    "path": ".readthedocs.yaml",
    "content": "version: 2\n\nbuild:\n  os: ubuntu-24.04\n  tools:\n    python: \"3.13\"\n  jobs:\n    pre_create_environment:\n      - asdf plugin add uv\n      - asdf install uv latest\n      - asdf global uv latest\n    create_environment:\n      - uv venv \"${READTHEDOCS_VIRTUALENV_PATH}\"\n    install:\n      - UV_PROJECT_ENVIRONMENT=\"${READTHEDOCS_VIRTUALENV_PATH}\" uv sync --frozen --group docs\n\nsphinx:\n  configuration: doc/conf.py\n\nformats:\n  - pdf\n  - epub\n"
  },
  {
    "path": "CONTRIBUTING.rst",
    "content": "Code of conduct\n---------------\n\nThis project adheres to the `Open Code of Conduct \n<https://github.com/spotify/code-of-conduct/blob/master/code-of-conduct.md>`_.  By \nparticipating, you are expected to honor this code.\n\nRunning the tests\n-----------------\n\n\nWe are always happy to receive Pull Requests. When you open a PR, it will\nautomatically build on Travis. So you're not strictly required to test the\npatch locally before submitting it.\n\nIf you do want to run the tests locally you'll need to run the commands below\n.. code:: bash\n   curl -LsSf https://astral.sh/uv/install.sh | sh\n   uv tool install tox --with tox-uv\n\nYou will need a ``tox --version`` of at least 4.22.\n\n.. code:: bash\n\n    # These commands are pretty fast and will tell if you've\n    # broken something major:\n    tox run -e flake8\n    tox run -e py38-core\n\n    # You can also test particular files for even faster iterations\n    tox run -e py38-core -- test/rpc_test.py\n\n    # The visualiser tests require phantomjs to be installed on your path\n    tox run -e visualiser\n\n    # And some of the others involve downloading and running Hadoop:\n    tox run -e py38-cdh\n    tox run -e py39-hdp\n\nWhere ``flake8`` is the lint checking, ``py38`` is obviously Python 3.8.\n``core`` are tests that do not require external components and ``cdh`` and\n``hdp`` are two different hadoop distributions. For most local development it's\nusually enough to run the lint checking and a python version for ``core``\nand let Travis run for the whole matrix.\n\nFor `cdh` and `hdp`, tox will download the hadoop distribution for you. You\nhowever have to have Java installed and the `JAVA_HOME` environment variable\nset.\n\nFor more details, check out the ``.github/workflows/pythonbuild.yml`` and ``tox.ini`` files.\n\nWriting documentation\n=====================\n\nAll documentation for Luigi is written in `reStructuredText/Sphinx markup\n<http://sphinx-doc.org/domains.html#the-python-domain>`_ and are both in the\ncode as docstrings and in `.rst`. Pull requests should come with documentation\nwhen appropriate.\n\nYou verify that your documentation code compiles by running\n\n.. code:: bash\n\n    tox run -e docs\n\nAfter that, you can check how it renders locally with your browser\n\n.. code:: bash\n\n    firefox doc/_build/html/index.html\n"
  },
  {
    "path": "LICENSE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright 2012-2021 Spotify AB\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "README.rst",
    "content": ".. figure:: https://raw.githubusercontent.com/spotify/luigi/master/doc/luigi.png\n   :alt: Luigi Logo\n   :align: center\n\n.. image:: https://img.shields.io/endpoint.svg?url=https%3A%2F%2Factions-badge.atrox.dev%2Fspotify%2Fluigi%2Fbadge&label=build&logo=none&%3Fref%3Dmaster&style=flat\n    :target: https://actions-badge.atrox.dev/spotify/luigi/goto?ref=master\n\n.. image:: https://img.shields.io/codecov/c/github/spotify/luigi/master.svg?style=flat\n    :target: https://codecov.io/gh/spotify/luigi?branch=master\n\n.. image:: https://img.shields.io/pypi/v/luigi.svg?style=flat\n   :target: https://pypi.python.org/pypi/luigi\n\n.. image:: https://img.shields.io/pypi/l/luigi.svg?style=flat\n   :target: https://pypi.python.org/pypi/luigi\n\n.. image:: https://readthedocs.org/projects/luigi/badge/?version=stable\n    :target: https://luigi.readthedocs.io/en/stable/?badge=stable\n    :alt: Documentation Status\n\nLuigi is a Python (3.10, 3.11, 3.12, 3.13 tested) package that helps you build complex\npipelines of batch jobs. It handles dependency resolution, workflow management,\nvisualization, handling failures, command line integration, and much more.\n\nGetting Started\n---------------\n\nRun ``pip install luigi`` to install the latest stable version from `PyPI\n<https://pypi.python.org/pypi/luigi>`_. `Documentation for the latest release\n<https://luigi.readthedocs.io/en/stable/>`__ is hosted on readthedocs.\n\nRun ``pip install luigi[toml]`` to install Luigi with `TOML-based configs\n<https://luigi.readthedocs.io/en/stable/configuration.html>`__ support.\n\nFor the bleeding edge code, ``pip install\ngit+https://github.com/spotify/luigi.git``. `Bleeding edge documentation\n<https://luigi.readthedocs.io/en/latest/>`__ is also available.\n\nBackground\n----------\n\nThe purpose of Luigi is to address all the plumbing typically associated\nwith long-running batch processes. You want to chain many tasks,\nautomate them, and failures *will* happen. These tasks can be anything,\nbut are typically long running things like\n`Hadoop <http://hadoop.apache.org/>`_ jobs, dumping data to/from\ndatabases, running machine learning algorithms, or anything else.\n\nThere are other software packages that focus on lower level aspects of\ndata processing, like `Hive <http://hive.apache.org/>`__,\n`Pig <http://pig.apache.org/>`_, or\n`Cascading <http://www.cascading.org/>`_. Luigi is not a framework to\nreplace these. Instead it helps you stitch many tasks together, where\neach task can be a `Hive query <https://luigi.readthedocs.io/en/latest/api/luigi.contrib.hive.html>`__,\na `Hadoop job in Java <https://luigi.readthedocs.io/en/latest/api/luigi.contrib.hadoop_jar.html>`_,\na  `Spark job in Scala or Python <https://luigi.readthedocs.io/en/latest/api/luigi.contrib.spark.html>`_,\na Python snippet,\n`dumping a table <https://luigi.readthedocs.io/en/latest/api/luigi.contrib.sqla.html>`_\nfrom a database, or anything else. It's easy to build up\nlong-running pipelines that comprise thousands of tasks and take days or\nweeks to complete. Luigi takes care of a lot of the workflow management\nso that you can focus on the tasks themselves and their dependencies.\n\nYou can build pretty much any task you want, but Luigi also comes with a\n*toolbox* of several common task templates that you use. It includes\nsupport for running\n`Python mapreduce jobs <https://luigi.readthedocs.io/en/latest/api/luigi.contrib.hadoop.html>`_\nin Hadoop, as well as\n`Hive <https://luigi.readthedocs.io/en/latest/api/luigi.contrib.hive.html>`__,\nand `Pig <https://luigi.readthedocs.io/en/latest/api/luigi.contrib.pig.html>`__,\njobs. It also comes with\n`file system abstractions for HDFS <https://luigi.readthedocs.io/en/latest/api/luigi.contrib.hdfs.html>`_,\nand local files that ensures all file system operations are atomic. This\nis important because it means your data pipeline will not crash in a\nstate containing partial data.\n\nVisualiser page\n---------------\n\nThe Luigi server comes with a web interface too, so you can search and filter\namong all your tasks.\n\n.. figure:: https://raw.githubusercontent.com/spotify/luigi/master/doc/visualiser_front_page.png\n   :alt: Visualiser page\n\nDependency graph example\n------------------------\n\nJust to give you an idea of what Luigi does, this is a screen shot from\nsomething we are running in production. Using Luigi's visualiser, we get\na nice visual overview of the dependency graph of the workflow. Each\nnode represents a task which has to be run. Green tasks are already\ncompleted whereas yellow tasks are yet to be run. Most of these tasks\nare Hadoop jobs, but there are also some things that run locally and\nbuild up data files.\n\n.. figure:: https://raw.githubusercontent.com/spotify/luigi/master/doc/user_recs.png\n   :alt: Dependency graph\n\nPhilosophy\n----------\n\nConceptually, Luigi is similar to `GNU\nMake <http://www.gnu.org/software/make/>`_ where you have certain tasks\nand these tasks in turn may have dependencies on other tasks. There are\nalso some similarities to `Oozie <http://oozie.apache.org/>`_\nand `Azkaban <https://azkaban.github.io/>`_. One major\ndifference is that Luigi is not just built specifically for Hadoop, and\nit's easy to extend it with other kinds of tasks.\n\nEverything in Luigi is in Python. Instead of XML configuration or\nsimilar external data files, the dependency graph is specified *within\nPython*. This makes it easy to build up complex dependency graphs of\ntasks, where the dependencies can involve date algebra or recursive\nreferences to other versions of the same task. However, the workflow can\ntrigger things not in Python, such as running\n`Pig scripts <https://luigi.readthedocs.io/en/latest/api/luigi.contrib.pig.html>`_\nor `scp'ing files <https://luigi.readthedocs.io/en/latest/api/luigi.contrib.ssh.html>`_.\n\nWho uses Luigi?\n---------------\n\nWe use Luigi internally at `Spotify <https://www.spotify.com>`_ to run\nthousands of tasks every day, organized in complex dependency graphs.\nMost of these tasks are Hadoop jobs. Luigi provides an infrastructure\nthat powers all kinds of stuff including recommendations, toplists, A/B\ntest analysis, external reports, internal dashboards, etc.\n\nSince Luigi is open source and without any registration walls, the exact number\nof Luigi users is unknown. But based on the number of unique contributors, we\nexpect hundreds of enterprises to use it. Some users have written blog posts\nor held presentations about Luigi:\n\n* `Spotify <https://www.spotify.com>`_ `(presentation, 2014) <http://www.slideshare.net/erikbern/luigi-presentation-nyc-data-science>`__\n* `Foursquare <https://foursquare.com/>`_ `(presentation, 2013) <http://www.slideshare.net/OpenAnayticsMeetup/luigi-presentation-17-23199897>`__\n* `Mortar Data (Datadog) <https://www.datadoghq.com/>`_ `(documentation / tutorial) <http://help.mortardata.com/technologies/luigi>`__\n* `Stripe <https://stripe.com/>`_ `(presentation, 2014) <http://www.slideshare.net/PyData/python-as-part-of-a-production-machine-learning-stack-by-michael-manapat-pydata-sv-2014>`__\n* `Buffer <https://buffer.com/>`_ `(blog, 2014) <https://buffer.com/resources/buffers-new-data-architecture/>`__\n* `SeatGeek <https://seatgeek.com/>`_ `(blog, 2015) <http://chairnerd.seatgeek.com/building-out-the-seatgeek-data-pipeline/>`__\n* `Treasure Data <https://www.treasuredata.com/>`_ `(blog, 2015) <http://blog.treasuredata.com/blog/2015/02/25/managing-the-data-pipeline-with-git-luigi/>`__\n* `Growth Intelligence <http://growthintel.com/>`_ `(presentation, 2015) <http://www.slideshare.net/growthintel/a-beginners-guide-to-building-data-pipelines-with-luigi>`__\n* `AdRoll <https://www.adroll.com/>`_ `(blog, 2015) <http://tech.adroll.com/blog/data/2015/09/22/data-pipelines-docker.html>`__\n* 17zuoye `(presentation, 2015) <https://speakerdeck.com/mvj3/luiti-an-offline-task-management-framework>`__\n* `Custobar <https://www.custobar.com/>`_ `(presentation, 2016) <http://www.slideshare.net/teemukurppa/managing-data-workflows-with-luigi>`__\n* `Blendle <https://launch.blendle.com/>`_ `(presentation) <http://www.anneschuth.nl/wp-content/uploads/sea-anneschuth-streamingblendle.pdf#page=126>`__\n* `TrustYou <http://www.trustyou.com/>`_ `(presentation, 2015) <https://speakerdeck.com/mfcabrera/pydata-berlin-2015-processing-hotel-reviews-with-python>`__\n* `Groupon <https://www.groupon.com/>`_ / `OrderUp <https://orderup.com>`_ `(alternative implementation) <https://github.com/groupon/luigi-warehouse>`__\n* `Red Hat - Marketing Operations <https://www.redhat.com>`_ `(blog, 2017) <https://github.com/rh-marketingops/rh-mo-scc-luigi>`__\n* `GetNinjas <https://www.getninjas.com.br/>`_ `(blog, 2017) <https://labs.getninjas.com.br/using-luigi-to-create-and-monitor-pipelines-of-batch-jobs-eb8b3cd2a574>`__\n* `voyages-sncf.com <https://www.voyages-sncf.com/>`_ `(presentation, 2017) <https://github.com/voyages-sncf-technologies/meetup-afpy-nantes-luigi>`__\n* `Open Targets <https://www.opentargets.org/>`_ `(blog, 2017) <https://blog.opentargets.org/using-containers-with-luigi>`__\n* `Leipzig University Library <https://ub.uni-leipzig.de>`_ `(presentation, 2016) <https://de.slideshare.net/MartinCzygan/build-your-own-discovery-index-of-scholary-eresources>`__ / `(project) <https://finc.info/de/datenquellen>`__\n* `Synetiq <https://synetiq.net/>`_ `(presentation, 2017) <https://www.youtube.com/watch?v=M4xUQXogSfo>`__\n* `Glossier <https://www.glossier.com/>`_ `(blog, 2018) <https://medium.com/glossier/how-to-build-a-data-warehouse-what-weve-learned-so-far-at-glossier-6ff1e1783e31>`__\n* `Data Revenue <https://www.datarevenue.com/>`_ `(blog, 2018) <https://www.datarevenue.com/en/blog/how-to-scale-your-machine-learning-pipeline>`_\n* `Uppsala University <http://pharmb.io>`_ `(tutorial) <http://uppnex.se/twiki/do/view/Courses/EinfraMPS2015/Luigi.html>`_   / `(presentation, 2015) <https://www.youtube.com/watch?v=f26PqSXZdWM>`_ / `(slides, 2015) <https://www.slideshare.net/SamuelLampa/building-workflows-with-spotifys-luigi>`_ / `(poster, 2015) <https://pharmb.io/poster/2015-sciluigi/>`_ / `(paper, 2016) <https://doi.org/10.1186/s13321-016-0179-6>`_ / `(project) <https://github.com/pharmbio/sciluigi>`_\n* `GIPHY <https://giphy.com/>`_ `(blog, 2019) <https://engineering.giphy.com/luigi-the-10x-plumber-containerizing-scaling-luigi-in-kubernetes/>`__\n* `xtream <https://xtreamers.io/>`__ `(blog, 2019) <https://towardsdatascience.com/lessons-from-a-real-machine-learning-project-part-1-from-jupyter-to-luigi-bdfd0b050ca5>`__\n* `CIAN <https://cian.ru/>`__ `(presentation, 2019) <https://www.highload.ru/moscow/2019/abstracts/6030>`__\n\nSome more companies are using Luigi but haven't had a chance yet to write about it:\n\n* `Schibsted <http://www.schibsted.com/>`_\n* `enbrite.ly <http://enbrite.ly/>`_\n* `Dow Jones / The Wall Street Journal <http://wsj.com>`_\n* `Hotels.com <https://hotels.com>`_\n* `Newsela <https://newsela.com>`_\n* `Squarespace <https://www.squarespace.com/>`_\n* `OAO <https://adops.com/>`_\n* `Grovo <https://grovo.com/>`_\n* `Weebly <https://www.weebly.com/>`_\n* `Deloitte <https://www.Deloitte.co.uk/>`_\n* `Stacktome <https://stacktome.com/>`_\n* `LINX+Neemu+Chaordic <https://www.chaordic.com.br/>`_\n* `Foxberry <https://www.foxberry.com/>`_\n* `Okko <https://okko.tv/>`_\n* `ISVWorld <http://isvworld.com/>`_\n* `Big Data <https://bigdata.com.br/>`_\n* `Movio <https://movio.co.nz/>`_\n* `Bonnier News <https://www.bonniernews.se/>`_\n* `Starsky Robotics <https://www.starsky.io/>`_\n* `BaseTIS <https://www.basetis.com/>`_\n* `Hopper <https://www.hopper.com/>`_\n* `VOYAGE GROUP/Zucks <https://zucks.co.jp/en/>`_\n* `Textpert <https://www.textpert.ai/>`_\n* `Tracktics <https://www.tracktics.com/>`_\n* `Whizar <https://www.whizar.com/>`_\n* `xtream <https://www.xtreamers.io/>`__\n* `Skyscanner <https://www.skyscanner.net/>`_\n* `Jodel <https://www.jodel.com/>`_\n* `Mekar <https://mekar.id/en/>`_\n* `M3 <https://corporate.m3.com/en/>`_\n* `Assist Digital <https://www.assistdigital.com/>`_\n* `Meltwater <https://www.meltwater.com/>`_\n* `DevSamurai <https://www.devsamurai.com/>`_\n* `Veridas <https://veridas.com/>`_\n* `Aidentified <https://www.aidentified.com/>`_\n\nWe're more than happy to have your company added here. Just send a PR on GitHub.\n\nExternal links\n--------------\n\n* `Mailing List <https://groups.google.com/d/forum/luigi-user/>`_ for discussions and asking questions. (Google Groups)\n* `Releases <https://pypi.python.org/pypi/luigi>`_ (PyPI)\n* `Source code <https://github.com/spotify/luigi>`_ (GitHub)\n* `Hubot Integration <https://github.com/houzz/hubot-luigi>`_ plugin for Slack, Hipchat, etc (GitHub)\n\nAuthors\n-------\n\nLuigi was built at `Spotify <https://www.spotify.com>`_, mainly by\n`Erik Bernhardsson <https://github.com/erikbern>`_ and\n`Elias Freider <https://github.com/freider>`_.\n`Many other people <https://github.com/spotify/luigi/graphs/contributors>`_\nhave contributed since open sourcing in late 2012.\n`Arash Rouhani <https://github.com/tarrasch>`_ was the chief maintainer from 2015 to 2019, and now\nSpotify's Data Team maintains Luigi.\n"
  },
  {
    "path": "RELEASE-PROCESS.rst",
    "content": "For maintainers of Luigi, who have push access to pypi. Here's how you upload\nLuigi to pypi.\n\n#. Make sure [uv](https://github.com/astral-sh/uv) is installed ``curl -LsSf https://astral.sh/uv/install.sh | sh``.\n#. Update version number in `luigi/__version__.py`.\n#. Commit, perhaps simply with a commit message like ``Version x.y.z``.\n#. Push to GitHub at [spotify/luigi](https://github.com/spotify/luigi).\n#. Clean up previous distributions by executing ``rm -rf dist``.\n#. Build a source distribution by executing ``uv build``.\n#. Set pypi token on environment variable ``export UV_PUBLISH_TOKEN=\"LUIGI_PYPI_TOKEN_HERE\"``.\n#. Upload to pypi by executing ``uv publish``.\n#. Add a tag on github (https://github.com/spotify/luigi/releases),\n   including a handwritten changelog, possibly inspired from previous notes.\n\nCurrently, Luigi is not released on any particular schedule and it is not\nstrictly abiding semantic versioning. Whenever possible, bump major version when you make incompatible API changes, minor version when you add functionality in a backwards compatible manner, and patch version when you make backwards compatible bug fixes.\n"
  },
  {
    "path": "SECURITY.md",
    "content": "# Security Policy\n\n## Reporting a Vulnerability\n\nPlease report sensitive security issues via Spotify's [bug-bounty program](https://hackerone.com/spotify) by following this [instruction](https://docs.hackerone.com/programs/security-page.html), rather than GitHub. \n"
  },
  {
    "path": "bin/luigi",
    "content": "#!/usr/bin/env python\n\nimport sys\nimport warnings\nimport luigi.cmdline\n\n\ndef main(argv):\n    warnings.warn(\"'bin/luigi' has moved to console script 'luigi'\", DeprecationWarning)\n    luigi.cmdline.luigi_run(argv)\n\n\nif __name__ == '__main__':\n    main(sys.argv[1:])\n"
  },
  {
    "path": "bin/luigid",
    "content": "#!/usr/bin/env python\n\nimport sys\nimport warnings\nimport luigi.cmdline\n\n\ndef main(argv):\n    warnings.warn(\"'bin/luigid' has moved to console script 'luigid'\", DeprecationWarning)\n    luigi.cmdline.luigid(argv)\n\n\nif __name__ == '__main__':\n    main(sys.argv[1:])\n"
  },
  {
    "path": "catalog-info.yaml",
    "content": "apiVersion: backstage.io/v1alpha1\nkind: Component\nmetadata:\n  name: luigi\nspec:\n  type: library\n  owner: dataex\n"
  },
  {
    "path": "codecov.yml",
    "content": "codecov:\n  require_ci_to_pass: true\n  notify:\n    wait_for_ci: true\n\ncoverage:\n  precision: 2\n  round: down\n  range: \"50...70\"\n\n  status:\n    project:\n      default: false  # disable the default status that measures entire project\n      core:\n        target: 90%\n        paths:\n          - \"luigi/*.py\"\n    patch:\n      default:\n        target: 50%\n        if_no_uploads: error\n\n    changes:\n      default:\n        informational: true\n\n  ignore:\n    - \"examples/\"\n    - \"luigi/tools\"  # These are tested as actual run commands without coverage\n    # List modules who's tests are not run by CI or are run in a subprocesses (like on cluster).\n    - \"luigi/contrib/beam_dataflow.py\"\n    - \"luigi/contrib/bigquery.py\"\n    - \"luigi/contrib/bigquery_avro.py\"\n    - \"luigi/contrib/dataproc.py\"\n    - \"luigi/contrib/dropbox.py\"\n    - \"luigi/contrib/ftp.py\"\n    - \"luigi/contrib/gcs.py\"\n    - \"luigi/contrib/hadoop.py\"\n    - \"luigi/contrib/hdfs/\"\n    - \"luigi/contrib/kubernetes.py\"\n    - \"luigi/contrib/mrrunner.py\"\n    - \"luigi/contrib/sparkey.py\"\n    - \"luigi/contrib/webhdfs.py\"\n\n# For luigi we do not want any comments\ncomment: false\n"
  },
  {
    "path": "doc/.gitignore",
    "content": "_static\n_build\n_templates\n"
  },
  {
    "path": "doc/Makefile",
    "content": "# Makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line.\nSPHINXOPTS    =\nSPHINXBUILD   = sphinx-build\nPAPER         =\nBUILDDIR      = _build\n\n# User-friendly check for sphinx-build\nifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)\n$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)\nendif\n\n# Internal variables.\nPAPEROPT_a4     = -D latex_paper_size=a4\nPAPEROPT_letter = -D latex_paper_size=letter\nALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .\n# the i18n builder cannot share the environment and doctrees with the others\nI18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .\n\n.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext\n\nhelp:\n\t@echo \"Please use \\`make <target>' where <target> is one of\"\n\t@echo \"  html       to make standalone HTML files\"\n\t@echo \"  dirhtml    to make HTML files named index.html in directories\"\n\t@echo \"  singlehtml to make a single large HTML file\"\n\t@echo \"  pickle     to make pickle files\"\n\t@echo \"  json       to make JSON files\"\n\t@echo \"  htmlhelp   to make HTML files and a HTML help project\"\n\t@echo \"  qthelp     to make HTML files and a qthelp project\"\n\t@echo \"  devhelp    to make HTML files and a Devhelp project\"\n\t@echo \"  epub       to make an epub\"\n\t@echo \"  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter\"\n\t@echo \"  latexpdf   to make LaTeX files and run them through pdflatex\"\n\t@echo \"  latexpdfja to make LaTeX files and run them through platex/dvipdfmx\"\n\t@echo \"  text       to make text files\"\n\t@echo \"  man        to make manual pages\"\n\t@echo \"  texinfo    to make Texinfo files\"\n\t@echo \"  info       to make Texinfo files and run them through makeinfo\"\n\t@echo \"  gettext    to make PO message catalogs\"\n\t@echo \"  changes    to make an overview of all changed/added/deprecated items\"\n\t@echo \"  xml        to make Docutils-native XML files\"\n\t@echo \"  pseudoxml  to make pseudoxml-XML files for display purposes\"\n\t@echo \"  linkcheck  to check all external links for integrity\"\n\t@echo \"  doctest    to run all doctests embedded in the documentation (if enabled)\"\n\nclean:\n\trm -rf $(BUILDDIR)/*\n\nhtml:\n\t$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html\n\t@echo\n\t@echo \"Build finished. The HTML pages are in $(BUILDDIR)/html.\"\n\ndirhtml:\n\t$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml\n\t@echo\n\t@echo \"Build finished. The HTML pages are in $(BUILDDIR)/dirhtml.\"\n\nsinglehtml:\n\t$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml\n\t@echo\n\t@echo \"Build finished. The HTML page is in $(BUILDDIR)/singlehtml.\"\n\npickle:\n\t$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle\n\t@echo\n\t@echo \"Build finished; now you can process the pickle files.\"\n\njson:\n\t$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json\n\t@echo\n\t@echo \"Build finished; now you can process the JSON files.\"\n\nhtmlhelp:\n\t$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp\n\t@echo\n\t@echo \"Build finished; now you can run HTML Help Workshop with the\" \\\n\t      \".hhp project file in $(BUILDDIR)/htmlhelp.\"\n\nqthelp:\n\t$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp\n\t@echo\n\t@echo \"Build finished; now you can run \"qcollectiongenerator\" with the\" \\\n\t      \".qhcp project file in $(BUILDDIR)/qthelp, like this:\"\n\t@echo \"# qcollectiongenerator $(BUILDDIR)/qthelp/Luigi.qhcp\"\n\t@echo \"To view the help file:\"\n\t@echo \"# assistant -collectionFile $(BUILDDIR)/qthelp/Luigi.qhc\"\n\ndevhelp:\n\t$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp\n\t@echo\n\t@echo \"Build finished.\"\n\t@echo \"To view the help file:\"\n\t@echo \"# mkdir -p $$HOME/.local/share/devhelp/Luigi\"\n\t@echo \"# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Luigi\"\n\t@echo \"# devhelp\"\n\nepub:\n\t$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub\n\t@echo\n\t@echo \"Build finished. The epub file is in $(BUILDDIR)/epub.\"\n\nlatex:\n\t$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex\n\t@echo\n\t@echo \"Build finished; the LaTeX files are in $(BUILDDIR)/latex.\"\n\t@echo \"Run \\`make' in that directory to run these through (pdf)latex\" \\\n\t      \"(use \\`make latexpdf' here to do that automatically).\"\n\nlatexpdf:\n\t$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex\n\t@echo \"Running LaTeX files through pdflatex...\"\n\t$(MAKE) -C $(BUILDDIR)/latex all-pdf\n\t@echo \"pdflatex finished; the PDF files are in $(BUILDDIR)/latex.\"\n\nlatexpdfja:\n\t$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex\n\t@echo \"Running LaTeX files through platex and dvipdfmx...\"\n\t$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja\n\t@echo \"pdflatex finished; the PDF files are in $(BUILDDIR)/latex.\"\n\ntext:\n\t$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text\n\t@echo\n\t@echo \"Build finished. The text files are in $(BUILDDIR)/text.\"\n\nman:\n\t$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man\n\t@echo\n\t@echo \"Build finished. The manual pages are in $(BUILDDIR)/man.\"\n\ntexinfo:\n\t$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo\n\t@echo\n\t@echo \"Build finished. The Texinfo files are in $(BUILDDIR)/texinfo.\"\n\t@echo \"Run \\`make' in that directory to run these through makeinfo\" \\\n\t      \"(use \\`make info' here to do that automatically).\"\n\ninfo:\n\t$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo\n\t@echo \"Running Texinfo files through makeinfo...\"\n\tmake -C $(BUILDDIR)/texinfo info\n\t@echo \"makeinfo finished; the Info files are in $(BUILDDIR)/texinfo.\"\n\ngettext:\n\t$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale\n\t@echo\n\t@echo \"Build finished. The message catalogs are in $(BUILDDIR)/locale.\"\n\nchanges:\n\t$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes\n\t@echo\n\t@echo \"The overview file is in $(BUILDDIR)/changes.\"\n\nlinkcheck:\n\t$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck\n\t@echo\n\t@echo \"Link check complete; look for any errors in the above output \" \\\n\t      \"or in $(BUILDDIR)/linkcheck/output.txt.\"\n\ndoctest:\n\t$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest\n\t@echo \"Testing of doctests in the sources finished, look at the \" \\\n\t      \"results in $(BUILDDIR)/doctest/output.txt.\"\n\nxml:\n\t$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml\n\t@echo\n\t@echo \"Build finished. The XML files are in $(BUILDDIR)/xml.\"\n\npseudoxml:\n\t$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml\n\t@echo\n\t@echo \"Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml.\"\n"
  },
  {
    "path": "doc/central_scheduler.rst",
    "content": "Using the Central Scheduler\n---------------------------\n\nWhile the ``--local-scheduler`` flag is useful for development purposes,\nit's not recommended for production usage.\nThe centralized scheduler serves two purposes:\n\n-  Make sure two instances of the same task are not running simultaneously\n-  Provide visualization of everything that's going on.\n\nNote that the central scheduler does not execute anything for you or\nhelp you with job parallelization.\nFor running tasks periodically,\nthe easiest thing to do is to trigger a Python script from cron or\nfrom a continuously running process.\nThere is no central process that automatically triggers jobs.\nThis model may seem limited, but\nwe believe that it makes things far more intuitive and easy to understand.\n\n.. figure:: dependency_graph.png\n   :alt: Dependency graph in the visualiser\n\nThe luigid server\n~~~~~~~~~~~~~~~~~\n\nTo run the server as a daemon run:\n\n.. code-block:: console\n\n    $ luigid --background --pidfile <PATH_TO_PIDFILE> --logdir <PATH_TO_LOGDIR> --state-path <PATH_TO_STATEFILE>\n\nNote that this requires ``python-daemon``.\nBy default, the server starts on AF_INET and AF_INET6 port ``8082``\n(which can be changed with the ``--port`` flag) and listens on all IPs. To change the default behavior of listening on all IPs, pass the ``--address`` flag and the IP address to listen on.\nTo use an AF_UNIX socket use the ``--unix-socket`` flag.\n\nFor a full list of configuration options and defaults,\nsee the :ref:`scheduler configuration section <scheduler-config>`.\nNote that ``luigid`` uses the same configuration files as the Luigi client\n(i.e. ``luigi.cfg`` or ``/etc/luigi/client.cfg`` by default).\n\n.. _TaskHistory:\n\nEnabling Task History\n~~~~~~~~~~~~~~~~~~~~~\n\nTask History is an experimental feature in which\nadditional information about tasks that have been executed are recorded in a relational database\nfor historical analysis.\nThis information is exposed via the Central Scheduler at ``/history``.\n\nTo enable the task history,\nspecify ``record_task_history = True`` in the\n``[scheduler]`` section of ``luigi.cfg`` and\nspecify ``db_connection`` under ``[task_history]``.\nThe ``db_connection`` string is used to configure the `SQLAlchemy engine\n<http://docs.sqlalchemy.org/en/rel_0_9/core/engines.html>`_.\nWhen starting up,\n``luigid`` will create all the necessary tables using `create_all\n<http://docs.sqlalchemy.org/en/rel_0_9/core/metadata.html#sqlalchemy.schema.MetaData.create_all>`_.\n\nExample configuration\n\n.. code:: ini\n\n    [scheduler]\n    record_task_history = True\n    state_path = /usr/local/var/luigi-state.pickle\n\n    [task_history]\n    db_connection = sqlite:////usr/local/var/luigi-task-hist.db\n\nThe task history has the following pages:\n\n* ``/history``\n  a reverse-cronological listing of runs from the past 24 hours.\n  Example screenshot:\n\n    .. figure:: history.png\n       :alt: Recent history screenshot\n* ``/history/by_id/{id}``\n  detailed information about a run, including:\n  parameter values, the host on which it ran, and timing information.\n  Example screenshot:\n\n    .. figure:: history_by_id.png\n       :alt: By id screenshot\n* ``/history/by_name/{name}``\n  a listing of all runs of a task with the given task ``{name}``.\n  Example screenshot:\n\n    .. figure:: history_by_name.png\n       :alt: By name screenshot\n* ``/history/by_params/{name}?data=params``\n  a listing of all runs of the task ``{name}`` restricted to runs with ``params`` matching the given history.\n  The ``params`` is a json blob describing the parameters,\n  e.g. ``data={\"foo\": \"bar\"}`` looks for a task with ``foo=bar``.\n* ``/history/by_task_id/{task_id}``\n  the latest run of a task given the ``{task_id}``. It is different from just ``{id}``\n  and is a derivative of ``params``. It is available via ``{task_id}`` property of a \n  ``luigi.Task`` instance or via `luigi.task.task_id_str\n  <https://luigi.readthedocs.io/en/stable/api/luigi.task.html#luigi.task.task_id_str>`_.\n  This kind of representation is useful for concisely recording URLs in a history tree.\n  Example screenshot:\n\n    .. figure:: history_by_task_id.png\n       :alt: By task_id screenshot\n  \n"
  },
  {
    "path": "doc/conf.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Luigi documentation build configuration file, created by\n# sphinx-quickstart on Sat Feb  8 00:56:43 2014.\n#\n# This file is execfile()d with the current directory set to its\n# containing dir.\n#\n# Note that not all possible configuration values are present in this\n# autogenerated file.\n#\n# All configuration values have a default; values that are commented out\n# serve to show the default.\n\nimport sys\nimport os\nimport datetime\nfrom importlib.metadata import Distribution\n\n\ntry:\n    import luigi\n    import luigi.parameter\n\n    def parameter_repr(self):\n        \"\"\"\n        When building documentation, we want Parameter objects to show their\n        description in a nice way\n        \"\"\"\n        significance = 'Insignificant ' if not self.significant else ''\n        class_name = self.__class__.__name__\n        has_default = self._default != luigi.parameter._no_value\n        default = ' (defaults to {})'.format(self._default) if has_default else ''\n        description = (': ' + self.description if self.description else '')\n        return significance + class_name + default + description\n\n    luigi.parameter.Parameter.__repr__ = parameter_repr\n\n    def assertIn(needle, haystack):\n        \"\"\"\n        We test repr of Parameter objects, since it'll be used for readthedocs\n        \"\"\"\n        assert needle in haystack\n\n    # TODO: find a better place to put this!\n    assertIn('IntParameter', repr(luigi.IntParameter()))\n    assertIn('defaults to 37', repr(luigi.IntParameter(default=37)))\n    assertIn('hi mom', repr(luigi.IntParameter(description='hi mom')))\n    assertIn('Insignificant BoolParameter', repr(luigi.BoolParameter(significant=False)))\nexcept ImportError:\n    pass\n\n\n# If extensions (or modules to document with autodoc) are in another directory,\n# add these directories to sys.path here. If the directory is relative to the\n# documentation root, use os.path.abspath to make it absolute, like shown here.\nsys.path.insert(0, os.path.abspath(os.path.pardir))\n\n# append the __init__ to class definitions\nautoclass_content = 'both'\n\n# -- General configuration ------------------------------------------------\n\n# If your documentation needs a minimal Sphinx version, state it here.\nneeds_sphinx = '9.0'\n\n# Add any Sphinx extension module names here, as strings. They can be\n# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom\n# ones.\nextensions = [\n    'sphinx.ext.autodoc',\n    'sphinx.ext.viewcode',\n    'sphinx.ext.autosummary',\n]\n\n# Add any paths that contain templates here, relative to this directory.\ntemplates_path = ['_templates']\n\n# The suffix of source filenames.\nsource_suffix = '.rst'\n\n# The encoding of source files.\n#source_encoding = 'utf-8-sig'\n\n# The master toctree document.\nmaster_doc = 'index'\n\n# General information about the project.\nproject = u'Luigi'\nauthors = u\"The Luigi Authors\"\ncopyright = u\"2011-{}, {}\".format(datetime.datetime.now().year, authors)\n\n# The version info for the project you're documenting, acts as replacement for\n# |version| and |release|, also used in various other places throughout the\n# built documents.\n#\n__version__ = Distribution.from_name('luigi').version  # assume luigi is already installed\n# The short X.Y version.\nversion = \".\".join(__version__.split(\".\")[0:2])\n# The full version, including alpha/beta/rc tags.\nrelease = __version__\n\n# The language for content autogenerated by Sphinx. Refer to documentation\n# for a list of supported languages.\n#language = None\n\n# There are two options for replacing |today|: either, you set today to some\n# non-false value, then it is used:\n#today = ''\n# Else, today_fmt is used as the format for a strftime call.\n#today_fmt = '%B %d, %Y'\n\n# List of patterns, relative to source directory, that match files and\n# directories to ignore when looking for source files.\nexclude_patterns = ['_build', 'README.rst']\n\n# The reST default role (used for this markup: `text`) to use for all\n# documents.\n#default_role = None\n\n# If true, '()' will be appended to :func: etc. cross-reference text.\n#add_function_parentheses = True\n\n# If true, the current module name will be prepended to all description\n# unit titles (such as .. function::).\n#add_module_names = True\n\n# If true, sectionauthor and moduleauthor directives will be shown in the\n# output. They are ignored by default.\n#show_authors = False\n\n# The name of the Pygments (syntax highlighting) style to use.\npygments_style = 'sphinx'\n\n# A list of ignored prefixes for module index sorting.\n#modindex_common_prefix = []\n\n# If true, keep warnings as \"system message\" paragraphs in the built documents.\n#keep_warnings = False\n\nautodoc_default_options = {'members': True, 'undoc-members': True}\nautosummary_generate = True\nautodoc_member_order = 'bysource'\n\n# -- Options for HTML output ----------------------------------------------\n\n# The theme to use for HTML and HTML Help pages.  See the documentation for\n# a list of builtin themes.\n\nhtml_theme = 'sphinx_rtd_theme'\n\n# Theme options are theme-specific and customize the look and feel of a theme\n# further.  For a list of options available for each theme, see the\n# documentation.\n#html_theme_options = {}\n\n# Add any paths that contain custom themes here, relative to this directory.\n#html_theme_path = []\n\n# The name for this set of Sphinx documents.  If None, it defaults to\n# \"<project> v<release> documentation\".\n#html_title = None\n\n# A shorter title for the navigation bar.  Default is the same as html_title.\n#html_short_title = None\n\n# The name of an image file (relative to this directory) to place at the top\n# of the sidebar.\nhtml_logo = 'luigi.png'\n\n# The name of an image file (within the static path) to use as favicon of the\n# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32\n# pixels large.\n#html_favicon = None\n\n# Add any paths that contain custom static files (such as style sheets) here,\n# relative to this directory. They are copied after the builtin static files,\n# so a file named \"default.css\" will overwrite the builtin \"default.css\".\n#html_static_path = ['_static']\n\n# Add any extra paths that contain custom files (such as robots.txt or\n# .htaccess) here, relative to this directory. These files are copied\n# directly to the root of the documentation.\n#html_extra_path = []\n\n# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,\n# using the given strftime format.\n#html_last_updated_fmt = '%b %d, %Y'\n\n# If true, SmartyPants will be used to convert quotes and dashes to\n# typographically correct entities.\n#html_use_smartypants = True\n\n# Custom sidebar templates, maps document names to template names.\n#html_sidebars = {}\n\n# Additional templates that should be rendered to pages, maps page names to\n# template names.\n#html_additional_pages = {}\n\n# If false, no module index is generated.\n#html_domain_indices = True\n\n# If false, no index is generated.\n#html_use_index = True\n\n# If true, the index is split into individual pages for each letter.\n#html_split_index = False\n\n# If true, links to the reST sources are added to the pages.\n#html_show_sourcelink = True\n\n# If true, \"Created using Sphinx\" is shown in the HTML footer. Default is True.\n#html_show_sphinx = True\n\n# If true, \"(C) Copyright ...\" is shown in the HTML footer. Default is True.\n#html_show_copyright = True\n\n# If true, an OpenSearch description file will be output, and all pages will\n# contain a <link> tag referring to it.  The value of this option must be the\n# base URL from which the finished HTML is served.\n#html_use_opensearch = ''\n\n# This is the file name suffix for HTML files (e.g. \".xhtml\").\n#html_file_suffix = None\n\n# Output file base name for HTML help builder.\nhtmlhelp_basename = 'Luigidoc'\n\n\n# -- Options for LaTeX output ---------------------------------------------\n\nlatex_elements = {\n    # The paper size ('letterpaper' or 'a4paper').\n    #'papersize': 'letterpaper',\n\n    # The font size ('10pt', '11pt' or '12pt').\n    #'pointsize': '10pt',\n\n    # Additional stuff for the LaTeX preamble.\n    #'preamble': '',\n}\n\n# Grouping the document tree into LaTeX files. List of tuples\n# (source start file, target name, title,\n#  author, documentclass [howto, manual, or own class]).\nlatex_documents = [\n    ('index', 'Luigi.tex', u'Luigi Documentation',\n     authors, 'manual'),\n]\n\n# The name of an image file (relative to this directory) to place at the top of\n# the title page.\n#latex_logo = None\n\n# For \"manual\" documents, if this is true, then toplevel headings are parts,\n# not chapters.\n#latex_use_parts = False\n\n# If true, show page references after internal links.\n#latex_show_pagerefs = False\n\n# If true, show URL addresses after external links.\n#latex_show_urls = False\n\n# Documents to append as an appendix to all manuals.\n#latex_appendices = []\n\n# If false, no module index is generated.\n#latex_domain_indices = True\n\n\n# -- Options for manual page output ---------------------------------------\n\n# One entry per manual page. List of tuples\n# (source start file, name, description, authors, manual section).\nman_pages = [\n    ('index', 'luigi', u'Luigi Documentation',\n     [authors], 1)\n]\n\n# If true, show URL addresses after external links.\n#man_show_urls = False\n\n\n# -- Options for Texinfo output -------------------------------------------\n\n# Grouping the document tree into Texinfo files. List of tuples\n# (source start file, target name, title, author,\n#  dir menu entry, description, category)\ntexinfo_documents = [\n    ('index', 'Luigi', u'Luigi Documentation',\n     authors, 'Luigi', 'One line description of project.',\n     'Miscellaneous'),\n]\n\n# Documents to append as an appendix to all manuals.\n#texinfo_appendices = []\n\n# If false, no module index is generated.\n#texinfo_domain_indices = True\n\n# How to display URL addresses: 'footnote', 'no', or 'inline'.\n#texinfo_show_urls = 'footnote'\n\n# If true, do not generate a @detailmenu in the \"Top\" node's menu.\n#texinfo_no_detailmenu = False\n\nautodoc_mock_imports = [\"mypy\"]\n\n# sphinx-apidoc --separate generates individual RST files not referenced by any toctree;\n# suppress the resulting warnings since this is expected behaviour.\nsuppress_warnings = ['toc.not_included']\n\n# Some regression introduced\n# https://github.com/sphinx-doc/sphinx/issues/2330\n# https://github.com/spotify/luigi/pull/1555\nhighlight_language = \"python\"\n"
  },
  {
    "path": "doc/configuration.rst",
    "content": "Configuration\n=============\n\nAll configuration can be done by adding configuration files.\n\nSupported config parsers:\n\n* ``cfg`` (default), based on Python's standard ConfigParser_. Values may refer to environment variables using ``${ENVVAR}`` syntax.\n* ``toml``\n\n.. _ConfigParser: https://docs.python.org/3/library/configparser.html\n\nYou can choose right parser via ``LUIGI_CONFIG_PARSER`` environment variable. For example, ``LUIGI_CONFIG_PARSER=toml``.\n\nDefault (cfg) parser are looked for in:\n\n* ``/etc/luigi/client.cfg`` (deprecated)\n* ``/etc/luigi/luigi.cfg``\n* ``client.cfg`` (deprecated)\n* ``luigi.cfg``\n* ``LUIGI_CONFIG_PATH`` environment variable\n\n`TOML <https://github.com/toml-lang/toml>`_ parser are looked for in:\n\n* ``/etc/luigi/luigi.toml``\n* ``luigi.toml``\n* ``LUIGI_CONFIG_PATH`` environment variable\n\nBoth config lists increase in priority (from low to high). The order only\nmatters in case of key conflicts (see docs for ConfigParser.read_).\nThese files are meant for both the client and ``luigid``.\nIf you decide to specify your own configuration you should make sure\nthat both the client and ``luigid`` load it properly.\n\n.. _ConfigParser.read: https://docs.python.org/3/library/configparser.html#configparser.ConfigParser.read\n\nThe config file is broken into sections, each controlling a different part of the config.\n\nExample cfg config:\n\n.. code:: ini\n\n    [hadoop]\n    version=cdh4\n    streaming_jar=/usr/lib/hadoop-xyz/hadoop-streaming-xyz-123.jar\n\n    [core]\n    scheduler_host=luigi-host.mycompany.foo\n\nExample toml config:\n\n.. code:: python\n\n    [hadoop]\n    version = \"cdh4\"\n    streaming_jar = \"/usr/lib/hadoop-xyz/hadoop-streaming-xyz-123.jar\"\n\n    [core]\n    scheduler_host = \"luigi-host.mycompany.foo\"\n\nAlso see `examples/config.toml\n<https://github.com/spotify/luigi/blob/master/examples/config.toml>`_\nfor more complex example.\n\n.. _ParamConfigIngestion:\n\nParameters from config Ingestion\n--------------------------------\n\nAll parameters can be overridden from configuration files. For instance if you\nhave a Task definition:\n\n.. code:: python\n\n    class DailyReport(luigi.contrib.hadoop.JobTask):\n        date = luigi.DateParameter(default=datetime.date.today())\n        # ...\n\nThen you can override the default value for ``DailyReport().date`` by providing\nit in the configuration:\n\n.. code:: ini\n\n    [DailyReport]\n    date=2012-01-01\n\n.. _ConfigClasses:\n\nConfiguration classes\n*********************\n\nUsing the :ref:`ParamConfigIngestion` method, we derive the\nconventional way to do global configuration. Imagine this configuration.\n\n.. code:: ini\n\n    [mysection]\n    option=hello\n    intoption=123\n\n\nWe can create a :py:class:`~luigi.Config` class:\n\n.. code:: python\n\n    import luigi\n\n    # Config classes should be camel cased\n    class mysection(luigi.Config):\n        option = luigi.Parameter(default='world')\n        intoption = luigi.IntParameter(default=555)\n\n    mysection().option\n    mysection().intoption\n\n\nConfigurable options\n--------------------\n\nLuigi comes with a lot of configurable options. Below, we describe each\nsection and the parameters available within it.\n\n\n[core]\n------\n\nThese parameters control core Luigi behavior, such as error e-mails and\ninteractions between the worker and scheduler.\n\nautoload_range\n  .. versionadded:: 2.8.11\n\n  If false, prevents range tasks from autoloading. They can still be loaded\n  using ``--module luigi.tools.range``. Defaults to true. Setting this to true\n  explicitly disables the deprecation warning.\n\ndefault_scheduler_host\n  Hostname of the machine running the scheduler. Defaults to localhost.\n\ndefault_scheduler_port\n  Port of the remote scheduler api process. Defaults to 8082.\n\ndefault_scheduler_url\n  Full path to remote scheduler. Defaults to ``http://localhost:8082/``.\n  For TLS support use the URL scheme: ``https``,\n  example: ``https://luigi.example.com:443/``\n  (Note: you will have to terminate TLS using an HTTP proxy)\n  You can also use this to connect to a local Unix socket using the\n  non-standard URI scheme: ``http+unix``\n  example: ``http+unix://%2Fvar%2Frun%2Fluigid%2Fluigid.sock/``\n\nhdfs_tmp_dir\n  Base directory in which to store temporary files on hdfs. Defaults to\n  tempfile.gettempdir()\n\nhistory_filename\n  If set, specifies a filename for Luigi to write stuff (currently just\n  job id) to in mapreduce job's output directory. Useful in a\n  configuration where no history is stored in the output directory by\n  Hadoop.\n\nlog_level\n  The default log level to use when no logging_conf_file is set. Must be\n  a valid name of a `Python log level\n  <https://docs.python.org/2/library/logging.html#logging-levels>`_.\n  Default is ``DEBUG``.\n\nlogging_conf_file\n  Location of the logging configuration file.\n\nno_configure_logging\n  If true, logging is not configured. Defaults to false.\n\nparallel_scheduling\n  If true, the scheduler will compute complete functions of tasks in\n  parallel using multiprocessing. This can significantly speed up\n  scheduling, but requires that all tasks can be pickled.\n  Defaults to false.\n\nparallel_scheduling_processes\n  The number of processes to use for parallel scheduling. If not specified\n  the default number of processes will be the total number of CPUs available.\n\nrpc_connect_timeout\n  Number of seconds to wait before timing out when making an API call.\n  Defaults to 10.0\n\nrpc_retry_attempts\n  The maximum number of retries to connect the central scheduler before giving up.\n  Defaults to 3\n\nrpc_retry_wait\n  Number of seconds to wait before the next attempt will be started to\n  connect to the central scheduler between two retry attempts.\n  Defaults to 30\n\n\n[cors]\n------\n\n.. versionadded:: 2.8.0\n\nThese parameters control ``/api/<method>`` ``CORS`` behaviour (see: `W3C Cross-Origin Resource Sharing\n<http://www.w3.org/TR/cors/>`_).\n\nenabled\n  Enables CORS support.\n  Defaults to false.\n\nallowed_origins\n  A list of allowed origins. Used only if ``allow_any_origin`` is false.\n  Configure in JSON array format, e.g. [\"foo\", \"bar\"].\n  Defaults to empty.\n\nallow_any_origin\n  Accepts requests from any origin.\n  Defaults to false.\n\nallow_null_origin\n  Allows the request to set ``null`` value of the ``Origin`` header.\n  Defaults to false.\n\nmax_age\n  Content of ``Access-Control-Max-Age``.\n  Defaults to 86400 (24 hours).\n\nallowed_methods\n  Content of ``Access-Control-Allow-Methods``.\n  Defaults to ``GET, OPTIONS``.\n\nallowed_headers\n  Content of ``Access-Control-Allow-Headers``.\n  Defaults to ``Accept, Content-Type, Origin``.\n\nexposed_headers\n  Content of ``Access-Control-Expose-Headers``.\n  Defaults to empty string (will NOT be sent as a response header).\n\nallow_credentials\n  Indicates that the actual request can include user credentials.\n  Defaults to false.\n\n.. _worker-config:\n\n[worker]\n--------\n\nThese parameters control Luigi worker behavior.\n\ncount_uniques\n  If true, workers will only count unique pending jobs when deciding\n  whether to stay alive. So if a worker can't get a job to run and other\n  workers are waiting on all of its pending jobs, the worker will die.\n  ``worker_keep_alive`` must be ``true`` for this to have any effect. Defaults\n  to false.\n\nkeep_alive\n  If true, workers will stay alive when they run out of jobs to run, as\n  long as they have some pending job waiting to be run. Defaults to\n  false.\n\nping_interval\n  Number of seconds to wait between pinging scheduler to let it know\n  that the worker is still alive. Defaults to 1.0.\n\ntask_limit\n  .. versionadded:: 1.0.25\n\n  Maximum number of tasks to schedule per invocation. Upon exceeding it,\n  the worker will issue a warning and proceed with the workflow obtained\n  thus far. Prevents incidents due to spamming of the scheduler, usually\n  accidental. Default: no limit.\n\ntask_process_context\n  An optional setting allowing Luigi to import a custom context manager\n  used to wrap the execution of tasks' run methods. Default: no context manager.\n\ntimeout\n  .. versionadded:: 1.0.20\n\n  Number of seconds after which to kill a task which has been running\n  for too long. This provides a default value for all tasks, which can\n  be overridden by setting the ``worker_timeout`` property in any task.\n  Default value is 0, meaning no timeout.\n\nwait_interval\n  Number of seconds for the worker to wait before asking the scheduler\n  for another job after the scheduler has said that it does not have any\n  available jobs.\n\nwait_jitter\n  Duration of jitter to add to the worker wait interval such that the multiple\n  workers do not ask the scheduler for another job at the same time, in seconds.\n  Default: 5.0\n\nmax_keep_alive_idle_duration\n  .. versionadded:: 2.8.4\n\n  Maximum duration in seconds to keep worker alive while in idle state.\n  Default: 0 (Indefinitely)\n\nmax_reschedules\n  The maximum number of times that a job can be automatically\n  rescheduled by a worker before it will stop trying. Workers will\n  reschedule a job if it is found to not be done when attempting to run\n  a dependent job. This defaults to 1.\n\nretry_external_tasks\n  If true, incomplete external tasks (i.e. tasks where the ``run()`` method is\n  NotImplemented) will be retested for completion while Luigi is running.\n  This means that if external dependencies are satisfied after a workflow has\n  started, any tasks dependent on that resource will be eligible for running.\n  Note: Every time the task remains incomplete, it will count as FAILED, so\n  normal retry logic applies (see: ``retry_count`` and ``retry_delay``).\n  This setting works best with ``worker_keep_alive: true``.\n  If false, external tasks will only be evaluated when Luigi is first invoked.\n  In this case, Luigi will not check whether external dependencies are\n  satisfied  while a workflow is in progress, so dependent tasks will remain\n  PENDING until the workflow is reinvoked.\n  Defaults to false for backwards compatibility.\n\nno_install_shutdown_handler\n  By default, workers will stop requesting new work and finish running\n  pending tasks after receiving a ``SIGUSR1`` signal. This provides a hook\n  for gracefully shutting down workers that are in the process of running\n  (potentially expensive) tasks. If set to true, Luigi will NOT install\n  this shutdown hook on workers. Note this hook does not work on Windows\n  operating systems, or when jobs are launched outside the main execution\n  thread.\n  Defaults to false.\n\nsend_failure_email\n  Controls whether the worker will send e-mails on task and scheduling\n  failures. If set to false, workers will only send e-mails on\n  framework errors during scheduling and all other e-mail must be\n  handled by the scheduler.\n  Defaults to true.\n\ncheck_unfulfilled_deps\n  If true, the worker checks for completeness of dependencies before running a\n  task. In case unfulfilled dependencies are detected, an exception is raised\n  and the task will not run. This mechanism is useful to detect situations\n  where tasks do not create their outputs properly, or when targets were\n  removed after the dependency tree was built. It is recommended to disable\n  this feature only when the completeness checks are known to be bottlenecks,\n  e.g. when the ``exists()`` calls of the dependencies' outputs are\n  resource-intensive.\n  Defaults to true.\n\nforce_multiprocessing\n  By default, luigi uses multiprocessing when *more than one* worker process is\n  requested. When set to true, multiprocessing is used independent of the\n  number of workers.\n  Defaults to false.\n\ncheck_complete_on_run\n  By default, luigi tasks are marked as 'done' when they finish running without\n  raising an error. When set to true, tasks will also verify that their outputs\n  exist when they finish running, and will fail immediately if the outputs are\n  missing.\n  Defaults to false.\n\ncache_task_completion\n  By default, luigi task processes might check the completion status multiple\n  times per task which is a safe way to avoid potential inconsistencies. For\n  tasks with many dynamic dependencies, yielded in multiple stages, this might\n  become expensive, e.g. in case the per-task completion check entails remote\n  resources. When set to true, completion checks are cached so that tasks\n  declared as complete once are not checked again.\n  Defaults to false.\n\n\n[elasticsearch]\n---------------\n\nThese parameters control use of elasticsearch\n\nmarker_index\n  Defaults to \"update_log\".\n\nmarker_doc_type\n  Defaults to \"entry\".\n\n\n[email]\n-------\n\nGeneral parameters\n\nforce_send\n  If true, e-mails are sent in all run configurations (even if stdout is\n  connected to a tty device).  Defaults to False.\n\nformat\n  Type of e-mail to send. Valid values are \"plain\", \"html\" and \"none\".\n  When set to html, tracebacks are wrapped in <pre> tags to get fixed-\n  width font. When set to none, no e-mails will be sent.\n\n  Default value is plain.\n\nmethod\n  Valid values are \"smtp\", \"sendgrid\", \"ses\" and \"sns\". SES and SNS are\n  services of Amazon web services. SendGrid is an email delivery service.\n  The default value is \"smtp\".\n\n  In order to send messages through Amazon SNS or SES set up your AWS\n  config files or run Luigi on an EC2 instance with proper instance\n  profile.\n\n  In order to use sendgrid, fill in your sendgrid API key in the\n  `[sendgrid]`_ section.\n\n  In order to use smtp, fill in the appropriate fields in the `[smtp]`_\n  section.\n\nprefix\n  Optional prefix to add to the subject line of all e-mails. For\n  example, setting this to \"[LUIGI]\" would change the subject line of an\n  e-mail from \"Luigi: Framework error\" to \"[LUIGI] Luigi: Framework\n  error\"\n\nreceiver\n  Recipient of all error e-mails. If this is not set, no error e-mails\n  are sent when Luigi crashes unless the crashed job has owners set. If\n  Luigi is run from the command line, no e-mails will be sent unless\n  output is redirected to a file.\n\n  Set it to SNS Topic ARN if you want to receive notifications through\n  Amazon SNS. Make sure to set method to sns in this case too.\n\nsender\n  User name in from field of error e-mails.\n  Default value: luigi-client@<server_name>\n\ntraceback_max_length\n  Maximum length for traceback included in error email. Default is 5000.\n\n\n[batch_email]\n----------------\n\nParameters controlling the contents of batch notifications sent from the\nscheduler\n\nemail_interval\n  Number of minutes between e-mail sends. Making this larger results in\n  fewer, bigger e-mails.\n  Defaults to 60.\n\nbatch_mode\n  Controls how tasks are grouped together in the e-mail. Suppose we have\n  the following sequence of failures:\n\n  1. TaskA(a=1, b=1)\n  2. TaskA(a=1, b=1)\n  3. TaskA(a=2, b=1)\n  4. TaskA(a=1, b=2)\n  5. TaskB(a=1, b=1)\n\n  For any setting of batch_mode, the batch e-mail will record 5 failures\n  and mention them in the subject. The difference is in how they will\n  be displayed in the body. Here are example bodies with error_messages\n  set to 0.\n\n  \"all\" only groups together failures for the exact same task:\n\n  - TaskA(a=1, b=1) (2 failures)\n  - TaskA(a=1, b=2) (1 failure)\n  - TaskA(a=2, b=1) (1 failure)\n  - TaskB(a=1, b=1) (1 failure)\n\n  \"family\" groups together failures for tasks of the same family:\n\n  - TaskA (4 failures)\n  - TaskB (1 failure)\n\n  \"unbatched_params\" groups together tasks that look the same after\n  removing batched parameters. So if TaskA has a batch_method set for\n  parameter a, we get the following:\n\n  - TaskA(b=1) (3 failures)\n  - TaskA(b=2) (1 failure)\n  - TaskB(a=1, b=2) (1 failure)\n\n  Defaults to \"unbatched_params\", which is identical to \"all\" if you are\n  not using batched parameters.\n\nerror_lines\n  Number of lines to include from each error message in the batch\n  e-mail. This can be used to keep e-mails shorter while preserving the\n  more useful information usually found near the bottom of stack traces.\n  This can be set to 0 to include all lines. If you don't wish to see\n  error messages, instead set ``error_messages`` to 0.\n  Defaults to 20.\n\nerror_messages\n  Number of messages to preserve for each task group. As most tasks that\n  fail repeatedly do so for similar reasons each time, it's not usually\n  necessary to keep every message. This controls how many messages are\n  kept for each task or task group. The most recent error messages are\n  kept. Set to 0 to not include error messages in the e-mails.\n  Defaults to 1.\n\ngroup_by_error_messages\n  Quite often, a system or cluster failure will cause many disparate\n  task types to fail for the same reason. This can cause a lot of noise\n  in the batch e-mails. This cuts down on the noise by listing items\n  with identical error messages together. Error messages are compared\n  after limiting by ``error_lines``.\n  Defaults to true.\n\n\n[hadoop]\n--------\n\nParameters controlling basic hadoop tasks\n\ncommand\n  Name of command for running hadoop from the command line. Defaults to\n  \"hadoop\"\n\npython_executable\n  Name of command for running python from the command line. Defaults to\n  \"python\"\n\nscheduler\n  Type of scheduler to use when scheduling hadoop jobs. Can be \"fair\" or\n  \"capacity\". Defaults to \"fair\".\n\nstreaming_jar\n  Path to your streaming jar. Must be specified to run streaming jobs.\n\nversion\n  Version of hadoop used in your cluster. Can be \"cdh3\", \"chd4\", or\n  \"apache1\". Defaults to \"cdh4\".\n\n\n[hdfs]\n------\n\nParameters controlling the use of snakebite to speed up hdfs queries.\n\nclient\n  Client to use for most hadoop commands. Options are \"snakebite\",\n  \"snakebite_with_hadoopcli_fallback\", \"webhdfs\" and \"hadoopcli\". Snakebite is\n  much faster, so use of it is encouraged. webhdfs is fast and works with\n  Python 3 as well, but has not been used that much in the wild.\n  Both snakebite and webhdfs requires you to install it separately on\n  the machine. Defaults to \"hadoopcli\".\n\nclient_version\n  Optionally specifies hadoop client version for snakebite.\n\neffective_user\n  Optionally specifies the effective user for snakebite.\n\nnamenode_host\n  The hostname of the namenode. Needed for snakebite if\n  snakebite_autoconfig is not set.\n\nnamenode_port\n  The port used by snakebite on the namenode. Needed for snakebite if\n  snakebite_autoconfig is not set.\n\nsnakebite_autoconfig\n  If true, attempts to automatically detect the host and port of the\n  namenode for snakebite queries. Defaults to false.\n\ntmp_dir\n  Path to where Luigi will put temporary files on hdfs\n\n\n[hive]\n------\n\nParameters controlling hive tasks\n\ncommand\n  Name of the command used to run hive on the command line. Defaults to\n  \"hive\".\n\nhiverc_location\n  Optional path to hive rc file.\n\nmetastore_host\n  Hostname for metastore.\n\nmetastore_port\n  Port for hive to connect to metastore host.\n\nrelease\n  If set to \"apache\", uses a hive client that better handles apache\n  hive output. All other values use the standard client Defaults to\n  \"cdh4\".\n\n\n[kubernetes]\n------------\n\nParameters controlling Kubernetes Job Tasks\n\nauth_method\n  Authorization method to access the cluster.\n  Options are \"kubeconfig_\" or \"service-account_\"\n\nkubeconfig_path\n  Path to kubeconfig file, for cluster authentication.\n  It defaults to ``~/.kube/config``, which is the default location when\n  using minikube_.\n  When auth_method is \"service-account\" this property is ignored.\n\nmax_retrials\n  Maximum number of retrials in case of job failure.\n\n.. _service-account: http://kubernetes.io/docs/user-guide/kubeconfig-file\n.. _kubeconfig: http://kubernetes.io/docs/user-guide/service-accounts\n.. _minikube: http://kubernetes.io/docs/getting-started-guides/minikube\n\n\n[mysql]\n-------\n\nParameters controlling use of MySQL targets\n\nmarker_table\n  Table in which to store status of table updates. This table will be\n  created if it doesn't already exist. Defaults to \"table_updates\".\n\n\n[postgres]\n----------\n\nParameters controlling the use of Postgres targets\n\nlocal_tmp_dir\n  Directory in which to temporarily store data before writing to\n  postgres. Uses system default if not specified.\n\nmarker_table\n  Table in which to store status of table updates. This table will be\n  created if it doesn't already exist. Defaults to \"table_updates\".\n\n\n[prometheus]\n------------\n\nuse_task_family_in_labels\n  Should task family be used as a prometheus bucket label.\n  Default value is true.\n\ntask_parameters_to_use_in_labels\n  List of task arguments' names used as additional prometheus bucket labels.\n  Passed in a form of a json list.\n\n\n[redshift]\n----------\n\nParameters controlling the use of Redshift targets\n\nmarker_table\n  Table in which to store status of table updates. This table will be\n  created if it doesn't already exist. Defaults to \"table_updates\".\n\n.. _resources-config:\n\n[resources]\n-----------\n\nThis section can contain arbitrary keys. Each of these specifies the\namount of a global resource that the scheduler can allow workers to use.\nThe scheduler will prevent running jobs with resources specified from\nexceeding the counts in this section. Unspecified resources are assumed\nto have limit 1. Example resources section for a configuration with 2\nhive resources and 1 mysql resource:\n\n.. code:: ini\n\n  [resources]\n  hive=2\n  mysql=1\n\nNote that it was not necessary to specify the 1 for mysql here, but it\nis good practice to do so when you have a fixed set of resources.\n\n.. _retcode-config:\n\n[retcode]\n---------\n\nConfigure return codes for the Luigi binary. In the case of multiple return\ncodes that could apply, for example a failing task and missing data, the\n*numerically greatest* return code is returned.\n\nWe recommend that you copy this set of exit codes to your ``luigi.cfg`` file:\n\n.. code:: ini\n\n  [retcode]\n  # The following return codes are the recommended exit codes for Luigi\n  # They are in increasing level of severity (for most applications)\n  already_running=10\n  missing_data=20\n  not_run=25\n  task_failed=30\n  scheduling_error=35\n  unhandled_exception=40\n\nalready_running\n  This can happen in two different cases. Either the local lock file was taken\n  at the time the invocation starts up. Or, the central scheduler have reported\n  that some tasks could not have been run, because other workers are already\n  running the tasks.\nmissing_data\n  For when an :py:class:`~luigi.task.ExternalTask` is not complete, and this\n  caused the worker to give up.  As an alternative to fiddling with this, see\n  the [worker] keep_alive option.\nnot_run\n  For when a task is not granted run permission by the scheduler. Typically\n  because of lack of resources, because the task has been already run by\n  another worker or because the attempted task is in DISABLED state.\n  Connectivity issues with the central scheduler might also cause this.\n  This does not include the cases for which a run is not allowed due to missing\n  dependencies (missing_data) or due to the fact that another worker is currently\n  running the task (already_running).\ntask_failed\n  For signaling that there were last known to have failed. Typically because\n  some exception have been raised.\nscheduling_error\n  For when a task's ``complete()`` or ``requires()`` method fails with an\n  exception, or when the limit number of tasks is reached.\nunhandled_exception\n  For internal Luigi errors.  Defaults to 4, since this type of error\n  probably will not recover over time.\n\nIf you customize return codes, prefer to set them in range 128 to 255 to avoid\nconflicts. Return codes in range 0 to 127 are reserved for possible future use\nby Luigi contributors.\n\n[scalding]\n----------\n\nParameters controlling running of scalding jobs\n\nscala_home\n  Home directory for scala on your machine. Defaults to either\n  SCALA_HOME or /usr/share/scala if SCALA_HOME is unset.\n\nscalding_home\n  Home directory for scalding on your machine. Defaults to either\n  SCALDING_HOME or /usr/share/scalding if SCALDING_HOME is unset.\n\nscalding_provided\n  Provided directory for scalding on your machine. Defaults to either\n  SCALDING_HOME/provided or /usr/share/scalding/provided\n\nscalding_libjars\n  Libjars directory for scalding on your machine. Defaults to either\n  SCALDING_HOME/libjars or /usr/share/scalding/libjars\n\n\n.. _scheduler-config:\n\n[scheduler]\n-----------\n\nParameters controlling scheduler behavior\n\nbatch_emails\n  Whether to send batch e-mails for failures and disables rather than\n  sending immediate disable e-mails and just relying on workers to send\n  immediate batch e-mails.\n  Defaults to false.\n\ndisable_hard_timeout\n  Hard time limit after which tasks will be disabled by the server if\n  they fail again, in seconds. It will disable the task if it fails\n  **again** after this amount of time. E.g. if this was set to 600\n  (i.e. 10 minutes), and the task first failed at 10:00am, the task would\n  be disabled if it failed again any time after 10:10am. Note: This setting\n  does not consider the values of the ``retry_count`` or\n  ``disable_window`` settings.\n\nretry_count\n  Number of times a task can fail within ``disable_window`` before\n  the scheduler will automatically disable it. If not set, the scheduler\n  will not automatically disable jobs.\n\ndisable_persist\n  Number of seconds for which an automatic scheduler disable lasts.\n  Defaults to 86400 (1 day).\n\ndisable_window\n  Number of seconds during which ``retry_count`` failures must\n  occur in order for an automatic disable by the scheduler. The\n  scheduler forgets about disables that have occurred longer ago than\n  this amount of time. Defaults to 3600 (1 hour).\n\nmax_shown_tasks\n  .. versionadded:: 1.0.20\n\n  The maximum number of tasks returned in a task_list api call. This\n  will restrict the number of tasks shown in task lists in the\n  visualiser. Small values can alleviate frozen browsers when there are\n  too many done tasks. This defaults to 100000 (one hundred thousand).\n\nmax_graph_nodes\n  .. versionadded:: 2.0.0\n\n  The maximum number of nodes returned by a dep_graph or\n  inverse_dep_graph api call. Small values can greatly speed up graph\n  display in the visualiser by limiting the number of nodes shown. Some\n  of the nodes that are not sent to the visualiser will still show up as\n  dependencies of nodes that were sent. These nodes are given TRUNCATED\n  status.\n\nrecord_task_history\n  If true, stores task history in a database. Defaults to false.\n\nremove_delay\n  Number of seconds to wait before removing a task that has no\n  stakeholders. Defaults to 600 (10 minutes).\n\nretry_delay\n  Number of seconds to wait after a task failure to mark it pending\n  again. Defaults to 900 (15 minutes).\n\nstate_path\n  Path in which to store the Luigi scheduler's state. When the scheduler\n  is shut down, its state is stored in this path. The scheduler must be\n  shut down cleanly for this to work, usually with a kill command. If\n  the kill command includes the -9 flag, the scheduler will not be able\n  to save its state. When the scheduler is started, it will load the\n  state from this path if it exists. This will restore all scheduled\n  jobs and other state from when the scheduler last shut down.\n\n  Sometimes this path must be deleted when restarting the scheduler\n  after upgrading Luigi, as old state files can become incompatible\n  with the new scheduler. When this happens, all workers should be\n  restarted after the scheduler both to become compatible with the\n  updated code and to reschedule the jobs that the scheduler has now\n  forgotten about.\n\n  This defaults to /var/lib/luigi-server/state.pickle\n\nworker_disconnect_delay\n  Number of seconds to wait after a worker has stopped pinging the\n  scheduler before removing it and marking all of its running tasks as\n  failed. Defaults to 60.\n\npause_enabled\n  If false, disables pause/unpause operations and hides the pause toggle from\n  the visualiser.\n\nsend_messages\n  When true, the scheduler is allowed to send messages to running tasks and\n  the central scheduler provides a simple prompt per task to send messages.\n  Defaults to true.\n\nmetrics_collector\n  Optional setting allowing Luigi to use a contribution to collect metrics\n  about the pipeline to a third-party. By default this uses the default metric\n  collector that acts as a shell and does nothing. The currently available\n  options are \"datadog\", \"prometheus\" and \"custom\". If it's custom the\n  'metrics_custom_import' needs to be set.\n\nmetrics_custom_import\n  Optional setting allowing Luigi to import a custom subclass of MetricsCollector\n  at runtime. The string should be formatted like \"module.sub_module.ClassName\".\n\n\n[sendgrid]\n----------\n\nThese parameters control sending error e-mails through SendGrid.\n\napikey\n  API key of the SendGrid account.\n\n\n[smtp]\n------\n\nThese parameters control the smtp server setup.\n\nhost\n  Hostname for sending mail through smtp. Defaults to localhost.\n\nlocal_hostname\n  If specified, overrides the FQDN of localhost in the HELO/EHLO\n  command.\n\nno_tls\n  If true, connects to smtp without TLS. Defaults to false.\n\npassword\n  Password to log in to your smtp server. Must be specified for\n  username to have an effect.\n\nport\n  Port number for smtp on smtp_host. Defaults to 0.\n\nssl\n  If true, connects to smtp through SSL. Defaults to false.\n\ntimeout\n  Sets the number of seconds after which smtp attempts should time out.\n  Defaults to 10.\n\nusername\n  Username to log in to your smtp server, if necessary.\n\n\n[spark]\n-------\n\nParameters controlling the default execution of :py:class:`~luigi.contrib.spark.SparkSubmitTask` and :py:class:`~luigi.contrib.spark.PySparkTask`:\n\n.. deprecated:: 1.1.1\n   :py:class:`~luigi.contrib.spark.SparkJob`, :py:class:`~luigi.contrib.spark.Spark1xJob` and :py:class:`~luigi.contrib.spark.PySpark1xJob`\n    are deprecated. Please use :py:class:`~luigi.contrib.spark.SparkSubmitTask` or :py:class:`~luigi.contrib.spark.PySparkTask`.\n\nspark_submit\n  Command to run in order to submit spark jobs. Default: ``\"spark-submit\"``\n\nmaster\n  Master url to use for ``spark_submit``. Example: local[*], spark://masterhost:7077. Default: Spark default (Prior to 1.1.1: yarn-client)\n\ndeploy_mode\n    Whether to launch the driver programs locally (\"client\") or on one of the worker machines inside the cluster (\"cluster\"). Default: Spark default\n\njars\n    Comma-separated list of local jars to include on the driver and executor classpaths. Default: Spark default\n\npackages\n    Comma-separated list of packages to link to on the driver and executors\n\npy_files\n    Comma-separated list of .zip, .egg, or .py files to place on the ``PYTHONPATH`` for Python apps. Default: Spark default\n\nfiles\n    Comma-separated list of files to be placed in the working directory of each executor. Default: Spark default\n\nconf:\n    Arbitrary Spark configuration property in the form Prop=Value|Prop2=Value2. Default: Spark default\n\nproperties_file\n    Path to a file from which to load extra properties. Default: Spark default\n\ndriver_memory\n    Memory for driver (e.g. 1000M, 2G). Default: Spark default\n\ndriver_java_options\n    Extra Java options to pass to the driver. Default: Spark default\n\ndriver_library_path\n    Extra library path entries to pass to the driver. Default: Spark default\n\ndriver_class_path\n    Extra class path entries to pass to the driver. Default: Spark default\n\nexecutor_memory\n    Memory per executor (e.g. 1000M, 2G). Default: Spark default\n\n*Configuration for Spark submit jobs on Spark standalone with cluster deploy mode only:*\n\ndriver_cores\n    Cores for driver. Default: Spark default\n\nsupervise\n    If given, restarts the driver on failure. Default: Spark default\n\n*Configuration for Spark submit jobs on Spark standalone and Mesos only:*\n\ntotal_executor_cores\n    Total cores for all executors. Default: Spark default\n\n*Configuration for Spark submit jobs on YARN only:*\n\nexecutor_cores\n    Number of cores per executor. Default: Spark default\n\nqueue\n    The YARN queue to submit to. Default: Spark default\n\nnum_executors\n    Number of executors to launch. Default: Spark default\n\narchives\n    Comma separated list of archives to be extracted into the working directory of each executor. Default: Spark default\n\nhadoop_conf_dir\n  Location of the hadoop conf dir. Sets HADOOP_CONF_DIR environment variable\n  when running spark. Example: /etc/hadoop/conf\n\n*Extra configuration for PySparkTask jobs:*\n\npy_packages\n    Comma-separated list of local packages (in your python path) to be distributed to the cluster.\n\n*Parameters controlling the execution of SparkJob jobs (deprecated):*\n\n\n[task_history]\n--------------\n\nParameters controlling storage of task history in a database\n\ndb_connection\n  Connection string for connecting to the task history db using\n  sqlalchemy.\n\n\n[execution_summary]\n-------------------\n\nParameters controlling execution summary of a worker\n\nsummary_length\n  Maximum number of tasks to show in an execution summary.  If the value is 0,\n  then all tasks will be displayed.  Default value is 5.\n\n\n[webhdfs]\n---------\n\nport\n  The port to use for webhdfs. The normal namenode port is probably on a\n  different port from this one.\n\nuser\n  Perform file system operations as the specified user instead of $USER.  Since\n  this parameter is not honored by any of the other hdfs clients, you should\n  think twice before setting this parameter.\n\nclient_type\n  The type of client to use. Default is the \"insecure\" client that requires no\n  authentication. The other option is the \"kerberos\" client that uses kerberos\n  authentication.\n\n[datadog]\n---------\n\napi_key\n  The api key found in the account settings of Datadog under the API\n  sections.\napp_key\n  The application key found in the account settings of Datadog under the API\n  sections.\ndefault_tags\n  Optional settings that adds the tag to all the metrics and events sent to\n  Datadog. Default value is \"application:luigi\".\nenvironment\n  Allows you to tweak multiple environment to differentiate between production,\n  staging or development metrics within Datadog. Default value is \"development\".\nstatsd_host\n  The host that has the statsd instance to allow Datadog to send statsd metric. Default value is \"localhost\".\nstatsd_port\n  The port on the host that allows connection to the statsd host. Defaults value is 8125.\nmetric_namespace\n  Optional prefix to add to the beginning of every metric sent to Datadog.\n  Default value is \"luigi\".\n\n\nPer Task Retry-Policy\n---------------------\n\nLuigi also supports defining ``retry_policy`` per task.\n\n.. code-block:: python\n\n    class GenerateWordsFromHdfs(luigi.Task):\n\n       retry_count = 2\n\n        ...\n\n    class GenerateWordsFromRDBM(luigi.Task):\n\n       retry_count = 5\n\n        ...\n\n    class CountLetters(luigi.Task):\n\n        def requires(self):\n            return [GenerateWordsFromHdfs()]\n\n        def run():\n            yield GenerateWordsFromRDBM()\n\n        ...\n\nIf none of retry-policy fields is defined per task, the field value will be **default** value which is defined in luigi config file.\n\nTo make luigi sticks to the given retry-policy, be sure you run luigi worker with ``keep_alive`` config. Please check ``keep_alive`` config in :ref:`worker-config` section.\n\nRetry-Policy Fields\n-------------------\n\nThe fields below are in retry-policy and they can be defined per task.\n\n* ``retry_count``\n* ``disable_hard_timeout``\n* ``disable_window``\n"
  },
  {
    "path": "doc/design_and_limitations.rst",
    "content": "Design and limitations\n----------------------\n\nLuigi is the successor to a couple of attempts that we weren't fully happy with.\nWe learned a lot from our mistakes and some design decisions include:\n\n-  Straightforward command-line integration.\n-  As little boilerplate as possible.\n-  Focus on job scheduling and dependency resolution, not a particular platform.\n   In particular, this means no limitation to Hadoop.\n   Though Hadoop/HDFS support is built-in and is easy to use,\n   this is just one of many types of things you can run.\n-  A file system abstraction where code doesn't have to care about where files are located.\n-  Atomic file system operations through this abstraction.\n   If a task crashes it won't lead to a broken state.\n-  The dependencies are decentralized.\n   No big config file in XML.\n   Each task just specifies which inputs it needs and cross-module dependencies are trivial.\n-  A web server that renders the dependency graph and does locking, etc for free.\n-  Trivial to extend with new file systems, file formats, and job types.\n   You can easily write jobs that inserts a Tokyo Cabinet into Cassandra.\n   Adding support for new systems is generally not very hard.\n   (Feel free to send us a patch when you're done!)\n-  Date algebra included.\n-  Lots of unit tests of the most basic stuff.\n\nIt wouldn't be fair not to mention some limitations with the current design:\n\n-  Its focus is on batch processing so\n   it's probably less useful for near real-time pipelines or continuously running processes.\n-  The assumption is that each task is a sizable chunk of work.\n   While you can probably schedule a few thousand jobs,\n   it's not meant to scale beyond tens of thousands.\n-  Luigi does not support distribution of execution.\n   When you have workers running thousands of jobs daily, this starts to matter,\n   because the worker nodes get overloaded.\n   There are some ways to mitigate this (trigger from many nodes, use resources),\n   but none of them are ideal.\n-  Luigi does not come with built-in triggering, and you still need to rely on something like\n   crontab to trigger workflows periodically.\n\nAlso, it should be mentioned that Luigi is named after the world's second most famous plumber.\n"
  },
  {
    "path": "doc/example_top_artists.rst",
    "content": "Example – Top Artists\n---------------------\n\nThis is a very simplified case of something we do at Spotify a lot.\nAll user actions are logged to Google Cloud Storage (previously HDFS) where\nwe run a bunch of processing jobs to transform the data. The processing code itself is implemented\nin a scalable data processing framework, such as Scio, Scalding, or Spark, but the jobs\nare orchestrated with Luigi.\nAt some point we might end up with\na smaller data set that we can bulk ingest into Cassandra, Postgres, or\nother storage suitable for serving or exploration.\n\nFor the purpose of this exercise, we want to aggregate all streams,\nfind the top 10 artists and then put the results into Postgres.\n\nThis example is also available in\n`examples/top_artists.py <https://github.com/spotify/luigi/blob/master/examples/top_artists.py>`_.\n\nStep 1 - Aggregate Artist Streams\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n.. code:: python\n\n    class AggregateArtists(luigi.Task):\n        date_interval = luigi.DateIntervalParameter()\n\n        def output(self):\n            return luigi.LocalTarget(\"data/artist_streams_%s.tsv\" % self.date_interval)\n\n        def requires(self):\n            return [Streams(date) for date in self.date_interval]\n\n        def run(self):\n            artist_count = defaultdict(int)\n\n            for input in self.input():\n                with input.open('r') as in_file:\n                    for line in in_file:\n                        timestamp, artist, track = line.strip().split()\n                        artist_count[artist] += 1\n\n            with self.output().open('w') as out_file:\n                for artist, count in artist_count.iteritems():\n                    print(artist, count, file=out_file)\n\nNote that this is just a portion of the file ``examples/top_artists.py``.\nIn particular, ``Streams`` is defined as a :class:`~luigi.task.Task`,\nacting as a dependency for ``AggregateArtists``.\nIn addition, ``luigi.run()`` is called if the script is executed directly,\nallowing it to be run from the command line.\n\nThere are several pieces of this snippet that deserve more explanation.\n\n-  Any :class:`~luigi.task.Task` may be customized by instantiating one\n   or more :class:`~luigi.parameter.Parameter` objects on the class level.\n-  The :func:`~luigi.task.Task.output` method tells Luigi where the result\n   of running the task will end up. The path can be some function of the\n   parameters.\n-  The :func:`~luigi.task.Task.requires` tasks specifies other tasks that\n   we need to perform this task. In this case it's an external dump named\n   *Streams* which takes the date as the argument.\n-  For plain Tasks, the :func:`~luigi.task.Task.run` method implements the\n   task. This could be anything, including calling subprocesses, performing\n   long running number crunching, etc. For some subclasses of\n   :class:`~luigi.task.Task` you don't have to implement the ``run``\n   method. For instance, for the :class:`~luigi.contrib.hadoop.JobTask`\n   subclass you implement a *mapper* and *reducer* instead.\n-  :class:`~luigi.LocalTarget` is a built in class that makes it\n   easy to read/write from/to the local filesystem. It also makes all file operations\n   atomic, which is nice in case your script crashes for any reason.\n\nRunning this Locally\n~~~~~~~~~~~~~~~~~~~~\n\nTry running this using eg.\n\n.. code-block:: console\n\n    $ cd examples\n    $ luigi --module top_artists AggregateArtists --local-scheduler --date-interval 2012-06\n\nNote that  *top_artists* needs to be in your PYTHONPATH, or else this can produce an error (*ImportError: No module named top_artists*). Add the current working directory to the command PYTHONPATH with:\n\n.. code-block:: console\n\n    $ PYTHONPATH='.' luigi --module top_artists AggregateArtists --local-scheduler --date-interval 2012-06\n\nYou can also try to view the manual using ``--help`` which will give you an\noverview of the options.\n\nRunning the command again will do nothing because the output file is\nalready created.\nIn that sense, any task in Luigi is *idempotent*\nbecause running it many times gives the same outcome as running it once.\nNote that unlike Makefile, the output will not be recreated when any of\nthe input files is modified.\nYou need to delete the output file\nmanually.\n\nThe ``--local-scheduler`` flag tells Luigi not to connect to a scheduler\nserver. This is not recommended for other purpose than just testing\nthings.\n\nStep 1b - Aggregate artists with Spark\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nWhile Luigi can process data inline, it is normally used to orchestrate external programs that\nperform the actual processing. In this example, we will demonstrate how top artists instead can be\nread from HDFS and calculated with Spark, orchestrated by Luigi.\n\n.. code:: python\n\n    class AggregateArtistsSpark(luigi.contrib.spark.SparkSubmitTask):\n        date_interval = luigi.DateIntervalParameter()\n\n        app = 'top_artists_spark.py'\n        master = 'local[*]'\n\n        def output(self):\n            return luigi.contrib.hdfs.HdfsTarget(\"data/artist_streams_%s.tsv\" % self.date_interval)\n\n        def requires(self):\n            return [StreamsHdfs(date) for date in self.date_interval]\n\n        def app_options(self):\n            # :func:`~luigi.task.Task.input` returns the targets produced by the tasks in\n            # `~luigi.task.Task.requires`.\n            return [','.join([p.path for p in self.input()]),\n                    self.output().path]\n\n\n:class:`luigi.contrib.hadoop.SparkSubmitTask` doesn't require you to implement a\n:func:`~luigi.task.Task.run` method. Instead, you specify the command line parameters to send\nto ``spark-submit``, as well as any other configuration specific to Spark.\n\nPython code for the Spark job is found below.\n\n.. code:: python\n\n    import operator\n    import sys\n    from pyspark.sql import SparkSession\n\n\n    def main(argv):\n        input_paths = argv[1].split(',')\n        output_path = argv[2]\n\n        spark = SparkSession.builder.getOrCreate()\n\n        streams = spark.read.option('sep', '\\t').csv(input_paths[0])\n        for stream_path in input_paths[1:]:\n            streams.union(spark.read.option('sep', '\\t').csv(stream_path))\n\n        # The second field is the artist\n        counts = streams \\\n            .map(lambda row: (row[1], 1)) \\\n            .reduceByKey(operator.add)\n\n        counts.write.option('sep', '\\t').csv(output_path)\n\n\n    if __name__ == '__main__':\n        sys.exit(main(sys.argv))\n\n\nIn a typical deployment scenario, the Luigi orchestration definition above as well as the\nPyspark processing code would be packaged into a deployment package, such as a container image. The\nprocessing code does not have to be implemented in Python, any program can be packaged in the\nimage and run from Luigi.\n\n\nStep 2 – Find the Top Artists\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nAt this point, we've counted the number of streams for each artists,\nfor the full time period.\nWe are left with a large file that contains\nmappings of artist -> count data, and we want to find the top 10 artists.\nSince we only have a few hundred thousand artists, and\ncalculating artists is nontrivial to parallelize,\nwe choose to do this not as a Hadoop job, but just as a plain old for-loop in Python.\n\n.. code:: python\n\n    class Top10Artists(luigi.Task):\n        date_interval = luigi.DateIntervalParameter()\n        use_hadoop = luigi.BoolParameter()\n\n        def requires(self):\n            if self.use_hadoop:\n                return AggregateArtistsSpark(self.date_interval)\n            else:\n                return AggregateArtists(self.date_interval)\n\n        def output(self):\n            return luigi.LocalTarget(\"data/top_artists_%s.tsv\" % self.date_interval)\n\n        def run(self):\n            top_10 = nlargest(10, self._input_iterator())\n            with self.output().open('w') as out_file:\n                for streams, artist in top_10:\n                    print(self.date_interval.date_a, self.date_interval.date_b, artist, streams, file=out_file)\n\n        def _input_iterator(self):\n            with self.input().open('r') as in_file:\n                for line in in_file:\n                    artist, streams = line.strip().split()\n                    yield int(streams), int(artist)\n\nThe most interesting thing here is that this task (*Top10Artists*)\ndefines a dependency on the previous task (*AggregateArtists*).\nThis means that if the output of *AggregateArtists* does not exist,\nthe task will run before *Top10Artists*.\n\n.. code-block:: console\n\n    $ luigi --module examples.top_artists Top10Artists --local-scheduler --date-interval 2012-07\n\nThis will run both tasks.\n\nStep 3 - Insert into Postgres\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nThis mainly serves as an example of a specific subclass *Task* that\ndoesn't require any code to be written.\nIt's also an example of how you can define task templates that\nyou can reuse for a lot of different tasks.\n\n.. code:: python\n\n    class ArtistToplistToDatabase(luigi.contrib.postgres.CopyToTable):\n        date_interval = luigi.DateIntervalParameter()\n        use_hadoop = luigi.BoolParameter()\n\n        host = \"localhost\"\n        database = \"toplists\"\n        user = \"luigi\"\n        password = \"abc123\"  # ;)\n        table = \"top10\"\n\n        columns = [(\"date_from\", \"DATE\"),\n                   (\"date_to\", \"DATE\"),\n                   (\"artist\", \"TEXT\"),\n                   (\"streams\", \"INT\")]\n\n        def requires(self):\n            return Top10Artists(self.date_interval, self.use_hadoop)\n\nJust like previously, this defines a recursive dependency on the\nprevious task. If you try to build the task, that will also trigger\nbuilding all its upstream dependencies.\n\nUsing the Central Planner\n~~~~~~~~~~~~~~~~~~~~~~~~~\n\nThe ``--local-scheduler`` flag tells Luigi not to connect to a central scheduler.\nThis is recommended in order to get started and or for development purposes.\nAt the point where you start putting things in production\nwe strongly recommend running the central scheduler server.\nIn addition to providing locking\nso that the same task is not run by multiple processes at the same time,\nthis server also provides a pretty nice visualization of your current work flow.\n\nIf you drop the ``--local-scheduler`` flag,\nyour script will try to connect to the central planner,\nby default at localhost port 8082.\nIf you run\n\n.. code-block:: console\n\n    $ luigid\n\nin the background and then run your task without the ``--local-scheduler`` flag,\nthen your script will now schedule through a centralized server.\nYou need `Tornado <http://www.tornadoweb.org/>`__ for this to work.\n\nLaunching http://localhost:8082 should show something like this:\n\n.. figure:: web_server.png\n   :alt: Web server screenshot\n\nWeb server screenshot\nLooking at the dependency graph\nfor any of the tasks yields something like this:\n\n.. figure:: aggregate_artists.png\n   :alt: Aggregate artists screenshot\n\nAggregate artists screenshot\n\nIn production, you'll want to run the centralized scheduler.\nSee: :doc:`central_scheduler` for more information.\n"
  },
  {
    "path": "doc/execution_model.rst",
    "content": "Execution Model\n---------------\n\nLuigi has a quite simple model for execution and triggering.\n\nWorkers and task execution\n~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nThe most important aspect is that *no execution is transferred*.\nWhen you run a Luigi workflow,\nthe worker schedules all tasks, and\nalso executes the tasks within the process.\n\n    .. figure:: execution_model.png\n       :alt: Execution model\n\nThe benefit of this scheme is that\nit's super easy to debug since all execution takes place in the process.\nIt also makes deployment a non-event.\nDuring development,\nyou typically run the Luigi workflow from the command line,\nwhereas when you deploy it,\nyou can trigger it using crontab or any other scheduler.\n\nThe downside is that Luigi doesn't give you scalability for free.\nIn practice this is not a problem until you start running thousands of tasks.\n\nIsn't the point of Luigi to automate and schedule these workflows?\nTo some extent.\nLuigi helps you *encode the dependencies* of tasks and build up chains.\nFurthermore, Luigi's scheduler makes sure that there's a centralized view of the dependency graph and\nthat the same job will not be executed by multiple workers simultaneously.\n\nScheduler\n~~~~~~~~~\n\nA client only starts the ``run()`` method of a task when the single-threaded\ncentral scheduler has permitted it. Since the number of tasks is usually very\nsmall (in comparison with the petabytes of data one task is processing), we\ncan afford the convenience of a simple centralised server.\n\n.. figure:: https://tarrasch.github.io/luigid-basics-jun-2015/img/50.gif\n   :alt: Scheduling gif\n\nThe gif is from `this presentation\n<https://tarrasch.github.io/luigid-basics-jun-2015/>`__, which is about the\nclient and server interaction.\n\nTriggering tasks\n~~~~~~~~~~~~~~~~\n\nLuigi does not include its own triggering, so you have to rely on an external scheduler\nsuch as crontab to actually trigger the workflows.\n\nIn practice, it's not a big hurdle because Luigi avoids all the mess typically caused by it.\nScheduling a complex workflow is fairly trivial using eg. crontab.\n\nIn the future, Luigi might implement its own triggering.\nThe dependency on crontab (or any external triggering mechanism) is a bit awkward and it would be nice to avoid.\n\nTrigger example\n^^^^^^^^^^^^^^^\n\nFor instance, if you have an external data dump that arrives every day and that your workflow depends on it,\nyou write a workflow that depends on this data dump.\nCrontab can then trigger this workflow *every minute* to check if the data has arrived.\nIf it has, it will run the full dependency graph.\n\n.. code:: python\n\n    # my_tasks.py\n\n    class DataDump(luigi.ExternalTask):\n        date = luigi.DateParameter()\n        def output(self): return luigi.contrib.hdfs.HdfsTarget(self.date.strftime('/var/log/dump/%Y-%m-%d.txt'))\n\n    class AggregationTask(luigi.Task):\n        date = luigi.DateParameter()\n        window = luigi.IntParameter()\n        def requires(self): return [DataDump(self.date - datetime.timedelta(i)) for i in xrange(self.window)]\n        def run(self): run_some_cool_stuff(self.input())\n        def output(self): return luigi.contrib.hdfs.HdfsTarget('/aggregated-%s-%d' % (self.date, self.window))\n\n    class RunAll(luigi.Task):\n        ''' Dummy task that triggers execution of a other tasks'''\n        def requires(self):\n            for window in [3, 7, 14]:\n                for d in xrange(10): # guarantee that aggregations were run for the past 10 days\n                   yield AggregationTask(datetime.date.today() - datetime.timedelta(d), window)\n\nIn your cronline you would then have something like\n\n.. code:: console\n\n    30 0 * * * my-user luigi RunAll --module my_tasks\n\n\nYou can trigger this as much as you want from crontab, and\neven across multiple machines, because\nthe central scheduler will make sure at most one of each ``AggregationTask`` task is run simultaneously.\nNote that this might actually mean multiple tasks can be run because\nthere are instances with different parameters, and\nthis can give you some form of parallelization\n(eg. ``AggregationTask(2013-01-09)`` might run in parallel with ``AggregationTask(2013-01-08)``).\n\nOf course,\nsome Task types (eg. ``HadoopJobTask``) can transfer execution to other places, but\nthis is up to each Task to define.\n"
  },
  {
    "path": "doc/index.rst",
    "content": ".. Luigi documentation master file, created by\n   sphinx-quickstart on Sat Feb  8 00:56:43 2014.\n   You can adapt this file completely to your liking, but it should at least\n   contain the root `toctree` directive.\n\n.. include:: ../README.rst\n\nTable of Contents\n-----------------\n\n.. toctree::\n   :maxdepth: 2\n\n   example_top_artists.rst\n   workflows.rst\n   tasks.rst\n   parameters.rst\n   running_luigi.rst\n   central_scheduler.rst\n   execution_model.rst\n   luigi_patterns.rst\n   configuration.rst\n   logging.rst\n   design_and_limitations.rst\n   mypy.rst\n\nAPI Reference\n-------------\n\n.. autosummary::\n   :toctree: api\n   :recursive:\n\n   luigi\n\n\nIndices and tables\n==================\n\n* :ref:`genindex`\n* :ref:`modindex`\n* :ref:`search`\n"
  },
  {
    "path": "doc/logging.rst",
    "content": "Configure logging\n-----------------\n\n\nConfig options:\n~~~~~~~~~~~~~~~\n\nSome config options for config [core] section\n\nlog_level\n    The default log level to use when no logging_conf_file is set. Must be\n    a valid name of a `Python log level\n    <https://docs.python.org/3/library/logging.html#logging-levels>`_.\n    Default is ``DEBUG``.\nlogging_conf_file\n      Location of the logging configuration file.\nno_configure_logging\n    If true, logging is not configured. Defaults to false.\n\n\nConfig section\n~~~~~~~~~~~~~~\n\nIf you're use TOML for configuration file, you can configure logging\nvia ``logging`` section in this file. See `example\n<https://github.com/spotify/luigi/blob/master/examples/config.toml>`_\nfor more details.\n\nLuigid CLI options:\n~~~~~~~~~~~~~~~~~~~\n\n``--background``\n    Run daemon in background mode. Disable logging setup\n    and set up log level to INFO for root logger.\n``--logdir``\n    set logging with INFO level and output in ``$logdir/luigi-server.log`` file\n\n\nWorker CLI options:\n~~~~~~~~~~~~~~~~~~~\n\n``--logging-conf-file``\n    Configuration file for logging.\n``--log-level``\n    Default log level.\n    Available values: NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL.\n    Default DEBUG. See `Python documentation\n    <https://docs.python.org/3/library/logging.html#logging-levels>`_\n    For information about levels difference.\n\n\nConfiguration options resolution order:\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n1. no_configure_logging option\n2. ``--background``\n3. ``--logdir``\n4. ``--logging-conf-file``\n5. logging_conf_file option\n6. ``logging`` section\n7. ``--log-level``\n8. log_level option\n"
  },
  {
    "path": "doc/luigi_patterns.rst",
    "content": "Luigi Patterns\n--------------\n\nCode Reuse\n~~~~~~~~~~\n\nOne nice thing about Luigi is that it's super easy to depend on tasks defined in other repos.\nIt's also trivial to have \"forks\" in the execution path,\nwhere the output of one task may become the input of many other tasks.\n\nCurrently, no semantics for \"intermediate\" output is supported,\nmeaning that all output will be persisted indefinitely.\nThe upside of that is that if you try to run X -> Y, and Y crashes,\nyou can resume with the previously built X.\nThe downside is that you will have a lot of intermediate results on your file system.\nA useful pattern is to put these files in a special directory and\nhave some kind of periodical garbage collection clean it up.\n\nTriggering Many Tasks\n~~~~~~~~~~~~~~~~~~~~~\n\nA convenient pattern is to have a dummy Task at the end of several\ndependency chains, so you can trigger a multitude of pipelines by\nspecifying just one task in command line, similarly to how e.g. `make <http://www.gnu.org/software/make/>`_\nworks.\n\n.. code:: python\n\n    class AllReports(luigi.WrapperTask):\n        date = luigi.DateParameter(default=datetime.date.today())\n        def requires(self):\n            yield SomeReport(self.date)\n            yield SomeOtherReport(self.date)\n            yield CropReport(self.date)\n            yield TPSReport(self.date)\n            yield FooBarBazReport(self.date)\n\nThis simple task will not do anything itself, but will invoke a bunch of\nother tasks. Per each invocation, Luigi will perform as many of the pending\njobs as possible (those which have all their dependencies present).\n\nYou'll need to use :class:`~luigi.task.WrapperTask` for this instead of the usual Task class, because this job will not produce any output of its own, and as such needs a way to indicate when it's complete. This class is used for tasks that only wrap other tasks and that by definition are done if all their requirements exist.\n\nTriggering recurring tasks\n~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nA common requirement is to have a daily report (or something else)\nproduced every night. Sometimes for various reasons tasks will keep\ncrashing or lacking their required dependencies for more than a day\nthough, which would lead to a missing deliverable for some date. Oops.\n\nTo ensure that the above AllReports task is eventually completed for\nevery day (value of date parameter), one could e.g. add a loop in\nrequires method to yield dependencies on the past few days preceding\nself.date. Then, so long as Luigi keeps being invoked, the backlog of\njobs would catch up nicely after fixing intermittent problems.\n\nLuigi actually comes with a reusable tool for achieving this, called\n:class:`~luigi.tools.range.RangeDailyBase` (resp. :class:`~luigi.tools.range.RangeHourlyBase`). Simply putting\n\n.. code-block:: console\n\n\tluigi --module all_reports RangeDailyBase --of AllReports --start 2015-01-01\n\nin your crontab will easily keep gaps from occurring from 2015-01-01\nonwards. NB - it will not always loop over everything from 2015-01-01\ntill current time though, but rather a maximum of 3 months ago by\ndefault - see :class:`~luigi.tools.range.RangeDailyBase` documentation for this and more knobs\nfor tweaking behavior. See also Monitoring below.\n\nEfficiently triggering recurring tasks\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nRangeDailyBase, described above, is named like that because a more\nefficient subclass exists, :class:`~luigi.tools.range.RangeDaily` (resp. :class:`~luigi.tools.range.RangeHourly`), tailored for\nhundreds of task classes scheduled concurrently with contiguousness\nrequirements spanning years (which would incur redundant completeness\nchecks and scheduler overload using the naive looping approach.) Usage:\n\n.. code-block:: console\n\n\tluigi --module all_reports RangeDaily --of AllReports --start 2015-01-01\n\nIt has the same knobs as RangeDailyBase, with some added requirements.\nNamely the task must implement an efficient bulk_complete method, or\nmust be writing output to file system Target with date parameter value\nconsistently represented in the file path.\n\nBackfilling tasks\n~~~~~~~~~~~~~~~~~\n\nAlso a common use case, sometimes you have tweaked existing recurring\ntask code and you want to schedule recomputation of it over an interval\nof dates for that or another reason. Most conveniently it is achieved\nwith the above described range tools, just with both start (inclusive)\nand stop (exclusive) parameters specified:\n\n.. code-block:: console\n\n\tluigi --module all_reports RangeDaily --of AllReportsV2 --start 2014-10-31 --stop 2014-12-25\n\nPropagating parameters with Range\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nSome tasks you want to recur may include additional parameters which need to be configured.\nThe Range classes provide a parameter which accepts a :class:`~luigi.parameter.DictParameter`\nand passes any parameters onwards for this purpose.\n\n.. code-block:: console\n\n\tluigi RangeDaily --of MyTask --start 2014-10-31 --of-params '{\"my_string_param\": \"123\", \"my_int_param\": 123}'\n\nAlternatively, you can specify parameters at the task family level (as described :ref:`here <Parameter-class-level-parameters>`),\nhowever these will not appear in the task name for the upstream Range task which\ncan have implications in how the scheduler and visualizer handle task instances.\n\n.. code-block:: console\n\n\tluigi RangeDaily --of MyTask --start 2014-10-31 --MyTask-my-param 123\n\n.. _batch_method:\n\nBatching multiple parameter values into a single run\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nSometimes it'll be faster to run multiple jobs together as a single\nbatch rather than running them each individually. When this is the case,\nyou can mark some parameters with a batch_method in their constructor\nto tell the worker how to combine multiple values. One common way to do\nthis is by simply running the maximum value. This is good for tasks that\noverwrite older data when a newer one runs. You accomplish this by\nsetting the batch_method to max, like so:\n\n.. code-block:: python\n\n    class A(luigi.Task):\n        date = luigi.DateParameter(batch_method=max)\n\nWhat's exciting about this is that if you send multiple As to the\nscheduler, it can combine them and return one. So if\n``A(date=2016-07-28)``, ``A(date=2016-07-29)`` and\n``A(date=2016-07-30)`` are all ready to run, you will start running\n``A(date=2016-07-30)``. While this is running, the scheduler will show\n``A(date=2016-07-28)``, ``A(date=2016-07-29)`` as batch running while\n``A(date=2016-07-30)`` is running. When ``A(date=2016-07-30)`` is done\nrunning and becomes FAILED or DONE, the other two tasks will be updated\nto the same status.\n\nIf you want to limit how big a batch can get, simply set max_batch_size.\nSo if you have\n\n.. code-block:: python\n\n    class A(luigi.Task):\n        date = luigi.DateParameter(batch_method=max)\n\n        max_batch_size = 10\n\nthen the scheduler will batch at most 10 jobs together. You probably do\nnot want to do this with the max batch method, but it can be helpful if\nyou use other methods. You can use any method that takes a list of\nparameter values and returns a single parameter value.\n\nIf you have two max batch parameters, you'll get the max values for both\nof them. If you have parameters that don't have a batch method, they'll\nbe aggregated separately. So if you have a class like\n\n.. code-block:: python\n\n    class A(luigi.Task):\n        p1 = luigi.IntParameter(batch_method=max)\n        p2 = luigi.IntParameter(batch_method=max)\n        p3 = luigi.IntParameter()\n\nand you create tasks ``A(p1=1, p2=2, p3=0)``, ``A(p1=2, p2=3, p3=0)``,\n``A(p1=3, p2=4, p3=1)``, you'll get them batched as\n``A(p1=2, p2=3, p3=0)`` and ``A(p1=3, p2=4, p3=1)``.\n\nNote that batched tasks do not take up :ref:`resources-config`, only the\ntask that ends up running will use resources. The scheduler only checks\nthat there are sufficient resources for each task individually before\nbatching them all together.\n\nTasks that regularly overwrite the same data source\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nIf you are overwriting of the same data source with every run, you'll\nneed to ensure that two batches can't run at the same time. You can do\nthis pretty easily by setting batch_method to max and setting a unique\nresource:\n\n.. code-block:: python\n\n    class A(luigi.Task):\n        date = luigi.DateParameter(batch_method=max)\n\n        resources = {'overwrite_resource': 1}\n\nNow if you have multiple tasks such as ``A(date=2016-06-01)``,\n``A(date=2016-06-02)``, ``A(date=2016-06-03)``, the scheduler will just\ntell you to run the highest available one and mark the lower ones as\nbatch_running. Using a unique resource will prevent multiple tasks from\nwriting to the same location at the same time if a new one becomes\navailable while others are running.\n\nAvoiding concurrent writes to a single file\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nUpdating a single file from several tasks is almost always a bad idea, and you\nneed to be very confident that no other good solution exists before doing this.\nIf, however, you have no other option, then you will probably at least need to ensure that\nno two tasks try to write to the file _simultaneously_.\n\nBy turning 'resources' into a Python property, it can return a value dependent on\nthe task parameters or other dynamic attributes:\n\n.. code-block:: python\n\n    class A(luigi.Task):\n        ...\n\n        @property\n        def resources(self):\n            return { self.important_file_name: 1 }\n\nSince, by default, resources have a usage limit of 1, no two instances of Task A\nwill now run if they have the same `important_file_name` property.\n\nDecreasing resources of running tasks\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nAt scheduling time, the luigi scheduler needs to be aware of the maximum\nresource consumption a task might have once it runs. For some tasks, however,\nit can be beneficial to decrease the amount of consumed resources between two\nsteps within their run method (e.g. after some heavy computation). In this\ncase, a different task waiting for that particular resource can already be\nscheduled.\n\n.. code-block:: python\n\n    class A(luigi.Task):\n\n        # set maximum resources a priori\n        resources = {\"some_resource\": 3}\n\n        def run(self):\n            # do something\n            ...\n\n            # decrease consumption of \"some_resource\" by one\n            self.decrease_running_resources({\"some_resource\": 1})\n\n            # continue with reduced resources\n            ...\n\nMonitoring task pipelines\n~~~~~~~~~~~~~~~~~~~~~~~~~\n\nLuigi comes with some existing ways in :py:mod:`luigi.notifications` to receive\nnotifications whenever tasks crash. Email is the most common way.\n\nThe above mentioned range tools for recurring tasks not only implement\nreliable scheduling for you, but also emit events which you can use to\nset up delay monitoring. That way you can implement alerts for when\njobs are stuck for prolonged periods lacking input data or otherwise\nrequiring attention.\n\n.. _AtomicWrites:\n\nAtomic Writes Problem\n~~~~~~~~~~~~~~~~~~~~~\n\nA very common mistake done by luigi plumbers is to write data partially to the\nfinal destination, that is, not atomically. The problem arises because\ncompletion checks in luigi are exactly as naive as running\n:meth:`luigi.target.Target.exists`. And in many cases it just means to check if\na folder exist on disk. During the time we have partially written data, a task\ndepending on that output would think its input is complete. This can have\ndevestating effects, as in `the thanksgiving bug\n<http://tarrasch.github.io/luigi-budapest-bi-oct-2015/#/21>`__.\n\nThe concept can be illustrated by imagining that we deal with data stored on\nlocal disk and by running commands:\n\n.. code-block:: console\n\n    # This the BAD way\n    $ mkdir /outputs/final_output\n    $ big-slow-calculation > /outputs/final_output/foo.data\n\nAs stated earlier, the problem is that only partial data exists for a duration,\nyet we consider the data to be :meth:`~luigi.task.Task.complete` because the\noutput folder already exists. Here is a robust version of this:\n\n.. code-block:: console\n\n    # This is the good way\n    $ mkdir /outputs/final_output-tmp-123456\n    $ big-slow-calculation > /outputs/final_output-tmp-123456/foo.data\n    $ mv --no-target-directory --no-clobber /outputs/final_output{-tmp-123456,}\n    $ [[ -d /outputs/final_output-tmp-123456 ]] && rm -r /outputs/final_output-tmp-123456\n\nIndeed, the good way is not as trivial. It involves coming up with a unique\ndirectory name and a pretty complex ``mv`` line, the reason ``mv`` need all\nthose is because we don't want ``mv`` to move a directory into a potentially\nexisting directory. A directory could already exist in exceptional cases, for\nexample when central locking fails and the same task would somehow run twice at\nthe same time. Lastly, in the exceptional case where the file was never moved,\none might want to remove the temporary directory that never got used.\n\nNote that this was an example where the storage was on local disk. But for\nevery storage (hard disk file, hdfs file, database table, etc.) this procedure\nwill look different. But do every luigi user need to implement that complexity?\nNope, thankfully luigi developers are aware of these and luigi comes with many\nbuilt-in solutions. In the case of you're dealing with a file system\n(:class:`~luigi.target.FileSystemTarget`), you should consider using\n:meth:`~luigi.target.FileSystemTarget.temporary_path`. For other targets, you\nshould ensure that the way you're writing your final output directory is\natomic.\n\nSending messages to tasks\n~~~~~~~~~~~~~~~~~~~~~~~~~\n\nThe central scheduler is able to send messages to particular tasks. When a running task accepts\nmessages, it can access a `multiprocessing.Queue <https://docs.python.org/3/library/multiprocessing.html#pipes-and-queues>`__\nobject storing incoming messages. You can implement custom behavior to react and respond to\nmessages:\n\n.. code-block:: python\n\n    class Example(luigi.Task):\n\n        # common task setup\n        ...\n\n        # configure the task to accept all incoming messages\n        accepts_messages = True\n\n        def run(self):\n            # this example runs some loop and listens for the\n            # \"terminate\" message, and responds to all other messages\n            for _ in some_loop():\n                # check incomming messages\n                if not self.scheduler_messages.empty():\n                    msg = self.scheduler_messages.get()\n                    if msg.content == \"terminate\":\n                        break\n                    else:\n                        msg.respond(\"unknown message\")\n\n            # finalize\n            ...\n\nMessages can be sent right from the scheduler UI which also displays responses (if any). Note that\nthis feature is only available when the scheduler is configured to send messages (see the :ref:`scheduler-config` config), and the task is configured to accept them.\n\nGathering custom metrics from tasks' executions\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nThe central scheduler is able to gather custom metrics from tasks' executions with help of\ncustom metrics collector (see the :ref:`scheduler-config` config). To obtain custom metrics,\nyou need to implement:\n\n#. Custom metrics collector class inheriting from\n   :class:`~luigi.metrics.MetricsCollector` (or derived) and implementing the\n   :meth:`~luigi.metrics.MetricsCollector.handle_task_statistics`\n   method (default one does nothing). This method will be called for each task\n   that has been executed everytime, when\n   :meth:`~luigi.worker.TaskStatusReporter.report_task_statistics` is called.\n   For instance, following metrics collector adds monitoring tasks' execution\n   time and memory usage:\n\n   .. code-block:: python\n\n       class MetricsCollector(PrometheusMetricsCollector):\n           def __init__(self, *args, **kwargs):\n               super().__init__(*args, **kwargs)\n               self.task_run_execution_time = Gauge(\n                   'luigi_task_run_execution_time_seconds',\n                   'luigi task run method execution time in seconds',\n                   self.labels,\n                   registry=self.registry\n               )\n               self.task_execution_memory = Gauge(\n                   'luigi_task_max_memory_megabytes',\n                   'luigi task run method max memory usage in megabytes',\n                   self.labels,\n                   registry=self.registry\n               )\n\n           def handle_task_statistics(self, task, statistics):\n               if \"elapsed\" in statistics:\n                   self.task_run_execution_time.labels(**self._generate_task_labels(task)).set(statistics[\"elapsed\"])\n               if \"memory\" in statistics:\n                   self.task_execution_memory.labels(**self._generate_task_labels(task)).set(statistics[\"memory\"])\n\n#. Custom task context manager (see the :ref:`worker-config` config),\n   which in `__exit__` method would call\n   :meth:`~luigi.worker.TaskStatusReporter.report_task_statistics` method with\n   the statistics dictionary. For instance, following task context manager collects\n   task execution time and memory usage:\n\n   .. code-block:: python\n\n       class TaskContext:\n           def __init__(self, task_process):\n               self._task_process = task_process\n               self._start = None\n\n           def __enter__(self):\n               self._start = time.perf_counter()\n               return self\n\n           def __exit__(self, exc_type, exc_val, exc_tb):\n               assert self._start is not None\n               elapsed = time.perf_counter() - self._start\n               used_memory = max(\n                   resource.getrusage(resource.RUSAGE_SELF).ru_maxrss, resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss\n               )\n               logging.getLogger(\"luigi-interface\").info(\n                   f'Task {self._task_process.task}: time: {elapsed:.2f}s, memory: {used_memory / 1024:.2f}MB '\n               )\n               self._task_process.status_reporter.report_task_statistics({\"memory\": used_memory / 1024, \"elapsed\": elapsed})\n"
  },
  {
    "path": "doc/mypy.rst",
    "content": "Mypy plugin\n--------------\n\nMypy plugin provides type checking for ``luigi.Task`` using Mypy.\n\nRequire Python 3.8 or later.\n\nHow to use\n~~~~~~~~~~\n\nConfigure Mypy to use this plugin by adding the following to your ``mypy.ini`` file:\n\n.. code:: ini\n\n    [mypy]\n    plugins = luigi.mypy\n\nor by adding the following to your ``pyproject.toml`` file:\n\n.. code:: toml\n\n    [tool.mypy]\n    plugins = [\"luigi.mypy\"]\n\nThen, run Mypy as usual.\n\nExamples\n~~~~~~~~\n\nFor example the following code linted by Mypy:\n\n.. code:: python\n\n    import luigi\n\n\n    class MyTask(luigi.Task):\n        foo: int = luigi.IntParameter()\n        bar: str = luigi.Parameter()\n\n    MyTask(foo=1, bar='2')   # OK\n    MyTask(foo='1', bar='2')  # Error: Argument 1 to \"Foo\" has incompatible type \"str\"; expected \"int\"\n"
  },
  {
    "path": "doc/parameters.rst",
    "content": "Parameters\n----------\n\nParameters is the Luigi equivalent of creating a constructor for each Task.\nLuigi requires you to declare these parameters by instantiating\n:class:`~luigi.parameter.Parameter` objects on the class scope:\n\n.. code:: python\n\n    class DailyReport(luigi.contrib.hadoop.JobTask):\n        date = luigi.DateParameter(default=datetime.date.today())\n        # ...\n\nBy doing this, Luigi can take care of all the boilerplate code that\nwould normally be needed in the constructor.\nInternally, the DailyReport object can now be constructed by running\n``DailyReport(datetime.date(2012, 5, 10))`` or just ``DailyReport()``.\nLuigi also creates a command line parser that automatically handles the\nconversion from strings to Python types.\nThis way you can invoke the job on the command line eg. by passing ``--date 2012-05-10``.\n\nThe parameters are all set to their values on the Task object instance,\ni.e.\n\n.. code:: python\n\n    d = DailyReport(datetime.date(2012, 5, 10))\n    print(d.date)\n\nwill return the same date that the object was constructed with.\nSame goes if you invoke Luigi on the command line.\n\n.. _Parameter-instance-caching:\n\nInstance caching\n^^^^^^^^^^^^^^^^\n\nTasks are uniquely identified by their class name and values of their\nparameters.\nIn fact, within the same worker, two tasks of the same class with\nparameters of the same values are not just equal, but the same instance:\n\n.. code:: python\n\n    >>> import luigi\n    >>> import datetime\n    >>> class DateTask(luigi.Task):\n    ...   date = luigi.DateParameter()\n    ...\n    >>> a = datetime.date(2014, 1, 21)\n    >>> b = datetime.date(2014, 1, 21)\n    >>> a is b\n    False\n    >>> c = DateTask(date=a)\n    >>> d = DateTask(date=b)\n    >>> c\n    DateTask(date=2014-01-21)\n    >>> d\n    DateTask(date=2014-01-21)\n    >>> c is d\n    True\n\nInsignificant parameters\n^^^^^^^^^^^^^^^^^^^^^^^^\n\nIf a parameter is created with ``significant=False``,\nit is ignored as far as the Task signature is concerned.\nTasks created with only insignificant parameters differing have the same signature but\nare not the same instance:\n\n.. code:: python\n\n    >>> class DateTask2(DateTask):\n    ...   other = luigi.Parameter(significant=False)\n    ...\n    >>> c = DateTask2(date=a, other=\"foo\")\n    >>> d = DateTask2(date=b, other=\"bar\")\n    >>> c\n    DateTask2(date=2014-01-21)\n    >>> d\n    DateTask2(date=2014-01-21)\n    >>> c.other\n    'foo'\n    >>> d.other\n    'bar'\n    >>> c is d\n    False\n    >>> hash(c) == hash(d)\n    True\n\nParameter visibility\n^^^^^^^^^^^^^^^^^^^^\n\nUsing :class:`~luigi.parameter.ParameterVisibility` you can configure parameter visibility. By default, all\nparameters are public, but you can also set them hidden or private.\n\n.. code:: python\n\n    >>> import luigi\n    >>> from luigi.parameter import ParameterVisibility\n    \n    >>> luigi.Parameter(visibility=ParameterVisibility.PRIVATE)\n\n``ParameterVisibility.PUBLIC`` (default) - visible everywhere\n\n``ParameterVisibility.HIDDEN`` - ignored in WEB-view, but saved into database if save db_history is true\n\n``ParameterVisibility.PRIVATE`` - visible only inside task.\n\nParameter types\n^^^^^^^^^^^^^^^\n\nIn the examples above, the *type* of the parameter is determined by using different\nsubclasses of :class:`~luigi.parameter.Parameter`. There are a few of them, like\n:class:`~luigi.parameter.DateParameter`,\n:class:`~luigi.parameter.DateIntervalParameter`,\n:class:`~luigi.parameter.IntParameter`,\n:class:`~luigi.parameter.FloatParameter`, etc.\n\nPython is not a statically typed language and you don't have to specify the types\nof any of your parameters.\nYou can simply use the base class :class:`~luigi.parameter.Parameter` if you don't care.\n\nThe reason you would use a subclass like :class:`~luigi.parameter.DateParameter`\nis that Luigi needs to know its type for the command line interaction.\nThat's how it knows how to convert a string provided on the command line to\nthe corresponding type (i.e. datetime.date instead of a string).\n\n.. _Parameter-class-level-parameters:\n\nSetting parameter value for other classes\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\nAll parameters are also exposed on a class level on the command line interface.\nFor instance, say you have classes TaskA and TaskB:\n\n.. code:: python\n\n    class TaskA(luigi.Task):\n        x = luigi.Parameter()\n\n    class TaskB(luigi.Task):\n        y = luigi.Parameter()\n\n\nYou can run ``TaskB`` on the command line: ``luigi TaskB --y 42``.\nBut you can also set the class value of ``TaskA`` by running\n``luigi TaskB --y 42 --TaskA-x 43``.\nThis sets the value of ``TaskA.x`` to 43 on a *class* level.\nIt is still possible to override it inside Python if you instantiate ``TaskA(x=44)``.\n\nAll parameters can also be set from the configuration file.\nFor instance, you can put this in the config:\n\n.. code:: ini\n\n    [TaskA]\n    x: 45\n\n\nJust as in the previous case, this will set the value of ``TaskA.x`` to 45 on the *class* level.\nAnd likewise, it is still possible to override it inside Python if you instantiate ``TaskA(x=44)``.\n\nParameter resolution order\n^^^^^^^^^^^^^^^^^^^^^^^^^^\n\nParameters are resolved in the following order of decreasing priority:\n\n1. Any value passed to the constructor, or task level value set on the command line (applies on an instance level)\n2. Any value set on the command line (applies on a class level)\n3. Any configuration option (applies on a class level)\n4. Any default value provided to the parameter (applies on a class level)\n\nSee the :class:`~luigi.parameter.Parameter` class for more information.\n"
  },
  {
    "path": "doc/running_luigi.rst",
    "content": "Running Luigi\n-------------\n\nRunning from the Command Line\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\nThe preferred way to run Luigi tasks is through the ``luigi`` command line tool\nthat will be installed with the pip package.\n\n.. code-block:: python\n\n    # my_module.py, available in your sys.path\n    import luigi\n\n    class MyTask(luigi.Task):\n        x = luigi.IntParameter()\n        y = luigi.IntParameter(default=45)\n\n        def run(self):\n            print(self.x + self.y)\n\nShould be run like this\n\n.. code-block:: console\n\n        $ luigi --module my_module MyTask --x 123 --y 456 --local-scheduler\n\nOr alternatively like this:\n\n.. code-block:: console\n\n        $ python -m luigi --module my_module MyTask --x 100 --local-scheduler\n\nNote that if a parameter name contains '_', it should be replaced by '-'.\nFor example, if MyTask had a parameter called 'my_parameter':\n\n.. code-block:: console\n\n        $ luigi --module my_module MyTask --my-parameter 100 --local-scheduler\n\n.. note:: Please make sure to always place task parameters behind the task family!\n\n\nRunning from Python code\n^^^^^^^^^^^^^^^^^^^^^^^^\n\nAnother way to start tasks from Python code is using ``luigi.build(tasks, worker_scheduler_factory=None, **env_params)``\nfrom ``luigi.interface`` module.\n\nThis way of running luigi tasks is useful if you want to get some dynamic parameters from another\nsource, such as database, or provide additional logic before you start tasks.\n\nOne notable difference is that ``build`` defaults to not using the identical process lock.\nIf you want to change this behaviour, just pass ``no_lock=False``.\n\n\n.. code-block:: python\n\n    class MyTask1(luigi.Task):\n        x = luigi.IntParameter()\n        y = luigi.IntParameter(default=0)\n\n        def run(self):\n            print(self.x + self.y)\n\n\n    class MyTask2(luigi.Task):\n        x = luigi.IntParameter()\n        y = luigi.IntParameter(default=1)\n        z = luigi.IntParameter(default=2)\n\n        def run(self):\n            print(self.x * self.y * self.z)\n\n\n    if __name__ == '__main__':\n        luigi.build([MyTask1(x=10), MyTask2(x=15, z=3)])\n\n\nAlso, it is possible to pass additional parameters to ``build`` such as host, port, workers and local_scheduler:\n\n.. code-block:: python\n\n    if __name__ == '__main__':\n         luigi.build([MyTask1(x=1)], workers=5, local_scheduler=True)\n\nTo achieve some special requirements you can pass to ``build`` your  ``worker_scheduler_factory``\nwhich will return your worker and/or scheduler implementations:\n\n.. code-block:: python\n\n    class MyWorker(Worker):\n        # some custom logic\n\n\n    class MyFactory:\n      def create_local_scheduler(self):\n          return scheduler.Scheduler(prune_on_get_work=True, record_task_history=False)\n\n      def create_remote_scheduler(self, url):\n          return rpc.RemoteScheduler(url)\n\n      def create_worker(self, scheduler, worker_processes, assistant=False):\n          # return your worker instance\n          return MyWorker(\n              scheduler=scheduler, worker_processes=worker_processes, assistant=assistant)\n\n\n    if __name__ == '__main__':\n        luigi.build([MyTask1(x=1)], worker_scheduler_factory=MyFactory())\n\nIn some cases (like task queue) it may be useful.\n\n\n\nResponse of luigi.build()/luigi.run()\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n- **Default response** By default *luigi.build()/luigi.run()* returns True if there were no scheduling errors. This is the same as the attribute ``LuigiRunResult.scheduling_succeeded``.\n\n- **Detailed response** This is a response of type :class:`~luigi.execution_summary.LuigiRunResult`. This is obtained by passing a keyword argument ``detailed_summary=True`` to *build/run*. This response contains detailed information about the jobs.\n\n  .. code-block:: python\n\n    if __name__ == '__main__':\n         luigi_run_result = luigi.build(..., detailed_summary=True)\n         print(luigi_run_result.summary_text)\n\n\nLuigi on Windows\n^^^^^^^^^^^^^^^^\n\nMost Luigi functionality works on Windows. Exceptions:\n\n- Specifying multiple worker processes using the ``workers`` argument for\n  ``luigi.build``, or using the ``--workers`` command line argument. (Similarly,\n  specifying ``--worker-force-multiprocessing``). For most programs, this will\n  result in failure (a common sight is ``BrokenPipeError``). The reason is that\n  worker processes are assumed to be forked from the main process. Forking is\n  `not possible <https://docs.python.org/dev/library/multiprocessing.html#contexts-and-start-methods>`_\n  on Windows.\n- Running the Luigi central scheduling server as a daemon (i.e. with ``--background``).\n  Again, a Unix-only concept.\n"
  },
  {
    "path": "doc/tasks.rst",
    "content": "Tasks\n-----\n\nTasks are where the execution takes place.\nTasks depend on each other and output targets.\n\nAn outline of how a task can look like:\n\n    .. figure:: task_breakdown.png\n       :alt: Task breakdown\n\n.. _Task.requires:\n\nTask.requires\n~~~~~~~~~~~~~\n\nThe :func:`~luigi.task.Task.requires` method is used to specify dependencies on other Task object,\nwhich might even be of the same class.\nFor instance, an example implementation could be\n\n.. code:: python\n\n    def requires(self):\n        return OtherTask(self.date), DailyReport(self.date - datetime.timedelta(1))\n\nIn this case, the DailyReport task depends on two inputs created earlier,\none of which is the same class.\nrequires can return other Tasks in any way wrapped up within dicts/lists/tuples/etc.\n\nRequiring another Task\n~~~~~~~~~~~~~~~~~~~~~~\n\nNote that :func:`~luigi.task.Task.requires` can *not* return a :class:`~luigi.target.Target` object.\nIf you have a simple Target object that is created externally\nyou can wrap it in a Task class like this:\n\n.. code:: python\n\n    class LogFiles(luigi.ExternalTask):\n        def output(self):\n            return luigi.contrib.hdfs.HdfsTarget('/log')\n\nThis also makes it easier to add parameters:\n\n.. code:: python\n\n    class LogFiles(luigi.ExternalTask):\n        date = luigi.DateParameter()\n        def output(self):\n            return luigi.contrib.hdfs.HdfsTarget(self.date.strftime('/log/%Y-%m-%d'))\n\n.. _Task.output:\n\nTask.output\n~~~~~~~~~~~\n\nThe :func:`~luigi.task.Task.output` method returns one or more :class:`~luigi.target.Target` objects.\nSimilarly to requires, you can return them wrapped up in any way that's convenient for you.\nHowever we recommend that any :class:`~luigi.task.Task` only return one single :class:`~luigi.target.Target` in output.\nIf multiple outputs are returned,\natomicity will be lost unless the :class:`~luigi.task.Task` itself can ensure that each :class:`~luigi.target.Target` is atomically created.\n(If atomicity is not of concern, then it is safe to return multiple :class:`~luigi.target.Target` objects.)\n\n.. code:: python\n\n    class DailyReport(luigi.Task):\n        date = luigi.DateParameter()\n        def output(self):\n            return luigi.contrib.hdfs.HdfsTarget(self.date.strftime('/reports/%Y-%m-%d'))\n        # ...\n\n.. _Task.run:\n\nTask.run\n~~~~~~~~\n\nThe :func:`~luigi.task.Task.run` method now contains the actual code that is run.\nWhen you are using Task.requires_ and Task.run_ Luigi breaks down everything into two stages.\nFirst it figures out all dependencies between tasks,\nthen it runs everything.\nThe :func:`~luigi.task.Task.input` method is an internal helper method that just replaces all Task objects in requires\nwith their corresponding output.\nAn example:\n\n.. code:: python\n\n    class GenerateWords(luigi.Task):\n\n        def output(self):\n            return luigi.LocalTarget('words.txt')\n\n        def run(self):\n\n            # write a dummy list of words to output file\n            words = [\n                    'apple',\n                    'banana',\n                    'grapefruit'\n                    ]\n\n            with self.output().open('w') as f:\n                for word in words:\n                    f.write('{word}\\n'.format(word=word))\n\n\n    class CountLetters(luigi.Task):\n\n        def requires(self):\n            return GenerateWords()\n\n        def output(self):\n            return luigi.LocalTarget('letter_counts.txt')\n\n        def run(self):\n\n            # read in file as list\n            with self.input().open('r') as infile:\n                words = infile.read().splitlines()\n\n            # write each word to output file with its corresponding letter count\n            with self.output().open('w') as outfile:\n                for word in words:\n                    outfile.write(\n                            '{word} | {letter_count}\\n'.format(\n                                word=word,\n                                letter_count=len(word)\n                                )\n                            )\n\nIt's useful to note that if you're writing to a binary file, Luigi automatically\nstrips the ``'b'`` flag due to how atomic writes/reads work. In order to write a binary\nfile, such as a pickle file, you should instead use ``format=Nop`` when calling\nLocalTarget. Following the above example:\n\n.. code:: python\n\n    from luigi.format import Nop\n\n    class GenerateWords(luigi.Task):\n\n        def output(self):\n            return luigi.LocalTarget('words.pckl', format=Nop)\n\n        def run(self):\n            import pickle\n\n            # write a dummy list of words to output file\n            words = [\n                    'apple',\n                    'banana',\n                    'grapefruit'\n                    ]\n\n            with self.output().open('w') as f:\n                pickle.dump(words, f)\n\n\nIt is your responsibility to ensure that after running :func:`~luigi.task.Task.run`, the task is\ncomplete, i.e. :func:`~luigi.task.Task.complete` returns ``True``. Unless you have overridden\n:func:`~luigi.task.Task.complete`, :func:`~luigi.task.Task.run` should generate all the targets\ndefined as outputs. Luigi verifies that you adhere to the contract before running downstream\ndependencies, and reports ``Unfulfilled dependencies at run time`` if a violation is detected.\n\n.. _Task.input:\n\nTask.input\n~~~~~~~~~~\n\nAs seen in the example above, :func:`~luigi.task.Task.input` is a wrapper around Task.requires_ that\nreturns the corresponding Target objects instead of Task objects.\nAnything returned by Task.requires_ will be transformed, including lists,\nnested dicts, etc.\nThis can be useful if you have many dependencies:\n\n.. code:: python\n\n    class TaskWithManyInputs(luigi.Task):\n        def requires(self):\n            return {'a': TaskA(), 'b': [TaskB(i) for i in xrange(100)]}\n\n        def run(self):\n            f = self.input()['a'].open('r')\n            g = [y.open('r') for y in self.input()['b']]\n\n\nDynamic dependencies\n~~~~~~~~~~~~~~~~~~~~\n\nSometimes you might not know exactly what other tasks to depend on until runtime.\nIn that case, Luigi provides a mechanism to specify dynamic dependencies.\nIf you yield another :class:`~luigi.task.Task` in the Task.run_ method,\nthe current task will be suspended and the other task will be run.\nYou can also yield a list of tasks.\n\n.. code:: python\n\n    class MyTask(luigi.Task):\n        def run(self):\n            other_target = yield OtherTask()\n\n            # dynamic dependencies resolve into targets\n            f = other_target.open('r')\n\n\nThis mechanism is an alternative to Task.requires_ in case\nyou are not able to build up the full dependency graph before running the task.\nIt does come with some constraints:\nthe Task.run_ method will resume from scratch each time a new task is yielded.\nIn other words, you should make sure your Task.run_ method is idempotent.\n(This is good practice for all Tasks in Luigi, but especially so for tasks with dynamic dependencies).\nAs this might entail redundant calls to tasks' :func:`~luigi.task.Task.complete` methods,\nyou should consider setting the \"cache_task_completion\" option in the :ref:`worker-config`.\nTo further control how dynamic task requirements are handled internally by worker nodes,\nthere is also the option to wrap dependent tasks by :class:`~luigi.task.DynamicRequirements`.\n\nFor an example of a workflow using dynamic dependencies, see\n`examples/dynamic_requirements.py <https://github.com/spotify/luigi/blob/master/examples/dynamic_requirements.py>`_.\n\n\nTask status tracking\n~~~~~~~~~~~~~~~~~~~~\n\nFor long-running or remote tasks it is convenient to see extended status information not only on\nthe command line or in your logs but also in the GUI of the central scheduler. Luigi implements\ndynamic status messages, progress bar and tracking urls which may point to an external monitoring system.\nYou can set this information using callbacks within Task.run_:\n\n.. code:: python\n\n    class MyTask(luigi.Task):\n        def run(self):\n            # set a tracking url\n            self.set_tracking_url(\"http://...\")\n\n            # set status messages during the workload\n            for i in range(100):\n                # do some hard work here\n                if i % 10 == 0:\n                    self.set_status_message(\"Progress: %d / 100\" % i)\n                    # displays a progress bar in the scheduler UI\n                    self.set_progress_percentage(i)\n\n\n.. _Events:\n\nEvents and callbacks\n~~~~~~~~~~~~~~~~~~~~\n\nLuigi has a built-in event system that\nallows you to register callbacks to events and trigger them from your own tasks.\nYou can both hook into some pre-defined events and create your own.\nEach event handle is tied to a Task class and\nwill be triggered only from that class or\na subclass of it.\nThis allows you to effortlessly subscribe to events only from a specific class (e.g. for hadoop jobs).\n\n.. code:: python\n\n    @luigi.Task.event_handler(luigi.Event.SUCCESS)\n    def celebrate_success(task):\n        \"\"\"Will be called directly after a successful execution\n           of `run` on any Task subclass (i.e. all luigi Tasks)\n        \"\"\"\n        ...\n\n    @luigi.contrib.hadoop.JobTask.event_handler(luigi.Event.FAILURE)\n    def mourn_failure(task, exception):\n        \"\"\"Will be called directly after a failed execution\n           of `run` on any JobTask subclass\n        \"\"\"\n        ...\n\n    luigi.run()\n\n\nBut I just want to run a Hadoop job?\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nThe Hadoop code is integrated in the rest of the Luigi code because\nwe really believe almost all Hadoop jobs benefit from being part of some sort of workflow.\nHowever, in theory, nothing stops you from using the :class:`~luigi.contrib.hadoop.JobTask` class (and also :class:`~luigi.contrib.hdfs.target.HdfsTarget`)\nwithout using the rest of Luigi.\nYou can simply run it manually using\n\n.. code:: python\n\n    MyJobTask('abc', 123).run()\n\nYou can use the hdfs.target.HdfsTarget class anywhere by just instantiating it:\n\n.. code:: python\n\n    t = luigi.contrib.hdfs.target.HdfsTarget('/tmp/test.gz', format=format.Gzip)\n    f = t.open('w')\n    # ...\n    f.close() # needed\n\n.. _Task.priority:\n\nTask priority\n~~~~~~~~~~~~~\n\nThe scheduler decides which task to run next from\nthe set of all tasks that have all their dependencies met.\nBy default, this choice is pretty arbitrary,\nwhich is fine for most workflows and situations.\n\nIf you want to have some control on the order of execution of available tasks,\nyou can set the ``priority`` property of a task,\nfor example as follows:\n\n.. code:: python\n\n    # A static priority value as a class constant:\n    class MyTask(luigi.Task):\n        priority = 100\n        # ...\n\n    # A dynamic priority value with a \"@property\" decorated method:\n    class OtherTask(luigi.Task):\n        @property\n        def priority(self):\n            if self.date > some_threshold:\n                return 80\n            else:\n                return 40\n        # ...\n\nTasks with a higher priority value will be picked before tasks with a lower priority value.\nThere is no predefined range of priorities,\nyou can choose whatever (int or float) values you want to use.\nThe default value is 0.\n\nWarning: task execution order in Luigi is influenced by both dependencies and priorities, but\nin Luigi dependencies come first.\nFor example:\nif there is a task A with priority 1000 but still with unmet dependencies and\na task B with priority 1 without any pending dependencies,\ntask B will be picked first.\n\n.. _Task.namespaces_famlies_and_ids:\n\nNamespaces, families and ids\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nIn order to avoid name clashes and to be able to have an identifier for tasks,\nLuigi introduces the concepts *task_namespace*, *task_family* and\n*task_id*. The namespace and family operate on class level meanwhile the task\nid only exists on instance level. The concepts are best illustrated using code.\n\n.. code:: python\n\n    import luigi\n    class MyTask(luigi.Task):\n        my_param = luigi.Parameter()\n        task_namespace = 'my_namespace'\n\n    my_task = MyTask(my_param='hello')\n    print(my_task)                      # --> my_namespace.MyTask(my_param=hello)\n\n    print(my_task.get_task_namespace()) # --> my_namespace\n    print(my_task.get_task_family())    # --> my_namespace.MyTask\n    print(my_task.task_id)              # --> my_namespace.MyTask_hello_890907e7ce\n\n    print(MyTask.get_task_namespace())  # --> my_namespace\n    print(MyTask.get_task_family())     # --> my_namespace.MyTask\n    print(MyTask.task_id)               # --> Error!\n\nThe full documentation for this machinery exists in the :py:mod:`~luigi.task` module.\n\nInstance caching\n~~~~~~~~~~~~~~~~\n\nIn addition to the stuff mentioned above,\nLuigi also does some metaclass logic so that\nif e.g. ``DailyReport(datetime.date(2012, 5, 10))`` is instantiated twice in the code,\nit will in fact result in the same object.\nSee :ref:`Parameter-instance-caching` for more info\n"
  },
  {
    "path": "doc/workflows.rst",
    "content": "Building workflows\n------------------\n\nThere are two fundamental building blocks of Luigi -\nthe :class:`~luigi.task.Task` class and the :class:`~luigi.target.Target` class.\nBoth are abstract classes and expect a few methods to be implemented.\nIn addition to those two concepts,\nthe :class:`~luigi.parameter.Parameter` class is an important concept that governs how a Task is run.\n\nTarget\n~~~~~~\n\nThe :py:class:`~luigi.target.Target` class corresponds to a file on a disk,\na file on HDFS or some kind of a checkpoint, like an entry in a database.\nActually, the only method that Targets have to implement is the *exists*\nmethod which returns True if and only if the Target exists.\n\nIn practice, implementing Target subclasses is rarely needed.\nLuigi comes with a toolbox of several useful Targets.\nIn particular, :class:`~luigi.file.LocalTarget` and :class:`~luigi.contrib.hdfs.target.HdfsTarget`,\nbut there is also support for other file systems:\n:class:`luigi.contrib.s3.S3Target`,\n:class:`luigi.contrib.ssh.RemoteTarget`,\n:class:`luigi.contrib.ftp.RemoteTarget`,\n:class:`luigi.contrib.mysqldb.MySqlTarget`,\n:class:`luigi.contrib.redshift.RedshiftTarget`, and several more.\n\nMost of these targets, are file system-like.\nFor instance, :class:`~luigi.file.LocalTarget` and :class:`~luigi.contrib.hdfs.target.HdfsTarget` map to a file on the local drive or a file in HDFS.\nIn addition these also wrap the underlying operations to make them atomic.\nThey both implement the :func:`~luigi.file.LocalTarget.open` method which returns a stream object that\ncould be read (``mode='r'``) from or written to (``mode='w'``).\n\nLuigi comes with Gzip support by providing ``format=format.Gzip``.\nAdding support for other formats is pretty simple.\n\nTask\n~~~~\n\nThe :class:`~luigi.task.Task` class is a bit more conceptually interesting because this is\nwhere computation is done.\nThere are a few methods that can be implemented to alter its behavior,\nmost notably :func:`~luigi.task.Task.run`, :func:`~luigi.task.Task.output` and :func:`~luigi.task.Task.requires`.\n\nTasks consume Targets that were created by some other task. They usually also output targets:\n\n    .. figure:: task_with_targets.png\n       :alt: Task and targets\n\nYou can define dependencies between *Tasks* using the :py:meth:`~luigi.task.Task.requires` method. See :doc:`/tasks` for more info.\n\n    .. figure:: tasks_with_dependencies.png\n       :alt: Tasks and dependencies\n\nEach task defines its outputs using the :py:meth:`~luigi.task.Task.output` method.\nAdditionally, there is a helper method :py:meth:`~luigi.task.Task.input` that returns the corresponding Target classes for each Task dependency.\n\n    .. figure:: tasks_input_output_requires.png\n       :alt: Tasks and methods\n\n.. _Parameter:\n\nParameter\n~~~~~~~~~\n\nThe Task class corresponds to some type of job that is run, but in\ngeneral you want to allow some form of parameterization of it.\nFor instance, if your Task class runs a Hadoop job to create a report every night,\nyou probably want to make the date a parameter of the class.\nSee :doc:`/parameters` for more info.\n\n    .. figure:: task_parameters.png\n       :alt: Tasks with parameters\n\nDependencies\n~~~~~~~~~~~~\n\nUsing tasks, targets, and parameters, Luigi lets you express arbitrary dependencies in *code*, rather than using some kind of awkward config DSL.\nThis is really useful because in the real world, dependencies are often very messy.\nFor instance, some examples of the dependencies you might encounter:\n\n    .. figure:: parameters_date_algebra.png\n       :alt: Dependencies with date algebra\n\n    .. figure:: parameters_recursion.png\n       :alt: Dependencies with recursion\n\n    .. figure:: parameters_enum.png\n       :alt: Dependencies with enums\n\n(These diagrams are from a `Luigi presentation in late 2014 at NYC Data Science meetup <http://www.slideshare.net/erikbern/luigi-presentation-nyc-data-science>`_)\n"
  },
  {
    "path": "examples/__init__.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n"
  },
  {
    "path": "examples/config.toml",
    "content": "\n[hdfs]\nclient = \"hadoopcli\"\nnamenode_host = \"localhost\"\nnamenode_port = 50030\n\n# LOGGING\n\n[logging]\nversion = 1\ndisable_existing_loggers = false\n\n# logs format\n[logging.formatters.simple]\nformat = \"{levelname:8} {asctime} {module}:{lineno} {message}\"\nstyle = \"{\"\ndatefmt = \"%Y-%m-%d %H:%M:%S\"\n\n# write logs to console\n[logging.handlers.console]\nlevel = \"DEBUG\"\nclass = \"logging.StreamHandler\"\nformatter = \"simple\"\n\n# luigi worker logging\n[logging.loggers.luigi-interface]\nhandlers = [\"console\"]\nlevel = \"INFO\"\ndisabled = false\npropagate = false\n\n# luigid logging\n[logging.loggers.luigi]\nhandlers = [\"console\"]\nlevel = \"INFO\"\ndisabled = false\npropagate = false\n\n# luigid builded on tornado\n[logging.loggers.tornado]\nhandlers = [\"console\"]\nlevel = \"INFO\"\ndisabled = false\npropagate = false\n\n# custom logger for \"project\"\n[logging.loggers.project]\nhandlers = [\"console\"]\nlevel = \"DEBUG\"\ndisabled = false\npropagate = false\n"
  },
  {
    "path": "examples/dynamic_requirements.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport os\nimport random as rnd\nimport time\n\nimport luigi\n\n\nclass Configuration(luigi.Task):\n    seed = luigi.IntParameter()\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, a successful execution of this task will create a file on the local filesystem.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`luigi.target.Target`)\n        \"\"\"\n        return luigi.LocalTarget(\"/tmp/Config_%d.txt\" % self.seed)\n\n    def run(self):\n        time.sleep(5)\n        rnd.seed(self.seed)\n\n        result = \",\".join([str(x) for x in rnd.sample(list(range(300)), rnd.randint(7, 25))])\n        with self.output().open(\"w\") as f:\n            f.write(result)\n\n\nclass Data(luigi.Task):\n    magic_number = luigi.IntParameter()\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, a successful execution of this task will create a file on the local filesystem.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`luigi.target.Target`)\n        \"\"\"\n        return luigi.LocalTarget(\"/tmp/Data_%d.txt\" % self.magic_number)\n\n    def run(self):\n        time.sleep(1)\n        with self.output().open(\"w\") as f:\n            f.write(\"%s\" % self.magic_number)\n\n\nclass Dynamic(luigi.Task):\n    seed = luigi.IntParameter(default=1)\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, a successful execution of this task will create a file on the local filesystem.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`luigi.target.Target`)\n        \"\"\"\n        return luigi.LocalTarget(\"/tmp/Dynamic_%d.txt\" % self.seed)\n\n    def run(self):\n        # This could be done using regular requires method\n        config = self.clone(Configuration)\n        yield config\n\n        with config.output().open() as f:\n            data = [int(x) for x in f.read().split(\",\")]\n\n        # ... but not this\n        data_dependent_deps = [Data(magic_number=x) for x in data]\n        yield data_dependent_deps\n\n        with self.output().open(\"w\") as f:\n            f.write(\"Tada!\")\n\n        # and in case data is rather long, consider wrapping the requirements\n        # in DynamicRequirements and optionally define a custom complete method\n        def custom_complete(complete_fn):\n            # example: Data() stores all outputs in the same directory, so avoid doing len(data) fs\n            # calls but rather check only the first, and compare basenames for the rest\n            # (complete_fn defaults to \"lambda task: task.complete()\" but can also include caching)\n            if not complete_fn(data_dependent_deps[0]):\n                return False\n            paths = [task.output().path for task in data_dependent_deps]\n            basenames = os.listdir(os.path.dirname(paths[0]))  # a single fs call\n            return all(os.path.basename(path) in basenames for path in paths)\n\n        yield luigi.DynamicRequirements(data_dependent_deps, custom_complete)\n\n\nif __name__ == \"__main__\":\n    luigi.run()\n"
  },
  {
    "path": "examples/elasticsearch_index.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nimport datetime\nimport json\n\nimport luigi\nfrom luigi.contrib.esindex import CopyToIndex\n\n\nclass FakeDocuments(luigi.Task):\n    \"\"\"\n    Generates a local file containing 5 elements of data in JSON format.\n    \"\"\"\n\n    #: the date parameter.\n    date = luigi.DateParameter(default=datetime.date.today())\n\n    def run(self):\n        \"\"\"\n        Writes data in JSON format into the task's output target.\n\n        The data objects have the following attributes:\n\n        * `_id` is the default Elasticsearch id field,\n        * `text`: the text,\n        * `date`: the day when the data was created.\n\n        \"\"\"\n        today = datetime.date.today()\n        with self.output().open(\"w\") as output:\n            for i in range(5):\n                output.write(json.dumps({\"_id\": i, \"text\": \"Hi %s\" % i, \"date\": str(today)}))\n                output.write(\"\\n\")\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, a successful execution of this task will create a file on the local filesystem.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`luigi.target.Target`)\n        \"\"\"\n        return luigi.LocalTarget(path=\"/tmp/_docs-%s.ldj\" % self.date)\n\n\nclass IndexDocuments(CopyToIndex):\n    \"\"\"\n    This task loads JSON data contained in a :py:class:`luigi.target.Target` into an ElasticSearch index.\n\n    This task's input will the target returned by :py:meth:`~.FakeDocuments.output`.\n\n    This class uses :py:meth:`luigi.contrib.esindex.CopyToIndex.run`.\n\n    After running this task you can run:\n\n    .. code-block:: console\n\n        $ curl \"localhost:9200/example_index/_search?pretty\"\n\n    to see the indexed documents.\n\n    To see the update log, run\n\n    .. code-block:: console\n\n        $ curl \"localhost:9200/update_log/_search?q=target_index:example_index&pretty\"\n\n    To cleanup both indexes run:\n\n    .. code-block:: console\n\n        $ curl -XDELETE \"localhost:9200/example_index\"\n        $ curl -XDELETE \"localhost:9200/update_log/_query?q=target_index:example_index\"\n\n    \"\"\"\n\n    #: date task parameter (default = today)\n    date = luigi.DateParameter(default=datetime.date.today())\n\n    #: the name of the index in ElasticSearch to be updated.\n    index = \"example_index\"\n    #: the name of the document type.\n    doc_type = \"greetings\"\n    #: the host running the ElasticSearch service.\n    host = \"localhost\"\n    #: the port used by the ElasticSearch service.\n    port = 9200\n\n    def requires(self):\n        \"\"\"\n        This task's dependencies:\n\n        * :py:class:`~.FakeDocuments`\n\n        :return: object (:py:class:`luigi.task.Task`)\n        \"\"\"\n        return FakeDocuments()\n\n\nif __name__ == \"__main__\":\n    luigi.run([\"IndexDocuments\", \"--local-scheduler\"])\n"
  },
  {
    "path": "examples/execution_summary_example.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2015-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nYou can run this example like this:\n\n    .. code:: console\n\n            $ luigi --module examples.execution_summary_example examples.EntryPoint --local-scheduler\n            ...\n            ... lots of spammy output\n            ...\n            INFO: There are 11 pending tasks unique to this worker\n            INFO: Worker Worker(salt=843361665, workers=1, host=arash-spotify-T440s, username=arash, pid=18534) was stopped. Shutting down Keep-Alive thread\n            INFO:\n            ===== Luigi Execution Summary =====\n\n            Scheduled 218 tasks of which:\n            * 195 complete ones were encountered:\n                - 195 examples.Bar(num=5...199)\n            * 1 ran successfully:\n                - 1 examples.Boom(...)\n            * 22 were left pending, among these:\n                * 1 were missing external dependencies:\n                    - 1 MyExternal()\n                * 21 had missing dependencies:\n                    - 1 examples.EntryPoint()\n                    - examples.Foo(num=100, num2=16) and 9 other examples.Foo\n                    - 10 examples.DateTask(date=1998-03-23...1998-04-01, num=5)\n\n            This progress looks :| because there were missing external dependencies\n\n            ===== Luigi Execution Summary =====\n\"\"\"\n\nimport datetime\n\nimport luigi\n\n\nclass MyExternal(luigi.ExternalTask):\n    def complete(self):\n        return False\n\n\nclass Boom(luigi.Task):\n    task_namespace = \"examples\"\n    this_is_a_really_long_I_mean_way_too_long_and_annoying_parameter = luigi.IntParameter()\n\n    def run(self):\n        print(\"Running Boom\")\n\n    def requires(self):\n        for i in range(5, 200):\n            yield Bar(i)\n\n\nclass Foo(luigi.Task):\n    task_namespace = \"examples\"\n    num = luigi.IntParameter()\n    num2 = luigi.IntParameter()\n\n    def run(self):\n        print(\"Running Foo\")\n\n    def requires(self):\n        yield MyExternal()\n        yield Boom(0)\n\n\nclass Bar(luigi.Task):\n    task_namespace = \"examples\"\n    num = luigi.IntParameter()\n\n    def run(self):\n        self.output().open(\"w\").close()\n\n    def output(self):\n        return luigi.LocalTarget(\"/tmp/bar/%d\" % self.num)\n\n\nclass DateTask(luigi.Task):\n    task_namespace = \"examples\"\n    date = luigi.DateParameter()\n    num = luigi.IntParameter()\n\n    def run(self):\n        print(\"Running DateTask\")\n\n    def requires(self):\n        yield MyExternal()\n        yield Boom(0)\n\n\nclass EntryPoint(luigi.Task):\n    task_namespace = \"examples\"\n\n    def run(self):\n        print(\"Running EntryPoint\")\n\n    def requires(self):\n        for i in range(10):\n            yield Foo(100, 2 * i)\n        for i in range(10):\n            yield DateTask(datetime.date(1998, 3, 23) + datetime.timedelta(days=i), 5)\n"
  },
  {
    "path": "examples/foo.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nYou can run this example like this:\n\n    .. code:: console\n\n            $ rm -rf '/tmp/bar'\n            $ luigi --module examples.foo examples.Foo --workers 2 --local-scheduler\n\n\"\"\"\n\nimport time\n\nimport luigi\n\n\nclass Foo(luigi.WrapperTask):\n    task_namespace = \"examples\"\n\n    def run(self):\n        print(\"Running Foo\")\n\n    def requires(self):\n        for i in range(10):\n            yield Bar(i)\n\n\nclass Bar(luigi.Task):\n    task_namespace = \"examples\"\n    num = luigi.IntParameter()\n\n    def run(self):\n        time.sleep(1)\n        self.output().open(\"w\").close()\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`~luigi.target.Target`)\n        \"\"\"\n        time.sleep(1)\n        return luigi.LocalTarget(\"/tmp/bar/%d\" % self.num)\n"
  },
  {
    "path": "examples/foo_complex.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nYou can run this example like this:\n\n    .. code:: console\n\n            $ rm -rf '/tmp/bar'\n            $ luigi --module examples.foo_complex examples.Foo --workers 2 --local-scheduler\n\n\"\"\"\n\nimport random\nimport time\n\nimport luigi\n\nmax_depth = 10\nmax_total_nodes = 50\ncurrent_nodes = 0\n\n\nclass Foo(luigi.Task):\n    task_namespace = \"examples\"\n\n    def run(self):\n        print(\"Running Foo\")\n\n    def requires(self):\n        global current_nodes\n        for i in range(30 // max_depth):\n            current_nodes += 1\n            yield Bar(i)\n\n\nclass Bar(luigi.Task):\n    task_namespace = \"examples\"\n\n    num = luigi.IntParameter()\n\n    def run(self):\n        time.sleep(1)\n        self.output().open(\"w\").close()\n\n    def requires(self):\n        global current_nodes\n\n        if max_total_nodes > current_nodes:\n            valor = int(random.uniform(1, 30))\n            for i in range(valor // max_depth):\n                current_nodes += 1\n                yield Bar(current_nodes)\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`~luigi.target.Target`)\n        \"\"\"\n        time.sleep(1)\n        return luigi.LocalTarget(\"/tmp/bar/%d\" % self.num)\n"
  },
  {
    "path": "examples/ftp_experiment_outputs.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nimport luigi\nfrom luigi.contrib.ftp import RemoteTarget\n\n#: the FTP server\nHOST = \"some_host\"\n#: the username\nUSER = \"user\"\n#: the password\nPWD = \"some_password\"\n\n\nclass ExperimentTask(luigi.ExternalTask):\n    \"\"\"\n    This class represents something that was created elsewhere by an external process,\n    so all we want to do is to implement the output method.\n    \"\"\"\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, a successful execution of this task will create a file that will be created in a FTP server.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`~luigi.target.Target`)\n        \"\"\"\n        return RemoteTarget(\"/experiment/output1.txt\", HOST, username=USER, password=PWD)\n\n    def run(self):\n        \"\"\"\n        The execution of this task will write 4 lines of data on this task's target output.\n        \"\"\"\n        with self.output().open(\"w\") as outfile:\n            print(\"data 0 200 10 50 60\", file=outfile)\n            print(\"data 1 190 9 52 60\", file=outfile)\n            print(\"data 2 200 10 52 60\", file=outfile)\n            print(\"data 3 195 1 52 60\", file=outfile)\n\n\nclass ProcessingTask(luigi.Task):\n    \"\"\"\n    This class represents something that was created elsewhere by an external process,\n    so all we want to do is to implement the output method.\n    \"\"\"\n\n    def requires(self):\n        \"\"\"\n        This task's dependencies:\n\n        * :py:class:`~.ExperimentTask`\n\n        :return: object (:py:class:`luigi.task.Task`)\n        \"\"\"\n        return ExperimentTask()\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, a successful execution of this task will create a file on the local filesystem.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`~luigi.target.Target`)\n        \"\"\"\n        return luigi.LocalTarget(\"/tmp/processeddata.txt\")\n\n    def run(self):\n        avg = 0.0\n        elements = 0\n        sumval = 0.0\n\n        # Target objects are a file system/format abstraction and this will return a file stream object\n        # NOTE: self.input() actually returns the ExperimentTask.output() target\n        for line in self.input().open(\"r\"):\n            values = line.split(\" \")\n            avg += float(values[2])\n            sumval += float(values[3])\n            elements = elements + 1\n\n        # average\n        avg = avg / elements\n\n        # save calculated values\n        with self.output().open(\"w\") as outfile:\n            print(avg, sumval, file=outfile)\n\n\nif __name__ == \"__main__\":\n    luigi.run()\n"
  },
  {
    "path": "examples/hello_world.py",
    "content": "\"\"\"\nYou can run this example like this:\n\n    .. code:: console\n\n            $ luigi --module examples.hello_world examples.HelloWorldTask --local-scheduler\n\nIf that does not work, see :ref:`CommandLine`.\n\"\"\"\n\nimport luigi\n\n\nclass HelloWorldTask(luigi.Task):\n    task_namespace = \"examples\"\n\n    def run(self):\n        print(\"{task} says: Hello world!\".format(task=self.__class__.__name__))\n\n\nif __name__ == \"__main__\":\n    luigi.run([\"examples.HelloWorldTask\", \"--workers\", \"1\", \"--local-scheduler\"])\n"
  },
  {
    "path": "examples/kubernetes.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2015 Outlier Bio, LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nExample Kubernetes Job Task.\n\nRequires:\n\n- pykube: ``pip install pykube-ng``\n- A local minikube custer up and running: http://kubernetes.io/docs/getting-started-guides/minikube/\n\n**WARNING**: For Python versions < 3.5 the kubeconfig file must point to a Kubernetes API\nhostname, and NOT to an IP address.\n\nYou can run this code example like this:\n\n    .. code:: console\n        $ luigi --module examples.kubernetes_job PerlPi --local-scheduler\n\nRunning this code will create a pi-luigi-uuid kubernetes job within the cluster\npointed to by the default context in \"~/.kube/config\".\n\nIf running within a kubernetes cluster, set auth_method = \"service-account\" to\naccess the local cluster.\n\"\"\"\n\n# import os\n# import luigi\nfrom luigi.contrib.kubernetes import KubernetesJobTask\n\n\nclass PerlPi(KubernetesJobTask):\n    name = \"pi\"\n    max_retrials = 3\n    spec_schema = {\"containers\": [{\"name\": \"pi\", \"image\": \"perl\", \"command\": [\"perl\", \"-Mbignum=bpi\", \"-wle\", \"print bpi(2000)\"]}]}\n\n    # defining the two functions below allows for dependency checking,\n    # but isn't a requirement\n    # def signal_complete(self):\n    #     with self.output().open('w') as output:\n    #         output.write('')\n    #\n    # def output(self):\n    #     target = os.path.join(\"/tmp\", \"PerlPi\")\n    #     return luigi.LocalTarget(target)\n"
  },
  {
    "path": "examples/per_task_retry_policy.py",
    "content": "# -*- coding: utf-8 -*-\n\n\"\"\"\nYou can run this example like this:\n\n    .. code:: console\n\n            $ luigi --module examples.per_task_retry_policy examples.PerTaskRetryPolicy --worker-keep-alive \\\n            --local-scheduler --scheduler-retry-delay 5  --logging-conf-file test/testconfig/logging.cfg\n\n            ...\n            ... lots of spammy output\n            ...\n            DEBUG: ErrorTask1__99914b932b task num failures is 1 and limit is 5\n            DEBUG: ErrorTask2__99914b932b task num failures is 1 and limit is 2\n            DEBUG: DynamicErrorTask1__99914b932b task num failures is 1 and limit is 3\n            DEBUG: ErrorTask1__99914b932b task num failures is 2 and limit is 5\n            DEBUG: ErrorTask2__99914b932b task num failures is 2 and limit is 2\n            DEBUG: ErrorTask2__99914b932b task num failures limit(2) is exceeded\n            DEBUG: DynamicErrorTask1__99914b932b task num failures is 2 and limit is 3\n            DEBUG: ErrorTask1__99914b932b task num failures is 3 and limit is 5\n            DEBUG: DynamicErrorTask1__99914b932b task num failures is 3 and limit is 3\n            DEBUG: DynamicErrorTask1__99914b932b task num failures limit(3) is exceeded\n            DEBUG: ErrorTask1__99914b932b task num failures is 4 and limit is 5\n            DEBUG: ErrorTask1__99914b932b task num failures is 5 and limit is 5\n            DEBUG: ErrorTask1__99914b932b task num failures limit(5) is exceeded\n            INFO:\n            ===== Luigi Execution Summary =====\n\n            Scheduled 8 tasks of which:\n            * 2 ran successfully:\n                - 1 SuccessSubTask1()\n                - 1 SuccessTask1()\n            * 3 failed:\n                - 1 DynamicErrorTask1()\n                - 1 ErrorTask1()\n                - 1 ErrorTask2()\n            * 3 were left pending, among these:\n                * 1 were missing external dependencies:\n                    - 1 DynamicErrorTaskSubmitter()\n                * 1 had failed dependencies:\n                    - 1 examples.PerTaskRetryPolicy()\n                * 1 had missing dependencies:\n                    - 1 examples.PerTaskRetryPolicy()\n                * 1 was not granted run permission by the scheduler:\n                    - 1 DynamicErrorTaskSubmitter()\n\n            This progress looks :( because there were failed tasks\n\n            ===== Luigi Execution Summary =====\n\"\"\"\n\nimport luigi\n\n\nclass PerTaskRetryPolicy(luigi.Task):\n    \"\"\"\n    Wrapper class for some error and success tasks. Worker won't be shutdown unless there is\n    pending tasks or failed tasks which will be retried. While keep-alive is active, workers\n    are not shutdown while there is/are some pending task(s).\n\n    \"\"\"\n\n    task_namespace = \"examples\"\n\n    def requires(self):\n        return [ErrorTask1(), ErrorTask2(), SuccessTask1(), DynamicErrorTaskSubmitter()]\n\n    def output(self):\n        return luigi.LocalTarget(path=\"/tmp/_docs-%s.ldj\" % self.task_id)\n\n\nclass ErrorTask1(luigi.Task):\n    \"\"\"\n    This error class raises error to retry the task. retry-count for this task is 5. It can be seen on\n    \"\"\"\n\n    retry = 0\n\n    retry_count = 5\n\n    def run(self):\n        self.retry += 1\n        raise Exception(\"Test Exception. Retry Index %s for %s\" % (self.retry, self.task_family))\n\n    def output(self):\n        return luigi.LocalTarget(path=\"/tmp/_docs-%s.ldj\" % self.task_id)\n\n\nclass ErrorTask2(luigi.Task):\n    \"\"\"\n    This error class raises error to retry the task. retry-count for this task is 2\n    \"\"\"\n\n    retry = 0\n\n    retry_count = 2\n\n    def run(self):\n        self.retry += 1\n        raise Exception(\"Test Exception. Retry Index %s for %s\" % (self.retry, self.task_family))\n\n    def output(self):\n        return luigi.LocalTarget(path=\"/tmp/_docs-%s.ldj\" % self.task_id)\n\n\nclass DynamicErrorTaskSubmitter(luigi.Task):\n    target = None\n\n    def run(self):\n        target = yield DynamicErrorTask1()\n\n        if target.exists():\n            with self.output().open(\"w\") as output:\n                output.write(\"SUCCESS DynamicErrorTaskSubmitter\\n\")\n\n    def output(self):\n        return luigi.LocalTarget(path=\"/tmp/_docs-%s.ldj\" % self.task_id)\n\n\nclass DynamicErrorTask1(luigi.Task):\n    \"\"\"\n    This dynamic error task raises error to retry the task. retry-count for this task is 3\n    \"\"\"\n\n    retry = 0\n\n    retry_count = 3\n\n    def run(self):\n        self.retry += 1\n        raise Exception(\"Test Exception. Retry Index %s for %s\" % (self.retry, self.task_family))\n\n    def output(self):\n        return luigi.LocalTarget(path=\"/tmp/_docs-%s.ldj\" % self.task_id)\n\n\nclass SuccessTask1(luigi.Task):\n    def requires(self):\n        return [SuccessSubTask1()]\n\n    def run(self):\n        with self.output().open(\"w\") as output:\n            output.write(\"SUCCESS Test Task 4\\n\")\n\n    def output(self):\n        return luigi.LocalTarget(path=\"/tmp/_docs-%s.ldj\" % self.task_id)\n\n\nclass SuccessSubTask1(luigi.Task):\n    \"\"\"\n    This success task sleeps for a while and then it is completed successfully.\n    \"\"\"\n\n    def run(self):\n        with self.output().open(\"w\") as output:\n            output.write(\"SUCCESS Test Task 4.1\\n\")\n\n    def output(self):\n        return luigi.LocalTarget(path=\"/tmp/_docs-%s.ldj\" % self.task_id)\n"
  },
  {
    "path": "examples/pyspark_wc.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nimport luigi\nfrom luigi.contrib.s3 import S3Target\nfrom luigi.contrib.spark import PySparkTask, SparkSubmitTask\n\n\nclass InlinePySparkWordCount(PySparkTask):\n    \"\"\"\n    This task runs a :py:class:`luigi.contrib.spark.PySparkTask` task\n    over the target data in :py:meth:`wordcount.input` (a file in S3) and\n    writes the result into its :py:meth:`wordcount.output` target (a file in S3).\n\n    This class uses :py:meth:`luigi.contrib.spark.PySparkTask.main`.\n\n    Example luigi configuration::\n\n        [spark]\n        spark-submit: /usr/local/spark/bin/spark-submit\n        master: spark://spark.example.org:7077\n        # py-packages: numpy, pandas\n\n    \"\"\"\n\n    driver_memory = \"2g\"\n    executor_memory = \"3g\"\n\n    def input(self):\n        return S3Target(\"s3n://bucket.example.org/wordcount.input\")\n\n    def output(self):\n        return S3Target(\"s3n://bucket.example.org/wordcount.output\")\n\n    def main(self, sc, *args):\n        sc.textFile(self.input().path).flatMap(lambda line: line.split()).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b).saveAsTextFile(\n            self.output().path\n        )\n\n\nclass PySparkWordCount(SparkSubmitTask):\n    \"\"\"\n    This task is the same as :py:class:`InlinePySparkWordCount` above but uses\n    an external python driver file specified in :py:meth:`app`\n\n    It runs a :py:class:`luigi.contrib.spark.SparkSubmitTask` task\n    over the target data in :py:meth:`wordcount.input` (a file in S3) and\n    writes the result into its :py:meth:`wordcount.output` target (a file in S3).\n\n    This class uses :py:meth:`luigi.contrib.spark.SparkSubmitTask.run`.\n\n    Example luigi configuration::\n\n        [spark]\n        spark-submit: /usr/local/spark/bin/spark-submit\n        master: spark://spark.example.org:7077\n        deploy-mode: client\n\n    \"\"\"\n\n    driver_memory = \"2g\"\n    executor_memory = \"3g\"\n    total_executor_cores = luigi.IntParameter(default=100, significant=False)\n\n    name = \"PySpark Word Count\"\n    app = \"wordcount.py\"\n\n    def app_options(self):\n        # These are passed to the Spark main args in the defined order.\n        return [self.input().path, self.output().path]\n\n    def input(self):\n        return S3Target(\"s3n://bucket.example.org/wordcount.input\")\n\n    def output(self):\n        return S3Target(\"s3n://bucket.example.org/wordcount.output\")\n\n\n\"\"\"\n// Corresponding example Spark Job, running Word count with Spark's Python API\n// This file would have to be saved into wordcount.py\n\nimport sys\nfrom pyspark import SparkContext\n\nif __name__ == \"__main__\":\n\n    sc = SparkContext()\n    sc.textFile(sys.argv[1]) \\\n      .flatMap(lambda line: line.split()) \\\n      .map(lambda word: (word, 1)) \\\n      .reduceByKey(lambda a, b: a + b) \\\n      .saveAsTextFile(sys.argv[2])\n\"\"\"\n"
  },
  {
    "path": "examples/spark_als.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport random\n\nimport luigi\nimport luigi.contrib.hdfs\nimport luigi.format\nfrom luigi.contrib.spark import SparkSubmitTask\n\n\nclass UserItemMatrix(luigi.Task):\n    #: the size of the data being generated\n    data_size = luigi.IntParameter()\n\n    def run(self):\n        \"\"\"\n        Generates :py:attr:`~.UserItemMatrix.data_size` elements.\n        Writes this data in \\\\ separated value format into the target :py:func:`~/.UserItemMatrix.output`.\n\n        The data has the following elements:\n\n        * `user` is the default Elasticsearch id field,\n        * `track`: the text,\n        * `rating`: the day when the data was created.\n\n        \"\"\"\n        w = self.output().open(\"w\")\n        for user in range(self.data_size):\n            track = int(random.random() * self.data_size)\n            w.write(\"%d\\\\%d\\\\%f\" % (user, track, 1.0))\n        w.close()\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, a successful execution of this task will create a file in HDFS.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`~luigi.target.Target`)\n        \"\"\"\n        return luigi.contrib.hdfs.HdfsTarget(\"data-matrix\", format=luigi.format.Gzip)\n\n\nclass SparkALS(SparkSubmitTask):\n    \"\"\"\n    This task runs a :py:class:`luigi.contrib.spark.SparkSubmitTask` task\n    over the target data returned by :py:meth:`~/.UserItemMatrix.output` and\n    writes the result into its :py:meth:`~.SparkALS.output` target (a file in HDFS).\n\n    This class uses :py:meth:`luigi.contrib.spark.SparkSubmitTask.run`.\n\n    Example luigi configuration::\n\n        [spark]\n        spark-submit: /usr/local/spark/bin/spark-submit\n        master: yarn-client\n\n    \"\"\"\n\n    data_size = luigi.IntParameter(default=1000)\n\n    driver_memory = \"2g\"\n    executor_memory = \"3g\"\n    num_executors = luigi.IntParameter(default=100)\n\n    app = \"my-spark-assembly.jar\"\n    entry_class = \"com.spotify.spark.ImplicitALS\"\n\n    def app_options(self):\n        # These are passed to the Spark main args in the defined order.\n        return [self.input().path, self.output().path]\n\n    def requires(self):\n        \"\"\"\n        This task's dependencies:\n\n        * :py:class:`~.UserItemMatrix`\n\n        :return: object (:py:class:`luigi.task.Task`)\n        \"\"\"\n        return UserItemMatrix(self.data_size)\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, a successful execution of this task will create a file in HDFS.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`~luigi.target.Target`)\n        \"\"\"\n        # The corresponding Spark job outputs as GZip format.\n        return luigi.contrib.hdfs.HdfsTarget(\"als-output/\", format=luigi.format.Gzip)\n\n\n\"\"\"\n// Corresponding example Spark Job, a wrapper around the MLLib ALS job.\n// This class would have to be jarred into my-spark-assembly.jar\n// using sbt assembly (or package) and made available to the Luigi job\n// above.\n\npackage com.spotify.spark\n\nimport org.apache.spark._\nimport org.apache.spark.mllib.recommendation.{Rating, ALS}\nimport org.apache.hadoop.io.compress.GzipCodec\n\nobject ImplicitALS {\n\n  def main(args: Array[String]) {\n    val sc = new SparkContext(args(0), \"ImplicitALS\")\n    val input = args(1)\n    val output = args(2)\n\n    val ratings = sc.textFile(input)\n      .map { l: String =>\n        val t = l.split('\\t')\n        Rating(t(0).toInt, t(1).toInt, t(2).toFloat)\n      }\n\n    val model = ALS.trainImplicit(ratings, 40, 20, 0.8, 150)\n    model\n      .productFeatures\n      .map { case (id, vec) =>\n        id + \"\\t\" + vec.map(d => \"%.6f\".format(d)).mkString(\" \")\n      }\n      .saveAsTextFile(output, classOf[GzipCodec])\n\n    sc.stop()\n  }\n}\n\"\"\"\n"
  },
  {
    "path": "examples/ssh_remote_execution.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom collections import defaultdict\n\nimport luigi\nfrom luigi.contrib.ssh import RemoteContext, RemoteTarget\nfrom luigi.mock import MockTarget\n\nSSH_HOST = \"some.accessible.host\"\n\n\nclass CreateRemoteData(luigi.Task):\n    \"\"\"\n    Dump info on running processes on remote host.\n    Data is still stored on the remote host\n    \"\"\"\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, a successful execution of this task will create a file on a remote server using SSH.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`~luigi.target.Target`)\n        \"\"\"\n        return RemoteTarget(\"/tmp/stuff\", SSH_HOST)\n\n    def run(self):\n        remote = RemoteContext(SSH_HOST)\n        print(remote.check_output([\"ps aux > {0}\".format(self.output().path)]))\n\n\nclass ProcessRemoteData(luigi.Task):\n    \"\"\"\n    Create a toplist of users based on how many running processes they have on a remote machine.\n\n    In this example the processed data is stored in a MockTarget.\n    \"\"\"\n\n    def requires(self):\n        \"\"\"\n        This task's dependencies:\n\n        * :py:class:`~.CreateRemoteData`\n\n        :return: object (:py:class:`luigi.task.Task`)\n        \"\"\"\n        return CreateRemoteData()\n\n    def run(self):\n        processes_per_user = defaultdict(int)\n        with self.input().open(\"r\") as infile:\n            for line in infile:\n                username = line.split()[0]\n                processes_per_user[username] += 1\n\n        toplist = sorted(processes_per_user.items(), key=lambda x: x[1], reverse=True)\n\n        with self.output().open(\"w\") as outfile:\n            for user, n_processes in toplist:\n                print(n_processes, user, file=outfile)\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, a successful execution of this task will simulate the creation of a file in a filesystem.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`~luigi.target.Target`)\n        \"\"\"\n        return MockTarget(\"output\", mirror_on_stderr=True)\n"
  },
  {
    "path": "examples/terasort.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport logging\nimport os\n\nimport luigi\nimport luigi.contrib.hadoop_jar\nimport luigi.contrib.hdfs\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\ndef hadoop_examples_jar():\n    config = luigi.configuration.get_config()\n    examples_jar = config.get(\"hadoop\", \"examples-jar\")\n    if not examples_jar:\n        logger.error(\"You must specify hadoop:examples-jar in luigi.cfg\")\n        raise\n    if not os.path.exists(examples_jar):\n        logger.error(\"Can't find example jar: \" + examples_jar)\n        raise\n    return examples_jar\n\n\nDEFAULT_TERASORT_IN = \"/tmp/terasort-in\"\nDEFAULT_TERASORT_OUT = \"/tmp/terasort-out\"\n\n\nclass TeraGen(luigi.contrib.hadoop_jar.HadoopJarJobTask):\n    \"\"\"\n    Runs TeraGen, by default with 1TB of data (10B records)\n    \"\"\"\n\n    records = luigi.Parameter(default=\"10000000000\", description=\"Number of records, each record is 100 Bytes\")\n    terasort_in = luigi.Parameter(default=DEFAULT_TERASORT_IN, description=\"directory to store terasort input into.\")\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, a successful execution of this task will create a file in HDFS.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`~luigi.target.Target`)\n        \"\"\"\n        return luigi.contrib.hdfs.HdfsTarget(self.terasort_in)\n\n    def jar(self):\n        return hadoop_examples_jar()\n\n    def main(self):\n        return \"teragen\"\n\n    def args(self):\n        # First arg is 10B -- each record is 100bytes\n        return [self.records, self.output()]\n\n\nclass TeraSort(luigi.contrib.hadoop_jar.HadoopJarJobTask):\n    \"\"\"\n    Runs TeraGent, by default using\n    \"\"\"\n\n    terasort_in = luigi.Parameter(default=DEFAULT_TERASORT_IN, description=\"directory to store terasort input into.\")\n    terasort_out = luigi.Parameter(default=DEFAULT_TERASORT_OUT, description=\"directory to store terasort output into.\")\n\n    def requires(self):\n        \"\"\"\n        This task's dependencies:\n\n        * :py:class:`~.TeraGen`\n\n        :return: object (:py:class:`luigi.task.Task`)\n        \"\"\"\n        return TeraGen(terasort_in=self.terasort_in)\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, a successful execution of this task will create a file in HDFS.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`~luigi.target.Target`)\n        \"\"\"\n        return luigi.contrib.hdfs.HdfsTarget(self.terasort_out)\n\n    def jar(self):\n        return hadoop_examples_jar()\n\n    def main(self):\n        return \"terasort\"\n\n    def args(self):\n        return [self.input(), self.output()]\n"
  },
  {
    "path": "examples/top_artists.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport random\nfrom collections import defaultdict\nfrom heapq import nlargest\n\nimport luigi\nimport luigi.contrib.hdfs\nimport luigi.contrib.postgres\nimport luigi.contrib.spark\n\n\nclass ExternalStreams(luigi.ExternalTask):\n    \"\"\"\n    Example of a possible external data dump\n\n    To depend on external targets (typically at the top of your dependency graph), you can define\n    an ExternalTask like this.\n    \"\"\"\n\n    date = luigi.DateParameter()\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, it expects a file to be present in HDFS.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`luigi.target.Target`)\n        \"\"\"\n        return luigi.contrib.hdfs.HdfsTarget(self.date.strftime(\"data/streams_%Y-%m-%d.tsv\"))\n\n\nclass Streams(luigi.Task):\n    \"\"\"\n    Faked version right now, just generates bogus data.\n    \"\"\"\n\n    date = luigi.DateParameter()\n\n    def run(self):\n        \"\"\"\n        Generates bogus data and writes it into the :py:meth:`~.Streams.output` target.\n        \"\"\"\n        with self.output().open(\"w\") as output:\n            for _ in range(1000):\n                output.write(\"{} {} {}\\n\".format(random.randint(0, 999), random.randint(0, 999), random.randint(0, 999)))\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, a successful execution of this task will create a file in the local file system.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`luigi.target.Target`)\n        \"\"\"\n        return luigi.LocalTarget(self.date.strftime(\"data/streams_%Y_%m_%d_faked.tsv\"))\n\n\nclass StreamsHdfs(Streams):\n    \"\"\"\n    This task performs the same work as :py:class:`~.Streams` but its output is written to HDFS.\n\n    This class uses :py:meth:`~.Streams.run` and\n    overrides :py:meth:`~.Streams.output` so redefine HDFS as its target.\n    \"\"\"\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, a successful execution of this task will create a file in HDFS.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`luigi.target.Target`)\n        \"\"\"\n        return luigi.contrib.hdfs.HdfsTarget(self.date.strftime(\"data/streams_%Y_%m_%d_faked.tsv\"))\n\n\nclass AggregateArtists(luigi.Task):\n    \"\"\"\n    This task runs over the target data returned by :py:meth:`~/.Streams.output` and\n    writes the result into its :py:meth:`~.AggregateArtists.output` target (local file).\n    \"\"\"\n\n    date_interval = luigi.DateIntervalParameter()\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, a successful execution of this task will create a file on the local filesystem.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`luigi.target.Target`)\n        \"\"\"\n        return luigi.LocalTarget(\"data/artist_streams_{}.tsv\".format(self.date_interval))\n\n    def requires(self):\n        \"\"\"\n        This task's dependencies:\n\n        * :py:class:`~.Streams`\n\n        :return: list of object (:py:class:`luigi.task.Task`)\n        \"\"\"\n        return [Streams(date) for date in self.date_interval]\n\n    def run(self):\n        artist_count = defaultdict(int)\n\n        for t in self.input():\n            with t.open(\"r\") as in_file:\n                for line in in_file:\n                    _, artist, track = line.strip().split()\n                    artist_count[artist] += 1\n\n        with self.output().open(\"w\") as out_file:\n            for artist, count in artist_count.items():\n                out_file.write(\"{}\\t{}\\n\".format(artist, count))\n\n\nclass AggregateArtistsSpark(luigi.contrib.spark.SparkSubmitTask):\n    \"\"\"\n    This task runs a :py:class:`luigi.contrib.spark.SparkSubmitTask` task\n    over each target data returned by :py:meth:`~/.StreamsHdfs.output` and\n    writes the result into its :py:meth:`~.AggregateArtistsSpark.output` target (a file in HDFS).\n    \"\"\"\n\n    date_interval = luigi.DateIntervalParameter()\n\n    \"\"\"\n    The Pyspark script to run.\n\n    For Spark applications written in Java or Scala, the name of a jar file should be supplied instead.\n    \"\"\"\n    app = \"top_artists_spark.py\"\n\n    \"\"\"\n    Address of the Spark cluster master. In this case, we are not using a cluster, but running\n    Spark in local mode.\n    \"\"\"\n    master = \"local[*]\"\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, a successful execution of this task will create a file in HDFS.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`luigi.target.Target`)\n        \"\"\"\n        return luigi.contrib.hdfs.HdfsTarget(\"data/artist_streams_%s.tsv\" % self.date_interval)\n\n    def requires(self):\n        \"\"\"\n        This task's dependencies:\n\n        * :py:class:`~.StreamsHdfs`\n\n        :return: list of object (:py:class:`luigi.task.Task`)\n        \"\"\"\n        return [StreamsHdfs(date) for date in self.date_interval]\n\n    def app_options(self):\n        # :func:`~luigi.task.Task.input` returns the targets produced by the tasks in\n        # `~luigi.task.Task.requires`.\n        return [\",\".join([p.path for p in self.input()]), self.output().path]\n\n\nclass Top10Artists(luigi.Task):\n    \"\"\"\n    This task runs over the target data returned by :py:meth:`~/.AggregateArtists.output` or\n    :py:meth:`~/.AggregateArtistsSpark.output` in case :py:attr:`~/.Top10Artists.use_spark` is set and\n    writes the result into its :py:meth:`~.Top10Artists.output` target (a file in local filesystem).\n    \"\"\"\n\n    date_interval = luigi.DateIntervalParameter()\n    use_spark = luigi.BoolParameter()\n\n    def requires(self):\n        \"\"\"\n        This task's dependencies:\n\n        * :py:class:`~.AggregateArtists` or\n        * :py:class:`~.AggregateArtistsSpark` if :py:attr:`~/.Top10Artists.use_spark` is set.\n\n        :return: object (:py:class:`luigi.task.Task`)\n        \"\"\"\n        if self.use_spark:\n            return AggregateArtistsSpark(self.date_interval)\n        else:\n            return AggregateArtists(self.date_interval)\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, a successful execution of this task will create a file on the local filesystem.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`luigi.target.Target`)\n        \"\"\"\n        return luigi.LocalTarget(\"data/top_artists_%s.tsv\" % self.date_interval)\n\n    def run(self):\n        top_10 = nlargest(10, self._input_iterator())\n        with self.output().open(\"w\") as out_file:\n            for streams, artist in top_10:\n                out_line = \"\\t\".join([str(self.date_interval.date_a), str(self.date_interval.date_b), artist, str(streams)])\n                out_file.write((out_line + \"\\n\"))\n\n    def _input_iterator(self):\n        with self.input().open(\"r\") as in_file:\n            for line in in_file:\n                artist, streams = line.strip().split()\n                yield int(streams), artist\n\n\nclass ArtistToplistToDatabase(luigi.contrib.postgres.CopyToTable):\n    \"\"\"\n    This task runs a :py:class:`luigi.contrib.postgres.CopyToTable` task\n    over the target data returned by :py:meth:`~/.Top10Artists.output` and\n    writes the result into its :py:meth:`~.ArtistToplistToDatabase.output` target which,\n    by default, is :py:class:`luigi.contrib.postgres.PostgresTarget` (a table in PostgreSQL).\n\n    This class uses :py:meth:`luigi.contrib.postgres.CopyToTable.run`\n    and :py:meth:`luigi.contrib.postgres.CopyToTable.output`.\n    \"\"\"\n\n    date_interval = luigi.DateIntervalParameter()\n    use_spark = luigi.BoolParameter()\n\n    host = \"localhost\"\n    database = \"toplists\"\n    user = \"luigi\"\n    password = \"abc123\"  # ;)\n    table = \"top10\"\n\n    columns = [(\"date_from\", \"DATE\"), (\"date_to\", \"DATE\"), (\"artist\", \"TEXT\"), (\"streams\", \"INT\")]\n\n    def requires(self):\n        \"\"\"\n        This task's dependencies:\n\n        * :py:class:`~.Top10Artists`\n\n        :return: list of object (:py:class:`luigi.task.Task`)\n        \"\"\"\n        return Top10Artists(self.date_interval, self.use_spark)\n\n\nif __name__ == \"__main__\":\n    luigi.run()\n"
  },
  {
    "path": "examples/top_artists_spark.py",
    "content": "# -*- coding: utf-8 -*-\n\nimport operator\nimport sys\n\nfrom pyspark.sql import SparkSession\n\n\ndef main(argv):\n    input_paths = argv[1].split(\",\")\n    output_path = argv[2]\n\n    spark = SparkSession.builder.getOrCreate()\n\n    streams = spark.read.option(\"sep\", \"\\t\").csv(input_paths[0])\n    for stream_path in input_paths[1:]:\n        streams.union(spark.read.option(\"sep\", \"\\t\").csv(stream_path))\n\n    # The second field is the artist\n    counts = streams.map(lambda row: (row[1], 1)).reduceByKey(operator.add)\n\n    counts.write.option(\"sep\", \"\\t\").csv(output_path)\n\n\nif __name__ == \"__main__\":\n    sys.exit(main(sys.argv))\n"
  },
  {
    "path": "examples/wordcount.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nimport luigi\n\n\nclass InputText(luigi.ExternalTask):\n    \"\"\"\n    This class represents something that was created elsewhere by an external process,\n    so all we want to do is to implement the output method.\n    \"\"\"\n\n    date = luigi.DateParameter()\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, it expects a file to be present in the local file system.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`luigi.target.Target`)\n        \"\"\"\n        return luigi.LocalTarget(self.date.strftime(\"/var/tmp/text/%Y-%m-%d.txt\"))\n\n\nclass WordCount(luigi.Task):\n    date_interval = luigi.DateIntervalParameter()\n\n    def requires(self):\n        \"\"\"\n        This task's dependencies:\n\n        * :py:class:`~.InputText`\n\n        :return: list of object (:py:class:`luigi.task.Task`)\n        \"\"\"\n        return [InputText(date) for date in self.date_interval.dates()]\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, a successful execution of this task will create a file on the local filesystem.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`luigi.target.Target`)\n        \"\"\"\n        return luigi.LocalTarget(\"/var/tmp/text-count/%s\" % self.date_interval)\n\n    def run(self):\n        \"\"\"\n        1. count the words for each of the :py:meth:`~.InputText.output` targets created by :py:class:`~.InputText`\n        2. write the count into the :py:meth:`~.WordCount.output` target\n        \"\"\"\n        count = {}\n\n        # NOTE: self.input() actually returns an element for the InputText.output() target\n        for f in self.input():  # The input() method is a wrapper around requires() that returns Target objects\n            for line in f.open(\"r\"):  # Target objects are a file system/format abstraction and this will return a file stream object\n                for word in line.strip().split():\n                    count[word] = count.get(word, 0) + 1\n\n        # output data\n        f = self.output().open(\"w\")\n        for word, count in count.items():\n            f.write(\"%s\\t%d\\n\" % (word, count))\n        f.close()  # WARNING: file system operations are atomic therefore if you don't close the file you lose all data\n"
  },
  {
    "path": "examples/wordcount_hadoop.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport luigi\nimport luigi.contrib.hadoop\nimport luigi.contrib.hdfs\n\n# To make this run, you probably want to edit /etc/luigi/client.cfg and add something like:\n#\n# [hadoop]\n# jar: /usr/lib/hadoop-xyz/hadoop-streaming-xyz-123.jar\n\n\nclass InputText(luigi.ExternalTask):\n    \"\"\"\n    This task is a :py:class:`luigi.task.ExternalTask` which means it doesn't generate the\n    :py:meth:`~.InputText.output` target on its own instead relying on the execution something outside of Luigi\n    to produce it.\n    \"\"\"\n\n    date = luigi.DateParameter()\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, it expects a file to be present in HDFS.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`luigi.target.Target`)\n        \"\"\"\n        return luigi.contrib.hdfs.HdfsTarget(self.date.strftime(\"/tmp/text/%Y-%m-%d.txt\"))\n\n\nclass WordCount(luigi.contrib.hadoop.JobTask):\n    \"\"\"\n    This task runs a :py:class:`luigi.contrib.hadoop.JobTask`\n    over the target data returned by :py:meth:`~/.InputText.output` and\n    writes the result into its :py:meth:`~.WordCount.output` target.\n\n    This class uses :py:meth:`luigi.contrib.hadoop.JobTask.run`.\n    \"\"\"\n\n    date_interval = luigi.DateIntervalParameter()\n\n    def requires(self):\n        \"\"\"\n        This task's dependencies:\n\n        * :py:class:`~.InputText`\n\n        :return: list of object (:py:class:`luigi.task.Task`)\n        \"\"\"\n        return [InputText(date) for date in self.date_interval.dates()]\n\n    def output(self):\n        \"\"\"\n        Returns the target output for this task.\n        In this case, a successful execution of this task will create a file in HDFS.\n\n        :return: the target output for this task.\n        :rtype: object (:py:class:`luigi.target.Target`)\n        \"\"\"\n        return luigi.contrib.hdfs.HdfsTarget(\"/tmp/text-count/%s\" % self.date_interval)\n\n    def mapper(self, line):\n        for word in line.strip().split():\n            yield word, 1\n\n    def reducer(self, key, values):\n        yield key, sum(values)\n\n\nif __name__ == \"__main__\":\n    luigi.run()\n"
  },
  {
    "path": "luigi/__init__.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nPackage containing core luigi functionality.\n\"\"\"\n\nfrom luigi import configuration, event, interface, local_target, parameter, rpc, target, task\nfrom luigi.__version__ import VERSION\nfrom luigi.event import Event\nfrom luigi.execution_summary import LuigiStatusCode\nfrom luigi.interface import build, run\nfrom luigi.local_target import LocalTarget\nfrom luigi.parameter import (\n    BoolParameter,\n    ChoiceListParameter,\n    ChoiceParameter,\n    DateHourParameter,\n    DateIntervalParameter,\n    DateMinuteParameter,\n    DateParameter,\n    DateSecondParameter,\n    DictParameter,\n    EnumListParameter,\n    EnumParameter,\n    FloatParameter,\n    IntParameter,\n    ListParameter,\n    MonthParameter,\n    NumericalParameter,\n    OptionalBoolParameter,\n    OptionalChoiceParameter,\n    OptionalDictParameter,\n    OptionalFloatParameter,\n    OptionalIntParameter,\n    OptionalListParameter,\n    OptionalNumericalParameter,\n    OptionalParameter,\n    OptionalPathParameter,\n    OptionalStrParameter,\n    OptionalTupleParameter,\n    Parameter,\n    PathParameter,\n    StrParameter,\n    TaskParameter,\n    TimeDeltaParameter,\n    TupleParameter,\n    YearParameter,\n)\nfrom luigi.rpc import RemoteScheduler, RPCError\nfrom luigi.target import Target\nfrom luigi.task import (\n    Config,\n    DynamicRequirements,\n    ExternalTask,\n    Task,\n    WrapperTask,\n    auto_namespace,\n    namespace,\n)\n\n__version__ = VERSION\n__all__ = [\n    \"task\",\n    \"Task\",\n    \"Config\",\n    \"ExternalTask\",\n    \"WrapperTask\",\n    \"namespace\",\n    \"auto_namespace\",\n    \"DynamicRequirements\",\n    \"target\",\n    \"Target\",\n    \"LocalTarget\",\n    \"rpc\",\n    \"RemoteScheduler\",\n    \"RPCError\",\n    \"parameter\",\n    \"Parameter\",\n    \"DateParameter\",\n    \"MonthParameter\",\n    \"YearParameter\",\n    \"DateHourParameter\",\n    \"DateMinuteParameter\",\n    \"DateSecondParameter\",\n    \"DateIntervalParameter\",\n    \"TimeDeltaParameter\",\n    \"StrParameter\",\n    \"IntParameter\",\n    \"FloatParameter\",\n    \"BoolParameter\",\n    \"PathParameter\",\n    \"TaskParameter\",\n    \"ListParameter\",\n    \"TupleParameter\",\n    \"EnumParameter\",\n    \"DictParameter\",\n    \"EnumListParameter\",\n    \"configuration\",\n    \"interface\",\n    \"local_target\",\n    \"run\",\n    \"build\",\n    \"event\",\n    \"Event\",\n    \"NumericalParameter\",\n    \"ChoiceParameter\",\n    \"ChoiceListParameter\",\n    \"OptionalParameter\",\n    \"OptionalStrParameter\",\n    \"OptionalIntParameter\",\n    \"OptionalFloatParameter\",\n    \"OptionalBoolParameter\",\n    \"OptionalPathParameter\",\n    \"OptionalDictParameter\",\n    \"OptionalListParameter\",\n    \"OptionalTupleParameter\",\n    \"OptionalChoiceParameter\",\n    \"OptionalNumericalParameter\",\n    \"LuigiStatusCode\",\n    \"__version__\",\n]\n\nif not configuration.get_config().has_option(\"core\", \"autoload_range\"):\n    import warnings\n\n    warning_message = \"\"\"\n        Autoloading range tasks by default has been deprecated and will be removed in a future version.\n        To get the behavior now add an option to luigi.cfg:\n\n          [core]\n            autoload_range: false\n\n        Alternately set the option to true to continue with existing behaviour and suppress this warning.\n    \"\"\"\n    warnings.warn(warning_message, DeprecationWarning)\n\nif configuration.get_config().getboolean(\"core\", \"autoload_range\", True):\n    from .tools import range  # noqa: F401    just makes the tool classes available from command line\n\n    __all__.append(\"range\")\n"
  },
  {
    "path": "luigi/__main__.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2016 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nfrom luigi.cmdline import luigi_run\n\nif __name__ == \"__main__\":\n    luigi_run()\n"
  },
  {
    "path": "luigi/__version__.py",
    "content": "# coding: utf-8\n\nVERSION = \"3.8.0\"\n"
  },
  {
    "path": "luigi/batch_notifier.py",
    "content": "\"\"\"\nLibrary for sending batch notifications from the Luigi scheduler. This module\nis internal to Luigi and not designed for use in other contexts.\n\"\"\"\n\nimport collections\nimport time\nfrom datetime import datetime\n\nimport luigi\nimport luigi.parameter\nimport luigi.task\nfrom luigi.notifications import email, send_email\n\n\nclass batch_email(luigi.task.Config):\n    email_interval = luigi.parameter.IntParameter(\n        default=60,\n        config_path=dict(section=\"batch-notifier\", name=\"email-interval-minutes\"),\n        description=\"Number of minutes between e-mail sends (default: 60)\",\n    )\n    batch_mode = luigi.parameter.ChoiceParameter(\n        default=\"unbatched_params\",\n        choices=(\"family\", \"all\", \"unbatched_params\"),\n        description='Method used for batching failures in e-mail. If \"family\" all failures for '\n        'tasks with the same family will be batched. If \"unbatched_params\", all '\n        \"failures for tasks with the same family and non-batched parameters will be \"\n        'batched. If \"all\", tasks will only be batched if they have identical names.',\n    )\n    error_lines = luigi.parameter.IntParameter(default=20, description=\"Number of lines to show from each error message. 0 means show all\")\n    error_messages = luigi.parameter.IntParameter(default=1, description=\"Number of error messages to show for each group\")\n    group_by_error_messages = luigi.parameter.BoolParameter(default=True, description=\"Group items with the same error messages together\")\n\n\nclass ExplQueue(collections.OrderedDict):\n    def __init__(self, num_items):\n        self.num_items = num_items\n        super(ExplQueue, self).__init__()\n\n    def enqueue(self, item):\n        self.pop(item, None)\n        self[item] = datetime.now()\n        if len(self) > self.num_items:\n            self.popitem(last=False)  # pop first item if past length\n\n\ndef _fail_queue(num_messages):\n    return lambda: collections.defaultdict(lambda: ExplQueue(num_messages))\n\n\ndef _plural_format(template, number, plural=\"s\"):\n    if number == 0:\n        return \"\"\n    return template.format(number, \"\" if number == 1 else plural)\n\n\nclass BatchNotifier:\n    def __init__(self, **kwargs):\n        self._config = batch_email(**kwargs)\n        self._fail_counts = collections.defaultdict(collections.Counter)\n        self._disabled_counts = collections.defaultdict(collections.Counter)\n        self._scheduling_fail_counts = collections.defaultdict(collections.Counter)\n        self._fail_expls = collections.defaultdict(_fail_queue(self._config.error_messages))\n        self._update_next_send()\n\n        self._email_format = email().format\n        if email().receiver:\n            self._default_owner = set(filter(None, email().receiver.split(\",\")))\n        else:\n            self._default_owner = set()\n\n    def _update_next_send(self):\n        self._next_send = time.time() + 60 * self._config.email_interval\n\n    def _key(self, task_name, family, unbatched_args):\n        if self._config.batch_mode == \"all\":\n            return task_name\n        elif self._config.batch_mode == \"family\":\n            return family\n        elif self._config.batch_mode == \"unbatched_params\":\n            param_str = \", \".join(\"{}={}\".format(k, v) for k, v in unbatched_args.items())\n            return \"{}({})\".format(family, param_str)\n        else:\n            raise ValueError(\"Unknown batch mode for batch notifier: {}\".format(self._config.batch_mode))\n\n    def _format_expl(self, expl):\n        lines = expl.rstrip().split(\"\\n\")[-self._config.error_lines :]\n        if self._email_format == \"html\":\n            return \"<pre>{}</pre>\".format(\"\\n\".join(lines))\n        else:\n            return \"\\n{}\".format(\"\\n\".join(map(\"      {}\".format, lines)))\n\n    def _expl_body(self, expls):\n        lines = [self._format_expl(expl) for expl in expls]\n        if lines and self._email_format != \"html\":\n            lines.append(\"\")\n        return \"\\n\".join(lines)\n\n    def _format_task(self, task_tuple):\n        task, failure_count, disable_count, scheduling_count = task_tuple\n        counts = [\n            _plural_format(\"{} failure{}\", failure_count),\n            _plural_format(\"{} disable{}\", disable_count),\n            _plural_format(\"{} scheduling failure{}\", scheduling_count),\n        ]\n        count_str = \", \".join(filter(None, counts))\n        return \"{} ({})\".format(task, count_str)\n\n    def _format_tasks(self, tasks):\n        lines = map(self._format_task, sorted(tasks, key=self._expl_key))\n        if self._email_format == \"html\":\n            return \"<li>{}\".format(\"\\n<br>\".join(lines))\n        else:\n            return \"- {}\".format(\"\\n  \".join(lines))\n\n    def _owners(self, owners):\n        return self._default_owner | set(owners)\n\n    def add_failure(self, task_name, family, unbatched_args, expl, owners):\n        key = self._key(task_name, family, unbatched_args)\n        for owner in self._owners(owners):\n            self._fail_counts[owner][key] += 1\n            self._fail_expls[owner][key].enqueue(expl)\n\n    def add_disable(self, task_name, family, unbatched_args, owners):\n        key = self._key(task_name, family, unbatched_args)\n        for owner in self._owners(owners):\n            self._disabled_counts[owner][key] += 1\n            self._fail_counts[owner].setdefault(key, 0)\n\n    def add_scheduling_fail(self, task_name, family, unbatched_args, expl, owners):\n        key = self._key(task_name, family, unbatched_args)\n        for owner in self._owners(owners):\n            self._scheduling_fail_counts[owner][key] += 1\n            self._fail_expls[owner][key].enqueue(expl)\n            self._fail_counts[owner].setdefault(key, 0)\n\n    def _task_expl_groups(self, expls):\n        if not self._config.group_by_error_messages:\n            return [((task,), msg) for task, msg in expls.items()]\n\n        groups = collections.defaultdict(list)\n        for task, msg in expls.items():\n            groups[msg].append(task)\n        return [(tasks, msg) for msg, tasks in groups.items()]\n\n    def _expls_key(self, expls_tuple):\n        expls = expls_tuple[0]\n        num_failures = sum(failures + scheduling_fails for (_1, failures, _2, scheduling_fails) in expls)\n        num_disables = sum(disables for (_1, _2, disables, _3) in expls)\n        min_name = min(expls)[0]\n        return -num_failures, -num_disables, min_name\n\n    def _expl_key(self, expl):\n        return self._expls_key(((expl,), None))\n\n    def _email_body(self, fail_counts, disable_counts, scheduling_counts, fail_expls):\n        expls = {\n            (name, fail_count, disable_counts[name], scheduling_counts[name]): self._expl_body(fail_expls[name]) for name, fail_count in fail_counts.items()\n        }\n        expl_groups = sorted(self._task_expl_groups(expls), key=self._expls_key)\n        body_lines = []\n        for tasks, msg in expl_groups:\n            body_lines.append(self._format_tasks(tasks))\n            body_lines.append(msg)\n        body = \"\\n\".join(filter(None, body_lines)).rstrip()\n        if self._email_format == \"html\":\n            return \"<ul>\\n{}\\n</ul>\".format(body)\n        else:\n            return body\n\n    def _send_email(self, fail_counts, disable_counts, scheduling_counts, fail_expls, owner):\n        num_failures = sum(fail_counts.values())\n        num_disables = sum(disable_counts.values())\n        num_scheduling_failures = sum(scheduling_counts.values())\n        subject_parts = [\n            _plural_format(\"{} failure{}\", num_failures),\n            _plural_format(\"{} disable{}\", num_disables),\n            _plural_format(\"{} scheduling failure{}\", num_scheduling_failures),\n        ]\n        subject_base = \", \".join(filter(None, subject_parts))\n        if subject_base:\n            prefix = \"\" if owner in self._default_owner else \"Your tasks have \"\n            subject = \"Luigi: {}{} in the last {} minutes\".format(prefix, subject_base, self._config.email_interval)\n            email_body = self._email_body(fail_counts, disable_counts, scheduling_counts, fail_expls)\n            send_email(subject, email_body, email().sender, (owner,))\n\n    def send_email(self):\n        try:\n            for owner, failures in self._fail_counts.items():\n                self._send_email(\n                    fail_counts=failures,\n                    disable_counts=self._disabled_counts[owner],\n                    scheduling_counts=self._scheduling_fail_counts[owner],\n                    fail_expls=self._fail_expls[owner],\n                    owner=owner,\n                )\n        finally:\n            self._update_next_send()\n            self._fail_counts.clear()\n            self._disabled_counts.clear()\n            self._scheduling_fail_counts.clear()\n            self._fail_expls.clear()\n\n    def update(self):\n        if time.time() >= self._next_send:\n            self.send_email()\n"
  },
  {
    "path": "luigi/cmdline.py",
    "content": "import argparse\nimport sys\n\nfrom luigi.retcodes import run_with_retcodes\nfrom luigi.setup_logging import DaemonLogging\n\n\ndef luigi_run(argv=sys.argv[1:]):\n    run_with_retcodes(argv)\n\n\ndef luigid(argv=sys.argv[1:]):\n    import luigi.configuration\n    import luigi.process\n    import luigi.server\n\n    parser = argparse.ArgumentParser(description=\"Central luigi server\")\n    parser.add_argument(\"--background\", help=\"Run in background mode\", action=\"store_true\")\n    parser.add_argument(\"--pidfile\", help=\"Write pidfile\")\n    parser.add_argument(\"--logdir\", help=\"log directory\")\n    parser.add_argument(\"--state-path\", help=\"Pickled state file\")\n    parser.add_argument(\"--address\", help=\"Listening interface\")\n    parser.add_argument(\"--unix-socket\", help=\"Unix socket path\")\n    parser.add_argument(\"--port\", default=8082, help=\"Listening port\")\n\n    opts = parser.parse_args(argv)\n\n    if opts.state_path:\n        config = luigi.configuration.get_config()\n        config.set(\"scheduler\", \"state_path\", opts.state_path)\n\n    DaemonLogging.setup(opts)\n    if opts.background:\n        luigi.process.daemonize(\n            luigi.server.run, api_port=opts.port, address=opts.address, pidfile=opts.pidfile, logdir=opts.logdir, unix_socket=opts.unix_socket\n        )\n    else:\n        luigi.server.run(api_port=opts.port, address=opts.address, unix_socket=opts.unix_socket)\n"
  },
  {
    "path": "luigi/cmdline_parser.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nThis module contains luigi internal parsing logic. Things exposed here should\nbe considered internal to luigi.\n\"\"\"\n\nimport argparse\nimport sys\nfrom contextlib import contextmanager\n\nfrom luigi.task_register import Register\n\n\nclass CmdlineParser:\n    \"\"\"\n    Helper for parsing command line arguments and used as part of the\n    context when instantiating task objects.\n\n    Normal luigi users should just use :py:func:`luigi.run`.\n    \"\"\"\n\n    _instance = None\n\n    @classmethod\n    def get_instance(cls):\n        \"\"\"Singleton getter\"\"\"\n        return cls._instance\n\n    @classmethod\n    @contextmanager\n    def global_instance(cls, cmdline_args, allow_override=False):\n        \"\"\"\n        Meant to be used as a context manager.\n        \"\"\"\n        orig_value = cls._instance\n        assert (orig_value is None) or allow_override\n        new_value = None\n        try:\n            new_value = CmdlineParser(cmdline_args)\n            cls._instance = new_value\n            yield new_value\n        finally:\n            assert cls._instance is new_value\n            cls._instance = orig_value\n\n    def __init__(self, cmdline_args):\n        \"\"\"\n        Initialize cmd line args\n        \"\"\"\n        known_args, _ = self._build_parser().parse_known_args(args=cmdline_args)\n        self._attempt_load_module(known_args)\n        # We have to parse again now. As the positionally first unrecognized\n        # argument (the task) could be different.\n        known_args, _ = self._build_parser().parse_known_args(args=cmdline_args)\n        root_task = known_args.root_task\n        parser = self._build_parser(root_task=root_task, help_all=known_args.core_help_all)\n        self._possibly_exit_with_help(parser, known_args)\n        if not root_task:\n            raise SystemExit(\"No task specified\")\n        else:\n            # Check that what we believe to be the task is correctly spelled\n            Register.get_task_cls(root_task)\n        known_args = parser.parse_args(args=cmdline_args)\n        self.known_args = known_args  # Also publicly expose parsed arguments\n\n    @staticmethod\n    def _build_parser(root_task=None, help_all=False):\n        parser = argparse.ArgumentParser(add_help=False)\n\n        # Unfortunately, we have to set it as optional to argparse, so we can\n        # parse out stuff like `--module` before we call for `--help`.\n        parser.add_argument(\n            \"root_task\",\n            nargs=\"?\",\n            help=\"Task family to run. Is not optional.\",\n            metavar=\"Required root task\",\n        )\n\n        for task_name, is_without_section, param_name, param_obj in Register.get_all_params():\n            is_the_root_task = task_name == root_task\n            help = param_obj.description if any((is_the_root_task, help_all, param_obj.always_in_help)) else argparse.SUPPRESS\n            flag_name_underscores = param_name if is_without_section else task_name + \"_\" + param_name\n            global_flag_name = \"--\" + flag_name_underscores.replace(\"_\", \"-\")\n            parser.add_argument(global_flag_name, help=help, **param_obj._parser_kwargs(param_name, task_name))\n            if is_the_root_task:\n                local_flag_name = \"--\" + param_name.replace(\"_\", \"-\")\n                parser.add_argument(local_flag_name, help=help, **param_obj._parser_kwargs(param_name))\n\n        return parser\n\n    def get_task_obj(self):\n        \"\"\"\n        Get the task object\n        \"\"\"\n        return self._get_task_cls()(**self._get_task_kwargs())\n\n    def _get_task_cls(self):\n        \"\"\"\n        Get the task class\n        \"\"\"\n        return Register.get_task_cls(self.known_args.root_task)\n\n    def _get_task_kwargs(self):\n        \"\"\"\n        Get the local task arguments as a dictionary. The return value is in\n        the form ``dict(my_param='my_value', ...)``\n        \"\"\"\n        res = {}\n        for param_name, param_obj in self._get_task_cls().get_params():\n            attr = getattr(self.known_args, param_name)\n            if attr:\n                res.update(((param_name, param_obj.parse(attr)),))\n\n        return res\n\n    @staticmethod\n    def _attempt_load_module(known_args):\n        \"\"\"\n        Load the --module parameter\n        \"\"\"\n        module = known_args.core_module\n        if module:\n            __import__(module)\n\n    @staticmethod\n    def _possibly_exit_with_help(parser, known_args):\n        \"\"\"\n        Check if the user passed --help[-all], if so, print a message and exit.\n        \"\"\"\n        if known_args.core_help or known_args.core_help_all:\n            parser.print_help()\n            sys.exit()\n"
  },
  {
    "path": "luigi/configuration/__init__.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nfrom .cfg_parser import LuigiConfigParser\nfrom .core import add_config_path, get_config\nfrom .toml_parser import LuigiTomlParser\n\n__all__ = [\n    \"add_config_path\",\n    \"get_config\",\n    \"LuigiConfigParser\",\n    \"LuigiTomlParser\",\n]\n"
  },
  {
    "path": "luigi/configuration/base_parser.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nimport logging\n\n\n# IMPORTANT: don't inherit from `object`!\n# ConfigParser have some troubles in this case.\n# More info: https://stackoverflow.com/a/19323238\nclass BaseParser:\n    @classmethod\n    def instance(cls, *args, **kwargs):\n        \"\"\"Singleton getter\"\"\"\n        if cls._instance is None:\n            cls._instance = cls(*args, **kwargs)\n            loaded = cls._instance.reload()\n            logging.getLogger(\"luigi-interface\").info(\"Loaded %r\", loaded)\n\n        return cls._instance\n\n    @classmethod\n    def add_config_path(cls, path):\n        cls._config_paths.append(path)\n        cls.reload()\n\n    @classmethod\n    def reload(cls):\n        return cls.instance().read(cls._config_paths)\n"
  },
  {
    "path": "luigi/configuration/cfg_parser.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nluigi.configuration provides some convenience wrappers around Python's\nConfigParser to get configuration options from config files.\n\nThe default location for configuration files is luigi.cfg (or client.cfg) in the current\nworking directory, then /etc/luigi/client.cfg.\n\nConfiguration has largely been superseded by parameters since they can\ndo essentially everything configuration can do, plus a tighter integration\nwith the rest of Luigi.\n\nSee :doc:`/configuration` for more info.\n\"\"\"\n\nimport os\nimport re\nimport warnings\nfrom configparser import BasicInterpolation, ConfigParser, Interpolation, InterpolationError, NoOptionError, NoSectionError\n\nfrom .base_parser import BaseParser\n\n\nclass InterpolationMissingEnvvarError(InterpolationError):\n    \"\"\"\n    Raised when option value refers to a nonexisting environment variable.\n    \"\"\"\n\n    def __init__(self, option, section, value, envvar):\n        msg = (\"Config refers to a nonexisting environment variable {}. Section [{}], option {}={}\").format(envvar, section, option, value)\n        InterpolationError.__init__(self, option, section, msg)\n\n\nclass EnvironmentInterpolation(Interpolation):\n    \"\"\"\n    Custom interpolation which allows values to refer to environment variables\n    using the ``${ENVVAR}`` syntax.\n    \"\"\"\n\n    _ENVRE = re.compile(r\"\\$\\{([^}]+)\\}\")  # matches \"${envvar}\"\n\n    def before_get(self, parser, section, option, value, defaults):\n        return self._interpolate_env(option, section, value)\n\n    def _interpolate_env(self, option, section, value):\n        rawval = value\n        parts = []\n        while value:\n            match = self._ENVRE.search(value)\n            if match is None:\n                parts.append(value)\n                break\n            envvar = match.groups()[0]\n            try:\n                envval = os.environ[envvar]\n            except KeyError:\n                raise InterpolationMissingEnvvarError(option, section, rawval, envvar)\n            start, end = match.span()\n            parts.append(value[:start])\n            parts.append(envval)\n            value = value[end:]\n        return \"\".join(parts)\n\n\nclass CombinedInterpolation(Interpolation):\n    \"\"\"\n    Custom interpolation which applies multiple interpolations in series.\n\n    :param interpolations: a sequence of configparser.Interpolation objects.\n    \"\"\"\n\n    def __init__(self, interpolations):\n        self._interpolations = interpolations\n\n    def before_get(self, parser, section, option, value, defaults):\n        for interp in self._interpolations:\n            value = interp.before_get(parser, section, option, value, defaults)\n        return value\n\n    def before_read(self, parser, section, option, value):\n        for interp in self._interpolations:\n            value = interp.before_read(parser, section, option, value)\n        return value\n\n    def before_set(self, parser, section, option, value):\n        for interp in self._interpolations:\n            value = interp.before_set(parser, section, option, value)\n        return value\n\n    def before_write(self, parser, section, option, value):\n        for interp in self._interpolations:\n            value = interp.before_write(parser, section, option, value)\n        return value\n\n\nclass LuigiConfigParser(BaseParser, ConfigParser):\n    NO_DEFAULT = object()\n    enabled = True\n    optionxform = str  # type: ignore\n    _instance = None\n    _config_paths = [\n        \"/etc/luigi/client.cfg\",  # Deprecated old-style global luigi config\n        \"/etc/luigi/luigi.cfg\",\n        \"client.cfg\",  # Deprecated old-style local luigi config\n        \"luigi.cfg\",\n    ]\n    _DEFAULT_INTERPOLATION = CombinedInterpolation([BasicInterpolation(), EnvironmentInterpolation()])\n\n    @classmethod\n    def reload(cls):\n        # Warn about deprecated old-style config paths.\n        deprecated_paths = [p for p in cls._config_paths if os.path.basename(p) == \"client.cfg\" and os.path.exists(p)]\n        if deprecated_paths:\n            warnings.warn(\n                \"Luigi configuration files named 'client.cfg' are deprecated if favor of 'luigi.cfg'. \" + \"Found: {paths!r}\".format(paths=deprecated_paths),\n                DeprecationWarning,\n            )\n\n        return cls.instance().read(cls._config_paths)\n\n    def _get_with_default(self, method, section, option, default, expected_type=None, **kwargs):\n        \"\"\"\n        Gets the value of the section/option using method.\n\n        Returns default if value is not found.\n\n        Raises an exception if the default value is not None and doesn't match the expected_type.\n        \"\"\"\n        try:\n            try:\n                # Underscore-style is the recommended configuration style\n                option = option.replace(\"-\", \"_\")\n                return method(self, section, option, **kwargs)\n            except (NoOptionError, NoSectionError):\n                # Support dash-style option names (with deprecation warning).\n                option_alias = option.replace(\"_\", \"-\")\n                value = method(self, section, option_alias, **kwargs)\n                warn = \"Configuration [{s}] {o} (with dashes) should be avoided. Please use underscores: {u}.\".format(s=section, o=option_alias, u=option)\n                warnings.warn(warn, DeprecationWarning)\n                return value\n        except (NoOptionError, NoSectionError):\n            if default is LuigiConfigParser.NO_DEFAULT:\n                raise\n            if expected_type is not None and default is not None and not isinstance(default, expected_type):\n                raise\n            return default\n\n    def has_option(self, section, option):\n        \"\"\"modified has_option\n        Check for the existence of a given option in a given section. If the\n        specified 'section' is None or an empty string, DEFAULT is assumed. If\n        the specified 'section' does not exist, returns False.\n        \"\"\"\n\n        # Underscore-style is the recommended configuration style\n        option = option.replace(\"-\", \"_\")\n        if ConfigParser.has_option(self, section, option):\n            return True\n\n        # Support dash-style option names (with deprecation warning).\n        option_alias = option.replace(\"_\", \"-\")\n        if ConfigParser.has_option(self, section, option_alias):\n            warn = \"Configuration [{s}] {o} (with dashes) should be avoided. Please use underscores: {u}.\".format(s=section, o=option_alias, u=option)\n            warnings.warn(warn, DeprecationWarning)\n            return True\n\n        return False\n\n    def get(self, section, option, default=NO_DEFAULT, **kwargs):\n        return self._get_with_default(ConfigParser.get, section, option, default, **kwargs)\n\n    def getboolean(self, section, option, default=NO_DEFAULT):\n        return self._get_with_default(ConfigParser.getboolean, section, option, default, bool)\n\n    def getint(self, section, option, default=NO_DEFAULT):\n        return self._get_with_default(ConfigParser.getint, section, option, default, int)\n\n    def getfloat(self, section, option, default=NO_DEFAULT):\n        return self._get_with_default(ConfigParser.getfloat, section, option, default, float)\n\n    def getintdict(self, section):\n        try:\n            # Exclude keys from [DEFAULT] section because in general they do not hold int values\n            return dict((key, int(value)) for key, value in self.items(section) if key not in {k for k, _ in self.items(\"DEFAULT\")})\n        except NoSectionError:\n            return {}\n\n    def set(self, section, option, value=None):\n        if not ConfigParser.has_section(self, section):\n            ConfigParser.add_section(self, section)\n\n        return ConfigParser.set(self, section, option, value)\n"
  },
  {
    "path": "luigi/configuration/core.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nimport logging\nimport os\nimport warnings\n\nfrom .cfg_parser import LuigiConfigParser\nfrom .toml_parser import LuigiTomlParser\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nPARSERS = {\n    \"cfg\": LuigiConfigParser,\n    \"conf\": LuigiConfigParser,\n    \"ini\": LuigiConfigParser,\n    \"toml\": LuigiTomlParser,\n}\n\nDEFAULT_PARSER = \"cfg\"\n\n\ndef _get_default_parser():\n    parser = os.environ.get(\"LUIGI_CONFIG_PARSER\", DEFAULT_PARSER)\n    if parser not in PARSERS:\n        warnings.warn(\"Invalid parser: {parser}\".format(parser=DEFAULT_PARSER))\n        parser = DEFAULT_PARSER\n    return parser\n\n\ndef _check_parser(parser_class, parser):\n    if not parser_class.enabled:\n        msg = \"Parser not installed yet. Please, install luigi with required parser:\\npip install luigi[{parser}]\"\n        raise ImportError(msg.format(parser=parser))\n\n\ndef get_config(parser=None):\n    \"\"\"Get configs singleton for parser\"\"\"\n    if parser is None:\n        parser = _get_default_parser()\n    parser_class = PARSERS[parser]\n    _check_parser(parser_class, parser)\n    return parser_class.instance()\n\n\ndef add_config_path(path):\n    \"\"\"Select config parser by file extension and add path into parser.\"\"\"\n    if not os.path.isfile(path):\n        warnings.warn(\"Config file does not exist: {path}\".format(path=path))\n        return False\n\n    # select parser by file extension\n    default_parser = _get_default_parser()\n    _base, ext = os.path.splitext(path)\n    if ext and ext[1:] in PARSERS:\n        parser = ext[1:]\n    else:\n        parser = default_parser\n    parser_class = PARSERS[parser]\n\n    _check_parser(parser_class, parser)\n    if parser != default_parser:\n        msg = \"Config for {added} parser added, but used {used} parser. Set up right parser via env var: export LUIGI_CONFIG_PARSER={added}\"\n        warnings.warn(msg.format(added=parser, used=default_parser))\n\n    # add config path to parser\n    parser_class.add_config_path(path)\n    return True\n\n\nif \"LUIGI_CONFIG_PATH\" in os.environ:\n    add_config_path(os.environ[\"LUIGI_CONFIG_PATH\"])\n"
  },
  {
    "path": "luigi/configuration/toml_parser.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2018 Vote Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nimport os.path\nfrom configparser import ConfigParser\nfrom typing import Any, Dict\n\ntry:\n    import toml\n\n    toml_enabled = True\nexcept ImportError:\n    toml_enabled = False\n\nfrom ..freezing import recursively_freeze\nfrom .base_parser import BaseParser\n\n\nclass LuigiTomlParser(BaseParser, ConfigParser):\n    NO_DEFAULT = object()\n    enabled = bool(toml_enabled)\n    data: Dict[str, Any] = dict()\n    _instance = None\n    _config_paths = [\n        \"/etc/luigi/luigi.toml\",\n        \"luigi.toml\",\n    ]\n\n    @staticmethod\n    def _update_data(data, new_data):\n        if not new_data:\n            return data\n        if not data:\n            return new_data\n        for section, content in new_data.items():\n            if section not in data:\n                data[section] = dict()\n            data[section].update(content)\n        return data\n\n    def read(self, config_paths):\n        self.data = dict()\n        for path in config_paths:\n            if os.path.isfile(path):\n                self.data = self._update_data(self.data, toml.load(path))\n\n        # freeze dict params\n        for section, content in self.data.items():\n            for key, value in content.items():\n                if isinstance(value, dict):\n                    self.data[section][key] = recursively_freeze(value)\n\n        return self.data\n\n    def get(self, section, option, default=NO_DEFAULT, **kwargs):\n        try:\n            return self.data[section][option]\n        except KeyError:\n            if default is self.NO_DEFAULT:\n                raise\n            return default\n\n    def getboolean(self, section, option, default=NO_DEFAULT):\n        return self.get(section, option, default)\n\n    def getint(self, section, option, default=NO_DEFAULT):\n        return self.get(section, option, default)\n\n    def getfloat(self, section, option, default=NO_DEFAULT):\n        return self.get(section, option, default)\n\n    def getintdict(self, section):\n        return self.data.get(section, {})\n\n    def set(self, section, option, value=None):\n        if section not in self.data:\n            self.data[section] = {}\n        self.data[section][option] = value\n\n    def has_option(self, section, option):\n        return section in self.data and option in self.data[section]\n\n    def __getitem__(self, name):\n        return self.data[name]\n"
  },
  {
    "path": "luigi/contrib/__init__.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nPackage containing optional and-on functionality.\n\"\"\"\n"
  },
  {
    "path": "luigi/contrib/azureblob.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright (c) 2018 Microsoft Corporation\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n#\n\nimport datetime\nimport logging\nimport os\nimport tempfile\n\nfrom azure.storage.blob import BlobServiceClient\n\nfrom luigi.format import get_default_format\nfrom luigi.target import AtomicLocalFile, FileAlreadyExists, FileSystem, FileSystemTarget\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nclass AzureBlobClient(FileSystem):\n    \"\"\"\n    Create an Azure Blob Storage client for authentication.\n    Users can create multiple storage account, each of which acts like a silo. Under each storage account, we can\n    create a container. Inside each container, the user can create multiple blobs.\n\n    For each account, there should be an account key. This account key cannot be changed and one can access all the\n    containers and blobs under this account using the account key.\n\n    Usually using an account key might not always be the best idea as the key can be leaked and cannot be revoked. The\n    solution to this issue is to create Shared `Access Signatures` aka `sas`. A SAS can be created for an entire\n    container or just a single blob. SAS can be revoked.\n    \"\"\"\n\n    def __init__(self, account_name=None, account_key=None, sas_token=None, **kwargs):\n        \"\"\"\n        :param str account_name:\n            The storage account name. This is used to authenticate requests signed with an account key\\\n            and to construct the storage endpoint. It is required unless a connection string is given,\\\n            or if a custom domain is used with anonymous authentication.\n        :param str account_key:\n            The storage account key. This is used for shared key authentication.\n        :param str sas_token:\n            A shared access signature token to use to authenticate requests instead of the account key.\n        :param dict kwargs:\n            A key-value pair to provide additional connection options.\n\n            * `protocol` - The protocol to use for requests. Defaults to https.\n            * `connection_string` - If specified, this will override all other parameters besides request session.\\\n                See http://azure.microsoft.com/en-us/documentation/articles/storage-configure-connection-string/ for the connection string format\n            * `endpoint_suffix` - The host base component of the url, minus the account name. Defaults to Azure\\\n                (core.windows.net). Override this to use the China cloud (core.chinacloudapi.cn).\n            * `custom_domain` - The custom domain to use. This can be set in the Azure Portal. For example, ‘www.mydomain.com’.\n            * `token_credential` - A token credential used to authenticate HTTPS requests. The token value should be updated before its expiration.\n        \"\"\"\n        if kwargs.get(\"custom_domain\"):\n            account_url = \"{protocol}://{custom_domain}/{account_name}\".format(\n                protocol=kwargs.get(\"protocol\", \"https\"), custom_domain=kwargs.get(\"custom_domain\"), account_name=account_name\n            )\n        else:\n            account_url = \"{protocol}://{account_name}.blob.{endpoint_suffix}\".format(\n                protocol=kwargs.get(\"protocol\", \"https\"), account_name=account_name, endpoint_suffix=kwargs.get(\"endpoint_suffix\", \"core.windows.net\")\n            )\n\n        self.options = {\"account_name\": account_name, \"account_key\": account_key, \"account_url\": account_url, \"sas_token\": sas_token}\n        self.kwargs = kwargs\n\n    @property\n    def connection(self):\n        if self.kwargs.get(\"connection_string\"):\n            return BlobServiceClient.from_connection_string(conn_str=self.kwargs.get(\"connection_string\"), **self.kwargs)\n        else:\n            return BlobServiceClient(\n                account_url=self.options.get(\"account_url\"), credential=self.options.get(\"account_key\") or self.options.get(\"sas_token\"), **self.kwargs\n            )\n\n    def container_client(self, container_name):\n        return self.connection.get_container_client(container_name)\n\n    def blob_client(self, container_name, blob_name):\n        container_client = self.container_client(container_name)\n        return container_client.get_blob_client(blob_name)\n\n    def upload(self, tmp_path, container, blob, **kwargs):\n        logging.debug(\"Uploading file '{tmp_path}' to container '{container}' and blob '{blob}'\".format(tmp_path=tmp_path, container=container, blob=blob))\n        self.create_container(container)\n        lease = None\n        blob_client = self.blob_client(container, blob)\n        if blob_client.exists():\n            lease = blob_client.acquire_lease()\n        try:\n            with open(tmp_path, \"rb\") as data:\n                blob_client.upload_blob(data, overwrite=True, lease=lease, progress_hook=kwargs.get(\"progress_callback\"))\n        finally:\n            if lease is not None:\n                lease.release()\n\n    def download_as_bytes(self, container, blob, bytes_to_read=None):\n        logging.debug(\"Downloading from container '{container}' and blob '{blob}' as bytes\".format(container=container, blob=blob))\n        blob_client = self.blob_client(container, blob)\n        download_stream = blob_client.download_blob(offset=0, length=bytes_to_read) if bytes_to_read else blob_client.download_blob()\n        return download_stream.readall()\n\n    def download_as_file(self, container, blob, location):\n        logging.debug(\"Downloading from container '{container}' and blob '{blob}' to {location}\".format(container=container, blob=blob, location=location))\n        blob_client = self.blob_client(container, blob)\n        with open(location, \"wb\") as file:\n            download_stream = blob_client.download_blob()\n            file.write(download_stream.readall())\n        return blob_client.get_blob_properties()\n\n    def create_container(self, container_name):\n        if not self.exists(container_name):\n            return self.connection.create_container(container_name)\n\n    def delete_container(self, container_name):\n        container_client = self.container_client(container_name)\n        lease = container_client.acquire_lease()\n        container_client.delete_container(lease=lease)\n\n    def exists(self, path):\n        container, blob = self.splitfilepath(path)\n        if blob is None:\n            return self.container_client(container).exists()\n        else:\n            return self.blob_client(container, blob).exists()\n\n    def remove(self, path, recursive=True, skip_trash=True):\n        if not self.exists(path):\n            return False\n\n        container, blob = self.splitfilepath(path)\n        blob_client = self.blob_client(container, blob)\n        lease = blob_client.acquire_lease()\n        blob_client.delete_blob(lease=lease)\n        return True\n\n    def mkdir(self, path, parents=True, raise_if_exists=False):\n        container, blob = self.splitfilepath(path)\n        if raise_if_exists and self.exists(path):\n            raise FileAlreadyExists(\"The Azure blob path '{blob}' already exists under container '{container}'\".format(blob=blob, container=container))\n\n    def isdir(self, path):\n        \"\"\"\n        Azure Blob Storage has no concept of directories. It always returns False\n        :param str path: Path of the Azure blob storage\n        :return: False\n        \"\"\"\n        return False\n\n    def move(self, path, dest):\n        try:\n            return self.copy(path, dest) and self.remove(path)\n        except IOError:\n            self.remove(dest)\n            return False\n\n    def copy(self, path, dest):\n        source_container, source_blob = self.splitfilepath(path)\n        dest_container, dest_blob = self.splitfilepath(dest)\n        if source_container != dest_container:\n            raise Exception(\n                \"Can't copy blob from '{source_container}' to '{dest_container}'. File can be moved within container\".format(\n                    source_container=source_container, dest_container=dest_container\n                )\n            )\n\n        source_blob_client = self.blob_client(source_container, source_blob)\n        dest_blob_client = self.blob_client(dest_container, dest_blob)\n        source_lease = source_blob_client.acquire_lease()\n        destination_lease = dest_blob_client.acquire_lease() if self.exists(dest) else None\n        try:\n            return dest_blob_client.start_copy_from_url(source_url=source_blob_client.url, source_lease=source_lease, destination_lease=destination_lease)\n        finally:\n            source_lease.release()\n            if destination_lease is not None:\n                destination_lease.release()\n\n    def rename_dont_move(self, path, dest):\n        self.move(path, dest)\n\n    @staticmethod\n    def splitfilepath(filepath):\n        splitpath = filepath.split(\"/\")\n        container = splitpath[0]\n        blobsplit = splitpath[1:]\n        blob = None if not blobsplit else \"/\".join(blobsplit)\n        return container, blob\n\n\nclass ReadableAzureBlobFile:\n    def __init__(self, container, blob, client, download_when_reading, **kwargs):\n        self.container = container\n        self.blob = blob\n        self.client = client\n        self.closed = False\n        self.download_when_reading = download_when_reading\n        self.azure_blob_options = kwargs\n        self.download_file_location = os.path.join(tempfile.mkdtemp(prefix=str(datetime.datetime.utcnow())), blob)\n        self.fid = None\n\n    def read(self, n=None):\n        return self.client.download_as_bytes(self.container, self.blob, n)\n\n    def __enter__(self):\n        if self.download_when_reading:\n            self.client.download_as_file(self.container, self.blob, self.download_file_location)\n            self.fid = open(self.download_file_location)\n            return self.fid\n        else:\n            return self\n\n    def __exit__(self, exc_type, exc, traceback):\n        self.close()\n\n    def __del__(self):\n        self.close()\n        if os._exists(self.download_file_location):\n            os.remove(self.download_file_location)\n\n    def close(self):\n        if self.download_when_reading:\n            if self.fid is not None and not self.fid.closed:\n                self.fid.close()\n                self.fid = None\n\n    def readable(self):\n        return True\n\n    def writable(self):\n        return False\n\n    def seekable(self):\n        return False\n\n    def seek(self, offset, whence=None):\n        pass\n\n\nclass AtomicAzureBlobFile(AtomicLocalFile):\n    def __init__(self, container, blob, client, **kwargs):\n        super(AtomicAzureBlobFile, self).__init__(os.path.join(container, blob))\n        self.container = container\n        self.blob = blob\n        self.client = client\n        self.azure_blob_options = kwargs\n\n    def move_to_final_destination(self):\n        self.client.upload(self.tmp_path, self.container, self.blob, **self.azure_blob_options)\n\n\nclass AzureBlobTarget(FileSystemTarget):\n    \"\"\"\n    Create an Azure Blob Target for storing data on Azure Blob Storage\n    \"\"\"\n\n    def __init__(self, container, blob, client=None, format=None, download_when_reading=True, **kwargs):\n        \"\"\"\n        :param str account_name:\n            The storage account name. This is used to authenticate requests signed with an account key and to construct\n            the storage endpoint. It is required unless a connection string is given, or if a custom domain is\n            used with anonymous authentication.\n        :param str container:\n            The azure container in which the blob needs to be stored\n        :param str blob:\n            The name of the blob under container specified\n        :param str client:\n            An instance of :class:`.AzureBlobClient`. If none is specified, anonymous access would be used\n        :param str format:\n            An instance of :class:`luigi.format`.\n        :param bool download_when_reading:\n            Determines whether the file has to be downloaded to temporary location on disk. Defaults to `True`.\n\n        Pass the argument **progress_callback** with signature *(func(current, total))* to get real time progress of upload\n        \"\"\"\n        super(AzureBlobTarget, self).__init__(os.path.join(container, blob))\n        if format is None:\n            format = get_default_format()\n        self.container = container\n        self.blob = blob\n        self.client = client or AzureBlobClient()\n        self.format = format\n        self.download_when_reading = download_when_reading\n        self.azure_blob_options = kwargs\n\n    @property\n    def fs(self):\n        \"\"\"\n        The :py:class:`FileSystem` associated with :class:`.AzureBlobTarget`\n        \"\"\"\n        return self.client\n\n    def open(self, mode):\n        \"\"\"\n        Open the target for reading or writing\n\n        :param char mode:\n            'r' for reading and 'w' for writing.\n\n            'b' is not supported and will be stripped if used. For binary mode, use `format`\n        :return:\n            * :class:`.ReadableAzureBlobFile` if 'r'\n            * :class:`.AtomicAzureBlobFile` if 'w'\n        \"\"\"\n        if mode not in (\"r\", \"w\"):\n            raise ValueError(\"Unsupported open mode '%s'\" % mode)\n        if mode == \"r\":\n            return self.format.pipe_reader(ReadableAzureBlobFile(self.container, self.blob, self.client, self.download_when_reading, **self.azure_blob_options))\n        else:\n            return self.format.pipe_writer(AtomicAzureBlobFile(self.container, self.blob, self.client, **self.azure_blob_options))\n"
  },
  {
    "path": "luigi/contrib/batch.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2018 Outlier Bio, LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nAWS Batch wrapper for Luigi\n\nFrom the AWS website:\n\n    AWS Batch enables you to run batch computing workloads on the AWS Cloud.\n\n    Batch computing is a common way for developers, scientists, and engineers\n    to access large amounts of compute resources, and AWS Batch removes the\n    undifferentiated heavy lifting of configuring and managing the required\n    infrastructure. AWS Batch is similar to traditional batch computing\n    software. This service can efficiently provision resources in response to\n    jobs submitted in order to eliminate capacity constraints, reduce compute\n    costs, and deliver results quickly.\n\nSee `AWS Batch User Guide`_ for more details.\n\nTo use AWS Batch, you create a jobDefinition JSON that defines a `docker run`_\ncommand, and then submit this JSON to the API to queue up the task. Behind the\nscenes, AWS Batch auto-scales a fleet of EC2 Container Service instances,\nmonitors the load on these instances, and schedules the jobs.\n\nThis `boto3-powered`_ wrapper allows you to create Luigi Tasks to submit Batch\n``jobDefinition``s. You can either pass a dict (mapping directly to the\n``jobDefinition`` JSON) OR an Amazon Resource Name (arn) for a previously\nregistered ``jobDefinition``.\n\nRequires:\n\n- boto3 package\n- Amazon AWS credentials discoverable by boto3 (e.g., by using ``aws configure``\n  from awscli_)\n- An enabled AWS Batch job queue configured to run on a compute environment.\n\nWritten and maintained by Jake Feala (@jfeala) for Outlier Bio (@outlierbio)\n\n.. _`docker run`: https://docs.docker.com/reference/commandline/run\n.. _jobDefinition: http://http://docs.aws.amazon.com/batch/latest/userguide/job_definitions.html\n.. _`boto3-powered`: https://boto3.readthedocs.io\n.. _awscli: https://aws.amazon.com/cli\n.. _`AWS Batch User Guide`: http://docs.aws.amazon.com/AmazonECS/latest/developerguide/ECS_GetStarted.html\n\n\"\"\"\n\nimport json\nimport logging\nimport random\nimport string\nimport time\n\nimport luigi\n\nlogger = logging.getLogger(__name__)\n\ntry:\n    import boto3\nexcept ImportError:\n    logger.warning(\"boto3 is not installed. BatchTasks require boto3\")\n\n\nclass BatchJobException(Exception):\n    pass\n\n\nPOLL_TIME = 10\n\n\ndef _random_id():\n    return \"batch-job-\" + \"\".join(random.sample(string.ascii_lowercase, 8))\n\n\nclass BatchClient:\n    def __init__(self, poll_time=POLL_TIME):\n        self.poll_time = poll_time\n        self._client = boto3.client(\"batch\")\n        self._log_client = boto3.client(\"logs\")\n        self._queue = self.get_active_queue()\n\n    def get_active_queue(self):\n        \"\"\"Get name of first active job queue\"\"\"\n\n        # Get dict of active queues keyed by name\n        queues = {q[\"jobQueueName\"]: q for q in self._client.describe_job_queues()[\"jobQueues\"] if q[\"state\"] == \"ENABLED\" and q[\"status\"] == \"VALID\"}\n        if not queues:\n            raise Exception(\"No job queues with state=ENABLED and status=VALID\")\n\n        # Pick the first queue as default\n        return list(queues.keys())[0]\n\n    def get_job_id_from_name(self, job_name):\n        \"\"\"Retrieve the first job ID matching the given name\"\"\"\n        jobs = self._client.list_jobs(jobQueue=self._queue, jobStatus=\"RUNNING\")[\"jobSummaryList\"]\n        matching_jobs = [job for job in jobs if job[\"jobName\"] == job_name]\n        if matching_jobs:\n            return matching_jobs[0][\"jobId\"]\n\n    def get_job_status(self, job_id):\n        \"\"\"Retrieve task statuses from ECS API\n\n        :param job_id (str): AWS Batch job uuid\n\n        Returns one of {SUBMITTED|PENDING|RUNNABLE|STARTING|RUNNING|SUCCEEDED|FAILED}\n        \"\"\"\n        response = self._client.describe_jobs(jobs=[job_id])\n\n        # Error checking\n        status_code = response[\"ResponseMetadata\"][\"HTTPStatusCode\"]\n        if status_code != 200:\n            msg = \"Job status request received status code {0}:\\n{1}\"\n            raise Exception(msg.format(status_code, response))\n\n        return response[\"jobs\"][0][\"status\"]\n\n    def get_logs(self, log_stream_name, get_last=50):\n        \"\"\"Retrieve log stream from CloudWatch\"\"\"\n        response = self._log_client.get_log_events(logGroupName=\"/aws/batch/job\", logStreamName=log_stream_name, startFromHead=False)\n        events = response[\"events\"]\n        return \"\\n\".join(e[\"message\"] for e in events[-get_last:])\n\n    def submit_job(self, job_definition, parameters, job_name=None, queue=None):\n        \"\"\"Wrap submit_job with useful defaults\"\"\"\n        if job_name is None:\n            job_name = _random_id()\n        response = self._client.submit_job(jobName=job_name, jobQueue=queue or self.get_active_queue(), jobDefinition=job_definition, parameters=parameters)\n        return response[\"jobId\"]\n\n    def wait_on_job(self, job_id):\n        \"\"\"Poll task status until STOPPED\"\"\"\n\n        while True:\n            status = self.get_job_status(job_id)\n            if status == \"SUCCEEDED\":\n                logger.info(\"Batch job {} SUCCEEDED\".format(job_id))\n                return True\n            elif status == \"FAILED\":\n                # Raise and notify if job failed\n                jobs = self._client.describe_jobs(jobs=[job_id])[\"jobs\"]\n                job_str = json.dumps(jobs, indent=4)\n                logger.debug(\"Job details:\\n\" + job_str)\n\n                log_stream_name = jobs[0][\"attempts\"][0][\"container\"][\"logStreamName\"]\n                logs = self.get_logs(log_stream_name)\n                raise BatchJobException(\"Job {} failed: {}\".format(job_id, logs))\n\n            time.sleep(self.poll_time)\n            logger.debug(\"Batch job status for job {0}: {1}\".format(job_id, status))\n\n    def register_job_definition(self, json_fpath):\n        \"\"\"Register a job definition with AWS Batch, using a JSON\"\"\"\n        with open(json_fpath) as f:\n            job_def = json.load(f)\n        response = self._client.register_job_definition(**job_def)\n        status_code = response[\"ResponseMetadata\"][\"HTTPStatusCode\"]\n        if status_code != 200:\n            msg = \"Register job definition request received status code {0}:\\n{1}\"\n            raise Exception(msg.format(status_code, response))\n        return response\n\n\nclass BatchTask(luigi.Task):\n    \"\"\"\n    Base class for an Amazon Batch job\n\n    Amazon Batch requires you to register \"job definitions\", which are JSON\n    descriptions for how to issue the ``docker run`` command. This Luigi Task\n    requires a pre-registered Batch jobDefinition name passed as a Parameter\n\n    :param job_definition (str): name of pre-registered jobDefinition\n    :param job_name: name of specific job, for tracking in the queue and logs.\n    :param job_queue: name of job queue where job is going to be submitted.\n\n    \"\"\"\n\n    job_definition = luigi.Parameter()\n    job_name = luigi.OptionalParameter(default=None)\n    job_queue = luigi.OptionalParameter(default=None)\n    poll_time = luigi.IntParameter(default=POLL_TIME)\n\n    def run(self):\n        bc = BatchClient(self.poll_time)\n        job_id = bc.submit_job(self.job_definition, self.parameters, job_name=self.job_name, queue=self.job_queue)\n        bc.wait_on_job(job_id)\n\n    @property\n    def parameters(self):\n        \"\"\"Override to return a dict of parameters for the Batch Task\"\"\"\n        return {}\n"
  },
  {
    "path": "luigi/contrib/beam_dataflow.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2019 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom __future__ import annotations\n\nimport abc\nimport json\nimport logging\nimport os\nimport subprocess\n\nimport luigi\nfrom luigi.contrib import bigquery, gcs\nfrom luigi.task import MixinNaiveBulkComplete\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nclass DataflowParamKeys(metaclass=abc.ABCMeta):\n    \"\"\"\n    Defines the naming conventions for Dataflow execution params.\n    For example, the Java API expects param names in lower camel case, whereas\n    the Python implementation expects snake case.\n\n    \"\"\"\n\n    @property\n    @abc.abstractmethod\n    def runner(self):\n        pass\n\n    @property\n    @abc.abstractmethod\n    def project(self):\n        pass\n\n    @property\n    @abc.abstractmethod\n    def zone(self):\n        pass\n\n    @property\n    @abc.abstractmethod\n    def region(self):\n        pass\n\n    @property\n    @abc.abstractmethod\n    def staging_location(self):\n        pass\n\n    @property\n    @abc.abstractmethod\n    def temp_location(self):\n        pass\n\n    @property\n    @abc.abstractmethod\n    def gcp_temp_location(self):\n        pass\n\n    @property\n    @abc.abstractmethod\n    def num_workers(self):\n        pass\n\n    @property\n    @abc.abstractmethod\n    def autoscaling_algorithm(self):\n        pass\n\n    @property\n    @abc.abstractmethod\n    def max_num_workers(self):\n        pass\n\n    @property\n    @abc.abstractmethod\n    def disk_size_gb(self):\n        pass\n\n    @property\n    @abc.abstractmethod\n    def worker_machine_type(self):\n        pass\n\n    @property\n    @abc.abstractmethod\n    def worker_disk_type(self):\n        pass\n\n    @property\n    @abc.abstractmethod\n    def job_name(self):\n        pass\n\n    @property\n    @abc.abstractmethod\n    def service_account(self):\n        pass\n\n    @property\n    @abc.abstractmethod\n    def network(self):\n        pass\n\n    @property\n    @abc.abstractmethod\n    def subnetwork(self):\n        pass\n\n    @property\n    @abc.abstractmethod\n    def labels(self):\n        pass\n\n\nclass _CmdLineRunner:\n    \"\"\"\n    Executes a given command line class in a subprocess, logging its output.\n    If more complex monitoring/logging is desired, user can implement their\n    own launcher class and set it in BeamDataflowJobTask.cmd_line_runner.\n\n    \"\"\"\n\n    @staticmethod\n    def run(cmd, task=None):\n        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)\n        output_lines = []\n        while True:\n            line = process.stdout.readline()\n            if not line:\n                break\n            line = line.decode(\"utf-8\")\n            output_lines += [line]\n            logger.info(line.rstrip(\"\\n\"))\n        process.stdout.close()\n        exit_code = process.wait()\n        if exit_code:\n            output = \"\".join(output_lines)\n            raise subprocess.CalledProcessError(exit_code, cmd, output=output)\n\n\nclass BeamDataflowJobTask(MixinNaiveBulkComplete, luigi.Task, metaclass=abc.ABCMeta):\n    \"\"\"\n    Luigi wrapper for a Dataflow job. Must be overridden for each Beam SDK\n    with that SDK's dataflow_executable().\n\n    For more documentation, see:\n        https://cloud.google.com/dataflow/docs/guides/specifying-exec-params\n\n    The following required Dataflow properties must be set:\n\n    project                 # GCP project ID\n    temp_location           # Cloud storage path for temporary files\n\n    The following optional Dataflow properties can be set:\n\n    runner                  # PipelineRunner implementation for your Beam job.\n                              Default: DirectRunner\n    num_workers             # The number of workers to start the task with\n                              Default: Determined by Dataflow service\n    autoscaling_algorithm   # The Autoscaling mode for the Dataflow job\n                              Default: `THROUGHPUT_BASED`\n    max_num_workers         # Used if the autoscaling is enabled\n                              Default: Determined by Dataflow service\n    network                 # Network in GCE to be used for launching workers\n                              Default: a network named \"default\"\n    subnetwork              # Subnetwork in GCE to be used for launching workers\n                              Default: Determined by Dataflow service\n    disk_size_gb            # Remote worker disk size. Minimum value is 30GB\n                              Default: set to 0 to use GCP project default\n    worker_machine_type     # Machine type to create Dataflow worker VMs\n                              Default: Determined by Dataflow service\n    job_name                # Custom job name, must be unique across project's\n                              active jobs\n    worker_disk_type        # Specify SSD for local disk or defaults to hard\n                              disk as a full URL of disk type resource\n                              Default: Determined by Dataflow service.\n    service_account         # Service account of Dataflow VMs/workers\n                              Default: active GCE service account\n    region                  # Region to deploy Dataflow job to\n                              Default: us-central1\n    zone                    # Availability zone for launching workers instances\n                              Default: an available zone in the specified region\n    staging_location        # Cloud Storage bucket for Dataflow to stage binary\n                              files\n                              Default: the value of temp_location\n    gcp_temp_location       # Cloud Storage path for Dataflow to stage temporary\n                              files\n                              Default: the value of temp_location\n    labels                  # Custom GCP labels attached to the Dataflow job\n                              Default: nothing\n    \"\"\"\n\n    project = None\n    runner = None\n    temp_location = None\n    staging_location = None\n    gcp_temp_location = None\n    num_workers = None\n    autoscaling_algorithm = None\n    max_num_workers = None\n    network = None\n    subnetwork = None\n    disk_size_gb = None\n    worker_machine_type = None\n    job_name = None\n    worker_disk_type = None\n    service_account = None\n    zone = None\n    region = None\n    labels: dict[str, str] = {}\n\n    cmd_line_runner = _CmdLineRunner\n    dataflow_params = None\n\n    def __init__(self):\n        if not isinstance(self.dataflow_params, DataflowParamKeys):\n            raise ValueError(\"dataflow_params must be of type DataflowParamKeys\")\n        super(BeamDataflowJobTask, self).__init__()\n\n    @abc.abstractmethod\n    def dataflow_executable(self):\n        \"\"\"\n        Command representing the Dataflow executable to be run.\n        For example:\n\n        return ['java', 'com.spotify.luigi.MyClass', '-Xmx256m']\n        \"\"\"\n        pass\n\n    def args(self):\n        \"\"\"\n        Extra String arguments that will be passed to your Dataflow job.\n        For example:\n\n        return ['--setup_file=setup.py']\n        \"\"\"\n        return []\n\n    def before_run(self):\n        \"\"\"\n        Hook that gets called right before the Dataflow job is launched.\n        Can be used to setup any temporary files/tables, validate input, etc.\n        \"\"\"\n        pass\n\n    def on_successful_run(self):\n        \"\"\"\n        Callback that gets called right after the Dataflow job has finished\n        successfully but before validate_output is run.\n        \"\"\"\n        pass\n\n    def validate_output(self):\n        \"\"\"\n        Callback that can be used to validate your output before it is moved to\n        its final location. Returning false here will cause the job to fail, and\n        output to be removed instead of published.\n        \"\"\"\n        return True\n\n    def file_pattern(self):\n        \"\"\"\n        If one/some of the input target files are not in the pattern of part-*,\n        we can add the key of the required target and the correct file pattern\n        that should be appended in the command line here. If the input target key is not found\n        in this dict, the file pattern will be assumed to be part-* for that target.\n\n        :return A dictionary of overridden file pattern that is not part-* for the inputs\n        \"\"\"\n        return {}\n\n    def on_successful_output_validation(self):\n        \"\"\"\n        Callback that gets called after the Dataflow job has finished\n        successfully if validate_output returns True.\n        \"\"\"\n        pass\n\n    def cleanup_on_error(self, error):\n        \"\"\"\n        Callback that gets called after the Dataflow job has finished\n        unsuccessfully, or validate_output returns False.\n        \"\"\"\n        pass\n\n    def run(self):\n        cmd_line = self._mk_cmd_line()\n        logger.info(\" \".join(cmd_line))\n\n        self.before_run()\n\n        try:\n            self.cmd_line_runner.run(cmd_line, self)\n        except subprocess.CalledProcessError as e:\n            logger.error(e, exc_info=True)\n            self.cleanup_on_error(e)\n            os._exit(e.returncode)\n\n        self.on_successful_run()\n\n        if self.validate_output():\n            self.on_successful_output_validation()\n        else:\n            error = ValueError(\"Output validation failed\")\n            self.cleanup_on_error(error)\n            raise error\n\n    def _mk_cmd_line(self):\n        cmd_line = self.dataflow_executable()\n\n        cmd_line.extend(self._get_dataflow_args())\n        cmd_line.extend(self.args())\n        cmd_line.extend(self._format_input_args())\n        cmd_line.extend(self._format_output_args())\n        return cmd_line\n\n    def _get_runner(self):\n        if not self.runner:\n            logger.warning(\"Runner not supplied to BeamDataflowJobTask. \" + \"Defaulting to DirectRunner.\")\n            return \"DirectRunner\"\n        elif self.runner in [\"DataflowRunner\", \"DirectRunner\"]:\n            return self.runner\n        else:\n            raise ValueError(\"Runner %s is unsupported.\" % self.runner)\n\n    def _get_dataflow_args(self):\n        def f(key, value):\n            return \"--{}={}\".format(key, value)\n\n        output = []\n\n        output.append(f(self.dataflow_params.runner, self._get_runner()))\n\n        if self.project:\n            output.append(f(self.dataflow_params.project, self.project))\n        if self.zone:\n            output.append(f(self.dataflow_params.zone, self.zone))\n        if self.region:\n            output.append(f(self.dataflow_params.region, self.region))\n        if self.staging_location:\n            output.append(f(self.dataflow_params.staging_location, self.staging_location))\n        if self.temp_location:\n            output.append(f(self.dataflow_params.temp_location, self.temp_location))\n        if self.gcp_temp_location:\n            output.append(f(self.dataflow_params.gcp_temp_location, self.gcp_temp_location))\n        if self.num_workers:\n            output.append(f(self.dataflow_params.num_workers, self.num_workers))\n        if self.autoscaling_algorithm:\n            output.append(f(self.dataflow_params.autoscaling_algorithm, self.autoscaling_algorithm))\n        if self.max_num_workers:\n            output.append(f(self.dataflow_params.max_num_workers, self.max_num_workers))\n        if self.disk_size_gb:\n            output.append(f(self.dataflow_params.disk_size_gb, self.disk_size_gb))\n        if self.worker_machine_type:\n            output.append(f(self.dataflow_params.worker_machine_type, self.worker_machine_type))\n        if self.worker_disk_type:\n            output.append(f(self.dataflow_params.worker_disk_type, self.worker_disk_type))\n        if self.network:\n            output.append(f(self.dataflow_params.network, self.network))\n        if self.subnetwork:\n            output.append(f(self.dataflow_params.subnetwork, self.subnetwork))\n        if self.job_name:\n            output.append(f(self.dataflow_params.job_name, self.job_name))\n        if self.service_account:\n            output.append(f(self.dataflow_params.service_account, self.service_account))\n        if self.labels:\n            output.append(f(self.dataflow_params.labels, json.dumps(self.labels)))\n\n        return output\n\n    def _format_input_args(self):\n        \"\"\"\n        Parses the result(s) of self.input() into a string-serialized\n        key-value list passed to the Dataflow job. Valid inputs include:\n\n        return FooTarget()\n\n        return {\"input1\": FooTarget(), \"input2\": FooTarget2())\n\n        return (\"input\", FooTarget())\n\n        return [(\"input1\", FooTarget()), (\"input2\": FooTarget2())]\n\n        return [FooTarget(), FooTarget2()]\n\n        Unlabeled input are passed in with under the default key \"input\".\n        \"\"\"\n        job_input = self.input()\n\n        if isinstance(job_input, luigi.Target):\n            job_input = {\"input\": job_input}\n\n        elif isinstance(job_input, tuple):\n            job_input = {job_input[0]: job_input[1]}\n\n        elif isinstance(job_input, list):\n            if all(isinstance(item, tuple) for item in job_input):\n                job_input = dict(job_input)\n            else:\n                job_input = {\"input\": job_input}\n\n        elif not isinstance(job_input, dict):\n            raise ValueError(\"Invalid job input requires(). Supported types: [Target, tuple of (name, Target), dict of (name: Target), list of Targets]\")\n\n        if not isinstance(self.file_pattern(), dict):\n            raise ValueError(\"file_pattern() must return a dict type\")\n\n        input_args = []\n\n        for name, targets in job_input.items():\n            uris = [self.get_target_path(uri_target) for uri_target in luigi.task.flatten(targets)]\n            if isinstance(targets, dict):\n                \"\"\"\n                If targets is a dict that means it had multiple outputs.\n                Make the input args in that case \"<input key>-<task output key>\"\n                \"\"\"\n                names = [\"%s-%s\" % (name, key) for key in targets.keys()]\n\n            else:\n                names = [name] * len(uris)\n\n            input_dict = {}\n\n            for arg_name, uri in zip(names, uris):\n                pattern = self.file_pattern().get(name, \"part-*\")\n                input_value = input_dict.get(arg_name, [])\n                input_value.append(uri.rstrip(\"/\") + \"/\" + pattern)\n                input_dict[arg_name] = input_value\n\n            for key, paths in input_dict.items():\n                input_args.append(\"--%s=%s\" % (key, \",\".join(paths)))\n\n        return input_args\n\n    def _format_output_args(self):\n        \"\"\"\n        Parses the result(s) of self.output() into a string-serialized\n        key-value list passed to the Dataflow job. Valid outputs include:\n\n        return FooTarget()\n\n        return {\"output1\": FooTarget(), \"output2\": FooTarget2()}\n\n        Unlabeled outputs are passed in with under the default key \"output\".\n        \"\"\"\n        job_output = self.output()\n        if isinstance(job_output, luigi.Target):\n            job_output = {\"output\": job_output}\n        elif not isinstance(job_output, dict):\n            raise ValueError(\"Task output must be a Target or a dict from String to Target\")\n\n        output_args = []\n\n        for name, target in job_output.items():\n            uri = self.get_target_path(target)\n            output_args.append(\"--%s=%s\" % (name, uri))\n\n        return output_args\n\n    @staticmethod\n    def get_target_path(target):\n        \"\"\"\n        Given a luigi Target, determine a stringly typed path to pass as a\n        Dataflow job argument.\n        \"\"\"\n        if isinstance(target, luigi.LocalTarget) or isinstance(target, gcs.GCSTarget):\n            return target.path\n        elif isinstance(target, bigquery.BigQueryTarget):\n            return \"{}:{}.{}\".format(target.table.project_id, target.table.dataset_id, target.table.table_id)\n        else:\n            raise ValueError(\"Target %s not supported\" % target)\n"
  },
  {
    "path": "luigi/contrib/bigquery.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2015 Twitter Inc\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom __future__ import annotations\n\nimport collections\nimport logging\nimport time\n\nfrom tenacity import retry, retry_if_exception, retry_if_exception_type, stop_after_attempt, wait_exponential\n\nimport luigi.target\nfrom luigi.contrib import gcp\n\nlogger = logging.getLogger(\"luigi-interface\")\n\nRETRYABLE_ERRORS: tuple[type[BaseException], ...] = ()\ntry:\n    import httplib2\n    from googleapiclient import discovery, errors, http\nexcept ImportError:\n    logger.warning(\"BigQuery module imported, but google-api-python-client is not installed. Any BigQuery task will fail\")\nelse:\n    RETRYABLE_ERRORS = (httplib2.HttpLib2Error, IOError, TimeoutError, BrokenPipeError)\n\n\n# Retry configurations. For more details, see https://tenacity.readthedocs.io/en/latest/\ndef is_error_5xx(err):\n    return isinstance(err, errors.HttpError) and err.resp.status >= 500\n\n\nbq_retry = retry(\n    retry=(retry_if_exception(is_error_5xx) | retry_if_exception_type(RETRYABLE_ERRORS)),\n    wait=wait_exponential(multiplier=1, min=1, max=10),\n    stop=stop_after_attempt(3),\n    reraise=True,\n    after=lambda x: x.args[0]._initialise_client(),\n)\n\n\nclass CreateDisposition:\n    CREATE_IF_NEEDED = \"CREATE_IF_NEEDED\"\n    CREATE_NEVER = \"CREATE_NEVER\"\n\n\nclass WriteDisposition:\n    WRITE_TRUNCATE = \"WRITE_TRUNCATE\"\n    WRITE_APPEND = \"WRITE_APPEND\"\n    WRITE_EMPTY = \"WRITE_EMPTY\"\n\n\nclass QueryMode:\n    INTERACTIVE = \"INTERACTIVE\"\n    BATCH = \"BATCH\"\n\n\nclass SourceFormat:\n    AVRO = \"AVRO\"\n    CSV = \"CSV\"\n    DATASTORE_BACKUP = \"DATASTORE_BACKUP\"\n    NEWLINE_DELIMITED_JSON = \"NEWLINE_DELIMITED_JSON\"\n    PARQUET = \"PARQUET\"\n\n\nclass FieldDelimiter:\n    \"\"\"\n    The separator for fields in a CSV file. The separator can be any ISO-8859-1 single-byte character.\n    To use a character in the range 128-255, you must encode the character as UTF8.\n    BigQuery converts the string to ISO-8859-1 encoding, and then uses the\n    first byte of the encoded string to split the data in its raw, binary state.\n    BigQuery also supports the escape sequence \"\\t\" to specify a tab separator.\n    The default value is a comma (',').\n\n    https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load\n    \"\"\"\n\n    COMMA = \",\"  # Default\n    TAB = \"\\t\"\n    PIPE = \"|\"\n\n\nclass PrintHeader:\n    TRUE = True\n    FALSE = False\n\n\nclass DestinationFormat:\n    AVRO = \"AVRO\"\n    CSV = \"CSV\"\n    NEWLINE_DELIMITED_JSON = \"NEWLINE_DELIMITED_JSON\"\n\n\nclass Compression:\n    GZIP = \"GZIP\"\n    NONE = \"NONE\"\n\n\nclass Encoding:\n    \"\"\"\n    [Optional] The character encoding of the data. The supported values are UTF-8 or ISO-8859-1. The default value is UTF-8.\n\n    BigQuery decodes the data after the raw, binary data has been split using the values of the quote and fieldDelimiter properties.\n    \"\"\"\n\n    UTF_8 = \"UTF-8\"\n    ISO_8859_1 = \"ISO-8859-1\"\n\n\nBQDataset = collections.namedtuple(\"BQDataset\", \"project_id dataset_id location\")\n\n\nclass BQTable(collections.namedtuple(\"BQTable\", \"project_id dataset_id table_id location\")):\n    @property\n    def dataset(self):\n        return BQDataset(project_id=self.project_id, dataset_id=self.dataset_id, location=self.location)\n\n    @property\n    def uri(self):\n        return \"bq://\" + self.project_id + \"/\" + self.dataset.dataset_id + \"/\" + self.table_id\n\n\nclass BigQueryClient:\n    \"\"\"A client for Google BigQuery.\n\n    For details of how authentication and the descriptor work, see the\n    documentation for the GCS client. The descriptor URL for BigQuery is\n    https://www.googleapis.com/discovery/v1/apis/bigquery/v2/rest\n    \"\"\"\n\n    def __init__(self, oauth_credentials=None, descriptor=\"\", http_=None):\n        # Save initialisation arguments in case we need to re-create client\n        # due to connection timeout\n        self.oauth_credentials = oauth_credentials\n        self.descriptor = descriptor\n        self.http_ = http_\n\n        self._initialise_client()\n\n    def _initialise_client(self):\n        authenticate_kwargs = gcp.get_authenticate_kwargs(self.oauth_credentials, self.http_)\n\n        if self.descriptor:\n            self.client = discovery.build_from_document(self.descriptor, **authenticate_kwargs)\n        else:\n            self.client = discovery.build(\"bigquery\", \"v2\", cache_discovery=False, **authenticate_kwargs)\n\n    @bq_retry\n    def dataset_exists(self, dataset):\n        \"\"\"Returns whether the given dataset exists.\n        If regional location is specified for the dataset, that is also checked\n        to be compatible with the remote dataset, otherwise an exception is thrown.\n\n           :param dataset:\n           :type dataset: BQDataset\n        \"\"\"\n\n        try:\n            response = self.client.datasets().get(projectId=dataset.project_id, datasetId=dataset.dataset_id).execute()\n            if dataset.location is not None:\n                fetched_location = response.get(\"location\")\n                if dataset.location != fetched_location:\n                    raise Exception(\n                        \"\"\"Dataset already exists with regional location {}. Can't use {}.\"\"\".format(\n                            fetched_location if fetched_location is not None else \"unspecified\", dataset.location\n                        )\n                    )\n        except http.HttpError as ex:\n            if ex.resp.status == 404:\n                return False\n            raise\n\n        return True\n\n    @bq_retry\n    def table_exists(self, table):\n        \"\"\"Returns whether the given table exists.\n\n        :param table:\n        :type table: BQTable\n        \"\"\"\n        if not self.dataset_exists(table.dataset):\n            return False\n\n        try:\n            self.client.tables().get(projectId=table.project_id, datasetId=table.dataset_id, tableId=table.table_id).execute()\n        except http.HttpError as ex:\n            if ex.resp.status == 404:\n                return False\n            raise\n\n        return True\n\n    def make_dataset(self, dataset, raise_if_exists=False, body=None):\n        \"\"\"Creates a new dataset with the default permissions.\n\n        :param dataset:\n        :type dataset: BQDataset\n        :param raise_if_exists: whether to raise an exception if the dataset already exists.\n        :raises luigi.target.FileAlreadyExists: if raise_if_exists=True and the dataset exists\n        \"\"\"\n\n        if body is None:\n            body = {}\n\n        try:\n            # Construct a message body in the format required by\n            # https://developers.google.com/resources/api-libraries/documentation/bigquery/v2/python/latest/bigquery_v2.datasets.html#insert\n            body[\"datasetReference\"] = {\"projectId\": dataset.project_id, \"datasetId\": dataset.dataset_id}\n            if dataset.location is not None:\n                body[\"location\"] = dataset.location\n            self.client.datasets().insert(projectId=dataset.project_id, body=body).execute()\n        except http.HttpError as ex:\n            if ex.resp.status == 409:\n                if raise_if_exists:\n                    raise luigi.target.FileAlreadyExists()\n            else:\n                raise\n\n    def delete_dataset(self, dataset, delete_nonempty=True):\n        \"\"\"Deletes a dataset (and optionally any tables in it), if it exists.\n\n        :param dataset:\n        :type dataset: BQDataset\n        :param delete_nonempty: if true, will delete any tables before deleting the dataset\n        \"\"\"\n\n        if not self.dataset_exists(dataset):\n            return\n\n        self.client.datasets().delete(projectId=dataset.project_id, datasetId=dataset.dataset_id, deleteContents=delete_nonempty).execute()\n\n    def delete_table(self, table):\n        \"\"\"Deletes a table, if it exists.\n\n        :param table:\n        :type table: BQTable\n        \"\"\"\n\n        if not self.table_exists(table):\n            return\n\n        self.client.tables().delete(projectId=table.project_id, datasetId=table.dataset_id, tableId=table.table_id).execute()\n\n    def list_datasets(self, project_id):\n        \"\"\"Returns the list of datasets in a given project.\n\n        :param project_id:\n        :type project_id: str\n        \"\"\"\n\n        request = self.client.datasets().list(projectId=project_id, maxResults=1000)\n        response = request.execute()\n\n        while response is not None:\n            for ds in response.get(\"datasets\", []):\n                yield ds[\"datasetReference\"][\"datasetId\"]\n\n            request = self.client.datasets().list_next(request, response)\n            if request is None:\n                break\n\n            response = request.execute()\n\n    def list_tables(self, dataset):\n        \"\"\"Returns the list of tables in a given dataset.\n\n        :param dataset:\n        :type dataset: BQDataset\n        \"\"\"\n\n        request = self.client.tables().list(projectId=dataset.project_id, datasetId=dataset.dataset_id, maxResults=1000)\n        response = request.execute()\n\n        while response is not None:\n            for t in response.get(\"tables\", []):\n                yield t[\"tableReference\"][\"tableId\"]\n\n            request = self.client.tables().list_next(request, response)\n            if request is None:\n                break\n\n            response = request.execute()\n\n    def get_view(self, table):\n        \"\"\"Returns the SQL query for a view, or None if it doesn't exist or is not a view.\n\n        :param table: The table containing the view.\n        :type table: BQTable\n        \"\"\"\n\n        request = self.client.tables().get(projectId=table.project_id, datasetId=table.dataset_id, tableId=table.table_id)\n\n        try:\n            response = request.execute()\n        except http.HttpError as ex:\n            if ex.resp.status == 404:\n                return None\n            raise\n\n        return response[\"view\"][\"query\"] if \"view\" in response else None\n\n    def update_view(self, table, view):\n        \"\"\"Updates the SQL query for a view.\n\n        If the output table exists, it is replaced with the supplied view query. Otherwise a new\n        table is created with this view.\n\n        :param table: The table to contain the view.\n        :type table: BQTable\n        :param view: The SQL query for the view.\n        :type view: str\n        \"\"\"\n\n        body = {\"tableReference\": {\"projectId\": table.project_id, \"datasetId\": table.dataset_id, \"tableId\": table.table_id}, \"view\": {\"query\": view}}\n\n        if self.table_exists(table):\n            self.client.tables().update(projectId=table.project_id, datasetId=table.dataset_id, tableId=table.table_id, body=body).execute()\n        else:\n            self.client.tables().insert(projectId=table.project_id, datasetId=table.dataset_id, body=body).execute()\n\n    def run_job(self, project_id, body, dataset=None):\n        \"\"\"Runs a BigQuery \"job\". See the documentation for the format of body.\n\n        .. note::\n            You probably don't need to use this directly. Use the tasks defined below.\n\n        :param dataset:\n        :type dataset: BQDataset\n        :return: the job id of the job.\n        :rtype: str\n        :raises luigi.contrib.BigQueryExecutionError: if the job fails.\n        \"\"\"\n\n        if dataset and not self.dataset_exists(dataset):\n            self.make_dataset(dataset)\n\n        new_job = self.client.jobs().insert(projectId=project_id, body=body).execute()\n        job_id = new_job[\"jobReference\"][\"jobId\"]\n        logger.info(\"Started import job %s:%s\", project_id, job_id)\n        while True:\n            status = self.client.jobs().get(projectId=project_id, jobId=job_id).execute(num_retries=10)\n            if status[\"status\"][\"state\"] == \"DONE\":\n                if status[\"status\"].get(\"errorResult\"):\n                    raise BigQueryExecutionError(job_id, status[\"status\"][\"errorResult\"])\n                return job_id\n\n            logger.info(\"Waiting for job %s:%s to complete...\", project_id, job_id)\n            time.sleep(5)\n\n    def copy(self, source_table, dest_table, create_disposition=CreateDisposition.CREATE_IF_NEEDED, write_disposition=WriteDisposition.WRITE_TRUNCATE):\n        \"\"\"Copies (or appends) a table to another table.\n\n        :param source_table:\n        :type source_table: BQTable\n        :param dest_table:\n        :type dest_table: BQTable\n        :param create_disposition: whether to create the table if needed\n        :type create_disposition: CreateDisposition\n        :param write_disposition: whether to append/truncate/fail if the table exists\n        :type write_disposition: WriteDisposition\n        \"\"\"\n\n        job = {\n            \"configuration\": {\n                \"copy\": {\n                    \"sourceTable\": {\n                        \"projectId\": source_table.project_id,\n                        \"datasetId\": source_table.dataset_id,\n                        \"tableId\": source_table.table_id,\n                    },\n                    \"destinationTable\": {\n                        \"projectId\": dest_table.project_id,\n                        \"datasetId\": dest_table.dataset_id,\n                        \"tableId\": dest_table.table_id,\n                    },\n                    \"createDisposition\": create_disposition,\n                    \"writeDisposition\": write_disposition,\n                }\n            }\n        }\n\n        self.run_job(dest_table.project_id, job, dataset=dest_table.dataset)\n\n\nclass BigQueryTarget(luigi.target.Target):\n    def __init__(self, project_id, dataset_id, table_id, client=None, location=None):\n        self.table = BQTable(project_id=project_id, dataset_id=dataset_id, table_id=table_id, location=location)\n        self.client = client or BigQueryClient()\n\n    @classmethod\n    def from_bqtable(cls, table, client=None):\n        \"\"\"A constructor that takes a :py:class:`BQTable`.\n\n        :param table:\n        :type table: BQTable\n        \"\"\"\n        return cls(table.project_id, table.dataset_id, table.table_id, client=client)\n\n    def exists(self):\n        return self.client.table_exists(self.table)\n\n    def __str__(self):\n        return str(self.table)\n\n\nclass MixinBigQueryBulkComplete:\n    \"\"\"\n    Allows to efficiently check if a range of BigQueryTargets are complete.\n    This enables scheduling tasks with luigi range tools.\n\n    If you implement a custom Luigi task with a BigQueryTarget output, make sure to also inherit\n    from this mixin to enable range support.\n    \"\"\"\n\n    @classmethod\n    def bulk_complete(cls, parameter_tuples):\n        # Instantiate the tasks to inspect them\n        tasks_with_params = [(cls(p), p) for p in parameter_tuples]\n        if not tasks_with_params:\n            return\n\n        # Grab the set of BigQuery datasets we are interested in\n        datasets = {t.output().table.dataset for t, p in tasks_with_params}\n        logger.info(\"Checking datasets %s for available tables\", datasets)\n\n        # Query the available tables for all datasets\n        client = tasks_with_params[0][0].output().client\n        available_datasets = filter(client.dataset_exists, datasets)\n        available_tables = {d: set(client.list_tables(d)) for d in available_datasets}\n\n        # Return parameter_tuples belonging to available tables\n        for t, p in tasks_with_params:\n            table = t.output().table\n            if table.table_id in available_tables.get(table.dataset, []):\n                yield p\n\n\nclass BigQueryLoadTask(MixinBigQueryBulkComplete, luigi.Task):\n    \"\"\"Load data into BigQuery from GCS.\"\"\"\n\n    @property\n    def source_format(self):\n        \"\"\"The source format to use (see :py:class:`SourceFormat`).\"\"\"\n        return SourceFormat.NEWLINE_DELIMITED_JSON\n\n    @property\n    def encoding(self):\n        \"\"\"The encoding of the data that is going to be loaded (see :py:class:`Encoding`).\"\"\"\n        return Encoding.UTF_8\n\n    @property\n    def write_disposition(self):\n        \"\"\"What to do if the table already exists. By default this will fail the job.\n\n        See :py:class:`WriteDisposition`\"\"\"\n        return WriteDisposition.WRITE_EMPTY\n\n    @property\n    def schema(self):\n        \"\"\"Schema in the format defined at https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schema.\n\n        If the value is falsy, it is omitted and inferred by BigQuery.\"\"\"\n        return []\n\n    @property\n    def max_bad_records(self):\n        \"\"\"The maximum number of bad records that BigQuery can ignore when reading data.\n\n        If the number of bad records exceeds this value, an invalid error is returned in the job result.\"\"\"\n        return 0\n\n    @property\n    def field_delimiter(self):\n        \"\"\"The separator for fields in a CSV file. The separator can be any ISO-8859-1 single-byte character.\"\"\"\n        return FieldDelimiter.COMMA\n\n    def source_uris(self):\n        \"\"\"The fully-qualified URIs that point to your data in Google Cloud Storage.\n\n        Each URI can contain one '*' wildcard character and it must come after the 'bucket' name.\"\"\"\n        return [x.path for x in luigi.task.flatten(self.input())]\n\n    @property\n    def skip_leading_rows(self):\n        \"\"\"The number of rows at the top of a CSV file that BigQuery will skip when loading the data.\n\n        The default value is 0. This property is useful if you have header rows in the file that should be skipped.\"\"\"\n        return 0\n\n    @property\n    def allow_jagged_rows(self):\n        \"\"\"Accept rows that are missing trailing optional columns. The missing values are treated as nulls.\n\n        If false, records with missing trailing columns are treated as bad records, and if there are too many bad records,\n\n        an invalid error is returned in the job result. The default value is false. Only applicable to CSV, ignored for other formats.\"\"\"\n        return False\n\n    @property\n    def ignore_unknown_values(self):\n        \"\"\"Indicates if BigQuery should allow extra values that are not represented in the table schema.\n\n        If true, the extra values are ignored. If false, records with extra columns are treated as bad records,\n\n        and if there are too many bad records, an invalid error is returned in the job result. The default value is false.\n\n        The sourceFormat property determines what BigQuery treats as an extra value:\n\n        CSV: Trailing columns JSON: Named values that don't match any column names\"\"\"\n        return False\n\n    @property\n    def allow_quoted_new_lines(self):\n        \"\"\"Indicates if BigQuery should allow quoted data sections that contain newline characters in a CSV file. The default value is false.\"\"\"\n        return False\n\n    def configure_job(self, configuration):\n        \"\"\"Set additional job configuration.\n\n        This allows to specify job configuration parameters that are not exposed via Task properties.\n\n        :param configuration: Current configuration.\n        :return: New or updated configuration.\n        \"\"\"\n        return configuration\n\n    def run(self):\n        output = self.output()\n        assert isinstance(output, BigQueryTarget), \"Output must be a BigQueryTarget, not %s\" % (output)\n\n        bq_client = output.client\n\n        source_uris = self.source_uris()\n        assert all(x.startswith(\"gs://\") for x in source_uris)\n\n        job = {\n            \"configuration\": {\n                \"load\": {\n                    \"destinationTable\": {\n                        \"projectId\": output.table.project_id,\n                        \"datasetId\": output.table.dataset_id,\n                        \"tableId\": output.table.table_id,\n                    },\n                    \"encoding\": self.encoding,\n                    \"sourceFormat\": self.source_format,\n                    \"writeDisposition\": self.write_disposition,\n                    \"sourceUris\": source_uris,\n                    \"maxBadRecords\": self.max_bad_records,\n                    \"ignoreUnknownValues\": self.ignore_unknown_values,\n                }\n            }\n        }\n\n        if self.source_format == SourceFormat.CSV:\n            job[\"configuration\"][\"load\"][\"fieldDelimiter\"] = self.field_delimiter\n            job[\"configuration\"][\"load\"][\"skipLeadingRows\"] = self.skip_leading_rows\n            job[\"configuration\"][\"load\"][\"allowJaggedRows\"] = self.allow_jagged_rows\n            job[\"configuration\"][\"load\"][\"allowQuotedNewlines\"] = self.allow_quoted_new_lines\n\n        if self.schema:\n            job[\"configuration\"][\"load\"][\"schema\"] = {\"fields\": self.schema}\n        else:\n            job[\"configuration\"][\"load\"][\"autodetect\"] = True\n\n        job[\"configuration\"] = self.configure_job(job[\"configuration\"])\n\n        bq_client.run_job(output.table.project_id, job, dataset=output.table.dataset)\n\n\nclass BigQueryRunQueryTask(MixinBigQueryBulkComplete, luigi.Task):\n    @property\n    def write_disposition(self):\n        \"\"\"What to do if the table already exists. By default this will fail the job.\n\n        See :py:class:`WriteDisposition`\"\"\"\n        return WriteDisposition.WRITE_TRUNCATE\n\n    @property\n    def create_disposition(self):\n        \"\"\"Whether to create the table or not. See :py:class:`CreateDisposition`\"\"\"\n        return CreateDisposition.CREATE_IF_NEEDED\n\n    @property\n    def flatten_results(self):\n        \"\"\"Flattens all nested and repeated fields in the query results.\n        allowLargeResults must be true if this is set to False.\"\"\"\n        return True\n\n    @property\n    def query(self):\n        \"\"\"The query, in text form.\"\"\"\n        raise NotImplementedError()\n\n    @property\n    def query_mode(self):\n        \"\"\"The query mode. See :py:class:`QueryMode`.\"\"\"\n        return QueryMode.INTERACTIVE\n\n    @property\n    def udf_resource_uris(self):\n        \"\"\"Iterator of code resource to load from a Google Cloud Storage URI (gs://bucket/path).\"\"\"\n        return []\n\n    @property\n    def use_legacy_sql(self):\n        \"\"\"Whether to use legacy SQL\"\"\"\n        return True\n\n    def configure_job(self, configuration):\n        \"\"\"Set additional job configuration.\n\n        This allows to specify job configuration parameters that are not exposed via Task properties.\n\n        :param configuration: Current configuration.\n        :return: New or updated configuration.\n        \"\"\"\n        return configuration\n\n    def run(self):\n        output = self.output()\n        assert isinstance(output, BigQueryTarget), \"Output must be a BigQueryTarget, not %s\" % (output)\n\n        query = self.query\n        assert query, \"No query was provided\"\n\n        bq_client = output.client\n\n        logger.info(\"Launching Query\")\n        logger.info(\"Query destination: %s (%s)\", output, self.write_disposition)\n        logger.info(\"Query SQL: %s\", query)\n\n        job = {\n            \"configuration\": {\n                \"query\": {\n                    \"query\": query,\n                    \"priority\": self.query_mode,\n                    \"destinationTable\": {\n                        \"projectId\": output.table.project_id,\n                        \"datasetId\": output.table.dataset_id,\n                        \"tableId\": output.table.table_id,\n                    },\n                    \"allowLargeResults\": True,\n                    \"createDisposition\": self.create_disposition,\n                    \"writeDisposition\": self.write_disposition,\n                    \"flattenResults\": self.flatten_results,\n                    \"userDefinedFunctionResources\": [{\"resourceUri\": v} for v in self.udf_resource_uris],\n                    \"useLegacySql\": self.use_legacy_sql,\n                }\n            }\n        }\n\n        job[\"configuration\"] = self.configure_job(job[\"configuration\"])\n\n        bq_client.run_job(output.table.project_id, job, dataset=output.table.dataset)\n\n\nclass BigQueryCreateViewTask(luigi.Task):\n    \"\"\"\n    Creates (or updates) a view in BigQuery.\n\n    The output of this task needs to be a BigQueryTarget.\n    Instances of this class should specify the view SQL in the view property.\n\n    If a view already exist in BigQuery at output(), it will be updated.\n    \"\"\"\n\n    @property\n    def view(self):\n        \"\"\"The SQL query for the view, in text form.\"\"\"\n        raise NotImplementedError()\n\n    def complete(self):\n        output = self.output()\n        assert isinstance(output, BigQueryTarget), \"Output must be a BigQueryTarget, not %s\" % (output)\n\n        if not output.exists():\n            return False\n\n        existing_view = output.client.get_view(output.table)\n        return existing_view == self.view\n\n    def run(self):\n        output = self.output()\n        assert isinstance(output, BigQueryTarget), \"Output must be a BigQueryTarget, not %s\" % (output)\n\n        view = self.view\n        assert view, \"No view was provided\"\n\n        logger.info(\"Create view\")\n        logger.info(\"Destination: %s\", output)\n        logger.info(\"View SQL: %s\", view)\n\n        output.client.update_view(output.table, view)\n\n\nclass ExternalBigQueryTask(MixinBigQueryBulkComplete, luigi.ExternalTask):\n    \"\"\"\n    An external task for a BigQuery target.\n    \"\"\"\n\n    pass\n\n\nclass BigQueryExtractTask(luigi.Task):\n    \"\"\"\n    Extracts (unloads) a table from BigQuery to GCS.\n\n    This tasks requires the input to be exactly one BigQueryTarget while the\n    output should be one or more GCSTargets from luigi.contrib.gcs depending on\n    the use of destinationUris property.\n    \"\"\"\n\n    @property\n    def destination_uris(self):\n        \"\"\"\n        The fully-qualified URIs that point to your data in Google Cloud\n        Storage. Each URI can contain one '*' wildcard character and it must\n        come after the 'bucket' name.\n\n        Wildcarded destinationUris in GCSQueryTarget might not be resolved\n        correctly and result in incomplete data. If a GCSQueryTarget is used to\n        pass wildcarded destinationUris be sure to overwrite this property to\n        suppress the warning.\n        \"\"\"\n        return [x.path for x in luigi.task.flatten(self.output())]\n\n    @property\n    def print_header(self):\n        \"\"\"Whether to print the header or not.\"\"\"\n        return PrintHeader.TRUE\n\n    @property\n    def field_delimiter(self):\n        \"\"\"\n        The separator for fields in a CSV file. The separator can be any\n        ISO-8859-1 single-byte character.\n        \"\"\"\n        return FieldDelimiter.COMMA\n\n    @property\n    def destination_format(self):\n        \"\"\"\n        The destination format to use (see :py:class:`DestinationFormat`).\n        \"\"\"\n        return DestinationFormat.CSV\n\n    @property\n    def compression(self):\n        \"\"\"Whether to use compression.\"\"\"\n        return Compression.NONE\n\n    def configure_job(self, configuration):\n        \"\"\"Set additional job configuration.\n\n        This allows to specify job configuration parameters that are not exposed via Task properties.\n\n        :param configuration: Current configuration.\n        :return: New or updated configuration.\n        \"\"\"\n        return configuration\n\n    def run(self):\n        input = luigi.task.flatten(self.input())[0]\n        assert isinstance(input, BigQueryTarget) or (len(input) == 1 and isinstance(input[0], BigQueryTarget)), (\n            \"Input must be exactly one BigQueryTarget, not %s\" % (input)\n        )\n        bq_client = input.client\n\n        destination_uris = self.destination_uris\n        assert all(x.startswith(\"gs://\") for x in destination_uris)\n\n        logger.info(\"Launching Extract Job\")\n        logger.info(\"Extract source: %s\", input)\n        logger.info(\"Extract destination: %s\", destination_uris)\n\n        job = {\n            \"configuration\": {\n                \"extract\": {\n                    \"sourceTable\": {\"projectId\": input.table.project_id, \"datasetId\": input.table.dataset_id, \"tableId\": input.table.table_id},\n                    \"destinationUris\": destination_uris,\n                    \"destinationFormat\": self.destination_format,\n                    \"compression\": self.compression,\n                }\n            }\n        }\n\n        if self.destination_format == \"CSV\":\n            # \"Only exports to CSV may specify a field delimiter.\"\n            job[\"configuration\"][\"extract\"][\"printHeader\"] = self.print_header\n            job[\"configuration\"][\"extract\"][\"fieldDelimiter\"] = self.field_delimiter\n\n        job[\"configuration\"] = self.configure_job(job[\"configuration\"])\n\n        bq_client.run_job(input.table.project_id, job, dataset=input.table.dataset)\n\n\n# the original inconsistently capitalized aliases, for backwards compatibility\nBigqueryClient = BigQueryClient\nBigqueryTarget = BigQueryTarget\nMixinBigqueryBulkComplete = MixinBigQueryBulkComplete\nBigqueryLoadTask = BigQueryLoadTask\nBigqueryRunQueryTask = BigQueryRunQueryTask\nBigqueryCreateViewTask = BigQueryCreateViewTask\nExternalBigqueryTask = ExternalBigQueryTask\n\n\nclass BigQueryExecutionError(Exception):\n    def __init__(self, job_id, error_message) -> None:\n        \"\"\"\n        :param job_id: BigQuery Job ID\n        :type job_id: str\n        :param error_message: status['status']['errorResult'] for the failed job\n        :type error_message: str\n        \"\"\"\n        super().__init__(\"BigQuery job {} failed: {}\".format(job_id, error_message))\n        self.error_message = error_message\n        self.job_id = job_id\n"
  },
  {
    "path": "luigi/contrib/bigquery_avro.py",
    "content": "\"\"\"Specialized tasks for handling Avro data in BigQuery from GCS.\"\"\"\n\nimport logging\n\nfrom luigi.contrib.bigquery import BigQueryLoadTask, SourceFormat\nfrom luigi.contrib.gcs import GCSClient\nfrom luigi.task import flatten\n\nlogger = logging.getLogger(\"luigi-interface\")\n\ntry:\n    import avro\n    import avro.datafile\nexcept ImportError:\n    logger.warning(\"bigquery_avro module imported, but avro is not installed. Any BigQueryLoadAvro task will fail to propagate schema documentation\")\n\n\nclass BigQueryLoadAvro(BigQueryLoadTask):\n    \"\"\"A helper for loading specifically Avro data into BigQuery from GCS.\n\n    Copies table level description from Avro schema doc,\n    BigQuery internally will copy field-level descriptions to the table.\n\n    Suitable for use via subclassing: override requires() to return Task(s) that output\n    to GCS Targets; their paths are expected to be URIs of .avro files or URI prefixes\n    (GCS \"directories\") containing one or many .avro files.\n\n    Override output() to return a BigQueryTarget representing the destination table.\n    \"\"\"\n\n    source_format = SourceFormat.AVRO\n\n    def _avro_uri(self, target):\n        path_or_uri = target.uri if hasattr(target, \"uri\") else target.path\n        return path_or_uri if path_or_uri.endswith(\".avro\") else path_or_uri.rstrip(\"/\") + \"/*.avro\"\n\n    def source_uris(self):\n        return [self._avro_uri(x) for x in flatten(self.input())]\n\n    def _get_input_schema(self):\n        \"\"\"Arbitrarily picks an object in input and reads the Avro schema from it.\"\"\"\n        assert avro, \"avro module required\"\n\n        input_target = flatten(self.input())[0]\n        input_fs = input_target.fs if hasattr(input_target, \"fs\") else GCSClient()\n        input_uri = self.source_uris()[0]\n        if \"*\" in input_uri:\n            file_uris = list(input_fs.list_wildcard(input_uri))\n            if file_uris:\n                input_uri = file_uris[0]\n            else:\n                raise RuntimeError(\"No match for \" + input_uri)\n\n        schema = []\n        exception_reading_schema = []\n\n        def read_schema(fp):\n            # fp contains the file part downloaded thus far. We rely on that the DataFileReader\n            # initializes itself fine as soon as the file header with schema is downloaded, without\n            # requiring the remainder of the file...\n            try:\n                reader = avro.datafile.DataFileReader(fp, avro.io.DatumReader())\n                schema[:] = [BigQueryLoadAvro._get_writer_schema(reader.datum_reader)]\n            except Exception as e:\n                # Save but assume benign unless schema reading ultimately fails. The benign\n                # exception in case of insufficiently big downloaded file part seems to be:\n                # TypeError('ord() expected a character, but string of length 0 found',).\n                exception_reading_schema[:] = [e]\n                return False\n            return True\n\n        input_fs.download(input_uri, 64 * 1024, read_schema).close()\n        if not schema:\n            raise exception_reading_schema[0]\n        return schema[0]\n\n    @staticmethod\n    def _get_writer_schema(datum_reader):\n        \"\"\"Python-version agnostic getter for datum_reader writer(s)_schema attribute\n\n        Parameters:\n        datum_reader (avro.io.DatumReader): DatumReader\n\n        Returns:\n        Returning correct attribute name depending on Python version.\n        \"\"\"\n        return datum_reader.writer_schema\n\n    def _set_output_doc(self, avro_schema):\n        bq_client = self.output().client.client\n        table = self.output().table\n\n        patch = {\n            \"description\": avro_schema.doc,\n        }\n\n        bq_client.tables().patch(projectId=table.project_id, datasetId=table.dataset_id, tableId=table.table_id, body=patch).execute()\n\n    def run(self):\n        super(BigQueryLoadAvro, self).run()\n\n        # We propagate documentation in one fire-and-forget attempt; the output table is\n        # left to exist without documentation if this step raises an exception.\n        try:\n            self._set_output_doc(self._get_input_schema())\n        except Exception as e:\n            logger.warning(\"Could not propagate Avro doc to BigQuery table description: %r\", e)\n"
  },
  {
    "path": "luigi/contrib/datadog_metric.py",
    "content": "import logging\n\nfrom luigi import parameter\nfrom luigi.metrics import MetricsCollector\nfrom luigi.task import Config\n\nlogger = logging.getLogger(\"luigi-interface\")\n\ntry:\n    from datadog import api, initialize, statsd\nexcept ImportError:\n    logger.warning(\"Loading datadog module without datadog installed. Will crash at runtime if datadog functionality is used.\")\n\n\nclass datadog(Config):\n    api_key = parameter.Parameter(default=\"dummy_api_key\", description=\"API key provided by Datadog\")\n    app_key = parameter.Parameter(default=\"dummy_app_key\", description=\"APP key provided by Datadog\")\n    default_tags = parameter.Parameter(default=\"application:luigi\", description=\"Default tags for every events and metrics sent to Datadog\")\n    environment = parameter.Parameter(default=\"development\", description=\"Environment of which the pipeline is ran from (eg: 'production', 'staging', ...\")\n    metric_namespace = parameter.Parameter(default=\"luigi\", description=\"Default namespace for events and metrics (eg: 'luigi' for 'luigi.task.started')\")\n    statsd_host = parameter.Parameter(default=\"localhost\", description=\"StatsD host implementing the Datadog service\")\n    statsd_port = parameter.IntParameter(default=8125, description=\"StatsD port implementing the Datadog service\")\n\n\nclass DatadogMetricsCollector(MetricsCollector):\n    def __init__(self, *args, **kwargs):\n        self._config = datadog(**kwargs)\n\n        initialize(api_key=self._config.api_key, app_key=self._config.app_key, statsd_host=self._config.statsd_host, statsd_port=self._config.statsd_port)\n\n    def handle_task_started(self, task):\n        title = \"Luigi: A task has been started!\"\n        text = \"A task has been started in the pipeline named: {name}\".format(name=task.family)\n        tags = [\"task_name:{name}\".format(name=task.family)] + self._format_task_params_to_tags(task)\n\n        self._send_increment(\"task.started\", tags=tags)\n\n        event_tags = tags + [\"task_state:STARTED\"]\n        self._send_event(title=title, text=text, tags=event_tags, alert_type=\"info\", priority=\"low\")\n\n    def handle_task_failed(self, task):\n        title = \"Luigi: A task has failed!\"\n        text = \"A task has failed in the pipeline named: {name}\".format(name=task.family)\n        tags = [\"task_name:{name}\".format(name=task.family)] + self._format_task_params_to_tags(task)\n\n        self._send_increment(\"task.failed\", tags=tags)\n\n        event_tags = tags + [\"task_state:FAILED\"]\n        self._send_event(title=title, text=text, tags=event_tags, alert_type=\"error\", priority=\"normal\")\n\n    def handle_task_disabled(self, task, config):\n        title = \"Luigi: A task has been disabled!\"\n        lines = [\"A task has been disabled in the pipeline named: {name}.\"]\n        lines.append(\"The task has failed {failures} times in the last {window}\")\n        lines.append(\"seconds, so it is being disabled for {persist} seconds.\")\n\n        preformated_text = \" \".join(lines)\n\n        text = preformated_text.format(name=task.family, persist=config.disable_persist, failures=config.retry_count, window=config.disable_window)\n\n        tags = [\"task_name:{name}\".format(name=task.family)] + self._format_task_params_to_tags(task)\n\n        self._send_increment(\"task.disabled\", tags=tags)\n\n        event_tags = tags + [\"task_state:DISABLED\"]\n        self._send_event(title=title, text=text, tags=event_tags, alert_type=\"error\", priority=\"normal\")\n\n    def handle_task_done(self, task):\n        # The task is already done -- Let's not re-create an event\n        if task.time_running is None:\n            return\n\n        title = \"Luigi: A task has been completed!\"\n        text = \"A task has completed in the pipeline named: {name}\".format(name=task.family)\n        tags = [\"task_name:{name}\".format(name=task.family)] + self._format_task_params_to_tags(task)\n\n        time_elapse = task.updated - task.time_running\n\n        self._send_increment(\"task.done\", tags=tags)\n        self._send_gauge(\"task.execution_time\", time_elapse, tags=tags)\n\n        event_tags = tags + [\"task_state:DONE\"]\n        self._send_event(title=title, text=text, tags=event_tags, alert_type=\"info\", priority=\"low\")\n\n    def _send_event(self, **params):\n        params[\"tags\"] += self.default_tags\n\n        api.Event.create(**params)\n\n    def _send_gauge(self, metric_name, value, tags=[]):\n        all_tags = tags + self.default_tags\n\n        namespaced_metric = \"{namespace}.{metric_name}\".format(namespace=self._config.metric_namespace, metric_name=metric_name)\n        statsd.gauge(namespaced_metric, value, tags=all_tags)\n\n    def _send_increment(self, metric_name, value=1, tags=[]):\n        all_tags = tags + self.default_tags\n\n        namespaced_metric = \"{namespace}.{metric_name}\".format(namespace=self._config.metric_namespace, metric_name=metric_name)\n        statsd.increment(namespaced_metric, value, tags=all_tags)\n\n    def _format_task_params_to_tags(self, task):\n        params = []\n        for key, value in task.params.items():\n            params.append(\"{key}:{value}\".format(key=key, value=value))\n\n        return params\n\n    @property\n    def default_tags(self):\n        default_tags = []\n\n        env_tag = \"environment:{environment}\".format(environment=self._config.environment)\n        default_tags.append(env_tag)\n\n        if self._config.default_tags:\n            default_tags = default_tags + str.split(self._config.default_tags, \",\")\n\n        return default_tags\n"
  },
  {
    "path": "luigi/contrib/dataproc.py",
    "content": "\"\"\"luigi bindings for Google Dataproc on Google Cloud\"\"\"\n\nimport logging\nimport os\nimport time\n\nimport luigi\nfrom luigi.contrib import gcp\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n_dataproc_client = None\n\ntry:\n    import google.auth\n    from googleapiclient import discovery\n    from googleapiclient.errors import HttpError\n\n    DEFAULT_CREDENTIALS, _ = google.auth.default()\n    authenticate_kwargs = gcp.get_authenticate_kwargs(DEFAULT_CREDENTIALS)\n    _dataproc_client = discovery.build(\"dataproc\", \"v1\", cache_discovery=False, **authenticate_kwargs)\nexcept ImportError:\n    logger.warning(\n        \"Loading Dataproc module without the python packages googleapiclient & google-auth. \\\n        This will crash at runtime if Dataproc functionality is used.\"\n    )\n\n\ndef get_dataproc_client():\n    return _dataproc_client\n\n\ndef set_dataproc_client(client):\n    global _dataproc_client\n    _dataproc_client = client\n\n\nclass _DataprocBaseTask(luigi.Task):\n    gcloud_project_id = luigi.Parameter(significant=False, positional=False)\n    dataproc_cluster_name = luigi.Parameter(significant=False, positional=False)\n    dataproc_region = luigi.Parameter(default=\"global\", significant=False, positional=False)\n\n    dataproc_client = get_dataproc_client()\n\n\nclass DataprocBaseTask(_DataprocBaseTask):\n    \"\"\"\n    Base task for running jobs in Dataproc. It is recommended to use one of the tasks specific to your job type.\n    Extend this class if you need fine grained control over what kind of job gets submitted to your Dataproc cluster.\n    \"\"\"\n\n    _job = None\n    _job_name = None\n    _job_id = None\n\n    def submit_job(self, job_config):\n        self._job = (\n            self.dataproc_client.projects().regions().jobs().submit(projectId=self.gcloud_project_id, region=self.dataproc_region, body=job_config).execute()\n        )\n        self._job_id = self._job[\"reference\"][\"jobId\"]\n        return self._job\n\n    def submit_spark_job(self, jars, main_class, job_args=None):\n\n        if job_args is None:\n            job_args = []\n\n        job_config = {\n            \"job\": {\"placement\": {\"clusterName\": self.dataproc_cluster_name}, \"sparkJob\": {\"args\": job_args, \"mainClass\": main_class, \"jarFileUris\": jars}}\n        }\n        self.submit_job(job_config)\n        self._job_name = os.path.basename(self._job[\"sparkJob\"][\"mainClass\"])\n        logger.info(\"Submitted new dataproc job:{} id:{}\".format(self._job_name, self._job_id))\n        return self._job\n\n    def submit_pyspark_job(self, job_file, extra_files=list(), job_args=None):\n\n        if job_args is None:\n            job_args = []\n\n        job_config = {\n            \"job\": {\n                \"placement\": {\"clusterName\": self.dataproc_cluster_name},\n                \"pysparkJob\": {\"mainPythonFileUri\": job_file, \"pythonFileUris\": extra_files, \"args\": job_args},\n            }\n        }\n        self.submit_job(job_config)\n        self._job_name = os.path.basename(self._job[\"pysparkJob\"][\"mainPythonFileUri\"])\n        logger.info(\"Submitted new dataproc job:{} id:{}\".format(self._job_name, self._job_id))\n        return self._job\n\n    def wait_for_job(self):\n        if self._job is None:\n            raise Exception(\"You must submit a job before you can wait for it\")\n        while True:\n            job_result = (\n                self.dataproc_client.projects()\n                .regions()\n                .jobs()\n                .get(projectId=self.gcloud_project_id, region=self.dataproc_region, jobId=self._job_id)\n                .execute()\n            )\n            status = job_result[\"status\"][\"state\"]\n            logger.info(\"Current dataproc status: {} job:{} id:{}\".format(status, self._job_name, self._job_id))\n            if status == \"DONE\":\n                break\n            if status == \"ERROR\":\n                raise Exception(job_result[\"status\"][\"details\"])\n            time.sleep(5)\n\n\nclass DataprocSparkTask(DataprocBaseTask):\n    \"\"\"\n    Runs a spark jobs on your Dataproc cluster\n    \"\"\"\n\n    main_class = luigi.Parameter()\n    jars = luigi.Parameter(default=\"\")\n    job_args = luigi.Parameter(default=\"\")\n\n    def run(self):\n        self.submit_spark_job(\n            main_class=self.main_class, jars=self.jars.split(\",\") if self.jars else [], job_args=self.job_args.split(\",\") if self.job_args else []\n        )\n        self.wait_for_job()\n\n\nclass DataprocPysparkTask(DataprocBaseTask):\n    \"\"\"\n    Runs a pyspark jobs on your Dataproc cluster\n    \"\"\"\n\n    job_file = luigi.Parameter()\n    extra_files = luigi.Parameter(default=\"\")\n    job_args = luigi.Parameter(default=\"\")\n\n    def run(self):\n        self.submit_pyspark_job(\n            job_file=self.job_file,\n            extra_files=self.extra_files.split(\",\") if self.extra_files else [],\n            job_args=self.job_args.split(\",\") if self.job_args else [],\n        )\n        self.wait_for_job()\n\n\nclass CreateDataprocClusterTask(_DataprocBaseTask):\n    \"\"\"Task for creating a Dataproc cluster.\"\"\"\n\n    gcloud_zone = luigi.Parameter(default=\"europe-west1-c\")\n    gcloud_network = luigi.Parameter(default=\"default\")\n\n    master_node_type = luigi.Parameter(default=\"n1-standard-2\")\n    master_disk_size = luigi.Parameter(default=\"100\")\n    worker_node_type = luigi.Parameter(default=\"n1-standard-2\")\n    worker_disk_size = luigi.Parameter(default=\"100\")\n    worker_normal_count = luigi.Parameter(default=\"2\")\n    worker_preemptible_count = luigi.Parameter(default=\"0\")\n    image_version = luigi.Parameter(default=\"\")\n\n    def _get_cluster_status(self):\n        return (\n            self.dataproc_client.projects()\n            .regions()\n            .clusters()\n            .get(projectId=self.gcloud_project_id, region=self.dataproc_region, clusterName=self.dataproc_cluster_name)\n            .execute()\n        )\n\n    def complete(self):\n        try:\n            self._get_cluster_status()\n            return True  # No (404) error so the cluster already exists\n        except HttpError as e:\n            if e.resp.status == 404:\n                return False  # We got a 404 so the cluster doesn't exist yet\n            else:\n                raise e  # Something's wrong ...\n\n    def run(self):\n        base_uri = \"https://www.googleapis.com/compute/v1/projects/{}\".format(self.gcloud_project_id)\n        software_config = {\"imageVersion\": self.image_version} if self.image_version else {}\n\n        cluster_conf = {\n            \"clusterName\": self.dataproc_cluster_name,\n            \"projectId\": self.gcloud_project_id,\n            \"config\": {\n                \"configBucket\": \"\",\n                \"gceClusterConfig\": {\n                    \"networkUri\": base_uri + \"/global/networks/\" + self.gcloud_network,\n                    \"zoneUri\": base_uri + \"/zones/\" + self.gcloud_zone,\n                    \"serviceAccountScopes\": [\"https://www.googleapis.com/auth/cloud-platform\"],\n                },\n                \"masterConfig\": {\n                    \"numInstances\": 1,\n                    \"machineTypeUri\": base_uri + \"/zones/\" + self.gcloud_zone + \"/machineTypes/\" + self.master_node_type,\n                    \"diskConfig\": {\"bootDiskSizeGb\": self.master_disk_size, \"numLocalSsds\": 0},\n                },\n                \"workerConfig\": {\n                    \"numInstances\": self.worker_normal_count,\n                    \"machineTypeUri\": base_uri + \"/zones/\" + self.gcloud_zone + \"/machineTypes/\" + self.worker_node_type,\n                    \"diskConfig\": {\"bootDiskSizeGb\": self.worker_disk_size, \"numLocalSsds\": 0},\n                },\n                \"secondaryWorkerConfig\": {\"numInstances\": self.worker_preemptible_count, \"isPreemptible\": True},\n                \"softwareConfig\": software_config,\n            },\n        }\n\n        self.dataproc_client.projects().regions().clusters().create(projectId=self.gcloud_project_id, region=self.dataproc_region, body=cluster_conf).execute()\n\n        while True:\n            time.sleep(10)\n            cluster_status = self._get_cluster_status()\n            status = cluster_status[\"status\"][\"state\"]\n            logger.info(\"Creating new dataproc cluster: {} status: {}\".format(self.dataproc_cluster_name, status))\n            if status == \"RUNNING\":\n                break\n            if status == \"ERROR\":\n                raise Exception(cluster_status[\"status\"][\"details\"])\n\n\nclass DeleteDataprocClusterTask(_DataprocBaseTask):\n    \"\"\"\n    Task for deleting a Dataproc cluster.\n    One of the uses for this class is to extend it and have it require a Dataproc task that does a calculation and have\n    that task extend the cluster creation task. This allows you to create chains where you create a cluster,\n    run your job and remove the cluster right away.\n    (Store your input and output files in gs://... instead of hdfs://... if you do this).\n    \"\"\"\n\n    def _get_cluster_status(self):\n        try:\n            return (\n                self.dataproc_client.projects()\n                .regions()\n                .clusters()\n                .get(projectId=self.gcloud_project_id, region=self.dataproc_region, clusterName=self.dataproc_cluster_name, fields=\"status\")\n                .execute()\n            )\n        except HttpError as e:\n            if e.resp.status == 404:\n                return None  # We got a 404 so the cluster doesn't exist\n            else:\n                raise e\n\n    def complete(self):\n        return self._get_cluster_status() is None\n\n    def run(self):\n        self.dataproc_client.projects().regions().clusters().delete(\n            projectId=self.gcloud_project_id, region=self.dataproc_region, clusterName=self.dataproc_cluster_name\n        ).execute()\n\n        while True:\n            time.sleep(10)\n            status = self._get_cluster_status()\n            if status is None:\n                logger.info(\"Finished shutting down cluster: {}\".format(self.dataproc_cluster_name))\n                break\n            logger.info(\"Shutting down cluster: {} current status: {}\".format(self.dataproc_cluster_name, status[\"status\"][\"state\"]))\n"
  },
  {
    "path": "luigi/contrib/docker_runner.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2017 Open Targets\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\n\"\"\"\nDocker container wrapper for Luigi.\n\nEnables running a docker container as a task in luigi.\nThis wrapper uses the Docker Python SDK to communicate directly with the\nDocker API avoiding the common pattern to invoke the docker client\nfrom the command line. Using the SDK it is possible to detect and properly\nhandle errors occurring when pulling, starting or running the containers.\nOn top of this, it is possible to mount a single file in the container\nand a temporary directory is created on the host and mounted allowing\nthe handling of files bigger than the container limit.\n\nRequires:\n\n- docker: ``pip install docker``\n\nWritten and maintained by Andrea Pierleoni (@apierleoni).\nContributions by Eliseo Papa (@elipapa).\n\"\"\"\n\nimport logging\nfrom tempfile import mkdtemp\n\nimport luigi\nfrom luigi.local_target import LocalFileSystem\n\nlogger = logging.getLogger(\"luigi-interface\")\n\ntry:\n    import docker\n    from docker.errors import APIError, ContainerError, ImageNotFound\n\nexcept ImportError:\n    logger.warning(\"docker is not installed. DockerTask requires docker.\")\n    docker = None\n\n# TODO: may need to implement this logic for remote hosts\n# class dockerconfig(luigi.Config):\n#     '''\n#     this class allows to use the luigi.cfg file to specify the path to the docker config.json.\n#     The docker client should look by default in the main directory,\n#      but on different systems this may need to be specified.\n#     '''\n#     docker_config_path = luigi.Parameter(\n#         default=\"~/.docker/config.json\",\n#         description=\"Path to dockercfg file for authentication\")\n\n\nclass DockerTask(luigi.Task):\n    @property\n    def image(self):\n        return \"alpine\"\n\n    @property\n    def command(self):\n        return \"echo hello world\"\n\n    @property\n    def name(self):\n        return None\n\n    @property\n    def host_config_options(self):\n        \"\"\"\n        Override this to specify host_config options like gpu requests or shm\n        size e.g. `{\"device_requests\": [docker.types.DeviceRequest(count=1, capabilities=[[\"gpu\"]])]}`\n\n        See https://docker-py.readthedocs.io/en/stable/api.html#docker.api.container.ContainerApiMixin.create_host_config\n        \"\"\"\n        return {}\n\n    @property\n    def container_options(self):\n        \"\"\"\n        Override this to specify container options like user or ports e.g.\n        `{\"user\": f\"{os.getuid()}:{os.getgid()}\"}`\n\n        See https://docker-py.readthedocs.io/en/stable/api.html#docker.api.container.ContainerApiMixin.create_container\n        \"\"\"\n        return {}\n\n    @property\n    def environment(self):\n        return {}\n\n    @property\n    def container_tmp_dir(self):\n        return \"/tmp/luigi\"\n\n    @property\n    def binds(self):\n        \"\"\"\n        Override this to mount local volumes, in addition to the /tmp/luigi\n        which gets defined by default. This should return a list of strings.\n        e.g. ['/hostpath1:/containerpath1', '/hostpath2:/containerpath2']\n        \"\"\"\n        return None\n\n    @property\n    def network_mode(self):\n        return \"\"\n\n    @property\n    def docker_url(self):\n        return None\n\n    @property\n    def auto_remove(self):\n        return True\n\n    @property\n    def force_pull(self):\n        return False\n\n    @property\n    def mount_tmp(self):\n        return True\n\n    def __init__(self, *args, **kwargs):\n        \"\"\"\n        When a new instance of the DockerTask class gets created:\n        - call the parent class __init__ method\n        - start the logger\n        - init an instance of the docker client\n        - create a tmp dir\n        - add the temp dir to the volume binds specified in the task\n        \"\"\"\n        super(DockerTask, self).__init__(*args, **kwargs)\n        self.__logger = logger\n\n        \"\"\"init docker client\n        using the low level API as the higher level API does not allow to mount single\n        files as volumes\n        \"\"\"\n        self._client = docker.APIClient(self.docker_url)\n\n        # add latest tag if nothing else is specified by task\n        if \":\" not in self.image:\n            self._image = \":\".join([self.image, \"latest\"])\n        else:\n            self._image = self.image\n\n        if self.mount_tmp:\n            # create a tmp_dir, NOTE: /tmp needs to be specified for it to work on\n            # macOS, despite what the python documentation says\n            self._host_tmp_dir = mkdtemp(suffix=self.task_id, prefix=\"luigi-docker-tmp-dir-\", dir=\"/tmp\")\n\n            self._binds = [\"{0}:{1}\".format(self._host_tmp_dir, self.container_tmp_dir)]\n        else:\n            self._binds = []\n\n        # update environment property with the (internal) location of tmp_dir\n        self.environment[\"LUIGI_TMP_DIR\"] = self.container_tmp_dir\n\n        # add additional volume binds specified by the user to the tmp_Dir bind\n        if isinstance(self.binds, str):\n            self._binds.append(self.binds)\n        elif isinstance(self.binds, list):\n            self._binds.extend(self.binds)\n\n        # derive volumes (ie. list of container destination paths) from\n        # specified binds\n        self._volumes = [b.split(\":\")[1] for b in self._binds]\n\n    def run(self):\n\n        # get image if missing\n        if self.force_pull or len(self._client.images(name=self._image)) == 0:\n            logger.info(\"Pulling docker image \" + self._image)\n            try:\n                for logline in self._client.pull(self._image, stream=True):\n                    logger.debug(logline.decode(\"utf-8\"))\n            except APIError as e:\n                self.__logger.warning(\"Error in Docker API: \" + e.explanation)\n                raise\n\n        # remove clashing container if a container with the same name exists\n        if self.auto_remove and self.name:\n            try:\n                self._client.remove_container(self.name, force=True)\n            except APIError as e:\n                self.__logger.warning(\"Ignored error in Docker API: \" + e.explanation)\n\n        # run the container\n        try:\n            logger.debug(\"Creating image: %s command: %s volumes: %s\" % (self._image, self.command, self._binds))\n\n            host_config = self._client.create_host_config(binds=self._binds, network_mode=self.network_mode, **self.host_config_options)\n\n            container = self._client.create_container(\n                self._image,\n                command=self.command,\n                name=self.name,\n                environment=self.environment,\n                volumes=self._volumes,\n                host_config=host_config,\n                **self.container_options,\n            )\n            self._client.start(container[\"Id\"])\n\n            exit_status = self._client.wait(container[\"Id\"])\n            # docker-py>=3.0.0 returns a dict instead of the status code directly\n            if type(exit_status) is dict:\n                exit_status = exit_status[\"StatusCode\"]\n\n            if exit_status != 0:\n                stdout = False\n                stderr = True\n                error = self._client.logs(container[\"Id\"], stdout=stdout, stderr=stderr)\n            if self.auto_remove:\n                try:\n                    self._client.remove_container(container[\"Id\"])\n                except docker.errors.APIError:\n                    self.__logger.warning(\"Container \" + container[\"Id\"] + \" could not be removed\")\n            if exit_status != 0:\n                raise ContainerError(container, exit_status, self.command, self._image, error)\n\n        except ContainerError as e:\n            # catch non zero exti status and return it\n            container_name = \"\"\n            if self.name:\n                container_name = self.name\n            try:\n                message = e.message\n            except AttributeError:\n                message = str(e)\n            self.__logger.error(\"Container \" + container_name + \" exited with non zero code: \" + message)\n            raise\n        except ImageNotFound:\n            self.__logger.error(\"Image \" + self._image + \" not found\")\n            raise\n        except APIError as e:\n            self.__logger.error(\"Error in Docker API: \" + e.explanation)\n            raise\n\n        # delete temp dir\n        filesys = LocalFileSystem()\n        if self.mount_tmp and filesys.exists(self._host_tmp_dir):\n            filesys.remove(self._host_tmp_dir, recursive=True)\n"
  },
  {
    "path": "luigi/contrib/dropbox.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright (c) 2019 Jose-Ignacio Riaño Chico\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n#\n\nimport logging\nimport ntpath\nimport os\nimport random\nimport tempfile\nimport time\nfrom contextlib import contextmanager\nfrom functools import wraps\n\nimport luigi.format\nfrom luigi.target import AtomicLocalFile, FileSystem, FileSystemTarget\n\nlogger = logging.getLogger(\"luigi-interface\")\n\ntry:\n    import dropbox.dropbox_client\n    import dropbox.exceptions\n    import dropbox.files\nexcept ImportError:\n    logger.warning(\n        \"Loading Dropbox module without the python package dropbox (https://pypi.org/project/dropbox/). Will crash at runtime if Dropbox functionality is used.\"\n    )\n\n\ndef accept_trailing_slash_in_existing_dirpaths(func):\n    @wraps(func)\n    def wrapped(self, path, *args, **kwargs):\n        if path != \"/\" and path.endswith(\"/\"):\n            logger.warning(\"Dropbox paths should NOT have trailing slashes. This causes additional API calls\")\n            logger.warning(\"Consider modifying your calls to {}, so that they don't use paths than end with '/'\".format(func.__name__))\n\n            if self._exists_and_is_dir(path[:-1]):\n                path = path[:-1]\n\n        return func(self, path, *args, **kwargs)\n\n    return wrapped\n\n\ndef accept_trailing_slash(func):\n    @wraps(func)\n    def wrapped(self, path, *args, **kwargs):\n        if path != \"/\" and path.endswith(\"/\"):\n            path = path[:-1]\n        return func(self, path, *args, **kwargs)\n\n    return wrapped\n\n\nclass DropboxClient(FileSystem):\n    \"\"\"\n    Dropbox client for authentication, designed to be used by the :py:class:`DropboxTarget` class.\n    \"\"\"\n\n    def __init__(self, token, user_agent=\"Luigi\", root_namespace_id=None):\n        \"\"\"\n        :param str token: Dropbox Oauth2 Token. See :class:`DropboxTarget` for more information about generating a token\n        :param str root_namespace_id: Root namespace ID for interacting with Team Spaces\n        \"\"\"\n        if not token:\n            raise ValueError(\"The token parameter must contain a valid Dropbox Oauth2 Token\")\n\n        try:\n            conn = dropbox.dropbox_client.Dropbox(oauth2_access_token=token, user_agent=user_agent)\n        except Exception as e:\n            raise Exception(\"Cannot connect to Dropbox. Check your Internet connection and the token. \\n\" + repr(e))\n\n        if root_namespace_id:\n            conn = conn.with_path_root(dropbox.common.PathRoot.root(root_namespace_id))\n\n        self.token = token\n        self.conn = conn\n\n    @accept_trailing_slash_in_existing_dirpaths\n    def exists(self, path):\n        if path == \"/\":\n            return True\n        if path.endswith(\"/\"):\n            path = path[:-1]\n            return self._exists_and_is_dir(path)\n\n        try:\n            self.conn.files_get_metadata(path)\n            return True\n        except dropbox.exceptions.ApiError as e:\n            if isinstance(e.error.get_path(), dropbox.files.LookupError):\n                return False\n            else:\n                raise e\n\n    @accept_trailing_slash_in_existing_dirpaths\n    def remove(self, path, recursive=True, skip_trash=True):\n        if not self.exists(path):\n            return False\n        self.conn.files_delete_v2(path)\n        return True\n\n    @accept_trailing_slash\n    def mkdir(self, path, parents=True, raise_if_exists=False):\n        if self.exists(path):\n            if not self.isdir(path):\n                raise luigi.target.NotADirectory()\n            elif raise_if_exists:\n                raise luigi.target.FileAlreadyExists()\n            else:\n                return\n\n        self.conn.files_create_folder_v2(path)\n\n    @accept_trailing_slash_in_existing_dirpaths\n    def isdir(self, path):\n        if path == \"/\":\n            return True\n        try:\n            md = self.conn.files_get_metadata(path)\n            return isinstance(md, dropbox.files.FolderMetadata)\n        except dropbox.exceptions.ApiError as e:\n            if isinstance(e.error.get_path(), dropbox.files.LookupError):\n                return False\n            else:\n                raise e\n\n    @accept_trailing_slash_in_existing_dirpaths\n    def listdir(self, path, **kwargs):\n        dirs = []\n        lister = self.conn.files_list_folder(path, recursive=True, **kwargs)\n        dirs.extend(lister.entries)\n        while lister.has_more:\n            lister = self.conn.files_list_folder_continue(lister.cursor)\n            dirs.extend(lister.entries)\n        return [d.path_display for d in dirs]\n\n    @accept_trailing_slash_in_existing_dirpaths\n    def move(self, path, dest):\n        self.conn.files_move_v2(from_path=path, to_path=dest)\n\n    @accept_trailing_slash_in_existing_dirpaths\n    def copy(self, path, dest):\n        self.conn.files_copy_v2(from_path=path, to_path=dest)\n\n    def download_as_bytes(self, path):\n        metadata, response = self.conn.files_download(path)\n        return response.content\n\n    def upload(self, tmp_path, dest_path):\n        with open(tmp_path, \"rb\") as f:\n            file_size = os.path.getsize(tmp_path)\n\n            CHUNK_SIZE = 4 * 1000 * 1000\n            upload_session_start_result = self.conn.files_upload_session_start(f.read(CHUNK_SIZE))\n            commit = dropbox.files.CommitInfo(path=dest_path)\n            cursor = dropbox.files.UploadSessionCursor(session_id=upload_session_start_result.session_id, offset=f.tell())\n\n            if f.tell() >= file_size:\n                self.conn.files_upload_session_finish(f.read(CHUNK_SIZE), cursor, commit)\n                return\n\n            while f.tell() < file_size:\n                if (file_size - f.tell()) <= CHUNK_SIZE:\n                    self.conn.files_upload_session_finish(f.read(CHUNK_SIZE), cursor, commit)\n                else:\n                    self.conn.files_upload_session_append_v2(f.read(CHUNK_SIZE), cursor)\n                    cursor.offset = f.tell()\n\n    def _exists_and_is_dir(self, path):\n        \"\"\"\n        Auxiliary method, used by the 'accept_trailing_slash' and 'accept_trailing_slash_in_existing_dirpaths' decorators\n        :param path: a Dropbox path that does NOT ends with a '/' (even if it is a directory)\n        \"\"\"\n        if path == \"/\":\n            return True\n        try:\n            md = self.conn.files_get_metadata(path)\n            is_dir = isinstance(md, dropbox.files.FolderMetadata)\n            return is_dir\n        except dropbox.exceptions.ApiError:\n            return False\n\n\nclass ReadableDropboxFile:\n    def __init__(self, path, client):\n        \"\"\"\n        Represents a file inside the Dropbox cloud which will be read\n\n        :param str path: Dropbpx path of the file to be read (always starting with /)\n        :param DropboxClient client: a DropboxClient object (initialized with a valid token)\n\n        \"\"\"\n        self.path = path\n        self.client = client\n        self.download_file_location = os.path.join(tempfile.mkdtemp(prefix=str(time.time())), ntpath.basename(path))\n        self.fid = None\n        self.closed = False\n\n    def read(self):\n        return self.client.download_as_bytes(self.path)\n\n    def __enter__(self):\n        return self\n\n    def __exit__(self, exc_type, exc, traceback):\n        self.close()\n\n    def __del__(self):\n        self.close()\n        if os.path.exists(self.download_file_location):\n            os.remove(self.download_file_location)\n\n    def close(self):\n        self.closed = True\n\n    def readable(self):\n        return True\n\n    def writable(self):\n        return False\n\n    def seekable(self):\n        return False\n\n\nclass AtomicWritableDropboxFile(AtomicLocalFile):\n    def __init__(self, path, client):\n        \"\"\"\n        Represents a file that will be created inside the Dropbox cloud\n\n        :param str path: Destination path inside Dropbox\n        :param DropboxClient client: a DropboxClient object (initialized with a valid token, for the desired account)\n        \"\"\"\n        super(AtomicWritableDropboxFile, self).__init__(path)\n        self.path = path\n        self.client = client\n\n    def move_to_final_destination(self):\n        \"\"\"\n        After editing the file locally, this function uploads it to the Dropbox cloud\n        \"\"\"\n        self.client.upload(self.tmp_path, self.path)\n\n\nclass DropboxTarget(FileSystemTarget):\n    \"\"\"\n    A Dropbox filesystem target.\n    \"\"\"\n\n    def __init__(self, path, token, format=None, user_agent=\"Luigi\", root_namespace_id=None):\n        \"\"\"\n        Create an Dropbox Target for storing data in a dropbox.com account\n\n        **About the path parameter**\n\n        The path must start with '/' and should not end with '/' (even if it is a directory).\n        The path must not contain adjacent slashes ('/files//img.jpg' is an invalid path)\n\n        If the app has 'App folder' access, then / will refer to this app folder (which\n        mean that there is no need to prepend the name of the app to the path)\n        Otherwise, if the app has 'full access', then / will refer to the root of the Dropbox folder\n\n\n        **About the token parameter:**\n\n        The Dropbox target requires a valid OAuth2 token as a parameter (which means that a `Dropbox API app\n        <https://www.dropbox.com/developers/apps>`_ must be created. This app can have 'App folder' access\n        or 'Full Dropbox', as desired).\n\n        Information about generating the token can be read here:\n\n        - https://dropbox-sdk-python.readthedocs.io/en/latest/api/oauth.html#dropbox.oauth.DropboxOAuth2Flow\n        - https://blogs.dropbox.com/developers/2014/05/generate-an-access-token-for-your-own-account/\n\n        :param str path: Remote path in Dropbox (starting with '/').\n        :param str token: a valid OAuth2 Dropbox token.\n        :param luigi.Format format: the luigi format to use (e.g. `luigi.format.Nop`)\n        :param str root_namespace_id: Root namespace ID for interacting with Team Spaces\n\n\n        \"\"\"\n        super(DropboxTarget, self).__init__(path)\n\n        if not token:\n            raise ValueError(\"The token parameter must contain a valid Dropbox Oauth2 Token\")\n\n        self.path = path\n        self.token = token\n        self.client = DropboxClient(token, user_agent, root_namespace_id)\n        self.format = format or luigi.format.get_default_format()\n\n    def __str__(self):\n        return self.path\n\n    @property\n    def fs(self):\n        return self.client\n\n    @contextmanager\n    def temporary_path(self):\n        tmp_dir = tempfile.mkdtemp()\n        num = random.randrange(0, 10_000_000_000)\n        temp_path = \"{}{}luigi-tmp-{:010}{}\".format(tmp_dir, os.sep, num, ntpath.basename(self.path))\n\n        yield temp_path\n        # We won't reach here if there was an user exception.\n        self.fs.upload(temp_path, self.path)\n\n    def open(self, mode):\n        if mode not in (\"r\", \"w\"):\n            raise ValueError(\"Unsupported open mode '%s'\" % mode)\n        if mode == \"r\":\n            return self.format.pipe_reader(ReadableDropboxFile(self.path, self.client))\n        else:\n            return self.format.pipe_writer(AtomicWritableDropboxFile(self.path, self.client))\n"
  },
  {
    "path": "luigi/contrib/ecs.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2015 Outlier Bio, LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nEC2 Container Service wrapper for Luigi\n\nFrom the AWS website:\n\n  Amazon EC2 Container Service (ECS) is a highly scalable, high performance\n  container management service that supports Docker containers and allows you\n  to easily run applications on a managed cluster of Amazon EC2 instances.\n\nTo use ECS, you create a taskDefinition_ JSON that defines the `docker run`_\ncommand for one or more containers in a task or service, and then submit this\nJSON to the API to run the task.\n\nThis `boto3-powered`_ wrapper allows you to create Luigi Tasks to submit ECS\n``taskDefinition`` s. You can either pass a dict (mapping directly to the\n``taskDefinition`` JSON) OR an Amazon Resource Name (arn) for a previously\nregistered ``taskDefinition``.\n\nRequires:\n\n- boto3 package\n- Amazon AWS credentials discoverable by boto3 (e.g., by using ``aws configure``\n  from awscli_)\n- A running ECS cluster (see `ECS Get Started`_)\n\nWritten and maintained by Jake Feala (@jfeala) for Outlier Bio (@outlierbio)\n\n.. _`docker run`: https://docs.docker.com/reference/commandline/run\n.. _taskDefinition: http://docs.aws.amazon.com/AmazonECS/latest/developerguide/task_defintions.html\n.. _`boto3-powered`: https://boto3.readthedocs.io\n.. _awscli: https://aws.amazon.com/cli\n.. _`ECS Get Started`: http://docs.aws.amazon.com/AmazonECS/latest/developerguide/ECS_GetStarted.html\n\n\"\"\"\n\nimport copy\nimport logging\nimport time\n\nimport luigi\n\nlogger = logging.getLogger(\"luigi-interface\")\n\ntry:\n    import boto3\n\n    client = boto3.client(\"ecs\")\nexcept ImportError:\n    logger.warning(\"boto3 is not installed. ECSTasks require boto3\")\n\nPOLL_TIME = 2\n\n\ndef _get_task_statuses(task_ids, cluster):\n    \"\"\"\n    Retrieve task statuses from ECS API\n\n    Returns list of {RUNNING|PENDING|STOPPED} for each id in task_ids\n    \"\"\"\n    response = client.describe_tasks(tasks=task_ids, cluster=cluster)\n\n    # Error checking\n    if response[\"failures\"] != []:\n        raise Exception(\"There were some failures:\\n{0}\".format(response[\"failures\"]))\n    status_code = response[\"ResponseMetadata\"][\"HTTPStatusCode\"]\n    if status_code != 200:\n        msg = \"Task status request received status code {0}:\\n{1}\"\n        raise Exception(msg.format(status_code, response))\n\n    return [t[\"lastStatus\"] for t in response[\"tasks\"]]\n\n\ndef _track_tasks(task_ids, cluster):\n    \"\"\"Poll task status until STOPPED\"\"\"\n    while True:\n        statuses = _get_task_statuses(task_ids, cluster)\n        if all([status == \"STOPPED\" for status in statuses]):\n            logger.info(\"ECS tasks {0} STOPPED\".format(\",\".join(task_ids)))\n            break\n        time.sleep(POLL_TIME)\n        logger.debug(\"ECS task status for tasks {0}: {1}\".format(task_ids, statuses))\n\n\nclass ECSTask(luigi.Task):\n    \"\"\"\n    Base class for an Amazon EC2 Container Service Task\n\n    Amazon ECS requires you to register \"tasks\", which are JSON descriptions\n    for how to issue the ``docker run`` command. This Luigi Task can either\n    run a pre-registered ECS taskDefinition, OR register the task on the fly\n    from a Python dict.\n\n    :param task_def_arn: pre-registered task definition ARN (Amazon Resource\n        Name), of the form::\n\n            arn:aws:ecs:<region>:<user_id>:task-definition/<family>:<tag>\n\n    :param task_def: dict describing task in taskDefinition JSON format, for\n        example::\n\n            task_def = {\n                'family': 'hello-world',\n                'volumes': [],\n                'containerDefinitions': [\n                    {\n                        'memory': 1,\n                        'essential': True,\n                        'name': 'hello-world',\n                        'image': 'ubuntu',\n                        'command': ['/bin/echo', 'hello world']\n                    }\n                ]\n            }\n\n    :param cluster: str defining the ECS cluster to use.\n        When this is not defined it will use the default one.\n\n    \"\"\"\n\n    task_def_arn = luigi.OptionalParameter(default=None)\n    task_def = luigi.OptionalParameter(default=None)\n    cluster = luigi.Parameter(default=\"default\")\n\n    @property\n    def ecs_task_ids(self):\n        \"\"\"Expose the ECS task ID\"\"\"\n        if hasattr(self, \"_task_ids\"):\n            return self._task_ids\n\n    @property\n    def command(self):\n        \"\"\"\n        Command passed to the containers\n\n        Override to return list of dicts with keys 'name' and 'command',\n        describing the container names and commands to pass to the container.\n        These values will be specified in the `containerOverrides` property of\n        the `overrides` parameter passed to the runTask API.\n\n        Example::\n\n            [\n                {\n                    'name': 'myContainer',\n                    'command': ['/bin/sleep', '60']\n                }\n            ]\n\n        \"\"\"\n        pass\n\n    @staticmethod\n    def update_container_overrides_command(container_overrides, command):\n        \"\"\"\n        Update a list of container overrides with the specified command.\n\n        The specified command will take precedence over any existing commands\n        in `container_overrides` for the same container name. If no existing\n        command yet exists in `container_overrides` for the specified command,\n        it will be added.\n        \"\"\"\n        for colliding_override in filter(lambda x: x[\"name\"] == command[\"name\"], container_overrides):\n            colliding_override[\"command\"] = command[\"command\"]\n            break\n        else:\n            container_overrides.append(command)\n\n    @property\n    def combined_overrides(self):\n        \"\"\"\n        Return single dict combining any provided `overrides` parameters.\n\n        This is used to allow custom `overrides` parameters to be specified in\n        `self.run_task_kwargs` while ensuring that the values specified in\n        `self.command` are honored in `containerOverrides`.\n        \"\"\"\n        overrides = copy.deepcopy(self.run_task_kwargs.get(\"overrides\", {}))\n        if self.command:\n            if \"containerOverrides\" in overrides:\n                for command in self.command:\n                    self.update_container_overrides_command(overrides[\"containerOverrides\"], command)\n            else:\n                overrides[\"containerOverrides\"] = self.command\n        return overrides\n\n    @property\n    def run_task_kwargs(self):\n        \"\"\"\n        Additional keyword arguments to be provided to ECS runTask API.\n\n        Override this property in a subclass to provide additional parameters\n        such as `network_configuration`, `launchType`, etc.\n\n        If the returned `dict` includes an `overrides` value with a nested\n        `containerOverrides` array defining one or more container `command`\n        values, prior to calling `run_task` they will be combined with and\n        superseded by any colliding values specified separately in the\n        `command` property.\n\n        Example::\n\n            {\n                'launchType': 'FARGATE',\n                'platformVersion': '1.4.0',\n                'networkConfiguration': {\n                    'awsvpcConfiguration': {\n                        'subnets': [\n                            'subnet-01234567890abcdef',\n                            'subnet-abcdef01234567890'\n                        ],\n                        'securityGroups': [\n                            'sg-abcdef01234567890',\n                        ],\n                        'assignPublicIp': 'ENABLED'\n                    }\n                },\n                'overrides': {\n                    'ephemeralStorage': {\n                        'sizeInGiB': 30\n                    }\n                }\n            }\n        \"\"\"\n        return {}\n\n    def run(self):\n        if (not self.task_def and not self.task_def_arn) or (self.task_def and self.task_def_arn):\n            raise ValueError((\"Either (but not both) a task_def (dict) ortask_def_arn (string) must be assigned\"))\n        if not self.task_def_arn:\n            # Register the task and get assigned taskDefinition ID (arn)\n            response = client.register_task_definition(**self.task_def)\n            self.task_def_arn = response[\"taskDefinition\"][\"taskDefinitionArn\"]\n\n        run_task_kwargs = self.run_task_kwargs\n        run_task_kwargs.update(\n            {\n                \"taskDefinition\": self.task_def_arn,\n                \"cluster\": self.cluster,\n                \"overrides\": self.combined_overrides,\n            }\n        )\n\n        # Submit the task to AWS ECS and get assigned task ID\n        # (list containing 1 string)\n        response = client.run_task(**run_task_kwargs)\n\n        if response[\"failures\"]:\n            raise Exception(\", \".join([\"fail to run task {0} reason: {1}\".format(failure[\"arn\"], failure[\"reason\"]) for failure in response[\"failures\"]]))\n\n        self._task_ids = [task[\"taskArn\"] for task in response[\"tasks\"]]\n\n        # Wait on task completion\n        _track_tasks(self._task_ids, self.cluster)\n"
  },
  {
    "path": "luigi/contrib/esindex.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nSupport for Elasticsearch (1.0.0 or newer).\n\nProvides an :class:`ElasticsearchTarget` and a :class:`CopyToIndex` template task.\n\nModeled after :class:`luigi.contrib.rdbms.CopyToTable`.\n\nA minimal example (assuming elasticsearch is running on localhost:9200):\n\n.. code-block:: python\n\n    class ExampleIndex(CopyToIndex):\n        index = 'example'\n\n        def docs(self):\n            return [{'_id': 1, 'title': 'An example document.'}]\n\n    if __name__ == '__main__':\n        task = ExampleIndex()\n        luigi.build([task], local_scheduler=True)\n\nAll options:\n\n.. code-block:: python\n\n    class ExampleIndex(CopyToIndex):\n        host = 'localhost'\n        port = 9200\n        index = 'example'\n        doc_type = 'default'\n        purge_existing_index = True\n        marker_index_hist_size = 1\n\n        def docs(self):\n            return [{'_id': 1, 'title': 'An example document.'}]\n\n    if __name__ == '__main__':\n        task = ExampleIndex()\n        luigi.build([task], local_scheduler=True)\n\n`Host`, `port`, `index`, `doc_type` parameters are standard elasticsearch.\n\n`purge_existing_index` will delete the index, whenever an update is required.\nThis is useful, when one deals with \"dumps\" that represent the whole data, not just updates.\n\n`marker_index_hist_size` sets the maximum number of entries in the 'marker'\nindex:\n\n* 0 (default) keeps all updates,\n* 1 to only remember the most recent update to the index.\n\nThis can be useful, if an index needs to recreated, even though\nthe corresponding indexing task has been run sometime in the past - but\na later indexing task might have altered the index in the meantime.\n\nThere are a two luigi `luigi.cfg` configuration options:\n\n.. code-block:: ini\n\n    [elasticsearch]\n\n    marker-index = update_log\n    marker-doc-type = entry\n\n\"\"\"\n\n# pylint: disable=F0401,E1101,C0103\nimport abc\nimport datetime\nimport hashlib\nimport itertools\nimport json\nimport logging\n\nimport luigi\n\nlogger = logging.getLogger(\"luigi-interface\")\n\ntry:\n    import elasticsearch\n\n    if elasticsearch.__version__ < (1, 0, 0):\n        logger.warning(\"This module works with elasticsearch 1.0.0 or newer only.\")\n    from elasticsearch.connection import Urllib3HttpConnection\n    from elasticsearch.helpers import bulk\n\nexcept ImportError:\n    logger.warning(\"Loading esindex module without elasticsearch installed. Will crash at runtime if esindex functionality is used.\")\n\n\nclass ElasticsearchTarget(luigi.Target):\n    \"\"\"Target for a resource in Elasticsearch.\"\"\"\n\n    marker_index = luigi.configuration.get_config().get(\"elasticsearch\", \"marker-index\", \"update_log\")\n    marker_doc_type = luigi.configuration.get_config().get(\"elasticsearch\", \"marker-doc-type\", \"entry\")\n\n    def __init__(self, host, port, index, doc_type, update_id, marker_index_hist_size=0, http_auth=None, timeout=10, extra_elasticsearch_args=None):\n        \"\"\"\n        :param host: Elasticsearch server host\n        :type host: str\n        :param port: Elasticsearch server port\n        :type port: int\n        :param index: index name\n        :type index: str\n        :param doc_type: doctype name\n        :type doc_type: str\n        :param update_id: an identifier for this data set\n        :type update_id: str\n        :param marker_index_hist_size: list of changes to the index to remember\n        :type marker_index_hist_size: int\n        :param timeout: Elasticsearch connection timeout\n        :type timeout: int\n        :param extra_elasticsearch_args: extra args for Elasticsearch\n        :type Extra: dict\n        \"\"\"\n        if extra_elasticsearch_args is None:\n            extra_elasticsearch_args = {}\n\n        self.host = host\n        self.port = port\n        self.http_auth = http_auth\n        self.index = index\n        self.doc_type = doc_type\n        self.update_id = update_id\n        self.marker_index_hist_size = marker_index_hist_size\n        self.timeout = timeout\n        self.extra_elasticsearch_args = extra_elasticsearch_args\n\n        self.es = elasticsearch.Elasticsearch(\n            connection_class=Urllib3HttpConnection,\n            host=self.host,\n            port=self.port,\n            http_auth=self.http_auth,\n            timeout=self.timeout,\n            **self.extra_elasticsearch_args,\n        )\n\n    def marker_index_document_id(self):\n        \"\"\"\n        Generate an id for the indicator document.\n        \"\"\"\n        params = \"%s:%s:%s\" % (self.index, self.doc_type, self.update_id)\n        return hashlib.sha1(params.encode(\"utf-8\")).hexdigest()\n\n    def touch(self):\n        \"\"\"\n        Mark this update as complete.\n\n        The document id would be sufficient but,\n        for documentation,\n        we index the parameters `update_id`, `target_index`, `target_doc_type` and `date` as well.\n        \"\"\"\n        self.create_marker_index()\n        self.es.index(\n            index=self.marker_index,\n            doc_type=self.marker_doc_type,\n            id=self.marker_index_document_id(),\n            body={\"update_id\": self.update_id, \"target_index\": self.index, \"target_doc_type\": self.doc_type, \"date\": datetime.datetime.now()},\n        )\n        self.es.indices.flush(index=self.marker_index)\n        self.ensure_hist_size()\n\n    def exists(self):\n        \"\"\"\n        Test, if this task has been run.\n        \"\"\"\n        try:\n            self.es.get(index=self.marker_index, doc_type=self.marker_doc_type, id=self.marker_index_document_id())\n            return True\n        except elasticsearch.NotFoundError:\n            logger.debug(\"Marker document not found.\")\n        except elasticsearch.ElasticsearchException as err:\n            logger.warn(err)\n        return False\n\n    def create_marker_index(self):\n        \"\"\"\n        Create the index that will keep track of the tasks if necessary.\n        \"\"\"\n        if not self.es.indices.exists(index=self.marker_index):\n            self.es.indices.create(index=self.marker_index)\n\n    def ensure_hist_size(self):\n        \"\"\"\n        Shrink the history of updates for\n        a `index/doc_type` combination down to `self.marker_index_hist_size`.\n        \"\"\"\n        if self.marker_index_hist_size == 0:\n            return\n        result = self.es.search(\n            index=self.marker_index, doc_type=self.marker_doc_type, body={\"query\": {\"term\": {\"target_index\": self.index}}}, sort=(\"date:desc\",)\n        )\n\n        for i, hit in enumerate(result.get(\"hits\").get(\"hits\"), start=1):\n            if i > self.marker_index_hist_size:\n                marker_document_id = hit.get(\"_id\")\n                self.es.delete(id=marker_document_id, index=self.marker_index, doc_type=self.marker_doc_type)\n        self.es.indices.flush(index=self.marker_index)\n\n\nclass CopyToIndex(luigi.Task):\n    \"\"\"\n    Template task for inserting a data set into Elasticsearch.\n\n    Usage:\n\n    1. Subclass and override the required `index` attribute.\n\n    2. Implement a custom `docs` method, that returns an iterable over the documents.\n       A document can be a JSON string,\n       e.g. from a newline-delimited JSON (ldj) file (default implementation)\n       or some dictionary.\n\n    Optional attributes:\n\n    * doc_type (default),\n    * host (localhost),\n    * port (9200),\n    * settings ({'settings': {}})\n    * mapping (None),\n    * chunk_size (2000),\n    * raise_on_error (True),\n    * purge_existing_index (False),\n    * marker_index_hist_size (0)\n\n    If settings are defined, they are only applied at index creation time.\n    \"\"\"\n\n    @property\n    def host(self):\n        \"\"\"\n        ES hostname.\n        \"\"\"\n        return \"localhost\"\n\n    @property\n    def port(self):\n        \"\"\"\n        ES port.\n        \"\"\"\n        return 9200\n\n    @property\n    def http_auth(self):\n        \"\"\"\n        ES optional http auth information as either ‘:’ separated string or a tuple,\n        e.g. `('user', 'pass')` or `\"user:pass\"`.\n        \"\"\"\n        return None\n\n    @property\n    @abc.abstractmethod\n    def index(self):\n        \"\"\"\n        The target index.\n\n        May exist or not.\n        \"\"\"\n        return None\n\n    @property\n    def doc_type(self):\n        \"\"\"\n        The target doc_type.\n        \"\"\"\n        return \"default\"\n\n    @property\n    def mapping(self):\n        \"\"\"\n        Dictionary with custom mapping or `None`.\n        \"\"\"\n        return None\n\n    @property\n    def settings(self):\n        \"\"\"\n        Settings to be used at index creation time.\n        \"\"\"\n        return {\"settings\": {}}\n\n    @property\n    def chunk_size(self):\n        \"\"\"\n        Single API call for this number of docs.\n        \"\"\"\n        return 2000\n\n    @property\n    def raise_on_error(self):\n        \"\"\"\n        Whether to fail fast.\n        \"\"\"\n        return True\n\n    @property\n    def purge_existing_index(self):\n        \"\"\"\n        Whether to delete the `index` completely before any indexing.\n        \"\"\"\n        return False\n\n    @property\n    def marker_index_hist_size(self):\n        \"\"\"\n        Number of event log entries in the marker index. 0: unlimited.\n        \"\"\"\n        return 0\n\n    @property\n    def timeout(self):\n        \"\"\"\n        Timeout.\n        \"\"\"\n        return 10\n\n    @property\n    def extra_elasticsearch_args(self):\n        \"\"\"\n        Extra arguments to pass to the Elasticsearch constructor\n        \"\"\"\n        return {}\n\n    def docs(self):\n        \"\"\"\n        Return the documents to be indexed.\n\n        Beside the user defined fields, the document may contain an `_index`, `_type` and `_id`.\n        \"\"\"\n        with self.input().open(\"r\") as fobj:\n            for line in fobj:\n                yield line\n\n    # everything below will rarely have to be overridden\n\n    def _docs(self):\n        \"\"\"\n        Since `self.docs` may yield documents that do not explicitly contain `_index` or `_type`,\n        add those attributes here, if necessary.\n        \"\"\"\n        iterdocs = iter(self.docs())\n        first = next(iterdocs)\n        needs_parsing = False\n        if isinstance(first, str):\n            needs_parsing = True\n        elif isinstance(first, dict):\n            pass\n        else:\n            raise RuntimeError(\"Document must be either JSON strings or dict.\")\n        for doc in itertools.chain([first], iterdocs):\n            if needs_parsing:\n                doc = json.loads(doc)\n            if \"_index\" not in doc:\n                doc[\"_index\"] = self.index\n            if \"_type\" not in doc:\n                doc[\"_type\"] = self.doc_type\n            yield doc\n\n    def _init_connection(self):\n        return elasticsearch.Elasticsearch(\n            connection_class=Urllib3HttpConnection,\n            host=self.host,\n            port=self.port,\n            http_auth=self.http_auth,\n            timeout=self.timeout,\n            **self.extra_elasticsearch_args,\n        )\n\n    def create_index(self):\n        \"\"\"\n        Override to provide code for creating the target index.\n\n        By default it will be created without any special settings or mappings.\n        \"\"\"\n        es = self._init_connection()\n        if not es.indices.exists(index=self.index):\n            es.indices.create(index=self.index, body=self.settings)\n\n    def delete_index(self):\n        \"\"\"\n        Delete the index, if it exists.\n        \"\"\"\n        es = self._init_connection()\n        if es.indices.exists(index=self.index):\n            es.indices.delete(index=self.index)\n\n    def update_id(self):\n        \"\"\"\n        This id will be a unique identifier for this indexing task.\n        \"\"\"\n        return self.task_id\n\n    def output(self):\n        \"\"\"\n        Returns a ElasticsearchTarget representing the inserted dataset.\n\n        Normally you don't override this.\n        \"\"\"\n        return ElasticsearchTarget(\n            host=self.host,\n            port=self.port,\n            http_auth=self.http_auth,\n            index=self.index,\n            doc_type=self.doc_type,\n            update_id=self.update_id(),\n            marker_index_hist_size=self.marker_index_hist_size,\n            timeout=self.timeout,\n            extra_elasticsearch_args=self.extra_elasticsearch_args,\n        )\n\n    def run(self):\n        \"\"\"\n        Run task, namely:\n\n        * purge existing index, if requested (`purge_existing_index`),\n        * create the index, if missing,\n        * apply mappings, if given,\n        * set refresh interval to -1 (disable) for performance reasons,\n        * bulk index in batches of size `chunk_size` (2000),\n        * set refresh interval to 1s,\n        * refresh Elasticsearch,\n        * create entry in marker index.\n        \"\"\"\n        if self.purge_existing_index:\n            self.delete_index()\n        self.create_index()\n        es = self._init_connection()\n        if self.mapping:\n            es.indices.put_mapping(index=self.index, doc_type=self.doc_type, body=self.mapping)\n        es.indices.put_settings({\"index\": {\"refresh_interval\": \"-1\"}}, index=self.index)\n\n        bulk(es, self._docs(), chunk_size=self.chunk_size, raise_on_error=self.raise_on_error)\n\n        es.indices.put_settings({\"index\": {\"refresh_interval\": \"1s\"}}, index=self.index)\n        es.indices.refresh()\n        self.output().touch()\n"
  },
  {
    "path": "luigi/contrib/external_daily_snapshot.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2017 Spotify AB.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing,\n# software distributed under the License is distributed on an\n# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n# KIND, either express or implied.  See the License for the\n# specific language governing permissions and limitations\n# under the License.\n#\nfrom __future__ import annotations\n\nimport datetime\nimport logging\nfrom typing import Any\n\nimport luigi\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nclass ExternalDailySnapshot(luigi.ExternalTask):\n    \"\"\"\n    Abstract class containing a helper method to fetch the latest snapshot.\n\n    Example::\n\n      class MyTask(luigi.Task):\n        def requires(self):\n          return PlaylistContent.latest()\n\n    All tasks subclassing :class:`ExternalDailySnapshot` must have a :class:`luigi.DateParameter`\n    named ``date``.\n\n    You can also provide additional parameters to the class and also configure\n    lookback size.\n\n    Example::\n\n      ServiceLogs.latest(service=\"radio\", lookback=21)\n\n    \"\"\"\n\n    date = luigi.DateParameter()\n    __cache: list[Any] = []\n\n    @classmethod\n    def latest(cls, *args, **kwargs):\n        \"\"\"This is cached so that requires() is deterministic.\"\"\"\n        date = kwargs.pop(\"date\", datetime.date.today())\n        lookback = kwargs.pop(\"lookback\", 14)\n        # hashing kwargs deterministically would be hard. Let's just lookup by equality\n        key = (cls, args, kwargs, lookback, date)\n        for k, v in ExternalDailySnapshot.__cache:\n            if k == key:\n                return v\n        val = cls.__latest(date, lookback, args, kwargs)\n        ExternalDailySnapshot.__cache.append((key, val))\n        return val\n\n    @classmethod\n    def __latest(cls, date, lookback, args, kwargs):\n        assert lookback > 0\n        t = None\n        for i in range(lookback):\n            d = date - datetime.timedelta(i)\n            t = cls(date=d, *args, **kwargs)\n            if t.complete():\n                return t\n        logger.debug(\"Could not find last dump for %s (looked back %d days)\", cls.__name__, lookback)\n        return t\n"
  },
  {
    "path": "luigi/contrib/external_program.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2016 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nTemplate tasks for running external programs as luigi tasks.\n\nThis module is primarily intended for when you need to call a single external\nprogram or shell script, and it's enough to specify program arguments and\nenvironment variables.\n\nIf you need to run multiple commands, chain them together or pipe output\nfrom one command to the next, you're probably better off using something like\n`plumbum`_, and wrapping plumbum commands in normal luigi\n:py:class:`~luigi.task.Task` s.\n\n.. _plumbum: https://plumbum.readthedocs.io/\n\"\"\"\n\nimport logging\nimport os\nimport re\nimport signal\nimport subprocess\nimport sys\nimport tempfile\nfrom contextlib import contextmanager\nfrom multiprocessing import Process\nfrom time import sleep\n\nimport luigi\nfrom luigi.parameter import ParameterVisibility\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nclass ExternalProgramTask(luigi.Task):\n    \"\"\"\n    Template task for running an external program in a subprocess\n\n    The program is run using :py:class:`subprocess.Popen`, with ``args`` passed\n    as a list, generated by :py:meth:`program_args` (where the first element should\n    be the executable). See :py:class:`subprocess.Popen` for details.\n\n    Your must override :py:meth:`program_args` to specify the arguments you want,\n    and you can optionally override :py:meth:`program_environment` if you want to\n    control the environment variables (see :py:class:`ExternalPythonProgramTask`\n    for an example).\n\n    By default, the output (stdout and stderr) of the run external program\n    is being captured and displayed after the execution has ended. This\n    behaviour can be overridden by passing ``--capture-output False``\n    \"\"\"\n\n    capture_output = luigi.BoolParameter(default=True, significant=False, positional=False)\n\n    stream_for_searching_tracking_url = luigi.parameter.ChoiceParameter(\n        var_type=str,\n        choices=[\"none\", \"stdout\", \"stderr\"],\n        default=\"none\",\n        significant=False,\n        positional=False,\n        visibility=ParameterVisibility.HIDDEN,\n        description=\"Stream for searching tracking URL\",\n    )\n    \"\"\"\n    Used for defining which stream should be tracked for URL, may be set to 'stdout', 'stderr' or 'none'.\n\n    Default value is 'none', so URL tracking is not performed.\n    \"\"\"\n\n    tracking_url_pattern = luigi.OptionalParameter(\n        default=None,\n        significant=False,\n        positional=False,\n        visibility=ParameterVisibility.HIDDEN,\n        description=\"Regex pattern used for searching URL in the logs of the external program\",\n    )\n    \"\"\"\n    Regex pattern used for searching URL in the logs of the external program.\n\n    If a log line matches the regex, the first group in the matching is set as the tracking URL\n    for the job in the web UI. Example: 'Job UI is here: (https?://.*)'.\n\n    Default value is None, so URL tracking is not performed.\n    \"\"\"\n\n    def program_args(self):\n        \"\"\"\n        Override this method to map your task parameters to the program arguments\n\n        :return: list to pass as ``args`` to :py:class:`subprocess.Popen`\n        \"\"\"\n        raise NotImplementedError\n\n    def program_environment(self):\n        \"\"\"\n        Override this method to control environment variables for the program\n\n        :return: dict mapping environment variable names to values\n        \"\"\"\n        env = os.environ.copy()\n        return env\n\n    @property\n    def always_log_stderr(self):\n        \"\"\"\n        When True, stderr will be logged even if program execution succeeded\n\n        Override to False to log stderr only when program execution fails.\n        \"\"\"\n        return True\n\n    def _clean_output_file(self, file_object):\n        file_object.seek(0)\n        return \"\".join(map(lambda s: s.decode(\"utf-8\"), file_object.readlines()))\n\n    def build_tracking_url(self, logs_output):\n        \"\"\"\n        This method is intended for transforming pattern match in logs to an URL\n        :param logs_output: Found match of `self.tracking_url_pattern`\n        :return: a tracking URL for the task\n        \"\"\"\n        return logs_output\n\n    def run(self):\n        args = list(map(str, self.program_args()))\n\n        logger.info(\"Running command: %s\", \" \".join(args))\n        env = self.program_environment()\n        kwargs = {\"env\": env}\n        tmp_stdout, tmp_stderr = None, None\n        if self.capture_output:\n            tmp_stdout, tmp_stderr = tempfile.TemporaryFile(), tempfile.TemporaryFile()\n            kwargs.update({\"stdout\": tmp_stdout, \"stderr\": tmp_stderr})\n\n        try:\n            if self.stream_for_searching_tracking_url != \"none\" and self.tracking_url_pattern is not None:\n                with self._proc_with_tracking_url_context(proc_args=args, proc_kwargs=kwargs) as proc:\n                    proc.wait()\n            else:\n                proc = subprocess.Popen(args, **kwargs)\n                with ExternalProgramRunContext(proc):\n                    proc.wait()\n            success = proc.returncode == 0\n\n            if self.capture_output:\n                stdout = self._clean_output_file(tmp_stdout)\n                stderr = self._clean_output_file(tmp_stderr)\n\n                if stdout:\n                    logger.info(\"Program stdout:\\n{}\".format(stdout))\n                if stderr:\n                    if self.always_log_stderr or not success:\n                        logger.info(\"Program stderr:\\n{}\".format(stderr))\n            else:\n                stdout, stderr = None, None\n\n            if not success:\n                raise ExternalProgramRunError(\"Program failed with return code={}:\".format(proc.returncode), args, env=env, stdout=stdout, stderr=stderr)\n        finally:\n            if self.capture_output:\n                tmp_stderr.close()\n                tmp_stdout.close()\n\n    @contextmanager\n    def _proc_with_tracking_url_context(self, proc_args, proc_kwargs):\n        time_to_sleep = 0.5\n        file_to_write = proc_kwargs.get(self.stream_for_searching_tracking_url)\n        proc_kwargs.update({self.stream_for_searching_tracking_url: subprocess.PIPE})\n        main_proc = subprocess.Popen(proc_args, **proc_kwargs)\n        pipe_to_read = main_proc.stderr if self.stream_for_searching_tracking_url == \"stderr\" else main_proc.stdout\n\n        def _track_url_by_pattern():\n            \"\"\"\n            Scans the pipe looking for a passed pattern, if the pattern is found, `set_tracking_url` callback is sent.\n            If tmp_stdout is passed, also appends lines to this file.\n            \"\"\"\n            pattern = re.compile(self.tracking_url_pattern)\n            for new_line in iter(pipe_to_read.readline, \"\"):\n                if new_line:\n                    if file_to_write:\n                        file_to_write.write(new_line)\n                    match = re.search(pattern, new_line.decode(\"utf-8\"))\n                    if match:\n                        self.set_tracking_url(self.build_tracking_url(match.group(1)))\n                else:\n                    file_to_write.flush()\n                    sleep(time_to_sleep)\n\n        track_proc = Process(target=_track_url_by_pattern)\n        try:\n            track_proc.start()\n            with ExternalProgramRunContext(main_proc):\n                yield main_proc\n        finally:\n            # need to wait a bit to let the subprocess read the last lines\n            track_proc.join(time_to_sleep * 2)\n            if track_proc.is_alive():\n                track_proc.terminate()\n            pipe_to_read.close()\n\n\nclass ExternalProgramRunContext:\n    def __init__(self, proc):\n        self.proc = proc\n\n    def __enter__(self):\n        self.__old_signal = signal.getsignal(signal.SIGTERM)\n        signal.signal(signal.SIGTERM, self.kill_job)\n        return self\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        if exc_type is KeyboardInterrupt:\n            self.kill_job()\n        signal.signal(signal.SIGTERM, self.__old_signal)\n\n    def kill_job(self, captured_signal=None, stack_frame=None):\n        self.proc.kill()\n        if captured_signal is not None:\n            # adding 128 gives the exit code corresponding to a signal\n            sys.exit(128 + captured_signal)\n\n\nclass ExternalProgramRunError(RuntimeError):\n    def __init__(self, message, args, env=None, stdout=None, stderr=None):\n        super(ExternalProgramRunError, self).__init__(message, args, env, stdout, stderr)\n        self.message = message\n        self.args = args\n        self.env = env\n        self.out = stdout\n        self.err = stderr\n\n    def __str__(self):\n        info = self.message\n        info += \"\\nCOMMAND: {}\".format(\" \".join(self.args))\n        info += \"\\nSTDOUT: {}\".format(self.out or \"[empty]\")\n        info += \"\\nSTDERR: {}\".format(self.err or \"[empty]\")\n        env_string = None\n        if self.env:\n            env_string = \" \".join([\"=\".join([k, \"'{}'\".format(v)]) for k, v in self.env.items()])\n        info += \"\\nENVIRONMENT: {}\".format(env_string or \"[empty]\")\n        # reset terminal color in case the ENVIRONMENT changes colors\n        info += \"\\033[m\"\n        return info\n\n\nclass ExternalPythonProgramTask(ExternalProgramTask):\n    \"\"\"\n    Template task for running an external Python program in a subprocess\n\n    Simple extension of :py:class:`ExternalProgramTask`, adding two\n    :py:class:`luigi.parameter.Parameter` s for setting a virtualenv and for\n    extending the ``PYTHONPATH``.\n    \"\"\"\n\n    virtualenv = luigi.OptionalParameter(\n        default=None,\n        positional=False,\n        description=\"path to the virtualenv directory to use. It should point to \"\n        \"the directory containing the ``bin/activate`` file used for \"\n        \"enabling the virtualenv.\",\n    )\n    extra_pythonpath = luigi.OptionalParameter(\n        default=None, positional=False, description=\"extend the search path for modules by prepending this value to the ``PYTHONPATH`` environment variable.\"\n    )\n\n    def program_environment(self):\n        env = super(ExternalPythonProgramTask, self).program_environment()\n\n        if self.extra_pythonpath:\n            pythonpath = \":\".join([self.extra_pythonpath, env.get(\"PYTHONPATH\", \"\")])\n            env.update({\"PYTHONPATH\": pythonpath})\n\n        if self.virtualenv:\n            # Make the same changes to the env that a normal venv/bin/activate script would\n            path = \":\".join([\"{}/bin\".format(self.virtualenv), env.get(\"PATH\", \"\")])\n            env.update({\"PATH\": path, \"VIRTUAL_ENV\": self.virtualenv})\n            # remove PYTHONHOME env variable, if it exists\n            env.pop(\"PYTHONHOME\", None)\n\n        return env\n"
  },
  {
    "path": "luigi/contrib/ftp.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nThis library is a wrapper of ftplib or pysftp.\nIt is convenient to move data from/to (S)FTP servers.\n\nThere is an example on how to use it (example/ftp_experiment_outputs.py)\n\nYou can also find unittest for each class.\n\nBe aware that normal ftp does not provide secure communication.\n\"\"\"\n\nimport datetime\nimport ftplib\nimport io\nimport logging\nimport os\nimport random\nimport tempfile\n\nimport luigi\nimport luigi.format\nimport luigi.local_target\nimport luigi.target\nfrom luigi.format import FileWrapper\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nclass RemoteFileSystem(luigi.target.FileSystem):\n    def __init__(self, host, username=None, password=None, port=None, tls=False, timeout=60, sftp=False, pysftp_conn_kwargs=None):\n        self.host = host\n        self.username = username\n        self.password = password\n        self.tls = tls\n        self.timeout = timeout\n        self.sftp = sftp\n        self.pysftp_conn_kwargs = pysftp_conn_kwargs or {}\n\n        if port is None:\n            if self.sftp:\n                self.port = 22\n            else:\n                self.port = 21\n        else:\n            self.port = port\n\n    def _connect(self):\n        \"\"\"\n        Log in to ftp.\n        \"\"\"\n        if self.sftp:\n            self._sftp_connect()\n        else:\n            self._ftp_connect()\n\n    def _sftp_connect(self):\n        try:\n            import pysftp\n        except ImportError:\n            logger.warning(\"Please install pysftp to use SFTP.\")\n\n        self.conn = pysftp.Connection(self.host, username=self.username, password=self.password, port=self.port, **self.pysftp_conn_kwargs)\n\n    def _ftp_connect(self):\n        if self.tls:\n            self.conn = ftplib.FTP_TLS()\n        else:\n            self.conn = ftplib.FTP()\n        self.conn.connect(self.host, self.port, timeout=self.timeout)\n        self.conn.login(self.username, self.password)\n        if self.tls:\n            self.conn.prot_p()\n\n    def _close(self):\n        \"\"\"\n        Close ftp connection.\n        \"\"\"\n        if self.sftp:\n            self._sftp_close()\n        else:\n            self._ftp_close()\n\n    def _sftp_close(self):\n        self.conn.close()\n\n    def _ftp_close(self):\n        self.conn.quit()\n\n    def exists(self, path, mtime=None):\n        \"\"\"\n        Return `True` if file or directory at `path` exist, False otherwise.\n\n        Additional check on modified time when mtime is passed in.\n\n        Return False if the file's modified time is older mtime.\n        \"\"\"\n        self._connect()\n\n        if self.sftp:\n            exists = self._sftp_exists(path, mtime)\n        else:\n            exists = self._ftp_exists(path, mtime)\n\n        self._close()\n\n        return exists\n\n    def _sftp_exists(self, path, mtime):\n        exists = False\n        if mtime:\n            exists = self.conn.stat(path).st_mtime > mtime\n        elif self.conn.exists(path):\n            exists = True\n        return exists\n\n    def _ftp_exists(self, path, mtime):\n        dirname, fn = os.path.split(path)\n\n        files = self.conn.nlst(dirname)\n\n        exists = False\n        if path in files or fn in files:\n            if mtime:\n                mdtm = self.conn.sendcmd(\"MDTM \" + path)\n                modified = datetime.datetime.strptime(mdtm[4:], \"%Y%m%d%H%M%S\")\n                exists = modified > mtime\n            else:\n                exists = True\n        return exists\n\n    def remove(self, path, recursive=True):\n        \"\"\"\n        Remove file or directory at location ``path``.\n\n        :param path: a path within the FileSystem to remove.\n        :type path: str\n        :param recursive: if the path is a directory, recursively remove the directory and\n                          all of its descendants. Defaults to ``True``.\n        :type recursive: bool\n        \"\"\"\n        self._connect()\n\n        if self.sftp:\n            self._sftp_remove(path, recursive)\n        else:\n            self._ftp_remove(path, recursive)\n\n        self._close()\n\n    def _sftp_remove(self, path, recursive):\n        if self.conn.isfile(path):\n            self.conn.unlink(path)\n        else:\n            if not recursive:\n                raise RuntimeError(\"Path is not a regular file, and recursive option is not set\")\n            directories = []\n            # walk the tree, and execute call backs when files,\n            # directories and unknown types are encountered\n            # files must be removed first.  then directories can be removed\n            # after the files are gone.\n            self.conn.walktree(path, self.conn.unlink, directories.append, self.conn.unlink)\n            for directory in reversed(directories):\n                self.conn.rmdir(directory)\n            self.conn.rmdir(path)\n\n    def _ftp_remove(self, path, recursive):\n        if recursive:\n            self._rm_recursive(self.conn, path)\n        else:\n            try:\n                # try delete file\n                self.conn.delete(path)\n            except ftplib.all_errors:\n                # it is a folder, delete it\n                self.conn.rmd(path)\n\n    def _rm_recursive(self, ftp, path):\n        \"\"\"\n        Recursively delete a directory tree on a remote server.\n\n        Source: https://gist.github.com/artlogic/2632647\n        \"\"\"\n        wd = ftp.pwd()\n\n        # check if it is a file first, because some FTP servers don't return\n        # correctly on ftp.nlst(file)\n        try:\n            ftp.cwd(path)\n        except ftplib.all_errors:\n            # this is a file, we will just delete the file\n            ftp.delete(path)\n            return\n\n        try:\n            names = ftp.nlst()\n        except ftplib.all_errors:\n            # some FTP servers complain when you try and list non-existent paths\n            return\n\n        for name in names:\n            if os.path.split(name)[1] in (\".\", \"..\"):\n                continue\n\n            try:\n                ftp.cwd(name)  # if we can cwd to it, it's a folder\n                ftp.cwd(wd)  # don't try a nuke a folder we're in\n                ftp.cwd(path)  # then go back to where we were\n                self._rm_recursive(ftp, name)\n            except ftplib.all_errors:\n                ftp.delete(name)\n\n        try:\n            ftp.cwd(wd)  # do not delete the folder that we are in\n            ftp.rmd(path)\n        except ftplib.all_errors as e:\n            print(\"_rm_recursive: Could not remove {0}: {1}\".format(path, e))\n\n    def put(self, local_path, path, atomic=True):\n        \"\"\"\n        Put file from local filesystem to (s)FTP.\n        \"\"\"\n        self._connect()\n\n        if self.sftp:\n            self._sftp_put(local_path, path, atomic)\n        else:\n            self._ftp_put(local_path, path, atomic)\n\n        self._close()\n\n    def _sftp_put(self, local_path, path, atomic):\n        normpath = os.path.normpath(path)\n        directory = os.path.dirname(normpath)\n        self.conn.makedirs(directory)\n\n        if atomic:\n            tmp_path = os.path.join(directory, \"luigi-tmp-{:09d}\".format(random.randrange(0, 10_000_000_000)))\n        else:\n            tmp_path = normpath\n\n        self.conn.put(local_path, tmp_path)\n\n        if atomic:\n            self.conn.rename(tmp_path, normpath)\n\n    def _ftp_put(self, local_path, path, atomic):\n        normpath = os.path.normpath(path)\n        folder = os.path.dirname(normpath)\n\n        # create paths if do not exists\n        for subfolder in folder.split(os.sep):\n            if subfolder and subfolder not in self.conn.nlst():\n                self.conn.mkd(subfolder)\n\n            self.conn.cwd(subfolder)\n\n        # go back to ftp root folder\n        self.conn.cwd(\"/\")\n\n        # random file name\n        if atomic:\n            tmp_path = folder + os.sep + \"luigi-tmp-%09d\" % random.randrange(0, 10_000_000_000)\n        else:\n            tmp_path = normpath\n\n        self.conn.storbinary(\"STOR %s\" % tmp_path, open(local_path, \"rb\"))\n\n        if atomic:\n            self.conn.rename(tmp_path, normpath)\n\n    def get(self, path, local_path):\n        \"\"\"\n        Download file from (s)FTP to local filesystem.\n        \"\"\"\n        normpath = os.path.normpath(local_path)\n        folder = os.path.dirname(normpath)\n        if folder and not os.path.exists(folder):\n            os.makedirs(folder)\n\n        tmp_local_path = local_path + \"-luigi-tmp-%09d\" % random.randrange(0, 10_000_000_000)\n\n        # download file\n        self._connect()\n\n        if self.sftp:\n            self._sftp_get(path, tmp_local_path)\n        else:\n            self._ftp_get(path, tmp_local_path)\n\n        self._close()\n\n        os.replace(tmp_local_path, local_path)\n\n    def _sftp_get(self, path, tmp_local_path):\n        self.conn.get(path, tmp_local_path)\n\n    def _ftp_get(self, path, tmp_local_path):\n        self.conn.retrbinary(\"RETR %s\" % path, open(tmp_local_path, \"wb\").write)\n\n    def listdir(self, path=\".\"):\n        \"\"\"\n        Gets an list of the contents of path in (s)FTP\n        \"\"\"\n        self._connect()\n\n        if self.sftp:\n            contents = self._sftp_listdir(path)\n        else:\n            contents = self._ftp_listdir(path)\n\n        self._close()\n\n        return contents\n\n    def _sftp_listdir(self, path):\n        return self.conn.listdir(remotepath=path)\n\n    def _ftp_listdir(self, path):\n        return self.conn.nlst(path)\n\n\nclass AtomicFtpFile(luigi.target.AtomicLocalFile):\n    \"\"\"\n    Simple class that writes to a temp file and upload to ftp on close().\n\n    Also cleans up the temp file if close is not invoked.\n    \"\"\"\n\n    def __init__(self, fs, path):\n        \"\"\"\n        Initializes an AtomicFtpfile instance.\n        :param fs:\n        :param path:\n        :type path: str\n        \"\"\"\n        self._fs = fs\n        super(AtomicFtpFile, self).__init__(path)\n\n    def move_to_final_destination(self):\n        self._fs.put(self.tmp_path, self.path)\n\n    @property\n    def fs(self):\n        return self._fs\n\n\nclass RemoteTarget(luigi.target.FileSystemTarget):\n    \"\"\"\n    Target used for reading from remote files.\n\n    The target is implemented using intermediate files on the local system.\n    On Python2, these files may not be cleaned up.\n    \"\"\"\n\n    def __init__(\n        self, path, host, format=None, username=None, password=None, port=None, mtime=None, tls=False, timeout=60, sftp=False, pysftp_conn_kwargs=None\n    ):\n        if format is None:\n            format = luigi.format.get_default_format()\n\n        self.path = path\n        self.mtime = mtime\n        self.format = format\n        self.tls = tls\n        self.timeout = timeout\n        self.sftp = sftp\n        self._fs = RemoteFileSystem(host, username, password, port, tls, timeout, sftp, pysftp_conn_kwargs)\n\n    @property\n    def fs(self):\n        return self._fs\n\n    def open(self, mode):\n        \"\"\"\n        Open the FileSystem target.\n\n        This method returns a file-like object which can either be read from or written to depending\n        on the specified mode.\n\n        :param mode: the mode `r` opens the FileSystemTarget in read-only mode, whereas `w` will\n                     open the FileSystemTarget in write mode. Subclasses can implement\n                     additional options.\n        :type mode: str\n        \"\"\"\n        if mode == \"w\":\n            return self.format.pipe_writer(AtomicFtpFile(self._fs, self.path))\n\n        elif mode == \"r\":\n            temppath = \"{}-luigi-tmp-{:09d}\".format(self.path.lstrip(\"/\"), random.randrange(0, 10_000_000_000))\n            try:\n                # store reference to the TemporaryDirectory because it will be removed on GC\n                self.__temp_dir = tempfile.TemporaryDirectory(prefix=\"luigi-contrib-ftp_\")\n            except AttributeError:\n                # TemporaryDirectory only available in Python3, use old behaviour in Python2\n                # this file will not be cleaned up automatically\n                self.__tmp_path = os.path.join(tempfile.gettempdir(), \"luigi-contrib-ftp\", temppath)\n            else:\n                self.__tmp_path = os.path.join(self.__temp_dir.name, temppath)\n\n            # download file to local\n            self._fs.get(self.path, self.__tmp_path)\n\n            return self.format.pipe_reader(FileWrapper(io.BufferedReader(io.FileIO(self.__tmp_path, \"r\"))))\n        else:\n            raise Exception(\"mode must be 'r' or 'w' (got: %s)\" % mode)\n\n    def exists(self):\n        return self.fs.exists(self.path, self.mtime)\n\n    def put(self, local_path, atomic=True):\n        self.fs.put(local_path, self.path, atomic)\n\n    def get(self, local_path):\n        self.fs.get(self.path, local_path)\n"
  },
  {
    "path": "luigi/contrib/gcp.py",
    "content": "\"\"\"\nCommon code for GCP (google cloud services) integration\n\"\"\"\n\nimport logging\n\nlogger = logging.getLogger(\"luigi-interface\")\n\ntry:\n    import google.auth\n    import httplib2\nexcept ImportError:\n    logger.warning(\n        \"Loading GCP module without the python packages httplib2, google-auth. \\\n        This *could* crash at runtime if no other credentials are provided.\"\n    )\n\n\ndef get_authenticate_kwargs(oauth_credentials=None, http_=None):\n    \"\"\"Returns a dictionary with keyword arguments for use with discovery\n\n    Prioritizes oauth_credentials or a http client provided by the user\n    If none provided, falls back to default credentials provided by google's command line\n    utilities. If that also fails, tries using httplib2.Http()\n\n    Used by `gcs.GCSClient` and `bigquery.BigQueryClient` to initiate the API Client\n    \"\"\"\n    if oauth_credentials:\n        authenticate_kwargs = {\"credentials\": oauth_credentials}\n    elif http_:\n        authenticate_kwargs = {\"http\": http_}\n    else:\n        # neither http_ or credentials provided\n        try:\n            # try default credentials\n            credentials, _ = google.auth.default()\n            authenticate_kwargs = {\"credentials\": credentials}\n        except google.auth.exceptions.DefaultCredentialsError:\n            # try http using httplib2\n            authenticate_kwargs = {\"http\": httplib2.Http()}\n\n    return authenticate_kwargs\n"
  },
  {
    "path": "luigi/contrib/gcs.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2015 Twitter Inc\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"luigi bindings for Google Cloud Storage\"\"\"\n\nimport io\nimport logging\nimport mimetypes\nimport os\nimport tempfile\nimport time\nfrom io import BytesIO\nfrom urllib.parse import urlsplit\n\nfrom tenacity import after_log, retry, retry_if_exception, retry_if_exception_type, stop_after_attempt, wait_exponential\n\nimport luigi.target\nfrom luigi.contrib import gcp\nfrom luigi.format import FileWrapper\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n# Retry when following errors happened\nRETRYABLE_ERRORS = None\n\ntry:\n    import httplib2\n    from googleapiclient import discovery, errors, http\nexcept ImportError:\n    logger.warning(\n        \"Loading GCS module without the python packages googleapiclient & google-auth. \\\n        This will crash at runtime if GCS functionality is used.\"\n    )\nelse:\n    RETRYABLE_ERRORS = (httplib2.HttpLib2Error, IOError)\n\n# Number of bytes to send/receive in each request.\nCHUNKSIZE = 10 * 1024 * 1024\n\n# Mimetype to use if one can't be guessed from the file extension.\nDEFAULT_MIMETYPE = \"application/octet-stream\"\n\n# Time to sleep while waiting for eventual consistency to finish.\nEVENTUAL_CONSISTENCY_SLEEP_INTERVAL = 0.1\n\n# Maximum number of sleeps for eventual consistency.\nEVENTUAL_CONSISTENCY_MAX_SLEEPS = 300\n\n# Uri for batch requests\nGCS_BATCH_URI = \"https://storage.googleapis.com/batch/storage/v1\"\n\n\n# Retry configurations. For more details, see https://tenacity.readthedocs.io/en/latest/\ndef is_error_5xx(err):\n    return isinstance(err, errors.HttpError) and err.resp.status >= 500\n\n\ngcs_retry = retry(\n    retry=(retry_if_exception(is_error_5xx) | retry_if_exception_type(RETRYABLE_ERRORS)),\n    wait=wait_exponential(multiplier=1, min=1, max=10),\n    stop=stop_after_attempt(5),\n    reraise=True,\n    after=after_log(logger, logging.WARNING),\n)\n\n\ndef _wait_for_consistency(checker):\n    \"\"\"Eventual consistency: wait until GCS reports something is true.\n\n    This is necessary for e.g. create/delete where the operation might return,\n    but won't be reflected for a bit.\n    \"\"\"\n    for _ in range(EVENTUAL_CONSISTENCY_MAX_SLEEPS):\n        if checker():\n            return\n\n        time.sleep(EVENTUAL_CONSISTENCY_SLEEP_INTERVAL)\n\n    logger.warning(\"Exceeded wait for eventual GCS consistency - this may be abug in the library or something is terribly wrong.\")\n\n\nclass InvalidDeleteException(luigi.target.FileSystemException):\n    pass\n\n\nclass GCSClient(luigi.target.FileSystem):\n    \"\"\"An implementation of a FileSystem over Google Cloud Storage.\n\n       There are several ways to use this class. By default it will use the app\n       default credentials, as described at https://developers.google.com/identity/protocols/application-default-credentials .\n       Alternatively, you may pass an google-auth credentials object. e.g. to use a service account::\n\n         credentials = google.auth.jwt.Credentials.from_service_account_info(\n             '012345678912-ThisIsARandomServiceAccountEmail@developer.gserviceaccount.com',\n             'These are the contents of the p12 file that came with the service account',\n             scope='https://www.googleapis.com/auth/devstorage.read_write')\n         client = GCSClient(oauth_credentials=credentails)\n\n        The chunksize parameter specifies how much data to transfer when downloading\n        or uploading files.\n\n    .. warning::\n      By default this class will use \"automated service discovery\" which will require\n      a connection to the web. The google api client downloads a JSON file to \"create\" the\n      library interface on the fly. If you want a more hermetic build, you can pass the\n      contents of this file (currently found at https://www.googleapis.com/discovery/v1/apis/storage/v1/rest )\n      as the ``descriptor`` argument.\n    \"\"\"\n\n    def __init__(self, oauth_credentials=None, descriptor=\"\", http_=None, chunksize=CHUNKSIZE, **discovery_build_kwargs):\n        self.chunksize = chunksize\n        authenticate_kwargs = gcp.get_authenticate_kwargs(oauth_credentials, http_)\n\n        build_kwargs = authenticate_kwargs.copy()\n        build_kwargs.update(discovery_build_kwargs)\n\n        if descriptor:\n            self.client = discovery.build_from_document(descriptor, **build_kwargs)\n        else:\n            build_kwargs.setdefault(\"cache_discovery\", False)\n            self.client = discovery.build(\"storage\", \"v1\", **build_kwargs)\n\n    def _path_to_bucket_and_key(self, path):\n        (scheme, netloc, path, _, _) = urlsplit(path)\n        assert scheme == \"gs\"\n        path_without_initial_slash = path[1:]\n        return netloc, path_without_initial_slash\n\n    def _is_root(self, key):\n        return len(key) == 0 or key == \"/\"\n\n    def _add_path_delimiter(self, key):\n        return key if key[-1:] == \"/\" else key + \"/\"\n\n    @gcs_retry\n    def _obj_exists(self, bucket, obj):\n        try:\n            self.client.objects().get(bucket=bucket, object=obj).execute()\n        except errors.HttpError as ex:\n            if ex.resp[\"status\"] == \"404\":\n                return False\n            raise\n        else:\n            return True\n\n    def _list_iter(self, bucket, prefix):\n        request = self.client.objects().list(bucket=bucket, prefix=prefix)\n        response = request.execute()\n\n        while response is not None:\n            for it in response.get(\"items\", []):\n                yield it\n\n            request = self.client.objects().list_next(request, response)\n            if request is None:\n                break\n\n            response = request.execute()\n\n    @gcs_retry\n    def _do_put(self, media, dest_path):\n        bucket, obj = self._path_to_bucket_and_key(dest_path)\n\n        request = self.client.objects().insert(bucket=bucket, name=obj, media_body=media)\n        if not media.resumable():\n            return request.execute()\n\n        response = None\n        while response is None:\n            status, response = request.next_chunk()\n            if status:\n                logger.debug(\"Upload progress: %.2f%%\", 100 * status.progress())\n\n        _wait_for_consistency(lambda: self._obj_exists(bucket, obj))\n        return response\n\n    def exists(self, path):\n        bucket, obj = self._path_to_bucket_and_key(path)\n        if self._obj_exists(bucket, obj):\n            return True\n\n        return self.isdir(path)\n\n    def isdir(self, path):\n        bucket, obj = self._path_to_bucket_and_key(path)\n        if self._is_root(obj):\n            try:\n                self.client.buckets().get(bucket=bucket).execute()\n            except errors.HttpError as ex:\n                if ex.resp[\"status\"] == \"404\":\n                    return False\n                raise\n\n        obj = self._add_path_delimiter(obj)\n        if self._obj_exists(bucket, obj):\n            return True\n\n        # Any objects with this prefix\n        resp = self.client.objects().list(bucket=bucket, prefix=obj, maxResults=20).execute()\n        lst = next(iter(resp.get(\"items\", [])), None)\n        return bool(lst)\n\n    def remove(self, path, recursive=True):\n        (bucket, obj) = self._path_to_bucket_and_key(path)\n\n        if self._is_root(obj):\n            raise InvalidDeleteException(\"Cannot delete root of bucket at path {}\".format(path))\n\n        if self._obj_exists(bucket, obj):\n            self.client.objects().delete(bucket=bucket, object=obj).execute()\n            _wait_for_consistency(lambda: not self._obj_exists(bucket, obj))\n            return True\n\n        if self.isdir(path):\n            if not recursive:\n                raise InvalidDeleteException(\"Path {} is a directory. Must use recursive delete\".format(path))\n\n            req = http.BatchHttpRequest(batch_uri=GCS_BATCH_URI)\n            for it in self._list_iter(bucket, self._add_path_delimiter(obj)):\n                req.add(self.client.objects().delete(bucket=bucket, object=it[\"name\"]))\n            req.execute()\n\n            _wait_for_consistency(lambda: not self.isdir(path))\n            return True\n\n        return False\n\n    def put(self, filename, dest_path, mimetype=None, chunksize=None):\n        chunksize = chunksize or self.chunksize\n        resumable = os.path.getsize(filename) > 0\n\n        mimetype = mimetype or mimetypes.guess_type(dest_path)[0] or DEFAULT_MIMETYPE\n        media = http.MediaFileUpload(filename, mimetype=mimetype, chunksize=chunksize, resumable=resumable)\n\n        self._do_put(media, dest_path)\n\n    def _forward_args_to_put(self, kwargs):\n        return self.put(**kwargs)\n\n    def put_multiple(self, filepaths, remote_directory, mimetype=None, chunksize=None, num_process=1):\n        if isinstance(filepaths, str):\n            raise ValueError(\"filenames must be a list of strings. If you want to put a single file, use the `put(self, filename, ...)` method\")\n\n        put_kwargs_list = [\n            {\n                \"filename\": filepath,\n                \"dest_path\": os.path.join(remote_directory, os.path.basename(filepath)),\n                \"mimetype\": mimetype,\n                \"chunksize\": chunksize,\n            }\n            for filepath in filepaths\n        ]\n\n        if num_process > 1:\n            from contextlib import closing\n            from multiprocessing import Pool\n\n            with closing(Pool(num_process)) as p:\n                return p.map(self._forward_args_to_put, put_kwargs_list)\n        else:\n            for put_kwargs in put_kwargs_list:\n                self._forward_args_to_put(put_kwargs)\n\n    def put_string(self, contents, dest_path, mimetype=None):\n        mimetype = mimetype or mimetypes.guess_type(dest_path)[0] or DEFAULT_MIMETYPE\n        assert isinstance(mimetype, str)\n        if not isinstance(contents, bytes):\n            contents = contents.encode(\"utf-8\")\n        media = http.MediaIoBaseUpload(BytesIO(contents), mimetype, resumable=bool(contents))\n        self._do_put(media, dest_path)\n\n    def mkdir(self, path, parents=True, raise_if_exists=False):\n        if self.exists(path):\n            if raise_if_exists:\n                raise luigi.target.FileAlreadyExists()\n            elif not self.isdir(path):\n                raise luigi.target.NotADirectory()\n            else:\n                return\n\n        self.put_string(b\"\", self._add_path_delimiter(path), mimetype=\"text/plain\")\n\n    def copy(self, source_path, destination_path):\n        src_bucket, src_obj = self._path_to_bucket_and_key(source_path)\n        dest_bucket, dest_obj = self._path_to_bucket_and_key(destination_path)\n\n        if self.isdir(source_path):\n            src_prefix = self._add_path_delimiter(src_obj)\n            dest_prefix = self._add_path_delimiter(dest_obj)\n\n            source_path = self._add_path_delimiter(source_path)\n            copied_objs = []\n            for obj in self.listdir(source_path):\n                suffix = obj[len(source_path) :]\n\n                self.client.objects().copy(\n                    sourceBucket=src_bucket, sourceObject=src_prefix + suffix, destinationBucket=dest_bucket, destinationObject=dest_prefix + suffix, body={}\n                ).execute()\n                copied_objs.append(dest_prefix + suffix)\n\n            _wait_for_consistency(lambda: all(self._obj_exists(dest_bucket, obj) for obj in copied_objs))\n        else:\n            self.client.objects().copy(\n                sourceBucket=src_bucket, sourceObject=src_obj, destinationBucket=dest_bucket, destinationObject=dest_obj, body={}\n            ).execute()\n            _wait_for_consistency(lambda: self._obj_exists(dest_bucket, dest_obj))\n\n    def rename(self, *args, **kwargs):\n        \"\"\"\n        Alias for ``move()``\n        \"\"\"\n        self.move(*args, **kwargs)\n\n    def move(self, source_path, destination_path):\n        \"\"\"\n        Rename/move an object from one GCS location to another.\n        \"\"\"\n        self.copy(source_path, destination_path)\n        self.remove(source_path)\n\n    def listdir(self, path):\n        \"\"\"\n        Get an iterable with GCS folder contents.\n        Iterable contains paths relative to queried path.\n        \"\"\"\n        bucket, obj = self._path_to_bucket_and_key(path)\n\n        obj_prefix = self._add_path_delimiter(obj)\n        if self._is_root(obj_prefix):\n            obj_prefix = \"\"\n\n        obj_prefix_len = len(obj_prefix)\n        for it in self._list_iter(bucket, obj_prefix):\n            yield self._add_path_delimiter(path) + it[\"name\"][obj_prefix_len:]\n\n    def list_wildcard(self, wildcard_path):\n        \"\"\"Yields full object URIs matching the given wildcard.\n\n        Currently only the '*' wildcard after the last path delimiter is supported.\n\n        (If we need \"full\" wildcard functionality we should bring in gsutil dependency with its\n        https://github.com/GoogleCloudPlatform/gsutil/blob/master/gslib/wildcard_iterator.py...)\n        \"\"\"\n        path, wildcard_obj = wildcard_path.rsplit(\"/\", 1)\n        assert \"*\" not in path, \"The '*' wildcard character is only supported after the last '/'\"\n        wildcard_parts = wildcard_obj.split(\"*\")\n        assert len(wildcard_parts) == 2, \"Only one '*' wildcard is supported\"\n\n        for it in self.listdir(path):\n            if (\n                it.startswith(path + \"/\" + wildcard_parts[0])\n                and it.endswith(wildcard_parts[1])\n                and len(it) >= len(path + \"/\" + wildcard_parts[0]) + len(wildcard_parts[1])\n            ):\n                yield it\n\n    @gcs_retry\n    def download(self, path, chunksize=None, chunk_callback=lambda _: False):\n        \"\"\"Downloads the object contents to local file system.\n\n        Optionally stops after the first chunk for which chunk_callback returns True.\n        \"\"\"\n        chunksize = chunksize or self.chunksize\n        bucket, obj = self._path_to_bucket_and_key(path)\n\n        with tempfile.NamedTemporaryFile(delete=False) as fp:\n            # We can't return the tempfile reference because of a bug in python: http://bugs.python.org/issue18879\n            return_fp = _DeleteOnCloseFile(fp.name, \"r\")\n\n            # Special case empty files because chunk-based downloading doesn't work.\n            result = self.client.objects().get(bucket=bucket, object=obj).execute()\n            if int(result[\"size\"]) == 0:\n                return return_fp\n\n            request = self.client.objects().get_media(bucket=bucket, object=obj)\n            downloader = http.MediaIoBaseDownload(fp, request, chunksize=chunksize)\n\n            done = False\n            while not done:\n                _, done = downloader.next_chunk()\n                if chunk_callback(fp):\n                    done = True\n\n        return return_fp\n\n\nclass _DeleteOnCloseFile(io.FileIO):\n    def close(self):\n        super(_DeleteOnCloseFile, self).close()\n        try:\n            os.remove(self.name)\n        except OSError:\n            # Catch a potential threading race condition and also allow this\n            # method to be called multiple times.\n            pass\n\n    def readable(self):\n        return True\n\n    def writable(self):\n        return False\n\n    def seekable(self):\n        return True\n\n\nclass AtomicGCSFile(luigi.target.AtomicLocalFile):\n    \"\"\"\n    A GCS file that writes to a temp file and put to GCS on close.\n    \"\"\"\n\n    def __init__(self, path, gcs_client):\n        self.gcs_client = gcs_client\n        super(AtomicGCSFile, self).__init__(path)\n\n    def move_to_final_destination(self):\n        self.gcs_client.put(self.tmp_path, self.path)\n\n\nclass GCSTarget(luigi.target.FileSystemTarget):\n    fs = None\n\n    def __init__(self, path, format=None, client=None):\n        super(GCSTarget, self).__init__(path)\n        if format is None:\n            format = luigi.format.get_default_format()\n\n        self.format = format\n        self.fs = client or GCSClient()\n\n    def open(self, mode=\"r\"):\n        if mode == \"r\":\n            return self.format.pipe_reader(FileWrapper(io.BufferedReader(self.fs.download(self.path))))\n        elif mode == \"w\":\n            return self.format.pipe_writer(AtomicGCSFile(self.path, self.fs))\n        else:\n            raise ValueError(\"Unsupported open mode '{}'\".format(mode))\n\n\nclass GCSFlagTarget(GCSTarget):\n    \"\"\"\n    Defines a target directory with a flag-file (defaults to `_SUCCESS`) used\n    to signify job success.\n\n    This checks for two things:\n\n    * the path exists (just like the GCSTarget)\n    * the _SUCCESS file exists within the directory.\n\n    Because Hadoop outputs into a directory and not a single file,\n    the path is assumed to be a directory.\n\n    This is meant to be a handy alternative to AtomicGCSFile.\n\n    The AtomicFile approach can be burdensome for GCS since there are no directories, per se.\n\n    If we have 1,000,000 output files, then we have to rename 1,000,000 objects.\n    \"\"\"\n\n    fs = None\n\n    def __init__(self, path, format=None, client=None, flag=\"_SUCCESS\"):\n        \"\"\"\n        Initializes a GCSFlagTarget.\n\n        :param path: the directory where the files are stored.\n        :type path: str\n        :param client:\n        :type client:\n        :param flag:\n        :type flag: str\n        \"\"\"\n        if format is None:\n            format = luigi.format.get_default_format()\n\n        if path[-1] != \"/\":\n            raise ValueError(\"GCSFlagTarget requires the path to be to a directory.  It must end with a slash ( / ).\")\n        super(GCSFlagTarget, self).__init__(path, format=format, client=client)\n        self.format = format\n        self.fs = client or GCSClient()\n        self.flag = flag\n\n    def exists(self):\n        flag_target = self.path + self.flag\n        return self.fs.exists(flag_target)\n"
  },
  {
    "path": "luigi/contrib/hadoop.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nRun Hadoop Mapreduce jobs using Hadoop Streaming. To run a job, you need\nto subclass :py:class:`luigi.contrib.hadoop.JobTask` and implement a\n``mapper`` and ``reducer`` methods. See :doc:`/example_top_artists` for\nan example of how to run a Hadoop job.\n\"\"\"\n\nimport abc\nimport datetime\nimport glob\nimport hashlib\nimport logging\nimport os\nimport pickle\nimport random\nimport re\nimport shutil\nimport signal\nimport subprocess\nimport sys\nimport tempfile\nimport warnings\nfrom io import StringIO\nfrom itertools import groupby\n\nimport luigi\nimport luigi.contrib.gcs\nimport luigi.contrib.hdfs\nimport luigi.contrib.s3\nfrom luigi import configuration\nfrom luigi.contrib import mrrunner\nfrom luigi.task import Config\n\ntry:\n    # See benchmark at https://gist.github.com/mvj3/02dca2bcc8b0ef1bbfb5\n    import ujson as json\nexcept ImportError:\n    import json\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n_attached_packages = []\n\n\nTRACKING_RE = re.compile(r\"(tracking url|the url to track the job):\\s+(?P<url>.+)$\")\n\n\nclass hadoop(Config):\n    pool = luigi.OptionalParameter(\n        default=None,\n        description=(\"Hadoop pool so use for Hadoop tasks. To specify pools per tasks, see BaseHadoopJobTask.pool\"),\n    )\n\n\ndef attach(*packages):\n    \"\"\"\n    Attach a python package to hadoop map reduce tarballs to make those packages available\n    on the hadoop cluster.\n    \"\"\"\n    _attached_packages.extend(packages)\n\n\ndef dereference(f):\n    if os.path.islink(f):\n        # by joining with the dirname we are certain to get the absolute path\n        return dereference(os.path.join(os.path.dirname(f), os.readlink(f)))\n    else:\n        return f\n\n\ndef get_extra_files(extra_files):\n    result = []\n    for f in extra_files:\n        if isinstance(f, str):\n            src, dst = f, os.path.basename(f)\n        elif isinstance(f, tuple):\n            src, dst = f\n        else:\n            raise Exception()\n\n        if os.path.isdir(src):\n            src_prefix = os.path.join(src, \"\")\n            for base, dirs, files in os.walk(src):\n                for f in files:\n                    f_src = os.path.join(base, f)\n                    f_src_stripped = f_src[len(src_prefix) :]\n                    f_dst = os.path.join(dst, f_src_stripped)\n                    result.append((f_src, f_dst))\n        else:\n            result.append((src, dst))\n\n    return result\n\n\ndef create_packages_archive(packages, filename):\n    \"\"\"\n    Create a tar archive which will contain the files for the packages listed in packages.\n    \"\"\"\n    import tarfile\n\n    tar = tarfile.open(filename, \"w\")\n\n    def add(src, dst):\n        logger.debug(\"adding to tar: %s -> %s\", src, dst)\n        tar.add(src, dst)\n\n    def add_files_for_package(sub_package_path, root_package_path, root_package_name):\n        for root, dirs, files in os.walk(sub_package_path):\n            if \".svn\" in dirs:\n                dirs.remove(\".svn\")\n            for f in files:\n                if not f.endswith(\".pyc\") and not f.startswith(\".\"):\n                    add(dereference(root + \"/\" + f), root.replace(root_package_path, root_package_name) + \"/\" + f)\n\n    for package in packages:\n        # Put a submodule's entire package in the archive. This is the\n        # magic that usually packages everything you need without\n        # having to attach packages/modules explicitly\n        if not getattr(package, \"__path__\", None) and \".\" in package.__name__:\n            package = __import__(package.__name__.rpartition(\".\")[0], None, None, \"non_empty\")\n\n        n = package.__name__.replace(\".\", \"/\")\n\n        if getattr(package, \"__path__\", None):\n            # TODO: (BUG) picking only the first path does not\n            # properly deal with namespaced packages in different\n            # directories\n            p = package.__path__[0]\n\n            if p.endswith(\".egg\") and os.path.isfile(p):\n                raise \"egg files not supported!!!\"\n                # Add the entire egg file\n                # p = p[:p.find('.egg') + 4]\n                # add(dereference(p), os.path.basename(p))\n\n            else:\n                # include __init__ files from parent projects\n                root = []\n                for parent in package.__name__.split(\".\")[0:-1]:\n                    root.append(parent)\n                    module_name = \".\".join(root)\n                    directory = \"/\".join(root)\n\n                    add(dereference(__import__(module_name, None, None, \"non_empty\").__path__[0] + \"/__init__.py\"), directory + \"/__init__.py\")\n\n                add_files_for_package(p, p, n)\n\n                # include egg-info directories that are parallel:\n                for egg_info_path in glob.glob(p + \"*.egg-info\"):\n                    logger.debug('Adding package metadata to archive for \"%s\" found at \"%s\"', package.__name__, egg_info_path)\n                    add_files_for_package(egg_info_path, p, n)\n\n        else:\n            f = package.__file__\n            if f.endswith(\"pyc\"):\n                f = f[:-3] + \"py\"\n            if n.find(\".\") == -1:\n                add(dereference(f), os.path.basename(f))\n            else:\n                add(dereference(f), n + \".py\")\n    tar.close()\n\n\ndef flatten(sequence):\n    \"\"\"\n    A simple generator which flattens a sequence.\n\n    Only one level is flattened.\n\n    .. code-block:: python\n\n        (1, (2, 3), 4) -> (1, 2, 3, 4)\n\n    \"\"\"\n    for item in sequence:\n        if hasattr(item, \"__iter__\") and not isinstance(item, str) and not isinstance(item, bytes):\n            for i in item:\n                yield i\n        else:\n            yield item\n\n\nclass HadoopRunContext:\n    def __init__(self):\n        self.job_id = None\n        self.application_id = None\n\n    def __enter__(self):\n        self.__old_signal = signal.getsignal(signal.SIGTERM)\n        signal.signal(signal.SIGTERM, self.kill_job)\n        return self\n\n    def kill_job(self, captured_signal=None, stack_frame=None):\n        if self.application_id:\n            logger.info(\"Job interrupted, killing application %s\" % self.application_id)\n            subprocess.call([\"yarn\", \"application\", \"-kill\", self.application_id])\n        elif self.job_id:\n            logger.info(\"Job interrupted, killing job %s\", self.job_id)\n            subprocess.call([\"mapred\", \"job\", \"-kill\", self.job_id])\n        if captured_signal is not None:\n            # adding 128 gives the exit code corresponding to a signal\n            sys.exit(128 + captured_signal)\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        if exc_type is KeyboardInterrupt:\n            self.kill_job()\n        signal.signal(signal.SIGTERM, self.__old_signal)\n\n\nclass HadoopJobError(RuntimeError):\n    def __init__(self, message, out=None, err=None):\n        super(HadoopJobError, self).__init__(message, out, err)\n        self.message = message\n        self.out = out\n        self.err = err\n\n    def __str__(self):\n        return self.message\n\n\ndef run_and_track_hadoop_job(arglist, tracking_url_callback=None, env=None):\n    \"\"\"\n    Runs the job by invoking the command from the given arglist.\n    Finds tracking urls from the output and attempts to fetch errors using those urls if the job fails.\n    Throws HadoopJobError with information about the error\n    (including stdout and stderr from the process)\n    on failure and returns normally otherwise.\n\n    :param arglist:\n    :param tracking_url_callback:\n    :param env:\n    :return:\n    \"\"\"\n    logger.info(\"%s\", subprocess.list2cmdline(arglist))\n\n    def write_luigi_history(arglist, history):\n        \"\"\"\n        Writes history to a file in the job's output directory in JSON format.\n        Currently just for tracking the job ID in a configuration where\n        no history is stored in the output directory by Hadoop.\n        \"\"\"\n        history_filename = configuration.get_config().get(\"core\", \"history-filename\", \"\")\n        if history_filename and \"-output\" in arglist:\n            output_dir = arglist[arglist.index(\"-output\") + 1]\n            f = luigi.contrib.hdfs.HdfsTarget(os.path.join(output_dir, history_filename)).open(\"w\")\n            f.write(json.dumps(history))\n            f.close()\n\n    def track_process(arglist, tracking_url_callback, env=None):\n        # Dump stdout to a temp file, poll stderr and log it\n        temp_stdout = tempfile.TemporaryFile(\"w+t\")\n        proc = subprocess.Popen(arglist, stdout=temp_stdout, stderr=subprocess.PIPE, env=env, close_fds=True, universal_newlines=True)\n\n        # We parse the output to try to find the tracking URL.\n        # This URL is useful for fetching the logs of the job.\n        tracking_url = None\n        job_id = None\n        application_id = None\n        err_lines = []\n\n        with HadoopRunContext() as hadoop_context:\n            while proc.poll() is None:\n                err_line = proc.stderr.readline()\n                err_lines.append(err_line)\n                err_line = err_line.strip()\n                if err_line:\n                    logger.info(\"%s\", err_line)\n                err_line = err_line.lower()\n                tracking_url_match = TRACKING_RE.search(err_line)\n                if tracking_url_match:\n                    tracking_url = tracking_url_match.group(\"url\")\n                    try:\n                        tracking_url_callback(tracking_url)\n                    except Exception as e:\n                        logger.error(\"Error in tracking_url_callback, disabling! %s\", e)\n\n                        def tracking_url_callback(x):\n                            return None\n\n                if err_line.find(\"running job\") != -1:\n                    # hadoop jar output\n                    job_id = err_line.split(\"running job: \")[-1]\n                if err_line.find(\"submitted hadoop job:\") != -1:\n                    # scalding output\n                    job_id = err_line.split(\"submitted hadoop job: \")[-1]\n                if err_line.find(\"submitted application \") != -1:\n                    application_id = err_line.split(\"submitted application \")[-1]\n                hadoop_context.job_id = job_id\n                hadoop_context.application_id = application_id\n\n        # Read the rest + stdout\n        err = \"\".join(err_lines + [an_err_line for an_err_line in proc.stderr])\n        temp_stdout.seek(0)\n        out = \"\".join(temp_stdout.readlines())\n\n        if proc.returncode == 0:\n            write_luigi_history(arglist, {\"job_id\": job_id})\n            return (out, err)\n\n        # Try to fetch error logs if possible\n        message = \"Streaming job failed with exit code %d. \" % proc.returncode\n        if not tracking_url:\n            raise HadoopJobError(message + \"Also, no tracking url found.\", out, err)\n\n        try:\n            task_failures = fetch_task_failures(tracking_url)\n        except Exception as e:\n            raise HadoopJobError(message + \"Additionally, an error occurred when fetching data from %s: %s\" % (tracking_url, e), out, err)\n\n        if not task_failures:\n            raise HadoopJobError(message + \"Also, could not fetch output from tasks.\", out, err)\n        else:\n            raise HadoopJobError(message + \"Output from tasks below:\\n%s\" % task_failures, out, err)\n\n    if tracking_url_callback is None:\n\n        def tracking_url_callback(x):\n            return None\n\n    return track_process(arglist, tracking_url_callback, env)\n\n\ndef fetch_task_failures(tracking_url):\n    \"\"\"\n    Uses mechanize to fetch the actual task logs from the task tracker.\n\n    This is highly opportunistic, and we might not succeed.\n    So we set a low timeout and hope it works.\n    If it does not, it's not the end of the world.\n\n    TODO: Yarn has a REST API that we should probably use instead:\n    http://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/WebServicesIntro.html\n    \"\"\"\n    import mechanize\n\n    timeout = 3.0\n    failures_url = tracking_url.replace(\"jobdetails.jsp\", \"jobfailures.jsp\") + \"&cause=failed\"\n    logger.debug(\"Fetching data from %s\", failures_url)\n    b = mechanize.Browser()\n    b.open(failures_url, timeout=timeout)\n    links = list(b.links(text_regex=\"Last 4KB\"))  # For some reason text_regex='All' doesn't work... no idea why\n    links = random.sample(links, min(10, len(links)))  # Fetch a random subset of all failed tasks, so not to be biased towards the early fails\n    error_text = []\n    for link in links:\n        task_url = link.url.replace(\"&start=-4097\", \"&start=-100000\")  # Increase the offset\n        logger.debug(\"Fetching data from %s\", task_url)\n        b2 = mechanize.Browser()\n        try:\n            r = b2.open(task_url, timeout=timeout)\n            data = r.read()\n        except Exception as e:\n            logger.debug(\"Error fetching data from %s: %s\", task_url, e)\n            continue\n        # Try to get the hex-encoded traceback back from the output\n        for exc in re.findall(r\"luigi-exc-hex=[0-9a-f]+\", data):\n            error_text.append(\"---------- %s:\" % task_url)\n            error_text.append(exc.split(\"=\")[-1].decode(\"hex\"))\n\n    return \"\\n\".join(error_text)\n\n\nclass JobRunner:\n    run_job = NotImplemented\n\n\nclass HadoopJobRunner(JobRunner):\n    \"\"\"\n    Takes care of uploading & executing a Hadoop job using Hadoop streaming.\n\n    TODO: add code to support Elastic Mapreduce (using boto) and local execution.\n    \"\"\"\n\n    def __init__(\n        self,\n        streaming_jar,\n        modules=None,\n        streaming_args=None,\n        libjars=None,\n        libjars_in_hdfs=None,\n        jobconfs=None,\n        input_format=None,\n        output_format=None,\n        end_job_with_atomic_move_dir=True,\n        archives=None,\n    ):\n        def get(x, default):\n            return x is not None and x or default\n\n        self.streaming_jar = streaming_jar\n        self.modules = get(modules, [])\n        self.streaming_args = get(streaming_args, [])\n        self.libjars = get(libjars, [])\n        self.libjars_in_hdfs = get(libjars_in_hdfs, [])\n        self.archives = get(archives, [])\n        self.jobconfs = get(jobconfs, {})\n        self.input_format = input_format\n        self.output_format = output_format\n        self.end_job_with_atomic_move_dir = end_job_with_atomic_move_dir\n        self.tmp_dir = False\n\n    def run_job(self, job, tracking_url_callback=None):\n        if tracking_url_callback is not None:\n            warnings.warn(\"tracking_url_callback argument is deprecated, task.set_tracking_url is used instead.\", DeprecationWarning)\n\n        packages = [luigi] + self.modules + job.extra_modules() + list(_attached_packages)\n\n        # find the module containing the job\n        packages.append(__import__(job.__module__, None, None, \"dummy\"))\n\n        # find the path to out runner.py\n        runner_path = mrrunner.__file__\n        # assume source is next to compiled\n        if runner_path.endswith(\"pyc\"):\n            runner_path = runner_path[:-3] + \"py\"\n\n        base_tmp_dir = configuration.get_config().get(\"core\", \"tmp-dir\", None)\n        if base_tmp_dir:\n            warnings.warn(\n                \"The core.tmp-dir configuration item is\"\n                \" deprecated, please use the TMPDIR\"\n                \" environment variable if you wish\"\n                \" to control where luigi.contrib.hadoop may\"\n                \" create temporary files and directories.\"\n            )\n            self.tmp_dir = os.path.join(base_tmp_dir, \"hadoop_job_%016x\" % random.getrandbits(64))\n            os.makedirs(self.tmp_dir)\n        else:\n            self.tmp_dir = tempfile.mkdtemp()\n\n        logger.debug(\"Tmp dir: %s\", self.tmp_dir)\n\n        # build arguments\n        config = configuration.get_config()\n        python_executable = config.get(\"hadoop\", \"python-executable\", \"python\")\n        runner_arg = \"mrrunner.pex\" if job.package_binary is not None else \"mrrunner.py\"\n        command = \"{0} {1} {{step}}\".format(python_executable, runner_arg)\n        map_cmd = command.format(step=\"map\")\n        cmb_cmd = command.format(step=\"combiner\")\n        red_cmd = command.format(step=\"reduce\")\n\n        output_final = job.output().path\n        # atomic output: replace output with a temporary work directory\n        if self.end_job_with_atomic_move_dir:\n            illegal_targets = (luigi.contrib.s3.S3FlagTarget, luigi.contrib.gcs.GCSFlagTarget)\n            if isinstance(job.output(), illegal_targets):\n                raise TypeError(\"end_job_with_atomic_move_dir is not supported for {}\".format(illegal_targets))\n            output_hadoop = \"{output}-temp-{time}\".format(output=output_final, time=datetime.datetime.now().isoformat().replace(\":\", \"-\"))\n        else:\n            output_hadoop = output_final\n\n        arglist = luigi.contrib.hdfs.load_hadoop_cmd() + [\"jar\", self.streaming_jar]\n\n        # 'libjars' is a generic option, so place it first\n        libjars = [libjar for libjar in self.libjars]\n\n        for libjar in self.libjars_in_hdfs:\n            run_cmd = luigi.contrib.hdfs.load_hadoop_cmd() + [\"fs\", \"-get\", libjar, self.tmp_dir]\n            logger.debug(subprocess.list2cmdline(run_cmd))\n            subprocess.call(run_cmd)\n            libjars.append(os.path.join(self.tmp_dir, os.path.basename(libjar)))\n\n        if libjars:\n            arglist += [\"-libjars\", \",\".join(libjars)]\n\n        # 'archives' is also a generic option\n        archives = []\n        extra_archives = job.extra_archives()\n\n        if self.archives:\n            archives = self.archives\n\n        if extra_archives:\n            archives += extra_archives\n\n        if archives:\n            arglist += [\"-archives\", \",\".join(archives)]\n\n        # Add static files and directories\n        extra_files = get_extra_files(job.extra_files())\n\n        files = []\n        for src, dst in extra_files:\n            dst_tmp = \"%s_%09d\" % (dst.replace(\"/\", \"_\"), random.randint(0, 999999999))\n            files += [\"%s#%s\" % (src, dst_tmp)]\n            # -files doesn't support subdirectories, so we need to create the dst_tmp -> dst manually\n            job.add_link(dst_tmp, dst)\n\n        if files:\n            arglist += [\"-files\", \",\".join(files)]\n\n        jobconfs = job.jobconfs()\n\n        for k, v in self.jobconfs.items():\n            jobconfs.append(\"%s=%s\" % (k, v))\n\n        for conf in jobconfs:\n            arglist += [\"-D\", conf]\n\n        arglist += self.streaming_args\n\n        # Add additional non-generic  per-job streaming args\n        extra_streaming_args = job.extra_streaming_arguments()\n        for arg, value in extra_streaming_args:\n            if not arg.startswith(\"-\"):  # safety first\n                arg = \"-\" + arg\n            arglist += [arg, value]\n\n        arglist += [\"-mapper\", map_cmd]\n\n        if job.combiner != NotImplemented:\n            arglist += [\"-combiner\", cmb_cmd]\n        if job.reducer != NotImplemented:\n            arglist += [\"-reducer\", red_cmd]\n        packages_fn = \"mrrunner.pex\" if job.package_binary is not None else \"packages.tar\"\n        files = [\n            runner_path if job.package_binary is None else None,\n            os.path.join(self.tmp_dir, packages_fn),\n            os.path.join(self.tmp_dir, \"job-instance.pickle\"),\n        ]\n\n        for f in filter(None, files):\n            arglist += [\"-file\", f]\n\n        if self.output_format:\n            arglist += [\"-outputformat\", self.output_format]\n        if self.input_format:\n            arglist += [\"-inputformat\", self.input_format]\n\n        allowed_input_targets = (luigi.contrib.hdfs.HdfsTarget, luigi.contrib.s3.S3Target, luigi.contrib.gcs.GCSTarget)\n        for target in luigi.task.flatten(job.input_hadoop()):\n            if not isinstance(target, allowed_input_targets):\n                raise TypeError(\"target must one of: {}\".format(allowed_input_targets))\n            arglist += [\"-input\", target.path]\n\n        allowed_output_targets = (luigi.contrib.hdfs.HdfsTarget, luigi.contrib.s3.S3FlagTarget, luigi.contrib.gcs.GCSFlagTarget)\n        if not isinstance(job.output(), allowed_output_targets):\n            raise TypeError(\"output must be one of: {}\".format(allowed_output_targets))\n        arglist += [\"-output\", output_hadoop]\n\n        # submit job\n        if job.package_binary is not None:\n            shutil.copy(job.package_binary, os.path.join(self.tmp_dir, \"mrrunner.pex\"))\n        else:\n            create_packages_archive(packages, os.path.join(self.tmp_dir, \"packages.tar\"))\n\n        job.dump(self.tmp_dir)\n\n        run_and_track_hadoop_job(arglist, tracking_url_callback=job.set_tracking_url)\n\n        if self.end_job_with_atomic_move_dir:\n            luigi.contrib.hdfs.HdfsTarget(output_hadoop).move_dir(output_final)\n        self.finish()\n\n    def finish(self):\n        # FIXME: check for isdir?\n        if self.tmp_dir and os.path.exists(self.tmp_dir):\n            logger.debug(\"Removing directory %s\", self.tmp_dir)\n            shutil.rmtree(self.tmp_dir)\n\n    def __del__(self):\n        self.finish()\n\n\nclass DefaultHadoopJobRunner(HadoopJobRunner):\n    \"\"\"\n    The default job runner just reads from config and sets stuff.\n    \"\"\"\n\n    def __init__(self):\n        config = configuration.get_config()\n        streaming_jar = config.get(\"hadoop\", \"streaming-jar\")\n        super(DefaultHadoopJobRunner, self).__init__(streaming_jar=streaming_jar)\n        # TODO: add more configurable options\n\n\nclass LocalJobRunner(JobRunner):\n    \"\"\"\n    Will run the job locally.\n\n    This is useful for debugging and also unit testing. Tries to mimic Hadoop Streaming.\n\n    TODO: integrate with JobTask\n    \"\"\"\n\n    def __init__(self, samplelines=None):\n        self.samplelines = samplelines\n\n    def sample(self, input_stream, n, output):\n        for i, line in enumerate(input_stream):\n            if n is not None and i >= n:\n                break\n            output.write(line)\n\n    def group(self, input_stream):\n        output = StringIO()\n        lines = []\n        for i, line in enumerate(input_stream):\n            parts = line.rstrip(\"\\n\").split(\"\\t\")\n            blob = hashlib.new(\"md5\", str(i).encode(\"ascii\"), usedforsecurity=False).hexdigest()  # pseudo-random blob to make sure the input isn't sorted\n            lines.append((parts[:-1], blob, line))\n        for _, _, line in sorted(lines):\n            output.write(line)\n        output.seek(0)\n        return output\n\n    def run_job(self, job):\n        map_input = StringIO()\n\n        for i in luigi.task.flatten(job.input_hadoop()):\n            self.sample(i.open(\"r\"), self.samplelines, map_input)\n\n        map_input.seek(0)\n\n        if job.reducer == NotImplemented:\n            # Map only job; no combiner, no reducer\n            map_output = job.output().open(\"w\")\n            job.run_mapper(map_input, map_output)\n            map_output.close()\n            return\n\n        # run job now...\n        map_output = StringIO()\n        job.run_mapper(map_input, map_output)\n        map_output.seek(0)\n\n        if job.combiner == NotImplemented:\n            reduce_input = self.group(map_output)\n        else:\n            combine_input = self.group(map_output)\n            combine_output = StringIO()\n            job.run_combiner(combine_input, combine_output)\n            combine_output.seek(0)\n            reduce_input = self.group(combine_output)\n\n        reduce_output = job.output().open(\"w\")\n        job.run_reducer(reduce_input, reduce_output)\n        reduce_output.close()\n\n\nclass BaseHadoopJobTask(luigi.Task):\n    pool = luigi.OptionalParameter(default=None, significant=False, positional=False)\n    # This value can be set to change the default batching increment. Default is 1 for backwards compatibility.\n    batch_counter_default = 1\n\n    final_mapper = NotImplemented\n    final_combiner = NotImplemented\n    final_reducer = NotImplemented\n\n    mr_priority = NotImplemented\n    package_binary = None\n\n    _counter_dict = {}\n    task_id = None\n\n    def _get_pool(self):\n        \"\"\"Protected method\"\"\"\n        if self.pool:\n            return self.pool\n        if hadoop().pool:\n            return hadoop().pool\n\n    @abc.abstractmethod\n    def job_runner(self):\n        pass\n\n    def jobconfs(self):\n        jcs = []\n        jcs.append(\"mapred.job.name=%s\" % self)\n        if self.mr_priority != NotImplemented:\n            jcs.append(\"mapred.job.priority=%s\" % self.mr_priority())\n        pool = self._get_pool()\n        if pool is not None:\n            # Supporting two schedulers: fair (default) and capacity using the same option\n            scheduler_type = configuration.get_config().get(\"hadoop\", \"scheduler\", \"fair\")\n            if scheduler_type == \"fair\":\n                jcs.append(\"mapred.fairscheduler.pool=%s\" % pool)\n            elif scheduler_type == \"capacity\":\n                jcs.append(\"mapred.job.queue.name=%s\" % pool)\n        return jcs\n\n    def init_local(self):\n        \"\"\"\n        Implement any work to setup any internal datastructure etc here.\n\n        You can add extra input using the requires_local/input_local methods.\n\n        Anything you set on the object will be pickled and available on the Hadoop nodes.\n        \"\"\"\n        pass\n\n    def init_hadoop(self):\n        pass\n\n    # available formats are \"python\" and \"json\".\n    data_interchange_format = \"python\"\n\n    def run(self):\n        # The best solution is to store them as lazy `cached_property`, but it\n        # has extraneous dependency. And `property` is slow (need to be\n        # calculated every time when called), so we save them as attributes\n        # directly.\n        self.serialize = DataInterchange[self.data_interchange_format][\"serialize\"]\n        self.internal_serialize = DataInterchange[self.data_interchange_format][\"internal_serialize\"]\n        self.deserialize = DataInterchange[self.data_interchange_format][\"deserialize\"]\n\n        self.init_local()\n        self.job_runner().run_job(self)\n\n    def requires_local(self):\n        \"\"\"\n        Default impl - override this method if you need any local input to be accessible in init().\n        \"\"\"\n        return []\n\n    def requires_hadoop(self):\n        return self.requires()  # default impl\n\n    def input_local(self):\n        return luigi.task.getpaths(self.requires_local())\n\n    def input_hadoop(self):\n        return luigi.task.getpaths(self.requires_hadoop())\n\n    def deps(self):\n        # Overrides the default implementation\n        return luigi.task.flatten(self.requires_hadoop()) + luigi.task.flatten(self.requires_local())\n\n    def on_failure(self, exception):\n        if isinstance(exception, HadoopJobError):\n            return \"\"\"Hadoop job failed with message: {message}\n\n    stdout:\n    {stdout}\n\n\n    stderr:\n    {stderr}\n      \"\"\".format(message=exception.message, stdout=exception.out, stderr=exception.err)\n        else:\n            return super(BaseHadoopJobTask, self).on_failure(exception)\n\n\nDataInterchange = {\n    \"python\": {\"serialize\": str, \"internal_serialize\": repr, \"deserialize\": eval},\n    \"json\": {\"serialize\": json.dumps, \"internal_serialize\": json.dumps, \"deserialize\": json.loads},\n}\n\n\nclass JobTask(BaseHadoopJobTask):\n    jobconf_truncate = 20000\n    n_reduce_tasks = 25\n    reducer = NotImplemented\n\n    def jobconfs(self):\n        jcs = super(JobTask, self).jobconfs()\n        if self.reducer == NotImplemented:\n            jcs.append(\"mapred.reduce.tasks=0\")\n        else:\n            jcs.append(\"mapred.reduce.tasks=%s\" % self.n_reduce_tasks)\n        if self.jobconf_truncate >= 0:\n            jcs.append(\"stream.jobconf.truncate.limit=%i\" % self.jobconf_truncate)\n        return jcs\n\n    def init_mapper(self):\n        pass\n\n    def init_combiner(self):\n        pass\n\n    def init_reducer(self):\n        pass\n\n    def _setup_remote(self):\n        self._setup_links()\n\n    def job_runner(self):\n        # We recommend that you define a subclass, override this method and set up your own config\n        \"\"\"\n        Get the MapReduce runner for this job.\n\n        If all outputs are HdfsTargets, the DefaultHadoopJobRunner will be used.\n        Otherwise, the LocalJobRunner which streams all data through the local machine\n        will be used (great for testing).\n        \"\"\"\n        outputs = luigi.task.flatten(self.output())\n        for output in outputs:\n            if not isinstance(output, luigi.contrib.hdfs.HdfsTarget):\n                warnings.warn(\"Job is using one or more non-HdfsTarget outputs\" + \" so it will be run in local mode\")\n                return LocalJobRunner()\n        else:\n            return DefaultHadoopJobRunner()\n\n    def reader(self, input_stream):\n        \"\"\"\n        Reader is a method which iterates over input lines and outputs records.\n\n        The default implementation yields one argument containing the line for each line in the input.\"\"\"\n        for line in input_stream:\n            yield (line,)\n\n    def writer(self, outputs, stdout, stderr=sys.stderr):\n        \"\"\"\n        Writer format is a method which iterates over the output records\n        from the reducer and formats them for output.\n\n        The default implementation outputs tab separated items.\n        \"\"\"\n        for output in outputs:\n            try:\n                output = flatten(output)\n                if self.data_interchange_format == \"json\":\n                    # Only dump one json string, and skip another one, maybe key or value.\n                    output = filter(lambda x: x, output)\n                else:\n                    # JSON is already serialized, so we put `self.serialize` in a else statement.\n                    output = map(self.serialize, output)\n                print(\"\\t\".join(output), file=stdout)\n            except BaseException:\n                print(output, file=stderr)\n                raise\n\n    def mapper(self, item):\n        \"\"\"\n        Re-define to process an input item (usually a line of input data).\n\n        Defaults to identity mapper that sends all lines to the same reducer.\n        \"\"\"\n        yield None, item\n\n    combiner = NotImplemented\n\n    def incr_counter(self, *args, **kwargs):\n        \"\"\"\n        Increments a Hadoop counter.\n\n        Since counters can be a bit slow to update, this batches the updates.\n        \"\"\"\n        threshold = kwargs.get(\"threshold\", self.batch_counter_default)\n        if len(args) == 2:\n            # backwards compatibility with existing hadoop jobs\n            group_name, count = args\n            key = (group_name,)\n        else:\n            group, name, count = args\n            key = (group, name)\n\n        ct = self._counter_dict.get(key, 0)\n        ct += count\n        if ct >= threshold:\n            new_arg = list(key) + [ct]\n            self._incr_counter(*new_arg)\n            ct = 0\n        self._counter_dict[key] = ct\n\n    def _flush_batch_incr_counter(self):\n        \"\"\"\n        Increments any unflushed counter values.\n        \"\"\"\n        for key, count in self._counter_dict.items():\n            if count == 0:\n                continue\n            args = list(key) + [count]\n            self._incr_counter(*args)\n            self._counter_dict[key] = 0\n\n    def _incr_counter(self, *args):\n        \"\"\"\n        Increments a Hadoop counter.\n\n        Note that this seems to be a bit slow, ~1 ms\n\n        Don't overuse this function by updating very frequently.\n        \"\"\"\n        if len(args) == 2:\n            # backwards compatibility with existing hadoop jobs\n            group_name, count = args\n            print(\"reporter:counter:%s,%s\" % (group_name, count), file=sys.stderr)\n        else:\n            group, name, count = args\n            print(\"reporter:counter:%s,%s,%s\" % (group, name, count), file=sys.stderr)\n\n    def extra_modules(self):\n        return []  # can be overridden in subclass\n\n    def extra_files(self):\n        \"\"\"\n        Can be overridden in subclass.\n\n        Each element is either a string, or a pair of two strings (src, dst).\n\n        * `src` can be a directory (in which case everything will be copied recursively).\n        * `dst` can include subdirectories (foo/bar/baz.txt etc)\n\n        Uses Hadoop's -files option so that the same file is reused across tasks.\n        \"\"\"\n        return []\n\n    def extra_streaming_arguments(self):\n        \"\"\"\n        Extra arguments to Hadoop command line.\n        Return here a list of (parameter, value) tuples.\n        \"\"\"\n        return []\n\n    def extra_archives(self):\n        \"\"\"List of paths to archives\"\"\"\n        return []\n\n    def add_link(self, src, dst):\n        if not hasattr(self, \"_links\"):\n            self._links = []\n        self._links.append((src, dst))\n\n    def _setup_links(self):\n        if hasattr(self, \"_links\"):\n            missing = []\n            for src, dst in self._links:\n                d = os.path.dirname(dst)\n                if d:\n                    try:\n                        os.makedirs(d)\n                    except OSError:\n                        pass\n                if not os.path.exists(src):\n                    missing.append(src)\n                    continue\n                if not os.path.exists(dst):\n                    # If the combiner runs, the file might already exist,\n                    # so no reason to create the link again\n                    os.link(src, dst)\n            if missing:\n                raise HadoopJobError(\"Missing files for distributed cache: \" + \", \".join(missing))\n\n    def dump(self, directory=\"\"):\n        \"\"\"\n        Dump instance to file.\n        \"\"\"\n        with self.no_unpicklable_properties():\n            file_name = os.path.join(directory, \"job-instance.pickle\")\n            if self.__module__ == \"__main__\":\n                d = pickle.dumps(self)\n                module_name = os.path.basename(sys.argv[0]).rsplit(\".\", 1)[0]\n                d = d.replace(b\"(c__main__\", \"(c\" + module_name)\n                open(file_name, \"wb\").write(d)\n\n            else:\n                pickle.dump(self, open(file_name, \"wb\"))\n\n    def _map_input(self, input_stream):\n        \"\"\"\n        Iterate over input and call the mapper for each item.\n        If the job has a parser defined, the return values from the parser will\n        be passed as arguments to the mapper.\n\n        If the input is coded output from a previous run,\n        the arguments will be splitted in key and value.\n        \"\"\"\n        for record in self.reader(input_stream):\n            for output in self.mapper(*record):\n                yield output\n        if self.final_mapper != NotImplemented:\n            for output in self.final_mapper():\n                yield output\n        self._flush_batch_incr_counter()\n\n    def _reduce_input(self, inputs, reducer, final=NotImplemented):\n        \"\"\"\n        Iterate over input, collect values with the same key, and call the reducer for each unique key.\n        \"\"\"\n        for key, values in groupby(inputs, key=lambda x: self.internal_serialize(x[0])):\n            for output in reducer(self.deserialize(key), (v[1] for v in values)):\n                yield output\n        if final != NotImplemented:\n            for output in final():\n                yield output\n        self._flush_batch_incr_counter()\n\n    def run_mapper(self, stdin=sys.stdin, stdout=sys.stdout):\n        \"\"\"\n        Run the mapper on the hadoop node.\n        \"\"\"\n        self.init_hadoop()\n        self.init_mapper()\n        outputs = self._map_input((line[:-1] for line in stdin))\n        if self.reducer == NotImplemented:\n            self.writer(outputs, stdout)\n        else:\n            self.internal_writer(outputs, stdout)\n\n    def run_reducer(self, stdin=sys.stdin, stdout=sys.stdout):\n        \"\"\"\n        Run the reducer on the hadoop node.\n        \"\"\"\n        self.init_hadoop()\n        self.init_reducer()\n        outputs = self._reduce_input(self.internal_reader((line[:-1] for line in stdin)), self.reducer, self.final_reducer)\n        self.writer(outputs, stdout)\n\n    def run_combiner(self, stdin=sys.stdin, stdout=sys.stdout):\n        self.init_hadoop()\n        self.init_combiner()\n        outputs = self._reduce_input(self.internal_reader((line[:-1] for line in stdin)), self.combiner, self.final_combiner)\n        self.internal_writer(outputs, stdout)\n\n    def internal_reader(self, input_stream):\n        \"\"\"\n        Reader which uses python eval on each part of a tab separated string.\n        Yields a tuple of python objects.\n        \"\"\"\n        for input_line in input_stream:\n            yield list(map(self.deserialize, input_line.split(\"\\t\")))\n\n    def internal_writer(self, outputs, stdout):\n        \"\"\"\n        Writer which outputs the python repr for each item.\n        \"\"\"\n        for output in outputs:\n            print(\"\\t\".join(map(self.internal_serialize, output)), file=stdout)\n"
  },
  {
    "path": "luigi/contrib/hadoop_jar.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nProvides functionality to run a Hadoop job using a Jar\n\"\"\"\n\nimport logging\nimport os\nimport random\nimport shlex\nimport warnings\n\nimport luigi.contrib.hadoop\nimport luigi.contrib.hdfs\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\ndef fix_paths(job):\n    \"\"\"\n    Coerce input arguments to use temporary files when used for output.\n\n    Return a list of temporary file pairs (tmpfile, destination path) and\n    a list of arguments.\n\n    Converts each HdfsTarget to a string for the path.\n    \"\"\"\n    tmp_files = []\n    args = []\n    for x in job.args():\n        if isinstance(x, luigi.contrib.hdfs.HdfsTarget):  # input/output\n            if x.exists() or not job.atomic_output():  # input\n                args.append(x.path)\n            else:  # output\n                x_path_no_slash = x.path[:-1] if x.path[-1] == \"/\" else x.path\n                y = luigi.contrib.hdfs.HdfsTarget(x_path_no_slash + \"-luigi-tmp-%09d\" % random.randrange(0, 10_000_000_000))\n                tmp_files.append((y, x_path_no_slash))\n                logger.info(\"Using temp path: %s for path %s\", y.path, x.path)\n                args.append(y.path)\n        else:\n            try:\n                # hopefully the target has a path to use\n                args.append(x.path)\n            except AttributeError:\n                # if there's no path then hope converting it to a string will work\n                args.append(str(x))\n\n    return (tmp_files, args)\n\n\nclass HadoopJarJobError(Exception):\n    pass\n\n\nclass HadoopJarJobRunner(luigi.contrib.hadoop.JobRunner):\n    \"\"\"\n    JobRunner for `hadoop jar` commands. Used to run a HadoopJarJobTask.\n    \"\"\"\n\n    def __init__(self):\n        pass\n\n    def run_job(self, job, tracking_url_callback=None):\n        if tracking_url_callback is not None:\n            warnings.warn(\"tracking_url_callback argument is deprecated, task.set_tracking_url is used instead.\", DeprecationWarning)\n\n        # TODO(jcrobak): libjars, files, etc. Can refactor out of\n        # hadoop.HadoopJobRunner\n        if not job.jar():\n            raise HadoopJarJobError(\"Jar not defined\")\n\n        hadoop_arglist = luigi.contrib.hdfs.load_hadoop_cmd() + [\"jar\", job.jar()]\n        if job.main():\n            hadoop_arglist.append(job.main())\n\n        jobconfs = job.jobconfs()\n\n        for jc in jobconfs:\n            hadoop_arglist += [\"-D\" + jc]\n\n        (tmp_files, job_args) = fix_paths(job)\n\n        hadoop_arglist += job_args\n\n        ssh_config = job.ssh()\n        if ssh_config:\n            host = ssh_config.get(\"host\", None)\n            key_file = ssh_config.get(\"key_file\", None)\n            username = ssh_config.get(\"username\", None)\n            if not host or not key_file or not username:\n                raise HadoopJarJobError(\"missing some config for HadoopRemoteJarJobRunner\")\n            arglist = [\"ssh\", \"-i\", key_file, \"-o\", \"BatchMode=yes\"]  # no password prompts etc\n            if ssh_config.get(\"no_host_key_check\", False):\n                arglist += [\"-o\", \"UserKnownHostsFile=/dev/null\", \"-o\", \"StrictHostKeyChecking=no\"]\n            arglist.append(\"{}@{}\".format(username, host))\n            hadoop_arglist = [shlex.quote(arg) for arg in hadoop_arglist]\n            arglist.append(\" \".join(hadoop_arglist))\n        else:\n            if not os.path.exists(job.jar()):\n                logger.error(\"Can't find jar: %s, full path %s\", job.jar(), os.path.abspath(job.jar()))\n                raise HadoopJarJobError(\"job jar does not exist\")\n            arglist = hadoop_arglist\n\n        luigi.contrib.hadoop.run_and_track_hadoop_job(arglist, job.set_tracking_url)\n\n        for a, b in tmp_files:\n            a.move(b)\n\n\nclass HadoopJarJobTask(luigi.contrib.hadoop.BaseHadoopJobTask):\n    \"\"\"\n    A job task for `hadoop jar` commands that define a jar and (optional) main method.\n    \"\"\"\n\n    def jar(self):\n        \"\"\"\n        Path to the jar for this Hadoop Job.\n        \"\"\"\n        return None\n\n    def main(self):\n        \"\"\"\n        optional main method for this Hadoop Job.\n        \"\"\"\n        return None\n\n    def job_runner(self):\n        # We recommend that you define a subclass, override this method and set up your own config\n        return HadoopJarJobRunner()\n\n    def atomic_output(self):\n        \"\"\"\n        If True, then rewrite output arguments to be temp locations and\n        atomically move them into place after the job finishes.\n        \"\"\"\n        return True\n\n    def ssh(self):\n        \"\"\"\n        Set this to run hadoop command remotely via ssh. It needs to be a dict that looks like\n        {\"host\": \"myhost\", \"key_file\": None, \"username\": None, [\"no_host_key_check\": False]}\n        \"\"\"\n        return None\n\n    def args(self):\n        \"\"\"\n        Returns an array of args to pass to the job (after hadoop jar <jar> <main>).\n        \"\"\"\n        return []\n"
  },
  {
    "path": "luigi/contrib/hdfs/__init__.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nProvides access to HDFS using the :py:class:`HdfsTarget`, a subclass of :py:class:`~luigi.target.Target`.\nYou can configure what client by setting the \"client\" config under the \"hdfs\" section in the configuration, or using the ``--hdfs-client`` command line option.\n\"hadoopcli\" is the slowest, but should work out of the box.\n\nSince the hdfs functionality is quite big in luigi, it's split into smaller\nfiles under ``luigi/contrib/hdfs/*.py``. But for the sake of convenience and\nAPI stability, everything is reexported under :py:mod:`luigi.contrib.hdfs`.\n\"\"\"\n\n# imports\nfrom luigi.contrib.hdfs import clients as hdfs_clients\nfrom luigi.contrib.hdfs import config as hdfs_config\nfrom luigi.contrib.hdfs import error as hdfs_error\nfrom luigi.contrib.hdfs import format as hdfs_format\nfrom luigi.contrib.hdfs import hadoopcli_clients as hdfs_hadoopcli_clients\nfrom luigi.contrib.hdfs import target as hdfs_target\nfrom luigi.contrib.hdfs import webhdfs_client as hdfs_webhdfs_client\n\n# config.py\nhdfs = hdfs_config.hdfs\nload_hadoop_cmd = hdfs_config.load_hadoop_cmd\nget_configured_hadoop_version = hdfs_config.get_configured_hadoop_version\nget_configured_hdfs_client = hdfs_config.get_configured_hdfs_client\ntmppath = hdfs_config.tmppath\n\n\n# clients\nHDFSCliError = hdfs_error.HDFSCliError\ncall_check = hdfs_hadoopcli_clients.HdfsClient.call_check\nHdfsClient = hdfs_hadoopcli_clients.HdfsClient\nWebHdfsClient = hdfs_webhdfs_client.WebHdfsClient\nHdfsClientCdh3 = hdfs_hadoopcli_clients.HdfsClientCdh3\nHdfsClientApache1 = hdfs_hadoopcli_clients.HdfsClientApache1\ncreate_hadoopcli_client = hdfs_hadoopcli_clients.create_hadoopcli_client\nget_autoconfig_client = hdfs_clients.get_autoconfig_client\nexists = hdfs_clients.exists\nrename = hdfs_clients.rename\nremove = hdfs_clients.remove\nmkdir = hdfs_clients.mkdir\nlistdir = hdfs_clients.listdir\n\n\n# format.py\nHdfsReadPipe = hdfs_format.HdfsReadPipe\nHdfsAtomicWritePipe = hdfs_format.HdfsAtomicWritePipe\nHdfsAtomicWriteDirPipe = hdfs_format.HdfsAtomicWriteDirPipe\nPlainFormat = hdfs_format.PlainFormat\nPlainDirFormat = hdfs_format.PlainDirFormat\nPlain = hdfs_format.Plain\nPlainDir = hdfs_format.PlainDir\nCompatibleHdfsFormat = hdfs_format.CompatibleHdfsFormat\n\n\n# target.py\nHdfsTarget = hdfs_target.HdfsTarget\nHdfsFlagTarget = hdfs_target.HdfsFlagTarget\n"
  },
  {
    "path": "luigi/contrib/hdfs/abstract_client.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nModule containing abstract class about hdfs clients.\n\"\"\"\n\nimport abc\n\nimport luigi.target\n\n\nclass HdfsFileSystem(luigi.target.FileSystem, metaclass=abc.ABCMeta):\n    \"\"\"\n    This client uses Apache 2.x syntax for file system commands, which also matched CDH4.\n    \"\"\"\n\n    def rename(self, path, dest):\n        \"\"\"\n        Rename or move a file.\n\n        In hdfs land, \"mv\" is often called rename. So we add an alias for\n        ``move()`` called ``rename()``. This is also to keep backward\n        compatibility since ``move()`` became standardized in luigi's\n        filesystem interface.\n        \"\"\"\n        return self.move(path, dest)\n\n    def rename_dont_move(self, path, dest):\n        \"\"\"\n        Override this method with an implementation that uses rename2,\n        which is a rename operation that never moves.\n\n        rename2 -\n        https://github.com/apache/hadoop/blob/ae91b13/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/ClientProtocol.java\n        (lines 483-523)\n        \"\"\"\n        # We only override this method to be able to provide a more specific\n        # docstring.\n        return super(HdfsFileSystem, self).rename_dont_move(path, dest)\n\n    @abc.abstractmethod\n    def remove(self, path, recursive=True, skip_trash=False):\n        pass\n\n    @abc.abstractmethod\n    def chmod(self, path, permissions, recursive=False):\n        pass\n\n    @abc.abstractmethod\n    def chown(self, path, owner, group, recursive=False):\n        pass\n\n    @abc.abstractmethod\n    def count(self, path):\n        \"\"\"\n        Count contents in a directory\n        \"\"\"\n        pass\n\n    @abc.abstractmethod\n    def copy(self, path, destination):\n        pass\n\n    @abc.abstractmethod\n    def put(self, local_path, destination):\n        pass\n\n    @abc.abstractmethod\n    def get(self, path, local_destination):\n        pass\n\n    @abc.abstractmethod\n    def mkdir(self, path, parents=True, raise_if_exists=False):\n        pass\n\n    @abc.abstractmethod\n    def listdir(self, path, ignore_directories=False, ignore_files=False, include_size=False, include_type=False, include_time=False, recursive=False):\n        pass\n\n    @abc.abstractmethod\n    def touchz(self, path):\n        pass\n"
  },
  {
    "path": "luigi/contrib/hdfs/clients.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nThe implementations of the hdfs clients.\n\"\"\"\n\nimport logging\nimport threading\n\nfrom luigi.contrib.hdfs import config as hdfs_config\nfrom luigi.contrib.hdfs import hadoopcli_clients as hdfs_hadoopcli_clients\nfrom luigi.contrib.hdfs import webhdfs_client as hdfs_webhdfs_client\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n_AUTOCONFIG_CLIENT = threading.local()\n\n\ndef get_autoconfig_client(client_cache=_AUTOCONFIG_CLIENT):\n    \"\"\"\n    Creates the client as specified in the `luigi.cfg` configuration.\n    \"\"\"\n    try:\n        return client_cache.client\n    except AttributeError:\n        configured_client = hdfs_config.get_configured_hdfs_client()\n        if configured_client == \"webhdfs\":\n            client_cache.client = hdfs_webhdfs_client.WebHdfsClient()\n        elif configured_client == \"hadoopcli\":\n            client_cache.client = hdfs_hadoopcli_clients.create_hadoopcli_client()\n        else:\n            raise Exception(\"Unknown hdfs client \" + configured_client)\n        return client_cache.client\n\n\ndef _with_ac(method_name):\n    def result(*args, **kwargs):\n        return getattr(get_autoconfig_client(), method_name)(*args, **kwargs)\n\n    return result\n\n\nexists = _with_ac(\"exists\")\nrename = _with_ac(\"rename\")\nremove = _with_ac(\"remove\")\nmkdir = _with_ac(\"mkdir\")\nlistdir = _with_ac(\"listdir\")\n"
  },
  {
    "path": "luigi/contrib/hdfs/config.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nYou can configure what client by setting the \"client\" config under the \"hdfs\" section in the configuration, or using the ``--hdfs-client`` command line option.\n\"hadoopcli\" is the slowest, but should work out of the box.\n\"\"\"\n\nimport getpass\nimport os\nimport random\nfrom urllib.parse import urlparse, urlunparse\n\nimport luigi\nimport luigi.configuration\n\n\nclass hdfs(luigi.Config):\n    client_version = luigi.IntParameter(default=None)\n    namenode_host = luigi.OptionalParameter(default=None)\n    namenode_port = luigi.IntParameter(default=None)\n    client = luigi.Parameter(default=\"hadoopcli\")\n    tmp_dir = luigi.OptionalParameter(\n        default=None,\n        config_path=dict(section=\"core\", name=\"hdfs-tmp-dir\"),\n    )\n\n\nclass hadoopcli(luigi.Config):\n    command = luigi.Parameter(\n        default=\"hadoop\",\n        config_path=dict(section=\"hadoop\", name=\"command\"),\n        description='The hadoop command, will run split() on it, so you can pass something like \"hadoop --param\"',\n    )\n    version = luigi.Parameter(default=\"cdh4\", config_path=dict(section=\"hadoop\", name=\"version\"), description=\"Can also be cdh3 or apache1\")\n\n\ndef load_hadoop_cmd():\n    return hadoopcli().command.split()\n\n\ndef get_configured_hadoop_version():\n    \"\"\"\n    CDH4 (hadoop 2+) has a slightly different syntax for interacting with hdfs\n    via the command line.\n\n    The default version is CDH4, but one can override\n    this setting with \"cdh3\" or \"apache1\" in the hadoop section of the config\n    in order to use the old syntax.\n    \"\"\"\n    return hadoopcli().version.lower()\n\n\ndef get_configured_hdfs_client():\n    \"\"\"\n    This is a helper that fetches the configuration value for 'client' in\n    the [hdfs] section. It will return the client that retains backwards\n    compatibility when 'client' isn't configured.\n    \"\"\"\n    return hdfs().client\n\n\ndef tmppath(path=None, include_unix_username=True):\n    \"\"\"\n    @param path: target path for which it is needed to generate temporary location\n    @type path: str\n    @type include_unix_username: bool\n    @rtype: str\n\n    Note that include_unix_username might work on windows too.\n    \"\"\"\n    addon = \"luigitemp-%09d\" % random.randrange(0, 10_000_000_000)\n    temp_dir = \"/tmp\"  # default tmp dir if none is specified in config\n\n    # 1. Figure out to which temporary directory to place\n    configured_hdfs_tmp_dir = hdfs().tmp_dir\n    if configured_hdfs_tmp_dir is not None:\n        # config is superior\n        base_dir = configured_hdfs_tmp_dir\n    elif path is not None:\n        # need to copy correct schema and network location\n        parsed = urlparse(path)\n        base_dir = urlunparse((parsed.scheme, parsed.netloc, temp_dir, \"\", \"\", \"\"))\n    else:\n        # just system temporary directory\n        base_dir = temp_dir\n\n    # 2. Figure out what to place\n    if path is not None:\n        if path.startswith(temp_dir + \"/\"):\n            # Not 100%, but some protection from directories like /tmp/tmp/file\n            subdir = path[len(temp_dir) :]\n        else:\n            # Protection from /tmp/hdfs:/dir/file\n            parsed = urlparse(path)\n            subdir = parsed.path\n        subdir = subdir.lstrip(\"/\") + \"-\"\n    else:\n        # just return any random temporary location\n        subdir = \"\"\n\n    if include_unix_username:\n        subdir = os.path.join(getpass.getuser(), subdir)\n\n    return os.path.join(base_dir, subdir + addon)\n"
  },
  {
    "path": "luigi/contrib/hdfs/error.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nThe implementations of the hdfs clients.\n\"\"\"\n\n\nclass HDFSCliError(Exception):\n    def __init__(self, command, returncode, stdout, stderr):\n        self.returncode = returncode\n        self.stdout = stdout\n        self.stderr = stderr\n        msg = (\"Command %r failed [exit code %d]\\n---stdout---\\n%s\\n---stderr---\\n%s------------\") % (command, returncode, stdout, stderr)\n        super(HDFSCliError, self).__init__(msg)\n"
  },
  {
    "path": "luigi/contrib/hdfs/format.py",
    "content": "import logging\nimport os\n\nimport luigi.format\nfrom luigi.contrib.hdfs import config as hdfs_config\nfrom luigi.contrib.hdfs.clients import exists, listdir, mkdir, remove, rename\nfrom luigi.contrib.hdfs.config import load_hadoop_cmd\nfrom luigi.contrib.hdfs.error import HDFSCliError\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nclass HdfsAtomicWriteError(IOError):\n    pass\n\n\nclass HdfsReadPipe(luigi.format.InputPipeProcessWrapper):\n    def __init__(self, path):\n        super(HdfsReadPipe, self).__init__(load_hadoop_cmd() + [\"fs\", \"-cat\", path])\n\n\nclass HdfsAtomicWritePipe(luigi.format.OutputPipeProcessWrapper):\n    \"\"\"\n    File like object for writing to HDFS\n\n    The referenced file is first written to a temporary location and then\n    renamed to final location on close(). If close() isn't called\n    the temporary file will be cleaned up when this object is\n    garbage collected\n\n    TODO: if this is buggy, change it so it first writes to a\n    local temporary file and then uploads it on completion\n    \"\"\"\n\n    def __init__(self, path):\n        self.path = path\n        self.tmppath = hdfs_config.tmppath(self.path)\n        parent_dir = os.path.dirname(self.tmppath)\n        mkdir(parent_dir, parents=True, raise_if_exists=False)\n        super(HdfsAtomicWritePipe, self).__init__(load_hadoop_cmd() + [\"fs\", \"-put\", \"-\", self.tmppath])\n\n    def abort(self):\n        logger.info(\"Aborting %s('%s'). Removing temporary file '%s'\", self.__class__.__name__, self.path, self.tmppath)\n        super(HdfsAtomicWritePipe, self).abort()\n        remove(self.tmppath, skip_trash=True)\n\n    def close(self):\n        super(HdfsAtomicWritePipe, self).close()\n        try:\n            if exists(self.path):\n                remove(self.path)\n        except Exception as ex:\n            if isinstance(ex, HDFSCliError) or ex.args[0].contains(\"FileNotFoundException\"):\n                pass\n            else:\n                raise ex\n        if not all(result[\"result\"] for result in rename(self.tmppath, self.path) or []):\n            raise HdfsAtomicWriteError(\"Atomic write to {} failed\".format(self.path))\n\n\nclass HdfsAtomicWriteDirPipe(luigi.format.OutputPipeProcessWrapper):\n    \"\"\"\n    Writes a data<data_extension> file to a directory at <path>.\n    \"\"\"\n\n    def __init__(self, path, data_extension=\"\"):\n        self.path = path\n        self.tmppath = hdfs_config.tmppath(self.path)\n        self.datapath = self.tmppath + (\"/data%s\" % data_extension)\n        super(HdfsAtomicWriteDirPipe, self).__init__(load_hadoop_cmd() + [\"fs\", \"-put\", \"-\", self.datapath])\n\n    def abort(self):\n        logger.info(\"Aborting %s('%s'). Removing temporary dir '%s'\", self.__class__.__name__, self.path, self.tmppath)\n        super(HdfsAtomicWriteDirPipe, self).abort()\n        remove(self.tmppath, skip_trash=True)\n\n    def close(self):\n        super(HdfsAtomicWriteDirPipe, self).close()\n        try:\n            if exists(self.path):\n                remove(self.path)\n        except Exception as ex:\n            if isinstance(ex, HDFSCliError) or ex.args[0].contains(\"FileNotFoundException\"):\n                pass\n            else:\n                raise ex\n\n        # it's unlikely to fail in this way but better safe than sorry\n        if not all(result[\"result\"] for result in rename(self.tmppath, self.path) or []):\n            raise HdfsAtomicWriteError(\"Atomic write to {} failed\".format(self.path))\n\n        if os.path.basename(self.tmppath) in map(os.path.basename, listdir(self.path)):\n            remove(self.path)\n            raise HdfsAtomicWriteError(\"Atomic write to {} failed\".format(self.path))\n\n\nclass PlainFormat(luigi.format.Format):\n    input = \"bytes\"\n    output = \"hdfs\"\n\n    def hdfs_writer(self, path):\n        return self.pipe_writer(path)\n\n    def hdfs_reader(self, path):\n        return self.pipe_reader(path)\n\n    def pipe_reader(self, path):\n        return HdfsReadPipe(path)\n\n    def pipe_writer(self, output_pipe):\n        return HdfsAtomicWritePipe(output_pipe)\n\n\nclass PlainDirFormat(luigi.format.Format):\n    input = \"bytes\"\n    output = \"hdfs\"\n\n    def hdfs_writer(self, path):\n        return self.pipe_writer(path)\n\n    def hdfs_reader(self, path):\n        return self.pipe_reader(path)\n\n    def pipe_reader(self, path):\n        # exclude underscore-prefixedfiles/folders (created by MapReduce)\n        return HdfsReadPipe(\"%s/[^_]*\" % path)\n\n    def pipe_writer(self, path):\n        return HdfsAtomicWriteDirPipe(path)\n\n\nPlain = PlainFormat()\nPlainDir = PlainDirFormat()\n\n\nclass CompatibleHdfsFormat(luigi.format.Format):\n    output = \"hdfs\"\n\n    def __init__(self, writer, reader, input=None):\n        if input is not None:\n            self.input = input\n\n        self.reader = reader\n        self.writer = writer\n\n    def pipe_writer(self, output):\n        return self.writer(output)\n\n    def pipe_reader(self, input):\n        return self.reader(input)\n\n    def hdfs_writer(self, output):\n        return self.writer(output)\n\n    def hdfs_reader(self, input):\n        return self.reader(input)\n\n    # __getstate__/__setstate__ needed for pickling, because self.reader and\n    # self.writer may be unpickleable instance methods of another format class.\n    # This was mainly to support pickling of standard HdfsTarget instances.\n\n    def __getstate__(self):\n        d = self.__dict__.copy()\n        for attr in (\"reader\", \"writer\"):\n            method = getattr(self, attr)\n            try:\n                # if instance method, pickle instance and method name\n                d[attr] = method.__self__, method.__func__.__name__\n            except AttributeError:\n                pass  # not an instance method\n        return d\n\n    def __setstate__(self, d):\n        self.__dict__ = d\n        for attr in (\"reader\", \"writer\"):\n            try:\n                method_self, method_name = d[attr]\n            except ValueError:\n                continue\n            method = getattr(method_self, method_name)\n            setattr(self, attr, method)\n"
  },
  {
    "path": "luigi/contrib/hdfs/hadoopcli_clients.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nThe implementations of the hdfs clients.\n\"\"\"\n\nimport datetime\nimport logging\nimport os\nimport re\nimport subprocess\nimport warnings\n\nfrom luigi.contrib.hdfs import abstract_client as hdfs_abstract_client\nfrom luigi.contrib.hdfs import config as hdfs_config\nfrom luigi.contrib.hdfs import error as hdfs_error\nfrom luigi.contrib.hdfs.config import load_hadoop_cmd\nfrom luigi.target import FileAlreadyExists\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\ndef create_hadoopcli_client():\n    \"\"\"\n    Given that we want one of the hadoop cli clients,\n    this one will return the right one.\n    \"\"\"\n    version = hdfs_config.get_configured_hadoop_version()\n    if version == \"cdh4\":\n        return HdfsClient()\n    elif version == \"cdh3\":\n        return HdfsClientCdh3()\n    elif version == \"apache1\":\n        return HdfsClientApache1()\n    else:\n        raise ValueError(\"Error: Unknown version specified in Hadoop versionconfiguration parameter\")\n\n\nclass HdfsClient(hdfs_abstract_client.HdfsFileSystem):\n    \"\"\"\n    This client uses Apache 2.x syntax for file system commands, which also matched CDH4.\n    \"\"\"\n\n    recursive_listdir_cmd = [\"-ls\", \"-R\"]\n\n    @staticmethod\n    def call_check(command):\n        p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, universal_newlines=True)\n        stdout, stderr = p.communicate()\n        if p.returncode != 0:\n            raise hdfs_error.HDFSCliError(command, p.returncode, stdout, stderr)\n        return stdout\n\n    def exists(self, path):\n        \"\"\"\n        Use ``hadoop fs -stat`` to check file existence.\n        \"\"\"\n\n        cmd = load_hadoop_cmd() + [\"fs\", \"-stat\", path]\n        logger.debug(\"Running file existence check: %s\", subprocess.list2cmdline(cmd))\n        p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, universal_newlines=True)\n        stdout, stderr = p.communicate()\n        if p.returncode == 0:\n            return True\n        else:\n            not_found_pattern = \"^.*No such file or directory$\"\n            not_found_re = re.compile(not_found_pattern)\n            for line in stderr.split(\"\\n\"):\n                if not_found_re.match(line):\n                    return False\n            raise hdfs_error.HDFSCliError(cmd, p.returncode, stdout, stderr)\n\n    def move(self, path, dest):\n        parent_dir = os.path.dirname(dest)\n        if parent_dir != \"\" and not self.exists(parent_dir):\n            self.mkdir(parent_dir)\n        if not isinstance(path, (list, tuple)):\n            path = [path]\n        else:\n            warnings.warn(\"Renaming multiple files at once is not atomic.\", stacklevel=2)\n        self.call_check(load_hadoop_cmd() + [\"fs\", \"-mv\"] + path + [dest])\n\n    def remove(self, path, recursive=True, skip_trash=False):\n        if recursive:\n            cmd = load_hadoop_cmd() + [\"fs\", \"-rm\", \"-r\"]\n        else:\n            cmd = load_hadoop_cmd() + [\"fs\", \"-rm\"]\n\n        if skip_trash:\n            cmd = cmd + [\"-skipTrash\"]\n\n        cmd = cmd + [path]\n        self.call_check(cmd)\n\n    def chmod(self, path, permissions, recursive=False):\n        if recursive:\n            cmd = load_hadoop_cmd() + [\"fs\", \"-chmod\", \"-R\", permissions, path]\n        else:\n            cmd = load_hadoop_cmd() + [\"fs\", \"-chmod\", permissions, path]\n        self.call_check(cmd)\n\n    def chown(self, path, owner, group, recursive=False):\n        if owner is None:\n            owner = \"\"\n        if group is None:\n            group = \"\"\n        ownership = \"%s:%s\" % (owner, group)\n        if recursive:\n            cmd = load_hadoop_cmd() + [\"fs\", \"-chown\", \"-R\", ownership, path]\n        else:\n            cmd = load_hadoop_cmd() + [\"fs\", \"-chown\", ownership, path]\n        self.call_check(cmd)\n\n    def count(self, path):\n        cmd = load_hadoop_cmd() + [\"fs\", \"-count\", path]\n        stdout = self.call_check(cmd)\n        lines = stdout.split(\"\\n\")\n        for line in stdout.split(\"\\n\"):\n            if line.startswith(\"OpenJDK 64-Bit Server VM warning\") or line.startswith(\"It's highly recommended\") or not line:\n                lines.pop(lines.index(line))\n            else:\n                (dir_count, file_count, content_size, ppath) = stdout.split()\n        results = {\"content_size\": content_size, \"dir_count\": dir_count, \"file_count\": file_count}\n        return results\n\n    def copy(self, path, destination):\n        self.call_check(load_hadoop_cmd() + [\"fs\", \"-cp\", path, destination])\n\n    def put(self, local_path, destination):\n        self.call_check(load_hadoop_cmd() + [\"fs\", \"-put\", local_path, destination])\n\n    def get(self, path, local_destination):\n        self.call_check(load_hadoop_cmd() + [\"fs\", \"-get\", path, local_destination])\n\n    def getmerge(self, path, local_destination, new_line=False):\n        if new_line:\n            cmd = load_hadoop_cmd() + [\"fs\", \"-getmerge\", \"-nl\", path, local_destination]\n        else:\n            cmd = load_hadoop_cmd() + [\"fs\", \"-getmerge\", path, local_destination]\n        self.call_check(cmd)\n\n    def mkdir(self, path, parents=True, raise_if_exists=False):\n        if parents and raise_if_exists:\n            raise NotImplementedError(\"HdfsClient.mkdir can't raise with -p\")\n        try:\n            cmd = load_hadoop_cmd() + [\"fs\", \"-mkdir\"] + ([\"-p\"] if parents else []) + [path]\n            self.call_check(cmd)\n        except hdfs_error.HDFSCliError as ex:\n            if \"File exists\" in ex.stderr:\n                if raise_if_exists:\n                    raise FileAlreadyExists(ex.stderr)\n            else:\n                raise\n\n    def listdir(self, path, ignore_directories=False, ignore_files=False, include_size=False, include_type=False, include_time=False, recursive=False):\n        if not path:\n            path = \".\"  # default to current/home catalog\n\n        if recursive:\n            cmd = load_hadoop_cmd() + [\"fs\"] + self.recursive_listdir_cmd + [path]\n        else:\n            cmd = load_hadoop_cmd() + [\"fs\", \"-ls\", path]\n        lines = self.call_check(cmd).split(\"\\n\")\n\n        for line in lines:\n            if not line:\n                continue\n            elif line.startswith(\"OpenJDK 64-Bit Server VM warning\") or line.startswith(\"It's highly recommended\") or line.startswith(\"Found\"):\n                continue  # \"hadoop fs -ls\" outputs \"Found %d items\" as its first line\n            elif ignore_directories and line[0] == \"d\":\n                continue\n            elif ignore_files and line[0] == \"-\":\n                continue\n            data = line.split(\" \")\n\n            file = data[-1]\n            size = int(data[-4])\n            line_type = line[0]\n            extra_data = ()\n\n            if include_size:\n                extra_data += (size,)\n            if include_type:\n                extra_data += (line_type,)\n            if include_time:\n                time_str = \"%sT%s\" % (data[-3], data[-2])\n                modification_time = datetime.datetime.strptime(time_str, \"%Y-%m-%dT%H:%M\")\n                extra_data += (modification_time,)\n\n            if len(extra_data) > 0:\n                yield (file,) + extra_data\n            else:\n                yield file\n\n    def touchz(self, path):\n        self.call_check(load_hadoop_cmd() + [\"fs\", \"-touchz\", path])\n\n\nclass HdfsClientCdh3(HdfsClient):\n    \"\"\"\n    This client uses CDH3 syntax for file system commands.\n    \"\"\"\n\n    def mkdir(self, path, parents=True, raise_if_exists=False):\n        \"\"\"\n        No explicit -p switch, this version of Hadoop always creates parent directories.\n        \"\"\"\n        try:\n            self.call_check(load_hadoop_cmd() + [\"fs\", \"-mkdir\", path])\n        except hdfs_error.HDFSCliError as ex:\n            if \"File exists\" in ex.stderr:\n                if raise_if_exists:\n                    raise FileAlreadyExists(ex.stderr)\n            else:\n                raise\n\n    def remove(self, path, recursive=True, skip_trash=False):\n        if recursive:\n            cmd = load_hadoop_cmd() + [\"fs\", \"-rmr\"]\n        else:\n            cmd = load_hadoop_cmd() + [\"fs\", \"-rm\"]\n\n        if skip_trash:\n            cmd = cmd + [\"-skipTrash\"]\n\n        cmd = cmd + [path]\n        self.call_check(cmd)\n\n\nclass HdfsClientApache1(HdfsClientCdh3):\n    \"\"\"\n    This client uses Apache 1.x syntax for file system commands,\n    which are similar to CDH3 except for the file existence check.\n    \"\"\"\n\n    recursive_listdir_cmd = [\"-lsr\"]\n\n    def exists(self, path):\n        cmd = load_hadoop_cmd() + [\"fs\", \"-test\", \"-e\", path]\n        p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)\n        stdout, stderr = p.communicate()\n        if p.returncode == 0:\n            return True\n        elif p.returncode == 1:\n            return False\n        else:\n            raise hdfs_error.HDFSCliError(cmd, p.returncode, stdout, stderr)\n"
  },
  {
    "path": "luigi/contrib/hdfs/target.py",
    "content": "#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nProvides access to HDFS using the :py:class:`HdfsTarget`, a subclass of :py:class:`~luigi.target.Target`.\n\"\"\"\n\nimport random\nimport warnings\nfrom urllib import parse as urlparse\n\nimport luigi\nfrom luigi.contrib.hdfs import clients as hdfs_clients\nfrom luigi.contrib.hdfs import format as hdfs_format\nfrom luigi.contrib.hdfs.config import tmppath\nfrom luigi.target import FileSystemTarget\n\n\nclass HdfsTarget(FileSystemTarget):\n    def __init__(self, path=None, format=None, is_tmp=False, fs=None):\n        if path is None:\n            assert is_tmp\n            path = tmppath()\n        super(HdfsTarget, self).__init__(path)\n\n        if format is None:\n            format = luigi.format.get_default_format() >> hdfs_format.Plain\n\n        old_format = (hasattr(format, \"hdfs_writer\") or hasattr(format, \"hdfs_reader\")) and not hasattr(format, \"output\")\n\n        if not old_format and getattr(format, \"output\", \"\") != \"hdfs\":\n            format = format >> hdfs_format.Plain\n\n        if old_format:\n            warnings.warn(\n                \"hdfs_writer and hdfs_reader method for format is deprecated,specify the property output of your format as 'hdfs' instead\",\n                DeprecationWarning,\n                stacklevel=2,\n            )\n\n            if hasattr(format, \"hdfs_writer\"):\n                format_writer = format.hdfs_writer\n            else:\n                w_format = format >> hdfs_format.Plain\n                format_writer = w_format.pipe_writer\n\n            if hasattr(format, \"hdfs_reader\"):\n                format_reader = format.hdfs_reader\n            else:\n                r_format = format >> hdfs_format.Plain\n                format_reader = r_format.pipe_reader\n\n            format = hdfs_format.CompatibleHdfsFormat(\n                format_writer,\n                format_reader,\n            )\n\n        else:\n            format = hdfs_format.CompatibleHdfsFormat(\n                format.pipe_writer,\n                format.pipe_reader,\n                getattr(format, \"input\", None),\n            )\n\n        self.format = format\n\n        self.is_tmp = is_tmp\n        (scheme, netloc, path, query, fragment) = urlparse.urlsplit(path)\n        if \":\" in path:\n            raise ValueError(\"colon is not allowed in hdfs filenames\")\n        self._fs = fs or hdfs_clients.get_autoconfig_client()\n\n    def __del__(self):\n        # TODO: not sure is_tmp belongs in Targets construction arguments\n        if self.is_tmp and self.exists():\n            self.remove(skip_trash=True)\n\n    @property\n    def fs(self):\n        return self._fs\n\n    def glob_exists(self, expected_files):\n        ls = list(self.fs.listdir(self.path))\n        if len(ls) == expected_files:\n            return True\n        return False\n\n    def open(self, mode=\"r\"):\n        if mode not in (\"r\", \"w\"):\n            raise ValueError(\"Unsupported open mode '%s'\" % mode)\n\n        if mode == \"r\":\n            return self.format.pipe_reader(self.path)\n        else:\n            return self.format.pipe_writer(self.path)\n\n    def remove(self, skip_trash=False):\n        self.fs.remove(self.path, skip_trash=skip_trash)\n\n    def rename(self, path, raise_if_exists=False):\n        \"\"\"\n        Does not change self.path.\n\n        Unlike ``move_dir()``, ``rename()`` might cause nested directories.\n        See spotify/luigi#522\n        \"\"\"\n        if isinstance(path, HdfsTarget):\n            path = path.path\n        if raise_if_exists and self.fs.exists(path):\n            raise RuntimeError(\"Destination exists: %s\" % path)\n        self.fs.rename(self.path, path)\n\n    def move(self, path, raise_if_exists=False):\n        \"\"\"\n        Alias for ``rename()``\n        \"\"\"\n        self.rename(path, raise_if_exists=raise_if_exists)\n\n    def move_dir(self, path):\n        \"\"\"\n        Move using :py:class:`~luigi.contrib.hdfs.abstract_client.HdfsFileSystem.rename_dont_move`\n\n        New since after luigi v2.1: Does not change self.path\n\n        One could argue that the implementation should use the\n        mkdir+raise_if_exists approach, but we at Spotify have had more trouble\n        with that over just using plain mv.  See spotify/luigi#557\n        \"\"\"\n        self.fs.rename_dont_move(self.path, path)\n\n    def copy(self, dst_dir):\n        \"\"\"\n        Copy to destination directory.\n        \"\"\"\n        self.fs.copy(self.path, dst_dir)\n\n    def is_writable(self):\n        \"\"\"\n        Currently only works with hadoopcli\n        \"\"\"\n        if \"/\" in self.path:\n            # example path: /log/ap/2013-01-17/00\n            parts = self.path.split(\"/\")\n            # start with the full path and then up the tree until we can check\n            length = len(parts)\n            for part in range(length):\n                path = \"/\".join(parts[0 : length - part]) + \"/\"\n                if self.fs.exists(path):\n                    # if the path exists and we can write there, great!\n                    if self._is_writable(path):\n                        return True\n                    # if it exists and we can't =( sad panda\n                    else:\n                        return False\n            # We went through all parts of the path and we still couldn't find\n            # one that exists.\n            return False\n\n    def _is_writable(self, path):\n        test_path = path + \".test_write_access-%09d\" % random.randrange(10_000_000_000)\n        try:\n            self.fs.touchz(test_path)\n            self.fs.remove(test_path, recursive=False)\n            return True\n        except hdfs_clients.HDFSCliError:\n            return False\n\n\nclass HdfsFlagTarget(HdfsTarget):\n    \"\"\"\n    Defines a target directory with a flag-file (defaults to `_SUCCESS`) used\n    to signify job success.\n\n    This checks for two things:\n\n    * the path exists (just like the HdfsTarget)\n    * the _SUCCESS file exists within the directory.\n\n    Because Hadoop outputs into a directory and not a single file,\n    the path is assumed to be a directory.\n    \"\"\"\n\n    def __init__(self, path, format=None, client=None, flag=\"_SUCCESS\"):\n        \"\"\"\n        Initializes a HdfsFlagTarget.\n\n        :param path: the directory where the files are stored.\n        :type path: str\n        :param client:\n        :type client:\n        :param flag:\n        :type flag: str\n        \"\"\"\n        if path[-1] != \"/\":\n            raise ValueError(\"HdfsFlagTarget requires the path to be to a directory.  It must end with a slash ( / ).\")\n        super(HdfsFlagTarget, self).__init__(path, format, client)\n        self.flag = flag\n\n    def exists(self):\n        hadoopSemaphore = self.path + self.flag\n        return self.fs.exists(hadoopSemaphore)\n"
  },
  {
    "path": "luigi/contrib/hdfs/webhdfs_client.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2015 VNG Corporation\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nA luigi file system client that wraps around the hdfs-library (a webhdfs\nclient)\n\nNote. This wrapper client is not feature complete yet. As with most software\nthe authors only implement the features they need.  If you need to wrap more of\nthe file system operations, please do and contribute back.\n\"\"\"\n\nimport logging\nimport os\nimport warnings\n\nimport luigi.contrib.target\nfrom luigi.contrib.hdfs import abstract_client as hdfs_abstract_client\nfrom luigi.contrib.hdfs import config as hdfs_config\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nclass webhdfs(luigi.Config):\n    port = luigi.IntParameter(default=50070, description=\"Port for webhdfs\")\n    user = luigi.Parameter(default=\"\", description=\"Defaults to $USER envvar\", config_path=dict(section=\"hdfs\", name=\"user\"))\n    client_type = luigi.ChoiceParameter(var_type=str, choices=[\"insecure\", \"kerberos\"], default=\"insecure\", description=\"Type of hdfs client to use.\")\n\n\nclass WebHdfsClient(hdfs_abstract_client.HdfsFileSystem):\n    \"\"\"\n    A webhdfs that tries to confirm to luigis interface for file existence.\n\n    The library is using `this api\n    <https://hdfscli.readthedocs.io/en/latest/api.html>`__.\n    \"\"\"\n\n    def __init__(self, host=None, port=None, user=None, client_type=None):\n        self.host = host or hdfs_config.hdfs().namenode_host\n        self.port = port or webhdfs().port\n        self.user = user or webhdfs().user or os.environ[\"USER\"]\n        self.client_type = client_type or webhdfs().client_type\n\n    @property\n    def url(self):\n        # the hdfs package allows it to specify multiple namenodes by passing a string containing\n        # multiple namenodes separated by ';'\n        hosts = self.host.split(\";\")\n        urls = [\"http://\" + host + \":\" + str(self.port) for host in hosts]\n        return \";\".join(urls)\n\n    @property\n    def client(self):\n        # A naive benchmark showed that 1000 existence checks took 2.5 secs\n        # when not recreating the client, and 4.0 secs when recreating it. So\n        # not urgent to memoize it. Note that it *might* be issues with process\n        # forking and whatnot (as the one in the snakebite client) if we\n        # memoize it too trivially.\n        if self.client_type == \"kerberos\":\n            from hdfs.ext.kerberos import KerberosClient\n\n            return KerberosClient(url=self.url)\n        else:\n            import hdfs\n\n            return hdfs.InsecureClient(url=self.url, user=self.user)\n\n    def walk(self, path, depth=1):\n        return self.client.walk(path, depth=depth)\n\n    def exists(self, path):\n        \"\"\"\n        Returns true if the path exists and false otherwise.\n        \"\"\"\n        import hdfs\n\n        try:\n            self.client.status(path)\n            return True\n        except hdfs.util.HdfsError as e:\n            if str(e).startswith(\"File does not exist: \"):\n                return False\n            else:\n                raise e\n\n    def upload(self, hdfs_path, local_path, overwrite=False):\n        return self.client.upload(hdfs_path, local_path, overwrite=overwrite)\n\n    def download(self, hdfs_path, local_path, overwrite=False, n_threads=-1):\n        return self.client.download(hdfs_path, local_path, overwrite=overwrite, n_threads=n_threads)\n\n    def remove(self, hdfs_path, recursive=True, skip_trash=False):\n        assert skip_trash  # Yes, you need to explicitly say skip_trash=True\n        return self.client.delete(hdfs_path, recursive=recursive)\n\n    def read(self, hdfs_path, offset=0, length=None, buffer_size=None, chunk_size=1024, buffer_char=None):\n        return self.client.read(hdfs_path, offset=offset, length=length, buffer_size=buffer_size, chunk_size=chunk_size, buffer_char=buffer_char)\n\n    def move(self, path, dest):\n        parts = dest.rstrip(\"/\").split(\"/\")\n        if len(parts) > 1:\n            dir_path = \"/\".join(parts[0:-1])\n            if not self.exists(dir_path):\n                self.mkdir(dir_path, parents=True)\n        self.client.rename(path, dest)\n\n    def mkdir(self, path, parents=True, mode=0o755, raise_if_exists=False):\n        \"\"\"\n        Has no returnvalue (just like WebHDFS)\n        \"\"\"\n        if not parents or raise_if_exists:\n            warnings.warn(\"webhdfs mkdir: parents/raise_if_exists not implemented\")\n        permission = int(oct(mode)[2:])  # Convert from int(decimal) to int(octal)\n        self.client.makedirs(path, permission=permission)\n\n    def chmod(self, path, permissions, recursive=False):\n        \"\"\"\n        Raise a NotImplementedError exception.\n        \"\"\"\n        raise NotImplementedError(\"Webhdfs in luigi doesn't implement chmod\")\n\n    def chown(self, path, owner, group, recursive=False):\n        \"\"\"\n        Raise a NotImplementedError exception.\n        \"\"\"\n        raise NotImplementedError(\"Webhdfs in luigi doesn't implement chown\")\n\n    def count(self, path):\n        \"\"\"\n        Raise a NotImplementedError exception.\n        \"\"\"\n        raise NotImplementedError(\"Webhdfs in luigi doesn't implement count\")\n\n    def copy(self, path, destination):\n        \"\"\"\n        Raise a NotImplementedError exception.\n        \"\"\"\n        raise NotImplementedError(\"Webhdfs in luigi doesn't implement copy\")\n\n    def put(self, local_path, destination):\n        \"\"\"\n        Restricted version of upload\n        \"\"\"\n        self.upload(local_path, destination)\n\n    def get(self, path, local_destination):\n        \"\"\"\n        Restricted version of download\n        \"\"\"\n        self.download(path, local_destination)\n\n    def listdir(self, path, ignore_directories=False, ignore_files=False, include_size=False, include_type=False, include_time=False, recursive=False):\n        assert not recursive\n        return self.client.list(path, status=False)\n\n    def touchz(self, path):\n        \"\"\"\n        To touchz using the web hdfs \"write\" cmd.\n        \"\"\"\n        self.client.write(path, data=\"\", overwrite=False)\n"
  },
  {
    "path": "luigi/contrib/hive.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport abc\nimport collections\nimport logging\nimport operator\nimport os\nimport re\nimport subprocess\nimport tempfile\nimport warnings\n\nimport luigi\nimport luigi.contrib.hadoop\nfrom luigi.contrib.hdfs import get_autoconfig_client\nfrom luigi.target import FileAlreadyExists, FileSystemTarget\nfrom luigi.task import flatten\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nclass HiveCommandError(RuntimeError):\n    def __init__(self, message, out=None, err=None):\n        super(HiveCommandError, self).__init__(message, out, err)\n        self.message = message\n        self.out = out\n        self.err = err\n\n\ndef load_hive_cmd():\n    return luigi.configuration.get_config().get(\"hive\", \"command\", \"hive\").split(\" \")\n\n\ndef get_hive_syntax():\n    return luigi.configuration.get_config().get(\"hive\", \"release\", \"cdh4\")\n\n\ndef get_hive_warehouse_location():\n    return luigi.configuration.get_config().get(\"hive\", \"warehouse_location\", \"/user/hive/warehouse\")\n\n\ndef get_ignored_file_masks():\n    return luigi.configuration.get_config().get(\"hive\", \"ignored_file_masks\", None)\n\n\ndef run_hive(args, check_return_code=True):\n    \"\"\"\n    Runs the `hive` from the command line, passing in the given args, and\n    returning stdout.\n\n    With the apache release of Hive, so of the table existence checks\n    (which are done using DESCRIBE do not exit with a return code of 0\n    so we need an option to ignore the return code and just return stdout for parsing\n    \"\"\"\n    cmd = load_hive_cmd() + args\n    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n    stdout, stderr = p.communicate()\n    if check_return_code and p.returncode != 0:\n        raise HiveCommandError(\"Hive command: {0} failed with error code: {1}\".format(\" \".join(cmd), p.returncode), stdout, stderr)\n    return stdout.decode(\"utf-8\")\n\n\ndef run_hive_cmd(hivecmd, check_return_code=True):\n    \"\"\"\n    Runs the given hive query and returns stdout.\n    \"\"\"\n    return run_hive([\"-e\", hivecmd], check_return_code)\n\n\ndef run_hive_script(script):\n    \"\"\"\n    Runs the contents of the given script in hive and returns stdout.\n    \"\"\"\n    if not os.path.isfile(script):\n        raise RuntimeError(\"Hive script: {0} does not exist.\".format(script))\n    return run_hive([\"-f\", script])\n\n\ndef _is_ordered_dict(dikt):\n    return isinstance(dikt, (collections.OrderedDict, dict))\n\n\ndef _validate_partition(partition):\n    \"\"\"\n    If partition is set and its size is more than one and not ordered,\n    then we're unable to restore its path in the warehouse\n    \"\"\"\n    if partition and len(partition) > 1 and not _is_ordered_dict(partition):\n        raise ValueError(\"Unable to restore table/partition location\")\n\n\nclass HiveClient(metaclass=abc.ABCMeta):\n    @abc.abstractmethod\n    def table_location(self, table, database=\"default\", partition=None):\n        \"\"\"\n        Returns location of db.table (or db.table.partition). partition is a dict of partition key to\n        value.\n        \"\"\"\n        pass\n\n    @abc.abstractmethod\n    def table_schema(self, table, database=\"default\"):\n        \"\"\"\n        Returns list of [(name, type)] for each column in database.table.\n        \"\"\"\n        pass\n\n    @abc.abstractmethod\n    def table_exists(self, table, database=\"default\", partition=None):\n        \"\"\"\n        Returns true if db.table (or db.table.partition) exists. partition is a dict of partition key to\n        value.\n        \"\"\"\n        pass\n\n    @abc.abstractmethod\n    def partition_spec(self, partition):\n        \"\"\"Turn a dict into a string partition specification\"\"\"\n        pass\n\n\nclass HiveCommandClient(HiveClient):\n    \"\"\"\n    Uses `hive` invocations to find information.\n    \"\"\"\n\n    def table_location(self, table, database=\"default\", partition=None):\n        cmd = \"use {0}; describe formatted {1}\".format(database, table)\n        if partition is not None:\n            cmd += \" PARTITION ({0})\".format(self.partition_spec(partition))\n\n        stdout = run_hive_cmd(cmd)\n\n        for line in stdout.split(\"\\n\"):\n            if \"Location:\" in line:\n                return line.split(\"\\t\")[1]\n\n    def table_exists(self, table, database=\"default\", partition=None):\n        if partition is None:\n            stdout = run_hive_cmd('use {0}; show tables like \"{1}\";'.format(database, table))\n\n            return stdout and table.lower() in stdout\n        else:\n            stdout = run_hive_cmd(\n                \"\"\"use %s; show partitions %s partition\n                                (%s)\"\"\"\n                % (database, table, self.partition_spec(partition))\n            )\n\n            if stdout:\n                return True\n            else:\n                return False\n\n    def table_schema(self, table, database=\"default\"):\n        describe = run_hive_cmd(\"use {0}; describe {1}\".format(database, table))\n        if not describe or \"does not exist\" in describe:\n            return None\n        return [tuple([x.strip() for x in line.strip().split(\"\\t\")]) for line in describe.strip().split(\"\\n\")]\n\n    def partition_spec(self, partition):\n        \"\"\"\n        Turns a dict into the a Hive partition specification string.\n        \"\"\"\n        return \",\".join([\"`{0}`='{1}'\".format(k, v) for (k, v) in sorted(partition.items(), key=operator.itemgetter(0))])\n\n\nclass ApacheHiveCommandClient(HiveCommandClient):\n    \"\"\"\n    A subclass for the HiveCommandClient to (in some cases) ignore the return code from\n    the hive command so that we can just parse the output.\n    \"\"\"\n\n    def table_schema(self, table, database=\"default\"):\n        describe = run_hive_cmd(\"use {0}; describe {1}\".format(database, table), False)\n        if not describe or \"Table not found\" in describe:\n            return None\n        return [tuple([x.strip() for x in line.strip().split(\"\\t\")]) for line in describe.strip().split(\"\\n\")]\n\n\nclass MetastoreClient(HiveClient):\n    def table_location(self, table, database=\"default\", partition=None):\n        with HiveThriftContext() as client:\n            if partition is not None:\n                try:\n                    import hive_metastore.ttypes\n\n                    partition_str = self.partition_spec(partition)\n                    thrift_table = client.get_partition_by_name(database, table, partition_str)\n                except hive_metastore.ttypes.NoSuchObjectException:\n                    return \"\"\n            else:\n                thrift_table = client.get_table(database, table)\n            return thrift_table.sd.location\n\n    def table_exists(self, table, database=\"default\", partition=None):\n        with HiveThriftContext() as client:\n            if partition is None:\n                return table in client.get_all_tables(database)\n            else:\n                return partition in self._existing_partitions(table, database, client)\n\n    def _existing_partitions(self, table, database, client):\n        def _parse_partition_string(partition_string):\n            partition_def = {}\n            for part in partition_string.split(\"/\"):\n                name, value = part.split(\"=\")\n                partition_def[name] = value\n            return partition_def\n\n        # -1 is max_parts, the # of partition names to return (-1 = unlimited)\n        partition_strings = client.get_partition_names(database, table, -1)\n        return [_parse_partition_string(existing_partition) for existing_partition in partition_strings]\n\n    def table_schema(self, table, database=\"default\"):\n        with HiveThriftContext() as client:\n            return [(field_schema.name, field_schema.type) for field_schema in client.get_schema(database, table)]\n\n    def partition_spec(self, partition):\n        return \"/\".join(\"%s=%s\" % (k, v) for (k, v) in sorted(partition.items(), key=operator.itemgetter(0)))\n\n\nclass HiveThriftContext:\n    \"\"\"\n    Context manager for hive metastore client.\n    \"\"\"\n\n    def __enter__(self):\n        try:\n            # Note that this will only work with a CDH release.\n            # This uses the thrift bindings generated by the ThriftHiveMetastore service in Beeswax.\n            # If using the Apache release of Hive this import will fail.\n            from hive_metastore import ThriftHiveMetastore\n            from thrift.protocol import TBinaryProtocol\n            from thrift.transport import TSocket, TTransport\n\n            config = luigi.configuration.get_config()\n            host = config.get(\"hive\", \"metastore_host\")\n            port = config.getint(\"hive\", \"metastore_port\")\n            transport = TSocket.TSocket(host, port)\n            transport = TTransport.TBufferedTransport(transport)\n            protocol = TBinaryProtocol.TBinaryProtocol(transport)\n            transport.open()\n            self.transport = transport\n            return ThriftHiveMetastore.Client(protocol)\n        except ImportError as e:\n            raise Exception(\"Could not import Hive thrift library:\" + str(e))\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        self.transport.close()\n\n\nclass WarehouseHiveClient(HiveClient):\n    \"\"\"\n    Client for managed tables that makes decision based on presence of directory in hdfs\n    \"\"\"\n\n    def __init__(self, hdfs_client=None, warehouse_location=None):\n        self.hdfs_client = hdfs_client or get_autoconfig_client()\n        self.warehouse_location = warehouse_location or get_hive_warehouse_location()\n\n    def table_schema(self, table, database=\"default\"):\n        return NotImplemented\n\n    def table_location(self, table, database=\"default\", partition=None):\n        return os.path.join(self.warehouse_location, database + \".db\", table, self.partition_spec(partition))\n\n    def table_exists(self, table, database=\"default\", partition=None):\n        \"\"\"\n        The table/partition is considered existing if corresponding path in hdfs exists\n        and contains file except those which match pattern set in  `ignored_file_masks`\n        \"\"\"\n        path = self.table_location(table, database, partition)\n        if self.hdfs_client.exists(path):\n            ignored_files = get_ignored_file_masks()\n            if ignored_files is None:\n                return True\n\n            filenames = self.hdfs_client.listdir(path)\n            pattern = re.compile(ignored_files)\n            for filename in filenames:\n                if not pattern.match(filename):\n                    return True\n\n        return False\n\n    def partition_spec(self, partition):\n        _validate_partition(partition)\n        return \"/\".join([\"{}={}\".format(k, v) for (k, v) in (partition or {}).items()])\n\n\ndef get_default_client():\n    syntax = get_hive_syntax()\n    if syntax == \"apache\":\n        return ApacheHiveCommandClient()\n    elif syntax == \"metastore\":\n        return MetastoreClient()\n    elif syntax == \"warehouse\":\n        return WarehouseHiveClient()\n    else:\n        return HiveCommandClient()\n\n\nclient = get_default_client()\n\n\nclass HiveQueryTask(luigi.contrib.hadoop.BaseHadoopJobTask):\n    \"\"\"\n    Task to run a hive query.\n    \"\"\"\n\n    # by default, we let hive figure these out.\n    n_reduce_tasks = None\n    bytes_per_reducer = None\n    reducers_max = None\n\n    @abc.abstractmethod\n    def query(self):\n        \"\"\"Text of query to run in hive\"\"\"\n        raise RuntimeError(\"Must implement query!\")\n\n    def hiverc(self):\n        \"\"\"\n        Location of an rc file to run before the query\n        if hiverc-location key is specified in luigi.cfg, will default to the value there\n        otherwise returns None.\n\n        Returning a list of rc files will load all of them in order.\n        \"\"\"\n        return luigi.configuration.get_config().get(\"hive\", \"hiverc-location\", default=None)\n\n    def hivevars(self):\n        \"\"\"\n        Returns a dict of key=value settings to be passed along\n        to the hive command line via --hivevar.\n        This option can be used as a separated namespace for script local variables.\n        See https://cwiki.apache.org/confluence/display/Hive/LanguageManual+VariableSubstitution\n        \"\"\"\n        return {}\n\n    def hiveconfs(self):\n        \"\"\"\n        Returns a dict of key=value settings to be passed along\n        to the hive command line via --hiveconf. By default, sets\n        mapred.job.name to task_id and if not None, sets:\n\n        * mapred.reduce.tasks (n_reduce_tasks)\n        * mapred.fairscheduler.pool (pool) or mapred.job.queue.name (pool)\n        * hive.exec.reducers.bytes.per.reducer (bytes_per_reducer)\n        * hive.exec.reducers.max (reducers_max)\n        \"\"\"\n        jcs = {}\n        jcs[\"mapred.job.name\"] = \"'\" + self.task_id + \"'\"\n        if self.n_reduce_tasks is not None:\n            jcs[\"mapred.reduce.tasks\"] = self.n_reduce_tasks\n        if self.pool is not None:\n            # Supporting two schedulers: fair (default) and capacity using the same option\n            scheduler_type = luigi.configuration.get_config().get(\"hadoop\", \"scheduler\", \"fair\")\n            if scheduler_type == \"fair\":\n                jcs[\"mapred.fairscheduler.pool\"] = self.pool\n            elif scheduler_type == \"capacity\":\n                jcs[\"mapred.job.queue.name\"] = self.pool\n        if self.bytes_per_reducer is not None:\n            jcs[\"hive.exec.reducers.bytes.per.reducer\"] = self.bytes_per_reducer\n        if self.reducers_max is not None:\n            jcs[\"hive.exec.reducers.max\"] = self.reducers_max\n        return jcs\n\n    def job_runner(self):\n        return HiveQueryRunner()\n\n\nclass HiveQueryRunner(luigi.contrib.hadoop.JobRunner):\n    \"\"\"\n    Runs a HiveQueryTask by shelling out to hive.\n    \"\"\"\n\n    def prepare_outputs(self, job):\n        \"\"\"\n        Called before job is started.\n\n        If output is a `FileSystemTarget`, create parent directories so the hive command won't fail\n        \"\"\"\n        outputs = flatten(job.output())\n        for o in outputs:\n            if isinstance(o, FileSystemTarget):\n                parent_dir = os.path.dirname(o.path)\n                if parent_dir and not o.fs.exists(parent_dir):\n                    logger.info(\"Creating parent directory %r\", parent_dir)\n                    try:\n                        # there is a possible race condition\n                        # which needs to be handled here\n                        o.fs.mkdir(parent_dir)\n                    except FileAlreadyExists:\n                        pass\n\n    def get_arglist(self, f_name, job):\n        arglist = load_hive_cmd() + [\"-f\", f_name]\n        hiverc = job.hiverc()\n        if hiverc:\n            if isinstance(hiverc, str):\n                hiverc = [hiverc]\n            for rcfile in hiverc:\n                arglist += [\"-i\", rcfile]\n        hiveconfs = job.hiveconfs()\n        if hiveconfs:\n            for k, v in hiveconfs.items():\n                arglist += [\"--hiveconf\", \"{0}={1}\".format(k, v)]\n        hivevars = job.hivevars()\n        if hivevars:\n            for k, v in hivevars.items():\n                arglist += [\"--hivevar\", \"{0}={1}\".format(k, v)]\n        logger.info(arglist)\n        return arglist\n\n    def run_job(self, job, tracking_url_callback=None):\n        if tracking_url_callback is not None:\n            warnings.warn(\"tracking_url_callback argument is deprecated, task.set_tracking_url is used instead.\", DeprecationWarning)\n\n        self.prepare_outputs(job)\n        with tempfile.NamedTemporaryFile() as f:\n            query = job.query()\n            if isinstance(query, str):\n                query = query.encode(\"utf8\")\n            f.write(query)\n            f.flush()\n            arglist = self.get_arglist(f.name, job)\n            return luigi.contrib.hadoop.run_and_track_hadoop_job(arglist, job.set_tracking_url)\n\n\nclass HivePartitionTarget(luigi.Target):\n    \"\"\"\n    Target representing Hive table or Hive partition\n    \"\"\"\n\n    def __init__(self, table, partition, database=\"default\", fail_missing_table=True, client=None):\n        \"\"\"\n        @param table: Table name\n        @type table: str\n        @param partition: partition specificaton in form of\n        dict of {\"partition_column_1\": \"partition_value_1\", \"partition_column_2\": \"partition_value_2\", ... }\n        If `partition` is `None` or `{}` then target is Hive nonpartitioned table\n        @param database: Database name\n        @param fail_missing_table: flag to ignore errors raised due to table nonexistence\n        @param client: `HiveCommandClient` instance. Default if `client is None`\n        \"\"\"\n        self.database = database\n        self.table = table\n        self.partition = partition\n        self.client = client or get_default_client()\n        self.fail_missing_table = fail_missing_table\n\n    def __str__(self):\n        return self.path\n\n    def exists(self):\n        \"\"\"\n        returns `True` if the partition/table exists\n        \"\"\"\n        try:\n            logger.debug(\"Checking Hive table '{d}.{t}' for partition {p}\".format(d=self.database, t=self.table, p=str(self.partition or {})))\n\n            return self.client.table_exists(self.table, self.database, self.partition)\n        except HiveCommandError:\n            if self.fail_missing_table:\n                raise\n            else:\n                if self.client.table_exists(self.table, self.database):\n                    # a real error occurred\n                    raise\n                else:\n                    # oh the table just doesn't exist\n                    return False\n\n    @property\n    def path(self):\n        \"\"\"\n        Returns the path for this HiveTablePartitionTarget's data.\n        \"\"\"\n        location = self.client.table_location(self.table, self.database, self.partition)\n        if not location:\n            raise Exception(\"Couldn't find location for table: {0}\".format(str(self)))\n        return location\n\n\nclass HiveTableTarget(HivePartitionTarget):\n    \"\"\"\n    Target representing non-partitioned table\n    \"\"\"\n\n    def __init__(self, table, database=\"default\", client=None):\n        super(HiveTableTarget, self).__init__(\n            table=table,\n            partition=None,\n            database=database,\n            fail_missing_table=False,\n            client=client,\n        )\n\n\nclass ExternalHiveTask(luigi.ExternalTask):\n    \"\"\"\n    External task that depends on a Hive table/partition.\n    \"\"\"\n\n    database = luigi.Parameter(default=\"default\")\n    table = luigi.Parameter()\n    partition: luigi.DictParameter = luigi.DictParameter(\n        default={}, description='Python dictionary specifying the target partition e.g. {\"date\": \"2013-01-25\"}'\n    )\n\n    def output(self):\n        return HivePartitionTarget(\n            database=self.database,\n            table=self.table,\n            partition=self.partition,\n        )\n"
  },
  {
    "path": "luigi/contrib/kubernetes.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2015 Outlier Bio, LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\n\"\"\"\nKubernetes Job wrapper for Luigi.\n\nFrom the Kubernetes website:\n\n    Kubernetes is an open-source system for automating deployment, scaling,\n    and management of containerized applications.\n\nFor more information about Kubernetes Jobs: http://kubernetes.io/docs/user-guide/jobs/\n\nRequires:\n\n- pykube: ``pip install pykube-ng``\n\nWritten and maintained by Marco Capuccini (@mcapuccini).\n\"\"\"\n\nimport logging\nimport time\nimport uuid\nfrom datetime import datetime\n\nimport luigi\n\nlogger = logging.getLogger(\"luigi-interface\")\n\ntry:\n    from pykube.config import KubeConfig\n    from pykube.http import HTTPClient\n    from pykube.objects import Job, Pod\nexcept ImportError:\n    logger.warning(\"pykube is not installed. KubernetesJobTask requires pykube.\")\n\n\nclass kubernetes(luigi.Config):\n    auth_method = luigi.Parameter(default=\"kubeconfig\", description=\"Authorization method to access the cluster\")\n    kubeconfig_path = luigi.Parameter(default=\"~/.kube/config\", description=\"Path to kubeconfig file for cluster authentication\")\n    max_retrials = luigi.IntParameter(default=0, description=\"Max retrials in event of job failure\")\n    kubernetes_namespace = luigi.OptionalParameter(default=None, description=\"K8s namespace in which the job will run\")\n\n\nclass KubernetesJobTask(luigi.Task):\n    __DEFAULT_POLL_INTERVAL = 5  # see __track_job\n    __DEFAULT_POD_CREATION_INTERVAL = 5\n    _kubernetes_config = None  # Needs to be loaded at runtime\n\n    def _init_kubernetes(self):\n        self.__logger = logger\n        self.__logger.debug(\"Kubernetes auth method: \" + self.auth_method)\n        if self.auth_method == \"kubeconfig\":\n            self.__kube_api = HTTPClient(KubeConfig.from_file(self.kubeconfig_path))\n        elif self.auth_method == \"service-account\":\n            self.__kube_api = HTTPClient(KubeConfig.from_service_account())\n        else:\n            raise ValueError(\"Illegal auth_method\")\n        self.job_uuid = str(uuid.uuid4().hex)\n        now = datetime.utcnow()\n        self.uu_name = \"%s-%s-%s\" % (self.name, now.strftime(\"%Y%m%d%H%M%S\"), self.job_uuid[:16])\n\n    @property\n    def auth_method(self):\n        \"\"\"\n        This can be set to ``kubeconfig`` or ``service-account``.\n        It defaults to ``kubeconfig``.\n\n        For more details, please refer to:\n\n        - kubeconfig: http://kubernetes.io/docs/user-guide/kubeconfig-file\n        - service-account: http://kubernetes.io/docs/user-guide/service-accounts\n        \"\"\"\n        return self.kubernetes_config.auth_method\n\n    @property\n    def kubeconfig_path(self):\n        \"\"\"\n        Path to kubeconfig file used for cluster authentication.\n        It defaults to \"~/.kube/config\", which is the default location\n        when using minikube (http://kubernetes.io/docs/getting-started-guides/minikube).\n        When auth_method is ``service-account`` this property is ignored.\n\n        **WARNING**: For Python versions < 3.5 kubeconfig must point to a Kubernetes API\n        hostname, and NOT to an IP address.\n\n        For more details, please refer to:\n        http://kubernetes.io/docs/user-guide/kubeconfig-file\n        \"\"\"\n        return self.kubernetes_config.kubeconfig_path\n\n    @property\n    def kubernetes_namespace(self):\n        \"\"\"\n        Namespace in Kubernetes where the job will run.\n        It defaults to the default namespace in Kubernetes\n\n        For more details, please refer to:\n        https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/\n        \"\"\"\n        return self.kubernetes_config.kubernetes_namespace\n\n    @property\n    def name(self):\n        \"\"\"\n        A name for this job. This task will automatically append a UUID to the\n        name before to submit to Kubernetes.\n        \"\"\"\n        raise NotImplementedError(\"subclass must define name\")\n\n    @property\n    def labels(self):\n        \"\"\"\n        Return custom labels for kubernetes job.\n        example::\n        ``{\"run_dt\": datetime.date.today().strftime('%F')}``\n        \"\"\"\n        return {}\n\n    @property\n    def spec_schema(self):\n        \"\"\"\n        Kubernetes Job spec schema in JSON format, an example follows.\n\n        .. code-block:: javascript\n\n            {\n                \"containers\": [{\n                    \"name\": \"pi\",\n                    \"image\": \"perl\",\n                    \"command\": [\"perl\",  \"-Mbignum=bpi\", \"-wle\", \"print bpi(2000)\"]\n                }],\n                \"restartPolicy\": \"Never\"\n            }\n\n        **restartPolicy**\n\n        - If restartPolicy is not defined, it will be set to \"Never\" by default.\n        - **Warning**: restartPolicy=OnFailure will bypass max_retrials, and restart\n          the container until success, with the risk of blocking the Luigi task.\n\n        For more informations please refer to:\n        http://kubernetes.io/docs/user-guide/pods/multi-container/#the-spec-schema\n        \"\"\"\n        raise NotImplementedError(\"subclass must define spec_schema\")\n\n    @property\n    def max_retrials(self):\n        \"\"\"\n        Maximum number of retrials in case of failure.\n        \"\"\"\n        return self.kubernetes_config.max_retrials\n\n    @property\n    def backoff_limit(self):\n        \"\"\"\n        Maximum number of retries before considering the job as failed.\n        See: https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/#pod-backoff-failure-policy\n        \"\"\"\n        return 6\n\n    @property\n    def delete_on_success(self):\n        \"\"\"\n        Delete the Kubernetes workload if the job has ended successfully.\n        \"\"\"\n        return True\n\n    @property\n    def print_pod_logs_on_exit(self):\n        \"\"\"\n        Fetch and print the pod logs once the job is completed.\n        \"\"\"\n        return False\n\n    @property\n    def active_deadline_seconds(self):\n        \"\"\"\n        Time allowed to successfully schedule pods.\n        See: https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/#job-termination-and-cleanup\n        \"\"\"\n        return None\n\n    @property\n    def kubernetes_config(self):\n        if not self._kubernetes_config:\n            self._kubernetes_config = kubernetes()\n        return self._kubernetes_config\n\n    @property\n    def poll_interval(self):\n        \"\"\"How often to poll Kubernetes for job status, in seconds.\"\"\"\n        return self.__DEFAULT_POLL_INTERVAL\n\n    @property\n    def pod_creation_wait_interal(self):\n        \"\"\"Delay for initial pod creation for just submitted job in seconds\"\"\"\n        return self.__DEFAULT_POD_CREATION_INTERVAL\n\n    def __track_job(self):\n        \"\"\"Poll job status while active\"\"\"\n        while not self.__verify_job_has_started():\n            time.sleep(self.poll_interval)\n            self.__logger.debug(\"Waiting for Kubernetes job \" + self.uu_name + \" to start\")\n        self.__print_kubectl_hints()\n\n        status = self.__get_job_status()\n        while status == \"RUNNING\":\n            self.__logger.debug(\"Kubernetes job \" + self.uu_name + \" is running\")\n            time.sleep(self.poll_interval)\n            status = self.__get_job_status()\n\n        assert status != \"FAILED\", \"Kubernetes job \" + self.uu_name + \" failed\"\n\n        # status == \"SUCCEEDED\"\n        self.__logger.info(\"Kubernetes job \" + self.uu_name + \" succeeded\")\n        self.signal_complete()\n\n    def signal_complete(self):\n        \"\"\"Signal job completion for scheduler and dependent tasks.\n\n        Touching a system file is an easy way to signal completion. example::\n        .. code-block:: python\n\n        with self.output().open('w') as output_file:\n            output_file.write('')\n        \"\"\"\n        pass\n\n    def __get_pods(self):\n        pod_objs = Pod.objects(self.__kube_api, namespace=self.kubernetes_namespace).filter(selector=\"job-name=\" + self.uu_name).response[\"items\"]\n        return [Pod(self.__kube_api, p) for p in pod_objs]\n\n    def __get_job(self):\n        jobs = Job.objects(self.__kube_api, namespace=self.kubernetes_namespace).filter(selector=\"luigi_task_id=\" + self.job_uuid).response[\"items\"]\n        assert len(jobs) == 1, \"Kubernetes job \" + self.uu_name + \" not found\"\n        return Job(self.__kube_api, jobs[0])\n\n    def __print_pod_logs(self):\n        for pod in self.__get_pods():\n            logs = pod.logs(timestamps=True).strip()\n            self.__logger.info(\"Fetching logs from \" + pod.name)\n            if len(logs) > 0:\n                for line in logs.split(\"\\n\"):\n                    self.__logger.info(line)\n\n    def __print_kubectl_hints(self):\n        self.__logger.info(\"To stream Pod logs, use:\")\n        for pod in self.__get_pods():\n            self.__logger.info(\"`kubectl logs -f pod/%s -n %s`\" % (pod.name, pod.namespace))\n\n    def __verify_job_has_started(self):\n        \"\"\"Asserts that the job has successfully started\"\"\"\n        # Verify that the job started\n        self.__get_job()\n\n        # Verify that the pod started\n        pods = self.__get_pods()\n        if not pods:\n            self.__logger.debug(\"No pods found for %s, waiting for cluster state to match the job definition\" % self.uu_name)\n            time.sleep(self.pod_creation_wait_interal)\n            pods = self.__get_pods()\n\n        assert len(pods) > 0, \"No pod scheduled by \" + self.uu_name\n        for pod in pods:\n            status = pod.obj[\"status\"]\n            for cont_stats in status.get(\"containerStatuses\", []):\n                if \"terminated\" in cont_stats[\"state\"]:\n                    t = cont_stats[\"state\"][\"terminated\"]\n                    err_msg = \"Pod %s %s (exit code %d). Logs: `kubectl logs pod/%s`\" % (pod.name, t[\"reason\"], t[\"exitCode\"], pod.name)\n                    assert t[\"exitCode\"] == 0, err_msg\n\n                if \"waiting\" in cont_stats[\"state\"]:\n                    wr = cont_stats[\"state\"][\"waiting\"][\"reason\"]\n                    assert wr == \"ContainerCreating\", \"Pod %s %s. Logs: `kubectl logs pod/%s`\" % (pod.name, wr, pod.name)\n\n            for cond in status.get(\"conditions\", []):\n                if \"message\" in cond:\n                    if cond[\"reason\"] == \"ContainersNotReady\":\n                        return False\n                    assert cond[\"status\"] != \"False\", \"[ERROR] %s - %s\" % (cond[\"reason\"], cond[\"message\"])\n        return True\n\n    def __get_job_status(self):\n        \"\"\"Return the Kubernetes job status\"\"\"\n        # Figure out status and return it\n        job = self.__get_job()\n\n        if \"succeeded\" in job.obj[\"status\"] and job.obj[\"status\"][\"succeeded\"] > 0:\n            job.scale(replicas=0)\n            if self.print_pod_logs_on_exit:\n                self.__print_pod_logs()\n            if self.delete_on_success:\n                self.__delete_job_cascade(job)\n            return \"SUCCEEDED\"\n\n        if \"failed\" in job.obj[\"status\"]:\n            failed_cnt = job.obj[\"status\"][\"failed\"]\n            self.__logger.debug(\"Kubernetes job \" + self.uu_name + \" status.failed: \" + str(failed_cnt))\n            if self.print_pod_logs_on_exit:\n                self.__print_pod_logs()\n            if failed_cnt > self.max_retrials:\n                job.scale(replicas=0)  # avoid more retrials\n                return \"FAILED\"\n        return \"RUNNING\"\n\n    def __delete_job_cascade(self, job):\n        delete_options_cascade = {\"kind\": \"DeleteOptions\", \"apiVersion\": \"v1\", \"propagationPolicy\": \"Background\"}\n        r = self.__kube_api.delete(json=delete_options_cascade, **job.api_kwargs())\n        if r.status_code != 200:\n            self.__kube_api.raise_for_status(r)\n\n    def run(self):\n        self._init_kubernetes()\n        # Render job\n        job_json = {\n            \"apiVersion\": \"batch/v1\",\n            \"kind\": \"Job\",\n            \"metadata\": {\"name\": self.uu_name, \"labels\": {\"spawned_by\": \"luigi\", \"luigi_task_id\": self.job_uuid}},\n            \"spec\": {\"backoffLimit\": self.backoff_limit, \"template\": {\"metadata\": {\"name\": self.uu_name, \"labels\": {}}, \"spec\": self.spec_schema}},\n        }\n        if self.kubernetes_namespace is not None:\n            job_json[\"metadata\"][\"namespace\"] = self.kubernetes_namespace\n        if self.active_deadline_seconds is not None:\n            job_json[\"spec\"][\"activeDeadlineSeconds\"] = self.active_deadline_seconds\n        # Update user labels\n        job_json[\"metadata\"][\"labels\"].update(self.labels)\n        job_json[\"spec\"][\"template\"][\"metadata\"][\"labels\"].update(self.labels)\n\n        # Add default restartPolicy if not specified\n        if \"restartPolicy\" not in self.spec_schema:\n            job_json[\"spec\"][\"template\"][\"spec\"][\"restartPolicy\"] = \"Never\"\n        # Submit job\n        self.__logger.info(\"Submitting Kubernetes Job: \" + self.uu_name)\n        job = Job(self.__kube_api, job_json)\n        job.create()\n        # Track the Job (wait while active)\n        self.__logger.info(\"Start tracking Kubernetes Job: \" + self.uu_name)\n        self.__track_job()\n\n    def output(self):\n        \"\"\"\n        An output target is necessary for checking job completion unless\n        an alternative complete method is defined.\n\n        Example::\n\n            return luigi.LocalTarget(os.path.join('/tmp', 'example'))\n\n        \"\"\"\n        pass\n"
  },
  {
    "path": "luigi/contrib/lsf.py",
    "content": "# -*- coding: utf-8 -*-\n\n\"\"\"\n.. Copyright 2012-2015 Spotify AB\n   Copyright 2018\n   Copyright 2018 EMBL-European Bioinformatics Institute\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n   http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n\"\"\"\n\nimport logging\nimport os\nimport random\nimport shutil\nimport subprocess\nimport sys\nimport time\n\ntry:\n    # Dill is used for handling pickling and unpickling if there is a deference\n    # in server setups between the LSF submission node and the nodes in the\n    # cluster\n    import dill as pickle\nexcept ImportError:\n    import pickle\n\nimport luigi\nimport luigi.configuration\nfrom luigi.contrib import lsf_runner\nfrom luigi.contrib.hadoop import create_packages_archive\nfrom luigi.task_status import DONE, FAILED, PENDING, RUNNING, UNKNOWN\n\n\"\"\"\nLSF batch system Tasks.\n=======================\n\nWhat's LSF? see http://en.wikipedia.org/wiki/Platform_LSF\nand https://wiki.med.harvard.edu/Orchestra/IntroductionToLSF\n\nSee: https://github.com/spotify/luigi/issues/1936\n\nThis extension is modeled after the hadoop.py approach.\nI'll be making a few assumptions, and will try to note them.\n\nGoing into it, the assumptions are:\n\n- You schedule your jobs on an LSF submission node.\n- The 'bjobs' command on an LSF batch submission system returns a standardized format.\n- All nodes have access to the code you're running.\n- The sysadmin won't get pissed if we run a 'bjobs' check every thirty\n  seconds or so per job (there are ways of coalescing the bjobs calls if that's not cool).\n\nThe procedure:\n\n- Pickle the class\n- Construct a bsub argument that runs a generic runner function with the path to the pickled class\n- Runner function loads the class from pickle\n- Runner function hits the work button on it\n\n\"\"\"\n\nLOGGER = logging.getLogger(\"luigi-interface\")\n\n\ndef track_job(job_id):\n    \"\"\"\n    Tracking is done by requesting each job and then searching for whether the job\n    has one of the following states:\n    - \"RUN\",\n    - \"PEND\",\n    - \"SSUSP\",\n    - \"EXIT\"\n    based on the LSF documentation\n    \"\"\"\n    cmd = [\"bjobs\", \"-noheader\", \"-o\", \"stat\", str(job_id)]\n    track_job_proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=False)\n    status = track_job_proc.communicate()[0].strip(\"\\n\")\n    return status\n\n\ndef kill_job(job_id):\n    \"\"\"\n    Kill a running LSF job\n    \"\"\"\n    subprocess.call([\"bkill\", job_id])\n\n\nclass LSFJobTask(luigi.Task):\n    \"\"\"\n    Takes care of uploading and executing an LSF job\n    \"\"\"\n\n    n_cpu_flag = luigi.IntParameter(default=2, significant=False)\n    shared_tmp_dir = luigi.Parameter(default=\"/tmp\", significant=False)\n    resource_flag = luigi.Parameter(default=\"mem=8192\", significant=False)\n    memory_flag = luigi.Parameter(default=\"8192\", significant=False)\n    queue_flag = luigi.Parameter(default=\"queue_name\", significant=False)\n    runtime_flag = luigi.IntParameter(default=60)\n    job_name_flag = luigi.Parameter(default=\"\")\n    poll_time = luigi.FloatParameter(significant=False, default=5, description=\"specify the wait time to poll bjobs for the job status\")\n    save_job_info = luigi.BoolParameter(default=False)\n    output = luigi.Parameter(default=\"\")\n    extra_bsub_args = luigi.Parameter(default=\"\")\n\n    job_status = None\n\n    def fetch_task_failures(self):\n        \"\"\"\n        Read in the error file from bsub\n        \"\"\"\n        error_file = os.path.join(self.tmp_dir, \"job.err\")\n        if os.path.isfile(error_file):\n            with open(error_file, \"r\") as f_err:\n                errors = f_err.readlines()\n        else:\n            errors = \"\"\n        return errors\n\n    def fetch_task_output(self):\n        \"\"\"\n        Read in the output file\n        \"\"\"\n        # Read in the output file\n        if os.path.isfile(os.path.join(self.tmp_dir, \"job.out\")):\n            with open(os.path.join(self.tmp_dir, \"job.out\"), \"r\") as f_out:\n                outputs = f_out.readlines()\n        else:\n            outputs = \"\"\n        return outputs\n\n    def _init_local(self):\n\n        base_tmp_dir = self.shared_tmp_dir\n\n        random_id = \"%016x\" % random.getrandbits(64)\n        task_name = random_id + self.task_id\n        # If any parameters are directories, if we don't\n        # replace the separators on *nix, it'll create a weird nested directory\n        task_name = task_name.replace(\"/\", \"::\")\n\n        # Max filename length\n        max_filename_length = os.fstatvfs(0).f_namemax\n        self.tmp_dir = os.path.join(base_tmp_dir, task_name[:max_filename_length])\n\n        LOGGER.info(\"Tmp dir: %s\", self.tmp_dir)\n        os.makedirs(self.tmp_dir)\n\n        # Dump the code to be run into a pickle file\n        LOGGER.debug(\"Dumping pickled class\")\n        self._dump(self.tmp_dir)\n\n        # Make sure that all the class's dependencies are tarred and available\n        LOGGER.debug(\"Tarballing dependencies\")\n        # Grab luigi and the module containing the code to be run\n        packages = [luigi, __import__(self.__module__, None, None, \"dummy\")]\n        create_packages_archive(packages, os.path.join(self.tmp_dir, \"packages.tar\"))\n\n        # Now, pass onto the class's specified init_local() method.\n        self.init_local()\n\n    def init_local(self):\n        \"\"\"\n        Implement any work to setup any internal datastructure etc here.\n        You can add extra input using the requires_local/input_local methods.\n        Anything you set on the object will be pickled and available on the compute nodes.\n        \"\"\"\n        pass\n\n    def run(self):\n        \"\"\"\n        The procedure:\n        - Pickle the class\n        - Tarball the dependencies\n        - Construct a bsub argument that runs a generic runner function with the path to the pickled class\n        - Runner function loads the class from pickle\n        - Runner class untars the dependencies\n        - Runner function hits the button on the class's work() method\n        \"\"\"\n        self._init_local()\n        self._run_job()\n\n    def work(self):\n        \"\"\"\n        Subclass this for where you're doing your actual work.\n\n        Why not run(), like other tasks? Because we need run to always be\n        something that the Worker can call, and that's the real logical place to\n        do LSF scheduling.\n        So, the work will happen in work().\n        \"\"\"\n        pass\n\n    def _dump(self, out_dir=\"\"):\n        \"\"\"\n        Dump instance to file.\n        \"\"\"\n        self.job_file = os.path.join(out_dir, \"job-instance.pickle\")\n        if self.__module__ == \"__main__\":\n            dump_inst = pickle.dumps(self)\n            module_name = os.path.basename(sys.argv[0]).rsplit(\".\", 1)[0]\n            dump_inst = dump_inst.replace(\"(c__main__\", \"(c\" + module_name)\n            open(self.job_file, \"w\").write(dump_inst)\n\n        else:\n            pickle.dump(self, open(self.job_file, \"w\"))\n\n    def _run_job(self):\n        \"\"\"\n        Build a bsub argument that will run lsf_runner.py on the directory we've specified.\n        \"\"\"\n\n        args = []\n\n        if isinstance(self.output(), list):\n            log_output = os.path.split(self.output()[0].path)\n        else:\n            log_output = os.path.split(self.output().path)\n\n        args += [\"bsub\", \"-q\", self.queue_flag]\n        args += [\"-n\", str(self.n_cpu_flag)]\n        args += [\"-M\", str(self.memory_flag)]\n        args += [\"-R\", \"rusage[%s]\" % self.resource_flag]\n        args += [\"-W\", str(self.runtime_flag)]\n        if self.job_name_flag:\n            args += [\"-J\", str(self.job_name_flag)]\n        args += [\"-o\", os.path.join(log_output[0], \"job.out\")]\n        args += [\"-e\", os.path.join(log_output[0], \"job.err\")]\n        if self.extra_bsub_args:\n            args += self.extra_bsub_args.split()\n\n        # Find where the runner file is\n        runner_path = os.path.abspath(lsf_runner.__file__)\n\n        args += [runner_path]\n        args += [self.tmp_dir]\n\n        # That should do it. Let the world know what we're doing.\n        LOGGER.info(\"### LSF SUBMISSION ARGS: %s\", \" \".join([str(a) for a in args]))\n\n        # Submit the job\n        run_job_proc = subprocess.Popen([str(a) for a in args], stdin=subprocess.PIPE, stdout=subprocess.PIPE, cwd=self.tmp_dir)\n        output = run_job_proc.communicate()[0]\n\n        # ASSUMPTION\n        # The result will be of the format\n        # Job <123> is submitted ot queue <myqueue>\n        # So get the number in those first brackets.\n        # I cannot think of a better workaround that leaves logic on the Task side of things.\n        LOGGER.info(\"### JOB SUBMISSION OUTPUT: %s\", str(output))\n        self.job_id = int(output.split(\"<\")[1].split(\">\")[0])\n        LOGGER.info(\"Job %ssubmitted as job %s\", self.job_name_flag + \" \", str(self.job_id))\n\n        self._track_job()\n\n        # If we want to save the job temporaries, then do so\n        # We'll move them to be next to the job output\n        if self.save_job_info:\n            LOGGER.info(\"Saving up temporary bits\")\n\n            # dest_dir = self.output().path\n            shutil.move(self.tmp_dir, \"/\".join(log_output[0:-1]))\n\n        # Now delete the temporaries, if they're there.\n        self._finish()\n\n    def _track_job(self):\n        time0 = 0\n        while True:\n            # Sleep for a little bit\n            time.sleep(self.poll_time)\n\n            # See what the job's up to\n            # ASSUMPTION\n            lsf_status = track_job(self.job_id)\n            if lsf_status == \"RUN\":\n                self.job_status = RUNNING\n                LOGGER.info(\"Job is running...\")\n                if time0 == 0:\n                    time0 = int(round(time.time()))\n            elif lsf_status == \"PEND\":\n                self.job_status = PENDING\n                LOGGER.info(\"Job is pending...\")\n            elif lsf_status == \"DONE\" or lsf_status == \"EXIT\":\n                # Then the job could either be failed or done.\n                errors = self.fetch_task_failures()\n                if not errors:\n                    self.job_status = DONE\n                    LOGGER.info(\"Job is done\")\n                    time1 = int(round(time.time()))\n\n                    # Return a near estimate of the run time to with +/- the\n                    # self.poll_time\n                    job_name = str(self.job_id)\n                    if self.job_name_flag:\n                        job_name = \"%s %s\" % (self.job_name_flag, job_name)\n                    LOGGER.info(\"### JOB COMPLETED: %s in %s seconds\", job_name, str(time1 - time0))\n                else:\n                    self.job_status = FAILED\n                    LOGGER.error(\"Job has FAILED\")\n                    LOGGER.error(\"\\n\\n\")\n                    LOGGER.error(\"Traceback: \")\n                    for error in errors:\n                        LOGGER.error(error)\n                break\n            elif lsf_status == \"SSUSP\":\n                self.job_status = PENDING\n                LOGGER.info(\"Job is suspended (basically, pending)...\")\n\n            else:\n                self.job_status = UNKNOWN\n                LOGGER.info(\"Job status is UNKNOWN!\")\n                LOGGER.info(\"Status is : %s\", lsf_status)\n                break\n\n    def _finish(self):\n        LOGGER.info(\"Cleaning up temporary bits\")\n        if self.tmp_dir and os.path.exists(self.tmp_dir):\n            LOGGER.info(\"Removing directory %s\", self.tmp_dir)\n            shutil.rmtree(self.tmp_dir)\n\n    def __del__(self):\n        pass\n        # self._finish()\n\n\nclass LocalLSFJobTask(LSFJobTask):\n    \"\"\"\n    A local version of JobTask, for easier debugging.\n    \"\"\"\n\n    def run(self):\n        self.init_local()\n        self.work()\n"
  },
  {
    "path": "luigi/contrib/lsf_runner.py",
    "content": "# -*- coding: utf-8 -*-\n\n\"\"\"\n.. Copyright 2012-2015 Spotify AB\n   Copyright 2018\n   Copyright 2018 EMBL-European Bioinformatics Institute\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n   http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n\"\"\"\n\nimport os\nimport sys\n\ntry:\n    # Dill is used for handling pickling and unpickling if there is a deference\n    # in server setups between the LSF submission node and the nodes in the\n    # cluster\n    import dill as pickle\nexcept ImportError:\n    import pickle\nimport logging\n\nfrom luigi.safe_extractor import SafeExtractor\n\n\ndef do_work_on_compute_node(work_dir):\n    # Extract the necessary dependencies\n    extract_packages_archive(work_dir)\n\n    # Open up the pickle file with the work to be done\n    os.chdir(work_dir)\n    with open(\"job-instance.pickle\", \"r\") as pickle_file_handle:\n        job = pickle.load(pickle_file_handle)\n\n    # Do the work contained\n    job.work()\n\n\ndef extract_packages_archive(work_dir):\n    package_file = os.path.join(work_dir, \"packages.tar\")\n    if not os.path.exists(package_file):\n        return\n\n    curdir = os.path.abspath(os.curdir)\n\n    os.chdir(work_dir)\n    extractor = SafeExtractor(work_dir)\n    extractor.safe_extract(package_file)\n    if \"\" not in sys.path:\n        sys.path.insert(0, \"\")\n\n    os.chdir(curdir)\n\n\ndef main(args=sys.argv):\n    \"\"\"Run the work() method from the class instance in the file \"job-instance.pickle\".\"\"\"\n    try:\n        # Set up logging.\n        logging.basicConfig(level=logging.WARN)\n        work_dir = args[1]\n        assert os.path.exists(work_dir), \"First argument to lsf_runner.py must be a directory that exists\"\n        do_work_on_compute_node(work_dir)\n    except Exception as exc:\n        # Dump encoded data that we will try to fetch using mechanize\n        print(exc)\n        raise\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "luigi/contrib/mongodb.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2017 Big Datext Inc\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom luigi.target import Target\n\n\nclass MongoTarget(Target):\n    \"\"\"Target for a resource in MongoDB\"\"\"\n\n    def __init__(self, mongo_client, index, collection):\n        \"\"\"\n        :param mongo_client: MongoClient instance\n        :type mongo_client: MongoClient\n        :param index: database index\n        :type index: str\n        :param collection: index collection\n        :type collection: str\n        \"\"\"\n        self._mongo_client = mongo_client\n        self._index = index\n        self._collection = collection\n\n    def __str__(self):\n        return f\"{self._index}/{self._collection}\"\n\n    def get_collection(self):\n        \"\"\"\n        Return targeted mongo collection to query on\n        \"\"\"\n        db_mongo = self._mongo_client[self._index]\n        return db_mongo[self._collection]\n\n    def get_index(self):\n        \"\"\"\n        Return targeted mongo index to query on\n        \"\"\"\n        return self._mongo_client[self._index]\n\n\nclass MongoCellTarget(MongoTarget):\n    \"\"\"Target for a ressource in a specific field from a MongoDB document\"\"\"\n\n    def __init__(self, mongo_client, index, collection, document_id, path):\n        \"\"\"\n        :param document_id: targeted mongo document\n        :type document_id: str\n        :param path: full path to the targeted field in the mongo document\n        :type path: str\n        \"\"\"\n        super(MongoCellTarget, self).__init__(mongo_client, index, collection)\n\n        self._document_id = document_id\n        self._path = path\n\n    def exists(self):\n        \"\"\"\n        Test if target has been run\n        Target is considered run if the targeted field exists\n        \"\"\"\n        return self.read() is not None\n\n    def read(self):\n        \"\"\"\n        Read the target value\n        Use $project aggregate operator in order to support nested objects\n        \"\"\"\n        result = self.get_collection().aggregate([{\"$match\": {\"_id\": self._document_id}}, {\"$project\": {\"_value\": \"$\" + self._path, \"_id\": False}}])\n\n        for doc in result:\n            if \"_value\" not in doc:\n                break\n\n            return doc[\"_value\"]\n\n    def write(self, value):\n        \"\"\"\n        Write value to the target\n        \"\"\"\n        self.get_collection().update_one({\"_id\": self._document_id}, {\"$set\": {self._path: value}}, upsert=True)\n\n\nclass MongoRangeTarget(MongoTarget):\n    \"\"\"Target for a level 0 field in a range of documents\"\"\"\n\n    def __init__(self, mongo_client, index, collection, document_ids, field):\n        \"\"\"\n        :param document_ids: targeted mongo documents\n        :type documents_ids: list of str\n        :param field: targeted field in documents\n        :type field: str\n        \"\"\"\n        super(MongoRangeTarget, self).__init__(mongo_client, index, collection)\n\n        self._document_ids = document_ids\n        self._field = field\n\n    def exists(self):\n        \"\"\"\n        Test if target has been run\n        Target is considered run if the targeted field exists in ALL documents\n        \"\"\"\n        return not self.get_empty_ids()\n\n    def read(self):\n        \"\"\"\n        Read the targets value\n        \"\"\"\n        cursor = self.get_collection().find({\"_id\": {\"$in\": self._document_ids}, self._field: {\"$exists\": True}}, {self._field: True})\n\n        return {doc[\"_id\"]: doc[self._field] for doc in cursor}\n\n    def write(self, values):\n        \"\"\"\n        Write values to the targeted documents\n        Values need to be a dict as : {document_id: value}\n        \"\"\"\n        # Insert only for docs targeted by the target\n        filtered = {_id: value for _id, value in values.items() if _id in self._document_ids}\n\n        if not filtered:\n            return\n\n        bulk = self.get_collection().initialize_ordered_bulk_op()\n        for _id, value in filtered.items():\n            bulk.find({\"_id\": _id}).upsert().update_one({\"$set\": {self._field: value}})\n\n        bulk.execute()\n\n    def get_empty_ids(self):\n        \"\"\"\n        Get documents id with missing targeted field\n        \"\"\"\n        cursor = self.get_collection().find({\"_id\": {\"$in\": self._document_ids}, self._field: {\"$exists\": True}}, {\"_id\": True})\n\n        return set(self._document_ids) - {doc[\"_id\"] for doc in cursor}\n\n\nclass MongoCollectionTarget(MongoTarget):\n    \"\"\"Target for existing collection\"\"\"\n\n    def __init__(self, mongo_client, index, collection):\n        super(MongoCollectionTarget, self).__init__(mongo_client, index, collection)\n\n    def exists(self):\n        \"\"\"\n        Test if target has been run\n        Target is considered run if the targeted collection exists in the database\n        \"\"\"\n        return self.read()\n\n    def read(self):\n        \"\"\"\n        Return if the target collection exists in the database\n        \"\"\"\n        return self._collection in self.get_index().collection_names()\n\n\nclass MongoCountTarget(MongoTarget):\n    \"\"\"Target for documents count\"\"\"\n\n    def __init__(self, mongo_client, index, collection, target_count):\n        \"\"\"\n        :param target_count: Value of the desired item count in the target\n        :type field: int\n        \"\"\"\n        super(MongoCountTarget, self).__init__(mongo_client, index, collection)\n\n        self._target_count = target_count\n\n    def exists(self):\n        \"\"\"\n        Test if the target has been run\n        Target is considered run if the number of items in the target matches value of self._target_count\n        \"\"\"\n        return self.read() == self._target_count\n\n    def read(self):\n        \"\"\"\n        Using the aggregate method to avoid inaccurate count if using a sharded cluster\n        https://docs.mongodb.com/manual/reference/method/db.collection.count/#behavior\n        \"\"\"\n        for res in self.get_collection().aggregate([{\"$group\": {\"_id\": None, \"count\": {\"$sum\": 1}}}]):\n            return res.get(\"count\", None)\n        return None\n"
  },
  {
    "path": "luigi/contrib/mssqldb.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport logging\n\nimport luigi\n\nlogger = logging.getLogger(\"luigi-interface\")\n\ntry:\n    from pymssql import _mssql\nexcept ImportError:\n    logger.warning(\n        \"Loading MSSQL module without the python package pymssql. \\\n        This will crash at runtime if SQL Server functionality is used.\"\n    )\n\n\nclass MSSqlTarget(luigi.Target):\n    \"\"\"\n    Target for a resource in Microsoft SQL Server.\n    This module is primarily derived from mysqldb.py.  Much of MSSqlTarget,\n    MySqlTarget and PostgresTarget are similar enough to potentially add a\n    RDBMSTarget abstract base class to rdbms.py that these classes could be\n    derived from.\n    \"\"\"\n\n    marker_table = luigi.configuration.get_config().get(\"mssql\", \"marker-table\", \"table_updates\")\n\n    def __init__(self, host, database, user, password, table, update_id):\n        \"\"\"\n        Initializes a MsSqlTarget instance.\n\n        :param host: MsSql server address. Possibly a host:port string.\n        :type host: str\n        :param database: database name.\n        :type database: str\n        :param user: database user\n        :type user: str\n        :param password: password for specified user.\n        :type password: str\n        :param update_id: an identifier for this data set.\n        :type update_id: str\n        \"\"\"\n        if \":\" in host:\n            self.host, self.port = host.split(\":\")\n            self.port = int(self.port)\n        else:\n            self.host = host\n            self.port = 1433\n        self.database = database\n        self.user = user\n        self.password = password\n        self.table = table\n        self.update_id = update_id\n\n    def __str__(self):\n        return self.table\n\n    def touch(self, connection=None):\n        \"\"\"\n        Mark this update as complete.\n\n        IMPORTANT, If the marker table doesn't exist,\n        the connection transaction will be aborted and the connection reset.\n        Then the marker table will be created.\n        \"\"\"\n        self.create_marker_table()\n\n        if connection is None:\n            connection = self.connect()\n\n        connection.execute_non_query(\n            \"\"\"IF NOT EXISTS(SELECT 1\n                            FROM {marker_table}\n                            WHERE update_id = %(update_id)s)\n                    INSERT INTO {marker_table} (update_id, target_table)\n                        VALUES (%(update_id)s, %(table)s)\n                ELSE\n                    UPDATE t\n                    SET target_table = %(table)s\n                        , inserted = GETDATE()\n                    FROM {marker_table} t\n                    WHERE update_id = %(update_id)s\n              \"\"\".format(marker_table=self.marker_table),\n            {\"update_id\": self.update_id, \"table\": self.table},\n        )\n\n        # make sure update is properly marked\n        assert self.exists(connection)\n\n    def exists(self, connection=None):\n        if connection is None:\n            connection = self.connect()\n        try:\n            row = connection.execute_row(\n                \"\"\"SELECT 1 FROM {marker_table}\n                                            WHERE update_id = %s\n                                    \"\"\".format(marker_table=self.marker_table),\n                (self.update_id,),\n            )\n        except _mssql.MssqlDatabaseException as e:\n            # Error number for table doesn't exist\n            if e.number == 208:\n                row = None\n            else:\n                raise\n\n        return row is not None\n\n    def connect(self):\n        \"\"\"\n        Create a SQL Server connection and return a connection object\n        \"\"\"\n        connection = _mssql.connect(user=self.user, password=self.password, server=self.host, port=self.port, database=self.database)\n        return connection\n\n    def create_marker_table(self):\n        \"\"\"\n        Create marker table if it doesn't exist.\n        Use a separate connection since the transaction might have to be reset.\n        \"\"\"\n        connection = self.connect()\n        try:\n            connection.execute_non_query(\n                \"\"\" CREATE TABLE {marker_table} (\n                        id            BIGINT    NOT NULL IDENTITY(1,1),\n                        update_id     VARCHAR(128)  NOT NULL,\n                        target_table  VARCHAR(128),\n                        inserted      DATETIME DEFAULT(GETDATE()),\n                        PRIMARY KEY (update_id)\n                    )\n                \"\"\".format(marker_table=self.marker_table)\n            )\n        except _mssql.MssqlDatabaseException as e:\n            # Table already exists code\n            if e.number == 2714:\n                pass\n            else:\n                raise\n        connection.close()\n"
  },
  {
    "path": "luigi/contrib/mysqldb.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport logging\n\nimport luigi\nfrom luigi.contrib import rdbms\n\nlogger = logging.getLogger(\"luigi-interface\")\n\ntry:\n    import mysql.connector\n    from mysql.connector import Error, errorcode\nexcept ImportError:\n    logger.warning(\n        \"Loading MySQL module without the python package mysql-connector-python. \\\n       This will crash at runtime if MySQL functionality is used.\"\n    )\n\n\nclass MySqlTarget(luigi.Target):\n    \"\"\"\n    Target for a resource in MySql.\n    \"\"\"\n\n    marker_table = luigi.configuration.get_config().get(\"mysql\", \"marker-table\", \"table_updates\")\n\n    def __init__(self, host, database, user, password, table, update_id, **cnx_kwargs):\n        \"\"\"\n        Initializes a MySqlTarget instance.\n\n        :param host: MySql server address. Possibly a host:port string.\n        :type host: str\n        :param database: database name.\n        :type database: str\n        :param user: database user\n        :type user: str\n        :param password: password for specified user.\n        :type password: str\n        :param update_id: an identifier for this data set.\n        :type update_id: str\n        :param cnx_kwargs: optional params for mysql connector constructor.\n            See https://dev.mysql.com/doc/connector-python/en/connector-python-connectargs.html.\n        \"\"\"\n        if \":\" in host:\n            self.host, self.port = host.split(\":\")\n            self.port = int(self.port)\n        else:\n            self.host = host\n            self.port = 3306\n        self.database = database\n        self.user = user\n        self.password = password\n        self.table = table\n        self.update_id = update_id\n        self.cnx_kwargs = cnx_kwargs\n\n    def __str__(self):\n        return self.table\n\n    def touch(self, connection=None):\n        \"\"\"\n        Mark this update as complete.\n\n        IMPORTANT, If the marker table doesn't exist,\n        the connection transaction will be aborted and the connection reset.\n        Then the marker table will be created.\n        \"\"\"\n        self.create_marker_table()\n\n        if connection is None:\n            connection = self.connect()\n            connection.autocommit = True  # if connection created here, we commit it here\n\n        connection.cursor().execute(\n            \"\"\"INSERT INTO {marker_table} (update_id, target_table)\n               VALUES (%s, %s)\n               ON DUPLICATE KEY UPDATE\n               update_id = VALUES(update_id)\n            \"\"\".format(marker_table=self.marker_table),\n            (self.update_id, self.table),\n        )\n        # make sure update is properly marked\n        assert self.exists(connection)\n\n    def exists(self, connection=None):\n        if connection is None:\n            connection = self.connect()\n            connection.autocommit = True\n        cursor = connection.cursor()\n        try:\n            cursor.execute(\n                \"\"\"SELECT 1 FROM {marker_table}\n                WHERE update_id = %s\n                LIMIT 1\"\"\".format(marker_table=self.marker_table),\n                (self.update_id,),\n            )\n            row = cursor.fetchone()\n        except mysql.connector.Error as e:\n            if e.errno == errorcode.ER_NO_SUCH_TABLE:\n                row = None\n            else:\n                raise\n        return row is not None\n\n    def connect(self, autocommit=False):\n        connection = mysql.connector.connect(\n            user=self.user, password=self.password, host=self.host, port=self.port, database=self.database, autocommit=autocommit, **self.cnx_kwargs\n        )\n        return connection\n\n    def create_marker_table(self):\n        \"\"\"\n        Create marker table if it doesn't exist.\n\n        Using a separate connection since the transaction might have to be reset.\n        \"\"\"\n        connection = self.connect(autocommit=True)\n        cursor = connection.cursor()\n        try:\n            cursor.execute(\n                \"\"\" CREATE TABLE {marker_table} (\n                        id            BIGINT(20)    NOT NULL AUTO_INCREMENT,\n                        update_id     VARCHAR(128)  NOT NULL,\n                        target_table  VARCHAR(128),\n                        inserted      TIMESTAMP DEFAULT NOW(),\n                        PRIMARY KEY (update_id),\n                        KEY id (id)\n                    )\n                \"\"\".format(marker_table=self.marker_table)\n            )\n        except mysql.connector.Error as e:\n            if e.errno == errorcode.ER_TABLE_EXISTS_ERROR:\n                pass\n            else:\n                raise\n        connection.close()\n\n\nclass CopyToTable(rdbms.CopyToTable):\n    \"\"\"\n    Template task for inserting a data set into MySQL\n\n    Usage:\n    Subclass and override the required `host`, `database`, `user`,\n    `password`, `table` and `columns` attributes.\n\n    To customize how to access data from an input task, override the `rows` method\n    with a generator that yields each row as a tuple with fields ordered according to `columns`.\n    \"\"\"\n\n    def rows(self):\n        \"\"\"\n        Return/yield tuples or lists corresponding to each row to be inserted.\n        \"\"\"\n        with self.input().open(\"r\") as fobj:\n            for line in fobj:\n                yield line.strip(\"\\n\").split(\"\\t\")\n\n    # everything below will rarely have to be overridden\n\n    def output(self):\n        \"\"\"\n        Returns a MySqlTarget representing the inserted dataset.\n\n        Normally you don't override this.\n        \"\"\"\n        return MySqlTarget(host=self.host, database=self.database, user=self.user, password=self.password, table=self.table, update_id=self.update_id)\n\n    def copy(self, cursor, file=None):\n        values = \"({})\".format(\",\".join([\"%s\" for i in range(len(self.columns))]))\n        columns = \"({})\".format(\",\".join([c[0] for c in self.columns]))\n        query = \"INSERT INTO {} {} VALUES {}\".format(self.table, columns, values)\n        rows = []\n\n        for idx, row in enumerate(self.rows()):\n            rows.append(row)\n\n            if (idx + 1) % self.bulk_size == 0:\n                cursor.executemany(query, rows)\n                rows = []\n\n        cursor.executemany(query, rows)\n\n    def run(self):\n        \"\"\"\n        Inserts data generated by rows() into target table.\n\n        If the target table doesn't exist, self.create_table will be called to attempt to create the table.\n\n        Normally you don't want to override this.\n        \"\"\"\n        if not (self.table and self.columns):\n            raise Exception(\"table and columns need to be specified\")\n\n        connection = self.output().connect()\n\n        # attempt to copy the data into mysql\n        # if it fails because the target table doesn't exist\n        # try to create it by running self.create_table\n        for attempt in range(2):\n            try:\n                cursor = connection.cursor()\n                print(\"caling init copy...\")\n                self.init_copy(connection)\n                self.copy(cursor)\n                self.post_copy(connection)\n                if self.enable_metadata_columns:\n                    self.post_copy_metacolumns(cursor)\n            except Error as err:\n                if err.errno == errorcode.ER_NO_SUCH_TABLE and attempt == 0:\n                    # if first attempt fails with \"relation not found\", try creating table\n                    # logger.info(\"Creating table %s\", self.table)\n                    connection.reconnect()\n                    self.create_table(connection)\n                else:\n                    raise\n            else:\n                break\n\n        # mark as complete in same transaction\n        self.output().touch(connection)\n        connection.commit()\n        connection.close()\n\n    @property\n    def bulk_size(self):\n        return 10000\n"
  },
  {
    "path": "luigi/contrib/opener.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"OpenerTarget support, allows easier testing and configuration by abstracting\nout the LocalTarget, S3Target, and MockTarget types.\n\nExample:\n\n.. code-block:: python\n\n    from luigi.contrib.opener import OpenerTarget\n\n    OpenerTarget('/local/path.txt')\n    OpenerTarget('s3://zefr/remote/path.txt')\n\n\"\"\"\n\nfrom __future__ import annotations\n\nimport json\nfrom urllib.parse import parse_qs, urlsplit\n\nfrom luigi.contrib.s3 import S3Target\nfrom luigi.local_target import LocalTarget\nfrom luigi.mock import MockTarget\nfrom luigi.target import FileSystemException\n\n__all__ = [\"OpenerError\", \"NoOpenerError\", \"InvalidQuery\", \"OpenerRegistry\", \"Opener\", \"MockOpener\", \"LocalOpener\", \"S3Opener\", \"opener\", \"OpenerTarget\"]\n\n\nclass OpenerError(FileSystemException):\n    \"\"\"The base exception thrown by openers\"\"\"\n\n    pass\n\n\nclass NoOpenerError(OpenerError):\n    \"\"\"Thrown when there is no opener for the given protocol\"\"\"\n\n    pass\n\n\nclass InvalidQuery(OpenerError):\n    \"\"\"Thrown when an opener is passed unexpected arguments\"\"\"\n\n    pass\n\n\nclass OpenerRegistry:\n    def __init__(self, openers=None):\n        \"\"\"An opener registry that  stores a number of opener objects used\n        to parse Target URIs\n\n        :param openers: A list of objects inherited from the Opener class.\n        :type openers: list\n\n        \"\"\"\n        if openers is None:\n            openers = []\n\n        self.registry = {}\n        self.openers = {}\n        self.default_opener = \"file\"\n        for opener in openers:\n            self.add(opener)\n\n    def get_opener(self, name):\n        \"\"\"Retrieve an opener for the given protocol\n\n        :param name: name of the opener to open\n        :type name: string\n        :raises NoOpenerError: if no opener has been registered of that name\n\n        \"\"\"\n        if name not in self.registry:\n            raise NoOpenerError(\"No opener for %s\" % name)\n        index = self.registry[name]\n        return self.openers[index]\n\n    def add(self, opener):\n        \"\"\"Adds an opener to the registry\n\n        :param opener: Opener object\n        :type opener: Opener inherited object\n\n        \"\"\"\n\n        index = len(self.openers)\n        self.openers[index] = opener\n        for name in opener.names:\n            self.registry[name] = index\n\n    def open(self, target_uri, **kwargs):\n        \"\"\"Open target uri.\n\n        :param target_uri: Uri to open\n        :type target_uri: string\n\n        :returns: Target object\n\n        \"\"\"\n        target = urlsplit(target_uri, scheme=self.default_opener)\n\n        opener = self.get_opener(target.scheme)\n        query = opener.conform_query(target.query)\n\n        target = opener.get_target(target.scheme, target.path, target.fragment, target.username, target.password, target.hostname, target.port, query, **kwargs)\n        target.opener_path = target_uri\n\n        return target\n\n\nclass Opener:\n    \"\"\"Base class for Opener objects.\"\"\"\n\n    # Dictionary of expected kwargs and flag for json loading values (bool/int)\n    allowed_kwargs: dict[str, bool] = {}\n    # Flag to filter out unexpected kwargs\n    filter_kwargs = True\n\n    @classmethod\n    def conform_query(cls, query):\n        \"\"\"Converts the query string from a target uri, uses\n        cls.allowed_kwargs, and cls.filter_kwargs to drive logic.\n\n        :param query: Unparsed query string\n        :type query: urllib.parse.unsplit(uri).query\n        :returns: Dictionary of parsed values, everything in cls.allowed_kwargs\n            with values set to True will be parsed as json strings.\n\n        \"\"\"\n        query = parse_qs(query, keep_blank_values=True)\n\n        # Remove any unexpected keywords from the query string.\n        if cls.filter_kwargs:\n            query = {x: y for x, y in query.items() if x in cls.allowed_kwargs}\n\n        for key, vals in query.items():\n            # Multiple values of the same name could be passed use first\n            # Also params without strings will be treated as true values\n            if cls.allowed_kwargs.get(key, False):\n                val = json.loads(vals[0] or \"true\")\n            else:\n                val = vals[0] or \"true\"\n\n            query[key] = val\n\n        return query\n\n    @classmethod\n    def get_target(cls, scheme, path, fragment, username, password, hostname, port, query, **kwargs):\n        \"\"\"Override this method to use values from the parsed uri to initialize\n        the expected target.\n\n        \"\"\"\n        raise NotImplementedError(\"get_target must be overridden\")\n\n\nclass MockOpener(Opener):\n    \"\"\"Mock target opener, works like LocalTarget but files are all in\n    memory.\n\n    example:\n    * mock://foo/bar.txt\n\n    \"\"\"\n\n    names = [\"mock\"]\n    allowed_kwargs = {\n        \"is_tmp\": True,\n        \"mirror_on_stderr\": True,\n        \"format\": False,\n    }\n\n    @classmethod\n    def get_target(cls, scheme, path, fragment, username, password, hostname, port, query, **kwargs):\n        full_path = (hostname or \"\") + path\n        query.update(kwargs)\n        return MockTarget(full_path, **query)\n\n\nclass LocalOpener(Opener):\n    \"\"\"Local filesystem opener, works with any valid system path. This\n    is the default opener and will be used if you don't indicate which opener.\n\n    examples:\n    * file://relative/foo/bar/baz.txt (opens a relative file)\n    * file:///home/user (opens a directory from a absolute path)\n    * foo/bar.baz (file:// is the default opener)\n\n    \"\"\"\n\n    names = [\"file\"]\n    allowed_kwargs = {\n        \"is_tmp\": True,\n        \"format\": False,\n    }\n\n    @classmethod\n    def get_target(cls, scheme, path, fragment, username, password, hostname, port, query, **kwargs):\n        full_path = (hostname or \"\") + path\n        query.update(kwargs)\n        return LocalTarget(full_path, **query)\n\n\nclass S3Opener(Opener):\n    \"\"\"Opens a target stored on Amazon S3 storage\n\n    examples:\n    * s3://bucket/foo/bar.txt\n    * s3://bucket/foo/bar.txt?aws_access_key_id=xxx&aws_secret_access_key=yyy\n\n    \"\"\"\n\n    names = [\"s3\", \"s3n\"]\n    allowed_kwargs = {\n        \"format\": False,\n        \"client\": True,\n    }\n    filter_kwargs = False\n\n    @classmethod\n    def get_target(cls, scheme, path, fragment, username, password, hostname, port, query, **kwargs):\n        query.update(kwargs)\n        return S3Target(\"{scheme}://{hostname}{path}\".format(scheme=scheme, hostname=hostname, path=path), **query)\n\n\nopener = OpenerRegistry(\n    [\n        MockOpener,\n        LocalOpener,\n        S3Opener,\n    ]\n)\n\nOpenerTarget = opener.open\n"
  },
  {
    "path": "luigi/contrib/pai.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2017 Open Targets\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nMicroSoft OpenPAI Job wrapper for Luigi.\n\n  \"OpenPAI is an open source platform that provides complete AI model training and resource management capabilities,\n  it is easy to extend and supports on-premise, cloud and hybrid environments in various scale.\"\n\nFor more information about OpenPAI : https://github.com/Microsoft/pai/, this task is tested against OpenPAI 0.7.1\n\nRequires:\n\n- requests: ``pip install requests``\n\nWritten and maintained by Liu, Dongqing (@liudongqing).\n\"\"\"\n\nimport abc\nimport json\nimport logging\nimport time\nfrom urllib.parse import urljoin\n\nimport luigi\n\nlogger = logging.getLogger(\"luigi-interface\")\n\ntry:\n    import requests as rs\n    from requests.exceptions import HTTPError\n\nexcept ImportError:\n    logger.warning(\"requests is not installed. PaiTask requires requests.\")\n\n\ndef slot_to_dict(o):\n    o_dict = {}\n    for key in o.__slots__:\n        if not key.startswith(\"__\"):\n            value = getattr(o, key, None)\n            if value is not None:\n                o_dict[key] = value\n    return o_dict\n\n\nclass PaiJob:\n    \"\"\"\n    The Open PAI job definition.\n    Refer to here https://github.com/Microsoft/pai/blob/master/docs/job_tutorial.md\n    ::\n\n        {\n          \"jobName\":   String,\n          \"image\":     String,\n          \"authFile\":  String,\n          \"dataDir\":   String,\n          \"outputDir\": String,\n          \"codeDir\":   String,\n          \"virtualCluster\": String,\n          \"taskRoles\": [\n            {\n              \"name\":       String,\n              \"taskNumber\": Integer,\n              \"cpuNumber\":  Integer,\n              \"memoryMB\":   Integer,\n              \"shmMB\":      Integer,\n              \"gpuNumber\":  Integer,\n              \"portList\": [\n                {\n                  \"label\": String,\n                  \"beginAt\": Integer,\n                  \"portNumber\": Integer\n                }\n              ],\n              \"command\":    String,\n              \"minFailedTaskCount\": Integer,\n              \"minSucceededTaskCount\": Integer\n            }\n          ],\n          \"gpuType\": String,\n          \"retryCount\": Integer\n        }\n\n    \"\"\"\n\n    __slots__ = (\"jobName\", \"image\", \"authFile\", \"dataDir\", \"outputDir\", \"codeDir\", \"virtualCluster\", \"taskRoles\", \"gpuType\", \"retryCount\")\n\n    def __init__(self, jobName, image, tasks):\n        \"\"\"\n        Initialize a Job with required fields.\n\n        :param jobName: Name for the job, need to be unique\n        :param image: URL pointing to the Docker image for all tasks in the job\n        :param tasks: List of taskRole, one task role at least\n        \"\"\"\n        self.jobName = jobName\n        self.image = image\n        if isinstance(tasks, list) and len(tasks) != 0:\n            self.taskRoles = tasks\n        else:\n            raise TypeError(\"you must specify one task at least.\")\n\n\nclass Port:\n    __slots__ = (\"label\", \"beginAt\", \"portNumber\")\n\n    def __init__(self, label, begin_at=0, port_number=1):\n        \"\"\"\n        The Port definition for TaskRole\n\n        :param label: Label name for the port type, required\n        :param begin_at: The port to begin with in the port type, 0 for random selection, required\n        :param port_number: Number of ports for the specific type, required\n        \"\"\"\n        self.label = label\n        self.beginAt = begin_at\n        self.portNumber = port_number\n\n\nclass TaskRole:\n    __slots__ = (\"name\", \"taskNumber\", \"cpuNumber\", \"memoryMB\", \"shmMB\", \"gpuNumber\", \"portList\", \"command\", \"minFailedTaskCount\", \"minSucceededTaskCount\")\n\n    def __init__(self, name, command, taskNumber=1, cpuNumber=1, memoryMB=2048, shmMB=64, gpuNumber=0, portList=[]):\n        \"\"\"\n        The TaskRole of PAI\n\n        :param name: Name for the task role, need to be unique with other roles, required\n        :param command: Executable command for tasks in the task role, can not be empty, required\n        :param taskNumber: Number of tasks for the task role, no less than 1, required\n        :param cpuNumber: CPU number for one task in the task role, no less than 1, required\n        :param shmMB: Shared memory for one task in the task role, no more than memory size, required\n        :param memoryMB: Memory for one task in the task role, no less than 100, required\n        :param gpuNumber: GPU number for one task in the task role, no less than 0, required\n        :param portList: List of portType to use, optional\n        \"\"\"\n        self.name = name\n        self.command = command\n        self.taskNumber = taskNumber\n        self.cpuNumber = cpuNumber\n        self.memoryMB = memoryMB\n        self.shmMB = shmMB\n        self.gpuNumber = gpuNumber\n        self.portList = portList\n\n\nclass OpenPai(luigi.Config):\n    pai_url = luigi.Parameter(default=\"http://127.0.0.1:9186\", description=\"rest server url, default is http://127.0.0.1:9186\")\n    username = luigi.Parameter(default=\"admin\", description=\"your username\")\n    password = luigi.Parameter(default=None, description=\"your password\")\n    expiration = luigi.IntParameter(default=3600, description=\"expiration time in seconds\")\n\n\nclass PaiTask(luigi.Task):\n    __POLL_TIME = 5\n\n    @property\n    @abc.abstractmethod\n    def name(self):\n        \"\"\"Name for the job, need to be unique, required\"\"\"\n        return \"SklearnExample\"\n\n    @property\n    @abc.abstractmethod\n    def image(self):\n        \"\"\"URL pointing to the Docker image for all tasks in the job, required\"\"\"\n        return \"openpai/pai.example.sklearn\"\n\n    @property\n    @abc.abstractmethod\n    def tasks(self):\n        \"\"\"List of taskRole, one task role at least, required\"\"\"\n        return []\n\n    @property\n    def auth_file_path(self):\n        \"\"\"Docker registry authentication file existing on HDFS, optional\"\"\"\n        return None\n\n    @property\n    def data_dir(self):\n        \"\"\"Data directory existing on HDFS, optional\"\"\"\n        return None\n\n    @property\n    def code_dir(self):\n        \"\"\"Code directory existing on HDFS, should not contain any data and should be less than 200MB, optional\"\"\"\n        return None\n\n    @property\n    def output_dir(self):\n        \"\"\"Output directory on HDFS, $PAI_DEFAULT_FS_URI/$jobName/output will be used if not specified, optional\"\"\"\n        return \"$PAI_DEFAULT_FS_URI/{0}/output\".format(self.name)\n\n    @property\n    def virtual_cluster(self):\n        \"\"\"The virtual cluster job runs on. If omitted, the job will run on default virtual cluster, optional\"\"\"\n        return \"default\"\n\n    @property\n    def gpu_type(self):\n        \"\"\"Specify the GPU type to be used in the tasks. If omitted, the job will run on any gpu type, optional\"\"\"\n        return None\n\n    @property\n    def retry_count(self):\n        \"\"\"Job retry count, no less than 0, optional\"\"\"\n        return 0\n\n    def __init_token(self):\n        self.__openpai = OpenPai()\n\n        request_json = json.dumps({\"username\": self.__openpai.username, \"password\": self.__openpai.password, \"expiration\": self.__openpai.expiration})\n        logger.debug(\"Requesting token from OpenPai\")\n        response = rs.post(urljoin(self.__openpai.pai_url, \"/api/v1/token\"), headers={\"Content-Type\": \"application/json\"}, data=request_json)\n        logger.debug(\"Get token response {0}\".format(response.text))\n        if response.status_code != 200:\n            msg = \"Get token request failed, response is {}\".format(response.text)\n            logger.error(msg)\n            raise Exception(msg)\n        else:\n            self.__token = response.json()[\"token\"]\n\n    def __init__(self, *args, **kwargs):\n        \"\"\"\n        :param pai_url: The rest server url of PAI clusters, default is 'http://127.0.0.1:9186'.\n        :param token: The token used to auth the rest server of PAI.\n        \"\"\"\n        super(PaiTask, self).__init__(*args, **kwargs)\n        self.__init_token()\n\n    def __check_job_status(self):\n        response = rs.get(urljoin(self.__openpai.pai_url, \"/api/v1/jobs/{0}\".format(self.name)))\n        logger.debug(\"Check job response {0}\".format(response.text))\n        if response.status_code == 404:\n            msg = \"Job {0} is not found\".format(self.name)\n            logger.debug(msg)\n            raise HTTPError(msg, response=response)\n        elif response.status_code != 200:\n            msg = \"Get job request failed, response is {}\".format(response.text)\n            logger.error(msg)\n            raise HTTPError(msg, response=response)\n        job_state = response.json()[\"jobStatus\"][\"state\"]\n        if job_state in [\"UNKNOWN\", \"WAITING\", \"RUNNING\"]:\n            logger.debug(\"Job {0} is running in state {1}\".format(self.name, job_state))\n            return False\n        else:\n            msg = \"Job {0} finished in state {1}\".format(self.name, job_state)\n            logger.info(msg)\n            if job_state == \"SUCCEED\":\n                return True\n            else:\n                raise RuntimeError(msg)\n\n    def run(self):\n        job = PaiJob(self.name, self.image, self.tasks)\n        job.virtualCluster = self.virtual_cluster\n        job.authFile = self.auth_file_path\n        job.codeDir = self.code_dir\n        job.dataDir = self.data_dir\n        job.outputDir = self.output_dir\n        job.retryCount = self.retry_count\n        job.gpuType = self.gpu_type\n        request_json = json.dumps(job, default=slot_to_dict)\n        logger.debug(\"Submit job request {0}\".format(request_json))\n        response = rs.post(\n            urljoin(self.__openpai.pai_url, \"/api/v1/jobs\"),\n            headers={\"Content-Type\": \"application/json\", \"Authorization\": \"Bearer {}\".format(self.__token)},\n            data=request_json,\n        )\n        logger.debug(\"Submit job response {0}\".format(response.text))\n        # 202 is success for job submission, see https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md\n        if response.status_code != 202:\n            msg = \"Submit job failed, response code is {0}, body is {1}\".format(response.status_code, response.text)\n            logger.error(msg)\n            raise HTTPError(msg, response=response)\n        while not self.__check_job_status():\n            time.sleep(self.__POLL_TIME)\n\n    def output(self):\n        return luigi.contrib.hdfs.HdfsTarget(self.output())\n\n    def complete(self):\n        try:\n            return self.__check_job_status()\n        except HTTPError:\n            return False\n        except RuntimeError:\n            return False\n"
  },
  {
    "path": "luigi/contrib/pig.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nApache Pig support.\nExample configuration section in luigi.cfg::\n\n    [pig]\n    # pig home directory\n    home: /usr/share/pig\n\"\"\"\n\nimport logging\nimport os\nimport select\nimport signal\nimport subprocess\nimport sys\nimport tempfile\nfrom contextlib import contextmanager\n\nimport luigi\nfrom luigi import configuration\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nclass PigJobTask(luigi.Task):\n    def pig_home(self):\n        return configuration.get_config().get(\"pig\", \"home\", \"/usr/share/pig\")\n\n    def pig_command_path(self):\n        return os.path.join(self.pig_home(), \"bin/pig\")\n\n    def pig_env_vars(self):\n        \"\"\"\n        Dictionary of environment variables that should be set when running Pig.\n\n        Ex::\n            return { 'PIG_CLASSPATH': '/your/path' }\n        \"\"\"\n        return {}\n\n    def pig_properties(self):\n        \"\"\"\n        Dictionary of properties that should be set when running Pig.\n\n        Example::\n\n            return { 'pig.additional.jars':'/path/to/your/jar' }\n        \"\"\"\n        return {}\n\n    def pig_parameters(self):\n        \"\"\"\n        Dictionary of parameters that should be set for the Pig job.\n\n        Example::\n\n            return { 'YOUR_PARAM_NAME':'Your param value' }\n        \"\"\"\n        return {}\n\n    def pig_options(self):\n        \"\"\"\n        List of options that will be appended to the Pig command.\n\n        Example::\n\n            return ['-x', 'local']\n        \"\"\"\n        return []\n\n    def output(self):\n        raise NotImplementedError(\"subclass should define output path\")\n\n    def pig_script_path(self):\n        \"\"\"\n        Return the path to the Pig script to be run.\n        \"\"\"\n        raise NotImplementedError(\"subclass should define pig_script_path\")\n\n    @contextmanager\n    def _build_pig_cmd(self):\n        opts = self.pig_options()\n\n        def line(k, v):\n            return (\"%s=%s%s\" % (k, v, os.linesep)).encode(\"utf-8\")\n\n        with tempfile.NamedTemporaryFile() as param_file, tempfile.NamedTemporaryFile() as prop_file:\n            if self.pig_parameters():\n                items = self.pig_parameters().items()\n                param_file.writelines(line(k, v) for (k, v) in items)\n                param_file.flush()\n                opts.append(\"-param_file\")\n                opts.append(param_file.name)\n\n            if self.pig_properties():\n                items = self.pig_properties().items()\n                prop_file.writelines(line(k, v) for k, v in items)\n                prop_file.flush()\n                opts.append(\"-propertyFile\")\n                opts.append(prop_file.name)\n\n            cmd = [self.pig_command_path()] + opts + [\"-f\", self.pig_script_path()]\n\n            logger.info(subprocess.list2cmdline(cmd))\n            yield cmd\n\n    def run(self):\n        with self._build_pig_cmd() as cmd:\n            self.track_and_progress(cmd)\n\n    def track_and_progress(self, cmd):\n        temp_stdout = tempfile.TemporaryFile(\"wb\")\n        env = os.environ.copy()\n        env[\"PIG_HOME\"] = self.pig_home()\n        for k, v in self.pig_env_vars().items():\n            env[k] = v\n\n        proc = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)\n        reads = [proc.stderr.fileno(), proc.stdout.fileno()]\n        # tracking the possible problems with this job\n        err_lines = []\n        with PigRunContext():\n            while proc.poll() is None:\n                ret = select.select(reads, [], [])\n                for fd in ret[0]:\n                    if fd == proc.stderr.fileno():\n                        line = proc.stderr.readline().decode(\"utf8\")\n                        err_lines.append(line)\n                    if fd == proc.stdout.fileno():\n                        line_bytes = proc.stdout.readline()\n                        temp_stdout.write(line_bytes)\n                        line = line_bytes.decode(\"utf8\")\n\n                err_line = line.lower()\n                if err_line.find(\"More information at:\") != -1:\n                    logger.info(err_line.split(\"more information at: \")[-1].strip())\n                if err_line.find(\" - \"):\n                    t = err_line.split(\" - \")[-1].strip()\n                    if t != \"\":\n                        logger.info(t)\n\n        # Read the rest + stdout\n        err = \"\".join(err_lines + [an_err_line.decode(\"utf8\") for an_err_line in proc.stderr])\n        if proc.returncode == 0:\n            logger.info(\"Job completed successfully!\")\n        else:\n            logger.error(\"Error when running script:\\n%s\", self.pig_script_path())\n            logger.error(err)\n            raise PigJobError(\"Pig script failed with return value: %s\" % (proc.returncode,), err=err)\n\n\nclass PigRunContext:\n    def __init__(self):\n        self.job_id = None\n\n    def __enter__(self):\n        self.__old_signal = signal.getsignal(signal.SIGTERM)\n        signal.signal(signal.SIGTERM, self.kill_job)\n        return self\n\n    def kill_job(self, captured_signal=None, stack_frame=None):\n        if self.job_id:\n            logger.info(\"Job interrupted, killing job %s\", self.job_id)\n            subprocess.call([\"pig\", \"-e\", '\"kill %s\"' % self.job_id])\n        if captured_signal is not None:\n            # adding 128 gives the exit code corresponding to a signal\n            sys.exit(128 + captured_signal)\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        if exc_type is KeyboardInterrupt:\n            self.kill_job()\n        signal.signal(signal.SIGTERM, self.__old_signal)\n\n\nclass PigJobError(RuntimeError):\n    def __init__(self, message, out=None, err=None):\n        super(PigJobError, self).__init__(message, out, err)\n        self.message = message\n        self.out = out\n        self.err = err\n\n    def __str__(self):\n        info = self.message\n        if self.out:\n            info += \"\\nSTDOUT: \" + str(self.out)\n        if self.err:\n            info += \"\\nSTDERR: \" + str(self.err)\n        return info\n"
  },
  {
    "path": "luigi/contrib/postgres.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nImplements a subclass of :py:class:`~luigi.target.Target` that writes data to Postgres.\nAlso provides a helper task to copy data into a Postgres table.\n\"\"\"\n\nimport datetime\nimport logging\nimport os\nimport re\nimport tempfile\n\nimport luigi\nfrom luigi.contrib import rdbms\n\nlogger = logging.getLogger(\"luigi-interface\")\n\nDB_DRIVER = os.environ.get(\"LUIGI_PGSQL_DRIVER\", \"psycopg2\")\n\nDB_ERROR_CODES = {}\nERROR_DUPLICATE_TABLE = \"duplicate_table\"\nERROR_UNDEFINED_TABLE = \"undefined_table\"\n\ndbapi = None\n\nif DB_DRIVER == \"psycopg2\":\n    try:\n        import psycopg2 as dbapi\n\n        def update_error_codes():\n            import psycopg2.errorcodes\n\n            DB_ERROR_CODES.update(\n                {\n                    psycopg2.errorcodes.DUPLICATE_TABLE: ERROR_DUPLICATE_TABLE,\n                    psycopg2.errorcodes.UNDEFINED_TABLE: ERROR_UNDEFINED_TABLE,\n                }\n            )\n\n        update_error_codes()\n    except ImportError:\n        pass\n\nif dbapi is None or DB_DRIVER == \"pg8000\":\n    try:\n        import pg8000.core\n        import pg8000.dbapi as dbapi  # noqa: F811\n\n        # pg8000 doesn't have an error code catalog so we need to make our own\n        # from https://www.postgresql.org/docs/8.2/errcodes-appendix.html\n        DB_ERROR_CODES.update({\"42P07\": ERROR_DUPLICATE_TABLE, \"42P01\": ERROR_UNDEFINED_TABLE})\n    except ImportError:\n        pass\n\n\nif dbapi is None:\n    logger.warning(\"Loading postgres module without psycopg2 nor pg8000 installed. Will crash at runtime if postgres functionality is used.\")\n\n\ndef _is_pg8000_error(exception):\n    try:\n        return (\n            isinstance(exception, dbapi.DatabaseError)\n            and isinstance(exception.args, tuple)\n            and isinstance(exception.args[0], dict)\n            and pg8000.core.RESPONSE_CODE in exception.args[0]\n        )\n    except NameError:\n        return False\n\n\ndef _pg8000_connection_reset(connection):\n    cursor = connection.cursor()\n    if connection.autocommit:\n        cursor.execute(\"DISCARD ALL\")\n    else:\n        cursor.execute(\"ABORT\")\n        cursor.execute(\"BEGIN TRANSACTION\")\n    cursor.close()\n\n\ndef db_error_code(exception):\n    try:\n        error_code = None\n        if hasattr(exception, \"pgcode\"):\n            error_code = exception.pgcode\n        elif _is_pg8000_error(exception):\n            error_code = exception.args[0][pg8000.core.RESPONSE_CODE]\n\n        return DB_ERROR_CODES.get(error_code)\n    except TypeError as error:\n        error.__cause__ = exception\n        raise error\n\n\nclass MultiReplacer:\n    \"\"\"\n    Object for one-pass replace of multiple words\n\n    Substituted parts will not be matched against other replace patterns, as opposed to when using multipass replace.\n    The order of the items in the replace_pairs input will dictate replacement precedence.\n\n    Constructor arguments:\n    replace_pairs -- list of 2-tuples which hold strings to be replaced and replace string\n\n    Usage:\n\n    .. code-block:: python\n\n        >>> replace_pairs = [(\"a\", \"b\"), (\"b\", \"c\")]\n        >>> MultiReplacer(replace_pairs)(\"abcd\")\n        'bccd'\n        >>> replace_pairs = [(\"ab\", \"x\"), (\"a\", \"x\")]\n        >>> MultiReplacer(replace_pairs)(\"ab\")\n        'x'\n        >>> replace_pairs.reverse()\n        >>> MultiReplacer(replace_pairs)(\"ab\")\n        'xb'\n    \"\"\"\n\n    # TODO: move to misc/util module\n\n    def __init__(self, replace_pairs):\n        \"\"\"\n        Initializes a MultiReplacer instance.\n\n        :param replace_pairs: list of 2-tuples which hold strings to be replaced and replace string.\n        :type replace_pairs: tuple\n        \"\"\"\n        replace_list = list(replace_pairs)  # make a copy in case input is iterable\n        self._replace_dict = dict(replace_list)\n        pattern = \"|\".join(re.escape(x) for x, y in replace_list)\n        self._search_re = re.compile(pattern)\n\n    def _replacer(self, match_object):\n        # this method is used as the replace function in the re.sub below\n        return self._replace_dict[match_object.group()]\n\n    def __call__(self, search_string):\n        # using function replacing for a per-result replace\n        return self._search_re.sub(self._replacer, search_string)\n\n\n# these are the escape sequences recognized by postgres COPY\n# according to http://www.postgresql.org/docs/8.1/static/sql-copy.html\ndefault_escape = MultiReplacer([(\"\\\\\", \"\\\\\\\\\"), (\"\\t\", \"\\\\t\"), (\"\\n\", \"\\\\n\"), (\"\\r\", \"\\\\r\"), (\"\\v\", \"\\\\v\"), (\"\\b\", \"\\\\b\"), (\"\\f\", \"\\\\f\")])\n\n\nclass PostgresTarget(luigi.Target):\n    \"\"\"\n    Target for a resource in Postgres.\n\n    This will rarely have to be directly instantiated by the user.\n    \"\"\"\n\n    marker_table = luigi.configuration.get_config().get(\"postgres\", \"marker-table\", \"table_updates\")\n\n    # if not supplied, fall back to default Postgres port\n    DEFAULT_DB_PORT = 5432\n\n    # Use DB side timestamps or client side timestamps in the marker_table\n    use_db_timestamps = True\n\n    def __init__(self, host, database, user, password, table, update_id, port=None):\n        \"\"\"\n        Args:\n            host (str): Postgres server address. Possibly a host:port string.\n            database (str): Database name\n            user (str): Database user\n            password (str): Password for specified user\n            update_id (str): An identifier for this data set\n            port (int): Postgres server port.\n\n        \"\"\"\n        if \":\" in host:\n            self.host, self.port = host.split(\":\")\n        else:\n            self.host = host\n            self.port = port or self.DEFAULT_DB_PORT\n        self.database = database\n        self.user = user\n        self.password = password\n        self.table = table\n        self.update_id = update_id\n\n    def __str__(self):\n        return self.table\n\n    def touch(self, connection=None):\n        \"\"\"\n        Mark this update as complete.\n\n        Important: If the marker table doesn't exist, the connection transaction will be aborted\n        and the connection reset.\n        Then the marker table will be created.\n        \"\"\"\n        self.create_marker_table()\n\n        if connection is None:\n            # TODO: test this\n            connection = self.connect()\n            connection.autocommit = True  # if connection created here, we commit it here\n\n        if self.use_db_timestamps:\n            connection.cursor().execute(\n                \"\"\"INSERT INTO {marker_table} (update_id, target_table)\n                   VALUES (%s, %s)\n                \"\"\".format(marker_table=self.marker_table),\n                (self.update_id, self.table),\n            )\n        else:\n            connection.cursor().execute(\n                \"\"\"INSERT INTO {marker_table} (update_id, target_table, inserted)\n                         VALUES (%s, %s, %s);\n                    \"\"\".format(marker_table=self.marker_table),\n                (self.update_id, self.table, datetime.datetime.now()),\n            )\n\n    def exists(self, connection=None):\n        if connection is None:\n            connection = self.connect()\n            connection.autocommit = True\n        cursor = connection.cursor()\n        try:\n            cursor.execute(\n                \"\"\"SELECT 1 FROM {marker_table}\n                WHERE update_id = %s\n                LIMIT 1\"\"\".format(marker_table=self.marker_table),\n                (self.update_id,),\n            )\n            row = cursor.fetchone()\n        except dbapi.DatabaseError as e:\n            if db_error_code(e) == ERROR_UNDEFINED_TABLE:\n                row = None\n            else:\n                raise\n        return row is not None\n\n    def connect(self):\n        \"\"\"\n        Get a DBAPI 2.0 connection object to the database where the table is.\n        \"\"\"\n        connection = dbapi.connect(host=self.host, port=self.port, database=self.database, user=self.user, password=self.password)\n        connection.set_client_encoding(\"utf-8\")\n        return connection\n\n    def create_marker_table(self):\n        \"\"\"\n        Create marker table if it doesn't exist.\n\n        Using a separate connection since the transaction might have to be reset.\n        \"\"\"\n        connection = self.connect()\n        connection.autocommit = True\n        cursor = connection.cursor()\n        if self.use_db_timestamps:\n            sql = \"\"\" CREATE TABLE {marker_table} (\n                      update_id TEXT PRIMARY KEY,\n                      target_table TEXT,\n                      inserted TIMESTAMP DEFAULT NOW())\n                  \"\"\".format(marker_table=self.marker_table)\n        else:\n            sql = \"\"\" CREATE TABLE {marker_table} (\n                      update_id TEXT PRIMARY KEY,\n                      target_table TEXT,\n                      inserted TIMESTAMP);\n                  \"\"\".format(marker_table=self.marker_table)\n\n        try:\n            cursor.execute(sql)\n        except dbapi.DatabaseError as e:\n            if db_error_code(e) == ERROR_DUPLICATE_TABLE:\n                pass\n            else:\n                raise\n        connection.close()\n\n    def open(self, mode):\n        raise NotImplementedError(\"Cannot open() PostgresTarget\")\n\n\nclass CopyToTable(rdbms.CopyToTable):\n    \"\"\"\n    Template task for inserting a data set into Postgres\n\n    Usage:\n    Subclass and override the required `host`, `database`, `user`,\n    `password`, `table` and `columns` attributes.\n\n    To customize how to access data from an input task, override the `rows` method\n    with a generator that yields each row as a tuple with fields ordered according to `columns`.\n    \"\"\"\n\n    def rows(self):\n        \"\"\"\n        Return/yield tuples or lists corresponding to each row to be inserted.\n        \"\"\"\n        with self.input().open(\"r\") as fobj:\n            for line in fobj:\n                yield line.strip(\"\\n\").split(\"\\t\")\n\n    def map_column(self, value):\n        \"\"\"\n        Applied to each column of every row returned by `rows`.\n\n        Default behaviour is to escape special characters and identify any self.null_values.\n        \"\"\"\n        if value in self.null_values:\n            return r\"\\\\N\"\n        else:\n            return default_escape(str(value))\n\n    # everything below will rarely have to be overridden\n\n    def output(self):\n        \"\"\"\n        Returns a PostgresTarget representing the inserted dataset.\n\n        Normally you don't override this.\n        \"\"\"\n        return PostgresTarget(\n            host=self.host, database=self.database, user=self.user, password=self.password, table=self.table, update_id=self.update_id, port=self.port\n        )\n\n    def copy(self, cursor, file):\n        if isinstance(self.columns[0], str):\n            column_names = self.columns\n        elif len(self.columns[0]) == 2:\n            column_names = [c[0] for c in self.columns]\n        else:\n            raise Exception(\"columns must consist of column strings or (column string, type string) tuples (was %r ...)\" % (self.columns[0],))\n\n        copy_sql = (\"COPY {table} ({column_list}) FROM STDIN WITH (FORMAT text, NULL '{null_string}', DELIMITER '{delimiter}')\").format(\n            table=self.table, delimiter=self.column_separator, null_string=r\"\\\\N\", column_list=\", \".join(column_names)\n        )\n        # cursor.copy_expert is not available in pg8000\n        if hasattr(cursor, \"copy_expert\"):\n            cursor.copy_expert(copy_sql, file)\n        else:\n            cursor.execute(copy_sql, stream=file)\n\n    def run(self):\n        \"\"\"\n        Inserts data generated by rows() into target table.\n\n        If the target table doesn't exist, self.create_table will be called to attempt to create the table.\n\n        Normally you don't want to override this.\n        \"\"\"\n        if not (self.table and self.columns):\n            raise Exception(\"table and columns need to be specified\")\n\n        connection = self.output().connect()\n        # transform all data generated by rows() using map_column and write data\n        # to a temporary file for import using postgres COPY\n        tmp_dir = luigi.configuration.get_config().get(\"postgres\", \"local-tmp-dir\", None)\n        tmp_file = tempfile.TemporaryFile(dir=tmp_dir)\n        n = 0\n        for row in self.rows():\n            n += 1\n            if n % 100000 == 0:\n                logger.info(\"Wrote %d lines\", n)\n            rowstr = self.column_separator.join(self.map_column(val) for val in row)\n            rowstr += \"\\n\"\n            tmp_file.write(rowstr.encode(\"utf-8\"))\n\n        logger.info(\"Done writing, importing at %s\", datetime.datetime.now())\n        tmp_file.seek(0)\n\n        # attempt to copy the data into postgres\n        # if it fails because the target table doesn't exist\n        # try to create it by running self.create_table\n        for attempt in range(2):\n            try:\n                cursor = connection.cursor()\n                self.init_copy(connection)\n                self.copy(cursor, tmp_file)\n                self.post_copy(connection)\n                if self.enable_metadata_columns:\n                    self.post_copy_metacolumns(cursor)\n            except dbapi.DatabaseError as e:\n                if db_error_code(e) == ERROR_UNDEFINED_TABLE and attempt == 0:\n                    # if first attempt fails with \"relation not found\", try creating table\n                    logger.info(\"Creating table %s\", self.table)\n                    # reset() is a psycopg2-specific method\n                    if hasattr(connection, \"reset\"):\n                        connection.reset()\n                    else:\n                        _pg8000_connection_reset(connection)\n                    self.create_table(connection)\n                else:\n                    raise\n            else:\n                break\n\n        # mark as complete in same transaction\n        self.output().touch(connection)\n\n        # commit and clean up\n        connection.commit()\n        connection.close()\n        tmp_file.close()\n\n\nclass PostgresQuery(rdbms.Query):\n    \"\"\"\n    Template task for querying a Postgres compatible database\n\n    Usage:\n    Subclass and override the required `host`, `database`, `user`, `password`, `table`, and `query` attributes.\n    Optionally one can override the `autocommit` attribute to put the connection for the query in autocommit mode.\n\n    Override the `run` method if your use case requires some action with the query result.\n\n    Task instances require a dynamic `update_id`, e.g. via parameter(s), otherwise the query will only execute once\n\n    To customize the query signature as recorded in the database marker table, override the `update_id` property.\n    \"\"\"\n\n    def run(self):\n        connection = self.output().connect()\n        connection.autocommit = self.autocommit\n        cursor = connection.cursor()\n        sql = self.query\n\n        logger.info(\"Executing query from task: {name}\".format(name=self.__class__))\n        cursor.execute(sql)\n\n        # Update marker table\n        self.output().touch(connection)\n\n        # commit and close connection\n        connection.commit()\n        connection.close()\n\n    def output(self):\n        \"\"\"\n        Returns a PostgresTarget representing the executed query.\n\n        Normally you don't override this.\n        \"\"\"\n        return PostgresTarget(\n            host=self.host, database=self.database, user=self.user, password=self.password, table=self.table, update_id=self.update_id, port=self.port\n        )\n"
  },
  {
    "path": "luigi/contrib/presto.py",
    "content": "import inspect\nimport logging\nimport re\nfrom collections import OrderedDict\nfrom contextlib import closing\nfrom enum import Enum\nfrom time import sleep\n\nimport luigi\nfrom luigi.contrib import rdbms\nfrom luigi.task_register import Register\n\nlogger = logging.getLogger(\"luigi-interface\")\n\ntry:\n    from pyhive.exc import DatabaseError\n    from pyhive.presto import Connection, Cursor\nexcept ImportError:\n    logger.warning(\"pyhive[presto] is not installed.\")\n\n\nclass presto(luigi.Config):  # NOQA\n    host = luigi.Parameter(default=\"localhost\", description=\"Presto host\")\n    port = luigi.IntParameter(default=8090, description=\"Presto port\")\n    user = luigi.Parameter(default=\"anonymous\", description=\"Presto user\")\n    catalog = luigi.Parameter(default=\"hive\", description=\"Default catalog\")\n    password = luigi.Parameter(default=None, description=\"User password\")\n    protocol = luigi.Parameter(default=\"https\", description=\"Presto connection protocol\")\n    poll_interval = luigi.FloatParameter(default=1.0, description=\" how often to ask the Presto REST interface for a progress update, defaults to a second\")\n\n\nclass PrestoClient:\n    \"\"\"\n    Helper class wrapping `pyhive.presto.Connection`\n    for executing presto queries and tracking progress\n    \"\"\"\n\n    def __init__(self, connection, sleep_time=1):\n        self.sleep_time = sleep_time\n        self._connection = connection\n        self._status = {\"state\": \"initial\"}\n\n    @property\n    def percentage_progress(self):\n        \"\"\"\n        :return: percentage of query overall progress\n        \"\"\"\n        return self._status.get(\"stats\", {}).get(\"progressPercentage\", 0.1)\n\n    @property\n    def info_uri(self):\n        \"\"\"\n        :return: query UI link\n        \"\"\"\n        return self._status.get(\"infoUri\")\n\n    def execute(self, query, parameters=None, mode=None):\n        \"\"\"\n\n        :param query: query to run\n        :param parameters: parameters should be injected in the query\n        :param mode: \"fetch\" - yields rows, \"watch\" - yields log entries\n        :return:\n        \"\"\"\n\n        class Mode(Enum):\n            watch = \"watch\"\n            fetch = \"fetch\"\n\n        _mode = Mode(mode) if mode else Mode.watch\n\n        with closing(self._connection.cursor()) as cursor:\n            cursor.execute(query, parameters)\n            status = self._status\n            while status:\n                sleep(self.sleep_time)\n                status = cursor.poll()\n                if status:\n                    if _mode == Mode.watch:\n                        yield status\n                    self._status = status\n\n            if _mode == Mode.fetch:\n                for row in cursor.fetchall():\n                    yield row\n\n\nclass WithPrestoClient(Register):\n    \"\"\"\n    A metaclass for injecting `PrestoClient` as a `_client` field into a new instance of class `T`\n    Presto connection options are taken from `T`-instance fields\n    Fields should have the same names as in `pyhive.presto.Cursor`\n    \"\"\"\n\n    def __new__(cls, name, bases, attrs):\n        def _client(self):\n            def _kwargs():\n                \"\"\"\n                replace to\n                ```\n                (_self, *args), *_ = inspect.getfullargspec(Cursor.__init__)\n                ```\n                after py2-deprecation\n                \"\"\"\n                args = inspect.getfullargspec(Cursor.__init__)[0][1:]\n                for parameter in args:\n                    val = getattr(self, parameter)\n                    if val:\n                        yield parameter, val\n\n            connection = Connection(**dict(_kwargs()))\n            return PrestoClient(connection=connection)\n\n        attrs.update({\"_client\": property(_client)})\n        return super(cls, WithPrestoClient).__new__(cls, name, bases, attrs)\n\n\nclass PrestoTarget(luigi.Target):\n    \"\"\"\n    Target for presto-accessible tables\n    \"\"\"\n\n    def __init__(self, client, catalog, database, table, partition=None):\n        self.catalog = catalog\n        self.database = database\n        self.table = table\n        self.partition = partition\n        self._client = client\n        self._count = None\n\n    def __str__(self):\n        return self.table\n\n    @property\n    def _count_query(self):\n        partition = OrderedDict(self.partition or {1: 1})\n\n        def _clauses():\n            for k in partition.keys():\n                yield \"{} = %s\".format(k)\n\n        clauses = \" AND \".join(_clauses())\n\n        query = \"SELECT COUNT(*) AS cnt FROM {}.{}.{} WHERE {} LIMIT 1\".format(self.catalog, self.database, self.table, clauses)\n        params = list(partition.values())\n        return query, params\n\n    def _table_doesnot_exist(self, exception):\n        pattern = re.compile(r\"line (\\d+):(\\d+): Table {}.{}.{} does not exist\".format(self.catalog, self.database, self.table))\n        try:\n            message = exception.message[\"message\"]\n            if pattern.match(message):\n                return True\n        finally:\n            return False\n\n    def count(self):\n        if not self._count:\n            \"\"\"\n            replace to\n            self._count, *_ = next(self._client.execute(*self.count_query, 'fetch'))\n            after py2 deprecation\n            \"\"\"\n            self._count = next(self._client.execute(*self._count_query, mode=\"fetch\"))[0]\n        return self._count\n\n    def exists(self):\n        \"\"\"\n\n        :return: `True` if given table exists and there are any rows in a given partition\n                 `False` if no rows in the partition exists or table is absent\n        \"\"\"\n        try:\n            return self.count() > 0\n        except DatabaseError as exception:\n            if self._table_doesnot_exist(exception):\n                return False\n        except Exception:\n            raise\n\n\nclass PrestoTask(rdbms.Query, metaclass=WithPrestoClient):\n    \"\"\"\n    Task for executing presto queries\n    During its executions tracking url and percentage progress are set\n    \"\"\"\n\n    _tracking_url_set = False\n\n    @property\n    def host(self):\n        return presto().host\n\n    @property\n    def port(self):\n        return presto().port\n\n    @property\n    def user(self):\n        return presto().user\n\n    @property\n    def username(self):\n        return self.user\n\n    @property\n    def schema(self):\n        return self.database\n\n    @property\n    def password(self):\n        return presto().password\n\n    @property\n    def catalog(self):\n        return presto().catalog\n\n    @property\n    def poll_interval(self):\n        return presto().poll_interval\n\n    @property\n    def source(self):\n        return \"pyhive\"\n\n    @property\n    def partition(self):\n        return None\n\n    @property\n    def protocol(self):\n        return \"https\" if self.password else presto().protocol\n\n    @property\n    def session_props(self):\n        return None\n\n    @property\n    def requests_session(self):\n        return None\n\n    @property\n    def requests_kwargs(self):\n        return {\"verify\": False}\n\n    query = None\n\n    def _maybe_set_tracking_url(self):\n        if not self._tracking_url_set:\n            self.set_tracking_url(self._client.info_uri)\n            self._tracking_url_set = True\n\n    def _set_progress(self):\n        self.set_progress_percentage(self._client.percentage_progress)\n\n    def run(self):\n        for _ in self._client.execute(self.query):\n            self._maybe_set_tracking_url()\n            self._set_progress()\n\n    def output(self):\n        return PrestoTarget(\n            client=self._client,\n            catalog=self.catalog,\n            database=self.database,\n            table=self.table,\n            partition=self.partition,\n        )\n"
  },
  {
    "path": "luigi/contrib/prometheus_metric.py",
    "content": "from prometheus_client import CONTENT_TYPE_LATEST, CollectorRegistry, Counter, Gauge, generate_latest\n\nfrom luigi import parameter\nfrom luigi.metrics import MetricsCollector\nfrom luigi.task import Config\n\n\nclass prometheus(Config):\n    use_task_family_in_labels = parameter.BoolParameter(default=True, parsing=parameter.BoolParameter.EXPLICIT_PARSING)\n    task_parameters_to_use_in_labels = parameter.ListParameter(default=())\n\n\nclass PrometheusMetricsCollector(MetricsCollector):\n    def _generate_task_labels(self, task):\n        return {label: task.family if label == \"family\" else task.params.get(label) for label in self.labels}\n\n    def __init__(self, *args, **kwargs):\n        super(PrometheusMetricsCollector, self).__init__()\n        self.registry = CollectorRegistry()\n        config = prometheus(**kwargs)\n        self.labels = list(config.task_parameters_to_use_in_labels)\n        if config.use_task_family_in_labels:\n            self.labels += [\"family\"]\n        if not self.labels:\n            raise ValueError(\"Prometheus labels cannot be empty (see prometheus configuration)\")\n        self.task_started_counter = Counter(\"luigi_task_started_total\", \"number of started luigi tasks\", self.labels, registry=self.registry)\n        self.task_failed_counter = Counter(\"luigi_task_failed_total\", \"number of failed luigi tasks\", self.labels, registry=self.registry)\n        self.task_disabled_counter = Counter(\"luigi_task_disabled_total\", \"number of disabled luigi tasks\", self.labels, registry=self.registry)\n        self.task_done_counter = Counter(\"luigi_task_done_total\", \"number of done luigi tasks\", self.labels, registry=self.registry)\n        self.task_execution_time = Gauge(\"luigi_task_execution_time_seconds\", \"luigi task execution time in seconds\", self.labels, registry=self.registry)\n\n    def generate_latest(self):\n        return generate_latest(self.registry)\n\n    def handle_task_started(self, task):\n        self.task_started_counter.labels(**self._generate_task_labels(task)).inc()\n        self.task_execution_time.labels(**self._generate_task_labels(task))\n\n    def handle_task_failed(self, task):\n        self.task_failed_counter.labels(**self._generate_task_labels(task)).inc()\n        self.task_execution_time.labels(**self._generate_task_labels(task)).set(task.updated - task.time_running)\n\n    def handle_task_disabled(self, task, config):\n        self.task_disabled_counter.labels(**self._generate_task_labels(task)).inc()\n        self.task_execution_time.labels(**self._generate_task_labels(task)).set(task.updated - task.time_running)\n\n    def handle_task_done(self, task):\n        self.task_done_counter.labels(**self._generate_task_labels(task)).inc()\n        # time_running can be `None` if task was already complete\n        if task.time_running is not None:\n            self.task_execution_time.labels(**self._generate_task_labels(task)).set(task.updated - task.time_running)\n\n    def configure_http_handler(self, http_handler):\n        http_handler.set_header(\"Content-Type\", CONTENT_TYPE_LATEST)\n"
  },
  {
    "path": "luigi/contrib/pyspark_runner.py",
    "content": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2020 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nThe pyspark program.\n\nThis module will be run by spark-submit for PySparkTask jobs.\n\nThe first argument is a path to the pickled instance of the PySparkTask,\nother arguments are the ones returned by PySparkTask.app_options()\n\n\"\"\"\n\nimport abc\nimport logging\nimport os\nimport pickle\nimport sys\n\nfrom luigi import configuration\n\n# this prevents the modules in the directory of this script from shadowing global packages\nsys.path.append(sys.path.pop(0))\n\n\nclass _SparkEntryPoint(metaclass=abc.ABCMeta):\n    def __init__(self, conf):\n        self.conf = conf\n\n    @abc.abstractmethod\n    def __enter__(self):\n        pass\n\n    @abc.abstractmethod\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        pass\n\n\nclass SparkContextEntryPoint(_SparkEntryPoint):\n    sc = None\n\n    def __enter__(self):\n        from pyspark import SparkContext\n\n        self.sc = SparkContext(conf=self.conf)\n        return self.sc, self.sc\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        self.sc.stop()\n\n\nclass SparkSessionEntryPoint(_SparkEntryPoint):\n    spark = None\n\n    def _check_major_spark_version(self):\n        from pyspark import __version__ as spark_version\n\n        major_version = int(spark_version.split(\".\")[0])\n        if major_version < 2:\n            raise RuntimeError(\n                \"\"\"\n                Apache Spark {} does not support SparkSession entrypoint.\n                Try to set 'pyspark_runner.use_spark_session' to 'False' and switch to old-style syntax\n                \"\"\".format(spark_version)\n            )\n\n    def __enter__(self):\n        self._check_major_spark_version()\n        from pyspark.sql import SparkSession\n\n        self.spark = SparkSession.builder.config(conf=self.conf).enableHiveSupport().getOrCreate()\n\n        return self.spark, self.spark.sparkContext\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        self.spark.stop()\n\n\nclass AbstractPySparkRunner(object):\n    _entry_point_class = None\n\n    def __init__(self, job, *args):\n        # Append job directory to PYTHON_PATH to enable dynamic import\n        # of the module in which the class resides on unpickling\n        sys.path.append(os.path.dirname(job))\n        with open(job, \"rb\") as fd:\n            self.job = pickle.load(fd)\n        self.args = args\n\n    def run(self):\n        from pyspark import SparkConf\n\n        conf = SparkConf()\n        self.job.setup(conf)\n        with self._entry_point_class(conf=conf) as (entry_point, sc):\n            self.job.setup_remote(sc)\n            self.job.main(entry_point, *self.args)\n\n\ndef _pyspark_runner_with(name, entry_point_class):\n    return type(name, (AbstractPySparkRunner,), {\"_entry_point_class\": entry_point_class})\n\n\nPySparkRunner = _pyspark_runner_with(\"PySparkRunner\", SparkContextEntryPoint)\nPySparkSessionRunner = _pyspark_runner_with(\"PySparkSessionRunner\", SparkSessionEntryPoint)\n\n\ndef _use_spark_session():\n    return bool(configuration.get_config().get(\"pyspark_runner\", \"use_spark_session\", False))\n\n\ndef _get_runner_class():\n    if _use_spark_session():\n        return PySparkSessionRunner\n    return PySparkRunner\n\n\nif __name__ == \"__main__\":\n    logging.basicConfig(level=logging.WARN)\n    _get_runner_class()(*sys.argv[1:]).run()\n"
  },
  {
    "path": "luigi/contrib/rdbms.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nA common module for postgres like databases, such as postgres or redshift\n\"\"\"\n\nfrom __future__ import annotations\n\nimport abc\nimport logging\nfrom typing import Any\n\nimport luigi\nimport luigi.task\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nclass _MetadataColumnsMixin:\n    \"\"\"Provide an additional behavior that adds columns and values to tables\n\n    This mixin is used to provide an additional behavior that allow a task to\n    add generic metadata columns to every table created for both PSQL and\n    Redshift.\n\n    Example:\n\n        This is a use-case example of how this mixin could come handy and how\n        to use it.\n\n        .. code:: python\n\n            class CommonMetaColumnsBehavior:\n                def update_report_execution_date_query(self):\n                    query = \"UPDATE {0} \" \\\n                            \"SET date_param = DATE '{1}' \" \\\n                            \"WHERE date_param IS NULL\".format(self.table, self.date)\n\n                    return query\n\n                @property\n                def metadata_columns(self):\n                    if self.date:\n                        cols.append(('date_param', 'VARCHAR'))\n\n                    return cols\n\n                @property\n                def metadata_queries(self):\n                    queries = [self.update_created_tz_query()]\n                    if self.date:\n                        queries.append(self.update_report_execution_date_query())\n\n                    return queries\n\n\n            class RedshiftCopyCSVToTableFromS3(CommonMetaColumnsBehavior, redshift.S3CopyToTable):\n                \"We have some business override here that would only add noise to the\n                example, so let's assume that this is only a shell.\"\n                pass\n\n\n            class UpdateTableA(RedshiftCopyCSVToTableFromS3):\n                date = luigi.Parameter()\n                table = 'tableA'\n\n                def queries():\n                    return [query_content_for('/queries/deduplicate_dupes.sql')]\n\n\n            class UpdateTableB(RedshiftCopyCSVToTableFromS3):\n                date = luigi.Parameter()\n                table = 'tableB'\n    \"\"\"\n\n    @property\n    def metadata_columns(self):\n        \"\"\"Returns the default metadata columns.\n\n        Those columns are columns that we want each tables to have by default.\n        \"\"\"\n        return []\n\n    @property\n    def metadata_queries(self):\n        return []\n\n    @property\n    def enable_metadata_columns(self):\n        return False\n\n    def _add_metadata_columns(self, connection):\n        cursor = connection.cursor()\n\n        for column in self.metadata_columns:\n            if len(column) == 0:\n                raise ValueError(\n                    \"_add_metadata_columns is unable to infer column information from column {column} for {table}\".format(column=column, table=self.table)\n                )\n\n            column_name = column[0]\n            if not self._column_exists(cursor, column_name):\n                logger.info(\"Adding missing metadata column {column} to {table}\".format(column=column, table=self.table))\n                self._add_column_to_table(cursor, column)\n\n    def _column_exists(self, cursor, column_name):\n        if \".\" in self.table:\n            schema, table = self.table.split(\".\")\n            query = (\n                \"SELECT 1 AS column_exists \"\n                \"FROM information_schema.columns \"\n                \"WHERE table_schema = LOWER('{0}') AND table_name = LOWER('{1}') AND column_name = LOWER('{2}') LIMIT 1;\".format(schema, table, column_name)\n            )\n        else:\n            query = \"SELECT 1 AS column_exists FROM information_schema.columns WHERE table_name = LOWER('{0}') AND column_name = LOWER('{1}') LIMIT 1;\".format(\n                self.table, column_name\n            )\n\n        cursor.execute(query)\n        result = cursor.fetchone()\n        return bool(result)\n\n    def _add_column_to_table(self, cursor, column):\n        if len(column) == 1:\n            raise ValueError(\"_add_column_to_table() column type not specified for {column}\".format(column=column[0]))\n        elif len(column) == 2:\n            query = \"ALTER TABLE {table} ADD COLUMN {column};\".format(table=self.table, column=\" \".join(column))\n        elif len(column) == 3:\n            query = \"ALTER TABLE {table} ADD COLUMN {column} ENCODE {encoding};\".format(table=self.table, column=\" \".join(column[0:2]), encoding=column[2])\n        else:\n            raise ValueError(\"_add_column_to_table() found no matching behavior for {column}\".format(column=column))\n\n        cursor.execute(query)\n\n    def post_copy_metacolumns(self, cursor):\n        logger.info(\"Executing post copy metadata queries\")\n        for query in self.metadata_queries:\n            cursor.execute(query)\n\n\nclass CopyToTable(luigi.task.MixinNaiveBulkComplete, _MetadataColumnsMixin, luigi.Task):\n    \"\"\"\n    An abstract task for inserting a data set into RDBMS.\n\n    Usage:\n\n        Subclass and override the following attributes:\n\n        * `host`,\n        * `database`,\n        * `user`,\n        * `password`,\n        * `table`\n        * `columns`\n        * `port`\n    \"\"\"\n\n    @property\n    @abc.abstractmethod\n    def host(self):\n        return None\n\n    @property\n    @abc.abstractmethod\n    def database(self):\n        return None\n\n    @property\n    @abc.abstractmethod\n    def user(self):\n        return None\n\n    @property\n    @abc.abstractmethod\n    def password(self):\n        return None\n\n    @property\n    @abc.abstractmethod\n    def table(self):\n        return None\n\n    @property\n    def port(self):\n        return None\n\n    # specify the columns that are to be inserted (same as are returned by columns)\n    # overload this in subclasses with the either column names of columns to import:\n    # e.g. ['id', 'username', 'inserted']\n    # or tuples with column name, postgres column type strings:\n    # e.g. [('id', 'SERIAL PRIMARY KEY'), ('username', 'VARCHAR(255)'), ('inserted', 'DATETIME')]\n    columns: list[Any] = []\n\n    # options\n    null_values = (None,)  # container of values that should be inserted as NULL values\n\n    column_separator = \"\\t\"  # how columns are separated in the file copied into postgres\n\n    def create_table(self, connection):\n        \"\"\"\n        Override to provide code for creating the target table.\n\n        By default it will be created using types (optionally) specified in columns.\n\n        If overridden, use the provided connection object for setting up the table in order to\n        create the table and insert data using the same transaction.\n        \"\"\"\n        if len(self.columns[0]) == 1:\n            # only names of columns specified, no types\n            raise NotImplementedError(\"create_table() not implemented for %r and columns types not specified\" % self.table)\n        elif len(self.columns[0]) == 2:\n            # if columns is specified as (name, type) tuples\n            coldefs = \",\".join(\"{name} {type}\".format(name=name, type=type) for name, type in self.columns)\n            query = \"CREATE TABLE {table} ({coldefs})\".format(table=self.table, coldefs=coldefs)\n            connection.cursor().execute(query)\n\n    @property\n    def update_id(self):\n        \"\"\"\n        This update id will be a unique identifier for this insert on this table.\n        \"\"\"\n        return self.task_id\n\n    @abc.abstractmethod\n    def output(self):\n        raise NotImplementedError(\"This method must be overridden\")\n\n    def init_copy(self, connection):\n        \"\"\"\n        Override to perform custom queries.\n\n        Any code here will be formed in the same transaction as the main copy, just prior to copying data.\n        Example use cases include truncating the table or removing all data older than X in the database\n        to keep a rolling window of data available in the table.\n        \"\"\"\n\n        # TODO: remove this after sufficient time so most people using the\n        # clear_table attribtue will have noticed it doesn't work anymore\n        if hasattr(self, \"clear_table\"):\n            raise Exception(\"The clear_table attribute has been removed. Override init_copy instead!\")\n\n        if self.enable_metadata_columns:\n            self._add_metadata_columns(connection)\n\n    def post_copy(self, connection):\n        \"\"\"\n        Override to perform custom queries.\n\n        Any code here will be formed in the same transaction as the main copy, just after copying data.\n        Example use cases include cleansing data in temp table prior to insertion into real table.\n        \"\"\"\n        pass\n\n    @abc.abstractmethod\n    def copy(self, cursor, file):\n        raise NotImplementedError(\"This method must be overridden\")\n\n\nclass Query(luigi.task.MixinNaiveBulkComplete, luigi.Task):\n    \"\"\"\n    An abstract task for executing an RDBMS query.\n\n    Usage:\n\n        Subclass and override the following attributes:\n\n        * `host`,\n        * `database`,\n        * `user`,\n        * `password`,\n        * `table`,\n        * `query`\n\n        Optionally override:\n\n        * `port`,\n        * `autocommit`\n        * `update_id`\n\n        Subclass and override the following methods:\n\n        * `run`\n        * `output`\n    \"\"\"\n\n    @property\n    @abc.abstractmethod\n    def host(self):\n        \"\"\"\n        Host of the RDBMS. Implementation should support `hostname:port`\n        to encode port.\n        \"\"\"\n        return None\n\n    @property\n    def port(self):\n        \"\"\"\n        Override to specify port separately from host.\n        \"\"\"\n        return None\n\n    @property\n    @abc.abstractmethod\n    def database(self):\n        return None\n\n    @property\n    @abc.abstractmethod\n    def user(self):\n        return None\n\n    @property\n    @abc.abstractmethod\n    def password(self):\n        return None\n\n    @property\n    @abc.abstractmethod\n    def table(self):\n        return None\n\n    @property\n    @abc.abstractmethod\n    def query(self):\n        return None\n\n    @property\n    def autocommit(self):\n        return False\n\n    @property\n    def update_id(self):\n        \"\"\"\n        Override to create a custom marker table 'update_id' signature for Query subclass task instances\n        \"\"\"\n        return self.task_id\n\n    @abc.abstractmethod\n    def run(self):\n        raise NotImplementedError(\"This method must be overridden\")\n\n    @abc.abstractmethod\n    def output(self):\n        \"\"\"\n        Override with an RDBMS Target (e.g. PostgresTarget or RedshiftTarget) to record execution in a marker table\n        \"\"\"\n        raise NotImplementedError(\"This method must be overridden\")\n"
  },
  {
    "path": "luigi/contrib/redis_store.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport datetime\nimport logging\n\nfrom luigi.parameter import Parameter\nfrom luigi.target import Target\n\nlogger = logging.getLogger(\"luigi-interface\")\n\ntry:\n    import redis\n\nexcept ImportError:\n    logger.warning(\"Loading redis_store module without redis installed. Will crash at runtime if redis_store functionality is used.\")\n\n\nclass RedisTarget(Target):\n    \"\"\"Target for a resource in Redis.\"\"\"\n\n    marker_prefix = Parameter(default=\"luigi\", config_path=dict(section=\"redis\", name=\"marker-prefix\"))\n\n    def __init__(self, host, port, db, update_id, password=None, socket_timeout=None, expire=None):\n        \"\"\"\n        :param host: Redis server host\n        :type host: str\n        :param port: Redis server port\n        :type port: int\n        :param db: database index\n        :type db: int\n        :param update_id: an identifier for this data hash\n        :type update_id: str\n        :param password: a password to connect to the redis server\n        :type password: str\n        :param socket_timeout: client socket timeout\n        :type socket_timeout: int\n        :param expire: timeout before the target is deleted\n        :type expire: int\n\n        \"\"\"\n        self.host = host\n        self.port = port\n        self.db = db\n        self.password = password\n        self.socket_timeout = socket_timeout\n        self.update_id = update_id\n        self.expire = expire\n\n        self.redis_client = redis.StrictRedis(\n            host=self.host,\n            port=self.port,\n            password=self.password,\n            db=self.db,\n            socket_timeout=self.socket_timeout,\n        )\n\n    def __str__(self):\n        return self.marker_key()\n\n    def marker_key(self):\n        \"\"\"\n        Generate a key for the indicator hash.\n        \"\"\"\n        return \"%s:%s\" % (self.marker_prefix, self.update_id)\n\n    def touch(self):\n        \"\"\"\n        Mark this update as complete.\n\n        We index the parameters `update_id` and `date`.\n        \"\"\"\n        marker_key = self.marker_key()\n        self.redis_client.hset(marker_key, \"update_id\", self.update_id)\n        self.redis_client.hset(marker_key, \"date\", datetime.datetime.now().isoformat())\n\n        if self.expire is not None:\n            self.redis_client.expire(marker_key, self.expire)\n\n    def exists(self):\n        \"\"\"\n        Test, if this task has been run.\n        \"\"\"\n        return self.redis_client.exists(self.marker_key()) == 1\n"
  },
  {
    "path": "luigi/contrib/redshift.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport abc\nimport json\nimport logging\nimport os\nimport time\n\nimport luigi\nfrom luigi.contrib import postgres, rdbms\nfrom luigi.contrib.s3 import S3PathTask, S3Target\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\ntry:\n    import psycopg2\n    import psycopg2.errorcodes\nexcept ImportError:\n    logger.warning(\"Loading postgres module without psycopg2 installed. Will crash at runtime if postgres functionality is used.\")\n\n\nclass _CredentialsMixin:\n    \"\"\"\n    This mixin is used to provide the same credential properties\n    for AWS to all Redshift tasks. It also provides a helper method\n    to generate the credentials string for the task.\n    \"\"\"\n\n    @property\n    def configuration_section(self):\n        \"\"\"\n        Override to change the configuration section used\n        to obtain default credentials.\n        \"\"\"\n        return \"redshift\"\n\n    @property\n    def aws_access_key_id(self):\n        \"\"\"\n        Override to return the key id.\n        \"\"\"\n        return self._get_configuration_attribute(\"aws_access_key_id\")\n\n    @property\n    def aws_secret_access_key(self):\n        \"\"\"\n        Override to return the secret access key.\n        \"\"\"\n        return self._get_configuration_attribute(\"aws_secret_access_key\")\n\n    @property\n    def aws_account_id(self):\n        \"\"\"\n        Override to return the account id.\n        \"\"\"\n        return self._get_configuration_attribute(\"aws_account_id\")\n\n    @property\n    def aws_arn_role_name(self):\n        \"\"\"\n        Override to return the arn role name.\n        \"\"\"\n        return self._get_configuration_attribute(\"aws_arn_role_name\")\n\n    @property\n    def aws_session_token(self):\n        \"\"\"\n        Override to return the session token.\n        \"\"\"\n        return self._get_configuration_attribute(\"aws_session_token\")\n\n    def _get_configuration_attribute(self, attribute):\n        config = luigi.configuration.get_config()\n\n        value = config.get(self.configuration_section, attribute, default=None)\n\n        if not value:\n            value = os.environ.get(attribute.upper(), None)\n\n        return value\n\n    def _credentials(self):\n        \"\"\"\n        Return a credential string for the provided task. If no valid\n        credentials are set, raise a NotImplementedError.\n        \"\"\"\n\n        if self.aws_account_id and self.aws_arn_role_name:\n            return \"aws_iam_role=arn:aws:iam::{id}:role/{role}\".format(id=self.aws_account_id, role=self.aws_arn_role_name)\n        elif self.aws_access_key_id and self.aws_secret_access_key:\n            return \"aws_access_key_id={key};aws_secret_access_key={secret}{opt}\".format(\n                key=self.aws_access_key_id, secret=self.aws_secret_access_key, opt=\";token={}\".format(self.aws_session_token) if self.aws_session_token else \"\"\n            )\n        else:\n            raise NotImplementedError(\n                \"Missing Credentials. \"\n                \"Ensure one of the pairs of auth args below are set \"\n                \"in a configuration file, environment variables or by \"\n                \"being overridden in the task: \"\n                \"'aws_access_key_id' AND 'aws_secret_access_key' OR \"\n                \"'aws_account_id' AND 'aws_arn_role_name'\"\n            )\n\n\nclass RedshiftTarget(postgres.PostgresTarget):\n    \"\"\"\n    Target for a resource in Redshift.\n\n    Redshift is similar to postgres with a few adjustments\n    required by redshift.\n    \"\"\"\n\n    marker_table = luigi.configuration.get_config().get(\"redshift\", \"marker-table\", \"table_updates\")\n\n    # if not supplied, fall back to default Redshift port\n    DEFAULT_DB_PORT = 5439\n\n    use_db_timestamps = False\n\n\nclass S3CopyToTable(rdbms.CopyToTable, _CredentialsMixin):\n    \"\"\"\n    Template task for inserting a data set into Redshift from s3.\n\n    Usage:\n\n    * Subclass and override the required attributes:\n\n      * `host`,\n      * `database`,\n      * `user`,\n      * `password`,\n      * `table`,\n      * `columns`,\n      * `s3_load_path`.\n\n    * You can also override the attributes provided by the\n      CredentialsMixin if they are not supplied by your\n      configuration or environment variables.\n    \"\"\"\n\n    @abc.abstractmethod\n    def s3_load_path(self):\n        \"\"\"\n        Override to return the load path.\n        \"\"\"\n        return None\n\n    @property\n    @abc.abstractmethod\n    def copy_options(self):\n        \"\"\"\n        Add extra copy options, for example:\n\n        * TIMEFORMAT 'auto'\n        * IGNOREHEADER 1\n        * TRUNCATECOLUMNS\n        * IGNOREBLANKLINES\n        * DELIMITER '\\t'\n        \"\"\"\n        return \"\"\n\n    @property\n    def prune_table(self):\n        \"\"\"\n        Override to set equal to the name of the table which is to be pruned.\n        Intended to be used in conjunction with prune_column and prune_date\n        i.e. copy to temp table, prune production table to prune_column with a date greater than prune_date, then insert into production table from temp table\n        \"\"\"\n        return None\n\n    @property\n    def prune_column(self):\n        \"\"\"\n        Override to set equal to the column of the prune_table which is to be compared\n        Intended to be used in conjunction with prune_table and prune_date\n        i.e. copy to temp table, prune production table to prune_column with a date greater than prune_date, then insert into production table from temp table\n        \"\"\"\n        return None\n\n    @property\n    def prune_date(self):\n        \"\"\"\n        Override to set equal to the date by which prune_column is to be compared\n        Intended to be used in conjunction with prune_table and prune_column\n        i.e. copy to temp table, prune production table to prune_column with a date greater than prune_date, then insert into production table from temp table\n        \"\"\"\n        return None\n\n    @property\n    def table_attributes(self):\n        \"\"\"\n        Add extra table attributes, for example:\n\n        DISTSTYLE KEY\n        DISTKEY (MY_FIELD)\n        SORTKEY (MY_FIELD_2, MY_FIELD_3)\n        \"\"\"\n        return \"\"\n\n    @property\n    def table_constraints(self):\n        \"\"\"\n        Add extra table constraints, for example:\n\n        PRIMARY KEY (MY_FIELD, MY_FIELD_2)\n        UNIQUE KEY (MY_FIELD_3)\n        \"\"\"\n        return \"\"\n\n    @property\n    def do_truncate_table(self):\n        \"\"\"\n        Return True if table should be truncated before copying new data in.\n        \"\"\"\n        return False\n\n    def do_prune(self):\n        \"\"\"\n        Return True if prune_table, prune_column, and prune_date are implemented.\n        If only a subset of prune variables are override, an exception is raised to remind the user to implement all or none.\n        Prune (data newer than prune_date deleted) before copying new data in.\n        \"\"\"\n        if self.prune_table and self.prune_column and self.prune_date:\n            return True\n        elif self.prune_table or self.prune_column or self.prune_date:\n            raise Exception(\"override zero or all prune variables\")\n        else:\n            return False\n\n    @property\n    def table_type(self):\n        \"\"\"\n        Return table type (i.e. 'temp').\n        \"\"\"\n        return \"\"\n\n    @property\n    def queries(self):\n        \"\"\"\n        Override to return a list of queries to be executed in order.\n        \"\"\"\n        return []\n\n    def truncate_table(self, connection):\n        query = \"truncate %s\" % self.table\n        cursor = connection.cursor()\n        try:\n            cursor.execute(query)\n        finally:\n            cursor.close()\n\n    def prune(self, connection):\n        query = \"delete from %s where %s >= %s\" % (self.prune_table, self.prune_column, self.prune_date)\n        cursor = connection.cursor()\n        try:\n            cursor.execute(query)\n        finally:\n            cursor.close()\n\n    def create_schema(self, connection):\n        \"\"\"\n        Will create the schema in the database\n        \"\"\"\n        if \".\" not in self.table:\n            return\n\n        query = \"CREATE SCHEMA IF NOT EXISTS {schema_name};\".format(schema_name=self.table.split(\".\")[0])\n        connection.cursor().execute(query)\n\n    def create_table(self, connection):\n        \"\"\"\n        Override to provide code for creating the target table.\n\n        By default it will be created using types (optionally)\n        specified in columns.\n\n        If overridden, use the provided connection object for\n        setting up the table in order to create the table and\n        insert data using the same transaction.\n        \"\"\"\n        if len(self.columns[0]) == 1:\n            # only names of columns specified, no types\n            raise NotImplementedError(\"create_table() not implemented for %r and columns types not specified\" % self.table)\n        elif len(self.columns[0]) == 2:\n            # if columns is specified as (name, type) tuples\n            coldefs = \",\".join(\"{name} {type}\".format(name=name, type=type) for name, type in self.columns)\n\n            table_constraints = \"\"\n            if self.table_constraints != \"\":\n                table_constraints = \", \" + self.table_constraints\n\n            query = (\"CREATE {type} TABLE {table} ({coldefs} {table_constraints}) {table_attributes}\").format(\n                type=self.table_type, table=self.table, coldefs=coldefs, table_constraints=table_constraints, table_attributes=self.table_attributes\n            )\n\n            connection.cursor().execute(query)\n        elif len(self.columns[0]) == 3:\n            # if columns is specified as (name, type, encoding) tuples\n            # possible column encodings: https://docs.aws.amazon.com/redshift/latest/dg/c_Compression_encodings.html\n            coldefs = \",\".join(\"{name} {type} ENCODE {encoding}\".format(name=name, type=type, encoding=encoding) for name, type, encoding in self.columns)\n\n            table_constraints = \"\"\n            if self.table_constraints != \"\":\n                table_constraints = \",\" + self.table_constraints\n\n            query = (\"CREATE {type} TABLE {table} ({coldefs} {table_constraints}) {table_attributes}\").format(\n                type=self.table_type, table=self.table, coldefs=coldefs, table_constraints=table_constraints, table_attributes=self.table_attributes\n            )\n\n            connection.cursor().execute(query)\n        else:\n            raise ValueError(\"create_table() found no columns for %r\" % self.table)\n\n    def run(self):\n        \"\"\"\n        If the target table doesn't exist, self.create_table\n        will be called to attempt to create the table.\n        \"\"\"\n        if not (self.table):\n            raise Exception(\"table need to be specified\")\n\n        path = self.s3_load_path()\n        output = self.output()\n        connection = output.connect()\n        cursor = connection.cursor()\n\n        self.init_copy(connection)\n        self.copy(cursor, path)\n        self.post_copy(cursor)\n\n        if self.enable_metadata_columns:\n            self.post_copy_metacolumns(cursor)\n\n        # update marker table\n        output.touch(connection)\n        connection.commit()\n\n        # commit and clean up\n        connection.close()\n\n    def copy(self, cursor, f):\n        \"\"\"\n        Defines copying from s3 into redshift.\n\n        If both key-based and role-based credentials are provided, role-based will be used.\n        \"\"\"\n        logger.info(\"Inserting file: %s\", f)\n        colnames = \"\"\n        if self.columns and len(self.columns) > 0:\n            colnames = \",\".join([x[0] for x in self.columns])\n            colnames = \"({})\".format(colnames)\n\n        cursor.execute(\n            \"\"\"\n         COPY {table} {colnames} from '{source}'\n         CREDENTIALS '{creds}'\n         {options}\n         ;\"\"\".format(table=self.table, colnames=colnames, source=f, creds=self._credentials(), options=self.copy_options)\n        )\n\n    def output(self):\n        \"\"\"\n        Returns a RedshiftTarget representing the inserted dataset.\n\n        Normally you don't override this.\n        \"\"\"\n        return RedshiftTarget(host=self.host, database=self.database, user=self.user, password=self.password, table=self.table, update_id=self.update_id)\n\n    def does_schema_exist(self, connection):\n        \"\"\"\n        Determine whether the schema already exists.\n        \"\"\"\n\n        if \".\" in self.table:\n            query = \"select 1 as schema_exists from pg_namespace where nspname = lower(%s) limit 1\"\n        else:\n            return True\n\n        cursor = connection.cursor()\n        try:\n            schema = self.table.split(\".\")[0]\n            cursor.execute(query, [schema])\n            result = cursor.fetchone()\n            return bool(result)\n        finally:\n            cursor.close()\n\n    def does_table_exist(self, connection):\n        \"\"\"\n        Determine whether the table already exists.\n        \"\"\"\n\n        if \".\" in self.table:\n            query = \"select 1 as table_exists from information_schema.tables where table_schema = lower(%s) and table_name = lower(%s) limit 1\"\n        else:\n            query = \"select 1 as table_exists from pg_table_def where tablename = lower(%s) limit 1\"\n        cursor = connection.cursor()\n        try:\n            cursor.execute(query, tuple(self.table.split(\".\")))\n            result = cursor.fetchone()\n            return bool(result)\n        finally:\n            cursor.close()\n\n    def init_copy(self, connection):\n        \"\"\"\n        Perform pre-copy sql - such as creating table, truncating, or removing data older than x.\n        \"\"\"\n        if not self.does_schema_exist(connection):\n            logger.info(\"Creating schema for %s\", self.table)\n            self.create_schema(connection)\n\n        if not self.does_table_exist(connection):\n            logger.info(\"Creating table %s\", self.table)\n            self.create_table(connection)\n\n        if self.enable_metadata_columns:\n            self._add_metadata_columns(connection)\n\n        if self.do_truncate_table:\n            logger.info(\"Truncating table %s\", self.table)\n            self.truncate_table(connection)\n\n        if self.do_prune():\n            logger.info(\"Removing %s older than %s from %s\", self.prune_column, self.prune_date, self.prune_table)\n            self.prune(connection)\n\n    def post_copy(self, cursor):\n        \"\"\"\n        Performs post-copy sql - such as cleansing data, inserting into production table (if copied to temp table), etc.\n        \"\"\"\n        logger.info(\"Executing post copy queries\")\n        for query in self.queries:\n            cursor.execute(query)\n\n    def post_copy_metacolums(self, cursor):\n        \"\"\"\n        Performs post-copy to fill metadata columns.\n        \"\"\"\n        logger.info(\"Executing post copy metadata queries\")\n        for query in self.metadata_queries:\n            cursor.execute(query)\n\n\nclass S3CopyJSONToTable(S3CopyToTable, _CredentialsMixin):\n    \"\"\"\n    Template task for inserting a JSON data set into Redshift from s3.\n\n    Usage:\n\n        * Subclass and override the required attributes:\n\n            * `host`,\n            * `database`,\n            * `user`,\n            * `password`,\n            * `table`,\n            * `columns`,\n            * `s3_load_path`,\n            * `jsonpath`,\n            * `copy_json_options`.\n\n    * You can also override the attributes provided by the\n      CredentialsMixin if they are not supplied by your\n      configuration or environment variables.\n    \"\"\"\n\n    @property\n    @abc.abstractmethod\n    def jsonpath(self):\n        \"\"\"\n        Override the jsonpath schema location for the table.\n        \"\"\"\n        return \"\"\n\n    @property\n    @abc.abstractmethod\n    def copy_json_options(self):\n        \"\"\"\n        Add extra copy options, for example:\n\n        * GZIP\n        * LZOP\n        \"\"\"\n        return \"\"\n\n    def copy(self, cursor, f):\n        \"\"\"\n        Defines copying JSON from s3 into redshift.\n        \"\"\"\n\n        logger.info(\"Inserting file: %s\", f)\n        cursor.execute(\n            \"\"\"\n         COPY %s from '%s'\n         CREDENTIALS '%s'\n         JSON AS '%s' %s\n         %s\n         ;\"\"\"\n            % (self.table, f, self._credentials(), self.jsonpath, self.copy_json_options, self.copy_options)\n        )\n\n\nclass RedshiftManifestTask(S3PathTask):\n    \"\"\"\n    Generic task to generate a manifest file that can be used\n    in S3CopyToTable in order to copy multiple files from your\n    s3 folder into a redshift table at once.\n\n    For full description on how to use the manifest file see\n    http://docs.aws.amazon.com/redshift/latest/dg/loading-data-files-using-manifest.html\n\n    Usage:\n\n        * requires parameters\n            * path - s3 path to the generated manifest file, including the\n                     name of the generated file\n                     to be copied into a redshift table\n            * folder_paths - s3 paths to the folders containing files you wish to be copied\n\n    Output:\n\n        * generated manifest file\n    \"\"\"\n\n    # should be over ridden to point to a variety\n    # of folders you wish to copy from\n    folder_paths = luigi.Parameter()\n    text_target = True\n\n    def run(self):\n        entries = []\n        for folder_path in self.folder_paths:\n            s3 = S3Target(folder_path)\n            client = s3.fs\n            for file_name in client.list(s3.path):\n                entries.append({\"url\": \"%s/%s\" % (folder_path, file_name), \"mandatory\": True})\n        manifest = {\"entries\": entries}\n        target = self.output().open(\"w\")\n        dump = json.dumps(manifest)\n        if not self.text_target:\n            dump = dump.encode(\"utf8\")\n        target.write(dump)\n        target.close()\n\n\nclass KillOpenRedshiftSessions(luigi.Task):\n    \"\"\"\n    An task for killing any open Redshift sessions\n    in a given database. This is necessary to prevent open user sessions\n    with transactions against the table from blocking drop or truncate\n    table commands.\n\n    Usage:\n\n    Subclass and override the required `host`, `database`,\n    `user`, and `password` attributes.\n    \"\"\"\n\n    # time in seconds to wait before\n    # reconnecting to Redshift if our session is killed too.\n    # 30 seconds is usually fine; 60 is conservative\n    connection_reset_wait_seconds = luigi.IntParameter(default=60)\n\n    @property\n    @abc.abstractmethod\n    def host(self):\n        return None\n\n    @property\n    @abc.abstractmethod\n    def database(self):\n        return None\n\n    @property\n    @abc.abstractmethod\n    def user(self):\n        return None\n\n    @property\n    @abc.abstractmethod\n    def password(self):\n        return None\n\n    @property\n    def update_id(self):\n        \"\"\"\n        This update id will be a unique identifier\n        for this insert on this table.\n        \"\"\"\n        return self.task_id\n\n    def output(self):\n        \"\"\"\n        Returns a RedshiftTarget representing the inserted dataset.\n\n        Normally you don't override this.\n        \"\"\"\n        # uses class name as a meta-table\n        return RedshiftTarget(\n            host=self.host, database=self.database, user=self.user, password=self.password, table=self.__class__.__name__, update_id=self.update_id\n        )\n\n    def run(self):\n        \"\"\"\n        Kill any open Redshift sessions for the given database.\n        \"\"\"\n        connection = self.output().connect()\n        # kill any sessions other than ours and\n        # internal Redshift sessions (rdsdb)\n        query = \"select pg_terminate_backend(process) from STV_SESSIONS where db_name=%s and user_name != 'rdsdb' and process != pg_backend_pid()\"\n        cursor = connection.cursor()\n        logger.info(\"Killing all open Redshift sessions for database: %s\", self.database)\n        try:\n            cursor.execute(query, (self.database,))\n            cursor.close()\n            connection.commit()\n        except psycopg2.DatabaseError as e:\n            if e.message and \"EOF\" in e.message:\n                # sometimes this operation kills the current session.\n                # rebuild the connection. Need to pause for 30-60 seconds\n                # before Redshift will allow us back in.\n                connection.close()\n                logger.info(\"Pausing %s seconds for Redshift to reset connection\", self.connection_reset_wait_seconds)\n                time.sleep(self.connection_reset_wait_seconds)\n                logger.info(\"Reconnecting to Redshift\")\n                connection = self.output().connect()\n            else:\n                raise\n\n        try:\n            self.output().touch(connection)\n            connection.commit()\n        finally:\n            connection.close()\n\n        logger.info(\"Done killing all open Redshift sessions for database: %s\", self.database)\n\n\nclass RedshiftQuery(postgres.PostgresQuery):\n    \"\"\"\n    Template task for querying an Amazon Redshift database\n\n    Usage:\n    Subclass and override the required `host`, `database`, `user`, `password`, `table`, and `query` attributes.\n\n    Override the `run` method if your use case requires some action with the query result.\n\n    Task instances require a dynamic `update_id`, e.g. via parameter(s), otherwise the query will only execute once\n\n    To customize the query signature as recorded in the database marker table, override the `update_id` property.\n    \"\"\"\n\n    def output(self):\n        \"\"\"\n        Returns a RedshiftTarget representing the executed query.\n\n        Normally you don't override this.\n        \"\"\"\n        return RedshiftTarget(host=self.host, database=self.database, user=self.user, password=self.password, table=self.table, update_id=self.update_id)\n\n\nclass RedshiftUnloadTask(postgres.PostgresQuery, _CredentialsMixin):\n    \"\"\"\n    Template task for running UNLOAD on an Amazon Redshift database\n\n    Usage:\n    Subclass and override the required `host`, `database`, `user`, `password`, `table`, and `query` attributes.\n    Optionally, override the `autocommit` attribute to run the query in autocommit mode - this is necessary to run VACUUM for example.\n    Override the `run` method if your use case requires some action with the query result.\n    Task instances require a dynamic `update_id`, e.g. via parameter(s), otherwise the query will only execute once\n    To customize the query signature as recorded in the database marker table, override the `update_id` property.\n    You can also override the attributes provided by the CredentialsMixin if they are not supplied by your configuration or environment variables.\n    \"\"\"\n\n    @property\n    def s3_unload_path(self):\n        \"\"\"\n        Override to return the load path.\n        \"\"\"\n        return \"\"\n\n    @property\n    def unload_options(self):\n        \"\"\"\n        Add extra or override default unload options:\n        \"\"\"\n        return \"DELIMITER '|' ADDQUOTES GZIP ALLOWOVERWRITE PARALLEL ON\"\n\n    @property\n    def unload_query(self):\n        \"\"\"\n        Default UNLOAD command\n        \"\"\"\n        return \"UNLOAD ( '{query}' ) TO '{s3_unload_path}' credentials '{credentials}' {unload_options};\"\n\n    def run(self):\n        connection = self.output().connect()\n        cursor = connection.cursor()\n\n        unload_query = self.unload_query.format(\n            query=self.query().replace(\"'\", r\"\\'\"), s3_unload_path=self.s3_unload_path, unload_options=self.unload_options, credentials=self._credentials()\n        )\n\n        logger.info(\"Executing unload query from task: {name}\".format(name=self.__class__))\n\n        cursor = connection.cursor()\n        cursor.execute(unload_query)\n        logger.info(cursor.statusmessage)\n\n        # Update marker table\n        self.output().touch(connection)\n        # commit and close connection\n        connection.commit()\n        connection.close()\n\n    def output(self):\n        \"\"\"\n        Returns a RedshiftTarget representing the executed query.\n\n        Normally you don't override this.\n        \"\"\"\n        return RedshiftTarget(host=self.host, database=self.database, user=self.user, password=self.password, table=self.table, update_id=self.update_id)\n"
  },
  {
    "path": "luigi/contrib/s3.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nImplementation of Simple Storage Service support.\n:py:class:`S3Target` is a subclass of the Target class to support S3 file\nsystem operations. The `boto3` library is required to use S3 targets.\n\"\"\"\n\nimport datetime\nimport itertools\nimport logging\nimport os\nimport os.path\nimport warnings\nfrom configparser import NoSectionError\nfrom multiprocessing.pool import ThreadPool\nfrom urllib.parse import urlsplit\n\nfrom luigi import configuration\nfrom luigi.format import get_default_format\nfrom luigi.parameter import OptionalParameter, Parameter\nfrom luigi.target import AtomicLocalFile, FileAlreadyExists, FileSystem, FileSystemException, FileSystemTarget, MissingParentDirectory\nfrom luigi.task import ExternalTask\n\nlogger = logging.getLogger(\"luigi-interface\")\n\ntry:\n    import botocore\n    from boto3.s3.transfer import TransferConfig\nexcept ImportError:\n    logger.warning(\"Loading S3 module without the python package boto3. Will crash at runtime if S3 functionality is used.\")\n\n# two different ways of marking a directory\n# with a suffix in S3\nS3_DIRECTORY_MARKER_SUFFIX_0 = \"_$folder$\"\nS3_DIRECTORY_MARKER_SUFFIX_1 = \"/\"\n\n\nclass InvalidDeleteException(FileSystemException):\n    pass\n\n\nclass FileNotFoundException(FileSystemException):\n    pass\n\n\nclass DeprecatedBotoClientException(Exception):\n    pass\n\n\nclass S3Client(FileSystem):\n    \"\"\"\n    boto3-powered S3 client.\n    \"\"\"\n\n    _s3 = None\n    DEFAULT_PART_SIZE = 8388608\n    DEFAULT_THREADS = 100\n\n    def __init__(self, aws_access_key_id=None, aws_secret_access_key=None, aws_session_token=None, **kwargs):\n        options = self._get_s3_config()\n        options.update(kwargs)\n        if aws_access_key_id:\n            options[\"aws_access_key_id\"] = aws_access_key_id\n        if aws_secret_access_key:\n            options[\"aws_secret_access_key\"] = aws_secret_access_key\n        if aws_session_token:\n            options[\"aws_session_token\"] = aws_session_token\n\n        self._options = options\n\n    @property\n    def s3(self):\n        # only import boto3 when needed to allow top-lvl s3 module import\n        import boto3\n\n        options = dict(self._options)\n\n        if self._s3:\n            return self._s3\n\n        aws_access_key_id = options.get(\"aws_access_key_id\")\n        aws_secret_access_key = options.get(\"aws_secret_access_key\")\n\n        # Removing key args would break backwards compatibility\n        role_arn = options.get(\"aws_role_arn\")\n        role_session_name = options.get(\"aws_role_session_name\")\n\n        # In case the aws_session_token is provided use it\n        aws_session_token = options.get(\"aws_session_token\")\n\n        if role_arn and role_session_name:\n            sts_client = boto3.client(\"sts\")\n            assumed_role = sts_client.assume_role(RoleArn=role_arn, RoleSessionName=role_session_name)\n            aws_secret_access_key = assumed_role[\"Credentials\"].get(\"SecretAccessKey\")\n            aws_access_key_id = assumed_role[\"Credentials\"].get(\"AccessKeyId\")\n            aws_session_token = assumed_role[\"Credentials\"].get(\"SessionToken\")\n            logger.debug(\"using aws credentials via assumed role {} as defined in luigi config\".format(role_session_name))\n\n        for key in [\"aws_access_key_id\", \"aws_secret_access_key\", \"aws_role_session_name\", \"aws_role_arn\", \"aws_session_token\"]:\n            if key in options:\n                options.pop(key)\n\n        # At this stage, if no credentials provided, boto3 would handle their resolution for us\n        # For finding out about the order in which it tries to find these credentials\n        # please see here details\n        # http://boto3.readthedocs.io/en/latest/guide/configuration.html#configuring-credentials\n\n        if not (aws_access_key_id and aws_secret_access_key):\n            logger.debug(\"no credentials provided, delegating credentials resolution to boto3\")\n\n        try:\n            self._s3 = boto3.resource(\n                \"s3\", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, aws_session_token=aws_session_token, **options\n            )\n        except TypeError as e:\n            logger.error(e.args[0])\n            if \"got an unexpected keyword argument\" in e.args[0]:\n                raise DeprecatedBotoClientException(\"Now using boto3. Check that you're passing the correct arguments\")\n            raise\n\n        return self._s3\n\n    @s3.setter\n    def s3(self, value):\n        self._s3 = value\n\n    def exists(self, path):\n        \"\"\"\n        Does provided path exist on S3?\n        \"\"\"\n        (bucket, key) = self._path_to_bucket_and_key(path)\n\n        # root always exists\n        if self._is_root(key):\n            return True\n\n        # file\n        if self._exists(bucket, key):\n            return True\n\n        if self.isdir(path):\n            return True\n\n        logger.debug(\"Path %s does not exist\", path)\n        return False\n\n    def remove(self, path, recursive=True):\n        \"\"\"\n        Remove a file or directory from S3.\n        :param path: File or directory to remove\n        :param recursive: Boolean indicator to remove object and children\n        :return: Boolean indicator denoting success of the removal of 1 or more files\n        \"\"\"\n        if not self.exists(path):\n            logger.debug(\"Could not delete %s; path does not exist\", path)\n            return False\n\n        (bucket, key) = self._path_to_bucket_and_key(path)\n        s3_bucket = self.s3.Bucket(bucket)\n        # root\n        if self._is_root(key):\n            raise InvalidDeleteException(\"Cannot delete root of bucket at path %s\" % path)\n\n        # file\n        if self._exists(bucket, key):\n            self.s3.meta.client.delete_object(Bucket=bucket, Key=key)\n            logger.debug(\"Deleting %s from bucket %s\", key, bucket)\n            return True\n\n        if self.isdir(path) and not recursive:\n            raise InvalidDeleteException(\"Path %s is a directory. Must use recursive delete\" % path)\n\n        delete_key_list = [{\"Key\": obj.key} for obj in s3_bucket.objects.filter(Prefix=self._add_path_delimiter(key))]\n\n        # delete the directory marker file if it exists\n        if self._exists(bucket, \"{}{}\".format(key, S3_DIRECTORY_MARKER_SUFFIX_0)):\n            delete_key_list.append({\"Key\": \"{}{}\".format(key, S3_DIRECTORY_MARKER_SUFFIX_0)})\n\n        if len(delete_key_list) > 0:\n            n = 1000\n            for i in range(0, len(delete_key_list), n):\n                self.s3.meta.client.delete_objects(Bucket=bucket, Delete={\"Objects\": delete_key_list[i : i + n]})\n            return True\n\n        return False\n\n    def move(self, source_path, destination_path, **kwargs):\n        \"\"\"\n        Rename/move an object from one S3 location to another.\n        :param source_path: The `s3://` path of the directory or key to copy from\n        :param destination_path: The `s3://` path of the directory or key to copy to\n        :param kwargs: Keyword arguments are passed to the boto3 function `copy`\n        \"\"\"\n        self.copy(source_path, destination_path, **kwargs)\n        self.remove(source_path)\n\n    def get_key(self, path):\n        \"\"\"\n        Returns the object summary at the path\n        \"\"\"\n        (bucket, key) = self._path_to_bucket_and_key(path)\n\n        if self._exists(bucket, key):\n            return self.s3.ObjectSummary(bucket, key)\n\n    def put(self, local_path, destination_s3_path, **kwargs):\n        \"\"\"\n        Put an object stored locally to an S3 path.\n        :param local_path: Path to source local file\n        :param destination_s3_path: URL for target S3 location\n        :param kwargs: Keyword arguments are passed to the boto function `put_object`\n        \"\"\"\n        self._check_deprecated_argument(**kwargs)\n\n        # put the file\n        self.put_multipart(local_path, destination_s3_path, **kwargs)\n\n    def put_string(self, content, destination_s3_path, **kwargs):\n        \"\"\"\n        Put a string to an S3 path.\n        :param content: Data str\n        :param destination_s3_path: URL for target S3 location\n        :param kwargs: Keyword arguments are passed to the boto3 function `put_object`\n        \"\"\"\n        self._check_deprecated_argument(**kwargs)\n        (bucket, key) = self._path_to_bucket_and_key(destination_s3_path)\n\n        # put the file\n        self.s3.meta.client.put_object(Key=key, Bucket=bucket, Body=content, **kwargs)\n\n    def put_multipart(self, local_path, destination_s3_path, part_size=DEFAULT_PART_SIZE, **kwargs):\n        \"\"\"\n        Put an object stored locally to an S3 path\n        using S3 multi-part upload (for files > 8Mb).\n        :param local_path: Path to source local file\n        :param destination_s3_path: URL for target S3 location\n        :param part_size: Part size in bytes. Default: 8388608 (8MB)\n        :param kwargs: Keyword arguments are passed to the boto function `upload_fileobj` as ExtraArgs\n        \"\"\"\n        self._check_deprecated_argument(**kwargs)\n\n        from boto3.s3.transfer import TransferConfig\n\n        # default part size for boto3 is 8Mb, changing it to fit part_size\n        # provided as a parameter\n        transfer_config = TransferConfig(multipart_chunksize=part_size)\n\n        (bucket, key) = self._path_to_bucket_and_key(destination_s3_path)\n\n        self.s3.meta.client.upload_fileobj(Fileobj=open(local_path, \"rb\"), Bucket=bucket, Key=key, Config=transfer_config, ExtraArgs=kwargs)\n\n    def copy(self, source_path, destination_path, threads=DEFAULT_THREADS, start_time=None, end_time=None, part_size=DEFAULT_PART_SIZE, **kwargs):\n        \"\"\"\n        Copy object(s) from one S3 location to another. Works for individual keys or entire directories.\n        When files are larger than `part_size`, multipart uploading will be used.\n        :param source_path: The `s3://` path of the directory or key to copy from\n        :param destination_path: The `s3://` path of the directory or key to copy to\n        :param threads: Optional argument to define the number of threads to use when copying (min: 3 threads)\n        :param start_time: Optional argument to copy files with modified dates after start_time\n        :param end_time: Optional argument to copy files with modified dates before end_time\n        :param part_size: Part size in bytes\n        :param kwargs: Keyword arguments are passed to the boto function `copy` as ExtraArgs\n        :returns tuple (number_of_files_copied, total_size_copied_in_bytes)\n        \"\"\"\n\n        # don't allow threads to be less than 3\n        threads = 3 if threads < 3 else threads\n\n        if self.isdir(source_path):\n            return self._copy_dir(source_path, destination_path, threads=threads, start_time=start_time, end_time=end_time, part_size=part_size, **kwargs)\n\n        # If the file isn't a directory just perform a simple copy\n        else:\n            return self._copy_file(source_path, destination_path, threads=threads, part_size=part_size, **kwargs)\n\n    def _copy_file(self, source_path, destination_path, threads=DEFAULT_THREADS, part_size=DEFAULT_PART_SIZE, **kwargs):\n        src_bucket, src_key = self._path_to_bucket_and_key(source_path)\n        dst_bucket, dst_key = self._path_to_bucket_and_key(destination_path)\n        transfer_config = TransferConfig(max_concurrency=threads, multipart_chunksize=part_size)\n        item = self.get_key(source_path)\n        copy_source = {\"Bucket\": src_bucket, \"Key\": src_key}\n\n        self.s3.meta.client.copy(copy_source, dst_bucket, dst_key, Config=transfer_config, ExtraArgs=kwargs)\n\n        return 1, item.size\n\n    def _copy_dir(self, source_path, destination_path, threads=DEFAULT_THREADS, start_time=None, end_time=None, part_size=DEFAULT_PART_SIZE, **kwargs):\n        start = datetime.datetime.now()\n        copy_jobs = []\n        management_pool = ThreadPool(processes=threads)\n        transfer_config = TransferConfig(max_concurrency=threads, multipart_chunksize=part_size)\n        src_bucket, src_key = self._path_to_bucket_and_key(source_path)\n        dst_bucket, dst_key = self._path_to_bucket_and_key(destination_path)\n        src_prefix = self._add_path_delimiter(src_key)\n        dst_prefix = self._add_path_delimiter(dst_key)\n        key_path_len = len(src_prefix)\n        total_size_bytes = 0\n        total_keys = 0\n        for item in self.list(source_path, start_time=start_time, end_time=end_time, return_key=True):\n            path = item.key[key_path_len:]\n            # prevents copy attempt of empty key in folder\n            if path != \"\" and path != \"/\":\n                total_keys += 1\n                total_size_bytes += item.size\n                copy_source = {\"Bucket\": src_bucket, \"Key\": src_prefix + path}\n                the_kwargs = {\"Config\": transfer_config, \"ExtraArgs\": kwargs}\n                job = management_pool.apply_async(self.s3.meta.client.copy, args=(copy_source, dst_bucket, dst_prefix + path), kwds=the_kwargs)\n                copy_jobs.append(job)\n        # Wait for the pools to finish scheduling all the copies\n        management_pool.close()\n        management_pool.join()\n        # Raise any errors encountered in any of the copy processes\n        for result in copy_jobs:\n            result.get()\n        end = datetime.datetime.now()\n        duration = end - start\n        logger.info(\"%s : Complete : %s total keys copied in %s\" % (datetime.datetime.now(), total_keys, duration))\n        return total_keys, total_size_bytes\n\n    def get(self, s3_path, destination_local_path):\n        \"\"\"\n        Get an object stored in S3 and write it to a local path.\n        \"\"\"\n        (bucket, key) = self._path_to_bucket_and_key(s3_path)\n        # download the file\n        self.s3.meta.client.download_file(bucket, key, destination_local_path)\n\n    def get_as_bytes(self, s3_path):\n        \"\"\"\n        Get the contents of an object stored in S3 as bytes\n\n        :param s3_path: URL for target S3 location\n        :return: File contents as pure bytes\n        \"\"\"\n        (bucket, key) = self._path_to_bucket_and_key(s3_path)\n        obj = self.s3.Object(bucket, key)\n        contents = obj.get()[\"Body\"].read()\n        return contents\n\n    def get_as_string(self, s3_path, encoding=\"utf-8\"):\n        \"\"\"\n        Get the contents of an object stored in S3 as string.\n\n        :param s3_path: URL for target S3 location\n        :param encoding: Encoding to decode bytes to string\n        :return: File contents as a string\n        \"\"\"\n        content = self.get_as_bytes(s3_path)\n        return content.decode(encoding)\n\n    def isdir(self, path):\n        \"\"\"\n        Is the parameter S3 path a directory?\n        \"\"\"\n        (bucket, key) = self._path_to_bucket_and_key(path)\n\n        s3_bucket = self.s3.Bucket(bucket)\n\n        # root is a directory\n        if self._is_root(key):\n            return True\n\n        for suffix in (S3_DIRECTORY_MARKER_SUFFIX_0, S3_DIRECTORY_MARKER_SUFFIX_1):\n            try:\n                self.s3.meta.client.get_object(Bucket=bucket, Key=key + suffix)\n            except botocore.exceptions.ClientError as e:\n                if e.response[\"Error\"][\"Code\"] not in [\"NoSuchKey\", \"404\"]:\n                    raise\n            else:\n                return True\n\n        # files with this prefix\n        key_path = self._add_path_delimiter(key)\n        s3_bucket_list_result = list(itertools.islice(s3_bucket.objects.filter(Prefix=key_path), 1))\n        if s3_bucket_list_result:\n            return True\n\n        return False\n\n    is_dir = isdir  # compatibility with old version.\n\n    def mkdir(self, path, parents=True, raise_if_exists=False):\n        if raise_if_exists and self.isdir(path):\n            raise FileAlreadyExists()\n\n        bucket, key = self._path_to_bucket_and_key(path)\n        if self._is_root(key):\n            # isdir raises if the bucket doesn't exist; nothing to do here.\n            return\n\n        path = self._add_path_delimiter(path)\n\n        if not parents and not self.isdir(os.path.dirname(path)):\n            raise MissingParentDirectory()\n\n        return self.put_string(\"\", path)\n\n    def listdir(self, path, start_time=None, end_time=None, return_key=False):\n        \"\"\"\n        Get an iterable with S3 folder contents.\n        Iterable contains absolute paths for which queried path is a prefix.\n\n        :param path: URL for target S3 location\n        :param start_time: Optional argument to list files with modified (offset aware) datetime after start_time\n        :param end_time: Optional argument to list files with modified (offset aware) datetime before end_time\n        :param return_key: Optional argument, when set to True will return boto3's ObjectSummary (instead of the filename)\n        \"\"\"\n        (bucket, key) = self._path_to_bucket_and_key(path)\n\n        # grab and validate the bucket\n        s3_bucket = self.s3.Bucket(bucket)\n\n        key_path = self._add_path_delimiter(key)\n        key_path_len = len(key_path)\n        for item in s3_bucket.objects.filter(Prefix=key_path):\n            last_modified_date = item.last_modified\n            if (\n                # neither are defined, list all\n                (not start_time and not end_time)\n                or\n                # start defined, after start\n                (start_time and not end_time and start_time < last_modified_date)\n                or\n                # end defined, prior to end\n                (not start_time and end_time and last_modified_date < end_time)\n                or (start_time and end_time and start_time < last_modified_date < end_time)  # both defined, between\n            ):\n                if return_key:\n                    yield item\n                else:\n                    yield self._add_path_delimiter(path) + item.key[key_path_len:]\n\n    def list(self, path, start_time=None, end_time=None, return_key=False):  # backwards compat\n        \"\"\"\n        Get an iterable with S3 folder contents.\n        Iterable contains paths relative to queried path.\n\n        :param path: URL for target S3 location\n        :param start_time: Optional argument to list files with modified (offset aware) datetime after start_time\n        :param end_time: Optional argument to list files with modified (offset aware) datetime before end_time\n        :param return_key: Optional argument, when set to True will return boto3's ObjectSummary (instead of the filename)\n        \"\"\"\n        key_path_len = len(self._add_path_delimiter(path))\n        for item in self.listdir(path, start_time=start_time, end_time=end_time, return_key=return_key):\n            if return_key:\n                yield item\n            else:\n                yield item[key_path_len:]\n\n    @staticmethod\n    def _get_s3_config(key=None):\n        defaults = dict(configuration.get_config().defaults())\n        try:\n            config = dict(configuration.get_config().items(\"s3\"))\n        except (NoSectionError, KeyError):\n            return {}\n        # So what ports etc can be read without us having to specify all dtypes\n        for k, v in config.items():\n            try:\n                config[k] = int(v)\n            except ValueError:\n                pass\n        if key:\n            return config.get(key)\n        section_only = {k: v for k, v in config.items() if k not in defaults or v != defaults[k]}\n\n        return section_only\n\n    @staticmethod\n    def _path_to_bucket_and_key(path):\n        (scheme, netloc, path, query, fragment) = urlsplit(path, allow_fragments=False)\n        question_mark_plus_query = \"?\" + query if query else \"\"\n        path_without_initial_slash = path[1:] + question_mark_plus_query\n        return netloc, path_without_initial_slash\n\n    @staticmethod\n    def _is_root(key):\n        return (len(key) == 0) or (key == \"/\")\n\n    @staticmethod\n    def _add_path_delimiter(key):\n        return key if key[-1:] == \"/\" or key == \"\" else key + \"/\"\n\n    @staticmethod\n    def _check_deprecated_argument(**kwargs):\n        \"\"\"\n        If `encrypt_key` or `host` is part of the arguments raise an exception\n        :return: None\n        \"\"\"\n        if \"encrypt_key\" in kwargs:\n            raise DeprecatedBotoClientException(\"encrypt_key deprecated in boto3. Please refer to boto3 documentation for encryption details.\")\n        if \"host\" in kwargs:\n            raise DeprecatedBotoClientException(\n                \"host keyword deprecated and is replaced by region_name in boto3.\\n\"\n                \"example: region_name=us-west-1\\n\"\n                \"For region names, refer to the amazon S3 region documentation\\n\"\n                \"https://docs.aws.amazon.com/general/latest/gr/rande.html#s3_region\"\n            )\n\n    def _exists(self, bucket, key):\n        try:\n            self.s3.Object(bucket, key).load()\n        except botocore.exceptions.ClientError as e:\n            if e.response[\"Error\"][\"Code\"] in [\"NoSuchKey\", \"404\"]:\n                return False\n            else:\n                raise\n\n        return True\n\n\nclass AtomicS3File(AtomicLocalFile):\n    \"\"\"\n    An S3 file that writes to a temp file and puts to S3 on close.\n\n    :param kwargs: Keyword arguments are passed to the boto function `initiate_multipart_upload`\n    \"\"\"\n\n    def __init__(self, path, s3_client, **kwargs):\n        self.s3_client = s3_client\n        super(AtomicS3File, self).__init__(path)\n        self.s3_options = kwargs\n\n    def move_to_final_destination(self):\n        self.s3_client.put_multipart(self.tmp_path, self.path, **self.s3_options)\n\n\nclass ReadableS3File:\n    def __init__(self, s3_key):\n        self.s3_key = s3_key.get()[\"Body\"]\n        self.buffer = []\n        self.closed = False\n        self.finished = False\n\n    def read(self, size=None):\n        f = self.s3_key.read(size)\n        return f\n\n    def close(self):\n        self.s3_key.close()\n        self.closed = True\n\n    def __del__(self):\n        self.close()\n\n    def __exit__(self, exc_type, exc, traceback):\n        self.close()\n\n    def __enter__(self):\n        return self\n\n    def _add_to_buffer(self, line):\n        self.buffer.append(line)\n\n    def _flush_buffer(self):\n        output = b\"\".join(self.buffer)\n        self.buffer = []\n        return output\n\n    def readable(self):\n        return True\n\n    def writable(self):\n        return False\n\n    def seekable(self):\n        return False\n\n    def __iter__(self):\n        key_iter = self.s3_key.__iter__()\n\n        has_next = True\n        while has_next:\n            try:\n                # grab the next chunk\n                chunk = next(key_iter)\n\n                # split on newlines, preserving the newline\n                for line in chunk.splitlines(True):\n                    if not line.endswith(os.linesep):\n                        # no newline, so store in buffer\n                        self._add_to_buffer(line)\n                    else:\n                        # newline found, send it out\n                        if self.buffer:\n                            self._add_to_buffer(line)\n                            yield self._flush_buffer()\n                        else:\n                            yield line\n            except StopIteration:\n                # send out anything we have left in the buffer\n                output = self._flush_buffer()\n                if output:\n                    yield output\n                has_next = False\n        self.close()\n\n\nclass S3Target(FileSystemTarget):\n    \"\"\"\n    Target S3 file object\n\n    :param kwargs: Keyword arguments are passed to the boto function `initiate_multipart_upload`\n    \"\"\"\n\n    fs = None\n\n    def __init__(self, path, format=None, client=None, **kwargs):\n        super(S3Target, self).__init__(path)\n        if format is None:\n            format = get_default_format()\n\n        self.path = path\n        self.format = format\n        self.fs = client or S3Client()\n        self.s3_options = kwargs\n\n    def open(self, mode=\"r\"):\n        if mode not in (\"r\", \"w\"):\n            raise ValueError(\"Unsupported open mode '%s'\" % mode)\n\n        if mode == \"r\":\n            s3_key = self.fs.get_key(self.path)\n            if not s3_key:\n                raise FileNotFoundException(\"Could not find file at %s\" % self.path)\n\n            fileobj = ReadableS3File(s3_key)\n            return self.format.pipe_reader(fileobj)\n        else:\n            return self.format.pipe_writer(AtomicS3File(self.path, self.fs, **self.s3_options))\n\n\nclass S3FlagTarget(S3Target):\n    \"\"\"\n    Defines a target directory with a flag-file (defaults to `_SUCCESS`) used\n    to signify job success.\n\n    This checks for two things:\n\n    * the path exists (just like the S3Target)\n    * the _SUCCESS file exists within the directory.\n\n    Because Hadoop outputs into a directory and not a single file,\n    the path is assumed to be a directory.\n\n    This is meant to be a handy alternative to AtomicS3File.\n\n    The AtomicFile approach can be burdensome for S3 since there are no directories, per se.\n\n    If we have 1,000,000 output files, then we have to rename 1,000,000 objects.\n    \"\"\"\n\n    fs = None\n\n    def __init__(self, path, format=None, client=None, flag=\"_SUCCESS\"):\n        \"\"\"\n        Initializes a S3FlagTarget.\n\n        :param path: the directory where the files are stored.\n        :type path: str\n        :param client:\n        :type client:\n        :param flag:\n        :type flag: str\n        \"\"\"\n        if format is None:\n            format = get_default_format()\n\n        if path[-1] != \"/\":\n            raise ValueError(\"S3FlagTarget requires the path to be to a directory.  It must end with a slash ( / ).\")\n        super(S3FlagTarget, self).__init__(path, format, client)\n        self.flag = flag\n\n    def exists(self):\n        hadoopSemaphore = self.path + self.flag\n        return self.fs.exists(hadoopSemaphore)\n\n\nclass S3EmrTarget(S3FlagTarget):\n    \"\"\"\n    Deprecated. Use :py:class:`S3FlagTarget`\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        warnings.warn(\"S3EmrTarget is deprecated. Please use S3FlagTarget\")\n        super(S3EmrTarget, self).__init__(*args, **kwargs)\n\n\nclass S3PathTask(ExternalTask):\n    \"\"\"\n    A external task that to require existence of a path in S3.\n    \"\"\"\n\n    path = Parameter()\n\n    def output(self):\n        return S3Target(self.path)\n\n\nclass S3EmrTask(ExternalTask):\n    \"\"\"\n    An external task that requires the existence of EMR output in S3.\n    \"\"\"\n\n    path = Parameter()\n\n    def output(self):\n        return S3EmrTarget(self.path)\n\n\nclass S3FlagTask(ExternalTask):\n    \"\"\"\n    An external task that requires the existence of EMR output in S3.\n    \"\"\"\n\n    path = Parameter()\n    flag = OptionalParameter(default=None)\n\n    def output(self):\n        return S3FlagTarget(self.path, flag=self.flag)\n"
  },
  {
    "path": "luigi/contrib/salesforce.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nimport abc\nimport csv\nimport logging\nimport re\nimport tempfile\nimport time\nimport warnings\nimport xml.etree.ElementTree as ET\nfrom collections import OrderedDict\nfrom urllib.parse import urlsplit\n\nimport luigi\nfrom luigi import Task\n\nlogger = logging.getLogger(\"luigi-interface\")\n\ntry:\n    import requests\nexcept ImportError:\n    logger.warning(\"This module requires the python package 'requests'.\")\n\n\ndef get_soql_fields(soql):\n    \"\"\"\n    Gets queried columns names.\n    \"\"\"\n    soql_fields = re.search(\"(?<=select)(?s)(.*)(?=from)\", soql, re.IGNORECASE)  # get fields\n    soql_fields = re.sub(\" \", \"\", soql_fields.group())  # remove extra spaces\n    soql_fields = re.sub(\"\\t\", \"\", soql_fields)  # remove tabs\n    fields = re.split(\",|\\n|\\r|\", soql_fields)  # split on commas and newlines\n    fields = [field for field in fields if field != \"\"]  # remove empty strings\n    return fields\n\n\ndef ensure_utf(value):\n    return value.encode(\"utf-8\") if isinstance(value, str) else value\n\n\ndef parse_results(fields, data):\n    \"\"\"\n    Traverses ordered dictionary, calls _traverse_results() to recursively read into the dictionary depth of data\n    \"\"\"\n    master = []\n\n    for record in data[\"records\"]:  # for each 'record' in response\n        row = [None] * len(fields)  # create null list the length of number of columns\n        for obj, value in record.items():  # for each obj in record\n            if not isinstance(value, (dict, list, tuple)):  # if not data structure\n                if obj in fields:\n                    row[fields.index(obj)] = ensure_utf(value)\n\n            elif isinstance(value, dict) and obj != \"attributes\":  # traverse down into object\n                path = obj\n                _traverse_results(value, fields, row, path)\n\n        master.append(row)\n    return master\n\n\ndef _traverse_results(value, fields, row, path):\n    \"\"\"\n    Helper method for parse_results().\n\n    Traverses through ordered dict and recursively calls itself when encountering a dictionary\n    \"\"\"\n    for f, v in value.items():  # for each item in obj\n        field_name = \"{path}.{name}\".format(path=path, name=f) if path else f\n\n        if not isinstance(v, (dict, list, tuple)):  # if not data structure\n            if field_name in fields:\n                row[fields.index(field_name)] = ensure_utf(v)\n\n        elif isinstance(v, dict) and f != \"attributes\":  # it is a dict\n            _traverse_results(v, fields, row, field_name)\n\n\nclass salesforce(luigi.Config):\n    \"\"\"\n    Config system to get config vars from 'salesforce' section in configuration file.\n\n    Did not include sandbox_name here, as the user may have multiple sandboxes.\n    \"\"\"\n\n    username = luigi.Parameter(default=\"\")\n    password = luigi.Parameter(default=\"\")\n    security_token = luigi.Parameter(default=\"\")\n\n    # sandbox token\n    sb_security_token = luigi.Parameter(default=\"\")\n\n\nclass QuerySalesforce(Task):\n    @property\n    @abc.abstractmethod\n    def object_name(self):\n        \"\"\"\n        Override to return the SF object we are querying.\n        Must have the SF \"__c\" suffix if it is a customer object.\n        \"\"\"\n        return None\n\n    @property\n    def use_sandbox(self):\n        \"\"\"\n        Override to specify use of SF sandbox.\n        True iff we should be uploading to a sandbox environment instead of the production organization.\n        \"\"\"\n        return False\n\n    @property\n    def sandbox_name(self):\n        \"\"\"Override to specify the sandbox name if it is intended to be used.\"\"\"\n        return None\n\n    @property\n    @abc.abstractmethod\n    def soql(self):\n        \"\"\"Override to return the raw string SOQL or the path to it.\"\"\"\n        return None\n\n    @property\n    def is_soql_file(self):\n        \"\"\"Override to True if soql property is a file path.\"\"\"\n        return False\n\n    @property\n    def content_type(self):\n        \"\"\"\n        Override to use a different content type. Salesforce allows XML, CSV, ZIP_CSV, or ZIP_XML. Defaults to CSV.\n        \"\"\"\n        return \"CSV\"\n\n    def run(self):\n        if self.use_sandbox and not self.sandbox_name:\n            raise Exception(\"Parameter sf_sandbox_name must be provided when uploading to a Salesforce Sandbox\")\n\n        sf = SalesforceAPI(salesforce().username, salesforce().password, salesforce().security_token, salesforce().sb_security_token, self.sandbox_name)\n\n        job_id = sf.create_operation_job(\"query\", self.object_name, content_type=self.content_type)\n        logger.info(\"Started query job %s in salesforce for object %s\" % (job_id, self.object_name))\n\n        batch_id = \"\"\n        msg = \"\"\n        try:\n            if self.is_soql_file:\n                with open(self.soql, \"r\") as infile:\n                    self.soql = infile.read()\n\n            batch_id = sf.create_batch(job_id, self.soql, self.content_type)\n            logger.info(\"Creating new batch %s to query: %s for job: %s.\" % (batch_id, self.object_name, job_id))\n            status = sf.block_on_batch(job_id, batch_id)\n            if status[\"state\"].lower() == \"failed\":\n                msg = \"Batch failed with message: %s\" % status[\"state_message\"]\n                logger.error(msg)\n                # don't raise exception if it's b/c of an included relationship\n                # normal query will execute (with relationship) after bulk job is closed\n                if \"foreign key relationships not supported\" not in status[\"state_message\"].lower():\n                    raise Exception(msg)\n            else:\n                result_ids = sf.get_batch_result_ids(job_id, batch_id)\n\n                # If there's only one result, just download it, otherwise we need to merge the resulting downloads\n                if len(result_ids) == 1:\n                    data = sf.get_batch_result(job_id, batch_id, result_ids[0])\n                    with open(self.output().path, \"wb\") as outfile:\n                        outfile.write(data)\n                else:\n                    # Download each file to disk, and then merge into one.\n                    # Preferring to do it this way so as to minimize memory consumption.\n                    for i, result_id in enumerate(result_ids):\n                        logger.info(\"Downloading batch result %s for batch: %s and job: %s\" % (result_id, batch_id, job_id))\n                        with open(\"%s.%d\" % (self.output().path, i), \"wb\") as outfile:\n                            outfile.write(sf.get_batch_result(job_id, batch_id, result_id))\n\n                    logger.info(\"Merging results of batch %s\" % batch_id)\n                    self.merge_batch_results(result_ids)\n        finally:\n            logger.info(\"Closing job %s\" % job_id)\n            sf.close_job(job_id)\n\n        if \"state_message\" in status and \"foreign key relationships not supported\" in status[\"state_message\"].lower():\n            logger.info(\"Retrying with REST API query\")\n            data_file = sf.query_all(self.soql)\n\n            reader = csv.reader(data_file)\n            with open(self.output().path, \"wb\") as outfile:\n                writer = csv.writer(outfile, dialect=\"excel\")\n                for row in reader:\n                    writer.writerow(row)\n\n    def merge_batch_results(self, result_ids):\n        \"\"\"\n        Merges the resulting files of a multi-result batch bulk query.\n        \"\"\"\n        outfile = open(self.output().path, \"w\")\n\n        if self.content_type.lower() == \"csv\":\n            for i, result_id in enumerate(result_ids):\n                with open(\"%s.%d\" % (self.output().path, i), \"r\") as f:\n                    header = f.readline()\n                    if i == 0:\n                        outfile.write(header)\n                    for line in f:\n                        outfile.write(line)\n        else:\n            raise Exception(\"Batch result merging not implemented for %s\" % self.content_type)\n\n        outfile.close()\n\n\nclass SalesforceAPI:\n    \"\"\"\n    Class used to interact with the SalesforceAPI.  Currently provides only the\n    methods necessary for performing a bulk upload operation.\n    \"\"\"\n\n    API_VERSION = 34.0\n    SOAP_NS = \"{urn:partner.soap.sforce.com}\"\n    API_NS = \"{http://www.force.com/2009/06/asyncapi/dataload}\"\n\n    def __init__(self, username, password, security_token, sb_token=None, sandbox_name=None):\n        self.username = username\n        self.password = password\n        self.security_token = security_token\n        self.sb_security_token = sb_token\n        self.sandbox_name = sandbox_name\n\n        if self.sandbox_name:\n            self.username += \".%s\" % self.sandbox_name\n\n        self.session_id = None\n        self.server_url = None\n        self.hostname = None\n\n    def start_session(self):\n        \"\"\"\n        Starts a Salesforce session and determines which SF instance to use for future requests.\n        \"\"\"\n        if self.has_active_session():\n            raise Exception(\"Session already in progress.\")\n\n        response = requests.post(self._get_login_url(), headers=self._get_login_headers(), data=self._get_login_xml())\n        response.raise_for_status()\n\n        root = ET.fromstring(response.text)\n        for e in root.iter(\"%ssessionId\" % self.SOAP_NS):\n            if self.session_id:\n                raise Exception(\"Invalid login attempt.  Multiple session ids found.\")\n            self.session_id = e.text\n\n        for e in root.iter(\"%sserverUrl\" % self.SOAP_NS):\n            if self.server_url:\n                raise Exception(\"Invalid login attempt.  Multiple server urls found.\")\n            self.server_url = e.text\n\n        if not self.has_active_session():\n            raise Exception(\"Invalid login attempt resulted in null sessionId [%s] and/or serverUrl [%s].\" % (self.session_id, self.server_url))\n        self.hostname = urlsplit(self.server_url).hostname\n\n    def has_active_session(self):\n        return self.session_id and self.server_url\n\n    def query(self, query, **kwargs):\n        \"\"\"\n        Return the result of a Salesforce SOQL query as a dict decoded from the Salesforce response JSON payload.\n\n        :param query: the SOQL query to send to Salesforce, e.g. \"SELECT id from Lead WHERE email = 'a@b.com'\"\n        \"\"\"\n        params = {\"q\": query}\n        response = requests.get(self._get_norm_query_url(), headers=self._get_rest_headers(), params=params, **kwargs)\n        if response.status_code != requests.codes.ok:\n            raise Exception(response.content)\n\n        return response.json()\n\n    def query_more(self, next_records_identifier, identifier_is_url=False, **kwargs):\n        \"\"\"\n        Retrieves more results from a query that returned more results\n        than the batch maximum. Returns a dict decoded from the Salesforce\n        response JSON payload.\n\n        :param next_records_identifier: either the Id of the next Salesforce\n                                     object in the result, or a URL to the\n                                     next record in the result.\n        :param identifier_is_url: True if `next_records_identifier` should be\n                               treated as a URL, False if\n                               `next_records_identifer` should be treated as\n                               an Id.\n        \"\"\"\n        if identifier_is_url:\n            # Don't use `self.base_url` here because the full URI is provided\n            url = \"https://{instance}{next_record_url}\".format(instance=self.hostname, next_record_url=next_records_identifier)\n        else:\n            url = self._get_norm_query_url() + \"{next_record_id}\"\n            url = url.format(next_record_id=next_records_identifier)\n        response = requests.get(url, headers=self._get_rest_headers(), **kwargs)\n\n        response.raise_for_status()\n\n        return response.json()\n\n    def query_all(self, query, **kwargs):\n        \"\"\"\n        Returns the full set of results for the `query`. This is a\n        convenience wrapper around `query(...)` and `query_more(...)`.\n        The returned dict is the decoded JSON payload from the final call to\n        Salesforce, but with the `totalSize` field representing the full\n        number of results retrieved and the `records` list representing the\n        full list of records retrieved.\n\n        :param query: the SOQL query to send to Salesforce, e.g.\n                   `SELECT Id FROM Lead WHERE Email = \"waldo@somewhere.com\"`\n        \"\"\"\n        # Make the initial query to Salesforce\n        response = self.query(query, **kwargs)\n\n        # get fields\n        fields = get_soql_fields(query)\n\n        # put fields and first page of results into a temp list to be written to TempFile\n        tmp_list = [fields]\n        tmp_list.extend(parse_results(fields, response))\n\n        tmp_dir = luigi.configuration.get_config().get(\"salesforce\", \"local-tmp-dir\", None)\n        tmp_file = tempfile.TemporaryFile(mode=\"a+b\", dir=tmp_dir)\n\n        writer = csv.writer(tmp_file)\n        writer.writerows(tmp_list)\n\n        # The number of results might have exceeded the Salesforce batch limit\n        # so check whether there are more results and retrieve them if so.\n\n        length = len(response[\"records\"])\n        while not response[\"done\"]:\n            response = self.query_more(response[\"nextRecordsUrl\"], identifier_is_url=True, **kwargs)\n\n            writer.writerows(parse_results(fields, response))\n            length += len(response[\"records\"])\n            if not length % 10000:\n                logger.info(\"Requested {0} lines...\".format(length))\n\n        logger.info(\"Requested a total of {0} lines.\".format(length))\n\n        tmp_file.seek(0)\n        return tmp_file\n\n    # Generic Rest Function\n    def restful(self, path, params):\n        \"\"\"\n        Allows you to make a direct REST call if you know the path\n        Arguments:\n        :param path: The path of the request. Example: sobjects/User/ABC123/password'\n        :param params: dict of parameters to pass to the path\n        \"\"\"\n\n        url = self._get_norm_base_url() + path\n        response = requests.get(url, headers=self._get_rest_headers(), params=params)\n\n        if response.status_code != 200:\n            raise Exception(response)\n        json_result = response.json(object_pairs_hook=OrderedDict)\n        if len(json_result) == 0:\n            return None\n        else:\n            return json_result\n\n    def create_operation_job(self, operation, obj, external_id_field_name=None, content_type=None):\n        \"\"\"\n        Creates a new SF job that for doing any operation (insert, upsert, update, delete, query)\n\n        :param operation: delete, insert, query, upsert, update, hardDelete. Must be lowercase.\n        :param obj: Parent SF object\n        :param external_id_field_name: Optional.\n        \"\"\"\n        if not self.has_active_session():\n            self.start_session()\n\n        response = requests.post(\n            self._get_create_job_url(),\n            headers=self._get_create_job_headers(),\n            data=self._get_create_job_xml(operation, obj, external_id_field_name, content_type),\n        )\n        response.raise_for_status()\n\n        root = ET.fromstring(response.text)\n        job_id = root.find(\"%sid\" % self.API_NS).text\n        return job_id\n\n    def get_job_details(self, job_id):\n        \"\"\"\n        Gets all details for existing job\n\n        :param job_id: job_id as returned by 'create_operation_job(...)'\n        :return: job info as xml\n        \"\"\"\n        response = requests.get(self._get_job_details_url(job_id))\n\n        response.raise_for_status()\n\n        return response\n\n    def abort_job(self, job_id):\n        \"\"\"\n        Abort an existing job. When a job is aborted, no more records are processed.\n        Changes to data may already have been committed and aren't rolled back.\n\n        :param job_id: job_id as returned by 'create_operation_job(...)'\n        :return: abort response as xml\n        \"\"\"\n        response = requests.post(self._get_abort_job_url(job_id), headers=self._get_abort_job_headers(), data=self._get_abort_job_xml())\n        response.raise_for_status()\n\n        return response\n\n    def close_job(self, job_id):\n        \"\"\"\n        Closes job\n\n        :param job_id: job_id as returned by 'create_operation_job(...)'\n        :return: close response as xml\n        \"\"\"\n        if not job_id or not self.has_active_session():\n            raise Exception(\"Can not close job without valid job_id and an active session.\")\n\n        response = requests.post(self._get_close_job_url(job_id), headers=self._get_close_job_headers(), data=self._get_close_job_xml())\n        response.raise_for_status()\n\n        return response\n\n    def create_batch(self, job_id, data, file_type):\n        \"\"\"\n        Creates a batch with either a string of data or a file containing data.\n\n        If a file is provided, this will pull the contents of the file_target into memory when running.\n        That shouldn't be a problem for any files that meet the Salesforce single batch upload\n        size limit (10MB) and is done to ensure compressed files can be uploaded properly.\n\n        :param job_id: job_id as returned by 'create_operation_job(...)'\n        :param data:\n\n        :return: Returns batch_id\n        \"\"\"\n        if not job_id or not self.has_active_session():\n            raise Exception(\"Can not create a batch without a valid job_id and an active session.\")\n\n        headers = self._get_create_batch_content_headers(file_type)\n        headers[\"Content-Length\"] = str(len(data))\n\n        response = requests.post(self._get_create_batch_url(job_id), headers=headers, data=data)\n        response.raise_for_status()\n\n        root = ET.fromstring(response.text)\n        batch_id = root.find(\"%sid\" % self.API_NS).text\n        return batch_id\n\n    def block_on_batch(self, job_id, batch_id, sleep_time_seconds=5, max_wait_time_seconds=-1):\n        \"\"\"\n        Blocks until @batch_id is completed or failed.\n        :param job_id:\n        :param batch_id:\n        :param sleep_time_seconds:\n        :param max_wait_time_seconds:\n        \"\"\"\n        if not job_id or not batch_id or not self.has_active_session():\n            raise Exception(\"Can not block on a batch without a valid batch_id, job_id and an active session.\")\n\n        start_time = time.time()\n        status = {}\n        while max_wait_time_seconds < 0 or time.time() - start_time < max_wait_time_seconds:\n            status = self._get_batch_info(job_id, batch_id)\n            logger.info(\n                \"Batch %s Job %s in state %s.  %s records processed.  %s records failed.\"\n                % (batch_id, job_id, status[\"state\"], status[\"num_processed\"], status[\"num_failed\"])\n            )\n            if status[\"state\"].lower() in [\"completed\", \"failed\"]:\n                return status\n            time.sleep(sleep_time_seconds)\n\n        raise Exception(\"Batch did not complete in %s seconds.  Final status was: %s\" % (sleep_time_seconds, status))\n\n    def get_batch_results(self, job_id, batch_id):\n        \"\"\"\n        DEPRECATED: Use `get_batch_result_ids`\n        \"\"\"\n        warnings.warn(\"get_batch_results is deprecated and only returns one batch result. Please use get_batch_result_ids\")\n        return self.get_batch_result_ids(job_id, batch_id)[0]\n\n    def get_batch_result_ids(self, job_id, batch_id):\n        \"\"\"\n        Get result IDs of a batch that has completed processing.\n\n        :param job_id: job_id as returned by 'create_operation_job(...)'\n        :param batch_id: batch_id as returned by 'create_batch(...)'\n        :return: list of batch result IDs to be used in 'get_batch_result(...)'\n        \"\"\"\n        response = requests.get(self._get_batch_results_url(job_id, batch_id), headers=self._get_batch_info_headers())\n        response.raise_for_status()\n\n        root = ET.fromstring(response.text)\n        result_ids = [r.text for r in root.findall(\"%sresult\" % self.API_NS)]\n\n        return result_ids\n\n    def get_batch_result(self, job_id, batch_id, result_id):\n        \"\"\"\n        Gets result back from Salesforce as whatever type was originally sent in create_batch (xml, or csv).\n        :param job_id:\n        :param batch_id:\n        :param result_id:\n\n        \"\"\"\n        response = requests.get(self._get_batch_result_url(job_id, batch_id, result_id), headers=self._get_session_headers())\n        response.raise_for_status()\n\n        return response.content\n\n    def _get_batch_info(self, job_id, batch_id):\n        response = requests.get(self._get_batch_info_url(job_id, batch_id), headers=self._get_batch_info_headers())\n        response.raise_for_status()\n\n        root = ET.fromstring(response.text)\n\n        result = {\n            \"state\": root.find(\"%sstate\" % self.API_NS).text,\n            \"num_processed\": root.find(\"%snumberRecordsProcessed\" % self.API_NS).text,\n            \"num_failed\": root.find(\"%snumberRecordsFailed\" % self.API_NS).text,\n        }\n        if root.find(\"%sstateMessage\" % self.API_NS) is not None:\n            result[\"state_message\"] = root.find(\"%sstateMessage\" % self.API_NS).text\n        return result\n\n    def _get_login_url(self):\n        server = \"login\" if not self.sandbox_name else \"test\"\n        return \"https://%s.salesforce.com/services/Soap/u/%s\" % (server, self.API_VERSION)\n\n    def _get_base_url(self):\n        return \"https://%s/services\" % self.hostname\n\n    def _get_bulk_base_url(self):\n        # Expands on Base Url for Bulk\n        return \"%s/async/%s\" % (self._get_base_url(), self.API_VERSION)\n\n    def _get_norm_base_url(self):\n        # Expands on Base Url for Norm\n        return \"%s/data/v%s\" % (self._get_base_url(), self.API_VERSION)\n\n    def _get_norm_query_url(self):\n        # Expands on Norm Base Url\n        return \"%s/query\" % self._get_norm_base_url()\n\n    def _get_create_job_url(self):\n        # Expands on Bulk url\n        return \"%s/job\" % (self._get_bulk_base_url())\n\n    def _get_job_id_url(self, job_id):\n        # Expands on Job Creation url\n        return \"%s/%s\" % (self._get_create_job_url(), job_id)\n\n    def _get_job_details_url(self, job_id):\n        # Expands on basic Job Id url\n        return self._get_job_id_url(job_id)\n\n    def _get_abort_job_url(self, job_id):\n        # Expands on basic Job Id url\n        return self._get_job_id_url(job_id)\n\n    def _get_close_job_url(self, job_id):\n        # Expands on basic Job Id url\n        return self._get_job_id_url(job_id)\n\n    def _get_create_batch_url(self, job_id):\n        # Expands on basic Job Id url\n        return \"%s/batch\" % (self._get_job_id_url(job_id))\n\n    def _get_batch_info_url(self, job_id, batch_id):\n        # Expands on Batch Creation url\n        return \"%s/%s\" % (self._get_create_batch_url(job_id), batch_id)\n\n    def _get_batch_results_url(self, job_id, batch_id):\n        # Expands on Batch Info url\n        return \"%s/result\" % (self._get_batch_info_url(job_id, batch_id))\n\n    def _get_batch_result_url(self, job_id, batch_id, result_id):\n        # Expands on Batch Results url\n        return \"%s/%s\" % (self._get_batch_results_url(job_id, batch_id), result_id)\n\n    def _get_login_headers(self):\n        headers = {\"Content-Type\": \"text/xml; charset=UTF-8\", \"SOAPAction\": \"login\"}\n        return headers\n\n    def _get_session_headers(self):\n        headers = {\"X-SFDC-Session\": self.session_id}\n        return headers\n\n    def _get_norm_session_headers(self):\n        headers = {\"Authorization\": \"Bearer %s\" % self.session_id}\n        return headers\n\n    def _get_rest_headers(self):\n        headers = self._get_norm_session_headers()\n        headers[\"Content-Type\"] = \"application/json\"\n        return headers\n\n    def _get_job_headers(self):\n        headers = self._get_session_headers()\n        headers[\"Content-Type\"] = \"application/xml; charset=UTF-8\"\n        return headers\n\n    def _get_create_job_headers(self):\n        return self._get_job_headers()\n\n    def _get_abort_job_headers(self):\n        return self._get_job_headers()\n\n    def _get_close_job_headers(self):\n        return self._get_job_headers()\n\n    def _get_create_batch_content_headers(self, content_type):\n        headers = self._get_session_headers()\n        content_type = \"text/csv\" if content_type.lower() == \"csv\" else \"application/xml\"\n        headers[\"Content-Type\"] = \"%s; charset=UTF-8\" % content_type\n        return headers\n\n    def _get_batch_info_headers(self):\n        return self._get_session_headers()\n\n    def _get_login_xml(self):\n        return \"\"\"<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n            <env:Envelope xmlns:xsd=\"http://www.w3.org/2001/XMLSchema\"\n                xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n                xmlns:env=\"http://schemas.xmlsoap.org/soap/envelope/\">\n              <env:Body>\n                <n1:login xmlns:n1=\"urn:partner.soap.sforce.com\">\n                  <n1:username>%s</n1:username>\n                  <n1:password>%s%s</n1:password>\n                </n1:login>\n              </env:Body>\n            </env:Envelope>\n        \"\"\" % (self.username, self.password, self.security_token if self.sandbox_name is None else self.sb_security_token)\n\n    def _get_create_job_xml(self, operation, obj, external_id_field_name, content_type):\n        external_id_field_name_element = \"\" if not external_id_field_name else \"\\n<externalIdFieldName>%s</externalIdFieldName>\" % external_id_field_name\n\n        # Note: \"Unable to parse job\" error may be caused by reordering fields.\n        #       ExternalIdFieldName element must be before contentType element.\n        return \"\"\"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n            <jobInfo xmlns=\"http://www.force.com/2009/06/asyncapi/dataload\">\n                <operation>%s</operation>\n                <object>%s</object>\n                %s\n                <contentType>%s</contentType>\n            </jobInfo>\n        \"\"\" % (operation, obj, external_id_field_name_element, content_type)\n\n    def _get_abort_job_xml(self):\n        return \"\"\"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n            <jobInfo xmlns=\"http://www.force.com/2009/06/asyncapi/dataload\">\n              <state>Aborted</state>\n            </jobInfo>\n        \"\"\"\n\n    def _get_close_job_xml(self):\n        return \"\"\"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n            <jobInfo xmlns=\"http://www.force.com/2009/06/asyncapi/dataload\">\n              <state>Closed</state>\n            </jobInfo>\n        \"\"\"\n"
  },
  {
    "path": "luigi/contrib/scalding.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport logging\nimport os\nimport re\nimport subprocess\nimport warnings\n\nimport luigi.configuration\nimport luigi.contrib.hadoop\nimport luigi.contrib.hadoop_jar\nimport luigi.contrib.hdfs\nfrom luigi import LocalTarget\nfrom luigi.task import flatten\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\"\"\"\nScalding support for Luigi.\n\nExample configuration section in luigi.cfg::\n\n    [scalding]\n    # scala home directory, which should include a lib subdir with scala jars.\n    scala-home: /usr/share/scala\n\n    # scalding home directory, which should include a lib subdir with\n    # scalding-*-assembly-* jars as built from the official Twitter build script.\n    scalding-home: /usr/share/scalding\n\n    # provided dependencies, e.g. jars required for compiling but not executing\n    # scalding jobs. Currently required jars:\n    # org.apache.hadoop/hadoop-core/0.20.2\n    # org.slf4j/slf4j-log4j12/1.6.6\n    # log4j/log4j/1.2.15\n    # commons-httpclient/commons-httpclient/3.1\n    # commons-cli/commons-cli/1.2\n    # org.apache.zookeeper/zookeeper/3.3.4\n    scalding-provided: /usr/share/scalding/provided\n\n    # additional jars required.\n    scalding-libjars: /usr/share/scalding/libjars\n\"\"\"\n\n\nclass ScaldingJobRunner(luigi.contrib.hadoop.JobRunner):\n    \"\"\"\n    JobRunner for `pyscald` commands. Used to run a ScaldingJobTask.\n    \"\"\"\n\n    def __init__(self):\n        conf = luigi.configuration.get_config()\n\n        default = os.environ.get(\"SCALA_HOME\", \"/usr/share/scala\")\n        self.scala_home = conf.get(\"scalding\", \"scala-home\", default)\n\n        default = os.environ.get(\"SCALDING_HOME\", \"/usr/share/scalding\")\n        self.scalding_home = conf.get(\"scalding\", \"scalding-home\", default)\n        self.provided_dir = conf.get(\"scalding\", \"scalding-provided\", os.path.join(default, \"provided\"))\n        self.libjars_dir = conf.get(\"scalding\", \"scalding-libjars\", os.path.join(default, \"libjars\"))\n\n        self.tmp_dir = LocalTarget(is_tmp=True)\n\n    def _get_jars(self, path):\n        return [os.path.join(path, j) for j in os.listdir(path) if j.endswith(\".jar\")]\n\n    def get_scala_jars(self, include_compiler=False):\n        lib_dir = os.path.join(self.scala_home, \"lib\")\n        jars = [os.path.join(lib_dir, \"scala-library.jar\")]\n\n        # additional jar for scala 2.10 only\n        reflect = os.path.join(lib_dir, \"scala-reflect.jar\")\n        if os.path.exists(reflect):\n            jars.append(reflect)\n\n        if include_compiler:\n            jars.append(os.path.join(lib_dir, \"scala-compiler.jar\"))\n\n        return jars\n\n    def get_scalding_jars(self):\n        lib_dir = os.path.join(self.scalding_home, \"lib\")\n        return self._get_jars(lib_dir)\n\n    def get_scalding_core(self):\n        lib_dir = os.path.join(self.scalding_home, \"lib\")\n        for j in os.listdir(lib_dir):\n            if j.startswith(\"scalding-core-\"):\n                p = os.path.join(lib_dir, j)\n                logger.debug(\"Found scalding-core: %s\", p)\n                return p\n        raise luigi.contrib.hadoop.HadoopJobError(\"Could not find scalding-core.\")\n\n    def get_provided_jars(self):\n        return self._get_jars(self.provided_dir)\n\n    def get_libjars(self):\n        return self._get_jars(self.libjars_dir)\n\n    def get_tmp_job_jar(self, source):\n        job_name = os.path.basename(os.path.splitext(source)[0])\n        return os.path.join(self.tmp_dir.path, job_name + \".jar\")\n\n    def get_build_dir(self, source):\n        build_dir = os.path.join(self.tmp_dir.path, \"build\")\n        return build_dir\n\n    def get_job_class(self, source):\n        # find name of the job class\n        # usually the one that matches file name or last class that extends Job\n        job_name = os.path.splitext(os.path.basename(source))[0]\n        package = None\n        job_class = None\n        for line in open(source).readlines():\n            p = re.search(r\"package\\s+([^\\s\\(]+)\", line)\n            if p:\n                package = p.groups()[0]\n            p = re.search(r\"class\\s+([^\\s\\(]+).*extends\\s+.*Job\", line)\n            if p:\n                job_class = p.groups()[0]\n                if job_class == job_name:\n                    break\n        if job_class:\n            if package:\n                job_class = package + \".\" + job_class\n            logger.debug(\"Found scalding job class: %s\", job_class)\n            return job_class\n        else:\n            raise luigi.contrib.hadoop.HadoopJobError(\"Coudl not find scalding job class.\")\n\n    def build_job_jar(self, job):\n        job_jar = job.jar()\n        if job_jar:\n            if not os.path.exists(job_jar):\n                logger.error(\"Can't find jar: %s, full path %s\", job_jar, os.path.abspath(job_jar))\n                raise Exception(\"job jar does not exist\")\n            if not job.job_class():\n                logger.error(\"Undefined job_class()\")\n                raise Exception(\"Undefined job_class()\")\n            return job_jar\n\n        job_src = job.source()\n        if not job_src:\n            logger.error(\"Both source() and jar() undefined\")\n            raise Exception(\"Both source() and jar() undefined\")\n        if not os.path.exists(job_src):\n            logger.error(\"Can't find source: %s, full path %s\", job_src, os.path.abspath(job_src))\n            raise Exception(\"job source does not exist\")\n\n        job_src = job.source()\n        job_jar = self.get_tmp_job_jar(job_src)\n\n        build_dir = self.get_build_dir(job_src)\n        if not os.path.exists(build_dir):\n            os.makedirs(build_dir)\n\n        classpath = \":\".join(filter(None, self.get_scalding_jars() + self.get_provided_jars() + self.get_libjars() + job.extra_jars()))\n        scala_cp = \":\".join(self.get_scala_jars(include_compiler=True))\n\n        # compile scala source\n        arglist = [\"java\", \"-cp\", scala_cp, \"scala.tools.nsc.Main\", \"-classpath\", classpath, \"-d\", build_dir, job_src]\n        logger.info(\"Compiling scala source: %s\", subprocess.list2cmdline(arglist))\n        subprocess.check_call(arglist)\n\n        # build job jar file\n        arglist = [\"jar\", \"cf\", job_jar, \"-C\", build_dir, \".\"]\n        logger.info(\"Building job jar: %s\", subprocess.list2cmdline(arglist))\n        subprocess.check_call(arglist)\n        return job_jar\n\n    def run_job(self, job, tracking_url_callback=None):\n        if tracking_url_callback is not None:\n            warnings.warn(\"tracking_url_callback argument is deprecated, task.set_tracking_url is used instead.\", DeprecationWarning)\n\n        job_jar = self.build_job_jar(job)\n        jars = [job_jar] + self.get_libjars() + job.extra_jars()\n        scalding_core = self.get_scalding_core()\n        libjars = \",\".join(filter(None, jars))\n        arglist = luigi.contrib.hdfs.load_hadoop_cmd() + [\"jar\", scalding_core, \"-libjars\", libjars]\n        arglist += [\"-D%s\" % c for c in job.jobconfs()]\n\n        job_class = job.job_class() or self.get_job_class(job.source())\n        arglist += [job_class, \"--hdfs\"]\n\n        # scalding does not parse argument with '=' properly\n        arglist += [\"--name\", job.task_id.replace(\"=\", \":\")]\n\n        (tmp_files, job_args) = luigi.contrib.hadoop_jar.fix_paths(job)\n        arglist += job_args\n\n        env = os.environ.copy()\n        jars.append(scalding_core)\n        hadoop_cp = \":\".join(filter(None, jars))\n        env[\"HADOOP_CLASSPATH\"] = hadoop_cp\n        logger.info(\"Submitting Hadoop job: HADOOP_CLASSPATH=%s %s\", hadoop_cp, subprocess.list2cmdline(arglist))\n        luigi.contrib.hadoop.run_and_track_hadoop_job(arglist, job.set_tracking_url, env=env)\n\n        for a, b in tmp_files:\n            a.move(b)\n\n\nclass ScaldingJobTask(luigi.contrib.hadoop.BaseHadoopJobTask):\n    \"\"\"\n    A job task for Scalding that define a scala source and (optional) main method.\n\n    requires() should return a dictionary where the keys are Scalding argument\n    names and values are sub tasks or lists of subtasks.\n\n    For example:\n\n    .. code-block:: python\n\n        {'input1': A, 'input2': C} => --input1 <Aoutput> --input2 <Coutput>\n        {'input1': [A, B], 'input2': [C]} => --input1 <Aoutput> <Boutput> --input2 <Coutput>\n    \"\"\"\n\n    def relpath(self, current_file, rel_path):\n        \"\"\"\n        Compute path given current file and relative path.\n        \"\"\"\n        script_dir = os.path.dirname(os.path.abspath(current_file))\n        rel_path = os.path.abspath(os.path.join(script_dir, rel_path))\n        return rel_path\n\n    def source(self):\n        \"\"\"\n        Path to the scala source for this Scalding Job\n\n        Either one of source() or jar() must be specified.\n        \"\"\"\n        return None\n\n    def jar(self):\n        \"\"\"\n        Path to the jar file for this Scalding Job\n\n        Either one of source() or jar() must be specified.\n        \"\"\"\n        return None\n\n    def extra_jars(self):\n        \"\"\"\n        Extra jars for building and running this Scalding Job.\n        \"\"\"\n        return []\n\n    def job_class(self):\n        \"\"\"\n        optional main job class for this Scalding Job.\n        \"\"\"\n        return None\n\n    def job_runner(self):\n        return ScaldingJobRunner()\n\n    def atomic_output(self):\n        \"\"\"\n        If True, then rewrite output arguments to be temp locations and\n        atomically move them into place after the job finishes.\n        \"\"\"\n        return True\n\n    def requires(self):\n        return {}\n\n    def job_args(self):\n        \"\"\"\n        Extra arguments to pass to the Scalding job.\n        \"\"\"\n        return []\n\n    def args(self):\n        \"\"\"\n        Returns an array of args to pass to the job.\n        \"\"\"\n        arglist = []\n        for k, v in self.requires_hadoop().items():\n            arglist.append(\"--\" + k)\n            arglist.extend([t.output().path for t in flatten(v)])\n        arglist.extend([\"--output\", self.output()])\n        arglist.extend(self.job_args())\n        return arglist\n"
  },
  {
    "path": "luigi/contrib/sge.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"SGE batch system Tasks.\n\nAdapted by Jake Feala (@jfeala) from\n`LSF extension <https://github.com/dattalab/luigi/blob/lsf/luigi/lsf.py>`_\nby Alex Wiltschko (@alexbw)\nMaintained by Jake Feala (@jfeala)\n\nSunGrid Engine is a job scheduler used to allocate compute resources on a\nshared cluster. Jobs are submitted using the ``qsub`` command and monitored\nusing ``qstat``. To get started, install luigi on all nodes.\n\nTo run luigi workflows on an SGE cluster, subclass\n:class:`luigi.contrib.sge.SGEJobTask` as you would any :class:`luigi.Task`,\nbut override the ``work()`` method, instead of ``run()``, to define the job\ncode. Then, run your Luigi workflow from the master node, assigning > 1\n``workers`` in order to distribute the tasks in parallel across the cluster.\n\nThe following is an example usage (and can also be found in ``sge_tests.py``)\n\n.. code-block:: python\n\n    import logging\n    import luigi\n    import os\n    from luigi.contrib.sge import SGEJobTask\n\n    logger = logging.getLogger('luigi-interface')\n\n\n    class TestJobTask(SGEJobTask):\n\n        i = luigi.Parameter()\n\n        def work(self):\n            logger.info('Running test job...')\n            with open(self.output().path, 'w') as f:\n                f.write('this is a test')\n\n        def output(self):\n            return luigi.LocalTarget(os.path.join('/home', 'testfile_' + str(self.i)))\n\n\n    if __name__ == '__main__':\n        tasks = [TestJobTask(i=str(i), n_cpu=i+1) for i in range(3)]\n        luigi.build(tasks, local_scheduler=True, workers=3)\n\n\nThe ``n-cpu`` parameter allows you to define different compute resource\nrequirements (or slots, in SGE terms) for each task. In this example, the\nthird Task asks for 3 CPU slots. If your cluster only contains nodes with\n2 CPUs, this task will hang indefinitely in the queue. See the docs for\n:class:`luigi.contrib.sge.SGEJobTask` for other SGE parameters. As for any\ntask, you can also set these in your luigi configuration file as shown below.\nThe default values below were matched to the values used by MIT StarCluster,\nan open-source SGE cluster manager for use with Amazon EC2::\n\n    [SGEJobTask]\n    shared-tmp-dir = /home\n    parallel-env = orte\n    n-cpu = 2\n\n\n\"\"\"\n\n\n# This extension is modeled after the hadoop.py approach.\n#\n# Implementation notes\n# The procedure:\n# - Pickle the class\n# - Construct a qsub argument that runs a generic runner function with the path to the pickled class\n# - Runner function loads the class from pickle\n# - Runner function hits the work button on it\n\nimport logging\nimport os\nimport pickle\nimport random\nimport subprocess\nimport sys\nimport time\n\nimport luigi\nfrom luigi.contrib import sge_runner\nfrom luigi.contrib.hadoop import create_packages_archive\n\nlogger = logging.getLogger(\"luigi-interface\")\nlogger.propagate = False\n\nPOLL_TIME = 5  # decided to hard-code rather than configure here\n\n\ndef _parse_qstat_state(qstat_out, job_id):\n    \"\"\"Parse \"state\" column from `qstat` output for given job_id\n\n    Returns state for the *first* job matching job_id. Returns 'u' if\n    `qstat` output is empty or job_id is not found.\n\n    \"\"\"\n    if qstat_out.strip() == \"\":\n        return \"u\"\n    lines = qstat_out.split(\"\\n\")\n    # skip past header\n    while not lines.pop(0).startswith(\"---\"):\n        pass\n    for line in lines:\n        if line:\n            job, prior, name, user, state = line.strip().split()[0:5]\n            if int(job) == int(job_id):\n                return state\n    return \"u\"\n\n\ndef _parse_qsub_job_id(qsub_out):\n    \"\"\"Parse job id from qsub output string.\n\n    Assume format:\n\n        \"Your job <job_id> (\"<job_name>\") has been submitted\"\n\n    \"\"\"\n    return int(qsub_out.split()[2])\n\n\ndef _build_qsub_command(cmd, job_name, outfile, errfile, pe, n_cpu):\n    \"\"\"Submit shell command to SGE queue via `qsub`\"\"\"\n    qsub_template = \"\"\"echo {cmd} | qsub -o \":{outfile}\" -e \":{errfile}\" -V -r y -pe {pe} {n_cpu} -N {job_name}\"\"\"\n    return qsub_template.format(cmd=cmd, job_name=job_name, outfile=outfile, errfile=errfile, pe=pe, n_cpu=n_cpu)\n\n\nclass SGEJobTask(luigi.Task):\n    \"\"\"Base class for executing a job on SunGrid Engine\n\n    Override ``work()`` (rather than ``run()``) with your job code.\n\n    Parameters:\n\n    - n_cpu: Number of CPUs (or \"slots\") to allocate for the Task. This\n          value is passed as ``qsub -pe {pe} {n_cpu}``\n    - parallel_env: SGE parallel environment name. The default is \"orte\",\n          the parallel environment installed with MIT StarCluster. If you\n          are using a different cluster environment, check with your\n          sysadmin for the right pe to use. This value is passed as {pe}\n          to the qsub command above.\n    - shared_tmp_dir: Shared drive accessible from all nodes in the cluster.\n          Task classes and dependencies are pickled to a temporary folder on\n          this drive. The default is ``/home``, the NFS share location setup\n          by StarCluster\n    - job_name_format: String that can be passed in to customize the job name\n        string passed to qsub; e.g. \"Task123_{task_family}_{n_cpu}...\".\n    - job_name: Exact job name to pass to qsub.\n    - run_locally: Run locally instead of on the cluster.\n    - poll_time: the length of time to wait in order to poll qstat\n    - dont_remove_tmp_dir: Instead of deleting the temporary directory, keep it.\n    - no_tarball: Don't create a tarball of the luigi project directory.  Can be\n        useful to reduce I/O requirements when the luigi directory is accessible\n        from cluster nodes already.\n\n    \"\"\"\n\n    n_cpu = luigi.IntParameter(default=2, significant=False)\n    shared_tmp_dir = luigi.Parameter(default=\"/home\", significant=False)\n    parallel_env = luigi.Parameter(default=\"orte\", significant=False)\n    job_name_format = luigi.Parameter(\n        significant=False, default=None, description=\"A string that can be formatted with class variables to name the job with qsub.\"\n    )\n    job_name = luigi.Parameter(significant=False, default=None, description=\"Explicit job name given via qsub.\")\n    run_locally = luigi.BoolParameter(significant=False, description=\"run locally instead of on the cluster\")\n    poll_time = luigi.IntParameter(significant=False, default=POLL_TIME, description=\"specify the wait time to poll qstat for the job status\")\n    dont_remove_tmp_dir = luigi.BoolParameter(significant=False, description=\"don't delete the temporary directory used (for debugging)\")\n    no_tarball = luigi.BoolParameter(significant=False, description=\"don't tarball (and extract) the luigi project files\")\n\n    def __init__(self, *args, **kwargs):\n        super(SGEJobTask, self).__init__(*args, **kwargs)\n        if self.job_name:\n            # use explicitly provided job name\n            pass\n        elif self.job_name_format:\n            # define the job name with the provided format\n            self.job_name = self.job_name_format.format(task_family=self.task_family, **self.__dict__)\n        else:\n            # default to the task family\n            self.job_name = self.task_family\n\n    def _fetch_task_failures(self):\n        if not os.path.exists(self.errfile):\n            logger.info(\"No error file\")\n            return []\n        with open(self.errfile, \"r\") as f:\n            errors = f.readlines()\n        if errors == []:\n            return errors\n        if errors[0].strip() == \"stdin: is not a tty\":  # SGE complains when we submit through a pipe\n            errors.pop(0)\n        return errors\n\n    def _init_local(self):\n\n        # Set up temp folder in shared directory (trim to max filename length)\n        base_tmp_dir = self.shared_tmp_dir\n        random_id = \"%016x\" % random.getrandbits(64)\n        folder_name = self.task_id + \"-\" + random_id\n        self.tmp_dir = os.path.join(base_tmp_dir, folder_name)\n        max_filename_length = os.fstatvfs(0).f_namemax\n        self.tmp_dir = self.tmp_dir[:max_filename_length]\n        logger.info(\"Tmp dir: %s\", self.tmp_dir)\n        os.makedirs(self.tmp_dir)\n\n        # Dump the code to be run into a pickle file\n        logging.debug(\"Dumping pickled class\")\n        self._dump(self.tmp_dir)\n\n        if not self.no_tarball:\n            # Make sure that all the class's dependencies are tarred and available\n            # This is not necessary if luigi is importable from the cluster node\n            logging.debug(\"Tarballing dependencies\")\n            # Grab luigi and the module containing the code to be run\n            packages = [luigi] + [__import__(self.__module__, None, None, \"dummy\")]\n            create_packages_archive(packages, os.path.join(self.tmp_dir, \"packages.tar\"))\n\n    def run(self):\n        if self.run_locally:\n            self.work()\n        else:\n            self._init_local()\n            self._run_job()\n            # The procedure:\n            # - Pickle the class\n            # - Tarball the dependencies\n            # - Construct a qsub argument that runs a generic runner function with the path to the pickled class\n            # - Runner function loads the class from pickle\n            # - Runner class untars the dependencies\n            # - Runner function hits the button on the class's work() method\n\n    def work(self):\n        \"\"\"Override this method, rather than ``run()``,  for your actual work.\"\"\"\n        pass\n\n    def _dump(self, out_dir=\"\"):\n        \"\"\"Dump instance to file.\"\"\"\n        with self.no_unpicklable_properties():\n            self.job_file = os.path.join(out_dir, \"job-instance.pickle\")\n            if self.__module__ == \"__main__\":\n                d = pickle.dumps(self)\n                module_name = os.path.basename(sys.argv[0]).rsplit(\".\", 1)[0]\n                d = d.replace(\"(c__main__\", \"(c\" + module_name)\n                with open(self.job_file, \"w\") as f:\n                    f.write(d)\n            else:\n                with open(self.job_file, \"wb\") as f:\n                    pickle.dump(self, f)\n\n    def _run_job(self):\n\n        # Build a qsub argument that will run sge_runner.py on the directory we've specified\n        runner_path = sge_runner.__file__\n        if runner_path.endswith(\"pyc\"):\n            runner_path = runner_path[:-3] + \"py\"\n        job_str = 'python {0} \"{1}\" \"{2}\"'.format(runner_path, self.tmp_dir, os.getcwd())  # enclose tmp_dir in quotes to protect from special escape chars\n        if self.no_tarball:\n            job_str += ' \"--no-tarball\"'\n\n        # Build qsub submit command\n        self.outfile = os.path.join(self.tmp_dir, \"job.out\")\n        self.errfile = os.path.join(self.tmp_dir, \"job.err\")\n        submit_cmd = _build_qsub_command(job_str, self.task_family, self.outfile, self.errfile, self.parallel_env, self.n_cpu)\n        logger.debug(\"qsub command: \\n\" + submit_cmd)\n\n        # Submit the job and grab job ID\n        output = subprocess.check_output(submit_cmd, shell=True)\n        self.job_id = _parse_qsub_job_id(output)\n        logger.debug(\"Submitted job to qsub with response:\\n\" + output)\n\n        self._track_job()\n\n        # Now delete the temporaries, if they're there.\n        if self.tmp_dir and os.path.exists(self.tmp_dir) and not self.dont_remove_tmp_dir:\n            logger.info(\"Removing temporary directory %s\" % self.tmp_dir)\n            subprocess.call([\"rm\", \"-rf\", self.tmp_dir])\n\n    def _track_job(self):\n        while True:\n            # Sleep for a little bit\n            time.sleep(self.poll_time)\n\n            # See what the job's up to\n            # ASSUMPTION\n            qstat_out = subprocess.check_output([\"qstat\"])\n            sge_status = _parse_qstat_state(qstat_out, self.job_id)\n            if sge_status == \"r\":\n                logger.info(\"Job is running...\")\n            elif sge_status == \"qw\":\n                logger.info(\"Job is pending...\")\n            elif \"E\" in sge_status:\n                logger.error(\"Job has FAILED:\\n\" + \"\\n\".join(self._fetch_task_failures()))\n                break\n            elif sge_status == \"t\" or sge_status == \"u\":\n                # Then the job could either be failed or done.\n                errors = self._fetch_task_failures()\n                if not errors:\n                    logger.info(\"Job is done\")\n                else:\n                    logger.error(\"Job has FAILED:\\n\" + \"\\n\".join(errors))\n                break\n            else:\n                logger.info(\"Job status is UNKNOWN!\")\n                logger.info(\"Status is : %s\" % sge_status)\n                raise Exception(\"job status isn't one of ['r', 'qw', 'E*', 't', 'u']: %s\" % sge_status)\n\n\nclass LocalSGEJobTask(SGEJobTask):\n    \"\"\"A local version of SGEJobTask, for easier debugging.\n\n    This version skips the ``qsub`` steps and simply runs ``work()``\n    on the local node, so you don't need to be on an SGE cluster to\n    use your Task in a test workflow.\n    \"\"\"\n\n    def run(self):\n        self.work()\n"
  },
  {
    "path": "luigi/contrib/sge_runner.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nThe SunGrid Engine runner\n\nThe main() function of this module will be executed on the\ncompute node by the submitted job. It accepts as a single\nargument the shared temp folder containing the package archive\nand pickled task to run, and carries out these steps:\n\n- extract tarfile of package dependencies and place on the path\n- unpickle SGETask instance created on the master node\n- run SGETask.work()\n\nOn completion, SGETask on the master node will detect that\nthe job has left the queue, delete the temporary folder, and\nreturn from SGETask.run()\n\"\"\"\n\nimport logging\nimport os\nimport pickle\nimport sys\n\nfrom luigi.safe_extractor import SafeExtractor\n\n\ndef _do_work_on_compute_node(work_dir, tarball=True):\n\n    if tarball:\n        # Extract the necessary dependencies\n        # This can create a lot of I/O overhead when running many SGEJobTasks,\n        # so is optional if the luigi project is accessible from the cluster node\n        _extract_packages_archive(work_dir)\n\n    # Open up the pickle file with the work to be done\n    os.chdir(work_dir)\n    with open(\"job-instance.pickle\", \"r\") as f:\n        job = pickle.load(f)\n\n    # Do the work contained\n    job.work()\n\n\ndef _extract_packages_archive(work_dir):\n    package_file = os.path.join(work_dir, \"packages.tar\")\n    if not os.path.exists(package_file):\n        return\n\n    curdir = os.path.abspath(os.curdir)\n\n    os.chdir(work_dir)\n    extractor = SafeExtractor(work_dir)\n    extractor.safe_extract(package_file)\n    if \"\" not in sys.path:\n        sys.path.insert(0, \"\")\n\n    os.chdir(curdir)\n\n\ndef main(args=sys.argv):\n    \"\"\"Run the work() method from the class instance in the file \"job-instance.pickle\".\"\"\"\n    try:\n        tarball = \"--no-tarball\" not in args\n        # Set up logging.\n        logging.basicConfig(level=logging.WARN)\n        work_dir = args[1]\n        assert os.path.exists(work_dir), \"First argument to sge_runner.py must be a directory that exists\"\n        project_dir = args[2]\n        sys.path.append(project_dir)\n        _do_work_on_compute_node(work_dir, tarball)\n    except Exception as e:\n        # Dump encoded data that we will try to fetch using mechanize\n        print(e)\n        raise\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "luigi/contrib/simulate.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nA module containing classes used to simulate certain behaviors\n\"\"\"\n\nimport hashlib\nimport logging\nimport os\nimport tempfile\nfrom multiprocessing import Value\n\nimport luigi\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nclass RunAnywayTarget(luigi.Target):\n    \"\"\"\n    A target used to make a task run every time it is called.\n\n    Usage:\n\n    Pass `self` as the first argument in your task's `output`:\n\n    .. code-block: python\n\n        def output(self):\n            return RunAnywayTarget(self)\n\n    And then mark it as `done` in your task's `run`:\n\n    .. code-block: python\n\n        def run(self):\n            # Your task execution\n            # ...\n            self.output().done() # will then be considered as \"existing\"\n    \"\"\"\n\n    # Specify the location of the temporary folder storing the state files. Subclass to change this value\n    temp_dir = os.path.join(tempfile.gettempdir(), \"luigi-simulate\")\n    temp_time = 24 * 3600  # seconds\n\n    # Unique value (PID of the first encountered target) to separate temporary files between executions and\n    # avoid deletion collision\n    unique = Value(\"i\", 0)\n\n    def __init__(self, task_obj):\n        self.task_id = task_obj.task_id\n\n        if self.unique.value == 0:\n            with self.unique.get_lock():\n                if self.unique.value == 0:\n                    self.unique.value = os.getpid()  # The PID will be unique for every execution of the pipeline\n\n        # Deleting old files > temp_time\n        if os.path.isdir(self.temp_dir):\n            import shutil\n            import time\n\n            limit = time.time() - self.temp_time\n            for fn in os.listdir(self.temp_dir):\n                path = os.path.join(self.temp_dir, fn)\n                if os.path.isdir(path) and os.stat(path).st_mtime < limit:\n                    shutil.rmtree(path)\n                    logger.debug(\"Deleted temporary directory %s\", path)\n\n    def __str__(self):\n        return self.task_id\n\n    def get_path(self):\n        \"\"\"\n        Returns a temporary file path based on a MD5 hash generated with the task's name and its arguments\n        \"\"\"\n        md5_hash = hashlib.new(\"md5\", self.task_id.encode(), usedforsecurity=False).hexdigest()\n        logger.debug(\"Hash %s corresponds to task %s\", md5_hash, self.task_id)\n\n        return os.path.join(self.temp_dir, str(self.unique.value), md5_hash)\n\n    def exists(self):\n        \"\"\"\n        Checks if the file exists\n        \"\"\"\n        return os.path.isfile(self.get_path())\n\n    def done(self):\n        \"\"\"\n        Creates temporary file to mark the task as `done`\n        \"\"\"\n        logger.info(\"Marking %s as done\", self)\n\n        fn = self.get_path()\n        try:\n            os.makedirs(os.path.dirname(fn))\n        except OSError:\n            pass\n        open(fn, \"w\").close()\n"
  },
  {
    "path": "luigi/contrib/spark.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport collections\nimport importlib\nimport inspect\nimport logging\nimport os\nimport pickle\nimport re\nimport shutil\nimport sys\nimport tarfile\nimport tempfile\n\nfrom luigi import configuration\nfrom luigi.contrib.external_program import ExternalProgramTask\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nclass SparkSubmitTask(ExternalProgramTask):\n    \"\"\"\n    Template task for running a Spark job\n\n    Supports running jobs on Spark local, standalone, Mesos or Yarn\n\n    See http://spark.apache.org/docs/latest/submitting-applications.html\n    for more information\n\n    \"\"\"\n\n    # Application (.jar or .py file)\n    name = None\n    entry_class = None\n    app = None\n\n    # Only log stderr if spark fails (since stderr is normally quite verbose)\n    always_log_stderr = False\n\n    # Spark applications write its logs into stderr\n    stream_for_searching_tracking_url = \"stderr\"\n\n    @property\n    def tracking_url_pattern(self):\n        if self.deploy_mode == \"cluster\":\n            # in cluster mode client only receives application status once a period of time\n            return r\"tracking URL: (https?://.*)\\s\"\n        else:\n            return r\"Bound (?:.*) to (?:.*), and started at (https?://.*)\\s\"\n\n    def app_options(self):\n        \"\"\"\n        Subclass this method to map your task parameters to the app's arguments\n\n        \"\"\"\n        return []\n\n    @property\n    def pyspark_python(self):\n        return None\n\n    @property\n    def pyspark_driver_python(self):\n        return None\n\n    @property\n    def hadoop_user_name(self):\n        return None\n\n    @property\n    def spark_version(self):\n        return \"spark\"\n\n    @property\n    def spark_submit(self):\n        return configuration.get_config().get(self.spark_version, \"spark-submit\", \"spark-submit\")\n\n    @property\n    def master(self):\n        return configuration.get_config().get(self.spark_version, \"master\", None)\n\n    @property\n    def deploy_mode(self):\n        return configuration.get_config().get(self.spark_version, \"deploy-mode\", None)\n\n    @property\n    def jars(self):\n        return self._list_config(configuration.get_config().get(self.spark_version, \"jars\", None))\n\n    @property\n    def packages(self):\n        return self._list_config(configuration.get_config().get(self.spark_version, \"packages\", None))\n\n    @property\n    def py_files(self):\n        return self._list_config(configuration.get_config().get(self.spark_version, \"py-files\", None))\n\n    @property\n    def files(self):\n        return self._list_config(configuration.get_config().get(self.spark_version, \"files\", None))\n\n    @property\n    def _conf(self):\n        conf = collections.OrderedDict(self.conf or {})\n        if self.pyspark_python:\n            conf[\"spark.pyspark.python\"] = self.pyspark_python\n        if self.pyspark_driver_python:\n            conf[\"spark.pyspark.driver.python\"] = self.pyspark_driver_python\n        return conf\n\n    @property\n    def conf(self):\n        return self._dict_config(configuration.get_config().get(self.spark_version, \"conf\", None))\n\n    @property\n    def properties_file(self):\n        return configuration.get_config().get(self.spark_version, \"properties-file\", None)\n\n    @property\n    def driver_memory(self):\n        return configuration.get_config().get(self.spark_version, \"driver-memory\", None)\n\n    @property\n    def driver_java_options(self):\n        return configuration.get_config().get(self.spark_version, \"driver-java-options\", None)\n\n    @property\n    def driver_library_path(self):\n        return configuration.get_config().get(self.spark_version, \"driver-library-path\", None)\n\n    @property\n    def driver_class_path(self):\n        return configuration.get_config().get(self.spark_version, \"driver-class-path\", None)\n\n    @property\n    def executor_memory(self):\n        return configuration.get_config().get(self.spark_version, \"executor-memory\", None)\n\n    @property\n    def driver_cores(self):\n        return configuration.get_config().get(self.spark_version, \"driver-cores\", None)\n\n    @property\n    def supervise(self):\n        return bool(configuration.get_config().get(self.spark_version, \"supervise\", False))\n\n    @property\n    def total_executor_cores(self):\n        return configuration.get_config().get(self.spark_version, \"total-executor-cores\", None)\n\n    @property\n    def executor_cores(self):\n        return configuration.get_config().get(self.spark_version, \"executor-cores\", None)\n\n    @property\n    def queue(self):\n        return configuration.get_config().get(self.spark_version, \"queue\", None)\n\n    @property\n    def num_executors(self):\n        return configuration.get_config().get(self.spark_version, \"num-executors\", None)\n\n    @property\n    def archives(self):\n        return self._list_config(configuration.get_config().get(self.spark_version, \"archives\", None))\n\n    @property\n    def hadoop_conf_dir(self):\n        return configuration.get_config().get(self.spark_version, \"hadoop-conf-dir\", None)\n\n    def get_environment(self):\n        env = os.environ.copy()\n        for prop in (\"HADOOP_CONF_DIR\", \"HADOOP_USER_NAME\"):\n            var = getattr(self, prop.lower(), None)\n            if var:\n                env[prop] = var\n        return env\n\n    def program_environment(self):\n        return self.get_environment()\n\n    def program_args(self):\n        return self.spark_command() + self.app_command()\n\n    def spark_command(self):\n        command = [self.spark_submit]\n        command += self._text_arg(\"--master\", self.master)\n        command += self._text_arg(\"--deploy-mode\", self.deploy_mode)\n        command += self._text_arg(\"--name\", self.name)\n        command += self._text_arg(\"--class\", self.entry_class)\n        command += self._list_arg(\"--jars\", self.jars)\n        command += self._list_arg(\"--packages\", self.packages)\n        command += self._list_arg(\"--py-files\", self.py_files)\n        command += self._list_arg(\"--files\", self.files)\n        command += self._list_arg(\"--archives\", self.archives)\n        command += self._dict_arg(\"--conf\", self._conf)\n        command += self._text_arg(\"--properties-file\", self.properties_file)\n        command += self._text_arg(\"--driver-memory\", self.driver_memory)\n        command += self._text_arg(\"--driver-java-options\", self.driver_java_options)\n        command += self._text_arg(\"--driver-library-path\", self.driver_library_path)\n        command += self._text_arg(\"--driver-class-path\", self.driver_class_path)\n        command += self._text_arg(\"--executor-memory\", self.executor_memory)\n        command += self._text_arg(\"--driver-cores\", self.driver_cores)\n        command += self._flag_arg(\"--supervise\", self.supervise)\n        command += self._text_arg(\"--total-executor-cores\", self.total_executor_cores)\n        command += self._text_arg(\"--executor-cores\", self.executor_cores)\n        command += self._text_arg(\"--queue\", self.queue)\n        command += self._text_arg(\"--num-executors\", self.num_executors)\n        return command\n\n    def app_command(self):\n        if not self.app:\n            raise NotImplementedError(\"subclass should define an app (.jar or .py file)\")\n        return [self.app] + self.app_options()\n\n    def _list_config(self, config):\n        if config and isinstance(config, str):\n            return list(map(lambda x: x.strip(), config.split(\",\")))\n\n    def _dict_config(self, config):\n        if config and isinstance(config, str):\n            return dict(map(lambda i: i.split(\"=\", 1), config.split(\"|\")))\n\n    def _text_arg(self, name, value):\n        if value:\n            return [name, value]\n        return []\n\n    def _list_arg(self, name, value):\n        if value and isinstance(value, (list, tuple)):\n            return [name, \",\".join(value)]\n        return []\n\n    def _dict_arg(self, name, value):\n        command = []\n        if value and isinstance(value, dict):\n            for prop, value in value.items():\n                command += [name, \"{0}={1}\".format(prop, value)]\n        return command\n\n    def _flag_arg(self, name, value):\n        if value:\n            return [name]\n        return []\n\n\nclass PySparkTask(SparkSubmitTask):\n    \"\"\"\n    Template task for running an inline PySpark job\n\n    Simply implement the ``main`` method in your subclass\n\n    You can optionally define package names to be distributed to the cluster\n    with ``py_packages`` (uses luigi's global py-packages configuration by default)\n\n    \"\"\"\n\n    # Path to the pyspark program passed to spark-submit\n    app = os.path.join(os.path.dirname(__file__), \"pyspark_runner.py\")\n\n    @property\n    def name(self):\n        return self.__class__.__name__\n\n    @property\n    def py_packages(self):\n        packages = configuration.get_config().get(\"spark\", \"py-packages\", None)\n        if packages:\n            return map(lambda s: s.strip(), packages.split(\",\"))\n\n    @property\n    def files(self):\n        if self.deploy_mode == \"cluster\":\n            return [self.run_pickle]\n\n    @property\n    def pickle_protocol(self):\n        return configuration.get_config().getint(\"spark\", \"pickle-protocol\", pickle.DEFAULT_PROTOCOL)\n\n    def setup(self, conf):\n        \"\"\"\n        Called by the pyspark_runner with a SparkConf instance that will be used to instantiate the SparkContext\n\n        :param conf: SparkConf\n        \"\"\"\n\n    def setup_remote(self, sc):\n        self._setup_packages(sc)\n\n    def main(self, sc, *args):\n        \"\"\"\n        Called by the pyspark_runner with a SparkContext and any arguments returned by ``app_options()``\n\n        :param sc: SparkContext\n        :param args: arguments list\n        \"\"\"\n        raise NotImplementedError(\"subclass should define a main method\")\n\n    def app_command(self):\n        if self.deploy_mode == \"cluster\":\n            pickle_loc = os.path.basename(self.run_pickle)\n        else:\n            pickle_loc = self.run_pickle\n        return [self.app, pickle_loc] + self.app_options()\n\n    def run(self):\n        path_name_fragment = re.sub(r\"[^\\w]\", \"_\", self.name)\n        self.run_path = tempfile.mkdtemp(prefix=path_name_fragment)\n        self.run_pickle = os.path.join(self.run_path, \".\".join([path_name_fragment, \"pickle\"]))\n        with open(self.run_pickle, \"wb\") as fd:\n            # Copy module file to run path.\n            module_path = os.path.abspath(inspect.getfile(self.__class__))\n            shutil.copy(module_path, os.path.join(self.run_path, \".\"))\n            self._dump(fd)\n        try:\n            super(PySparkTask, self).run()\n        finally:\n            shutil.rmtree(self.run_path)\n\n    def _dump(self, fd):\n        with self.no_unpicklable_properties():\n            if self.__module__ == \"__main__\":\n                d = pickle.dumps(self, protocol=self.pickle_protocol)\n                module_name = os.path.basename(sys.argv[0]).rsplit(\".\", 1)[0]\n                d = d.replace(b\"c__main__\", b\"c\" + module_name.encode(\"ascii\"))\n                fd.write(d)\n            else:\n                pickle.dump(self, fd, protocol=self.pickle_protocol)\n\n    def _setup_packages(self, sc):\n        \"\"\"\n        This method compresses and uploads packages to the cluster\n\n        \"\"\"\n        packages = self.py_packages\n        if not packages:\n            return\n        for package in packages:\n            mod = importlib.import_module(package)\n            try:\n                mod_path = mod.__path__[0]\n            except AttributeError:\n                mod_path = mod.__file__\n            os.makedirs(self.run_path, exist_ok=True)\n            tar_path = os.path.join(self.run_path, package + \".tar.gz\")\n            tar = tarfile.open(tar_path, \"w:gz\")\n            tar.add(mod_path, os.path.basename(mod_path))\n            tar.close()\n            sc.addPyFile(tar_path)\n"
  },
  {
    "path": "luigi/contrib/sparkey.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport luigi\n\n\nclass SparkeyExportTask(luigi.Task):\n    \"\"\"\n    A luigi task that writes to a local sparkey log file.\n\n    Subclasses should implement the requires and output methods. The output\n    must be a luigi.LocalTarget.\n\n    The resulting sparkey log file will contain one entry for every line in\n    the input, mapping from the first value to a tab-separated list of the\n    rest of the line.\n\n    To generate a simple key-value index, yield \"key\", \"value\" pairs from the input(s) to this task.\n    \"\"\"\n\n    # the separator used to split input lines\n    separator = \"\\t\"\n\n    def __init__(self, *args, **kwargs):\n        super(SparkeyExportTask, self).__init__(*args, **kwargs)\n\n    def run(self):\n        self._write_sparkey_file()\n\n    def _write_sparkey_file(self):\n        import sparkey\n\n        infile = self.input()\n        outfile = self.output()\n        if not isinstance(outfile, luigi.LocalTarget):\n            raise TypeError(\"output must be a LocalTarget\")\n\n        # write job output to temporary sparkey file\n        temp_output = luigi.LocalTarget(is_tmp=True)\n        w = sparkey.LogWriter(temp_output.path)\n        for line in infile.open(\"r\"):\n            k, v = line.strip().split(self.separator, 1)\n            w[k] = v\n        w.close()\n\n        # move finished sparkey file to final destination\n        temp_output.move(outfile.path)\n"
  },
  {
    "path": "luigi/contrib/sqla.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright (c) 2015 Gouthaman Balaraman\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n#\n\"\"\"\nSupport for SQLAlchemy. Provides SQLAlchemyTarget for storing in databases\nsupported by SQLAlchemy. The user would be responsible for installing the\nrequired database driver to connect using SQLAlchemy.\n\nMinimal example of a job to copy data to database using SQLAlchemy is as shown\nbelow:\n\n.. code-block:: python\n\n    from sqlalchemy import String\n    import luigi\n    from luigi.contrib import sqla\n\n    class SQLATask(sqla.CopyToTable):\n        # columns defines the table schema, with each element corresponding\n        # to a column in the format (args, kwargs) which will be sent to\n        # the sqlalchemy.Column(*args, **kwargs)\n        columns = [\n            ([\"item\", String(64)], {\"primary_key\": True}),\n            ([\"property\", String(64)], {})\n        ]\n        connection_string = \"sqlite://\"  # in memory SQLite database\n        table = \"item_property\"  # name of the table to store data\n\n        def rows(self):\n            for row in [(\"item1\", \"property1\"), (\"item2\", \"property2\")]:\n                yield row\n\n    if __name__ == '__main__':\n        task = SQLATask()\n        luigi.build([task], local_scheduler=True)\n\n\nIf the target table where the data needs to be copied already exists, then\nthe column schema definition can be skipped and instead the reflect flag\ncan be set as True. Here is a modified version of the above example:\n\n.. code-block:: python\n\n    from sqlalchemy import String\n    import luigi\n    from luigi.contrib import sqla\n\n    class SQLATask(sqla.CopyToTable):\n        # If database table is already created, then the schema can be loaded\n        # by setting the reflect flag to True\n        reflect = True\n        connection_string = \"sqlite://\"  # in memory SQLite database\n        table = \"item_property\"  # name of the table to store data\n\n        def rows(self):\n            for row in [(\"item1\", \"property1\"), (\"item2\", \"property2\")]:\n                yield row\n\n    if __name__ == '__main__':\n        task = SQLATask()\n        luigi.build([task], local_scheduler=True)\n\n\nIn the above examples, the data that needs to be copied was directly provided by\noverriding the rows method. Alternately, if the data comes from another task, the\nmodified example would look as shown below:\n\n.. code-block:: python\n\n    from sqlalchemy import String\n    import luigi\n    from luigi.contrib import sqla\n    from luigi.mock import MockTarget\n\n    class BaseTask(luigi.Task):\n        def output(self):\n            return MockTarget(\"BaseTask\")\n\n        def run(self):\n            out = self.output().open(\"w\")\n            TASK_LIST = [\"item%d\\\\tproperty%d\\\\n\" % (i, i) for i in range(10)]\n            for task in TASK_LIST:\n                out.write(task)\n            out.close()\n\n    class SQLATask(sqla.CopyToTable):\n        # columns defines the table schema, with each element corresponding\n        # to a column in the format (args, kwargs) which will be sent to\n        # the sqlalchemy.Column(*args, **kwargs)\n        columns = [\n            ([\"item\", String(64)], {\"primary_key\": True}),\n            ([\"property\", String(64)], {})\n        ]\n        connection_string = \"sqlite://\"  # in memory SQLite database\n        table = \"item_property\"  # name of the table to store data\n\n        def requires(self):\n            return BaseTask()\n\n    if __name__ == '__main__':\n        task1, task2 = SQLATask(), BaseTask()\n        luigi.build([task1, task2], local_scheduler=True)\n\n\nIn the above example, the output from `BaseTask` is copied into the\ndatabase. Here we did not have to implement the `rows` method because\nby default `rows` implementation assumes every line is a row with\ncolumn values separated by a tab. One can define `column_separator`\noption for the task if the values are say comma separated instead of\ntab separated.\n\nYou can pass in database specific connection arguments by setting the connect_args\ndictionary.  The options will be passed directly to the DBAPI's connect method as\nkeyword arguments.\n\nThe other option to `sqla.CopyToTable` that can be of help with performance aspect is the\n`chunk_size`. The default is 5000. This is the number of rows that will be inserted in\na transaction at a time. Depending on the size of the inserts, this value can be tuned\nfor performance.\n\nSee here for a `tutorial on building task pipelines using luigi\n<http://gouthamanbalaraman.com/blog/building-luigi-task-pipeline.html>`_ and\nusing `SQLAlchemy in workflow pipelines <http://gouthamanbalaraman.com/blog/sqlalchemy-luigi-workflow-pipeline.html>`_.\n\nAuthor: Gouthaman Balaraman\nDate: 01/02/2015\n\"\"\"\n\nimport abc\nimport collections\nimport datetime\nimport itertools\nimport logging\nimport os\n\nimport sqlalchemy\n\nimport luigi\n\n\nclass SQLAlchemyTarget(luigi.Target):\n    \"\"\"\n    Database target using SQLAlchemy.\n\n    This will rarely have to be directly instantiated by the user.\n\n    Typical usage would be to override `luigi.contrib.sqla.CopyToTable` class\n    to create a task to write to the database.\n    \"\"\"\n\n    marker_table = None\n    _engine_dict = {}  # dict of sqlalchemy engine instances\n    Connection = collections.namedtuple(\"Connection\", \"engine pid\")\n\n    def __init__(self, connection_string, target_table, update_id, echo=False, connect_args=None):\n        \"\"\"\n        Constructor for the SQLAlchemyTarget.\n\n        :param connection_string: SQLAlchemy connection string\n        :type connection_string: str\n        :param target_table: The table name for the data\n        :type target_table: str\n        :param update_id: An identifier for this data set\n        :type update_id: str\n        :param echo: Flag to setup SQLAlchemy logging\n        :type echo: bool\n        :param connect_args: A dictionary of connection arguments\n        :type connect_args: dict\n        :return:\n        \"\"\"\n        if connect_args is None:\n            connect_args = {}\n\n        self.target_table = target_table\n        self.update_id = update_id\n        self.connection_string = connection_string\n        self.echo = echo\n        self.connect_args = connect_args\n        self.marker_table_bound = None\n\n    def __str__(self):\n        return self.target_table\n\n    @property\n    def engine(self):\n        \"\"\"\n        Return an engine instance, creating it if it doesn't exist.\n\n        Recreate the engine connection if it wasn't originally created\n        by the current process.\n        \"\"\"\n        pid = os.getpid()\n        conn = SQLAlchemyTarget._engine_dict.get(self.connection_string)\n        if not conn or conn.pid != pid:\n            # create and reset connection\n            engine = sqlalchemy.create_engine(self.connection_string, connect_args=self.connect_args, echo=self.echo)\n            SQLAlchemyTarget._engine_dict[self.connection_string] = self.Connection(engine, pid)\n        return SQLAlchemyTarget._engine_dict[self.connection_string].engine\n\n    def touch(self):\n        \"\"\"\n        Mark this update as complete.\n        \"\"\"\n        if self.marker_table_bound is None:\n            self.create_marker_table()\n\n        table = self.marker_table_bound\n        id_exists = self.exists()\n        with self.engine.begin() as conn:\n            if not id_exists:\n                ins = table.insert().values(update_id=self.update_id, target_table=self.target_table, inserted=datetime.datetime.now())\n            else:\n                ins = (\n                    table.update()\n                    .where(sqlalchemy.and_(table.c.update_id == self.update_id, table.c.target_table == self.target_table))\n                    .values(update_id=self.update_id, target_table=self.target_table, inserted=datetime.datetime.now())\n                )\n            conn.execute(ins)\n        assert self.exists()\n\n    def exists(self):\n        row = None\n        if self.marker_table_bound is None:\n            self.create_marker_table()\n        with self.engine.begin() as conn:\n            table = self.marker_table_bound\n            s = sqlalchemy.select([table]).where(sqlalchemy.and_(table.c.update_id == self.update_id, table.c.target_table == self.target_table)).limit(1)\n            row = conn.execute(s).fetchone()\n        return row is not None\n\n    def create_marker_table(self):\n        \"\"\"\n        Create marker table if it doesn't exist.\n\n        Using a separate connection since the transaction might have to be reset.\n        \"\"\"\n        if self.marker_table is None:\n            self.marker_table = luigi.configuration.get_config().get(\"sqlalchemy\", \"marker-table\", \"table_updates\")\n\n        engine = self.engine\n\n        with engine.begin() as con:\n            metadata = sqlalchemy.MetaData()\n            if not con.dialect.has_table(con, self.marker_table):\n                self.marker_table_bound = sqlalchemy.Table(\n                    self.marker_table,\n                    metadata,\n                    sqlalchemy.Column(\"update_id\", sqlalchemy.String(128), primary_key=True),\n                    sqlalchemy.Column(\"target_table\", sqlalchemy.String(128)),\n                    sqlalchemy.Column(\"inserted\", sqlalchemy.DateTime, default=datetime.datetime.now()),\n                )\n                metadata.create_all(engine)\n            else:\n                metadata.reflect(only=[self.marker_table], bind=engine)\n                self.marker_table_bound = metadata.tables[self.marker_table]\n\n    def open(self, mode):\n        raise NotImplementedError(\"Cannot open() SQLAlchemyTarget\")\n\n\nclass CopyToTable(luigi.Task):\n    \"\"\"\n    An abstract task for inserting a data set into SQLAlchemy RDBMS\n\n    Usage:\n\n    * subclass and override the required `connection_string`, `table` and `columns` attributes.\n    * optionally override the `schema` attribute to use a different schema for\n      the target table.\n    \"\"\"\n\n    _logger = logging.getLogger(\"luigi-interface\")\n\n    echo = False\n    connect_args = {}\n\n    @property\n    @abc.abstractmethod\n    def connection_string(self):\n        return None\n\n    @property\n    @abc.abstractmethod\n    def table(self):\n        return None\n\n    # specify the columns that define the schema. The format for the columns is a list\n    # of tuples. For example :\n    # columns = [\n    #            ([\"id\", sqlalchemy.Integer], dict(primary_key=True)),\n    #            ([\"name\", sqlalchemy.String(64)], {}),\n    #            ([\"value\", sqlalchemy.String(64)], {})\n    #        ]\n    # The tuple (args_list, kwargs_dict) here is the args and kwargs\n    # that need to be passed to sqlalchemy.Column(*args, **kwargs).\n    # If the tables have already been setup by another process, then you can\n    # completely ignore the columns. Instead set the reflect value to True below\n    columns = []\n\n    # Specify the database schema of the target table, if supported by the\n    # RDBMS. Note that this doesn't change the schema of the marker table.\n    # The schema MUST already exist in the database, or this will task fail.\n    schema = \"\"\n\n    # options\n    column_separator = \"\\t\"  # how columns are separated in the file copied into postgres\n    chunk_size = 5000  # default chunk size for insert\n    reflect = False  # Set this to true only if the table has already been created by alternate means\n\n    def create_table(self, engine):\n        \"\"\"\n        Override to provide code for creating the target table.\n\n        By default it will be created using types specified in columns.\n        If the table exists, then it binds to the existing table.\n\n        If overridden, use the provided connection object for setting up the table in order to\n        create the table and insert data using the same transaction.\n        :param engine: The sqlalchemy engine instance\n        :type engine: object\n        \"\"\"\n\n        def construct_sqla_columns(columns):\n            retval = [sqlalchemy.Column(*c[0], **c[1]) for c in columns]\n            return retval\n\n        needs_setup = (len(self.columns) == 0) or (False in [len(c) == 2 for c in self.columns]) if not self.reflect else False\n        if needs_setup:\n            # only names of columns specified, no types\n            raise NotImplementedError(\"create_table() not implemented for %r and columns types not specified\" % self.table)\n        else:\n            # if columns is specified as (name, type) tuples\n            with engine.begin() as con:\n                if self.schema:\n                    metadata = sqlalchemy.MetaData(schema=self.schema)\n                else:\n                    metadata = sqlalchemy.MetaData()\n\n                try:\n                    if not con.dialect.has_table(con, self.table, self.schema or None):\n                        sqla_columns = construct_sqla_columns(self.columns)\n                        self.table_bound = sqlalchemy.Table(self.table, metadata, *sqla_columns)\n                        metadata.create_all(engine)\n                    else:\n                        full_table = \".\".join([self.schema, self.table]) if self.schema else self.table\n                        metadata.reflect(only=[self.table], bind=engine)\n                        self.table_bound = metadata.tables[full_table]\n                except Exception as e:\n                    self._logger.exception(self.table + str(e))\n\n    def update_id(self):\n        \"\"\"\n        This update id will be a unique identifier for this insert on this table.\n        \"\"\"\n        return self.task_id\n\n    def output(self):\n        return SQLAlchemyTarget(\n            connection_string=self.connection_string, target_table=self.table, update_id=self.update_id(), connect_args=self.connect_args, echo=self.echo\n        )\n\n    def rows(self):\n        \"\"\"\n        Return/yield tuples or lists corresponding to each row to be inserted.\n\n        This method can be overridden for custom file types or formats.\n        \"\"\"\n        with self.input().open(\"r\") as fobj:\n            for line in fobj:\n                yield line.strip(\"\\n\").split(self.column_separator)\n\n    def run(self):\n        self._logger.info(\"Running task copy to table for update id %s for table %s\" % (self.update_id(), self.table))\n        output = self.output()\n        engine = output.engine\n        self.create_table(engine)\n        with engine.begin() as conn:\n            rows = iter(self.rows())\n            ins_rows = [dict(zip((\"_\" + c.key for c in self.table_bound.c), row)) for row in itertools.islice(rows, self.chunk_size)]\n            while ins_rows:\n                self.copy(conn, ins_rows, self.table_bound)\n                ins_rows = [dict(zip((\"_\" + c.key for c in self.table_bound.c), row)) for row in itertools.islice(rows, self.chunk_size)]\n                self._logger.info(\"Finished inserting %d rows into SQLAlchemy target\" % len(ins_rows))\n        output.touch()\n        self._logger.info(\"Finished inserting rows into SQLAlchemy target\")\n\n    def copy(self, conn, ins_rows, table_bound):\n        \"\"\"\n        This method does the actual insertion of the rows of data given by ins_rows into the\n        database. A task that needs row updates instead of insertions should overload this method.\n        :param conn: The sqlalchemy connection object\n        :param ins_rows: The dictionary of rows with the keys in the format _<column_name>. For example\n        if you have a table with a column name \"property\", then the key in the dictionary\n        would be \"_property\". This format is consistent with the bindparam usage in sqlalchemy.\n        :param table_bound: The object referring to the table\n        :return:\n        \"\"\"\n        bound_cols = dict((c, sqlalchemy.bindparam(\"_\" + c.key)) for c in table_bound.columns)\n        ins = table_bound.insert().values(bound_cols)\n        conn.execute(ins, ins_rows)\n"
  },
  {
    "path": "luigi/contrib/ssh.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nLight-weight remote execution library and utilities.\n\nThere are some examples in the unittest but I added another that is more\nluigi-specific in the examples directory (examples/ssh_remote_execution.py)\n\n:class:`RemoteContext` is meant to provide functionality similar to that of the\nstandard library subprocess module, but where the commands executed are run on\na remote machine instead, without the user having to think about prefixing\neverything with \"ssh\" and credentials etc.\n\nUsing this mini library (which is just a convenience wrapper for subprocess),\n:class:`RemoteTarget` is created to let you stream data from a remotely stored file using\nthe luigi :class:`~luigi.target.FileSystemTarget` semantics.\n\nAs a bonus, :class:`RemoteContext` also provides a really cool feature that let's you\nset up ssh tunnels super easily using a python context manager (there is an example\nin the integration part of unittests).\n\nThis can be super convenient when you want secure communication using a non-secure\nprotocol or circumvent firewalls (as long as they are open for ssh traffic).\n\"\"\"\n\nimport contextlib\nimport logging\nimport os\nimport posixpath\nimport random\nimport subprocess\n\nimport luigi\nimport luigi.format\nimport luigi.target\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nclass RemoteCalledProcessError(subprocess.CalledProcessError):\n    def __init__(self, returncode, command, host, output=None):\n        super(RemoteCalledProcessError, self).__init__(returncode, command, output)\n        self.host = host\n\n    def __str__(self):\n        return \"Command '%s' on host %s returned non-zero exit status %d\" % (self.cmd, self.host, self.returncode)\n\n\nclass RemoteContext:\n    def __init__(self, host, **kwargs):\n        self.host = host\n        self.username = kwargs.get(\"username\", None)\n        self.key_file = kwargs.get(\"key_file\", None)\n        self.connect_timeout = kwargs.get(\"connect_timeout\", None)\n        self.port = kwargs.get(\"port\", None)\n        self.no_host_key_check = kwargs.get(\"no_host_key_check\", False)\n        self.sshpass = kwargs.get(\"sshpass\", False)\n        self.tty = kwargs.get(\"tty\", False)\n\n    def __repr__(self):\n        return \"%s(%r, %r, %r, %r, %r)\" % (type(self).__name__, self.host, self.username, self.key_file, self.connect_timeout, self.port)\n\n    def __eq__(self, other):\n        return repr(self) == repr(other)\n\n    def __hash__(self):\n        return hash(repr(self))\n\n    def _host_ref(self):\n        if self.username:\n            return \"{0}@{1}\".format(self.username, self.host)\n        else:\n            return self.host\n\n    def _prepare_cmd(self, cmd):\n        connection_cmd = [\"ssh\", self._host_ref(), \"-o\", \"ControlMaster=no\"]\n        if self.sshpass:\n            connection_cmd = [\"sshpass\", \"-e\"] + connection_cmd\n        else:\n            connection_cmd += [\"-o\", \"BatchMode=yes\"]  # no password prompts etc\n        if self.port:\n            connection_cmd.extend([\"-p\", self.port])\n\n        if self.connect_timeout is not None:\n            connection_cmd += [\"-o\", \"ConnectTimeout=%d\" % self.connect_timeout]\n\n        if self.no_host_key_check:\n            connection_cmd += [\"-o\", \"UserKnownHostsFile=/dev/null\", \"-o\", \"StrictHostKeyChecking=no\"]\n\n        if self.key_file:\n            connection_cmd.extend([\"-i\", self.key_file])\n\n        if self.tty:\n            connection_cmd.append(\"-t\")\n        return connection_cmd + cmd\n\n    def Popen(self, cmd, **kwargs):\n        \"\"\"\n        Remote Popen.\n        \"\"\"\n        prefixed_cmd = self._prepare_cmd(cmd)\n        return subprocess.Popen(prefixed_cmd, **kwargs)\n\n    def check_output(self, cmd):\n        \"\"\"\n        Execute a shell command remotely and return the output.\n\n        Simplified version of Popen when you only want the output as a string and detect any errors.\n        \"\"\"\n        p = self.Popen(cmd, stdout=subprocess.PIPE)\n        output, _ = p.communicate()\n        if p.returncode != 0:\n            raise RemoteCalledProcessError(p.returncode, cmd, self.host, output=output)\n        return output\n\n    @contextlib.contextmanager\n    def tunnel(self, local_port, remote_port=None, remote_host=\"localhost\"):\n        \"\"\"\n        Open a tunnel between localhost:local_port and remote_host:remote_port via the host specified by this context.\n\n        Remember to close() the returned \"tunnel\" object in order to clean up\n        after yourself when you are done with the tunnel.\n        \"\"\"\n        tunnel_host = \"{0}:{1}:{2}\".format(local_port, remote_host, remote_port)\n        proc = self.Popen(\n            # cat so we can shut down gracefully by closing stdin\n            [\"-L\", tunnel_host, \"echo -n ready && cat\"],\n            stdin=subprocess.PIPE,\n            stdout=subprocess.PIPE,\n        )\n        # make sure to get the data so we know the connection is established\n        ready = proc.stdout.read(5)\n        assert ready == b\"ready\", \"Didn't get ready from remote echo\"\n        yield  # user code executed here\n        proc.communicate()\n        assert proc.returncode == 0, \"Tunnel process did an unclean exit (returncode %s)\" % (proc.returncode,)\n\n\nclass RemoteFileSystem(luigi.target.FileSystem):\n    def __init__(self, host, **kwargs):\n        self.remote_context = RemoteContext(host, **kwargs)\n\n    def exists(self, path):\n        \"\"\"\n        Return `True` if file or directory at `path` exist, False otherwise.\n        \"\"\"\n        try:\n            self.remote_context.check_output([\"test\", \"-e\", path])\n        except subprocess.CalledProcessError as e:\n            if e.returncode == 1:\n                return False\n            else:\n                raise\n        return True\n\n    def listdir(self, path):\n        while path.endswith(\"/\"):\n            path = path[:-1]\n\n        path = path or \".\"\n        listing = self.remote_context.check_output([\"find\", \"-L\", path, \"-type\", \"f\"]).splitlines()\n        return [v.decode(\"utf-8\") for v in listing]\n\n    def isdir(self, path):\n        \"\"\"\n        Return `True` if directory at `path` exist, False otherwise.\n        \"\"\"\n        try:\n            self.remote_context.check_output([\"test\", \"-d\", path])\n        except subprocess.CalledProcessError as e:\n            if e.returncode == 1:\n                return False\n            else:\n                raise\n        return True\n\n    def remove(self, path, recursive=True):\n        \"\"\"\n        Remove file or directory at location `path`.\n        \"\"\"\n        if recursive:\n            cmd = [\"rm\", \"-r\", path]\n        else:\n            cmd = [\"rm\", path]\n\n        self.remote_context.check_output(cmd)\n\n    def mkdir(self, path, parents=True, raise_if_exists=False):\n        if self.exists(path):\n            if raise_if_exists:\n                raise luigi.target.FileAlreadyExists()\n            elif not self.isdir(path):\n                raise luigi.target.NotADirectory()\n            else:\n                return\n\n        if parents:\n            cmd = [\"mkdir\", \"-p\", path]\n        else:\n            cmd = [\"mkdir\", path, \"2>&1\"]\n\n        try:\n            self.remote_context.check_output(cmd)\n        except subprocess.CalledProcessError as e:\n            if b\"no such file\" in e.output.lower():\n                raise luigi.target.MissingParentDirectory()\n            raise\n\n    def _scp(self, src, dest):\n        cmd = [\"scp\", \"-q\", \"-C\", \"-o\", \"ControlMaster=no\"]\n        if self.remote_context.sshpass:\n            cmd = [\"sshpass\", \"-e\"] + cmd\n        else:\n            cmd.append(\"-B\")\n        if self.remote_context.no_host_key_check:\n            cmd.extend([\"-o\", \"UserKnownHostsFile=/dev/null\", \"-o\", \"StrictHostKeyChecking=no\"])\n        if self.remote_context.key_file:\n            cmd.extend([\"-i\", self.remote_context.key_file])\n        if self.remote_context.port:\n            cmd.extend([\"-P\", self.remote_context.port])\n        if os.path.isdir(src):\n            cmd.extend([\"-r\"])\n        cmd.extend([src, dest])\n        p = subprocess.Popen(cmd)\n        output, _ = p.communicate()\n        if p.returncode != 0:\n            raise subprocess.CalledProcessError(p.returncode, cmd, output=output)\n\n    def put(self, local_path, path):\n        # create parent folder if not exists\n        normpath = posixpath.normpath(path)\n        folder = os.path.dirname(normpath)\n        if folder and not self.exists(folder):\n            self.remote_context.check_output([\"mkdir\", \"-p\", folder])\n\n        tmp_path = path + \"-luigi-tmp-%09d\" % random.randrange(0, 10_000_000_000)\n        self._scp(local_path, \"%s:%s\" % (self.remote_context._host_ref(), tmp_path))\n        self.remote_context.check_output([\"mv\", tmp_path, path])\n\n    def get(self, path, local_path):\n        # Create folder if it does not exist\n        normpath = os.path.normpath(local_path)\n        folder = os.path.dirname(normpath)\n        if folder:\n            try:\n                os.makedirs(folder)\n            except OSError:\n                pass\n\n        tmp_local_path = local_path + \"-luigi-tmp-%09d\" % random.randrange(0, 10_000_000_000)\n        self._scp(\"%s:%s\" % (self.remote_context._host_ref(), path), tmp_local_path)\n        os.replace(tmp_local_path, local_path)\n\n\nclass AtomicRemoteFileWriter(luigi.format.OutputPipeProcessWrapper):\n    def __init__(self, fs, path):\n        self._fs = fs\n        self.path = path\n\n        # create parent folder if not exists\n        normpath = os.path.normpath(self.path)\n        folder = os.path.dirname(normpath)\n        if folder:\n            self.fs.mkdir(folder)\n\n        self.__tmp_path = self.path + \"-luigi-tmp-%09d\" % random.randrange(0, 10_000_000_000)\n        super(AtomicRemoteFileWriter, self).__init__(self.fs.remote_context._prepare_cmd([\"cat\", \">\", self.__tmp_path]))\n\n    def __del__(self):\n        super(AtomicRemoteFileWriter, self).__del__()\n\n        try:\n            if self.fs.exists(self.__tmp_path):\n                self.fs.remote_context.check_output([\"rm\", self.__tmp_path])\n        except Exception:\n            # Don't propagate the exception; bad things can happen.\n            logger.exception(\"Failed to delete in-flight file\")\n\n    def close(self):\n        super(AtomicRemoteFileWriter, self).close()\n        self.fs.remote_context.check_output([\"mv\", self.__tmp_path, self.path])\n\n    @property\n    def tmp_path(self):\n        return self.__tmp_path\n\n    @property\n    def fs(self):\n        return self._fs\n\n\nclass RemoteTarget(luigi.target.FileSystemTarget):\n    \"\"\"\n    Target used for reading from remote files.\n\n    The target is implemented using ssh commands streaming data over the network.\n    \"\"\"\n\n    def __init__(self, path, host, format=None, **kwargs):\n        super(RemoteTarget, self).__init__(path)\n        if format is None:\n            format = luigi.format.get_default_format()\n        self.format = format\n        self._fs = RemoteFileSystem(host, **kwargs)\n\n    @property\n    def fs(self):\n        return self._fs\n\n    def open(self, mode=\"r\"):\n        if mode == \"w\":\n            file_writer = AtomicRemoteFileWriter(self.fs, self.path)\n            if self.format:\n                return self.format.pipe_writer(file_writer)\n            else:\n                return file_writer\n        elif mode == \"r\":\n            file_reader = luigi.format.InputPipeProcessWrapper(self.fs.remote_context._prepare_cmd([\"cat\", self.path]))\n            if self.format:\n                return self.format.pipe_reader(file_reader)\n            else:\n                return file_reader\n        else:\n            raise Exception(\"mode must be 'r' or 'w' (got: %s)\" % mode)\n\n    def put(self, local_path):\n        self.fs.put(local_path, self.path)\n\n    def get(self, local_path):\n        self.fs.get(self.path, local_path)\n"
  },
  {
    "path": "luigi/contrib/target.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport logging\nfrom types import MethodType\n\nimport luigi.target\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nclass CascadingClient:\n    \"\"\"\n    A FilesystemClient that will cascade failing function calls through a list of clients.\n\n    Which clients are used are specified at time of construction.\n    \"\"\"\n\n    # This constant member is supposed to include all methods, feel free to add\n    # methods here. If you want full control of which methods that should be\n    # created, pass the kwarg to the constructor.\n    ALL_METHOD_NAMES = [\n        \"exists\",\n        \"rename\",\n        \"remove\",\n        \"chmod\",\n        \"chown\",\n        \"count\",\n        \"copy\",\n        \"get\",\n        \"put\",\n        \"mkdir\",\n        \"list\",\n        \"listdir\",\n        \"getmerge\",\n        \"isdir\",\n        \"rename_dont_move\",\n        \"touchz\",\n    ]\n\n    def __init__(self, clients, method_names=None):\n        self.clients = clients\n        if method_names is None:\n            method_names = self.ALL_METHOD_NAMES\n\n        for method_name in method_names:\n            new_method = self._make_method(method_name)\n            real_method = MethodType(new_method, self)\n            setattr(self, method_name, real_method)\n\n    @classmethod\n    def _make_method(cls, method_name):\n        def new_method(self, *args, **kwargs):\n            return self._chained_call(method_name, *args, **kwargs)\n\n        return new_method\n\n    def _chained_call(self, method_name, *args, **kwargs):\n        for i in range(len(self.clients)):\n            client = self.clients[i]\n            try:\n                result = getattr(client, method_name)(*args, **kwargs)\n                return result\n            except luigi.target.FileSystemException:\n                # For exceptions that are semantical, we must throw along\n                raise\n            except BaseException:\n                is_last_iteration = (i + 1) >= len(self.clients)\n                if is_last_iteration:\n                    raise\n                else:\n                    logger.warning(\n                        \"The %s failed to %s, using fallback class %s\", client.__class__.__name__, method_name, self.clients[i + 1].__class__.__name__\n                    )\n"
  },
  {
    "path": "luigi/contrib/webhdfs.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nProvides a :class:`WebHdfsTarget` using the `Python hdfs\n<https://pypi.python.org/pypi/hdfs/>`_\n\nThis module is DEPRECATED and does not play well with rest of luigi's hdfs\ncontrib module. You can consider migrating to\n:class:`luigi.contrib.hdfs.webhdfs_client.WebHdfsClient`\n\"\"\"\n\nimport logging\n\nimport luigi.contrib.hdfs\nfrom luigi.format import get_default_format\nfrom luigi.target import AtomicLocalFile, FileSystemTarget\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nclass WebHdfsTarget(FileSystemTarget):\n    fs = None\n\n    def __init__(self, path, client=None, format=None):\n        super(WebHdfsTarget, self).__init__(path)\n        path = self.path\n        self.fs = client or WebHdfsClient()\n        if format is None:\n            format = get_default_format()\n\n        self.format = format\n\n    def open(self, mode=\"r\"):\n        if mode not in (\"r\", \"w\"):\n            raise ValueError(\"Unsupported open mode '%s'\" % mode)\n\n        if mode == \"r\":\n            return self.format.pipe_reader(ReadableWebHdfsFile(path=self.path, client=self.fs))\n\n        return self.format.pipe_writer(AtomicWebHdfsFile(path=self.path, client=self.fs))\n\n\nclass ReadableWebHdfsFile:\n    def __init__(self, path, client):\n        self.path = path\n        self.client = client\n        self.generator = None\n\n    def read(self):\n        self.generator = self.client.read(self.path)\n        res = list(self.generator)[0]\n        return res\n\n    def readlines(self, char=\"\\n\"):\n        self.generator = self.client.read(self.path, buffer_char=char)\n        return self.generator\n\n    def __enter__(self):\n        return self\n\n    def __exit__(self, exc_type, exc, traceback):\n        self.close()\n\n    def __iter__(self):\n        self.generator = self.readlines(\"\\n\")\n        yield from self.generator\n        self.close()\n\n    def close(self):\n        self.generator.close()\n\n\nclass AtomicWebHdfsFile(AtomicLocalFile):\n    \"\"\"\n    An Hdfs file that writes to a temp file and put to WebHdfs on close.\n    \"\"\"\n\n    def __init__(self, path, client):\n        self.client = client\n        super(AtomicWebHdfsFile, self).__init__(path)\n\n    def move_to_final_destination(self):\n        if not self.client.exists(self.path):\n            self.client.upload(self.path, self.tmp_path)\n\n\nWebHdfsClient = luigi.contrib.hdfs.WebHdfsClient\n"
  },
  {
    "path": "luigi/date_interval.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\n``luigi.date_interval`` provides convenient classes for date algebra.\nEverything uses ISO 8601 notation, i.e. YYYY-MM-DD for dates, etc.\nThere is a corresponding :class:`luigi.parameter.DateIntervalParameter` that you can use to parse date intervals.\n\nExample::\n\n    class MyTask(luigi.Task):\n        date_interval = luigi.DateIntervalParameter()\n\nNow, you can launch this from the command line using\n``--date-interval 2014-05-10`` or\n``--date-interval 2014-W26`` (using week notation) or\n``--date-interval 2014`` (for a year) and some other notations.\n\"\"\"\n\nimport datetime\nimport re\n\n\nclass DateInterval:\n    \"\"\"\n    The :class:`DateInterval` is the base class with subclasses :class:`Date`, :class:`Week`, :class:`Month`, :class:`Year`, and :class:`Custom`.\n    Note that the :class:`DateInterval` is abstract and should not be used directly: use :class:`Custom` for arbitrary date intervals.\n    The base class features a couple of convenience methods, such as ``next()`` which returns the next consecutive date interval.\n\n    Example::\n\n       x = luigi.date_interval.Week(2013, 52)\n       print x.prev()\n\n    This will print ``2014-W01``.\n\n    All instances of :class:`DateInterval` have attributes ``date_a`` and ``date_b`` set.\n    This represents the half open range of the date interval.\n    For instance, a May 2014 is represented as ``date_a = 2014-05-01``, ``date_b = 2014-06-01``.\n    \"\"\"\n\n    def __init__(self, date_a, date_b):\n        self.date_a = date_a\n        self.date_b = date_b\n\n    def dates(self):\n        \"\"\"Returns a list of dates in this date interval.\"\"\"\n        dates = []\n        d = self.date_a\n        while d < self.date_b:\n            dates.append(d)\n            d += datetime.timedelta(1)\n\n        return dates\n\n    def hours(self):\n        \"\"\"Same as dates() but returns 24 times more info: one for each hour.\"\"\"\n        for date in self.dates():\n            for hour in range(24):\n                yield datetime.datetime.combine(date, datetime.time(hour))\n\n    def __str__(self):\n        return self.to_string()\n\n    def __repr__(self):\n        return self.to_string()\n\n    def prev(self):\n        \"\"\"Returns the preceding corresponding date interval (eg. May -> April).\"\"\"\n        return self.from_date(self.date_a - datetime.timedelta(1))\n\n    def next(self):\n        \"\"\"Returns the subsequent corresponding date interval (eg. 2014 -> 2015).\"\"\"\n        return self.from_date(self.date_b)\n\n    def to_string(self):\n        raise NotImplementedError\n\n    @classmethod\n    def from_date(cls, d):\n        \"\"\"Abstract class method.\n\n        For instance, ``Month.from_date(datetime.date(2012, 6, 6))`` returns a ``Month(2012, 6)``.\"\"\"\n        raise NotImplementedError\n\n    @classmethod\n    def parse(cls, s):\n        \"\"\"Abstract class method.\n\n        For instance, ``Year.parse(\"2014\")`` returns a ``Year(2014)``.\"\"\"\n        raise NotImplementedError\n\n    def __contains__(self, date):\n        return date in self.dates()\n\n    def __iter__(self):\n        for d in self.dates():\n            yield d\n\n    def __hash__(self):\n        return hash(repr(self))\n\n    def __cmp__(self, other):\n        if not isinstance(self, type(other)):\n            # doing this because it's not well defined if eg. 2012-01-01-2013-01-01 == 2012\n            raise TypeError(\"Date interval type mismatch\")\n\n        return (self > other) - (self < other)\n\n    def __lt__(self, other):\n        if not isinstance(self, type(other)):\n            raise TypeError(\"Date interval type mismatch\")\n        return (self.date_a, self.date_b) < (other.date_a, other.date_b)\n\n    def __le__(self, other):\n        if not isinstance(self, type(other)):\n            raise TypeError(\"Date interval type mismatch\")\n        return (self.date_a, self.date_b) <= (other.date_a, other.date_b)\n\n    def __gt__(self, other):\n        if not isinstance(self, type(other)):\n            raise TypeError(\"Date interval type mismatch\")\n        return (self.date_a, self.date_b) > (other.date_a, other.date_b)\n\n    def __ge__(self, other):\n        if not isinstance(self, type(other)):\n            raise TypeError(\"Date interval type mismatch\")\n        return (self.date_a, self.date_b) >= (other.date_a, other.date_b)\n\n    def __eq__(self, other):\n        if not isinstance(other, DateInterval):\n            return False\n        if not isinstance(self, type(other)):\n            raise TypeError(\"Date interval type mismatch\")\n        else:\n            return (self.date_a, self.date_b) == (other.date_a, other.date_b)\n\n    def __ne__(self, other):\n        return not self.__eq__(other)\n\n\nclass Date(DateInterval):\n    \"\"\"Most simple :class:`DateInterval` where ``date_b == date_a + datetime.timedelta(1)``.\"\"\"\n\n    def __init__(self, y, m, d):\n        a = datetime.date(y, m, d)\n        b = datetime.date(y, m, d) + datetime.timedelta(1)\n        super(Date, self).__init__(a, b)\n\n    def to_string(self):\n        return self.date_a.strftime(\"%Y-%m-%d\")\n\n    @classmethod\n    def from_date(cls, d):\n        return Date(d.year, d.month, d.day)\n\n    @classmethod\n    def parse(cls, s):\n        if re.match(r\"\\d\\d\\d\\d\\-\\d\\d\\-\\d\\d$\", s):\n            return Date(*map(int, s.split(\"-\")))\n\n\nclass Week(DateInterval):\n    \"\"\"ISO 8601 week. Note that it has some counterintuitive behavior around new year.\n    For instance Monday 29 December 2008 is week 2009-W01, and Sunday 3 January 2010 is week 2009-W53\n    This example was taken from from http://en.wikipedia.org/wiki/ISO_8601#Week_dates\n    \"\"\"\n\n    def __init__(self, y, w):\n        \"\"\"Python datetime does not have a method to convert from ISO weeks, so the constructor uses some stupid brute force\"\"\"\n        for d in range(-10, 370):\n            date = datetime.date(y, 1, 1) + datetime.timedelta(d)\n            if date.isocalendar() == (y, w, 1):\n                date_a = date\n                break\n        else:\n            raise ValueError(\"Invalid week\")\n        date_b = date_a + datetime.timedelta(7)\n        super(Week, self).__init__(date_a, date_b)\n\n    def to_string(self):\n        return \"%d-W%02d\" % self.date_a.isocalendar()[:2]\n\n    @classmethod\n    def from_date(cls, d):\n        return Week(*d.isocalendar()[:2])\n\n    @classmethod\n    def parse(cls, s):\n        if re.match(r\"\\d\\d\\d\\d\\-W\\d\\d$\", s):\n            y, w = map(int, s.split(\"-W\"))\n            return Week(y, w)\n\n\nclass Month(DateInterval):\n    def __init__(self, y, m):\n        date_a = datetime.date(y, m, 1)\n        date_b = datetime.date(y + m // 12, 1 + m % 12, 1)\n        super(Month, self).__init__(date_a, date_b)\n\n    def to_string(self):\n        return self.date_a.strftime(\"%Y-%m\")\n\n    @classmethod\n    def from_date(cls, d):\n        return Month(d.year, d.month)\n\n    @classmethod\n    def parse(cls, s):\n        if re.match(r\"\\d\\d\\d\\d\\-\\d\\d$\", s):\n            y, m = map(int, s.split(\"-\"))\n            return Month(y, m)\n\n\nclass Year(DateInterval):\n    def __init__(self, y):\n        date_a = datetime.date(y, 1, 1)\n        date_b = datetime.date(y + 1, 1, 1)\n        super(Year, self).__init__(date_a, date_b)\n\n    def to_string(self):\n        return self.date_a.strftime(\"%Y\")\n\n    @classmethod\n    def from_date(cls, d):\n        return Year(d.year)\n\n    @classmethod\n    def parse(cls, s):\n        if re.match(r\"\\d\\d\\d\\d$\", s):\n            return Year(int(s))\n\n\nclass Custom(DateInterval):\n    \"\"\"Custom date interval (does not implement prev and next methods)\n\n    Actually the ISO 8601 specifies <start>/<end> as the time interval format\n    Not sure if this goes for date intervals as well. In any case slashes will\n    most likely cause problems with paths etc.\n    \"\"\"\n\n    def to_string(self):\n        return \"-\".join([d.strftime(\"%Y-%m-%d\") for d in (self.date_a, self.date_b)])\n\n    @classmethod\n    def parse(cls, s):\n        if re.match(r\"\\d\\d\\d\\d\\-\\d\\d\\-\\d\\d\\-\\d\\d\\d\\d\\-\\d\\d\\-\\d\\d$\", s):\n            x = list(map(int, s.split(\"-\")))\n            date_a = datetime.date(*x[:3])\n            date_b = datetime.date(*x[3:])\n            return Custom(date_a, date_b)\n"
  },
  {
    "path": "luigi/db_task_history.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nProvides a database backend to the central scheduler. This lets you see historical runs.\nSee :ref:`TaskHistory` for information about how to turn out the task history feature.\n\"\"\"\n#\n# Description: Added codes for visualization of how long each task takes\n# running-time until it reaches the next status (failed or done)\n# At \"{base_url}/tasklist\", all completed(failed or done) tasks are shown.\n# At \"{base_url}/tasklist\", a user can select one specific task to see\n# how its running-time has changed over time.\n# At \"{base_url}/tasklist/{task_name}\", it visualizes a multi-bar graph\n# that represents the changes of the running-time for a selected task\n# up to the next status (failed or done).\n# This visualization let us know how the running-time of the specific task\n# has changed over time.\n#\n# Copyright 2015 Naver Corp.\n# Author Yeseul Park (yeseul.park@navercorp.com)\n#\n\nimport datetime\nimport logging\nfrom contextlib import contextmanager\n\nimport sqlalchemy\nimport sqlalchemy.ext.declarative\nimport sqlalchemy.orm\nimport sqlalchemy.orm.collections\nfrom sqlalchemy.engine import reflection\n\nfrom luigi import configuration, task_history\nfrom luigi.task_status import DONE, FAILED, PENDING, RUNNING\n\nBase = sqlalchemy.ext.declarative.declarative_base()\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nclass DbTaskHistory(task_history.TaskHistory):\n    \"\"\"\n    Task History that writes to a database using sqlalchemy.\n    Also has methods for useful db queries.\n    \"\"\"\n\n    CURRENT_SOURCE_VERSION = 1\n\n    @contextmanager\n    def _session(self, session=None):\n        if session:\n            yield session\n        else:\n            session = self.session_factory()\n            try:\n                yield session\n            except BaseException:\n                session.rollback()\n                raise\n            else:\n                session.commit()\n\n    def __init__(self):\n        config = configuration.get_config()\n        connection_string = config.get(\"task_history\", \"db_connection\")\n        self.engine = sqlalchemy.create_engine(connection_string)\n        self.session_factory = sqlalchemy.orm.sessionmaker(bind=self.engine, expire_on_commit=False)\n        Base.metadata.create_all(self.engine)\n        self.tasks = {}  # task_id -> TaskRecord\n\n        _upgrade_schema(self.engine)\n\n    def task_scheduled(self, task):\n        htask = self._get_task(task, status=PENDING)\n        self._add_task_event(htask, TaskEvent(event_name=PENDING, ts=datetime.datetime.now()))\n\n    def task_finished(self, task, successful):\n        event_name = DONE if successful else FAILED\n        htask = self._get_task(task, status=event_name)\n        self._add_task_event(htask, TaskEvent(event_name=event_name, ts=datetime.datetime.now()))\n\n    def task_started(self, task, worker_host):\n        htask = self._get_task(task, status=RUNNING, host=worker_host)\n        self._add_task_event(htask, TaskEvent(event_name=RUNNING, ts=datetime.datetime.now()))\n\n    def _get_task(self, task, status, host=None):\n        if task.id in self.tasks:\n            htask = self.tasks[task.id]\n            htask.status = status\n            if host:\n                htask.host = host\n        else:\n            htask = self.tasks[task.id] = task_history.StoredTask(task, status, host)\n        return htask\n\n    def _add_task_event(self, task, event):\n        for task_record, session in self._find_or_create_task(task):\n            task_record.events.append(event)\n\n    def _find_or_create_task(self, task):\n        with self._session() as session:\n            if task.record_id is not None:\n                logger.debug(\"Finding task with record_id [%d]\", task.record_id)\n                task_record = session.query(TaskRecord).get(task.record_id)\n                if not task_record:\n                    raise Exception(\"Task with record_id, but no matching Task record!\")\n                yield (task_record, session)\n            else:\n                task_record = TaskRecord(task_id=task._task.id, name=task.task_family, host=task.host)\n                for k, v in task.parameters.items():\n                    task_record.parameters[k] = TaskParameter(name=k, value=v)\n                session.add(task_record)\n                yield (task_record, session)\n            if task.host:\n                task_record.host = task.host\n        task.record_id = task_record.id\n\n    def find_all_by_parameters(self, task_name, session=None, **task_params):\n        \"\"\"\n        Find tasks with the given task_name and the same parameters as the kwargs.\n        \"\"\"\n        with self._session(session) as session:\n            query = session.query(TaskRecord).join(TaskEvent).filter(TaskRecord.name == task_name)\n            for k, v in task_params.items():\n                alias = sqlalchemy.orm.aliased(TaskParameter)\n                query = query.join(alias).filter(alias.name == k, alias.value == v)\n\n            tasks = query.order_by(TaskEvent.ts)\n            for task in tasks:\n                # Sanity check\n                assert all(k in task.parameters and v == str(task.parameters[k].value) for k, v in task_params.items())\n\n                yield task\n\n    def find_all_by_name(self, task_name, session=None):\n        \"\"\"\n        Find all tasks with the given task_name.\n        \"\"\"\n        return self.find_all_by_parameters(task_name, session)\n\n    def find_latest_runs(self, session=None):\n        \"\"\"\n        Return tasks that have been updated in the past 24 hours.\n        \"\"\"\n        with self._session(session) as session:\n            yesterday = datetime.datetime.now() - datetime.timedelta(days=1)\n            return (\n                session.query(TaskRecord)\n                .join(TaskEvent)\n                .filter(TaskEvent.ts >= yesterday)\n                .group_by(TaskRecord.id, TaskEvent.event_name, TaskEvent.ts)\n                .order_by(TaskEvent.ts.desc())\n                .all()\n            )\n\n    def find_all_runs(self, session=None):\n        \"\"\"\n        Return all tasks that have been updated.\n        \"\"\"\n        with self._session(session) as session:\n            return session.query(TaskRecord).all()\n\n    def find_all_events(self, session=None):\n        \"\"\"\n        Return all running/failed/done events.\n        \"\"\"\n        with self._session(session) as session:\n            return session.query(TaskEvent).all()\n\n    def find_task_by_id(self, id, session=None):\n        \"\"\"\n        Find task with the given record ID.\n        \"\"\"\n        with self._session(session) as session:\n            return session.query(TaskRecord).get(id)\n\n    def find_task_by_task_id(self, task_id, session=None):\n        \"\"\"\n        Find task with the given task ID.\n        \"\"\"\n        with self._session(session) as session:\n            return session.query(TaskRecord).filter(TaskRecord.task_id == task_id).all()[-1]\n\n\nclass TaskParameter(Base):  # type: ignore\n    \"\"\"\n    Table to track luigi.Parameter()s of a Task.\n    \"\"\"\n\n    __tablename__ = \"task_parameters\"\n    task_id = sqlalchemy.Column(sqlalchemy.Integer, sqlalchemy.ForeignKey(\"tasks.id\"), primary_key=True)\n    name = sqlalchemy.Column(sqlalchemy.String(128), primary_key=True)\n    value = sqlalchemy.Column(sqlalchemy.Text())\n\n    def __repr__(self):\n        return \"TaskParameter(task_id=%d, name=%s, value=%s)\" % (self.task_id, self.name, self.value)\n\n\nclass TaskEvent(Base):  # type: ignore\n    \"\"\"\n    Table to track when a task is scheduled, starts, finishes, and fails.\n    \"\"\"\n\n    __tablename__ = \"task_events\"\n    id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True)\n    task_id = sqlalchemy.Column(sqlalchemy.Integer, sqlalchemy.ForeignKey(\"tasks.id\"), index=True)\n    event_name = sqlalchemy.Column(sqlalchemy.String(20))\n    ts = sqlalchemy.Column(sqlalchemy.TIMESTAMP, index=True, nullable=False)\n\n    def __repr__(self):\n        return \"TaskEvent(task_id=%s, event_name=%s, ts=%s\" % (self.task_id, self.event_name, self.ts)\n\n\nclass TaskRecord(Base):  # type: ignore\n    \"\"\"\n    Base table to track information about a luigi.Task.\n\n    References to other tables are available through task.events, task.parameters, etc.\n    \"\"\"\n\n    __tablename__ = \"tasks\"\n    id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True)\n    task_id = sqlalchemy.Column(sqlalchemy.String(200), index=True)\n    name = sqlalchemy.Column(sqlalchemy.String(128), index=True)\n    host = sqlalchemy.Column(sqlalchemy.String(128))\n    parameters = sqlalchemy.orm.relationship(\n        \"TaskParameter\", collection_class=sqlalchemy.orm.collections.attribute_mapped_collection(\"name\"), cascade=\"all, delete-orphan\"\n    )\n    events = sqlalchemy.orm.relationship(\"TaskEvent\", order_by=(sqlalchemy.desc(TaskEvent.ts), sqlalchemy.desc(TaskEvent.id)), backref=\"task\")\n\n    def __repr__(self):\n        return \"TaskRecord(name=%s, host=%s)\" % (self.name, self.host)\n\n\ndef _upgrade_schema(engine):\n    \"\"\"\n    Ensure the database schema is up to date with the codebase.\n\n    :param engine: SQLAlchemy engine of the underlying database.\n    \"\"\"\n    inspector = reflection.Inspector.from_engine(engine)\n    with engine.connect() as conn:\n        # Upgrade 1.  Add task_id column and index to tasks\n        if \"task_id\" not in [x[\"name\"] for x in inspector.get_columns(\"tasks\")]:\n            logger.warning(\"Upgrading DbTaskHistory schema: Adding tasks.task_id\")\n            conn.execute(\"ALTER TABLE tasks ADD COLUMN task_id VARCHAR(200)\")\n            conn.execute(\"CREATE INDEX ix_task_id ON tasks (task_id)\")\n\n        # Upgrade 2. Alter value column to be TEXT, note that this is idempotent so no if-guard\n        if \"mysql\" in engine.dialect.name:\n            conn.execute(\"ALTER TABLE task_parameters MODIFY COLUMN value TEXT\")\n        elif \"oracle\" in engine.dialect.name:\n            conn.execute(\"ALTER TABLE task_parameters MODIFY value TEXT\")\n        elif \"mssql\" in engine.dialect.name:\n            conn.execute(\"ALTER TABLE task_parameters ALTER COLUMN value TEXT\")\n        elif \"postgresql\" in engine.dialect.name:\n            if str([x for x in inspector.get_columns(\"task_parameters\") if x[\"name\"] == \"value\"][0][\"type\"]) != \"TEXT\":\n                conn.execute(\"ALTER TABLE task_parameters ALTER COLUMN value TYPE TEXT\")\n        elif \"sqlite\" in engine.dialect.name:\n            # SQLite does not support changing column types. A database file will need\n            # to be used to pickup this migration change.\n            for i in conn.execute(\"PRAGMA table_info(task_parameters);\").fetchall():\n                if i[\"name\"] == \"value\" and i[\"type\"] != \"TEXT\":\n                    logger.warning(\"SQLite can not change column types. Please use a new database to pickup column type changes.\")\n        else:\n            logger.warning(\"SQLAlcheny dialect {} could not be migrated to the TEXT type\".format(engine.dialect))\n"
  },
  {
    "path": "luigi/event.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"Definitions needed for events. See :ref:`Events` for info on how to use it.\"\"\"\n\n\nclass Event:\n    # TODO nice descriptive subclasses of Event instead of strings? pass their instances to the callback instead of an undocumented arg list?\n    DEPENDENCY_DISCOVERED = \"event.core.dependency.discovered\"  # triggered for every (task, upstream task) pair discovered in a jobflow\n    DEPENDENCY_MISSING = \"event.core.dependency.missing\"\n    DEPENDENCY_PRESENT = \"event.core.dependency.present\"\n    BROKEN_TASK = \"event.core.task.broken\"\n    START = \"event.core.start\"\n    #: This event can be fired by the task itself while running. The purpose is\n    #: for the task to report progress, metadata or any generic info so that\n    #: event handler listening for this can keep track of the progress of running task.\n    PROGRESS = \"event.core.progress\"\n    FAILURE = \"event.core.failure\"\n    SUCCESS = \"event.core.success\"\n    PROCESSING_TIME = \"event.core.processing_time\"\n    TIMEOUT = \"event.core.timeout\"  # triggered if a task times out\n    PROCESS_FAILURE = \"event.core.process_failure\"  # triggered if the process a task is running in dies unexpectedly\n"
  },
  {
    "path": "luigi/execution_summary.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2015-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nThis module provide the function :py:func:`summary` that is used for printing\nan `execution summary\n<https://github.com/spotify/luigi/blob/master/examples/execution_summary_example.py>`_\nat the end of luigi invocations.\n\"\"\"\n\nimport collections\nimport enum\nimport functools\nimport textwrap\n\nfrom luigi.parameter import IntParameter\nfrom luigi.task import Config\n\n\nclass execution_summary(Config):\n    summary_length = IntParameter(default=5)\n\n\nclass LuigiStatusCode(enum.Enum):\n    \"\"\"\n    All possible status codes for the attribute ``status`` in :class:`~luigi.execution_summary.LuigiRunResult` when\n    the argument ``detailed_summary=True`` in *luigi.run() / luigi.build*.\n    Here are the codes and what they mean:\n\n    =============================  ==========================================================\n    Status Code Name               Meaning\n    =============================  ==========================================================\n    SUCCESS                        There were no failed tasks or missing dependencies\n    SUCCESS_WITH_RETRY             There were failed tasks but they all succeeded in a retry\n    FAILED                         There were failed tasks\n    FAILED_AND_SCHEDULING_FAILED   There were failed tasks and tasks whose scheduling failed\n    SCHEDULING_FAILED              There were tasks whose scheduling failed\n    NOT_RUN                        There were tasks that were not granted run permission by the scheduler\n    MISSING_EXT                    There were missing external dependencies\n    =============================  ==========================================================\n\n    \"\"\"\n\n    SUCCESS = (\":)\", \"there were no failed tasks or missing dependencies\")\n    SUCCESS_WITH_RETRY = (\":)\", \"there were failed tasks but they all succeeded in a retry\")\n    FAILED = (\":(\", \"there were failed tasks\")\n    FAILED_AND_SCHEDULING_FAILED = (\":(\", \"there were failed tasks and tasks whose scheduling failed\")\n    SCHEDULING_FAILED = (\":(\", \"there were tasks whose scheduling failed\")\n    NOT_RUN = (\":|\", \"there were tasks that were not granted run permission by the scheduler\")\n    MISSING_EXT = (\":|\", \"there were missing external dependencies\")\n\n\nclass LuigiRunResult:\n    \"\"\"\n    The result of a call to build/run when passing the detailed_summary=True argument.\n\n    Attributes:\n        - one_line_summary (str): One line summary of the progress.\n        - summary_text (str): Detailed summary of the progress.\n        - status (LuigiStatusCode): Luigi Status Code. See :class:`~luigi.execution_summary.LuigiStatusCode` for what these codes mean.\n        - worker (luigi.worker.worker): Worker object. See :class:`~luigi.worker.worker`.\n        - scheduling_succeeded (bool): Boolean which is *True* if all the tasks were scheduled without errors.\n\n    \"\"\"\n\n    def __init__(self, worker, worker_add_run_status=True):\n        self.worker = worker\n        summary_dict = _summary_dict(worker)\n        self.summary_text = _summary_wrap(_summary_format(summary_dict, worker))\n        self.status = _tasks_status(summary_dict)\n        self.one_line_summary = _create_one_line_summary(self.status)\n        self.scheduling_succeeded = worker_add_run_status\n\n    def __str__(self):\n        return \"LuigiRunResult with status {0}\".format(self.status)\n\n    def __repr__(self):\n        return \"LuigiRunResult(status={0!r},worker={1!r},scheduling_succeeded={2!r})\".format(self.status, self.worker, self.scheduling_succeeded)\n\n\ndef _partition_tasks(worker):\n    \"\"\"\n    Takes a worker and sorts out tasks based on their status.\n    Still_pending_not_ext is only used to get upstream_failure, upstream_missing_dependency and run_by_other_worker\n    \"\"\"\n    task_history = worker._add_task_history\n    pending_tasks = {task for (task, status, ext) in task_history if status == \"PENDING\"}\n    set_tasks = {}\n    set_tasks[\"completed\"] = {task for (task, status, ext) in task_history if status == \"DONE\" and task in pending_tasks}\n    set_tasks[\"already_done\"] = {\n        task for (task, status, ext) in task_history if status == \"DONE\" and task not in pending_tasks and task not in set_tasks[\"completed\"]\n    }\n    set_tasks[\"ever_failed\"] = {task for (task, status, ext) in task_history if status == \"FAILED\"}\n    set_tasks[\"failed\"] = set_tasks[\"ever_failed\"] - set_tasks[\"completed\"]\n    set_tasks[\"scheduling_error\"] = {task for (task, status, ext) in task_history if status == \"UNKNOWN\"}\n    set_tasks[\"still_pending_ext\"] = {\n        task\n        for (task, status, ext) in task_history\n        if status == \"PENDING\" and task not in set_tasks[\"ever_failed\"] and task not in set_tasks[\"completed\"] and not ext\n    }\n    set_tasks[\"still_pending_not_ext\"] = {\n        task\n        for (task, status, ext) in task_history\n        if status == \"PENDING\" and task not in set_tasks[\"ever_failed\"] and task not in set_tasks[\"completed\"] and ext\n    }\n    set_tasks[\"run_by_other_worker\"] = set()\n    set_tasks[\"upstream_failure\"] = set()\n    set_tasks[\"upstream_missing_dependency\"] = set()\n    set_tasks[\"upstream_run_by_other_worker\"] = set()\n    set_tasks[\"upstream_scheduling_error\"] = set()\n    set_tasks[\"not_run\"] = set()\n    return set_tasks\n\n\ndef _root_task(worker):\n    \"\"\"\n    Return the first task scheduled by the worker, corresponding to the root task\n    \"\"\"\n    return worker._add_task_history[0][0]\n\n\ndef _populate_unknown_statuses(set_tasks):\n    \"\"\"\n    Add the \"upstream_*\" and \"not_run\" statuses my mutating set_tasks.\n    \"\"\"\n    visited = set()\n    for task in set_tasks[\"still_pending_not_ext\"]:\n        _depth_first_search(set_tasks, task, visited)\n\n\ndef _depth_first_search(set_tasks, current_task, visited):\n    \"\"\"\n    This dfs checks why tasks are still pending.\n    \"\"\"\n    visited.add(current_task)\n    if current_task in set_tasks[\"still_pending_not_ext\"]:\n        upstream_failure = False\n        upstream_missing_dependency = False\n        upstream_run_by_other_worker = False\n        upstream_scheduling_error = False\n        for task in current_task._requires():\n            if task not in visited:\n                _depth_first_search(set_tasks, task, visited)\n            if task in set_tasks[\"ever_failed\"] or task in set_tasks[\"upstream_failure\"]:\n                set_tasks[\"upstream_failure\"].add(current_task)\n                upstream_failure = True\n            if task in set_tasks[\"still_pending_ext\"] or task in set_tasks[\"upstream_missing_dependency\"]:\n                set_tasks[\"upstream_missing_dependency\"].add(current_task)\n                upstream_missing_dependency = True\n            if task in set_tasks[\"run_by_other_worker\"] or task in set_tasks[\"upstream_run_by_other_worker\"]:\n                set_tasks[\"upstream_run_by_other_worker\"].add(current_task)\n                upstream_run_by_other_worker = True\n            if task in set_tasks[\"scheduling_error\"]:\n                set_tasks[\"upstream_scheduling_error\"].add(current_task)\n                upstream_scheduling_error = True\n        if (\n            not upstream_failure\n            and not upstream_missing_dependency\n            and not upstream_run_by_other_worker\n            and not upstream_scheduling_error\n            and current_task not in set_tasks[\"run_by_other_worker\"]\n        ):\n            set_tasks[\"not_run\"].add(current_task)\n\n\ndef _get_str(task_dict, extra_indent):\n    \"\"\"\n    This returns a string for each status\n    \"\"\"\n    summary_length = execution_summary().summary_length\n\n    lines = []\n    task_names = sorted(task_dict.keys())\n    for task_family in task_names:\n        tasks = task_dict[task_family]\n        tasks = sorted(tasks, key=lambda x: str(x))\n        prefix_size = 8 if extra_indent else 4\n        prefix = \" \" * prefix_size\n\n        line = None\n\n        if summary_length > 0 and len(lines) >= summary_length:\n            line = prefix + \"...\"\n            lines.append(line)\n            break\n        if len(tasks[0].get_params()) == 0:\n            line = prefix + \"- {0} {1}()\".format(len(tasks), str(task_family))\n        elif (\n            _get_len_of_params(tasks[0]) > 60\n            or len(str(tasks[0])) > 200\n            or (len(tasks) == 2 and len(tasks[0].get_params()) > 1 and (_get_len_of_params(tasks[0]) > 40 or len(str(tasks[0])) > 100))\n        ):\n            \"\"\"\n            This is to make sure that there is no really long task in the output\n            \"\"\"\n            line = prefix + \"- {0} {1}(...)\".format(len(tasks), task_family)\n        elif len((tasks[0].get_params())) == 1:\n            attributes = {getattr(task, tasks[0].get_params()[0][0]) for task in tasks}\n            param_class = tasks[0].get_params()[0][1]\n            first, last = _ranging_attributes(attributes, param_class)\n            if first is not None and last is not None and len(attributes) > 3:\n                param_str = \"{0}...{1}\".format(param_class.serialize(first), param_class.serialize(last))\n            else:\n                param_str = \"{0}\".format(_get_str_one_parameter(tasks))\n            line = prefix + \"- {0} {1}({2}={3})\".format(len(tasks), task_family, tasks[0].get_params()[0][0], param_str)\n        else:\n            ranging = False\n            params = _get_set_of_params(tasks)\n            unique_param_keys = list(_get_unique_param_keys(params))\n            if len(unique_param_keys) == 1:\n                (unique_param,) = unique_param_keys\n                attributes = params[unique_param]\n                param_class = unique_param[1]\n                first, last = _ranging_attributes(attributes, param_class)\n                if first is not None and last is not None and len(attributes) > 2:\n                    ranging = True\n                    line = prefix + \"- {0} {1}({2}\".format(len(tasks), task_family, _get_str_ranging_multiple_parameters(first, last, tasks, unique_param))\n            if not ranging:\n                if len(tasks) == 1:\n                    line = prefix + \"- {0} {1}\".format(len(tasks), tasks[0])\n                if len(tasks) == 2:\n                    line = prefix + \"- {0} {1} and {2}\".format(len(tasks), tasks[0], tasks[1])\n                if len(tasks) > 2:\n                    line = prefix + \"- {0} {1} ...\".format(len(tasks), tasks[0])\n        lines.append(line)\n    return \"\\n\".join(lines)\n\n\ndef _get_len_of_params(task):\n    return sum(len(param[0]) for param in task.get_params())\n\n\ndef _get_str_ranging_multiple_parameters(first, last, tasks, unique_param):\n    row = \"\"\n    str_unique_param = \"{0}...{1}\".format(unique_param[1].serialize(first), unique_param[1].serialize(last))\n    for param in tasks[0].get_params():\n        row += \"{0}=\".format(param[0])\n        if param[0] == unique_param[0]:\n            row += \"{0}\".format(str_unique_param)\n        else:\n            row += \"{0}\".format(param[1].serialize(getattr(tasks[0], param[0])))\n        if param != tasks[0].get_params()[-1]:\n            row += \", \"\n    row += \")\"\n    return row\n\n\ndef _get_set_of_params(tasks):\n    params = {}\n    for param in tasks[0].get_params():\n        params[param] = {getattr(task, param[0]) for task in tasks}\n    return params\n\n\ndef _get_unique_param_keys(params):\n    for param_key, param_values in params.items():\n        if len(param_values) > 1:\n            yield param_key\n\n\ndef _ranging_attributes(attributes, param_class):\n    \"\"\"\n    Checks if there is a continuous range\n    \"\"\"\n    next_attributes = {param_class.next_in_enumeration(attribute) for attribute in attributes}\n    in_first = attributes.difference(next_attributes)\n    in_second = next_attributes.difference(attributes)\n    if len(in_first) == 1 and len(in_second) == 1:\n        for x in attributes:\n            if {param_class.next_in_enumeration(x)} == in_second:\n                return next(iter(in_first)), x\n    return None, None\n\n\ndef _get_str_one_parameter(tasks):\n    row = \"\"\n    count = 0\n    for task in tasks:\n        if (len(row) >= 30 and count > 2 and count != len(tasks) - 1) or len(row) > 200:\n            row += \"...\"\n            break\n        param = task.get_params()[0]\n        row += \"{0}\".format(param[1].serialize(getattr(task, param[0])))\n        if count < len(tasks) - 1:\n            row += \",\"\n        count += 1\n    return row\n\n\ndef _serialize_first_param(task):\n    return task.get_params()[0][1].serialize(getattr(task, task.get_params()[0][0]))\n\n\ndef _get_number_of_tasks_for(status, group_tasks):\n    if status == \"still_pending\":\n        return _get_number_of_tasks(group_tasks[\"still_pending_ext\"]) + _get_number_of_tasks(group_tasks[\"still_pending_not_ext\"])\n    return _get_number_of_tasks(group_tasks[status])\n\n\ndef _get_number_of_tasks(task_dict):\n    return sum(len(tasks) for tasks in task_dict.values())\n\n\ndef _get_comments(group_tasks):\n    \"\"\"\n    Get the human readable comments and quantities for the task types.\n    \"\"\"\n    comments = {}\n    for status, human in _COMMENTS:\n        num_tasks = _get_number_of_tasks_for(status, group_tasks)\n        if num_tasks:\n            space = \"    \" if status in _PENDING_SUB_STATUSES else \"\"\n            comments[status] = \"{space}* {num_tasks} {human}:\\n\".format(space=space, num_tasks=num_tasks, human=human)\n    return comments\n\n\n# Oredered in the sense that they'll be printed in this order\n_ORDERED_STATUSES = (\n    \"already_done\",\n    \"completed\",\n    \"ever_failed\",\n    \"failed\",\n    \"scheduling_error\",\n    \"still_pending\",\n    \"still_pending_ext\",\n    \"run_by_other_worker\",\n    \"upstream_failure\",\n    \"upstream_missing_dependency\",\n    \"upstream_run_by_other_worker\",\n    \"upstream_scheduling_error\",\n    \"not_run\",\n)\n_PENDING_SUB_STATUSES = set(_ORDERED_STATUSES[_ORDERED_STATUSES.index(\"still_pending_ext\") :])\n_COMMENTS = {\n    (\"already_done\", \"complete ones were encountered\"),\n    (\"completed\", \"ran successfully\"),\n    (\"failed\", \"failed\"),\n    (\"scheduling_error\", \"failed scheduling\"),\n    (\"still_pending\", \"were left pending, among these\"),\n    (\"still_pending_ext\", \"were missing external dependencies\"),\n    (\"run_by_other_worker\", \"were being run by another worker\"),\n    (\"upstream_failure\", \"had failed dependencies\"),\n    (\"upstream_missing_dependency\", \"had missing dependencies\"),\n    (\"upstream_run_by_other_worker\", \"had dependencies that were being run by other worker\"),\n    (\"upstream_scheduling_error\", \"had dependencies whose scheduling failed\"),\n    (\"not_run\", \"was not granted run permission by the scheduler\"),\n}\n\n\ndef _get_run_by_other_worker(worker):\n    \"\"\"\n    This returns a set of the tasks that are being run by other worker\n    \"\"\"\n    task_sets = _get_external_workers(worker).values()\n    return functools.reduce(lambda a, b: a | b, task_sets, set())\n\n\ndef _get_external_workers(worker):\n    \"\"\"\n    This returns a dict with a set of tasks for all of the other workers\n    \"\"\"\n    worker_that_blocked_task = collections.defaultdict(set)\n    get_work_response_history = worker._get_work_response_history\n    for get_work_response in get_work_response_history:\n        if get_work_response[\"task_id\"] is None:\n            for running_task in get_work_response[\"running_tasks\"]:\n                other_worker_id = running_task[\"worker\"]\n                other_task_id = running_task[\"task_id\"]\n                other_task = worker._scheduled_tasks.get(other_task_id)\n                if other_worker_id == worker._id or not other_task:\n                    continue\n                worker_that_blocked_task[other_worker_id].add(other_task)\n    return worker_that_blocked_task\n\n\ndef _group_tasks_by_name_and_status(task_dict):\n    \"\"\"\n    Takes a dictionary with sets of tasks grouped by their status and\n    returns a dictionary with dictionaries with an array of tasks grouped by\n    their status and task name\n    \"\"\"\n    group_status = {}\n    for task in task_dict:\n        if task.task_family not in group_status:\n            group_status[task.task_family] = []\n        group_status[task.task_family].append(task)\n    return group_status\n\n\ndef _summary_dict(worker):\n    set_tasks = _partition_tasks(worker)\n    set_tasks[\"run_by_other_worker\"] = _get_run_by_other_worker(worker)\n    _populate_unknown_statuses(set_tasks)\n    return set_tasks\n\n\ndef _summary_format(set_tasks, worker):\n    group_tasks = {}\n    for status, task_dict in set_tasks.items():\n        group_tasks[status] = _group_tasks_by_name_and_status(task_dict)\n    comments = _get_comments(group_tasks)\n    num_all_tasks = sum(\n        [\n            len(set_tasks[\"already_done\"]),\n            len(set_tasks[\"completed\"]),\n            len(set_tasks[\"failed\"]),\n            len(set_tasks[\"scheduling_error\"]),\n            len(set_tasks[\"still_pending_ext\"]),\n            len(set_tasks[\"still_pending_not_ext\"]),\n        ]\n    )\n    str_output = \"\"\n    str_output += \"Scheduled {0} tasks of which:\\n\".format(num_all_tasks)\n    for status in _ORDERED_STATUSES:\n        if status not in comments:\n            continue\n        str_output += \"{0}\".format(comments[status])\n        if status != \"still_pending\":\n            str_output += \"{0}\\n\".format(_get_str(group_tasks[status], status in _PENDING_SUB_STATUSES))\n    ext_workers = _get_external_workers(worker)\n    group_tasks_ext_workers = {}\n    for ext_worker, task_dict in ext_workers.items():\n        group_tasks_ext_workers[ext_worker] = _group_tasks_by_name_and_status(task_dict)\n    if len(ext_workers) > 0:\n        str_output += \"\\nThe other workers were:\\n\"\n        count = 0\n        for ext_worker, task_dict in ext_workers.items():\n            if count > 3 and count < len(ext_workers) - 1:\n                str_output += \"    and {0} other workers\".format(len(ext_workers) - count)\n                break\n            str_output += \"    - {0} ran {1} tasks\\n\".format(ext_worker, len(task_dict))\n            count += 1\n        str_output += \"\\n\"\n    if num_all_tasks == sum(\n        [len(set_tasks[\"already_done\"]), len(set_tasks[\"scheduling_error\"]), len(set_tasks[\"still_pending_ext\"]), len(set_tasks[\"still_pending_not_ext\"])]\n    ):\n        if len(ext_workers) == 0:\n            str_output += \"\\n\"\n        str_output += \"Did not run any tasks\"\n    one_line_summary = _create_one_line_summary(_tasks_status(set_tasks))\n    str_output += \"\\n{0}\".format(one_line_summary)\n    if num_all_tasks == 0:\n        str_output = \"Did not schedule any tasks\"\n    return str_output\n\n\ndef _create_one_line_summary(status_code):\n    \"\"\"\n    Given a status_code of type LuigiStatusCode which has a tuple value, returns a one line summary\n    \"\"\"\n    return \"This progress looks {0} because {1}\".format(*status_code.value)\n\n\ndef _tasks_status(set_tasks):\n    \"\"\"\n    Given a grouped set of tasks, returns a LuigiStatusCode\n    \"\"\"\n    if set_tasks[\"ever_failed\"]:\n        if not set_tasks[\"failed\"]:\n            return LuigiStatusCode.SUCCESS_WITH_RETRY\n        else:\n            if set_tasks[\"scheduling_error\"]:\n                return LuigiStatusCode.FAILED_AND_SCHEDULING_FAILED\n            return LuigiStatusCode.FAILED\n    elif set_tasks[\"scheduling_error\"]:\n        return LuigiStatusCode.SCHEDULING_FAILED\n    elif set_tasks[\"not_run\"]:\n        return LuigiStatusCode.NOT_RUN\n    elif set_tasks[\"still_pending_ext\"]:\n        return LuigiStatusCode.MISSING_EXT\n    else:\n        return LuigiStatusCode.SUCCESS\n\n\ndef _summary_wrap(str_output):\n    return textwrap.dedent(\"\"\"\n    ===== Luigi Execution Summary =====\n\n    {str_output}\n\n    ===== Luigi Execution Summary =====\n    \"\"\").format(str_output=str_output)\n\n\ndef summary(worker):\n    \"\"\"\n    Given a worker, return a human readable summary of what the worker have\n    done.\n    \"\"\"\n    return _summary_wrap(_summary_format(_summary_dict(worker), worker))\n\n\n# 5\n"
  },
  {
    "path": "luigi/format.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport io\nimport locale\nimport os\nimport re\nimport signal\nimport subprocess\nimport tempfile\nimport warnings\n\n\nclass FileWrapper:\n    \"\"\"\n    Wrap `file` in a \"real\" so stuff can be added to it after creation.\n    \"\"\"\n\n    def __init__(self, file_object):\n        self._subpipe = file_object\n\n    def __getattr__(self, name):\n        # forward calls to 'write', 'close' and other methods not defined below\n        return getattr(self._subpipe, name)\n\n    def __enter__(self, *args, **kwargs):\n        # instead of returning whatever is returned by __enter__ on the subpipe\n        # this returns self, so whatever custom injected methods are still available\n        # this might cause problems with custom file_objects, but seems to work\n        # fine with standard python `file` objects which is the only default use\n        return self\n\n    def __exit__(self, *args, **kwargs):\n        return self._subpipe.__exit__(*args, **kwargs)\n\n    def __iter__(self):\n        return iter(self._subpipe)\n\n\nclass InputPipeProcessWrapper:\n    def __init__(self, command, input_pipe=None):\n        \"\"\"\n        Initializes a InputPipeProcessWrapper instance.\n\n        :param command: a subprocess.Popen instance with stdin=input_pipe and\n                        stdout=subprocess.PIPE.\n                        Alternatively, just its args argument as a convenience.\n        \"\"\"\n        self._command = command\n\n        self._input_pipe = input_pipe\n        self._original_input = True\n\n        if input_pipe is not None:\n            try:\n                input_pipe.fileno()\n            except (AttributeError, io.UnsupportedOperation):\n                # subprocess require a fileno to work, if not present we copy to disk first\n                self._original_input = False\n                f = tempfile.NamedTemporaryFile(\"wb\", prefix=\"luigi-process_tmp\", delete=False)\n                self._tmp_file = f.name\n                while True:\n                    chunk = input_pipe.read(io.DEFAULT_BUFFER_SIZE)\n                    if not chunk:\n                        break\n                    f.write(chunk)\n                input_pipe.close()\n                f.close()\n                self._input_pipe = FileWrapper(io.BufferedReader(io.FileIO(self._tmp_file, \"r\")))\n\n        self._process = command if isinstance(command, subprocess.Popen) else self.create_subprocess(command)\n        # we want to keep a circular reference to avoid garbage collection\n        # when the object is used in, e.g., pipe.read()\n        self._process._selfref = self\n\n    def create_subprocess(self, command):\n        \"\"\"\n        http://www.chiark.greenend.org.uk/ucgi/~cjwatson/blosxom/2009-07-02-python-sigpipe.html\n        \"\"\"\n\n        def subprocess_setup():\n            # Python installs a SIGPIPE handler by default. This is usually not what\n            # non-Python subprocesses expect.\n            signal.signal(signal.SIGPIPE, signal.SIG_DFL)\n\n        return subprocess.Popen(command, stdin=self._input_pipe, stdout=subprocess.PIPE, preexec_fn=subprocess_setup, close_fds=True)\n\n    def _finish(self):\n        # Need to close this before input_pipe to get all SIGPIPE messages correctly\n        self._process.stdout.close()\n        if not self._original_input and os.path.exists(self._tmp_file):\n            os.remove(self._tmp_file)\n\n        if self._input_pipe is not None:\n            self._input_pipe.close()\n\n        self._process.wait()  # deadlock?\n        if self._process.returncode not in (0, 141, 128 - 141):\n            # 141 == 128 + 13 == 128 + SIGPIPE - normally processes exit with 128 + {reiceived SIG}\n            # 128 - 141 == -13 == -SIGPIPE, sometimes python receives -13 for some subprocesses\n            raise RuntimeError(\"Error reading from pipe. Subcommand exited with non-zero exit status %s.\" % self._process.returncode)\n\n    def close(self):\n        self._finish()\n\n    def __del__(self):\n        self._finish()\n\n    def __enter__(self):\n        return self\n\n    def _abort(self):\n        \"\"\"\n        Call _finish, but eat the exception (if any).\n        \"\"\"\n        try:\n            self._finish()\n        except KeyboardInterrupt:\n            raise\n        except BaseException:\n            pass\n\n    def __exit__(self, type, value, traceback):\n        if type:\n            self._abort()\n        else:\n            self._finish()\n\n    def __getattr__(self, name):\n        if name in [\"_process\", \"_input_pipe\"]:\n            raise AttributeError(name)\n        try:\n            return getattr(self._process.stdout, name)\n        except AttributeError:\n            return getattr(self._input_pipe, name)\n\n    def __iter__(self):\n        for line in self._process.stdout:\n            yield line\n        self._finish()\n\n    def readable(self):\n        return True\n\n    def writable(self):\n        return False\n\n    def seekable(self):\n        return False\n\n\nclass OutputPipeProcessWrapper:\n    WRITES_BEFORE_FLUSH = 10000\n\n    def __init__(self, command, output_pipe=None):\n        self.closed = False\n        self._command = command\n        self._output_pipe = output_pipe\n        self._process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=output_pipe, close_fds=True)\n        self._flushcount = 0\n\n    def write(self, *args, **kwargs):\n        self._process.stdin.write(*args, **kwargs)\n        self._flushcount += 1\n        if self._flushcount == self.WRITES_BEFORE_FLUSH:\n            self._process.stdin.flush()\n            self._flushcount = 0\n\n    def writeLine(self, line):\n        assert \"\\n\" not in line\n        self.write(line + \"\\n\")\n\n    def _finish(self):\n        \"\"\"\n        Closes and waits for subprocess to exit.\n        \"\"\"\n        if self._process.returncode is None:\n            self._process.stdin.flush()\n            self._process.stdin.close()\n            self._process.wait()\n            self.closed = True\n\n    def __del__(self):\n        if not self.closed:\n            self.abort()\n\n    def __exit__(self, type, value, traceback):\n        if type is None:\n            self.close()\n        else:\n            self.abort()\n\n    def __enter__(self):\n        return self\n\n    def close(self):\n        self._finish()\n        if self._process.returncode == 0:\n            if self._output_pipe is not None:\n                self._output_pipe.close()\n        else:\n            raise RuntimeError(\"Error when executing command %s\" % self._command)\n\n    def abort(self):\n        self._finish()\n\n    def __getattr__(self, name):\n        if name in [\"_process\", \"_output_pipe\"]:\n            raise AttributeError(name)\n        try:\n            return getattr(self._process.stdin, name)\n        except AttributeError:\n            return getattr(self._output_pipe, name)\n\n    def readable(self):\n        return False\n\n    def writable(self):\n        return True\n\n    def seekable(self):\n        return False\n\n\nclass BaseWrapper:\n    def __init__(self, stream, *args, **kwargs):\n        self._stream = stream\n        try:\n            super(BaseWrapper, self).__init__(stream, *args, **kwargs)\n        except TypeError:\n            pass\n\n    def __getattr__(self, name):\n        if name == \"_stream\":\n            raise AttributeError(name)\n        return getattr(self._stream, name)\n\n    def __enter__(self):\n        self._stream.__enter__()\n        return self\n\n    def __exit__(self, *args):\n        self._stream.__exit__(*args)\n\n    def __iter__(self):\n        try:\n            for line in self._stream:\n                yield line\n        finally:\n            self.close()\n\n\nclass NewlineWrapper(BaseWrapper):\n    def __init__(self, stream, newline=None):\n        if newline is None:\n            self.newline = newline\n        else:\n            self.newline = newline.encode(\"ascii\")\n\n        if self.newline not in (b\"\", b\"\\r\\n\", b\"\\n\", b\"\\r\", None):\n            raise ValueError(\"newline need to be one of {b'', b'\\r\\n', b'\\n', b'\\r', None}\")\n        super(NewlineWrapper, self).__init__(stream)\n\n    def read(self, n=-1):\n        b = self._stream.read(n)\n\n        if self.newline == b\"\":\n            return b\n\n        if self.newline is None:\n            newline = b\"\\n\"\n\n        return re.sub(b\"(\\n|\\r\\n|\\r)\", newline, b)\n\n    def writelines(self, lines):\n        if self.newline is None or self.newline == \"\":\n            newline = os.linesep.encode(\"ascii\")\n        else:\n            newline = self.newline\n\n        self._stream.writelines((re.sub(b\"(\\n|\\r\\n|\\r)\", newline, line) for line in lines))\n\n    def write(self, b):\n        if self.newline is None or self.newline == \"\":\n            newline = os.linesep.encode(\"ascii\")\n        else:\n            newline = self.newline\n\n        self._stream.write(re.sub(b\"(\\n|\\r\\n|\\r)\", newline, b))\n\n\nclass MixedUnicodeBytesWrapper(BaseWrapper):\n    \"\"\" \"\"\"\n\n    def __init__(self, stream, encoding=None):\n        if encoding is None:\n            encoding = locale.getpreferredencoding()\n        self.encoding = encoding\n        super(MixedUnicodeBytesWrapper, self).__init__(stream)\n\n    def write(self, b):\n        self._stream.write(self._convert(b))\n\n    def writelines(self, lines):\n        self._stream.writelines((self._convert(line) for line in lines))\n\n    def _convert(self, b):\n        if isinstance(b, str):\n            b = b.encode(self.encoding)\n            warnings.warn(\"Writing unicode to byte stream\", stacklevel=2)\n        return b\n\n\nclass Format:\n    \"\"\"\n    Interface for format specifications.\n    \"\"\"\n\n    @classmethod\n    def pipe_reader(cls, input_pipe):\n        raise NotImplementedError()\n\n    @classmethod\n    def pipe_writer(cls, output_pipe):\n        raise NotImplementedError()\n\n    def __rshift__(a, b):\n        return ChainFormat(a, b)\n\n\nclass ChainFormat(Format):\n    def __init__(self, *args, **kwargs):\n        self.args = args\n        try:\n            self.input = args[0].input\n        except AttributeError:\n            pass\n        try:\n            self.output = args[-1].output\n        except AttributeError:\n            pass\n        if not kwargs.get(\"check_consistency\", True):\n            return\n        for x in range(len(args) - 1):\n            try:\n                if args[x].output != args[x + 1].input:\n                    raise TypeError(\n                        \"The format chaining is not valid, %s expect %s\"\n                        \" but %s provide %s\"\n                        % (\n                            args[x + 1].__class__.__name__,\n                            args[x + 1].input,\n                            args[x].__class__.__name__,\n                            args[x].output,\n                        )\n                    )\n            except AttributeError:\n                pass\n\n    def pipe_reader(self, input_pipe):\n        for x in reversed(self.args):\n            input_pipe = x.pipe_reader(input_pipe)\n        return input_pipe\n\n    def pipe_writer(self, output_pipe):\n        for x in reversed(self.args):\n            output_pipe = x.pipe_writer(output_pipe)\n        return output_pipe\n\n\nclass TextWrapper(io.TextIOWrapper):\n    def __exit__(self, *args):\n        # io.TextIOWrapper close the file on __exit__, let the underlying file decide\n        if not self.closed and self.writable():\n            super(TextWrapper, self).flush()\n\n        self._stream.__exit__(*args)\n\n    def __del__(self, *args):\n        # io.TextIOWrapper close the file on __del__, let the underlying file decide\n        if not self.closed and self.writable():\n            super(TextWrapper, self).flush()\n\n        try:\n            self._stream.__del__(*args)\n        except AttributeError:\n            pass\n\n    def __init__(self, stream, *args, **kwargs):\n        self._stream = stream\n        try:\n            super(TextWrapper, self).__init__(stream, *args, **kwargs)\n        except TypeError:\n            pass\n\n    def __getattr__(self, name):\n        if name == \"_stream\":\n            raise AttributeError(name)\n        return getattr(self._stream, name)\n\n    def __enter__(self):\n        self._stream.__enter__()\n        return self\n\n\nclass NopFormat(Format):\n    def pipe_reader(self, input_pipe):\n        return input_pipe\n\n    def pipe_writer(self, output_pipe):\n        return output_pipe\n\n\nclass WrappedFormat(Format):\n    def __init__(self, *args, **kwargs):\n        self.args = args\n        self.kwargs = kwargs\n\n    def pipe_reader(self, input_pipe):\n        return self.wrapper_cls(input_pipe, *self.args, **self.kwargs)\n\n    def pipe_writer(self, output_pipe):\n        return self.wrapper_cls(output_pipe, *self.args, **self.kwargs)\n\n\nclass TextFormat(WrappedFormat):\n    input = \"unicode\"\n    output = \"bytes\"\n    wrapper_cls = TextWrapper\n\n\nclass MixedUnicodeBytesFormat(WrappedFormat):\n    output = \"bytes\"\n    wrapper_cls = MixedUnicodeBytesWrapper\n\n\nclass NewlineFormat(WrappedFormat):\n    input = \"bytes\"\n    output = \"bytes\"\n    wrapper_cls = NewlineWrapper\n\n\nclass GzipFormat(Format):\n    input = \"bytes\"\n    output = \"bytes\"\n\n    def __init__(self, compression_level=None):\n        self.compression_level = compression_level\n\n    def pipe_reader(self, input_pipe):\n        return InputPipeProcessWrapper([\"gunzip\"], input_pipe)\n\n    def pipe_writer(self, output_pipe):\n        args = [\"gzip\"]\n        if self.compression_level is not None:\n            args.append(\"-\" + str(int(self.compression_level)))\n        return OutputPipeProcessWrapper(args, output_pipe)\n\n\nclass Bzip2Format(Format):\n    input = \"bytes\"\n    output = \"bytes\"\n\n    def pipe_reader(self, input_pipe):\n        return InputPipeProcessWrapper([\"bzcat\"], input_pipe)\n\n    def pipe_writer(self, output_pipe):\n        return OutputPipeProcessWrapper([\"bzip2\"], output_pipe)\n\n\nText = TextFormat()\nUTF8 = TextFormat(encoding=\"utf8\")\nNop = NopFormat()\nSysNewLine = NewlineFormat()\nGzip = GzipFormat()\nBzip2 = Bzip2Format()\nMixedUnicodeBytes = MixedUnicodeBytesFormat()\n\n\ndef get_default_format():\n    return Text\n"
  },
  {
    "path": "luigi/freezing.py",
    "content": "\"\"\"Internal-only module with immutable data structures.\n\nPlease, do not use it outside of Luigi codebase itself.\n\"\"\"\n\nfrom collections import OrderedDict\n\ntry:\n    from collections.abc import Mapping\nexcept ImportError:\n    from collections import Mapping  # type: ignore\nimport functools\nimport operator\n\n\nclass FrozenOrderedDict(Mapping):\n    \"\"\"\n    It is an immutable wrapper around ordered dictionaries that implements the complete :py:class:`collections.Mapping`\n    interface. It can be used as a drop-in replacement for dictionaries where immutability and ordering are desired.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        self.__dict = OrderedDict(*args, **kwargs)\n        self.__hash = None\n\n    def __getitem__(self, key):\n        return self.__dict[key]\n\n    def __iter__(self):\n        return iter(self.__dict)\n\n    def __len__(self):\n        return len(self.__dict)\n\n    def __repr__(self):\n        # We should use short representation for beautiful console output\n        return repr(dict(self.__dict))\n\n    def __hash__(self):\n        if self.__hash is None:\n            hashes = map(hash, self.items())\n            self.__hash = functools.reduce(operator.xor, hashes, 0)\n\n        return self.__hash\n\n    def get_wrapped(self):\n        return self.__dict\n\n\ndef recursively_freeze(value):\n    \"\"\"\n    Recursively walks ``Mapping``s and ``list``s and converts them to ``FrozenOrderedDict`` and ``tuples``, respectively.\n    \"\"\"\n    if isinstance(value, Mapping):\n        return FrozenOrderedDict(((k, recursively_freeze(v)) for k, v in value.items()))\n    elif isinstance(value, list) or isinstance(value, tuple):\n        return tuple(recursively_freeze(v) for v in value)\n    return value\n\n\ndef recursively_unfreeze(value):\n    \"\"\"\n    Recursively walks ``FrozenOrderedDict``s and ``tuple``s and converts them to ``dict`` and ``list``, respectively.\n    \"\"\"\n    if isinstance(value, Mapping):\n        return dict(((k, recursively_unfreeze(v)) for k, v in value.items()))\n    elif isinstance(value, list) or isinstance(value, tuple):\n        return list(recursively_unfreeze(v) for v in value)\n    return value\n"
  },
  {
    "path": "luigi/interface.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nThis module contains the bindings for command line integration and dynamic loading of tasks\n\nIf you don't want to run luigi from the command line. You may use the methods\ndefined in this module to programmatically run luigi.\n\"\"\"\n\nimport logging\nimport os\nimport signal\nimport sys\nimport tempfile\nimport warnings\n\nfrom luigi import lock, parameter, rpc, scheduler, task, worker\nfrom luigi.cmdline_parser import CmdlineParser\nfrom luigi.execution_summary import LuigiRunResult\nfrom luigi.setup_logging import InterfaceLogging\n\n\nclass core(task.Config):\n    \"\"\"Keeps track of a bunch of environment params.\n\n    Uses the internal luigi parameter mechanism.\n    The nice thing is that we can instantiate this class\n    and get an object with all the environment variables set.\n    This is arguably a bit of a hack.\n    \"\"\"\n\n    use_cmdline_section = False\n    ignore_unconsumed = {\n        \"autoload_range\",\n        \"no_configure_logging\",\n    }\n\n    local_scheduler = parameter.BoolParameter(default=False, description=\"Use an in-memory central scheduler. Useful for testing.\", always_in_help=True)\n    scheduler_host = parameter.Parameter(\n        default=\"localhost\", description=\"Hostname of machine running remote scheduler\", config_path=dict(section=\"core\", name=\"default-scheduler-host\")\n    )\n    scheduler_port = parameter.IntParameter(\n        default=8082, description=\"Port of remote scheduler api process\", config_path=dict(section=\"core\", name=\"default-scheduler-port\")\n    )\n    scheduler_url = parameter.Parameter(\n        default=\"\",\n        description=\"Full path to remote scheduler\",\n        config_path=dict(section=\"core\", name=\"default-scheduler-url\"),\n    )\n    lock_size = parameter.IntParameter(default=1, description=\"Maximum number of workers running the same command\")\n    no_lock = parameter.BoolParameter(default=False, description=\"Ignore if similar process is already running\")\n    lock_pid_dir = parameter.Parameter(default=os.path.join(tempfile.gettempdir(), \"luigi\"), description=\"Directory to store the pid file\")\n    take_lock = parameter.BoolParameter(default=False, description=\"Signal other processes to stop getting work if already running\")\n    workers = parameter.IntParameter(default=1, description=\"Maximum number of parallel tasks to run\")\n    logging_conf_file = parameter.Parameter(default=\"\", description=\"Configuration file for logging\")\n    log_level = parameter.ChoiceParameter(\n        default=\"DEBUG\",\n        choices=[\"NOTSET\", \"DEBUG\", \"INFO\", \"WARNING\", \"ERROR\", \"CRITICAL\"],\n        description=\"Default log level to use when logging_conf_file is not set\",\n    )\n    module = parameter.Parameter(default=\"\", description=\"Used for dynamic loading of modules\", always_in_help=True)\n    parallel_scheduling = parameter.BoolParameter(default=False, description=\"Use multiprocessing to do scheduling in parallel.\")\n    parallel_scheduling_processes = parameter.IntParameter(\n        default=0, description=\"The number of processes to use for scheduling in parallel. By default the number of available CPUs will be used\"\n    )\n    assistant = parameter.BoolParameter(default=False, description=\"Run any task from the scheduler.\")\n    help = parameter.BoolParameter(default=False, description=\"Show most common flags and all task-specific flags\", always_in_help=True)\n    help_all = parameter.BoolParameter(default=False, description=\"Show all command line flags\", always_in_help=True)\n\n\nclass _WorkerSchedulerFactory:\n    def create_local_scheduler(self):\n        return scheduler.Scheduler(prune_on_get_work=True, record_task_history=False)\n\n    def create_remote_scheduler(self, url):\n        return rpc.RemoteScheduler(url)\n\n    def create_worker(self, scheduler, worker_processes, assistant=False):\n        return worker.Worker(scheduler=scheduler, worker_processes=worker_processes, assistant=assistant)\n\n\ndef _schedule_and_run(tasks, worker_scheduler_factory=None, override_defaults=None):\n    \"\"\"\n    :param tasks:\n    :param worker_scheduler_factory:\n    :param override_defaults:\n    :return: True if all tasks and their dependencies were successfully run (or already completed);\n             False if any error occurred. It will return a detailed response of type LuigiRunResult\n             instead of a boolean if detailed_summary=True.\n    \"\"\"\n\n    if worker_scheduler_factory is None:\n        worker_scheduler_factory = _WorkerSchedulerFactory()\n    if override_defaults is None:\n        override_defaults = {}\n    env_params = core(**override_defaults)\n\n    InterfaceLogging.setup(env_params)\n\n    kill_signal = signal.SIGUSR1 if env_params.take_lock else None\n    if not env_params.no_lock and not (lock.acquire_for(env_params.lock_pid_dir, env_params.lock_size, kill_signal)):\n        raise PidLockAlreadyTakenExit()\n\n    if env_params.local_scheduler:\n        sch = worker_scheduler_factory.create_local_scheduler()\n    else:\n        if env_params.scheduler_url != \"\":\n            url = env_params.scheduler_url\n        else:\n            url = \"http://{host}:{port:d}/\".format(\n                host=env_params.scheduler_host,\n                port=env_params.scheduler_port,\n            )\n        sch = worker_scheduler_factory.create_remote_scheduler(url=url)\n\n    worker = worker_scheduler_factory.create_worker(scheduler=sch, worker_processes=env_params.workers, assistant=env_params.assistant)\n\n    success = True\n    logger = logging.getLogger(\"luigi-interface\")\n    with worker:\n        for t in tasks:\n            success &= worker.add(t, env_params.parallel_scheduling, env_params.parallel_scheduling_processes)\n        logger.info(\"Done scheduling tasks\")\n        success &= worker.run()\n    luigi_run_result = LuigiRunResult(worker, success)\n    logger.info(luigi_run_result.summary_text)\n    if hasattr(sch, \"close\"):\n        sch.close()\n    return luigi_run_result\n\n\nclass PidLockAlreadyTakenExit(SystemExit):\n    \"\"\"\n    The exception thrown by :py:func:`luigi.run`, when the lock file is inaccessible\n    \"\"\"\n\n    pass\n\n\ndef run(*args, **kwargs):\n    \"\"\"\n    Please dont use. Instead use `luigi` binary.\n\n    Run from cmdline using argparse.\n\n    :param use_dynamic_argparse: Deprecated and ignored\n    \"\"\"\n    luigi_run_result = _run(*args, **kwargs)\n    return luigi_run_result if kwargs.get(\"detailed_summary\") else luigi_run_result.scheduling_succeeded\n\n\ndef _run(cmdline_args=None, main_task_cls=None, worker_scheduler_factory=None, use_dynamic_argparse=None, local_scheduler=False, detailed_summary=False):\n    if use_dynamic_argparse is not None:\n        warnings.warn(\"use_dynamic_argparse is deprecated, don't set it.\", DeprecationWarning, stacklevel=2)\n    if cmdline_args is None:\n        cmdline_args = sys.argv[1:]\n\n    if main_task_cls:\n        cmdline_args.insert(0, main_task_cls.task_family)\n    if local_scheduler:\n        cmdline_args.append(\"--local-scheduler\")\n    with CmdlineParser.global_instance(cmdline_args) as cp:\n        return _schedule_and_run([cp.get_task_obj()], worker_scheduler_factory)\n\n\ndef build(tasks, worker_scheduler_factory=None, detailed_summary=False, **env_params):\n    \"\"\"\n    Run internally, bypassing the cmdline parsing.\n\n    Useful if you have some luigi code that you want to run internally.\n    Example:\n\n    .. code-block:: python\n\n        luigi.build([MyTask1(), MyTask2()], local_scheduler=True)\n\n    One notable difference is that `build` defaults to not using\n    the identical process lock. Otherwise, `build` would only be\n    callable once from each process.\n\n    :param tasks:\n    :param worker_scheduler_factory:\n    :param env_params:\n    :return: True if there were no scheduling errors, even if tasks may fail.\n    \"\"\"\n    if \"no_lock\" not in env_params:\n        env_params[\"no_lock\"] = True\n\n    luigi_run_result = _schedule_and_run(tasks, worker_scheduler_factory, override_defaults=env_params)\n    return luigi_run_result if detailed_summary else luigi_run_result.scheduling_succeeded\n"
  },
  {
    "path": "luigi/local_target.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\n:class:`LocalTarget` provides a concrete implementation of a :py:class:`~luigi.target.Target` class that uses files on the local file system\n\"\"\"\n\nimport errno\nimport io\nimport os\nimport random\nimport shutil\nimport tempfile\nimport warnings\n\nfrom luigi.format import FileWrapper, get_default_format\nfrom luigi.target import AtomicLocalFile, FileAlreadyExists, FileSystem, FileSystemTarget, MissingParentDirectory, NotADirectory\n\n\nclass atomic_file(AtomicLocalFile):\n    \"\"\"Simple class that writes to a temp file and moves it on close()\n    Also cleans up the temp file if close is not invoked\n    \"\"\"\n\n    def move_to_final_destination(self):\n        os.replace(self.tmp_path, self.path)\n\n    def generate_tmp_path(self, path):\n        return path + \"-luigi-tmp-%09d\" % random.randrange(0, 10_000_000_000)\n\n\nclass LocalFileSystem(FileSystem):\n    \"\"\"\n    Wrapper for access to file system operations.\n\n    Work in progress - add things as needed.\n    \"\"\"\n\n    def copy(self, old_path, new_path, raise_if_exists=False):\n        if raise_if_exists and os.path.exists(new_path):\n            raise RuntimeError(\"Destination exists: %s\" % new_path)\n        d = os.path.dirname(new_path)\n        if d and not os.path.exists(d):\n            self.mkdir(d)\n        shutil.copy(old_path, new_path)\n\n    def exists(self, path):\n        return os.path.exists(path)\n\n    def mkdir(self, path, parents=True, raise_if_exists=False):\n        if self.exists(path):\n            if raise_if_exists:\n                raise FileAlreadyExists()\n            elif not self.isdir(path):\n                raise NotADirectory()\n            else:\n                return\n\n        if parents:\n            try:\n                os.makedirs(path)\n            except OSError as err:\n                # somebody already created the path\n                if err.errno != errno.EEXIST:\n                    raise\n        else:\n            if not os.path.exists(os.path.dirname(path)):\n                raise MissingParentDirectory()\n            os.mkdir(path)\n\n    def isdir(self, path):\n        return os.path.isdir(path)\n\n    def listdir(self, path):\n        for dir_, _, files in os.walk(path):\n            assert dir_.startswith(path)\n            for name in files:\n                yield os.path.join(dir_, name)\n\n    def remove(self, path, recursive=True):\n        if recursive and self.isdir(path):\n            shutil.rmtree(path)\n        else:\n            os.remove(path)\n\n    def move(self, old_path, new_path, raise_if_exists=False):\n        \"\"\"\n        Move file atomically. If source and destination are located\n        on different filesystems, atomicity is approximated\n        but cannot be guaranteed.\n        \"\"\"\n        if raise_if_exists and os.path.exists(new_path):\n            raise FileAlreadyExists(\"Destination exists: %s\" % new_path)\n        d = os.path.dirname(new_path)\n        if d and not os.path.exists(d):\n            self.mkdir(d)\n        try:\n            os.replace(old_path, new_path)\n        except OSError as err:\n            if err.errno == errno.EXDEV:\n                new_path_tmp = \"%s-%09d\" % (new_path, random.randint(0, 999999999))\n                shutil.copy(old_path, new_path_tmp)\n                os.replace(new_path_tmp, new_path)\n                os.remove(old_path)\n            else:\n                raise err\n\n    def rename_dont_move(self, path, dest):\n        \"\"\"\n        Rename ``path`` to ``dest``, but don't move it into the ``dest``\n        folder (if it is a folder). This method is just a wrapper around the\n        ``move`` method of LocalTarget.\n        \"\"\"\n        self.move(path, dest, raise_if_exists=True)\n\n\nclass LocalTarget(FileSystemTarget):\n    fs = LocalFileSystem()\n\n    def __init__(self, path=None, format=None, is_tmp=False):\n        if format is None:\n            format = get_default_format()\n\n        if not path:\n            if not is_tmp:\n                raise Exception(\"path or is_tmp must be set\")\n            path = os.path.join(tempfile.gettempdir(), \"luigi-tmp-%09d\" % random.randint(0, 999999999))\n        super(LocalTarget, self).__init__(path)\n        self.format = format\n        self.is_tmp = is_tmp\n\n    def makedirs(self):\n        \"\"\"\n        Create all parent folders if they do not exist.\n        \"\"\"\n        normpath = os.path.normpath(self.path)\n        parentfolder = os.path.dirname(normpath)\n        if parentfolder:\n            try:\n                os.makedirs(parentfolder)\n            except OSError:\n                pass\n\n    def open(self, mode=\"r\"):\n        rwmode = mode.replace(\"b\", \"\").replace(\"t\", \"\")\n        if rwmode == \"w\":\n            self.makedirs()\n            return self.format.pipe_writer(atomic_file(self.path))\n\n        elif rwmode == \"r\":\n            fileobj = FileWrapper(io.BufferedReader(io.FileIO(self.path, mode)))\n            return self.format.pipe_reader(fileobj)\n\n        else:\n            raise Exception(\"mode must be 'r' or 'w' (got: %s)\" % mode)\n\n    def move(self, new_path, raise_if_exists=False):\n        self.fs.move(self.path, new_path, raise_if_exists=raise_if_exists)\n\n    def move_dir(self, new_path):\n        self.move(new_path)\n\n    def remove(self):\n        self.fs.remove(self.path)\n\n    def copy(self, new_path, raise_if_exists=False):\n        self.fs.copy(self.path, new_path, raise_if_exists)\n\n    @property\n    def fn(self):\n        warnings.warn(\"Use LocalTarget.path to reference filename\", DeprecationWarning, stacklevel=2)\n        return self.path\n\n    def __del__(self):\n        if hasattr(self, \"is_tmp\") and self.is_tmp and self.exists():\n            self.remove()\n"
  },
  {
    "path": "luigi/lock.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nLocking functionality when launching things from the command line.\nUses a pidfile.\nThis prevents multiple identical workflows to be launched simultaneously.\n\"\"\"\n\nimport errno\nimport hashlib\nimport os\nimport sys\nfrom subprocess import PIPE, Popen\n\n\ndef getpcmd(pid):\n    \"\"\"\n    Returns command of process.\n\n    :param pid:\n    \"\"\"\n    if os.name == \"nt\":\n        # Use wmic command instead of ps on Windows.\n        cmd = \"wmic path win32_process where ProcessID=%s get Commandline 2> nul\" % (pid,)\n        with os.popen(cmd, \"r\") as p:\n            lines = [line for line in p.readlines() if line.strip(\"\\r\\n \") != \"\"]\n            if lines:\n                _, val = lines\n                return val\n    elif sys.platform == \"darwin\":\n        # Use pgrep instead of /proc on macOS.\n        pidfile = \".%d.pid\" % (pid,)\n        with open(pidfile, \"w\") as f:\n            f.write(str(pid))\n        try:\n            p = Popen([\"pgrep\", \"-lf\", \"-F\", pidfile], stdout=PIPE)\n            stdout, _ = p.communicate()\n            line = stdout.decode(\"utf8\").strip()\n            if line:\n                _, scmd = line.split(\" \", 1)\n                return scmd\n        finally:\n            os.unlink(pidfile)\n    else:\n        # Use the /proc filesystem\n        # At least on android there have been some issues with not all\n        # process infos being readable. In these cases using the `ps` command\n        # worked. See the pull request at\n        # https://github.com/spotify/luigi/pull/1876\n        try:\n            with open(\"/proc/{0}/cmdline\".format(pid), \"r\") as fh:\n                return fh.read().replace(\"\\0\", \" \").rstrip()\n        except IOError:\n            # the system may not allow reading the command line\n            # of a process owned by another user\n            pass\n\n    # Fallback instead of None, for e.g. Cygwin where -o is an \"unknown option\" for the ps command:\n    return \"[PROCESS_WITH_PID={}]\".format(pid)\n\n\ndef get_info(pid_dir, my_pid=None):\n    # Check the name and pid of this process\n    if my_pid is None:\n        my_pid = os.getpid()\n\n    my_cmd = getpcmd(my_pid)\n    cmd_hash = my_cmd.encode(\"utf8\")\n    pid_file = os.path.join(pid_dir, hashlib.new(\"md5\", cmd_hash, usedforsecurity=False).hexdigest()) + \".pid\"\n\n    return my_pid, my_cmd, pid_file\n\n\ndef acquire_for(pid_dir, num_available=1, kill_signal=None):\n    \"\"\"\n    Makes sure the process is only run once at the same time with the same name.\n\n    Notice that we since we check the process name, different parameters to the same\n    command can spawn multiple processes at the same time, i.e. running\n    \"/usr/bin/my_process\" does not prevent anyone from launching\n    \"/usr/bin/my_process --foo bar\".\n    \"\"\"\n\n    my_pid, my_cmd, pid_file = get_info(pid_dir)\n\n    # Create a pid file if it does not exist\n    try:\n        os.mkdir(pid_dir)\n        os.chmod(pid_dir, 0o700)\n    except OSError as exc:\n        if exc.errno != errno.EEXIST:\n            raise\n        pass\n\n    # Let variable \"pids\" be all pids who exist in the .pid-file who are still\n    # about running the same command.\n    pids = {pid for pid in _read_pids_file(pid_file) if getpcmd(pid) == my_cmd}\n\n    if kill_signal is not None:\n        for pid in pids:\n            os.kill(pid, kill_signal)\n        print(\"Sent kill signal to Pids: {}\".format(pids))\n        # We allow for the killer to progress, yet we don't want these to stack\n        # up! So we only allow it once.\n        num_available += 1\n\n    if len(pids) >= num_available:\n        # We are already running under a different pid\n        print(\"Pid(s) {} already running\".format(pids))\n        if kill_signal is not None:\n            print('Note: There have (probably) been 1 other \"--take-lock\" process which continued to run! Probably no need to run this one as well.')\n        return False\n\n    _write_pids_file(pid_file, pids | {my_pid})\n\n    return True\n\n\ndef _read_pids_file(pid_file):\n    # First setup a python 2 vs 3 compatibility\n    # http://stackoverflow.com/a/21368622/621449\n    try:\n        FileNotFoundError  # noqa: F823\n    except NameError:\n        # Should only happen on python 2\n        FileNotFoundError = IOError\n    # If the file happen to not exist, simply return\n    # an empty set()\n    try:\n        with open(pid_file, \"r\") as f:\n            return {int(pid_str.strip()) for pid_str in f if pid_str.strip()}\n    except FileNotFoundError:\n        return set()\n\n\ndef _write_pids_file(pid_file, pids_set):\n    with open(pid_file, \"w\") as f:\n        f.writelines(\"{}\\n\".format(pid) for pid in pids_set)\n\n    # Make the .pid-file writable by all (when the os allows for it)\n    if os.name != \"nt\":\n        s = os.stat(pid_file)\n        if os.getuid() == s.st_uid:\n            os.chmod(pid_file, s.st_mode | 0o777)\n"
  },
  {
    "path": "luigi/metrics.py",
    "content": "import abc\nimport importlib\nfrom enum import Enum\n\n\nclass MetricsCollectors(Enum):\n    custom = -1\n    default = 1\n    none = 1\n    datadog = 2\n    prometheus = 3\n\n    @classmethod\n    def get(cls, which, custom_import=None):\n        if which == MetricsCollectors.none:\n            return NoMetricsCollector()\n        elif which == MetricsCollectors.datadog:\n            from luigi.contrib.datadog_metric import DatadogMetricsCollector\n\n            return DatadogMetricsCollector()\n        elif which == MetricsCollectors.prometheus:\n            from luigi.contrib.prometheus_metric import PrometheusMetricsCollector\n\n            return PrometheusMetricsCollector()\n        elif which == MetricsCollectors.custom:\n            if custom_import is None:\n                raise ValueError(f\"MetricsCollectors value ' {which} ' is -1 and custom_import is None\")\n\n            split_import_string = custom_import.split(\".\")\n\n            import_path = \".\".join(split_import_string[:-1])\n            import_class_string = split_import_string[-1]\n\n            mod = importlib.import_module(import_path)\n            metrics_class = getattr(mod, import_class_string)\n\n            if issubclass(metrics_class, MetricsCollector):\n                return metrics_class()\n            else:\n                raise ValueError(f\"Custom Import: {custom_import} is not a subclass of MetricsCollector\")\n        else:\n            raise ValueError(\"MetricsCollectors value ' {0} ' isn't supported\", which)\n\n\nclass MetricsCollector(metaclass=abc.ABCMeta):\n    \"\"\"Abstractable MetricsCollector base class that can be replace by tool\n    specific implementation.\n    \"\"\"\n\n    @abc.abstractmethod\n    def __init__(self):\n        pass\n\n    @abc.abstractmethod\n    def handle_task_started(self, task):\n        pass\n\n    @abc.abstractmethod\n    def handle_task_failed(self, task):\n        pass\n\n    @abc.abstractmethod\n    def handle_task_disabled(self, task, config):\n        pass\n\n    @abc.abstractmethod\n    def handle_task_done(self, task):\n        pass\n\n    def handle_task_statistics(self, task, statistics):\n        pass\n\n    def generate_latest(self):\n        return\n\n    def configure_http_handler(self, http_handler):\n        pass\n\n\nclass NoMetricsCollector(MetricsCollector):\n    \"\"\"Empty MetricsCollector when no collector is being used\"\"\"\n\n    def __init__(self):\n        pass\n\n    def handle_task_started(self, task):\n        pass\n\n    def handle_task_failed(self, task):\n        pass\n\n    def handle_task_disabled(self, task, config):\n        pass\n\n    def handle_task_done(self, task):\n        pass\n"
  },
  {
    "path": "luigi/mock.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nThis module provides a class :class:`MockTarget`, an implementation of :py:class:`~luigi.target.Target`.\n:class:`MockTarget` contains all data in-memory.\nThe main purpose is unit testing workflows without writing to disk.\n\"\"\"\n\nimport multiprocessing\nimport sys\nfrom io import BytesIO\n\nfrom luigi import target\nfrom luigi.format import get_default_format\n\n\nclass MockFileSystem(target.FileSystem):\n    \"\"\"\n    MockFileSystem inspects/modifies _data to simulate file system operations.\n    \"\"\"\n\n    _data = None\n\n    def copy(self, path, dest, raise_if_exists=False):\n        \"\"\"\n        Copies the contents of a single file path to dest\n        \"\"\"\n        if raise_if_exists and dest in self.get_all_data():\n            raise RuntimeError(\"Destination exists: %s\" % path)\n        contents = self.get_all_data()[path]\n        self.get_all_data()[dest] = contents\n\n    def get_all_data(self):\n        # This starts a server in the background, so we don't want to do it in the global scope\n        if MockFileSystem._data is None:\n            MockFileSystem._data = multiprocessing.Manager().dict()\n        return MockFileSystem._data\n\n    def get_data(self, fn):\n        return self.get_all_data()[fn]\n\n    def exists(self, path):\n        return MockTarget(path).exists()\n\n    def remove(self, path, recursive=True, skip_trash=True):\n        \"\"\"\n        Removes the given mockfile. skip_trash doesn't have any meaning.\n        \"\"\"\n        if recursive:\n            to_delete = []\n            for s in self.get_all_data().keys():\n                if s.startswith(path):\n                    to_delete.append(s)\n            for s in to_delete:\n                self.get_all_data().pop(s)\n        else:\n            self.get_all_data().pop(path)\n\n    def move(self, path, dest, raise_if_exists=False):\n        \"\"\"\n        Moves a single file from path to dest\n        \"\"\"\n        if raise_if_exists and dest in self.get_all_data():\n            raise RuntimeError(\"Destination exists: %s\" % path)\n        contents = self.get_all_data().pop(path)\n        self.get_all_data()[dest] = contents\n\n    def listdir(self, path):\n        \"\"\"\n        listdir does a prefix match of self.get_all_data(), but doesn't yet support globs.\n        \"\"\"\n        return [s for s in self.get_all_data().keys() if s.startswith(path)]\n\n    def isdir(self, path):\n        return any(self.listdir(path))\n\n    def mkdir(self, path, parents=True, raise_if_exists=False):\n        \"\"\"\n        mkdir is a noop.\n        \"\"\"\n        pass\n\n    def clear(self):\n        self.get_all_data().clear()\n\n\nclass MockTarget(target.FileSystemTarget):\n    fs = MockFileSystem()\n\n    def __init__(self, fn, is_tmp=None, mirror_on_stderr=False, format=None):\n        self._mirror_on_stderr = mirror_on_stderr\n        self.path = fn\n        self.format = format or get_default_format()\n\n    def exists(\n        self,\n    ):\n        return self.path in self.fs.get_all_data()\n\n    def move(self, path, raise_if_exists=False):\n        \"\"\"\n        Call MockFileSystem's move command\n        \"\"\"\n        self.fs.move(self.path, path, raise_if_exists)\n\n    def rename(self, *args, **kwargs):\n        \"\"\"\n        Call move to rename self\n        \"\"\"\n        self.move(*args, **kwargs)\n\n    def open(self, mode=\"r\"):\n        fn = self.path\n        mock_target = self\n\n        class Buffer(BytesIO):\n            # Just to be able to do writing + reading from the same buffer\n\n            _write_line = True\n\n            def set_wrapper(self, wrapper):\n                self.wrapper = wrapper\n\n            def write(self, data):\n                if mock_target._mirror_on_stderr:\n                    if self._write_line:\n                        sys.stderr.write(fn + \": \")\n                    if isinstance(data, bytes):\n                        sys.stderr.write(data.decode(\"utf8\"))\n                    else:\n                        sys.stderr.write(data)\n                    if (data[-1]) == \"\\n\":\n                        self._write_line = True\n                    else:\n                        self._write_line = False\n                super(Buffer, self).write(data)\n\n            def close(self):\n                if mode[0] == \"w\":\n                    try:\n                        mock_target.wrapper.flush()\n                    except AttributeError:\n                        pass\n                    mock_target.fs.get_all_data()[fn] = self.getvalue()\n                super(Buffer, self).close()\n\n            def __exit__(self, exc_type, exc_val, exc_tb):\n                if not exc_type:\n                    self.close()\n\n            def __enter__(self):\n                return self\n\n            def readable(self):\n                return mode[0] == \"r\"\n\n            def writeable(self):\n                return mode[0] == \"w\"\n\n            def seekable(self):\n                return False\n\n        if mode[0] == \"w\":\n            wrapper = self.format.pipe_writer(Buffer())\n            wrapper.set_wrapper(wrapper)\n            return wrapper\n        else:\n            return self.format.pipe_reader(Buffer(self.fs.get_all_data()[fn]))\n"
  },
  {
    "path": "luigi/mypy.py",
    "content": "\"\"\"Plugin that provides support for luigi.Task\n\nThis Code reuses the code from mypy.plugins.dataclasses\nhttps://github.com/python/mypy/blob/0753e2a82dad35034e000609b6e8daa37238bfaa/mypy/plugins/dataclasses.py\n\"\"\"\n\nfrom __future__ import annotations\n\nfrom typing import Callable, Dict, Final, Iterator, List, Literal, Optional\n\nfrom mypy.expandtype import expand_type, expand_type_by_instance\nfrom mypy.nodes import (\n    ARG_NAMED_OPT,\n    ARG_POS,\n    Argument,\n    AssignmentStmt,\n    Block,\n    CallExpr,\n    ClassDef,\n    Context,\n    EllipsisExpr,\n    Expression,\n    FuncDef,\n    IfStmt,\n    JsonDict,\n    MemberExpr,\n    NameExpr,\n    PlaceholderNode,\n    RefExpr,\n    Statement,\n    SymbolTableNode,\n    TempNode,\n    TypeInfo,\n    Var,\n)\nfrom mypy.plugin import (\n    ClassDefContext,\n    FunctionContext,\n    Plugin,\n    SemanticAnalyzerPluginInterface,\n)\nfrom mypy.plugins.common import (\n    add_method_to_class,\n    deserialize_and_fixup_type,\n)\nfrom mypy.server.trigger import make_wildcard_trigger\nfrom mypy.state import state\nfrom mypy.typeops import map_type_from_supertype\nfrom mypy.types import (\n    AnyType,\n    CallableType,\n    Instance,\n    NoneType,\n    Type,\n    TypeOfAny,\n    get_proper_type,\n)\nfrom mypy.typevars import fill_typevars\n\nMETADATA_TAG: Final[str] = \"task\"\n\n\nclass TaskPlugin(Plugin):\n    def get_base_class_hook(self, fullname: str) -> Callable[[ClassDefContext], None] | None:\n        sym = self.lookup_fully_qualified(fullname)\n        if sym and isinstance(sym.node, TypeInfo):\n            if any(base.fullname == \"luigi.task.Task\" for base in sym.node.mro):\n                return self._task_class_maker_callback\n        return None\n\n    def get_function_hook(self, fullname: str) -> Callable[[FunctionContext], Type] | None:\n        \"\"\"Adjust the return type of the `Parameters` function.\"\"\"\n        if self.check_parameter(fullname):\n            return self._task_parameter_field_callback\n        return None\n\n    def check_parameter(self, fullname):\n        sym = self.lookup_fully_qualified(fullname)\n        if sym and isinstance(sym.node, TypeInfo):\n            return any(base.fullname == \"luigi.parameter.Parameter\" for base in sym.node.mro)\n\n    def _task_class_maker_callback(self, ctx: ClassDefContext) -> None:\n        transformer = TaskTransformer(ctx.cls, ctx.reason, ctx.api, self)\n        transformer.transform()\n\n    def _infer_choice_enum_element_type(self, ctx: FunctionContext, default_type: Instance) -> Type:\n        \"\"\"Infer the element type for Choice/Enum parameter variants.\n\n        Checks the type argument first, then falls back to the 'choices' kwarg.\n        \"\"\"\n        element_type: Type = default_type.args[0] if default_type.args else AnyType(TypeOfAny.unannotated)\n        for i, names in enumerate(ctx.arg_names):\n            for j, name in enumerate(names):\n                if name == \"choices\":\n                    choices_type = get_proper_type(ctx.arg_types[i][j])\n                    if isinstance(choices_type, Instance) and choices_type.args:\n                        element_type = choices_type.args[0]\n        return element_type\n\n    def _task_parameter_field_callback(self, ctx: FunctionContext) -> Type:\n        \"\"\"Extract the type of the `default` argument from the Field function, and use it as the return type.\n\n        In particular:\n        * Retrieve the type of the argument which is specified, and use it as return type for the function.\n        * If no default argument is specified, use the __new__ method's return type from the Parameter class\n          e.g.\n          ```python\n          foo: int = luigi.IntParameter()  # IntParameter.__new__ returns int\n          ```\n        \"\"\"\n        # Try to get the return type from __new__ method\n        default_type = ctx.default_return_type\n        if isinstance(default_type, Instance):\n            # Handle Choice/Enum list parameters (ChoiceListParameter, EnumListParameter)\n            if default_type.type.fullname in (\n                \"luigi.parameter.ChoiceListParameter\",\n                \"luigi.parameter.EnumListParameter\",\n            ):\n                element_type = self._infer_choice_enum_element_type(ctx, default_type)\n                return ctx.api.named_generic_type(\"builtins.tuple\", [element_type])\n\n            # Handle Choice/Enum scalar parameters (ChoiceParameter, EnumParameter)\n            if default_type.type.fullname in (\n                \"luigi.parameter.ChoiceParameter\",\n                \"luigi.parameter.EnumParameter\",\n            ):\n                return self._infer_choice_enum_element_type(ctx, default_type)\n\n            # Check if a 'default' argument is explicitly provided\n            try:\n                default_idx = ctx.callee_arg_names.index(\"default\")\n                if ctx.args[default_idx]:\n                    default_arg = ctx.args[default_idx][0]\n                    if not isinstance(default_arg, EllipsisExpr):\n                        return ctx.arg_types[default_idx][0]\n            except ValueError:\n                pass\n\n            # For Parameter subclasses without explicit default, return Any\n            # so that both annotation styles work:\n            #   foo: int = IntParameter()           (resolved type annotation)\n            #   foo: IntParameter = IntParameter()  (parameter type annotation)\n            return AnyType(TypeOfAny.special_form)\n\n        try:\n            default_idx = ctx.callee_arg_names.index(\"default\")\n        except ValueError:\n            return AnyType(TypeOfAny.unannotated)\n\n        default_args = ctx.args[default_idx]\n\n        if default_args:\n            default_type = ctx.arg_types[default_idx][0]\n            default_arg = default_args[0]\n\n            if not isinstance(default_arg, EllipsisExpr):\n                return default_type\n\n        return AnyType(TypeOfAny.unannotated)\n\n\nclass TaskAttribute:\n    def __init__(\n        self,\n        name: str,\n        has_default: bool,\n        line: int,\n        column: int,\n        type: Type | None,\n        info: TypeInfo,\n        api: SemanticAnalyzerPluginInterface,\n    ) -> None:\n        self.name = name\n        self.has_default = has_default\n        self.line = line\n        self.column = column\n        self.type = type  # Type as __init__ argument\n        self.info = info\n        self._api = api\n\n    def to_argument(self, current_info: TypeInfo, *, of: Literal[\"__init__\",]) -> Argument:\n        if of == \"__init__\":\n            # All arguments to __init__ are keyword-only and optional\n            # This is because gokart can set parameters by configuration'\n            arg_kind = ARG_NAMED_OPT\n        return Argument(\n            variable=self.to_var(current_info),\n            type_annotation=self.expand_type(current_info),\n            initializer=EllipsisExpr() if self.has_default else None,  # Only used by stubgen\n            kind=arg_kind,\n        )\n\n    def expand_type(self, current_info: TypeInfo) -> Type | None:\n        if self.type is not None and self.info.self_type is not None:\n            # In general, it is not safe to call `expand_type()` during semantic analysis,\n            # however this plugin is called very late, so all types should be fully ready.\n            # Also, it is tricky to avoid eager expansion of Self types here (e.g. because\n            # we serialize attributes).\n            with state.strict_optional_set(self._api.options.strict_optional):\n                return expand_type(self.type, {self.info.self_type.id: fill_typevars(current_info)})\n        return self.type\n\n    def to_var(self, current_info: TypeInfo) -> Var:\n        return Var(self.name, self.expand_type(current_info))\n\n    def serialize(self) -> JsonDict:\n        assert self.type\n        return {\n            \"name\": self.name,\n            \"has_default\": self.has_default,\n            \"line\": self.line,\n            \"column\": self.column,\n            \"type\": self.type.serialize(),\n        }\n\n    @classmethod\n    def deserialize(cls, info: TypeInfo, data: JsonDict, api: SemanticAnalyzerPluginInterface) -> TaskAttribute:\n        data = data.copy()\n        typ = deserialize_and_fixup_type(data.pop(\"type\"), api)\n        return cls(type=typ, info=info, **data, api=api)\n\n    def expand_typevar_from_subtype(self, sub_type: TypeInfo) -> None:\n        \"\"\"Expands type vars in the context of a subtype when an attribute is inherited\n        from a generic super type.\"\"\"\n        if self.type is not None:\n            with state.strict_optional_set(self._api.options.strict_optional):\n                self.type = map_type_from_supertype(self.type, sub_type, self.info)\n\n\nclass TaskTransformer:\n    \"\"\"Implement the behavior of gokart.Task.\"\"\"\n\n    def __init__(\n        self,\n        cls: ClassDef,\n        reason: Expression | Statement,\n        api: SemanticAnalyzerPluginInterface,\n        task_plugin: TaskPlugin,\n    ) -> None:\n        self._cls = cls\n        self._reason = reason\n        self._api = api\n        self._task_plugin = task_plugin\n\n    def transform(self) -> bool:\n        \"\"\"Apply all the necessary transformations to the underlying gokart.Task\"\"\"\n        info = self._cls.info\n        attributes = self.collect_attributes()\n\n        if attributes is None:\n            # Some definitions are not ready. We need another pass.\n            return False\n        for attr in attributes:\n            if attr.type is None:\n                return False\n        # If there are no attributes, it may be that the semantic analyzer has not\n        # processed them yet. In order to work around this, we can simply skip generating\n        # __init__ if there are no attributes, because if the user truly did not define any,\n        # then the object default __init__ with an empty signature will be present anyway.\n        if (\"__init__\" not in info.names or info.names[\"__init__\"].plugin_generated) and attributes:\n            args = [attr.to_argument(info, of=\"__init__\") for attr in attributes]\n            add_method_to_class(self._api, self._cls, \"__init__\", args=args, return_type=NoneType())\n        info.metadata[METADATA_TAG] = {\n            \"attributes\": [attr.serialize() for attr in attributes],\n        }\n\n        return True\n\n    def _get_assignment_statements_from_if_statement(self, stmt: IfStmt) -> Iterator[AssignmentStmt]:\n        for body in stmt.body:\n            if not body.is_unreachable:\n                yield from self._get_assignment_statements_from_block(body)\n        if stmt.else_body is not None and not stmt.else_body.is_unreachable:\n            yield from self._get_assignment_statements_from_block(stmt.else_body)\n\n    def _get_assignment_statements_from_block(self, block: Block) -> Iterator[AssignmentStmt]:\n        for stmt in block.body:\n            if isinstance(stmt, AssignmentStmt):\n                yield stmt\n            elif isinstance(stmt, IfStmt):\n                yield from self._get_assignment_statements_from_if_statement(stmt)\n\n    def collect_attributes(self) -> Optional[List[TaskAttribute]]:\n        \"\"\"Collect all attributes declared in the task and its parents.\n\n        All assignments of the form\n\n          a: SomeType\n          b: SomeOtherType = ...\n\n        are collected.\n\n        Return None if some base class hasn't been processed\n        yet and thus we'll need to ask for another pass.\n        \"\"\"\n        cls = self._cls\n\n        # First, collect attributes belonging to any class in the MRO, ignoring duplicates.\n        #\n        # We iterate through the MRO in reverse because attrs defined in the parent must appear\n        # earlier in the attributes list than attrs defined in the child.\n        #\n        # However, we also want attributes defined in the subtype to override ones defined\n        # in the parent. We can implement this via a dict without disrupting the attr order\n        # because dicts preserve insertion order in Python 3.7+.\n        found_attrs: Dict[str, TaskAttribute] = {}\n        for info in reversed(cls.info.mro[1:-1]):\n            if METADATA_TAG not in info.metadata:\n                continue\n            # Each class depends on the set of attributes in its task ancestors.\n            self._api.add_plugin_dependency(make_wildcard_trigger(info.fullname))\n\n            for data in info.metadata[METADATA_TAG][\"attributes\"]:\n                name: str = data[\"name\"]\n\n                attr = TaskAttribute.deserialize(info, data, self._api)\n                # TODO: We shouldn't be performing type operations during the main\n                #       semantic analysis pass, since some TypeInfo attributes might\n                #       still be in flux. This should be performed in a later phase.\n                attr.expand_typevar_from_subtype(cls.info)\n                found_attrs[name] = attr\n\n                sym_node = cls.info.names.get(name)\n                if sym_node and sym_node.node and not isinstance(sym_node.node, Var):\n                    self._api.fail(\n                        \"Task attribute may only be overridden by another attribute\",\n                        sym_node.node,\n                    )\n\n        # Second, collect attributes belonging to the current class.\n        current_attr_names: set[str] = set()\n        for stmt in self._get_assignment_statements_from_block(cls.defs):\n            if not self.is_parameter_call(stmt.rvalue):\n                continue\n\n            # a: int, b: str = 1, 'foo' is not supported syntax so we\n            # don't have to worry about it.\n            lhs = stmt.lvalues[0]\n            if not isinstance(lhs, NameExpr):\n                continue\n            sym = cls.info.names.get(lhs.name)\n            if sym is None:\n                # There was probably a semantic analysis error.\n                continue\n\n            node = sym.node\n            assert not isinstance(node, PlaceholderNode)\n\n            assert isinstance(node, Var)\n\n            has_parameter_call, parameter_args = self._collect_parameter_args(stmt.rvalue)\n            has_default = False\n            # Ensure that something like x: int = field() is rejected\n            # after an attribute with a default.\n            if has_parameter_call:\n                has_default = \"default\" in parameter_args\n\n            # All other assignments are already type checked.\n            elif not isinstance(stmt.rvalue, TempNode):\n                has_default = True\n\n            if not has_default:\n                # Make all non-default task attributes implicit because they are de-facto\n                # set on self in the generated __init__(), not in the class body. On the other\n                # hand, we don't know how custom task transforms initialize attributes,\n                # so we don't treat them as implicit. This is required to support descriptors\n                # (https://github.com/python/mypy/issues/14868).\n                sym.implicit = True\n\n            current_attr_names.add(lhs.name)\n            with state.strict_optional_set(self._api.options.strict_optional):\n                init_type = self._infer_task_attr_init_type(sym, stmt)\n\n            # When the type annotation is a Parameter type, update the\n            # symbol's type to the resolved type so that mypy uses it\n            # for the __init__ parameter type\n            if init_type is not None and init_type != sym.type:\n                assert isinstance(node, Var)\n                node.type = init_type\n\n            found_attrs[lhs.name] = TaskAttribute(\n                name=lhs.name,\n                has_default=has_default,\n                line=stmt.line,\n                column=stmt.column,\n                type=init_type,\n                info=cls.info,\n                api=self._api,\n            )\n\n        return list(found_attrs.values())\n\n    def _collect_parameter_args(self, expr: Expression) -> tuple[bool, Dict[str, Expression]]:\n        \"\"\"Returns a tuple where the first value represents whether or not\n        the expression is a call to luigi.Parameter()\n        and the second value is a dictionary of the keyword arguments that luigi.Parameter() was called with.\n        \"\"\"\n        if isinstance(expr, CallExpr) and isinstance(expr.callee, RefExpr):\n            args = {}\n            for name, arg in zip(expr.arg_names, expr.args):\n                if name is None:\n                    # NOTE: this is a workaround to get default value from a parameter\n                    self._api.fail(\n                        \"Positional arguments are not allowed for parameters when using the mypy plugin. \"\n                        \"Update your code to use named arguments, like luigi.Parameter(default='foo') instead of luigi.Parameter('foo')\",\n                        expr,\n                    )\n                    continue\n                args[name] = arg\n            return True, args\n        return False, {}\n\n    def _infer_task_attr_init_type(self, sym: SymbolTableNode, context: Context) -> Type | None:\n        \"\"\"Infer __init__ argument type for an attribute.\n\n        In particular, possibly use the signature of __set__.\n        \"\"\"\n        default = sym.type\n        t = get_proper_type(sym.type)\n\n        # If the type annotation is a Parameter subclass, resolve to the inner type T\n        # e.g. IntParameter -> int, StrParameter -> str\n        if isinstance(t, Instance):\n            is_param = self._task_plugin.check_parameter(t.type.fullname)\n            if is_param:\n                resolved = self._resolve_parameter_type(t)\n                return resolved\n\n        if sym.implicit:\n            return default\n\n        # Perform a simple-minded inference from the signature of __set__, if present.\n        # We can't use mypy.checkmember here, since this plugin runs before type checking.\n        # We only support some basic scanerios here, which is hopefully sufficient for\n        # the vast majority of use cases.\n        if not isinstance(t, Instance):\n            return default\n\n        setter = t.type.get(\"__set__\")\n\n        if not setter:\n            return default\n\n        if isinstance(setter.node, FuncDef):\n            super_info = t.type.get_containing_type_info(\"__set__\")\n            assert super_info\n            if setter.type:\n                setter_type = get_proper_type(map_type_from_supertype(setter.type, t.type, super_info))\n            else:\n                return AnyType(TypeOfAny.unannotated)\n            if isinstance(setter_type, CallableType) and setter_type.arg_kinds == [\n                ARG_POS,\n                ARG_POS,\n                ARG_POS,\n            ]:\n                return expand_type_by_instance(setter_type.arg_types[2], t)\n            else:\n                self._api.fail(f'Unsupported signature for \"__set__\" in \"{t.type.name}\"', context)\n        else:\n            self._api.fail(f'Unsupported \"__set__\" in \"{t.type.name}\"', context)\n\n        return default\n\n    def is_parameter_call(self, expr: Expression) -> bool:\n        \"\"\"Checks if the expression is a call to luigi.Parameter()\"\"\"\n        if not isinstance(expr, CallExpr):\n            return False\n\n        callee = expr.callee\n        fullname = None\n        if isinstance(callee, MemberExpr):\n            type_info = callee.node\n            if type_info is None and isinstance(callee.expr, NameExpr):\n                fullname = f\"{callee.expr.name}.{callee.name}\"\n        elif isinstance(callee, NameExpr):\n            type_info = callee.node\n        else:\n            return False\n\n        if isinstance(type_info, TypeInfo):\n            fullname = type_info.fullname\n\n        return fullname is not None and self._task_plugin.check_parameter(fullname)\n\n    def _resolve_parameter_type(self, t: Instance) -> Type:\n        \"\"\"Resolve a Parameter type annotation to its inner type T.\n\n        e.g. IntParameter -> int, Parameter[str] -> str\n        \"\"\"\n        # Direct Parameter[T] usage (e.g. Parameter[str])\n        if t.type.fullname == \"luigi.parameter.Parameter\" and t.args:\n            return t.args[0]\n\n        # Parameter subclass (e.g. IntParameter extends Parameter[int])\n        for base in t.type.bases:\n            if isinstance(base, Instance) and base.type.fullname == \"luigi.parameter.Parameter\":\n                if base.args:\n                    return base.args[0]\n                break\n\n        return AnyType(TypeOfAny.unannotated)\n\n\ndef plugin(version: str) -> type[Plugin]:\n    return TaskPlugin\n"
  },
  {
    "path": "luigi/notifications.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"Supports sending emails when tasks fail.\n\nThis needs some more documentation.\nSee :doc:`/configuration` for configuration options.\nIn particular using the config `receiver` should set up Luigi so that it will send emails when tasks fail.\n\n.. code-block:: ini\n\n    [email]\n    receiver=foo@bar.baz\n\"\"\"\n\nimport logging\nimport socket\nimport sys\nimport textwrap\n\nimport luigi.parameter\nfrom luigi.task import Config, Task\n\nlogger = logging.getLogger(\"luigi-interface\")\nDEFAULT_CLIENT_EMAIL = \"luigi-client@%s\" % socket.gethostname()\n\n\nclass TestNotificationsTask(Task):\n    \"\"\"\n    You may invoke this task to quickly check if you correctly have setup your\n    notifications Configuration.  You can run:\n\n    .. code-block:: console\n\n            $ luigi TestNotificationsTask --local-scheduler --email-force-send\n\n    And then check your email inbox to see if you got an error email or any\n    other kind of notifications that you expected.\n    \"\"\"\n\n    raise_in_complete = luigi.parameter.BoolParameter(description=\"If true, fail in complete() instead of run()\")\n\n    def run(self):\n        raise ValueError(\"Testing notifications triggering\")\n\n    def complete(self):\n        if self.raise_in_complete:\n            raise ValueError(\"Testing notifications triggering\")\n        return False\n\n\nclass email(Config):\n    force_send = luigi.parameter.BoolParameter(default=False, description=\"Send e-mail even from a tty\")\n    format = luigi.parameter.ChoiceParameter(\n        default=\"plain\", config_path=dict(section=\"core\", name=\"email-type\"), choices=(\"plain\", \"html\", \"none\"), description=\"Format type for sent e-mails\"\n    )\n    method = luigi.parameter.ChoiceParameter(\n        default=\"smtp\", config_path=dict(section=\"email\", name=\"type\"), choices=(\"smtp\", \"sendgrid\", \"ses\", \"sns\"), description=\"Method for sending e-mail\"\n    )\n    prefix = luigi.parameter.Parameter(default=\"\", config_path=dict(section=\"core\", name=\"email-prefix\"), description=\"Prefix for subject lines of all e-mails\")\n    receiver = luigi.parameter.Parameter(default=\"\", config_path=dict(section=\"core\", name=\"error-email\"), description=\"Address to send error e-mails to\")\n    traceback_max_length = luigi.parameter.IntParameter(default=5000, description=\"Max length for error traceback\")\n    sender = luigi.parameter.Parameter(\n        default=DEFAULT_CLIENT_EMAIL, config_path=dict(section=\"core\", name=\"email-sender\"), description=\"Address to send e-mails from\"\n    )\n\n\nclass smtp(Config):\n    host = luigi.parameter.Parameter(default=\"localhost\", config_path=dict(section=\"core\", name=\"smtp_host\"), description=\"Hostname of smtp server\")\n    local_hostname = luigi.parameter.Parameter(\n        default=None,\n        config_path=dict(section=\"core\", name=\"smtp_local_hostname\"),\n        description=\"If specified, local_hostname is used as the FQDN of the local host in the HELO/EHLO command\",\n    )\n    no_tls = luigi.parameter.BoolParameter(\n        default=False, config_path=dict(section=\"core\", name=\"smtp_without_tls\"), description=\"Do not use TLS in SMTP connections\"\n    )\n    password = luigi.parameter.Parameter(default=None, config_path=dict(section=\"core\", name=\"smtp_password\"), description=\"Password for the SMTP server login\")\n    port = luigi.parameter.IntParameter(default=0, config_path=dict(section=\"core\", name=\"smtp_port\"), description=\"Port number for smtp server\")\n    ssl = luigi.parameter.BoolParameter(default=False, config_path=dict(section=\"core\", name=\"smtp_ssl\"), description=\"Use SSL for the SMTP connection.\")\n    timeout = luigi.parameter.FloatParameter(\n        default=10.0, config_path=dict(section=\"core\", name=\"smtp_timeout\"), description=\"Number of seconds before timing out the smtp connection\"\n    )\n    username = luigi.parameter.Parameter(\n        default=None, config_path=dict(section=\"core\", name=\"smtp_login\"), description=\"Username used to log in to the SMTP host\"\n    )\n\n\nclass sendgrid(Config):\n    apikey = luigi.parameter.Parameter(config_path=dict(section=\"email\", name=\"SENGRID_API_KEY\"), description=\"API key for SendGrid login\")\n\n\ndef generate_email(sender, subject, message, recipients, image_png):\n    from email.mime.image import MIMEImage\n    from email.mime.multipart import MIMEMultipart\n    from email.mime.text import MIMEText\n\n    msg_root = MIMEMultipart(\"related\")\n\n    msg_text = MIMEText(message, email().format, \"utf-8\")\n    msg_root.attach(msg_text)\n\n    if image_png:\n        with open(image_png, \"rb\") as fp:\n            msg_image = MIMEImage(fp.read(), \"png\")\n        msg_root.attach(msg_image)\n\n    msg_root[\"Subject\"] = subject\n    msg_root[\"From\"] = sender\n    msg_root[\"To\"] = \",\".join(recipients)\n\n    return msg_root\n\n\ndef wrap_traceback(traceback):\n    \"\"\"\n    For internal use only (until further notice)\n    \"\"\"\n    if email().format == \"html\":\n        try:\n            from pygments import highlight\n            from pygments.formatters import HtmlFormatter\n            from pygments.lexers import PythonTracebackLexer\n\n            with_pygments = True\n        except ImportError:\n            with_pygments = False\n\n        if with_pygments:\n            formatter = HtmlFormatter(noclasses=True)\n            wrapped = highlight(traceback, PythonTracebackLexer(), formatter)\n        else:\n            wrapped = \"<pre>%s</pre>\" % traceback\n    else:\n        wrapped = traceback\n\n    return wrapped\n\n\ndef send_email_smtp(sender, subject, message, recipients, image_png):\n    import smtplib\n\n    smtp_config = smtp()\n    kwargs = dict(\n        host=smtp_config.host,\n        port=smtp_config.port,\n        local_hostname=smtp_config.local_hostname,\n    )\n    if smtp_config.timeout:\n        kwargs[\"timeout\"] = smtp_config.timeout\n\n    try:\n        smtp_conn = smtplib.SMTP_SSL(**kwargs) if smtp_config.ssl else smtplib.SMTP(**kwargs)\n        smtp_conn.ehlo_or_helo_if_needed()\n        if smtp_conn.has_extn(\"starttls\") and not smtp_config.no_tls:\n            smtp_conn.starttls()\n        if smtp_config.username and smtp_config.password:\n            smtp_conn.login(smtp_config.username, smtp_config.password)\n\n        msg_root = generate_email(sender, subject, message, recipients, image_png)\n\n        smtp_conn.sendmail(sender, recipients, msg_root.as_string())\n    except socket.error as exception:\n        logger.error(\"Not able to connect to smtp server: %s\", exception)\n\n\ndef send_email_ses(sender, subject, message, recipients, image_png):\n    \"\"\"\n    Sends notification through AWS SES.\n\n    Does not handle access keys.  Use either\n      1/ configuration file\n      2/ EC2 instance profile\n\n    See also https://boto3.readthedocs.io/en/latest/guide/configuration.html.\n    \"\"\"\n    from boto3 import client as boto3_client\n\n    client = boto3_client(\"ses\")\n\n    msg_root = generate_email(sender, subject, message, recipients, image_png)\n    response = client.send_raw_email(Source=sender, Destinations=recipients, RawMessage={\"Data\": msg_root.as_string()})\n\n    logger.debug(\n        (\"Message sent to SES.\\nMessageId: {},\\nRequestId: {},\\nHTTPSStatusCode: {}\").format(\n            response[\"MessageId\"], response[\"ResponseMetadata\"][\"RequestId\"], response[\"ResponseMetadata\"][\"HTTPStatusCode\"]\n        )\n    )\n\n\ndef send_email_sendgrid(sender, subject, message, recipients, image_png):\n    import sendgrid as sendgrid_lib\n\n    client = sendgrid_lib.SendGridAPIClient(sendgrid().apikey)\n\n    to_send = sendgrid_lib.Mail(from_email=sender, to_emails=recipients, subject=subject)\n\n    if email().format == \"html\":\n        to_send.add_content(message, \"text/html\")\n    else:\n        to_send.add_content(message, \"text/plain\")\n\n    if image_png:\n        to_send.add_attachment(image_png)\n\n    client.send(to_send)\n\n\ndef _email_disabled_reason():\n    if email().format == \"none\":\n        return \"email format is 'none'\"\n    elif email().force_send:\n        return None\n    elif sys.stdout.isatty():\n        return \"running from a tty\"\n    else:\n        return None\n\n\ndef send_email_sns(sender, subject, message, topic_ARN, image_png):\n    \"\"\"\n    Sends notification through AWS SNS. Takes Topic ARN from recipients.\n\n    Does not handle access keys.  Use either\n      1/ configuration file\n      2/ EC2 instance profile\n\n    See also https://boto3.readthedocs.io/en/latest/guide/configuration.html.\n    \"\"\"\n    from boto3 import resource as boto3_resource\n\n    sns = boto3_resource(\"sns\")\n    topic = sns.Topic(topic_ARN[0])\n\n    # Subject is max 100 chars\n    if len(subject) > 100:\n        subject = subject[0:48] + \"...\" + subject[-49:]\n\n    response = topic.publish(Subject=subject, Message=message)\n\n    logger.debug(\n        (\"Message sent to SNS.\\nMessageId: {},\\nRequestId: {},\\nHTTPSStatusCode: {}\").format(\n            response[\"MessageId\"], response[\"ResponseMetadata\"][\"RequestId\"], response[\"ResponseMetadata\"][\"HTTPStatusCode\"]\n        )\n    )\n\n\ndef send_email(subject, message, sender, recipients, image_png=None):\n    \"\"\"\n    Decides whether to send notification. Notification is cancelled if there are\n    no recipients or if stdout is onto tty or if in debug mode.\n\n    Dispatches on config value email.method.  Default is 'smtp'.\n    \"\"\"\n    notifiers = {\n        \"ses\": send_email_ses,\n        \"sendgrid\": send_email_sendgrid,\n        \"smtp\": send_email_smtp,\n        \"sns\": send_email_sns,\n    }\n\n    subject = _prefix(subject)\n    if not recipients or recipients == (None,):\n        return\n\n    if _email_disabled_reason():\n        logger.info(\"Not sending email to %r because %s\", recipients, _email_disabled_reason())\n        return\n\n    # Clean the recipients lists to allow multiple email addresses, comma\n    # separated in luigi.cfg\n    recipients_tmp = []\n    for r in recipients:\n        recipients_tmp.extend([a.strip() for a in r.split(\",\") if a.strip()])\n\n    # Replace original recipients with the clean list\n    recipients = recipients_tmp\n\n    logger.info(\"Sending email to %r\", recipients)\n\n    # Get appropriate sender and call it to send the notification\n    email_sender = notifiers[email().method]\n    email_sender(sender, subject, message, recipients, image_png)\n\n\ndef _email_recipients(additional_recipients=None):\n    receiver = email().receiver\n    recipients = [receiver] if receiver else []\n    if additional_recipients:\n        if isinstance(additional_recipients, str):\n            recipients.append(additional_recipients)\n        else:\n            recipients.extend(additional_recipients)\n    return recipients\n\n\ndef send_error_email(subject, message, additional_recipients=None):\n    \"\"\"\n    Sends an email to the configured error email, if it's configured.\n    \"\"\"\n    recipients = _email_recipients(additional_recipients)\n    sender = email().sender\n    send_email(subject=subject, message=message, sender=sender, recipients=recipients)\n\n\ndef _prefix(subject):\n    \"\"\"\n    If the config has a special prefix for emails then this function adds\n    this prefix.\n    \"\"\"\n    if email().prefix:\n        return \"{} {}\".format(email().prefix, subject)\n    else:\n        return subject\n\n\ndef format_task_error(headline, task, command, formatted_exception=None):\n    \"\"\"\n    Format a message body for an error email related to a Task\n\n    :param headline: Summary line for the message\n    :param task: `Task` instance where this error occurred\n    :param formatted_exception: optional string showing traceback\n\n    :return: message body\n    \"\"\"\n\n    if formatted_exception:\n        if len(formatted_exception) > email().traceback_max_length:\n            truncated_exception = formatted_exception[: email().traceback_max_length]\n            formatted_exception = f\"{truncated_exception}...Traceback exceeds max length and has been truncated.\"\n\n    if formatted_exception:\n        formatted_exception = wrap_traceback(formatted_exception)\n    else:\n        formatted_exception = \"\"\n\n    if email().format == \"html\":\n        msg_template = textwrap.dedent(\"\"\"\n        <html>\n        <body>\n        <h2>{headline}</h2>\n\n        <table style=\"border-top: 1px solid black; border-bottom: 1px solid black\">\n        <thead>\n        <tr><th>name</th><td>{name}</td></tr>\n        </thead>\n        <tbody>\n        {param_rows}\n        </tbody>\n        </table>\n        </pre>\n\n        <h2>Command line</h2>\n        <pre>\n        {command}\n        </pre>\n\n        <h2>Traceback</h2>\n        {traceback}\n        </body>\n        </html>\n        \"\"\")\n\n        str_params = task.to_str_params()\n        params = \"\\n\".join(\"<tr><th>{}</th><td>{}</td></tr>\".format(*items) for items in str_params.items())\n        body = msg_template.format(headline=headline, name=task.task_family, param_rows=params, command=command, traceback=formatted_exception)\n    else:\n        msg_template = textwrap.dedent(\"\"\"\\\n        {headline}\n\n        Name: {name}\n\n        Parameters:\n        {params}\n\n        Command line:\n          {command}\n\n        {traceback}\n        \"\"\")\n\n        str_params = task.to_str_params()\n        max_width = max([0] + [len(x) for x in str_params.keys()])\n        params = \"\\n\".join(\"  {:{width}}: {}\".format(*items, width=max_width) for items in str_params.items())\n        body = msg_template.format(headline=headline, name=task.task_family, params=params, command=command, traceback=formatted_exception)\n\n    return body\n"
  },
  {
    "path": "luigi/parameter.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"Parameters are one of the core concepts of Luigi.\nAll Parameters sit on :class:`~luigi.task.Task` classes.\nSee :ref:`Parameter` for more info on how to define parameters.\n\"\"\"\n\nimport abc\nimport datetime\nimport json\nimport operator\nimport warnings\nfrom ast import literal_eval\nfrom enum import Enum, IntEnum\nfrom json import JSONEncoder\nfrom pathlib import Path\nfrom typing import (\n    Any,\n    Callable,\n    Dict,\n    Generic,\n    Iterable,\n    Optional,\n    Sequence,\n    Tuple,\n    Type,\n    TypedDict,\n    Union,\n    overload,\n)\n\nfrom typing_extensions import TypeVar, Unpack\n\ntry:\n    import jsonschema\n\n    _JSONSCHEMA_ENABLED = True\nexcept ImportError:\n    _JSONSCHEMA_ENABLED = False\n\nfrom configparser import NoOptionError, NoSectionError\n\nimport luigi\nfrom luigi import configuration, date_interval, task_register\nfrom luigi.cmdline_parser import CmdlineParser\n\nfrom .freezing import FrozenOrderedDict, recursively_freeze, recursively_unfreeze\n\n\nclass _NoValueType:\n    \"\"\"Sentinel class representing \"no default value provided\".\"\"\"\n\n    _instance: \"Optional[_NoValueType]\" = None\n\n    def __new__(cls) -> \"_NoValueType\":\n        if cls._instance is None:\n            cls._instance = super().__new__(cls)\n        return cls._instance\n\n    def __repr__(self) -> str:\n        return \"<no_value>\"\n\n\n_no_value = _NoValueType()\n\n\nclass ParameterVisibility(IntEnum):\n    \"\"\"\n    Possible values for the parameter visibility option. Public is the default.\n    See :doc:`/parameters` for more info.\n    \"\"\"\n\n    PUBLIC = 0\n    HIDDEN = 1\n    PRIVATE = 2\n\n    @classmethod\n    def has_value(cls, value):\n        return any(value == item.value for item in cls)\n\n    def serialize(self):\n        return self.value\n\n\nclass ParameterException(Exception):\n    \"\"\"\n    Base exception.\n    \"\"\"\n\n    pass\n\n\nclass MissingParameterException(ParameterException):\n    \"\"\"\n    Exception signifying that there was a missing Parameter.\n    \"\"\"\n\n    pass\n\n\nclass UnknownParameterException(ParameterException):\n    \"\"\"\n    Exception signifying that an unknown Parameter was supplied.\n    \"\"\"\n\n    pass\n\n\nclass DuplicateParameterException(ParameterException):\n    \"\"\"\n    Exception signifying that a Parameter was specified multiple times.\n    \"\"\"\n\n    pass\n\n\nclass OptionalParameterTypeWarning(UserWarning):\n    \"\"\"\n    Warning class for OptionalParameterMixin with wrong type.\n    \"\"\"\n\n    pass\n\n\nclass UnconsumedParameterWarning(UserWarning):\n    \"\"\"Warning class for parameters that are not consumed by the task.\"\"\"\n\n\nT = TypeVar(\"T\", default=str)\n_OptT = TypeVar(\"_OptT\")\n\n\nclass ConfigPath(TypedDict):\n    section: str\n    name: str\n\n\nclass _ParameterKwargs(TypedDict, total=False):\n    is_global: bool\n    significant: bool\n    description: Optional[str]\n    config_path: Optional[ConfigPath]\n    positional: bool\n    always_in_help: bool\n    batch_method: Optional[Callable[[Iterable[Any]], Any]]\n    visibility: ParameterVisibility\n\n\nclass Parameter(Generic[T]):\n    \"\"\"\n    Parameter whose value is a ``str``, and a base class for other parameter types.\n\n    Parameters are objects set on the Task class level to make it possible to parameterize tasks.\n    For instance:\n\n    .. code:: python\n\n        class MyTask(luigi.Task):\n            foo = luigi.Parameter()\n\n        class RequiringTask(luigi.Task):\n            def requires(self):\n                return MyTask(foo=\"hello\")\n\n            def run(self):\n                print(self.requires().foo)  # prints \"hello\"\n\n    This makes it possible to instantiate multiple tasks, eg ``MyTask(foo='bar')`` and\n    ``MyTask(foo='baz')``. The task will then have the ``foo`` attribute set appropriately.\n\n    When a task is instantiated, it will first use any argument as the value of the parameter, eg.\n    if you instantiate ``a = TaskA(x=44)`` then ``a.x == 44``. When the value is not provided, the\n    value  will be resolved in this order of falling priority:\n\n        * Any value provided on the command line:\n\n          - To the root task (eg. ``--param xyz``)\n\n          - Then to the class, using the qualified task name syntax (eg. ``--TaskA-param xyz``).\n\n        * With ``[TASK_NAME]>PARAM_NAME: <serialized value>`` syntax. See :ref:`ParamConfigIngestion`\n\n        * Any default value set using the ``default`` flag.\n\n    Parameter objects may be reused, but you must then set the ``positional=False`` flag.\n    \"\"\"\n\n    _counter = 0  # non-atomically increasing counter used for ordering parameters.\n\n    def __init__(\n        self,\n        default: Union[T, _NoValueType] = _no_value,\n        is_global: bool = False,\n        significant: bool = True,\n        description: Optional[str] = None,\n        config_path: Optional[ConfigPath] = None,\n        positional: bool = True,\n        always_in_help: bool = False,\n        batch_method: Optional[Callable[[Iterable[Any]], Any]] = None,\n        visibility: ParameterVisibility = ParameterVisibility.PUBLIC,\n    ):\n        \"\"\"\n        :param default: the default value for this parameter. This should match the type of the\n                        Parameter, i.e. ``datetime.date`` for ``DateParameter`` or ``int`` for\n                        ``IntParameter``. By default, no default is stored and\n                        the value must be specified at runtime.\n        :param bool significant: specify ``False`` if the parameter should not be treated as part of\n                                 the unique identifier for a Task. An insignificant Parameter might\n                                 also be used to specify a password or other sensitive information\n                                 that should not be made public via the scheduler. Default:\n                                 ``True``.\n        :param str description: A human-readable string describing the purpose of this Parameter.\n                                For command-line invocations, this will be used as the `help` string\n                                shown to users. Default: ``None``.\n        :param dict config_path: a dictionary with entries ``section`` and ``name``\n                                 specifying a config file entry from which to read the\n                                 default value for this parameter. DEPRECATED.\n                                 Default: ``None``.\n        :param bool positional: If true, you can set the argument as a\n                                positional argument. It's true by default but we recommend\n                                ``positional=False`` for abstract base classes and similar cases.\n        :param bool always_in_help: For the --help option in the command line\n                                    parsing. Set true to always show in --help.\n        :param function(iterable[A])->A batch_method: Method to combine an iterable of parsed\n                                                        parameter values into a single value. Used\n                                                        when receiving batched parameter lists from\n                                                        the scheduler. See :ref:`batch_method`\n\n        :param visibility: A Parameter whose value is a :py:class:`~luigi.parameter.ParameterVisibility`.\n                            Default value is ParameterVisibility.PUBLIC\n\n        \"\"\"\n        self._default = default\n        self._batch_method = batch_method\n        if is_global:\n            warnings.warn(\"is_global support is removed. Assuming positional=False\", DeprecationWarning, stacklevel=2)\n            positional = False\n        self.significant = significant  # Whether different values for this parameter will differentiate otherwise equal tasks\n        self.positional = positional\n        self.visibility = visibility if ParameterVisibility.has_value(visibility) else ParameterVisibility.PUBLIC\n\n        self.description = description\n        self.always_in_help = always_in_help\n\n        if config_path is not None and (\"section\" not in config_path or \"name\" not in config_path):\n            raise ParameterException(\"config_path must be a hash containing entries for section and name\")\n        self._config_path = config_path\n\n        self._counter = Parameter._counter  # We need to keep track of this to get the order right (see Task class)\n        Parameter._counter += 1\n\n    @overload\n    def __get__(self, instance: None, owner: Any) -> \"Parameter[T]\": ...\n\n    @overload\n    def __get__(self, instance: Any, owner: Any) -> T: ...\n\n    def __get__(self, instance: Any, owner: Any) -> Any:\n        if instance is None:\n            return self\n        return instance.__dict__[self._attribute_name]\n\n    def __set_name__(self, owner, name):\n        self._attribute_name = name\n\n    def __set__(self, instance: Any, value: T):\n        if self._attribute_name is None:\n            raise RuntimeError(\"Parameter name not set. ensure it's defined as a class attribute.\")\n        instance.__dict__[self._attribute_name] = value\n\n    def _get_value_from_config(self, section, name):\n        \"\"\"Loads the default from the config. Returns _no_value if it doesn't exist\"\"\"\n\n        conf = configuration.get_config()\n\n        try:\n            value = conf.get(section, name)\n        except (NoSectionError, NoOptionError, KeyError):\n            return _no_value\n\n        return self.parse(value)\n\n    def _get_value(self, task_name, param_name):\n        for value, warn in self._value_iterator(task_name, param_name):\n            if value != _no_value:\n                if warn:\n                    warnings.warn(warn, DeprecationWarning)\n                return value\n        return _no_value\n\n    def _value_iterator(self, task_name, param_name):\n        \"\"\"\n        Yield the parameter values, with optional deprecation warning as second tuple value.\n\n        The parameter value will be whatever non-_no_value that is yielded first.\n        \"\"\"\n        cp_parser = CmdlineParser.get_instance()\n        if cp_parser:\n            dest = self._parser_global_dest(param_name, task_name)\n            found = getattr(cp_parser.known_args, dest, None)\n            yield (self._parse_or_no_value(found), None)\n        yield (self._get_value_from_config(task_name, param_name), None)\n        if self._config_path:\n            yield (\n                self._get_value_from_config(self._config_path[\"section\"], self._config_path[\"name\"]),\n                \"The use of the configuration [{}] {} is deprecated. Please use [{}] {}\".format(\n                    self._config_path[\"section\"], self._config_path[\"name\"], task_name, param_name\n                ),\n            )\n        yield (self._default, None)\n\n    def has_task_value(self, task_name, param_name):\n        return self._get_value(task_name, param_name) != _no_value\n\n    def task_value(self, task_name, param_name):\n        value = self._get_value(task_name, param_name)\n        if value == _no_value:\n            raise MissingParameterException(\"No default specified\")\n        else:\n            return self.normalize(value)\n\n    def _is_batchable(self):\n        return self._batch_method is not None\n\n    def parse(self, x):\n        \"\"\"\n        Parse an individual value from the input.\n\n        The default implementation is the identity function, but subclasses should override\n        this method for specialized parsing.\n\n        :param str x: the value to parse.\n        :return: the parsed value.\n        \"\"\"\n        return x  # default impl\n\n    def _parse_list(self, xs):\n        \"\"\"\n        Parse a list of values from the scheduler.\n\n        Only possible if this is_batchable() is True. This will combine the list into a single\n        parameter value using batch method. This should never need to be overridden.\n\n        :param xs: list of values to parse and combine\n        :return: the combined parsed values\n        \"\"\"\n        if not self._is_batchable():\n            raise NotImplementedError(\"No batch method found\")\n        elif not xs:\n            raise ValueError(\"Empty parameter list passed to parse_list\")\n        else:\n            return self._batch_method(map(self.parse, xs))\n\n    def serialize(self, x):\n        \"\"\"\n        Opposite of :py:meth:`parse`.\n\n        Converts the value ``x`` to a string.\n\n        :param x: the value to serialize.\n        \"\"\"\n        return str(x)\n\n    def _warn_on_wrong_param_type(self, param_name, param_value):\n        if self.__class__ != Parameter:\n            return\n        if not isinstance(param_value, str):\n            warnings.warn('Parameter \"{}\" with value \"{}\" is not of type string.'.format(param_name, param_value))\n\n    def normalize(self, x):\n        \"\"\"\n        Given a parsed parameter value, normalizes it.\n\n        The value can either be the result of parse(), the default value or\n        arguments passed into the task's constructor by instantiation.\n\n        This is very implementation defined, but can be used to validate/clamp\n        valid values. For example, if you wanted to only accept even integers,\n        and \"correct\" odd values to the nearest integer, you can implement\n        normalize as ``x // 2 * 2``.\n        \"\"\"\n        return x  # default impl\n\n    def next_in_enumeration(self, value):\n        \"\"\"\n        If your Parameter type has an enumerable ordering of values. You can\n        choose to override this method. This method is used by the\n        :py:mod:`luigi.execution_summary` module for pretty printing\n        purposes. Enabling it to pretty print tasks like ``MyTask(num=1),\n        MyTask(num=2), MyTask(num=3)`` to ``MyTask(num=1..3)``.\n\n        :param value: The value\n        :return: The next value, like \"value + 1\". Or ``None`` if there's no enumerable ordering.\n        \"\"\"\n        return None\n\n    def _parse_or_no_value(self, x):\n        if not x:\n            return _no_value\n        else:\n            return self.parse(x)\n\n    @staticmethod\n    def _parser_global_dest(param_name, task_name):\n        return task_name + \"_\" + param_name\n\n    @classmethod\n    def _parser_kwargs(cls, param_name, task_name=None):\n        return {\n            \"action\": \"store\",\n            \"dest\": cls._parser_global_dest(param_name, task_name) if task_name else param_name,\n        }\n\n\nclass OptionalParameterMixin(Generic[_OptT]):\n    \"\"\"\n    Mixin to make a parameter class optional and treat empty string as None.\n    \"\"\"\n\n    expected_type: type = type(None)\n\n    def __init__(\n        self,\n        default: Union[_OptT, None, _NoValueType] = _no_value,\n        **kwargs: Unpack[_ParameterKwargs],\n    ):\n        super().__init__(default=default, **kwargs)  # type: ignore[arg-type, call-arg, misc]\n\n    @overload\n    def __get__(self, instance: None, owner: Any) -> \"Parameter[Optional[_OptT]]\": ...\n\n    @overload\n    def __get__(self, instance: Any, owner: Any) -> Optional[_OptT]: ...\n\n    def __get__(self, instance: Any, owner: Any) -> Any:\n        return super().__get__(instance, owner)  # type: ignore[misc]\n\n    def __set__(self, instance: Any, value: Optional[_OptT]):\n        super().__set__(instance, value)  # type: ignore[misc]\n\n    def serialize(self, x):\n        \"\"\"\n        Parse the given value if the value is not None else return an empty string.\n        \"\"\"\n        if x is None:\n            return \"\"\n        else:\n            return super().serialize(x)\n\n    def parse(self, x):\n        \"\"\"\n        Parse the given value if it is a string (empty strings are parsed to None).\n        \"\"\"\n        if not isinstance(x, str):\n            return x\n        elif x:\n            return super().parse(x)\n        else:\n            return None\n\n    def normalize(self, x):\n        \"\"\"\n        Normalize the given value if it is not None.\n        \"\"\"\n        if x is None:\n            return None\n        return super().normalize(x)\n\n    def _warn_on_wrong_param_type(self, param_name, param_value):\n        if not isinstance(param_value, self.expected_type) and param_value is not None:\n            try:\n                param_type = \"any type in \" + str([i.__name__ for i in self.expected_type]).replace(\"'\", '\"')\n            except TypeError:\n                param_type = f'type \"{self.expected_type.__name__}\"'\n            warnings.warn(\n                (f'{self.__class__.__name__} \"{param_name}\" with value \"{param_value}\" is not of {param_type} or None.'),\n                OptionalParameterTypeWarning,\n            )\n\n    def next_in_enumeration(self, value):\n        return None\n\n\nclass OptionalParameter(OptionalParameterMixin[str], Parameter[Optional[str]]):\n    \"\"\"Class to parse optional parameters.\"\"\"\n\n    expected_type = str\n\n\nclass OptionalStrParameter(OptionalParameterMixin[str], Parameter[Optional[str]]):\n    \"\"\"Class to parse optional str parameters.\"\"\"\n\n    expected_type = str\n\n\n_UNIX_EPOCH = datetime.datetime.utcfromtimestamp(0)\n\n\nclass _DateParameterBase(Parameter[datetime.date]):\n    \"\"\"\n    Base class Parameter for date (not datetime).\n    \"\"\"\n\n    def __init__(\n        self,\n        default: Union[datetime.date, _NoValueType] = _no_value,\n        interval: int = 1,\n        start: Optional[datetime.date] = None,\n        **kwargs: Unpack[_ParameterKwargs],\n    ):\n        super().__init__(default=default, **kwargs)\n        self.interval = interval\n        self.start = start if start is not None else _UNIX_EPOCH.date()\n\n    @property\n    @abc.abstractmethod\n    def date_format(self):\n        \"\"\"\n        Override me with a :py:meth:`~datetime.date.strftime` string.\n        \"\"\"\n        pass\n\n    def parse(self, s):\n        \"\"\"\n        Parses a date string formatted like ``YYYY-MM-DD``.\n        \"\"\"\n        return datetime.datetime.strptime(s, self.date_format).date()\n\n    def serialize(self, dt):\n        \"\"\"\n        Converts the date to a string using the :py:attr:`~_DateParameterBase.date_format`.\n        \"\"\"\n        if dt is None:\n            return str(dt)\n        return dt.strftime(self.date_format)\n\n\nclass DateParameter(_DateParameterBase):\n    \"\"\"\n    Parameter whose value is a :py:class:`~datetime.date`.\n\n    A DateParameter is a Date string formatted ``YYYY-MM-DD``. For example, ``2013-07-10`` specifies\n    July 10, 2013.\n\n    DateParameters are 90% of the time used to be interpolated into file system paths or the like.\n    Here is a gentle reminder of how to interpolate date parameters into strings:\n\n    .. code:: python\n\n        class MyTask(luigi.Task):\n            date = luigi.DateParameter()\n\n            def run(self):\n                templated_path = \"/my/path/to/my/dataset/{date:%Y/%m/%d}/\"\n                instantiated_path = templated_path.format(date=self.date)\n                # print(instantiated_path) --> /my/path/to/my/dataset/2016/06/09/\n                # ... use instantiated_path ...\n\n    To set this parameter to default to the current day. You can write code like this:\n\n    .. code:: python\n\n        import datetime\n\n        class MyTask(luigi.Task):\n            date = luigi.DateParameter(default=datetime.date.today())\n    \"\"\"\n\n    date_format = \"%Y-%m-%d\"\n\n    def next_in_enumeration(self, value):\n        return value + datetime.timedelta(days=self.interval)\n\n    def normalize(self, x):\n        if x is None:\n            return None\n\n        if isinstance(x, datetime.datetime):\n            x = x.date()\n\n        delta = (x - self.start).days % self.interval\n        return x - datetime.timedelta(days=delta)\n\n\nclass MonthParameter(DateParameter):\n    \"\"\"\n    Parameter whose value is a :py:class:`~datetime.date`, specified to the month\n    (day of :py:class:`~datetime.date` is \"rounded\" to first of the month).\n\n    A MonthParameter is a Date string formatted ``YYYY-MM``. For example, ``2013-07`` specifies\n    July of 2013. Task objects constructed from code accept :py:class:`~datetime.date` (ignoring the day value) or\n    :py:class:`~luigi.date_interval.Month`.\n    \"\"\"\n\n    date_format = \"%Y-%m\"\n\n    def _add_months(self, date, months):\n        \"\"\"\n        Add ``months`` months to ``date``.\n\n        Unfortunately we can't use timedeltas to add months because timedelta counts in days\n        and there's no foolproof way to add N months in days without counting the number of\n        days per month.\n        \"\"\"\n        year = date.year + (date.month + months - 1) // 12\n        month = (date.month + months - 1) % 12 + 1\n        return datetime.date(year=year, month=month, day=1)\n\n    def next_in_enumeration(self, value):\n        return self._add_months(value, self.interval)\n\n    def normalize(self, x):\n        if x is None:\n            return None\n\n        if isinstance(x, date_interval.Month):\n            x = x.date_a\n\n        months_since_start = (x.year - self.start.year) * 12 + (x.month - self.start.month)\n        months_since_start -= months_since_start % self.interval\n\n        return self._add_months(self.start, months_since_start)\n\n\nclass YearParameter(DateParameter):\n    \"\"\"\n    Parameter whose value is a :py:class:`~datetime.date`, specified to the year\n    (day and month of :py:class:`~datetime.date` is \"rounded\" to first day of the year).\n\n    A YearParameter is a Date string formatted ``YYYY``. Task objects constructed from code accept\n    :py:class:`~datetime.date` (ignoring the month and day values) or :py:class:`~luigi.date_interval.Year`.\n    \"\"\"\n\n    date_format = \"%Y\"\n\n    def next_in_enumeration(self, value):\n        return value.replace(year=value.year + self.interval)\n\n    def normalize(self, x):\n        if x is None:\n            return None\n\n        if isinstance(x, date_interval.Year):\n            x = x.date_a\n\n        delta = (x.year - self.start.year) % self.interval\n        return datetime.date(year=x.year - delta, month=1, day=1)\n\n\nclass _DatetimeParameterBase(Parameter[datetime.datetime]):\n    \"\"\"\n    Base class Parameter for datetime\n    \"\"\"\n\n    def __init__(\n        self,\n        default: Union[datetime.datetime, _NoValueType] = _no_value,\n        interval: int = 1,\n        start: Optional[datetime.datetime] = None,\n        **kwargs: Unpack[_ParameterKwargs],\n    ):\n        super().__init__(default=default, **kwargs)\n        self.interval = interval\n        self.start = start if start is not None else _UNIX_EPOCH\n\n    @property\n    @abc.abstractmethod\n    def date_format(self):\n        \"\"\"\n        Override me with a :py:meth:`~datetime.date.strftime` string.\n        \"\"\"\n        pass\n\n    @property\n    @abc.abstractmethod\n    def _timedelta(self):\n        \"\"\"\n        How to move one interval of this type forward (i.e. not counting self.interval).\n        \"\"\"\n        pass\n\n    def parse(self, s):\n        \"\"\"\n        Parses a string to a :py:class:`~datetime.datetime`.\n        \"\"\"\n        return datetime.datetime.strptime(s, self.date_format)\n\n    def serialize(self, dt):\n        \"\"\"\n        Converts the date to a string using the :py:attr:`~_DatetimeParameterBase.date_format`.\n        \"\"\"\n        if dt is None:\n            return str(dt)\n        return dt.strftime(self.date_format)\n\n    @staticmethod\n    def _convert_to_dt(dt):\n        if not isinstance(dt, datetime.datetime):\n            dt = datetime.datetime.combine(dt, datetime.time.min)\n        return dt\n\n    def normalize(self, dt):\n        \"\"\"\n        Clamp dt to every Nth :py:attr:`~_DatetimeParameterBase.interval` starting at\n        :py:attr:`~_DatetimeParameterBase.start`.\n        \"\"\"\n        if dt is None:\n            return None\n\n        dt = self._convert_to_dt(dt)\n\n        dt = dt.replace(microsecond=0)  # remove microseconds, to avoid float rounding issues.\n        delta = (dt - self.start).total_seconds()\n        granularity = (self._timedelta * self.interval).total_seconds()\n        return dt - datetime.timedelta(seconds=delta % granularity)\n\n    def next_in_enumeration(self, value):\n        return value + self._timedelta * self.interval\n\n\nclass DateHourParameter(_DatetimeParameterBase):\n    \"\"\"\n    Parameter whose value is a :py:class:`~datetime.datetime` specified to the hour.\n\n    A DateHourParameter is a `ISO 8601 <http://en.wikipedia.org/wiki/ISO_8601>`_ formatted\n    date and time specified to the hour. For example, ``2013-07-10T19`` specifies July 10, 2013 at\n    19:00.\n    \"\"\"\n\n    date_format = \"%Y-%m-%dT%H\"  # ISO 8601 is to use 'T'\n    _timedelta = datetime.timedelta(hours=1)\n\n\nclass DateMinuteParameter(_DatetimeParameterBase):\n    \"\"\"\n    Parameter whose value is a :py:class:`~datetime.datetime` specified to the minute.\n\n    A DateMinuteParameter is a `ISO 8601 <http://en.wikipedia.org/wiki/ISO_8601>`_ formatted\n    date and time specified to the minute. For example, ``2013-07-10T1907`` specifies July 10, 2013 at\n    19:07.\n\n    The interval parameter can be used to clamp this parameter to every N minutes, instead of every minute.\n    \"\"\"\n\n    date_format = \"%Y-%m-%dT%H%M\"\n    _timedelta = datetime.timedelta(minutes=1)\n    deprecated_date_format = \"%Y-%m-%dT%HH%M\"\n\n    def parse(self, x):\n        try:\n            value = datetime.datetime.strptime(x, self.deprecated_date_format)\n            warnings.warn('Using \"H\" between hours and minutes is deprecated, omit it instead.', DeprecationWarning, stacklevel=2)\n            return value\n        except ValueError:\n            return super().parse(x)\n\n\nclass DateSecondParameter(_DatetimeParameterBase):\n    \"\"\"\n    Parameter whose value is a :py:class:`~datetime.datetime` specified to the second.\n\n    A DateSecondParameter is a `ISO 8601 <http://en.wikipedia.org/wiki/ISO_8601>`_ formatted\n    date and time specified to the second. For example, ``2013-07-10T190738`` specifies July 10, 2013 at\n    19:07:38.\n\n    The interval parameter can be used to clamp this parameter to every N seconds, instead of every second.\n    \"\"\"\n\n    date_format = \"%Y-%m-%dT%H%M%S\"\n    _timedelta = datetime.timedelta(seconds=1)\n\n\nclass StrParameter(Parameter[str]):\n    \"\"\"\n    Parameter whose value is a ``str``.\n    \"\"\"\n\n    def parse(self, x):\n        return str(x)\n\n\nclass IntParameter(Parameter[int]):\n    \"\"\"\n    Parameter whose value is an ``int``.\n    \"\"\"\n\n    def parse(self, x):\n        \"\"\"\n        Parses an ``int`` from the string using ``int()``.\n        \"\"\"\n        return int(x)\n\n    def next_in_enumeration(self, value):\n        return value + 1\n\n\nclass OptionalIntParameter(OptionalParameterMixin[int], IntParameter):  # type: ignore[misc]\n    \"\"\"Class to parse optional int parameters.\"\"\"\n\n    expected_type = int\n\n\nclass FloatParameter(Parameter[float]):\n    \"\"\"\n    Parameter whose value is a ``float``.\n    \"\"\"\n\n    def parse(self, x):\n        \"\"\"\n        Parses a ``float`` from the string using ``float()``.\n        \"\"\"\n        return float(x)\n\n\nclass OptionalFloatParameter(OptionalParameterMixin[float], FloatParameter):  # type: ignore[misc]\n    \"\"\"Class to parse optional float parameters.\"\"\"\n\n    expected_type = float\n\n\nclass BoolParameter(Parameter[bool]):\n    \"\"\"\n    A Parameter whose value is a ``bool``. This parameter has an implicit default value of\n    ``False``. For the command line interface this means that the value is ``False`` unless you\n    add ``\"--the-bool-parameter\"`` to your command without giving a parameter value. This is\n    considered *implicit* parsing (the default). However, in some situations one might want to give\n    the explicit bool value (``\"--the-bool-parameter true|false\"``), e.g. when you configure the\n    default value to be ``True``. This is called *explicit* parsing. When omitting the parameter\n    value, it is still considered ``True`` but to avoid ambiguities during argument parsing, make\n    sure to always place bool parameters behind the task family on the command line when using\n    explicit parsing.\n\n    You can toggle between the two parsing modes on a per-parameter base via\n\n    .. code-block:: python\n\n        class MyTask(luigi.Task):\n            implicit_bool = luigi.BoolParameter(parsing=luigi.BoolParameter.IMPLICIT_PARSING)\n            explicit_bool = luigi.BoolParameter(parsing=luigi.BoolParameter.EXPLICIT_PARSING)\n\n    or globally by\n\n    .. code-block:: python\n\n        luigi.BoolParameter.parsing = luigi.BoolParameter.EXPLICIT_PARSING\n\n    for all bool parameters instantiated after this line.\n    \"\"\"\n\n    IMPLICIT_PARSING = \"implicit\"\n    EXPLICIT_PARSING = \"explicit\"\n\n    parsing = IMPLICIT_PARSING\n\n    def __init__(\n        self,\n        default: Union[bool, _NoValueType] = _no_value,\n        parsing: str = IMPLICIT_PARSING,\n        **kwargs: Unpack[_ParameterKwargs],\n    ):\n        self.parsing = parsing\n        super().__init__(default=default, **kwargs)\n        if self._default == _no_value:\n            self._default = False\n\n    def parse(self, x):\n        \"\"\"\n        Parses a ``bool`` from the string, matching 'true' or 'false' ignoring case.\n        \"\"\"\n        s = str(x).lower()\n        if s == \"true\":\n            return True\n        elif s == \"false\":\n            return False\n        else:\n            raise ValueError(\"cannot interpret '{}' as boolean\".format(x))\n\n    def normalize(self, x):\n        try:\n            return self.parse(x)\n        except ValueError:\n            return None\n\n    def _parser_kwargs(self, *args, **kwargs):\n        parser_kwargs = super()._parser_kwargs(*args, **kwargs)\n        if self.parsing == self.IMPLICIT_PARSING:\n            parser_kwargs[\"action\"] = \"store_true\"\n        elif self.parsing == self.EXPLICIT_PARSING:\n            parser_kwargs[\"nargs\"] = \"?\"\n            parser_kwargs[\"const\"] = True\n        else:\n            raise ValueError(\"unknown parsing value '{}'\".format(self.parsing))\n        return parser_kwargs\n\n\nclass OptionalBoolParameter(OptionalParameterMixin[bool], BoolParameter):  # type: ignore[misc]\n    \"\"\"Class to parse optional bool parameters.\"\"\"\n\n    expected_type = bool\n\n\nclass DateIntervalParameter(Parameter[date_interval.DateInterval]):\n    \"\"\"\n    A Parameter whose value is a :py:class:`~luigi.date_interval.DateInterval`.\n\n    Date Intervals are specified using the ISO 8601 date notation for dates\n    (eg. \"2015-11-04\"), months (eg. \"2015-05\"), years (eg. \"2015\"), or weeks\n    (eg. \"2015-W35\"). In addition, it also supports arbitrary date intervals\n    provided as two dates separated with a dash (eg. \"2015-11-04-2015-12-04\").\n    \"\"\"\n\n    def parse(self, x):\n        \"\"\"\n        Parses a :py:class:`~luigi.date_interval.DateInterval` from the input.\n\n        see :py:mod:`luigi.date_interval`\n          for details on the parsing of DateIntervals.\n        \"\"\"\n        # TODO: can we use xml.utils.iso8601 or something similar?\n\n        from luigi import date_interval as d\n\n        for cls in [d.Year, d.Month, d.Week, d.Date, d.Custom]:\n            i = cls.parse(x)\n            if i:\n                return i\n\n        raise ValueError(\"Invalid date interval - could not be parsed\")\n\n\nclass TimeDeltaParameter(Parameter[datetime.timedelta]):\n    \"\"\"\n    Class that maps to timedelta using strings in any of the following forms:\n\n     * A bare number is interpreted as duration in seconds.\n     * ``n {w[eek[s]]|d[ay[s]]|h[our[s]]|m[inute[s]|s[second[s]]}`` (e.g. \"1 week 2 days\" or \"1 h\")\n        Note: multiple arguments must be supplied in longest to shortest unit order\n     * ISO 8601 duration ``PnDTnHnMnS`` (each field optional, years and months not supported)\n     * ISO 8601 duration ``PnW``\n\n    See https://en.wikipedia.org/wiki/ISO_8601#Durations\n    \"\"\"\n\n    def _apply_regex(self, regex, input):\n        import re\n\n        re_match = re.match(regex, input)\n        if re_match and any(re_match.groups()):\n            kwargs = {}\n            has_val = False\n            for k, v in re_match.groupdict(default=\"0\").items():\n                val = int(v)\n                if val > -1:\n                    has_val = True\n                    kwargs[k] = val\n            if has_val:\n                return datetime.timedelta(**kwargs)\n\n    def _parseIso8601(self, input):\n        def field(key):\n            return r\"(?P<%s>\\d+)%s\" % (key, key[0].upper())\n\n        def optional_field(key):\n            return \"(%s)?\" % field(key)\n\n        # A little loose: ISO 8601 does not allow weeks in combination with other fields, but this regex does (as does python timedelta)\n        regex = \"P(%s|%s(T%s)?)\" % (field(\"weeks\"), optional_field(\"days\"), \"\".join([optional_field(key) for key in [\"hours\", \"minutes\", \"seconds\"]]))\n        return self._apply_regex(regex, input)\n\n    def _parseSimple(self, input):\n        keys = [\"weeks\", \"days\", \"hours\", \"minutes\", \"seconds\"]\n        # Give the digits a regex group name from the keys, then look for text with the first letter of the key,\n        # optionally followed by the rest of the word, with final char (the \"s\") optional\n        regex = \"\".join([r\"((?P<%s>\\d+) ?%s(%s)?(%s)? ?)?\" % (k, k[0], k[1:-1], k[-1]) for k in keys])\n        return self._apply_regex(regex, input)\n\n    def parse(self, x):\n        \"\"\"\n        Parses a time delta from the input.\n\n        See :py:class:`TimeDeltaParameter` for details on supported formats.\n        \"\"\"\n        try:\n            return datetime.timedelta(seconds=float(x))\n        except ValueError:\n            pass\n        result = self._parseIso8601(x)\n        if not result:\n            result = self._parseSimple(x)\n        if result is not None:\n            return result\n        else:\n            raise ParameterException(\"Invalid time delta - could not parse %s\" % x)\n\n    def serialize(self, x):\n        \"\"\"\n        Converts datetime.timedelta to a string\n\n        :param x: the value to serialize.\n        \"\"\"\n        weeks = x.days // 7\n        days = x.days % 7\n        hours = x.seconds // 3600\n        minutes = (x.seconds % 3600) // 60\n        seconds = (x.seconds % 3600) % 60\n        result = \"{} w {} d {} h {} m {} s\".format(weeks, days, hours, minutes, seconds)\n        return result\n\n    def _warn_on_wrong_param_type(self, param_name, param_value):\n        if self.__class__ != TimeDeltaParameter:\n            return\n        if not isinstance(param_value, datetime.timedelta):\n            warnings.warn('Parameter \"{}\" with value \"{}\" is not of type timedelta.'.format(param_name, param_value))\n\n\nTaskType = TypeVar(\"TaskType\", bound=\"luigi.task.Task\")\n\n\nclass TaskParameter(Parameter[Type[TaskType]]):\n    \"\"\"\n    A parameter that takes another luigi task class.\n\n    When used programatically, the parameter should be specified\n    directly with the :py:class:`luigi.task.Task` (sub) class. Like\n    ``MyMetaTask(my_task_param=my_tasks.MyTask)``. On the command line,\n    you specify the :py:meth:`luigi.task.Task.get_task_family`. Like\n\n    .. code-block:: console\n\n            $ luigi --module my_tasks MyMetaTask --my_task_param my_namespace.MyTask\n\n    Where ``my_namespace.MyTask`` is defined in the ``my_tasks`` python module.\n\n    When the :py:class:`luigi.task.Task` class is instantiated to an object.\n    The value will always be a task class (and not a string).\n    \"\"\"\n\n    def parse(self, x):\n        \"\"\"\n        Parse a task_famly using the :class:`~luigi.task_register.Register`\n        \"\"\"\n        return task_register.Register.get_task_cls(x)\n\n    def serialize(self, x):\n        \"\"\"\n        Converts the :py:class:`luigi.task.Task` (sub) class to its family name.\n        \"\"\"\n        return x.get_task_family()\n\n\nEnumParameterType = TypeVar(\"EnumParameterType\", bound=Enum)\n\n\nclass EnumParameter(Parameter[EnumParameterType]):\n    \"\"\"\n    A parameter whose value is an :class:`~enum.Enum`.\n\n    In the task definition, use\n\n    .. code-block:: python\n\n        class Model(enum.Enum):\n          Honda = 1\n          Volvo = 2\n\n        class MyTask(luigi.Task):\n          my_param = luigi.EnumParameter(enum=Model)\n\n    At the command line, use,\n\n    .. code-block:: console\n\n        $ luigi --module my_tasks MyTask --my-param Honda\n\n    \"\"\"\n\n    def __init__(\n        self,\n        default: Union[EnumParameterType, _NoValueType] = _no_value,\n        *,\n        enum: Optional[Type[EnumParameterType]] = None,\n        **kwargs: Unpack[_ParameterKwargs],\n    ):\n        if enum is None:\n            raise ParameterException(\"An enum class must be specified.\")\n        self._enum = enum\n        super().__init__(default=default, **kwargs)\n\n    def parse(self, x):\n        try:\n            return self._enum[x]\n        except KeyError:\n            raise ValueError(\"Invalid enum value - could not be parsed\")\n\n    def serialize(self, x):\n        return x.name\n\n\nclass EnumListParameter(Parameter[Tuple[EnumParameterType, ...]]):\n    \"\"\"\n    A parameter whose value is a comma-separated list of :class:`~enum.Enum`. Values should come from the same enum.\n\n    Values are taken to be a list, i.e. order is preserved, duplicates may occur, and empty list is possible.\n\n    In the task definition, use\n\n    .. code-block:: python\n\n        class Model(enum.Enum):\n          Honda = 1\n          Volvo = 2\n\n        class MyTask(luigi.Task):\n          my_param = luigi.EnumListParameter(enum=Model)\n\n    At the command line, use,\n\n    .. code-block:: console\n\n        $ luigi --module my_tasks MyTask --my-param Honda,Volvo\n\n    \"\"\"\n\n    _sep = \",\"\n\n    def __init__(\n        self,\n        default: Union[Tuple[EnumParameterType, ...], _NoValueType] = _no_value,\n        *,\n        enum: Optional[Type[EnumParameterType]] = None,\n        **kwargs: Unpack[_ParameterKwargs],\n    ):\n        if enum is None:\n            raise ParameterException(\"An enum class must be specified.\")\n        self._enum = enum\n        super().__init__(default=default, **kwargs)\n\n    def parse(self, x):\n        values = [] if x == \"\" else x.split(self._sep)\n\n        for i, v in enumerate(values):\n            try:\n                values[i] = self._enum[v]\n            except KeyError:\n                raise ValueError('Invalid enum value \"{}\" index {} - could not be parsed'.format(v, i))\n\n        return tuple(values)\n\n    def serialize(self, x):\n        return self._sep.join([e.name for e in x])\n\n\nclass _DictParamEncoder(JSONEncoder):\n    \"\"\"\n    JSON encoder for :py:class:`~DictParameter`, which makes :py:class:`~FrozenOrderedDict` JSON serializable.\n    \"\"\"\n\n    def default(self, obj):\n        if isinstance(obj, FrozenOrderedDict):\n            return obj.get_wrapped()\n        return json.JSONEncoder.default(self, obj)\n\n\nDictT = TypeVar(\"DictT\", bound=dict, default=Dict[Any, Any])\n\n\nclass DictParameter(Parameter[DictT]):\n    \"\"\"\n    Parameter whose value is a ``dict``.\n\n    In the task definition, use\n\n    .. code-block:: python\n\n        class MyTask(luigi.Task):\n          tags = luigi.DictParameter()\n\n            def run(self):\n                logging.info(\"Find server with role: %s\", self.tags['role'])\n                server = aws.ec2.find_my_resource(self.tags)\n\n\n    At the command line, use\n\n    .. code-block:: console\n\n        $ luigi --module my_tasks MyTask --tags <JSON string>\n\n    Simple example with two tags:\n\n    .. code-block:: console\n\n        $ luigi --module my_tasks MyTask --tags '{\"role\": \"web\", \"env\": \"staging\"}'\n\n    It can be used to define dynamic parameters, when you do not know the exact list of your parameters (e.g. list of\n    tags, that are dynamically constructed outside Luigi), or you have a complex parameter containing logically related\n    values (like a database connection config).\n\n    It is possible to provide a JSON schema that should be validated by the given value:\n\n    .. code-block:: python\n\n        class MyTask(luigi.Task):\n          tags = luigi.DictParameter(\n            schema={\n              \"type\": \"object\",\n              \"patternProperties\": {\n                \".*\": {\"type\": \"string\", \"enum\": [\"web\", \"staging\"]},\n              }\n            }\n          )\n\n          def run(self):\n            logging.info(\"Find server with role: %s\", self.tags['role'])\n            server = aws.ec2.find_my_resource(self.tags)\n\n    Using this schema, the following command will work:\n\n    .. code-block:: console\n\n        $ luigi --module my_tasks MyTask --tags '{\"role\": \"web\", \"env\": \"staging\"}'\n\n    while this command will fail because the parameter is not valid:\n\n    .. code-block:: console\n\n        $ luigi --module my_tasks MyTask --tags '{\"role\": \"UNKNOWN_VALUE\", \"env\": \"staging\"}'\n\n    Finally, the provided schema can be a custom validator:\n\n    .. code-block:: python\n\n        custom_validator = jsonschema.Draft4Validator(\n          schema={\n            \"type\": \"object\",\n            \"patternProperties\": {\n              \".*\": {\"type\": \"string\", \"enum\": [\"web\", \"staging\"]},\n            }\n          }\n        )\n\n        class MyTask(luigi.Task):\n          tags = luigi.DictParameter(schema=custom_validator)\n\n          def run(self):\n            logging.info(\"Find server with role: %s\", self.tags['role'])\n            server = aws.ec2.find_my_resource(self.tags)\n\n    \"\"\"\n\n    def __init__(\n        self,\n        default: Union[DictT, _NoValueType] = _no_value,\n        *,\n        schema=None,\n        **kwargs: Unpack[_ParameterKwargs],\n    ):\n        if schema is not None and not _JSONSCHEMA_ENABLED:\n            warnings.warn(\"The 'jsonschema' package is not installed so the parameter can not be validated even though a schema is given.\")\n            self.schema = None\n        else:\n            self.schema = schema\n        super().__init__(default=default, **kwargs)\n\n    def normalize(self, x):\n        \"\"\"\n        Ensure that dictionary parameter is converted to a FrozenOrderedDict so it can be hashed.\n        \"\"\"\n        if self.schema is not None:\n            unfrozen_value = recursively_unfreeze(x)\n            try:\n                self.schema.validate(unfrozen_value)\n                x = unfrozen_value  # Validators may update the instance inplace\n            except AttributeError:\n                jsonschema.validate(instance=unfrozen_value, schema=self.schema)\n        return recursively_freeze(x)\n\n    def parse(self, x):\n        \"\"\"\n        Parses an immutable and ordered ``dict`` from a JSON string using standard JSON library.\n\n        We need to use an immutable dictionary, to create a hashable parameter and also preserve the internal structure\n        of parsing. The traversal order of standard ``dict`` is undefined, which can result various string\n        representations of this parameter, and therefore a different task id for the task containing this parameter.\n        This is because task id contains the hash of parameters' JSON representation.\n\n        :param s: String to be parse\n        \"\"\"\n        # TOML based config convert params to python types itself.\n        if not isinstance(x, str):\n            return x\n        return json.loads(x, object_pairs_hook=FrozenOrderedDict)\n\n    def serialize(self, x):\n        return json.dumps(x, cls=_DictParamEncoder)\n\n\nclass OptionalDictParameter(OptionalParameterMixin[FrozenOrderedDict], DictParameter):  # type: ignore[misc]\n    \"\"\"Class to parse optional dict parameters.\"\"\"\n\n    expected_type = FrozenOrderedDict\n\n\nListT = TypeVar(\"ListT\", bound=tuple, default=Tuple[Any, ...])\n\n\nclass ListParameter(Parameter[ListT]):\n    \"\"\"\n    Parameter whose value is a ``list``.\n\n    In the task definition, use\n\n    .. code-block:: python\n\n        class MyTask(luigi.Task):\n          grades = luigi.ListParameter()\n\n            def run(self):\n                sum = 0\n                for element in self.grades:\n                    sum += element\n                avg = sum / len(self.grades)\n\n\n    At the command line, use\n\n    .. code-block:: console\n\n        $ luigi --module my_tasks MyTask --grades <JSON string>\n\n    Simple example with two grades:\n\n    .. code-block:: console\n\n        $ luigi --module my_tasks MyTask --grades '[100,70]'\n\n    It is possible to provide a JSON schema that should be validated by the given value:\n\n    .. code-block:: python\n\n        class MyTask(luigi.Task):\n          grades = luigi.ListParameter(\n            schema={\n              \"type\": \"array\",\n              \"items\": {\n                \"type\": \"number\",\n                \"minimum\": 0,\n                \"maximum\": 10\n              },\n              \"minItems\": 1\n            }\n          )\n\n          def run(self):\n                sum = 0\n                for element in self.grades:\n                    sum += element\n                avg = sum / len(self.grades)\n\n    Using this schema, the following command will work:\n\n    .. code-block:: console\n\n        $ luigi --module my_tasks MyTask --numbers '[1, 8.7, 6]'\n\n    while these commands will fail because the parameter is not valid:\n\n    .. code-block:: console\n\n        $ luigi --module my_tasks MyTask --numbers '[]'  # must have at least 1 element\n        $ luigi --module my_tasks MyTask --numbers '[-999, 999]'  # elements must be in [0, 10]\n\n    Finally, the provided schema can be a custom validator:\n\n    .. code-block:: python\n\n        custom_validator = jsonschema.Draft4Validator(\n          schema={\n            \"type\": \"array\",\n            \"items\": {\n              \"type\": \"number\",\n              \"minimum\": 0,\n              \"maximum\": 10\n            },\n            \"minItems\": 1\n          }\n        )\n\n        class MyTask(luigi.Task):\n          grades = luigi.ListParameter(schema=custom_validator)\n\n          def run(self):\n                sum = 0\n                for element in self.grades:\n                    sum += element\n                avg = sum / len(self.grades)\n\n    \"\"\"\n\n    def __init__(\n        self,\n        default: Union[ListT, _NoValueType] = _no_value,\n        *,\n        schema=None,\n        **kwargs: Unpack[_ParameterKwargs],\n    ):\n        if schema is not None and not _JSONSCHEMA_ENABLED:\n            warnings.warn(\"The 'jsonschema' package is not installed so the parameter can not be validated even though a schema is given.\")\n            self.schema = None\n        else:\n            self.schema = schema\n        super().__init__(default=default, **kwargs)\n\n    def normalize(self, x):\n        \"\"\"\n        Ensure that struct is recursively converted to a tuple so it can be hashed.\n\n        :param str x: the value to parse.\n        :return: the normalized (hashable/immutable) value.\n        \"\"\"\n        if self.schema is not None:\n            unfrozen_value = recursively_unfreeze(x)\n            try:\n                self.schema.validate(unfrozen_value)\n                x = unfrozen_value  # Validators may update the instance inplace\n            except AttributeError:\n                jsonschema.validate(instance=unfrozen_value, schema=self.schema)\n        return recursively_freeze(x)\n\n    def parse(self, x):\n        \"\"\"\n        Parse an individual value from the input.\n\n        :param str x: the value to parse.\n        :return: the parsed value.\n        \"\"\"\n        i = json.loads(x, object_pairs_hook=FrozenOrderedDict)\n        if i is None:\n            return None\n        return list(i)\n\n    def serialize(self, x):\n        \"\"\"\n        Opposite of :py:meth:`parse`.\n\n        Converts the value ``x`` to a string.\n\n        :param x: the value to serialize.\n        \"\"\"\n        return json.dumps(x, cls=_DictParamEncoder)\n\n\nclass OptionalListParameter(OptionalParameterMixin[ListT], ListParameter):  # type: ignore[misc]\n    \"\"\"Class to parse optional list parameters.\"\"\"\n\n    expected_type = tuple\n\n\nclass TupleParameter(ListParameter[ListT]):\n    \"\"\"\n    Parameter whose value is a ``tuple`` or ``tuple`` of tuples.\n\n    In the task definition, use\n\n    .. code-block:: python\n\n        class MyTask(luigi.Task):\n          book_locations = luigi.TupleParameter()\n\n            def run(self):\n                for location in self.book_locations:\n                    print(\"Go to page %d, line %d\" % (location[0], location[1]))\n\n\n    At the command line, use\n\n    .. code-block:: console\n\n        $ luigi --module my_tasks MyTask --book_locations <JSON string>\n\n    Simple example with two grades:\n\n    .. code-block:: console\n\n        $ luigi --module my_tasks MyTask --book_locations '((12,3),(4,15),(52,1))'\n    \"\"\"\n\n    def parse(self, x):\n        \"\"\"\n        Parse an individual value from the input.\n\n        :param str x: the value to parse.\n        :return: the parsed value.\n        \"\"\"\n        # Since the result of json.dumps(tuple) differs from a tuple string, we must handle either case.\n        # A tuple string may come from a config file or from cli execution.\n\n        # t = ((1, 2), (3, 4))\n        # t_str = '((1,2),(3,4))'\n        # t_json_str = json.dumps(t)\n        # t_json_str == '[[1, 2], [3, 4]]'\n        # json.loads(t_json_str) == t\n        # json.loads(t_str) == ValueError: No JSON object could be decoded\n\n        # Therefore, if json.loads(x) returns a ValueError, try ast.literal_eval(x).\n        # ast.literal_eval(t_str) == t\n        try:\n            # loop required to parse tuple of tuples\n            return tuple(self._convert_iterable_to_tuple(x) for x in json.loads(x, object_pairs_hook=FrozenOrderedDict))\n        except (ValueError, TypeError):\n            result = literal_eval(x)\n            # t_str = '(\"abcd\")'\n            # Ensure that the result is not a string to avoid cases like ('a','b','c','d')\n            if isinstance(result, str):\n                raise ValueError(\"Parsed result cannot be a string\")\n            return tuple(result)  # if this causes an error, let that error be raised.\n\n    def _convert_iterable_to_tuple(self, x):\n        if isinstance(x, str):\n            return x\n        return tuple(x)\n\n\nclass OptionalTupleParameter(OptionalParameterMixin[ListT], TupleParameter):  # type: ignore[misc]\n    \"\"\"Class to parse optional tuple parameters.\"\"\"\n\n    expected_type = tuple\n\n\nNumericalType = TypeVar(\"NumericalType\", int, float)\n\n\nclass NumericalParameter(Parameter[NumericalType]):\n    \"\"\"\n    Parameter whose value is a number of the specified type, e.g. ``int`` or\n    ``float`` and in the range specified.\n\n    In the task definition, use\n\n    .. code-block:: python\n\n        class MyTask(luigi.Task):\n            my_param_1 = luigi.NumericalParameter(\n                var_type=int, min_value=-3, max_value=7) # -3 <= my_param_1 < 7\n            my_param_2 = luigi.NumericalParameter(\n                var_type=int, min_value=-3, max_value=7, left_op=operator.lt, right_op=operator.le) # -3 < my_param_2 <= 7\n\n    At the command line, use\n\n    .. code-block:: console\n\n        $ luigi --module my_tasks MyTask --my-param-1 -3 --my-param-2 -2\n    \"\"\"\n\n    def __init__(\n        self,\n        default: Union[NumericalType, _NoValueType] = _no_value,\n        *,\n        var_type: Optional[Type[NumericalType]] = None,\n        min_value: Optional[NumericalType] = None,\n        max_value: Optional[NumericalType] = None,\n        left_op=operator.le,\n        right_op=operator.lt,\n        **kwargs: Unpack[_ParameterKwargs],\n    ):\n        \"\"\"\n        :param function var_type: The type of the input variable, e.g. int or float.\n        :param min_value: The minimum value permissible in the accepted values\n                          range.  May be inclusive or exclusive based on left_op parameter.\n                          This should be the same type as var_type.\n        :param max_value: The maximum value permissible in the accepted values\n                          range.  May be inclusive or exclusive based on right_op parameter.\n                          This should be the same type as var_type.\n        :param function left_op: The comparison operator for the left-most comparison in\n                                 the expression ``min_value left_op value right_op value``.\n                                 This operator should generally be either\n                                 ``operator.lt`` or ``operator.le``.\n                                 Default: ``operator.le``.\n        :param function right_op: The comparison operator for the right-most comparison in\n                                  the expression ``min_value left_op value right_op value``.\n                                  This operator should generally be either\n                                  ``operator.lt`` or ``operator.le``.\n                                  Default: ``operator.lt``.\n        \"\"\"\n        if var_type is None:\n            raise ParameterException(\"var_type must be specified\")\n        self._var_type: Type[NumericalType] = var_type\n        if min_value is None:\n            raise ParameterException(\"min_value must be specified\")\n        self._min_value: NumericalType = min_value\n        if max_value is None:\n            raise ParameterException(\"max_value must be specified\")\n        self._max_value: NumericalType = max_value\n        self._left_op = left_op\n        self._right_op = right_op\n        self._permitted_range = \"{var_type} in {left_endpoint}{min_value}, {max_value}{right_endpoint}\".format(\n            var_type=self._var_type.__name__,\n            min_value=self._min_value,\n            max_value=self._max_value,\n            left_endpoint=\"[\" if left_op == operator.le else \"(\",\n            right_endpoint=\")\" if right_op == operator.lt else \"]\",\n        )\n        super().__init__(default=default, **kwargs)  # type: ignore[arg-type]\n        if self.description:\n            self.description += \" \"\n        else:\n            self.description = \"\"\n        self.description += \"permitted values: \" + self._permitted_range\n\n    def parse(self, x):\n        value = self._var_type(x)\n        if self._left_op(self._min_value, value) and self._right_op(value, self._max_value):\n            return value\n        else:\n            raise ValueError(\"{s} is not in the set of {permitted_range}\".format(s=x, permitted_range=self._permitted_range))\n\n\nclass OptionalNumericalParameter(OptionalParameterMixin[NumericalType], NumericalParameter[NumericalType]):  # type: ignore[misc]\n    \"\"\"Class to parse optional numerical parameters.\"\"\"\n\n    def __init__(\n        self,\n        default: Union[Optional[NumericalType], _NoValueType] = _no_value,\n        **kwargs: Unpack[_ParameterKwargs],\n    ):\n        NumericalParameter.__init__(self, default=default, **kwargs)  # type: ignore[arg-type, misc]\n        self.expected_type = self._var_type\n\n\nChoiceType = TypeVar(\"ChoiceType\", default=str)\n\n\nclass ChoiceParameter(Parameter[ChoiceType]):\n    \"\"\"\n    A parameter which takes two values:\n        1. an instance of :class:`~collections.Iterable` and\n        2. the class of the variables to convert to.\n\n    In the task definition, use\n\n    .. code-block:: python\n\n        class MyTask(luigi.Task):\n            my_param = luigi.ChoiceParameter(choices=[0.1, 0.2, 0.3], var_type=float)\n\n    At the command line, use\n\n    .. code-block:: console\n\n        $ luigi --module my_tasks MyTask --my-param 0.1\n\n    Consider using :class:`~luigi.EnumParameter` for a typed, structured\n    alternative.  This class can perform the same role when all choices are the\n    same type and transparency of parameter value on the command line is\n    desired.\n    \"\"\"\n\n    def __init__(\n        self,\n        default: Union[ChoiceType, _NoValueType] = _no_value,\n        *,\n        choices: Optional[Sequence[ChoiceType]] = None,\n        var_type: Type[ChoiceType] = str,  # type: ignore[assignment]\n        **kwargs: Unpack[_ParameterKwargs],\n    ):\n        \"\"\"\n        :param function var_type: The type of the input variable, e.g. str, int,\n                                  float, etc.\n                                  Default: str\n        :param choices: An iterable, all of whose elements are of `var_type` to\n                        restrict parameter choices to.\n        \"\"\"\n        if choices is None:\n            raise ParameterException(\"A choices iterable must be specified\")\n        self._choices = set(choices)\n        self._var_type = var_type\n        assert all(type(choice) is self._var_type for choice in self._choices), \"Invalid type in choices\"\n        super().__init__(default=default, **kwargs)\n        if self.description:\n            self.description += \" \"\n        else:\n            self.description = \"\"\n        self.description += \"Choices: {\" + \", \".join(str(choice) for choice in self._choices) + \"}\"\n\n    def parse(self, x):\n        var = self._var_type(x)\n        return self.normalize(var)\n\n    def normalize(self, x):\n        if x in self._choices:\n            return x\n        else:\n            raise ValueError(\"{var} is not a valid choice from {choices}\".format(var=x, choices=self._choices))\n\n\nclass ChoiceListParameter(ChoiceParameter[ChoiceType]):\n    \"\"\"\n    A parameter which takes two values:\n        1. an instance of :class:`~collections.Iterable` and\n        2. the class of the variables to convert to.\n\n    Values are taken to be a list, i.e. order is preserved, duplicates may occur, and empty list is possible.\n\n    In the task definition, use\n\n    .. code-block:: python\n\n        class MyTask(luigi.Task):\n            my_param = luigi.ChoiceListParameter(choices=['foo', 'bar', 'baz'], var_type=str)\n\n    At the command line, use\n\n    .. code-block:: console\n\n        $ luigi --module my_tasks MyTask --my-param foo,bar\n\n    Consider using :class:`~luigi.EnumListParameter` for a typed, structured\n    alternative.  This class can perform the same role when all choices are the\n    same type and transparency of parameter value on the command line is\n    desired.\n    \"\"\"\n\n    _sep = \",\"\n\n    @overload  # type: ignore[override]\n    def __get__(self, instance: None, owner: Any) -> \"Parameter[Tuple[ChoiceType, ...]]\": ...\n\n    @overload\n    def __get__(self, instance: Any, owner: Any) -> Tuple[ChoiceType, ...]: ...\n\n    def __get__(self, instance: Any, owner: Any) -> Any:\n        return super().__get__(instance, owner)\n\n    def __init__(\n        self,\n        default: Union[Tuple[ChoiceType, ...], _NoValueType] = _no_value,\n        var_type: Type[ChoiceType] = str,  # type: ignore[assignment]\n        choices: Optional[Sequence[ChoiceType]] = None,\n        **kwargs: Unpack[_ParameterKwargs],\n    ):\n        super().__init__(default=default, var_type=var_type, choices=choices, **kwargs)  # type: ignore[arg-type]\n\n    def parse(self, x):\n        values = [] if x == \"\" else x.split(self._sep)\n        return self.normalize(map(self._var_type, values))\n\n    def normalize(self, x):\n        values = []\n        for v in x:\n            values.append(super().normalize(v))\n        return tuple(values)\n\n    def serialize(self, x):\n        return self._sep.join(x)\n\n\nclass OptionalChoiceParameter(OptionalParameterMixin[ChoiceType], ChoiceParameter[ChoiceType]):  # type: ignore[misc]\n    \"\"\"Class to parse optional choice parameters.\"\"\"\n\n    def __init__(\n        self,\n        default: Union[Optional[ChoiceType], _NoValueType] = _no_value,\n        var_type: Type[ChoiceType] = str,  # type: ignore[assignment]\n        choices: Optional[Sequence[ChoiceType]] = None,\n        **kwargs: Unpack[_ParameterKwargs],\n    ):\n        ChoiceParameter.__init__(self, default=default, var_type=var_type, choices=choices, **kwargs)  # type: ignore[arg-type, misc]\n        self.expected_type = self._var_type\n\n\nclass PathParameter(Parameter[Path]):\n    \"\"\"\n    Parameter whose value is a path.\n\n    In the task definition, use\n\n    .. code-block:: python\n\n        class MyTask(luigi.Task):\n            existing_file_path = luigi.PathParameter(exists=True)\n            new_file_path = luigi.PathParameter()\n\n            def run(self):\n                # Get data from existing file\n                with self.existing_file_path.open(\"r\", encoding=\"utf-8\") as f:\n                    data = f.read()\n\n                # Output message in new file\n                self.new_file_path.parent.mkdir(parents=True, exist_ok=True)\n                with self.new_file_path.open(\"w\", encoding=\"utf-8\") as f:\n                    f.write(\"hello from a PathParameter => \")\n                    f.write(data)\n\n    At the command line, use\n\n    .. code-block:: console\n\n        $ luigi --module my_tasks MyTask --existing-file-path <path> --new-file-path <path>\n    \"\"\"\n\n    def __init__(\n        self,\n        default: Union[Path, _NoValueType] = _no_value,\n        *,\n        absolute: bool = False,\n        exists: bool = False,\n        **kwargs: Unpack[_ParameterKwargs],\n    ):\n        \"\"\"\n        :param bool absolute: If set to ``True``, the given path is converted to an absolute path.\n        :param bool exists: If set to ``True``, a :class:`ValueError` is raised if the path does not exist.\n        \"\"\"\n        super().__init__(default=default, **kwargs)\n\n        self.absolute = absolute\n        self.exists = exists\n\n    def normalize(self, x):\n        \"\"\"\n        Normalize the given value to a :class:`pathlib.Path` object.\n        \"\"\"\n        path = Path(x)\n        if self.absolute:\n            path = path.absolute()\n        if self.exists and not path.exists():\n            raise ValueError(f\"The path {path} does not exist.\")\n        return path\n\n\nclass OptionalPathParameter(OptionalParameter, PathParameter):  # type: ignore[misc]\n    \"\"\"Class to parse optional path parameters.\"\"\"\n\n    expected_type = (str, Path)  # type: ignore[assignment]\n"
  },
  {
    "path": "luigi/process.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nContains some helper functions to run luigid in daemon mode\n\"\"\"\n\nimport datetime\nimport logging\nimport logging.handlers\nimport os\n\nrootlogger = logging.getLogger()\nserver_logger = logging.getLogger(\"luigi.server\")\n\n\ndef check_pid(pidfile):\n    if pidfile and os.path.exists(pidfile):\n        try:\n            pid = int(open(pidfile).read().strip())\n            os.kill(pid, 0)\n            return pid\n        except BaseException:\n            return 0\n    return 0\n\n\ndef write_pid(pidfile):\n    server_logger.info(\"Writing pid file\")\n    piddir = os.path.dirname(pidfile)\n    if piddir != \"\":\n        try:\n            os.makedirs(piddir)\n        except OSError:\n            pass\n\n    with open(pidfile, \"w\") as fobj:\n        fobj.write(str(os.getpid()))\n\n\ndef get_log_format():\n    return \"%(asctime)s %(name)s[%(process)s] %(levelname)s: %(message)s\"\n\n\ndef get_spool_handler(filename):\n    handler = logging.handlers.TimedRotatingFileHandler(\n        filename=filename,\n        when=\"d\",\n        encoding=\"utf8\",\n        backupCount=7,  # keep one week of historical logs\n    )\n    formatter = logging.Formatter(get_log_format())\n    handler.setFormatter(formatter)\n    return handler\n\n\ndef _server_already_running(pidfile):\n    existing_pid = check_pid(pidfile)\n    if pidfile and existing_pid:\n        return True\n    return False\n\n\ndef daemonize(cmd, pidfile=None, logdir=None, api_port=8082, address=None, unix_socket=None):\n    import daemon\n\n    logdir = logdir or \"/var/log/luigi\"\n    if not os.path.exists(logdir):\n        os.makedirs(logdir)\n\n    log_path = os.path.join(logdir, \"luigi-server.log\")\n\n    # redirect stdout/stderr\n    today = datetime.date.today()\n    stdout_path = os.path.join(logdir, \"luigi-server-{0:%Y-%m-%d}.out\".format(today))\n    stderr_path = os.path.join(logdir, \"luigi-server-{0:%Y-%m-%d}.err\".format(today))\n    stdout_proxy = open(stdout_path, \"a+\")\n    stderr_proxy = open(stderr_path, \"a+\")\n\n    try:\n        ctx = daemon.DaemonContext(\n            stdout=stdout_proxy,\n            stderr=stderr_proxy,\n            working_directory=\".\",\n            initgroups=False,\n        )\n    except TypeError:\n        # Older versions of python-daemon cannot deal with initgroups arg.\n        ctx = daemon.DaemonContext(\n            stdout=stdout_proxy,\n            stderr=stderr_proxy,\n            working_directory=\".\",\n        )\n\n    with ctx:\n        loghandler = get_spool_handler(log_path)\n        rootlogger.addHandler(loghandler)\n\n        if pidfile:\n            server_logger.info(\"Checking pid file\")\n            existing_pid = check_pid(pidfile)\n            if pidfile and existing_pid:\n                server_logger.info(\"Server already running (pid=%s)\", existing_pid)\n                return\n            write_pid(pidfile)\n\n        cmd(api_port=api_port, address=address, unix_socket=unix_socket)\n"
  },
  {
    "path": "luigi/py.typed",
    "content": ""
  },
  {
    "path": "luigi/retcodes.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2015-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nModule containing the logic for exit codes for the luigi binary. It's useful\nwhen you in a programmatic way need to know if luigi actually finished the\ngiven task, and if not why.\n\"\"\"\n\nimport logging\nimport sys\n\nimport luigi\nfrom luigi import IntParameter\nfrom luigi.setup_logging import InterfaceLogging\n\n\nclass retcode(luigi.Config):\n    \"\"\"\n    See the :ref:`return codes configuration section <retcode-config>`.\n    \"\"\"\n\n    # default value inconsistent with doc/configuration.rst for backwards compatibility reasons\n    unhandled_exception = IntParameter(\n        default=4,\n        description=\"For internal luigi errors.\",\n    )\n    # default value inconsistent with doc/configuration.rst for backwards compatibility reasons\n    missing_data = IntParameter(\n        default=0,\n        description=\"For when there are incomplete ExternalTask dependencies.\",\n    )\n    # default value inconsistent with doc/configuration.rst for backwards compatibility reasons\n    task_failed = IntParameter(\n        default=0,\n        description=\"\"\"For when a task's run() method fails.\"\"\",\n    )\n    # default value inconsistent with doc/configuration.rst for backwards compatibility reasons\n    already_running = IntParameter(\n        default=0,\n        description='For both local --lock and luigid \"lock\"',\n    )\n    # default value inconsistent with doc/configuration.rst for backwards compatibility reasons\n    scheduling_error = IntParameter(\n        default=0,\n        description=\"\"\"For when a task's complete() or requires() fails,\n                                                   or task-limit reached\"\"\",\n    )\n    # default value inconsistent with doc/configuration.rst for backwards compatibility reasons\n    not_run = IntParameter(default=0, description=\"For when a task is not granted run permission by the scheduler.\")\n\n\ndef run_with_retcodes(argv):\n    \"\"\"\n    Run luigi with command line parsing, but raise ``SystemExit`` with the configured exit code.\n\n    Note: Usually you use the luigi binary directly and don't call this function yourself.\n\n    :param argv: Should (conceptually) be ``sys.argv[1:]``\n    \"\"\"\n    logger = logging.getLogger(\"luigi-interface\")\n    with luigi.cmdline_parser.CmdlineParser.global_instance(argv):\n        retcodes = retcode()\n\n    worker = None\n    try:\n        worker = luigi.interface._run(argv).worker\n    except luigi.interface.PidLockAlreadyTakenExit:\n        sys.exit(retcodes.already_running)\n    except Exception:\n        # Some errors occur before logging is set up, we set it up now\n        env_params = luigi.interface.core()\n        InterfaceLogging.setup(env_params)\n        logger.exception(\"Uncaught exception in luigi\")\n        sys.exit(retcodes.unhandled_exception)\n\n    with luigi.cmdline_parser.CmdlineParser.global_instance(argv):\n        task_sets = luigi.execution_summary._summary_dict(worker)\n        root_task = luigi.execution_summary._root_task(worker)\n        non_empty_categories = {k: v for k, v in task_sets.items() if v}.keys()\n\n    def has(status):\n        assert status in luigi.execution_summary._ORDERED_STATUSES\n        return status in non_empty_categories\n\n    codes_and_conds = (\n        (retcodes.missing_data, has(\"still_pending_ext\")),\n        (retcodes.task_failed, has(\"failed\")),\n        (retcodes.already_running, has(\"run_by_other_worker\")),\n        (retcodes.scheduling_error, has(\"scheduling_error\")),\n        (retcodes.not_run, has(\"not_run\")),\n    )\n    expected_ret_code = max(code * (1 if cond else 0) for code, cond in codes_and_conds)\n\n    if expected_ret_code == 0 and root_task not in task_sets[\"completed\"] and root_task not in task_sets[\"already_done\"]:\n        sys.exit(retcodes.not_run)\n    else:\n        sys.exit(expected_ret_code)\n"
  },
  {
    "path": "luigi/rpc.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nImplementation of the REST interface between the workers and the server.\nrpc.py implements the client side of it, server.py implements the server side.\nSee :doc:`/central_scheduler` for more info.\n\"\"\"\n\nimport abc\nimport base64\nimport json\nimport logging\nimport os\nimport socket\nfrom urllib.error import URLError\nfrom urllib.parse import urlencode, urljoin, urlparse\nfrom urllib.request import Request, urlopen\n\nfrom tenacity import Retrying, stop_after_attempt, wait_fixed\n\nfrom luigi import configuration\nfrom luigi.scheduler import RPC_METHODS\n\nHAS_UNIX_SOCKET = True\nHAS_REQUESTS = True\n\n\ntry:\n    import requests_unixsocket as requests\nexcept ImportError:\n    HAS_UNIX_SOCKET = False\n    try:\n        import requests\n    except ImportError:\n        HAS_REQUESTS = False\n\n\nlogger = logging.getLogger(\"luigi-interface\")  # TODO: 'interface'?\n\n\ndef _urljoin(base, url):\n    \"\"\"\n    Join relative URLs to base URLs like urllib.parse.urljoin but support\n    arbitrary URIs (esp. 'http+unix://').\n    base part is fixed or mounted point, every url contains full base part.\n    \"\"\"\n    parsed = urlparse(base)\n    scheme = parsed.scheme\n    return urlparse(urljoin(parsed._replace(scheme=\"http\").geturl(), parsed.path + (url if url[0] == \"/\" else \"/\" + url)))._replace(scheme=scheme).geturl()\n\n\nclass RPCError(Exception):\n    def __init__(self, message, sub_exception=None):\n        super(RPCError, self).__init__(message)\n        self.sub_exception = sub_exception\n\n\nclass _FetcherInterface(metaclass=abc.ABCMeta):\n    @abc.abstractmethod\n    def fetch(self, full_url, body, timeout):\n        pass\n\n    @abc.abstractmethod\n    def close(self):\n        pass\n\n\nclass URLLibFetcher(_FetcherInterface):\n    raises = (URLError, socket.timeout)\n\n    def _create_request(self, full_url, body=None):\n        # when full_url contains basic auth info, extract it and set the Authorization header\n        url = urlparse(full_url)\n        if url.username:\n            # base64 encoding of username:password\n            auth = base64.b64encode(\"{}:{}\".format(url.username, url.password or \"\").encode(\"utf-8\"))\n            auth = auth.decode(\"utf-8\")\n            # update full_url and create a request object with the auth header set\n            full_url = url._replace(netloc=url.netloc.split(\"@\", 1)[-1]).geturl()\n            req = Request(full_url)\n            req.add_header(\"Authorization\", \"Basic {}\".format(auth))\n        else:\n            req = Request(full_url)\n\n        # add the request body\n        if body:\n            req.data = urlencode(body).encode(\"utf-8\")\n\n        return req\n\n    def fetch(self, full_url, body, timeout):\n        req = self._create_request(full_url, body=body)\n        return urlopen(req, timeout=timeout).read().decode(\"utf-8\")\n\n    def close(self):\n        pass\n\n\nclass RequestsFetcher(_FetcherInterface):\n    def __init__(self):\n        from requests import exceptions as requests_exceptions\n\n        self.raises = requests_exceptions.RequestException\n        self.session = requests.Session()\n        self.process_id = os.getpid()\n\n    def check_pid(self):\n        # if the process id change changed from when the session was created\n        # a new session needs to be setup since requests isn't multiprocessing safe.\n        if os.getpid() != self.process_id:\n            self.session = requests.Session()\n            self.process_id = os.getpid()\n\n    def fetch(self, full_url, body, timeout):\n        self.check_pid()\n        resp = self.session.post(full_url, data=body, timeout=timeout)\n        resp.raise_for_status()\n        return resp.text\n\n    def close(self):\n        self.session.close()\n\n\nclass RemoteScheduler:\n    \"\"\"\n    Scheduler proxy object. Talks to a RemoteSchedulerResponder.\n    \"\"\"\n\n    def __init__(self, url=\"http://localhost:8082/\", connect_timeout=None):\n        assert not url.startswith(\"http+unix://\") or HAS_UNIX_SOCKET, \"You need to install requests-unixsocket for Unix socket support.\"\n\n        self._url = url.rstrip(\"/\")\n        config = configuration.get_config()\n\n        if connect_timeout is None:\n            connect_timeout = config.getfloat(\"core\", \"rpc-connect-timeout\", 10.0)\n        self._connect_timeout = connect_timeout\n\n        self._rpc_retry_attempts = config.getint(\"core\", \"rpc-retry-attempts\", 3)\n        self._rpc_retry_wait = config.getint(\"core\", \"rpc-retry-wait\", 30)\n        self._rpc_log_retries = config.getboolean(\"core\", \"rpc-log-retries\", True)\n\n        if HAS_REQUESTS:\n            self._fetcher = RequestsFetcher()\n        else:\n            self._fetcher = URLLibFetcher()\n\n    def close(self):\n        self._fetcher.close()\n\n    def _get_retryer(self):\n        def retry_logging(retry_state):\n            if self._rpc_log_retries:\n                logger.warning(\"Failed connecting to remote scheduler %r\", self._url, exc_info=True)\n                logger.info(\"Retrying attempt %r of %r (max)\" % (retry_state.attempt_number + 1, self._rpc_retry_attempts))\n                logger.info(\"Wait for %d seconds\" % self._rpc_retry_wait)\n\n        return Retrying(wait=wait_fixed(self._rpc_retry_wait), stop=stop_after_attempt(self._rpc_retry_attempts), reraise=True, after=retry_logging)\n\n    def _fetch(self, url_suffix, body):\n        full_url = _urljoin(self._url, url_suffix)\n        scheduler_retry = self._get_retryer()\n\n        try:\n            response = scheduler_retry(self._fetcher.fetch, full_url, body, self._connect_timeout)\n        except self._fetcher.raises as e:\n            raise RPCError(\"Errors (%d attempts) when connecting to remote scheduler %r\" % (self._rpc_retry_attempts, self._url), e)\n        return response\n\n    def _request(self, url, data, attempts=3, allow_null=True):\n        body = {\"data\": json.dumps(data)}\n\n        for _ in range(attempts):\n            page = self._fetch(url, body)\n            response = json.loads(page)[\"response\"]\n            if allow_null or response is not None:\n                return response\n        raise RPCError(\"Received null response from remote scheduler %r\" % self._url)\n\n\nfor method_name, method in RPC_METHODS.items():\n    setattr(RemoteScheduler, method_name, method)\n"
  },
  {
    "path": "luigi/safe_extractor.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nThis module provides a class `SafeExtractor` that offers a secure way to extract tar files while\nmitigating path traversal vulnerabilities, which can occur when files inside the archive are\ncrafted to escape the intended extraction directory.\n\nThe `SafeExtractor` ensures that the extracted file paths are validated before extraction to\nprevent malicious archives from extracting files outside the intended directory.\n\nClasses:\n    SafeExtractor: A class to securely extract tar files with protection against path traversal attacks.\n\nUsage Example:\n    extractor = SafeExtractor(\"/desired/directory\")\n    extractor.safe_extract(\"archive.tar\")\n\"\"\"\n\nimport os\nimport tarfile\n\n\nclass SafeExtractor:\n    \"\"\"\n    A class to safely extract tar files, ensuring that no path traversal\n    vulnerabilities are exploited.\n\n    Attributes:\n        path (str): The directory to extract files into.\n\n    Methods:\n        _is_within_directory(directory, target):\n            Checks if a target path is within a given directory.\n\n        safe_extract(tar_path, members=None, \\\\*, numeric_owner=False):\n            Safely extracts the contents of a tar file to the specified directory.\n    \"\"\"\n\n    def __init__(self, path=\".\"):\n        \"\"\"\n        Initializes the SafeExtractor with the specified directory path.\n\n        Args:\n            path (str): The directory to extract files into. Defaults to the current directory.\n        \"\"\"\n        self.path = path\n\n    @staticmethod\n    def _is_within_directory(directory, target):\n        \"\"\"\n        Checks if a target path is within a given directory.\n\n        Args:\n            directory (str): The directory to check against.\n            target (str): The target path to check.\n\n        Returns:\n            bool: True if the target path is within the directory, False otherwise.\n        \"\"\"\n        abs_directory = os.path.abspath(directory)\n        abs_target = os.path.abspath(target)\n        prefix = os.path.commonprefix([abs_directory, abs_target])\n        return prefix == abs_directory\n\n    def safe_extract(self, tar_path, members=None, *, numeric_owner=False):\n        \"\"\"\n        Safely extracts the contents of a tar file to the specified directory.\n\n        Args:\n            tar_path (str): The path to the tar file to extract.\n            members (list, optional): A list of members to extract. Defaults to None.\n            numeric_owner (bool, optional): If True, only the numeric owner will be used. Defaults to False.\n\n        Raises:\n            RuntimeError: If a path traversal attempt is detected.\n        \"\"\"\n        with tarfile.open(tar_path, \"r\") as tar:\n            for member in tar.getmembers():\n                member_path = os.path.join(self.path, member.name)\n                if not self._is_within_directory(self.path, member_path):\n                    raise RuntimeError(\"Attempted Path Traversal in Tar File\")\n            tar.extractall(self.path, members, numeric_owner=numeric_owner)\n"
  },
  {
    "path": "luigi/scheduler.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nThe system for scheduling tasks and executing them in order.\nDeals with dependencies, priorities, resources, etc.\nThe :py:class:`~luigi.worker.Worker` pulls tasks from the scheduler (usually over the REST interface) and executes them.\nSee :doc:`/central_scheduler` for more info.\n\"\"\"\n\nimport collections\nimport functools\nimport hashlib\nimport inspect\nimport itertools\nimport json\nimport logging\nimport os\nimport pickle\nimport re\nimport time\nimport uuid\nfrom collections.abc import MutableSet\n\nfrom luigi import configuration, notifications, parameter\nfrom luigi import task_history as history\nfrom luigi.batch_notifier import BatchNotifier\nfrom luigi.metrics import MetricsCollectors\nfrom luigi.parameter import ParameterVisibility\nfrom luigi.task import Config\nfrom luigi.task_status import BATCH_RUNNING, DISABLED, DONE, FAILED, PENDING, RUNNING, SUSPENDED, UNKNOWN\n\nlogger = logging.getLogger(__name__)\n\nUPSTREAM_RUNNING = \"UPSTREAM_RUNNING\"\nUPSTREAM_MISSING_INPUT = \"UPSTREAM_MISSING_INPUT\"\nUPSTREAM_FAILED = \"UPSTREAM_FAILED\"\nUPSTREAM_DISABLED = \"UPSTREAM_DISABLED\"\n\nUPSTREAM_SEVERITY_ORDER = (\n    \"\",\n    UPSTREAM_RUNNING,\n    UPSTREAM_MISSING_INPUT,\n    UPSTREAM_FAILED,\n    UPSTREAM_DISABLED,\n)\nUPSTREAM_SEVERITY_KEY = UPSTREAM_SEVERITY_ORDER.index\nSTATUS_TO_UPSTREAM_MAP = {\n    FAILED: UPSTREAM_FAILED,\n    RUNNING: UPSTREAM_RUNNING,\n    BATCH_RUNNING: UPSTREAM_RUNNING,\n    PENDING: UPSTREAM_MISSING_INPUT,\n    DISABLED: UPSTREAM_DISABLED,\n}\n\nWORKER_STATE_DISABLED = \"disabled\"\nWORKER_STATE_ACTIVE = \"active\"\n\nTASK_FAMILY_RE = re.compile(r\"([^(_]+)[(_]\")\n\nRPC_METHODS = {}\n\n_retry_policy_fields = [\n    \"retry_count\",\n    \"disable_hard_timeout\",\n    \"disable_window\",\n]\nRetryPolicy = collections.namedtuple(\"RetryPolicy\", _retry_policy_fields)  # type: ignore\n\n\ndef _get_empty_retry_policy():\n    return RetryPolicy(*[None] * len(_retry_policy_fields))\n\n\ndef rpc_method(**request_args):\n    def _rpc_method(fn):\n        # If request args are passed, return this function again for use as\n        # the decorator function with the request args attached.\n        args, varargs, varkw, defaults, kwonlyargs, kwonlydefaults, ann = inspect.getfullargspec(fn)\n        assert not varargs\n        first_arg, *all_args = args\n        assert first_arg == \"self\"\n        defaults = dict(zip(reversed(all_args), reversed(defaults or ())))\n        required_args = frozenset(arg for arg in all_args if arg not in defaults)\n        fn_name = fn.__name__\n\n        @functools.wraps(fn)\n        def rpc_func(self, *args, **kwargs):\n            actual_args = defaults.copy()\n            actual_args.update(dict(zip(all_args, args)))\n            actual_args.update(kwargs)\n            if not all(arg in actual_args for arg in required_args):\n                raise TypeError(\"{} takes {} arguments ({} given)\".format(fn_name, len(all_args), len(actual_args)))\n            return self._request(\"/api/{}\".format(fn_name), actual_args, **request_args)\n\n        RPC_METHODS[fn_name] = rpc_func\n        return fn\n\n    return _rpc_method\n\n\nclass scheduler(Config):\n    retry_delay = parameter.FloatParameter(default=900.0)\n    remove_delay = parameter.FloatParameter(default=600.0)\n    worker_disconnect_delay = parameter.FloatParameter(default=60.0)\n    state_path = parameter.Parameter(default=\"/var/lib/luigi-server/state.pickle\")\n\n    batch_emails = parameter.BoolParameter(default=False, description=\"Send e-mails in batches rather than immediately\")\n\n    # Jobs are disabled if we see more than retry_count failures in disable_window seconds.\n    # These disables last for disable_persist seconds.\n    disable_window = parameter.IntParameter(default=3600)\n    retry_count = parameter.IntParameter(default=999999999)\n    disable_hard_timeout = parameter.IntParameter(default=999999999)\n    disable_persist = parameter.IntParameter(default=86400)\n    max_shown_tasks = parameter.IntParameter(default=100000)\n    max_graph_nodes = parameter.IntParameter(default=100000)\n\n    record_task_history = parameter.BoolParameter(default=False)\n\n    prune_on_get_work = parameter.BoolParameter(default=False)\n\n    pause_enabled = parameter.BoolParameter(default=True)\n\n    send_messages = parameter.BoolParameter(default=True)\n\n    metrics_collector = parameter.EnumParameter(enum=MetricsCollectors, default=MetricsCollectors.default)\n    metrics_custom_import = parameter.OptionalStrParameter(default=None)\n\n    stable_done_cooldown_secs = parameter.IntParameter(default=10, description=\"Sets cooldown period to avoid running the same task twice\")\n    \"\"\"\n    Sets a cooldown period in seconds after a task was completed, during this period the same task will not accepted by the scheduler.\n    \"\"\"\n\n    def _get_retry_policy(self):\n        return RetryPolicy(self.retry_count, self.disable_hard_timeout, self.disable_window)\n\n\ndef _get_default(x, default):\n    if x is not None:\n        return x\n    else:\n        return default\n\n\nclass OrderedSet(MutableSet):\n    \"\"\"\n    Standard Python OrderedSet recipe found at http://code.activestate.com/recipes/576694/\n\n    Modified to include a peek function to get the last element\n    \"\"\"\n\n    def __init__(self, iterable=None):\n        self.end = end = []\n        end += [None, end, end]  # sentinel node for doubly linked list\n        self.map = {}  # key --> [key, prev, next]\n        if iterable is not None:\n            self |= iterable\n\n    def __len__(self):\n        return len(self.map)\n\n    def __contains__(self, key):\n        return key in self.map\n\n    def add(self, key):\n        if key not in self.map:\n            end = self.end\n            curr = end[1]\n            curr[2] = end[1] = self.map[key] = [key, curr, end]\n\n    def discard(self, key):\n        if key in self.map:\n            key, prev, next = self.map.pop(key)\n            prev[2] = next\n            next[1] = prev\n\n    def __iter__(self):\n        end = self.end\n        curr = end[2]\n        while curr is not end:\n            yield curr[0]\n            curr = curr[2]\n\n    def __reversed__(self):\n        end = self.end\n        curr = end[1]\n        while curr is not end:\n            yield curr[0]\n            curr = curr[1]\n\n    def peek(self, last=True):\n        if not self:\n            raise KeyError(\"set is empty\")\n        key = self.end[1][0] if last else self.end[2][0]\n        return key\n\n    def pop(self, last=True):\n        key = self.peek(last)\n        self.discard(key)\n        return key\n\n    def __repr__(self):\n        if not self:\n            return \"%s()\" % (self.__class__.__name__,)\n        return \"%s(%r)\" % (self.__class__.__name__, list(self))\n\n    def __eq__(self, other):\n        if isinstance(other, OrderedSet):\n            return len(self) == len(other) and list(self) == list(other)\n        return set(self) == set(other)\n\n\nclass Task:\n    def __init__(\n        self,\n        task_id,\n        status,\n        deps,\n        resources=None,\n        priority=0,\n        family=\"\",\n        module=None,\n        params=None,\n        param_visibilities=None,\n        accepts_messages=False,\n        tracking_url=None,\n        status_message=None,\n        progress_percentage=None,\n        retry_policy=\"notoptional\",\n    ):\n        self.id = task_id\n        self.stakeholders = set()  # workers ids that are somehow related to this task (i.e. don't prune while any of these workers are still active)\n        self.workers = OrderedSet()  # workers ids that can perform task - task is 'BROKEN' if none of these workers are active\n        if deps is None:\n            self.deps = set()\n        else:\n            self.deps = set(deps)\n        self.status = status  # PENDING, RUNNING, FAILED or DONE\n        self.time = time.time()  # Timestamp when task was first added\n        self.updated = self.time\n        self.retry = None\n        self.remove = None\n        self.worker_running = None  # the worker id that is currently running the task or None\n        self.time_running = None  # Timestamp when picked up by worker\n        self.expl = None\n        self.priority = priority\n        self.resources = _get_default(resources, {})\n        self.family = family\n        self.module = module\n        self.param_visibilities = _get_default(param_visibilities, {})\n        self.params = {}\n        self.public_params = {}\n        self.hidden_params = {}\n        self.set_params(params)\n        self.accepts_messages = accepts_messages\n        self.retry_policy = retry_policy\n        self.failures = collections.deque()\n        self.first_failure_time = None\n        self.tracking_url = tracking_url\n        self.status_message = status_message\n        self.progress_percentage = progress_percentage\n        self.scheduler_message_responses = {}\n        self.scheduler_disable_time = None\n        self.runnable = False\n        self.batchable = False\n        self.batch_id = None\n\n    def __repr__(self):\n        return \"Task(%r)\" % vars(self)\n\n    def set_params(self, params):\n        self.params = _get_default(params, {})\n        self.public_params = {\n            key: value for key, value in self.params.items() if self.param_visibilities.get(key, ParameterVisibility.PUBLIC) == ParameterVisibility.PUBLIC\n        }\n        self.hidden_params = {\n            key: value for key, value in self.params.items() if self.param_visibilities.get(key, ParameterVisibility.PUBLIC) == ParameterVisibility.HIDDEN\n        }\n\n    # TODO(2017-08-10) replace this function with direct calls to batchable\n    # this only exists for backward compatibility\n    def is_batchable(self):\n        try:\n            return self.batchable\n        except AttributeError:\n            return False\n\n    def add_failure(self):\n        \"\"\"\n        Add a failure event with the current timestamp.\n        \"\"\"\n        failure_time = time.time()\n\n        if not self.first_failure_time:\n            self.first_failure_time = failure_time\n\n        self.failures.append(failure_time)\n\n    def num_failures(self):\n        \"\"\"\n        Return the number of failures in the window.\n        \"\"\"\n        min_time = time.time() - self.retry_policy.disable_window\n\n        while self.failures and self.failures[0] < min_time:\n            self.failures.popleft()\n\n        return len(self.failures)\n\n    def has_excessive_failures(self):\n        if self.first_failure_time is not None:\n            if time.time() >= self.first_failure_time + self.retry_policy.disable_hard_timeout:\n                return True\n\n        logger.debug(\"%s task num failures is %s and limit is %s\", self.id, self.num_failures(), self.retry_policy.retry_count)\n        if self.num_failures() >= self.retry_policy.retry_count:\n            logger.debug(\"%s task num failures limit(%s) is exceeded\", self.id, self.retry_policy.retry_count)\n            return True\n\n        return False\n\n    def clear_failures(self):\n        \"\"\"\n        Clear the failures history\n        \"\"\"\n        self.failures.clear()\n        self.first_failure_time = None\n\n    @property\n    def pretty_id(self):\n        param_str = \", \".join(\"{}={}\".format(key, value) for key, value in sorted(self.public_params.items()))\n        return \"{}({})\".format(self.family, param_str)\n\n\nclass Worker:\n    \"\"\"\n    Structure for tracking worker activity and keeping their references.\n    \"\"\"\n\n    def __init__(self, worker_id, last_active=None):\n        self.id = worker_id\n        self.reference = None  # reference to the worker in the real world. (Currently a dict containing just the host)\n        self.last_active = last_active or time.time()  # seconds since epoch\n        self.last_get_work = None\n        self.started = time.time()  # seconds since epoch\n        self.tasks = set()  # task objects\n        self.info = {}\n        self.disabled = False\n        self.rpc_messages = []\n\n    def add_info(self, info):\n        self.info.update(info)\n\n    def update(self, worker_reference, get_work=False):\n        if worker_reference:\n            self.reference = worker_reference\n        self.last_active = time.time()\n        if get_work:\n            self.last_get_work = time.time()\n\n    def prune(self, config):\n        # Delete workers that haven't said anything for a while (probably killed)\n        if self.last_active + config.worker_disconnect_delay < time.time():\n            return True\n\n    def get_tasks(self, state, *statuses):\n        num_self_tasks = len(self.tasks)\n        num_state_tasks = sum(len(state._status_tasks[status]) for status in statuses)\n        if num_self_tasks < num_state_tasks:\n            return filter(lambda task: task.status in statuses, self.tasks)\n        else:\n            return filter(lambda task: self.id in task.workers, state.get_active_tasks_by_status(*statuses))\n\n    def is_trivial_worker(self, state):\n        \"\"\"\n        If it's not an assistant having only tasks that are without\n        requirements.\n\n        We have to pass the state parameter for optimization reasons.\n        \"\"\"\n        if self.assistant:\n            return False\n        return all(not task.resources for task in self.get_tasks(state, PENDING))\n\n    @property\n    def assistant(self):\n        return self.info.get(\"assistant\", False)\n\n    @property\n    def enabled(self):\n        return not self.disabled\n\n    @property\n    def state(self):\n        if self.enabled:\n            return WORKER_STATE_ACTIVE\n        else:\n            return WORKER_STATE_DISABLED\n\n    def add_rpc_message(self, name, **kwargs):\n        # the message has the format {'name': <function_name>, 'kwargs': <function_kwargs>}\n        self.rpc_messages.append({\"name\": name, \"kwargs\": kwargs})\n\n    def fetch_rpc_messages(self):\n        messages = self.rpc_messages[:]\n        del self.rpc_messages[:]\n        return messages\n\n    def __str__(self):\n        return self.id\n\n\nclass SimpleTaskState:\n    \"\"\"\n    Keep track of the current state and handle persistence.\n\n    The point of this class is to enable other ways to keep state, eg. by using a database\n    These will be implemented by creating an abstract base class that this and other classes\n    inherit from.\n    \"\"\"\n\n    def __init__(self, state_path):\n        self._state_path = state_path\n        self._tasks = {}  # map from id to a Task object\n        self._status_tasks = collections.defaultdict(dict)\n        self._active_workers = {}  # map from id to a Worker object\n        self._task_batchers = {}\n        self._metrics_collector = None\n\n    def get_state(self):\n        return self._tasks, self._active_workers, self._task_batchers\n\n    def set_state(self, state):\n        self._tasks, self._active_workers = state[:2]\n        if len(state) >= 3:\n            self._task_batchers = state[2]\n\n    def dump(self):\n        try:\n            with open(self._state_path, \"wb\") as fobj:\n                pickle.dump(self.get_state(), fobj)\n        except IOError:\n            logger.warning(\"Failed saving scheduler state\", exc_info=1)\n        else:\n            logger.info(\"Saved state in %s\", self._state_path)\n\n    # prone to lead to crashes when old state is unpickled with updated code. TODO some kind of version control?\n    def load(self):\n        if os.path.exists(self._state_path):\n            logger.info(\"Attempting to load state from %s\", self._state_path)\n            try:\n                with open(self._state_path, \"rb\") as fobj:\n                    state = pickle.load(fobj)\n            except BaseException:\n                logger.exception(\"Error when loading state. Starting from empty state.\")\n                return\n\n            self.set_state(state)\n            self._status_tasks = collections.defaultdict(dict)\n            for task in self._tasks.values():\n                self._status_tasks[task.status][task.id] = task\n        else:\n            logger.info(\"No prior state file exists at %s. Starting with empty state\", self._state_path)\n\n    def get_active_tasks(self):\n        return self._tasks.values()\n\n    def get_active_tasks_by_status(self, *statuses):\n        return itertools.chain.from_iterable(self._status_tasks[status].values() for status in statuses)\n\n    def get_active_task_count_for_status(self, status):\n        if status:\n            return len(self._status_tasks[status])\n        else:\n            return len(self._tasks)\n\n    def get_batch_running_tasks(self, batch_id):\n        assert batch_id is not None\n        return [task for task in self.get_active_tasks_by_status(BATCH_RUNNING) if task.batch_id == batch_id]\n\n    def set_batcher(self, worker_id, family, batcher_args, max_batch_size):\n        self._task_batchers.setdefault(worker_id, {})\n        self._task_batchers[worker_id][family] = (batcher_args, max_batch_size)\n\n    def get_batcher(self, worker_id, family):\n        return self._task_batchers.get(worker_id, {}).get(family, (None, 1))\n\n    def num_pending_tasks(self):\n        \"\"\"\n        Return how many tasks are PENDING + RUNNING. O(1).\n        \"\"\"\n        return len(self._status_tasks[PENDING]) + len(self._status_tasks[RUNNING])\n\n    def get_task(self, task_id, default=None, setdefault=None):\n        if setdefault:\n            task = self._tasks.setdefault(task_id, setdefault)\n            self._status_tasks[task.status][task.id] = task\n            return task\n        else:\n            return self._tasks.get(task_id, default)\n\n    def has_task(self, task_id):\n        return task_id in self._tasks\n\n    def re_enable(self, task, config=None):\n        task.scheduler_disable_time = None\n        task.clear_failures()\n        if config:\n            self.set_status(task, FAILED, config)\n            task.clear_failures()\n\n    def set_batch_running(self, task, batch_id, worker_id):\n        self.set_status(task, BATCH_RUNNING)\n        task.batch_id = batch_id\n        task.worker_running = worker_id\n        task.resources_running = task.resources\n        task.time_running = time.time()\n\n    def set_status(self, task, new_status, config=None):\n        if new_status == FAILED:\n            assert config is not None\n\n        if new_status == DISABLED and task.status in (RUNNING, BATCH_RUNNING):\n            return\n\n        remove_on_failure = task.batch_id is not None and not task.batchable\n\n        if task.status == DISABLED:\n            if new_status == DONE:\n                self.re_enable(task)\n\n            # don't allow workers to override a scheduler disable\n            elif task.scheduler_disable_time is not None and new_status != DISABLED:\n                return\n\n        if task.status == RUNNING and task.batch_id is not None and new_status != RUNNING:\n            for batch_task in self.get_batch_running_tasks(task.batch_id):\n                self.set_status(batch_task, new_status, config)\n                batch_task.batch_id = None\n            task.batch_id = None\n\n        if new_status == FAILED and task.status != DISABLED:\n            task.add_failure()\n            if task.has_excessive_failures():\n                task.scheduler_disable_time = time.time()\n                new_status = DISABLED\n                if not config.batch_emails:\n                    notifications.send_error_email(\n                        \"Luigi Scheduler: DISABLED {task} due to excessive failures\".format(task=task.id),\n                        \"{task} failed {failures} times in the last {window} seconds, so it is being disabled for {persist} seconds\".format(\n                            failures=task.retry_policy.retry_count,\n                            task=task.id,\n                            window=task.retry_policy.disable_window,\n                            persist=config.disable_persist,\n                        ),\n                    )\n        elif new_status == DISABLED:\n            task.scheduler_disable_time = None\n\n        if new_status != task.status:\n            self._status_tasks[task.status].pop(task.id)\n            self._status_tasks[new_status][task.id] = task\n            task.status = new_status\n            task.updated = time.time()\n            self.update_metrics(task, config)\n\n        if new_status == FAILED:\n            task.retry = time.time() + config.retry_delay\n            if remove_on_failure:\n                task.remove = time.time()\n\n    def fail_dead_worker_task(self, task, config, assistants):\n        # If a running worker disconnects, tag all its jobs as FAILED and subject it to the same retry logic\n        if task.status in (BATCH_RUNNING, RUNNING) and task.worker_running and task.worker_running not in task.stakeholders | assistants:\n            logger.info(\n                \"Task %r is marked as running by disconnected worker %r -> marking as FAILED with retry delay of %rs\",\n                task.id,\n                task.worker_running,\n                config.retry_delay,\n            )\n            task.worker_running = None\n            self.set_status(task, FAILED, config)\n            task.retry = time.time() + config.retry_delay\n\n    def update_status(self, task, config):\n        # Mark tasks with no remaining active stakeholders for deletion\n        if (not task.stakeholders) and (task.remove is None) and (task.status != RUNNING):\n            # We don't check for the RUNNING case, because that is already handled\n            # by the fail_dead_worker_task function.\n            logger.debug(\"Task %r has no stakeholders anymore -> might remove task in %s seconds\", task.id, config.remove_delay)\n            task.remove = time.time() + config.remove_delay\n\n        # Re-enable task after the disable time expires\n        if task.status == DISABLED and task.scheduler_disable_time is not None:\n            if time.time() - task.scheduler_disable_time > config.disable_persist:\n                self.re_enable(task, config)\n\n        # Reset FAILED tasks to PENDING if max timeout is reached, and retry delay is >= 0\n        if task.status == FAILED and config.retry_delay >= 0 and task.retry < time.time():\n            self.set_status(task, PENDING, config)\n\n    def may_prune(self, task):\n        return task.remove and time.time() >= task.remove\n\n    def inactivate_tasks(self, delete_tasks):\n        # The terminology is a bit confusing: we used to \"delete\" tasks when they became inactive,\n        # but with a pluggable state storage, you might very well want to keep some history of\n        # older tasks as well. That's why we call it \"inactivate\" (as in the verb)\n        for task in delete_tasks:\n            task_obj = self._tasks.pop(task)\n            self._status_tasks[task_obj.status].pop(task)\n\n    def get_active_workers(self, last_active_lt=None, last_get_work_gt=None):\n        for worker in self._active_workers.values():\n            if last_active_lt is not None and worker.last_active >= last_active_lt:\n                continue\n            last_get_work = worker.last_get_work\n            if last_get_work_gt is not None and (last_get_work is None or last_get_work <= last_get_work_gt):\n                continue\n            yield worker\n\n    def get_assistants(self, last_active_lt=None):\n        return filter(lambda w: w.assistant, self.get_active_workers(last_active_lt))\n\n    def get_worker_ids(self):\n        return self._active_workers.keys()  # only used for unit tests\n\n    def get_worker(self, worker_id):\n        return self._active_workers.setdefault(worker_id, Worker(worker_id))\n\n    def inactivate_workers(self, delete_workers):\n        # Mark workers as inactive\n        for worker in delete_workers:\n            self._active_workers.pop(worker)\n        self._remove_workers_from_tasks(delete_workers)\n\n    def _remove_workers_from_tasks(self, workers, remove_stakeholders=True):\n        for task in self.get_active_tasks():\n            if remove_stakeholders:\n                task.stakeholders.difference_update(workers)\n            task.workers -= workers\n\n    def disable_workers(self, worker_ids):\n        self._remove_workers_from_tasks(worker_ids, remove_stakeholders=False)\n        for worker_id in worker_ids:\n            worker = self.get_worker(worker_id)\n            worker.disabled = True\n            worker.tasks.clear()\n\n    def update_metrics(self, task, config):\n        if task.status == DISABLED:\n            self._metrics_collector.handle_task_disabled(task, config)\n        elif task.status == DONE:\n            self._metrics_collector.handle_task_done(task)\n        elif task.status == FAILED:\n            self._metrics_collector.handle_task_failed(task)\n\n\nclass Scheduler:\n    \"\"\"\n    Async scheduler that can handle multiple workers, etc.\n\n    Can be run locally or on a server (using RemoteScheduler + server.Server).\n    \"\"\"\n\n    def __init__(self, config=None, resources=None, task_history_impl=None, **kwargs):\n        \"\"\"\n        Keyword Arguments:\n        :param config: an object of class \"scheduler\" or None (in which the global instance will be used)\n        :param resources: a dict of str->int constraints\n        :param task_history_impl: ignore config and use this object as the task history\n        \"\"\"\n        self._config = config or scheduler(**kwargs)\n        self._state = SimpleTaskState(self._config.state_path)\n\n        if task_history_impl:\n            self._task_history = task_history_impl\n        elif self._config.record_task_history:\n            from luigi import db_task_history  # Needs sqlalchemy, thus imported here\n\n            self._task_history = db_task_history.DbTaskHistory()\n        else:\n            self._task_history = history.NopHistory()\n        self._resources = resources or configuration.get_config().getintdict(\"resources\")  # TODO: Can we make this a Parameter?\n        self._make_task = functools.partial(Task, retry_policy=self._config._get_retry_policy())\n        self._worker_requests = {}\n        self._paused = False\n\n        if self._config.batch_emails:\n            self._email_batcher = BatchNotifier()\n\n        self._state._metrics_collector = MetricsCollectors.get(self._config.metrics_collector, self._config.metrics_custom_import)\n\n    def load(self):\n        self._state.load()\n\n    def dump(self):\n        self._state.dump()\n        if self._config.batch_emails:\n            self._email_batcher.send_email()\n\n    @rpc_method()\n    def prune(self):\n        logger.debug(\"Starting pruning of task graph\")\n        self._prune_workers()\n        self._prune_tasks()\n        self._prune_emails()\n        logger.debug(\"Done pruning task graph\")\n\n    def _prune_workers(self):\n        remove_workers = []\n        for worker in self._state.get_active_workers():\n            if worker.prune(self._config):\n                logger.debug(\"Worker %s timed out (no contact for >=%ss)\", worker, self._config.worker_disconnect_delay)\n                remove_workers.append(worker.id)\n\n        self._state.inactivate_workers(remove_workers)\n\n    def _prune_tasks(self):\n        assistant_ids = {w.id for w in self._state.get_assistants()}\n        remove_tasks = []\n\n        for task in self._state.get_active_tasks():\n            self._state.fail_dead_worker_task(task, self._config, assistant_ids)\n            self._state.update_status(task, self._config)\n            if self._state.may_prune(task):\n                logger.info(\"Removing task %r\", task.id)\n                remove_tasks.append(task.id)\n\n        self._state.inactivate_tasks(remove_tasks)\n\n    def _prune_emails(self):\n        if self._config.batch_emails:\n            self._email_batcher.update()\n\n    def _update_worker(self, worker_id, worker_reference=None, get_work=False):\n        # Keep track of whenever the worker was last active.\n        # For convenience also return the worker object.\n        worker = self._state.get_worker(worker_id)\n        worker.update(worker_reference, get_work=get_work)\n        return worker\n\n    def _update_priority(self, task, prio, worker):\n        \"\"\"\n        Update priority of the given task.\n\n        Priority can only be increased.\n        If the task doesn't exist, a placeholder task is created to preserve priority when the task is later scheduled.\n        \"\"\"\n        task.priority = prio = max(prio, task.priority)\n        for dep in task.deps or []:\n            t = self._state.get_task(dep)\n            if t is not None and prio > t.priority:\n                self._update_priority(t, prio, worker)\n\n    @rpc_method()\n    def add_task_batcher(self, worker, task_family, batched_args, max_batch_size=float(\"inf\")):\n        self._state.set_batcher(worker, task_family, batched_args, max_batch_size)\n\n    @rpc_method()\n    def forgive_failures(self, task_id=None):\n        status = PENDING\n        task = self._state.get_task(task_id)\n        if task is None:\n            return {\"task_id\": task_id, \"status\": None}\n\n        # we forgive only failures\n        if task.status == FAILED:\n            # forgive but do not forget\n            self._update_task_history(task, status)\n            self._state.set_status(task, status, self._config)\n        return {\"task_id\": task_id, \"status\": task.status}\n\n    @rpc_method()\n    def mark_as_done(self, task_id=None):\n        status = DONE\n        task = self._state.get_task(task_id)\n        if task is None:\n            return {\"task_id\": task_id, \"status\": None}\n\n        # we can force mark DONE for running or failed tasks\n        if task.status in {RUNNING, FAILED, DISABLED}:\n            self._update_task_history(task, status)\n            self._state.set_status(task, status, self._config)\n        return {\"task_id\": task_id, \"status\": task.status}\n\n    @rpc_method()\n    def add_task(\n        self,\n        task_id=None,\n        status=PENDING,\n        runnable=True,\n        deps=None,\n        new_deps=None,\n        expl=None,\n        resources=None,\n        priority=0,\n        family=\"\",\n        module=None,\n        params=None,\n        param_visibilities=None,\n        accepts_messages=False,\n        assistant=False,\n        tracking_url=None,\n        worker=None,\n        batchable=None,\n        batch_id=None,\n        retry_policy_dict=None,\n        owners=None,\n        **kwargs,\n    ):\n        \"\"\"\n        * add task identified by task_id if it doesn't exist\n        * if deps is not None, update dependency list\n        * update status of task\n        * add additional workers/stakeholders\n        * update priority when needed\n        \"\"\"\n        assert worker is not None\n        worker_id = worker\n        worker = self._update_worker(worker_id)\n\n        resources = {} if resources is None else resources.copy()\n\n        if retry_policy_dict is None:\n            retry_policy_dict = {}\n\n        retry_policy = self._generate_retry_policy(retry_policy_dict)\n\n        if worker.enabled:\n            _default_task = self._make_task(\n                task_id=task_id,\n                status=PENDING,\n                deps=deps,\n                resources=resources,\n                priority=priority,\n                family=family,\n                module=module,\n                params=params,\n                param_visibilities=param_visibilities,\n            )\n        else:\n            _default_task = None\n\n        task = self._state.get_task(task_id, setdefault=_default_task)\n\n        if task is None or (task.status != RUNNING and not worker.enabled):\n            return\n\n        # Ignore claims that the task is PENDING if it very recently was marked as DONE.\n        if status == PENDING and task.status == DONE and (time.time() - task.updated) < self._config.stable_done_cooldown_secs:\n            return\n\n        # for setting priority, we'll sometimes create tasks with unset family and params\n        if not task.family:\n            task.family = family\n        if not getattr(task, \"module\", None):\n            task.module = module\n        if not getattr(task, \"param_visibilities\", None):\n            task.param_visibilities = _get_default(param_visibilities, {})\n        if not task.params:\n            task.set_params(params)\n\n        if batch_id is not None:\n            task.batch_id = batch_id\n        if status == RUNNING and not task.worker_running:\n            task.worker_running = worker_id\n            if batch_id:\n                # copy resources_running of the first batch task\n                batch_tasks = self._state.get_batch_running_tasks(batch_id)\n                task.resources_running = batch_tasks[0].resources_running.copy()\n            task.time_running = time.time()\n\n        if accepts_messages is not None:\n            task.accepts_messages = accepts_messages\n\n        if tracking_url is not None or task.status != RUNNING:\n            task.tracking_url = tracking_url\n            if task.batch_id is not None:\n                for batch_task in self._state.get_batch_running_tasks(task.batch_id):\n                    batch_task.tracking_url = tracking_url\n\n        if batchable is not None:\n            task.batchable = batchable\n\n        if task.remove is not None:\n            task.remove = None  # unmark task for removal so it isn't removed after being added\n\n        if expl is not None:\n            task.expl = expl\n            if task.batch_id is not None:\n                for batch_task in self._state.get_batch_running_tasks(task.batch_id):\n                    batch_task.expl = expl\n\n        task_is_not_running = task.status not in (RUNNING, BATCH_RUNNING)\n        task_started_a_run = status in (DONE, FAILED, RUNNING)\n        running_on_this_worker = task.worker_running == worker_id\n        if task_is_not_running or (task_started_a_run and running_on_this_worker) or new_deps:\n            # don't allow re-scheduling of task while it is running, it must either fail or succeed on the worker actually running it\n            if status != task.status or status == PENDING:\n                # Update the DB only if there was a acctual change, to prevent noise.\n                # We also check for status == PENDING b/c that's the default value\n                # (so checking for status != task.status woule lie)\n                self._update_task_history(task, status)\n            self._state.set_status(task, PENDING if status == SUSPENDED else status, self._config)\n\n        if status == FAILED and self._config.batch_emails:\n            batched_params, _ = self._state.get_batcher(worker_id, family)\n            if batched_params:\n                unbatched_params = {param: value for param, value in task.params.items() if param not in batched_params}\n            else:\n                unbatched_params = task.params\n            try:\n                expl_raw = json.loads(expl)\n            except ValueError:\n                expl_raw = expl\n\n            self._email_batcher.add_failure(task.pretty_id, task.family, unbatched_params, expl_raw, owners)\n            if task.status == DISABLED:\n                self._email_batcher.add_disable(task.pretty_id, task.family, unbatched_params, owners)\n\n        if deps is not None:\n            task.deps = set(deps)\n\n        if new_deps is not None:\n            task.deps.update(new_deps)\n\n        if resources is not None:\n            task.resources = resources\n\n        if worker.enabled and not assistant:\n            task.stakeholders.add(worker_id)\n\n            # Task dependencies might not exist yet. Let's create dummy tasks for them for now.\n            # Otherwise the task dependencies might end up being pruned if scheduling takes a long time\n            for dep in task.deps or []:\n                t = self._state.get_task(dep, setdefault=self._make_task(task_id=dep, status=UNKNOWN, deps=None, priority=priority))\n                t.stakeholders.add(worker_id)\n\n        self._update_priority(task, priority, worker_id)\n\n        # Because some tasks (non-dynamic dependencies) are `_make_task`ed\n        # before we know their retry_policy, we always set it here\n        task.retry_policy = retry_policy\n\n        if runnable and status != FAILED and worker.enabled:\n            task.workers.add(worker_id)\n            self._state.get_worker(worker_id).tasks.add(task)\n            task.runnable = runnable\n\n    @rpc_method()\n    def announce_scheduling_failure(self, task_name, family, params, expl, owners, **kwargs):\n        if not self._config.batch_emails:\n            return\n        worker_id = kwargs[\"worker\"]\n        batched_params, _ = self._state.get_batcher(worker_id, family)\n        if batched_params:\n            unbatched_params = {param: value for param, value in params.items() if param not in batched_params}\n        else:\n            unbatched_params = params\n        self._email_batcher.add_scheduling_fail(task_name, family, unbatched_params, expl, owners)\n\n    @rpc_method()\n    def add_worker(self, worker, info, **kwargs):\n        self._state.get_worker(worker).add_info(info)\n\n    @rpc_method()\n    def disable_worker(self, worker):\n        self._state.disable_workers({worker})\n\n    @rpc_method()\n    def set_worker_processes(self, worker, n):\n        self._state.get_worker(worker).add_rpc_message(\"set_worker_processes\", n=n)\n\n    @rpc_method()\n    def send_scheduler_message(self, worker, task, content):\n        if not self._config.send_messages:\n            return {\"message_id\": None}\n\n        message_id = str(uuid.uuid4())\n        self._state.get_worker(worker).add_rpc_message(\"dispatch_scheduler_message\", task_id=task, message_id=message_id, content=content)\n\n        return {\"message_id\": message_id}\n\n    @rpc_method()\n    def add_scheduler_message_response(self, task_id, message_id, response):\n        if self._state.has_task(task_id):\n            task = self._state.get_task(task_id)\n            task.scheduler_message_responses[message_id] = response\n\n    @rpc_method()\n    def get_scheduler_message_response(self, task_id, message_id):\n        response = None\n        if self._state.has_task(task_id):\n            task = self._state.get_task(task_id)\n            response = task.scheduler_message_responses.pop(message_id, None)\n        return {\"response\": response}\n\n    @rpc_method()\n    def has_task_history(self):\n        return self._config.record_task_history\n\n    @rpc_method()\n    def is_pause_enabled(self):\n        return {\"enabled\": self._config.pause_enabled}\n\n    @rpc_method()\n    def is_paused(self):\n        return {\"paused\": self._paused}\n\n    @rpc_method()\n    def pause(self):\n        if self._config.pause_enabled:\n            self._paused = True\n\n    @rpc_method()\n    def unpause(self):\n        if self._config.pause_enabled:\n            self._paused = False\n\n    @rpc_method()\n    def update_resources(self, **resources):\n        if self._resources is None:\n            self._resources = {}\n        self._resources.update(resources)\n\n    @rpc_method()\n    def update_resource(self, resource, amount):\n        if not isinstance(amount, int) or amount < 0:\n            return False\n        self._resources[resource] = amount\n        return True\n\n    def _generate_retry_policy(self, task_retry_policy_dict):\n        retry_policy_dict = self._config._get_retry_policy()._asdict()\n        retry_policy_dict.update({k: v for k, v in task_retry_policy_dict.items() if v is not None})\n        return RetryPolicy(**retry_policy_dict)\n\n    def _has_resources(self, needed_resources, used_resources):\n        if needed_resources is None:\n            return True\n\n        available_resources = self._resources or {}\n        for resource, amount in needed_resources.items():\n            if amount + used_resources[resource] > available_resources.get(resource, 1):\n                return False\n        return True\n\n    def _used_resources(self):\n        used_resources = collections.defaultdict(int)\n        if self._resources is not None:\n            for task in self._state.get_active_tasks_by_status(RUNNING):\n                resources_running = getattr(task, \"resources_running\", task.resources)\n                if resources_running:\n                    for resource, amount in resources_running.items():\n                        used_resources[resource] += amount\n        return used_resources\n\n    def _rank(self, task):\n        \"\"\"\n        Return worker's rank function for task scheduling.\n\n        :return:\n        \"\"\"\n\n        return task.priority, -task.time\n\n    def _schedulable(self, task):\n        if task.status != PENDING:\n            return False\n        for dep in task.deps:\n            dep_task = self._state.get_task(dep, default=None)\n            if dep_task is None or dep_task.status != DONE:\n                return False\n        return True\n\n    def _reset_orphaned_batch_running_tasks(self, worker_id):\n        running_batch_ids = {task.batch_id for task in self._state.get_active_tasks_by_status(RUNNING) if task.worker_running == worker_id}\n        orphaned_tasks = [\n            task\n            for task in self._state.get_active_tasks_by_status(BATCH_RUNNING)\n            if task.worker_running == worker_id and task.batch_id not in running_batch_ids\n        ]\n        for task in orphaned_tasks:\n            self._state.set_status(task, PENDING)\n\n    @rpc_method()\n    def count_pending(self, worker):\n        worker_id, worker = worker, self._state.get_worker(worker)\n\n        num_pending, num_unique_pending, num_pending_last_scheduled = 0, 0, 0\n        running_tasks = []\n\n        upstream_status_table = {}\n        for task in worker.get_tasks(self._state, RUNNING):\n            if self._upstream_status(task.id, upstream_status_table) == UPSTREAM_DISABLED:\n                continue\n            # Return a list of currently running tasks to the client,\n            # makes it easier to troubleshoot\n            other_worker = self._state.get_worker(task.worker_running)\n            if other_worker is not None:\n                more_info = {\"task_id\": task.id, \"worker\": str(other_worker)}\n                more_info.update(other_worker.info)\n                running_tasks.append(more_info)\n\n        for task in worker.get_tasks(self._state, PENDING, FAILED):\n            if self._upstream_status(task.id, upstream_status_table) == UPSTREAM_DISABLED:\n                continue\n            num_pending += 1\n            num_unique_pending += int(len(task.workers) == 1)\n            num_pending_last_scheduled += int(task.workers.peek(last=True) == worker_id)\n\n        return {\n            \"n_pending_tasks\": num_pending,\n            \"n_unique_pending\": num_unique_pending,\n            \"n_pending_last_scheduled\": num_pending_last_scheduled,\n            \"worker_state\": worker.state,\n            \"running_tasks\": running_tasks,\n        }\n\n    @rpc_method(allow_null=False)\n    def get_work(self, host=None, assistant=False, current_tasks=None, worker=None, **kwargs):\n        # TODO: remove any expired nodes\n\n        # Algo: iterate over all nodes, find the highest priority node no dependencies and available\n        # resources.\n\n        # Resource checking looks both at currently available resources and at which resources would\n        # be available if all running tasks died and we rescheduled all workers greedily. We do both\n        # checks in order to prevent a worker with many low-priority tasks from starving other\n        # workers with higher priority tasks that share the same resources.\n\n        # TODO: remove tasks that can't be done, figure out if the worker has absolutely\n        # nothing it can wait for\n\n        if self._config.prune_on_get_work:\n            self.prune()\n\n        assert worker is not None\n        worker_id = worker\n        worker = self._update_worker(worker_id, worker_reference={\"host\": host}, get_work=True)\n        if not worker.enabled:\n            reply = {\n                \"n_pending_tasks\": 0,\n                \"running_tasks\": [],\n                \"task_id\": None,\n                \"n_unique_pending\": 0,\n                \"worker_state\": worker.state,\n            }\n            return reply\n\n        if assistant:\n            self.add_worker(worker_id, [(\"assistant\", assistant)])\n\n        batched_params, unbatched_params, batched_tasks, max_batch_size = None, None, [], 1\n        best_task = None\n        if current_tasks is not None:\n            ct_set = set(current_tasks)\n            for task in sorted(self._state.get_active_tasks_by_status(RUNNING), key=self._rank):\n                if task.worker_running == worker_id and task.id not in ct_set:\n                    best_task = task\n\n        if current_tasks is not None:\n            # batch running tasks that weren't claimed since the last get_work go back in the pool\n            self._reset_orphaned_batch_running_tasks(worker_id)\n\n        greedy_resources = collections.defaultdict(int)\n\n        worker = self._state.get_worker(worker_id)\n        if self._paused:\n            relevant_tasks = []\n        elif worker.is_trivial_worker(self._state):\n            relevant_tasks = worker.get_tasks(self._state, PENDING, RUNNING)\n            used_resources = collections.defaultdict(int)\n            greedy_workers = dict()  # If there's no resources, then they can grab any task\n        else:\n            relevant_tasks = self._state.get_active_tasks_by_status(PENDING, RUNNING)\n            used_resources = self._used_resources()\n            activity_limit = time.time() - self._config.worker_disconnect_delay\n            active_workers = self._state.get_active_workers(last_get_work_gt=activity_limit)\n            greedy_workers = dict((worker.id, worker.info.get(\"workers\", 1)) for worker in active_workers)\n        tasks = list(relevant_tasks)\n        tasks.sort(key=self._rank, reverse=True)\n\n        for task in tasks:\n            if (\n                best_task\n                and batched_params\n                and task.family == best_task.family\n                and len(batched_tasks) < max_batch_size\n                and task.is_batchable()\n                and all(task.params.get(name) == value for name, value in unbatched_params.items())\n                and task.resources == best_task.resources\n                and self._schedulable(task)\n            ):\n                for name, params in batched_params.items():\n                    params.append(task.params.get(name))\n                batched_tasks.append(task)\n            if best_task:\n                continue\n\n            if task.status == RUNNING and (task.worker_running in greedy_workers):\n                greedy_workers[task.worker_running] -= 1\n                for resource, amount in (getattr(task, \"resources_running\", task.resources) or {}).items():\n                    greedy_resources[resource] += amount\n\n            if self._schedulable(task) and self._has_resources(task.resources, greedy_resources):\n                in_workers = (assistant and task.runnable) or worker_id in task.workers\n                if in_workers and self._has_resources(task.resources, used_resources):\n                    best_task = task\n                    batch_param_names, max_batch_size = self._state.get_batcher(worker_id, task.family)\n                    if batch_param_names and task.is_batchable():\n                        try:\n                            batched_params = {name: [task.params[name]] for name in batch_param_names}\n                            unbatched_params = {name: value for name, value in task.params.items() if name not in batched_params}\n                            batched_tasks.append(task)\n                        except KeyError:\n                            batched_params, unbatched_params = None, None\n                else:\n                    workers = itertools.chain(task.workers, [worker_id]) if assistant else task.workers\n                    for task_worker in workers:\n                        if greedy_workers.get(task_worker, 0) > 0:\n                            # use up a worker\n                            greedy_workers[task_worker] -= 1\n\n                            # keep track of the resources used in greedy scheduling\n                            for resource, amount in (task.resources or {}).items():\n                                greedy_resources[resource] += amount\n\n                            break\n\n        reply = self.count_pending(worker_id)\n\n        if len(batched_tasks) > 1:\n            batch_string = \"|\".join(task.id for task in batched_tasks)\n            batch_id = hashlib.new(\"md5\", batch_string.encode(\"utf-8\"), usedforsecurity=False).hexdigest()\n            for task in batched_tasks:\n                self._state.set_batch_running(task, batch_id, worker_id)\n\n            combined_params = best_task.params.copy()\n            combined_params.update(batched_params)\n\n            reply[\"task_id\"] = None\n            reply[\"task_family\"] = best_task.family\n            reply[\"task_module\"] = getattr(best_task, \"module\", None)\n            reply[\"task_params\"] = combined_params\n            reply[\"batch_id\"] = batch_id\n            reply[\"batch_task_ids\"] = [task.id for task in batched_tasks]\n\n        elif best_task:\n            self.update_metrics_task_started(best_task)\n            self._state.set_status(best_task, RUNNING, self._config)\n            best_task.worker_running = worker_id\n            best_task.resources_running = best_task.resources.copy()\n            best_task.time_running = time.time()\n            self._update_task_history(best_task, RUNNING, host=host)\n\n            reply[\"task_id\"] = best_task.id\n            reply[\"task_family\"] = best_task.family\n            reply[\"task_module\"] = getattr(best_task, \"module\", None)\n            reply[\"task_params\"] = best_task.params\n\n        else:\n            reply[\"task_id\"] = None\n\n        return reply\n\n    @rpc_method(attempts=1)\n    def ping(self, **kwargs):\n        worker_id = kwargs[\"worker\"]\n        worker = self._update_worker(worker_id)\n        return {\"rpc_messages\": worker.fetch_rpc_messages()}\n\n    def _upstream_status(self, task_id, upstream_status_table):\n        if task_id in upstream_status_table:\n            return upstream_status_table[task_id]\n        elif self._state.has_task(task_id):\n            task_stack = [task_id]\n\n            while task_stack:\n                dep_id = task_stack.pop()\n                dep = self._state.get_task(dep_id)\n                if dep:\n                    if dep.status == DONE:\n                        continue\n                    if dep_id not in upstream_status_table:\n                        if dep.status == PENDING and dep.deps:\n                            task_stack += [dep_id] + list(dep.deps)\n                            upstream_status_table[dep_id] = \"\"  # will be updated postorder\n                        else:\n                            dep_status = STATUS_TO_UPSTREAM_MAP.get(dep.status, \"\")\n                            upstream_status_table[dep_id] = dep_status\n                    elif upstream_status_table[dep_id] == \"\" and dep.deps:\n                        # This is the postorder update step when we set the\n                        # status based on the previously calculated child elements\n                        status = max((upstream_status_table.get(a_task_id, \"\") for a_task_id in dep.deps), key=UPSTREAM_SEVERITY_KEY)\n                        upstream_status_table[dep_id] = status\n            return upstream_status_table[dep_id]\n\n    def _serialize_task(self, task_id, include_deps=True, deps=None):\n        task = self._state.get_task(task_id)\n\n        ret = {\n            \"display_name\": task.pretty_id,\n            \"status\": task.status,\n            \"workers\": list(task.workers),\n            \"worker_running\": task.worker_running,\n            \"time_running\": getattr(task, \"time_running\", None),\n            \"start_time\": task.time,\n            \"last_updated\": getattr(task, \"updated\", task.time),\n            \"params\": task.public_params,\n            \"name\": task.family,\n            \"priority\": task.priority,\n            \"resources\": task.resources,\n            \"resources_running\": getattr(task, \"resources_running\", None),\n            \"tracking_url\": getattr(task, \"tracking_url\", None),\n            \"status_message\": getattr(task, \"status_message\", None),\n            \"progress_percentage\": getattr(task, \"progress_percentage\", None),\n        }\n        if task.status == DISABLED:\n            ret[\"re_enable_able\"] = task.scheduler_disable_time is not None\n        if include_deps:\n            ret[\"deps\"] = list(task.deps if deps is None else deps)\n        if self._config.send_messages and task.status == RUNNING:\n            ret[\"accepts_messages\"] = task.accepts_messages\n        return ret\n\n    @rpc_method()\n    def graph(self, **kwargs):\n        self.prune()\n        serialized = {}\n        seen = set()\n        for task in self._state.get_active_tasks():\n            serialized.update(self._traverse_graph(task.id, seen))\n        return serialized\n\n    def _filter_done(self, task_ids):\n        for task_id in task_ids:\n            task = self._state.get_task(task_id)\n            if task is None or task.status != DONE:\n                yield task_id\n\n    def _traverse_graph(self, root_task_id, seen=None, dep_func=None, include_done=True):\n        \"\"\"Returns the dependency graph rooted at task_id\n\n        This does a breadth-first traversal to find the nodes closest to the\n        root before hitting the scheduler.max_graph_nodes limit.\n\n        :param root_task_id: the id of the graph's root\n        :return: A map of task id to serialized node\n        \"\"\"\n\n        if seen is None:\n            seen = set()\n        elif root_task_id in seen:\n            return {}\n\n        if dep_func is None:\n\n            def dep_func(t):\n                return t.deps\n\n        seen.add(root_task_id)\n        serialized = {}\n        queue = collections.deque([root_task_id])\n        while queue:\n            task_id = queue.popleft()\n\n            task = self._state.get_task(task_id)\n            if task is None or not task.family:\n                logger.debug(\"Missing task for id [%s]\", task_id)\n\n                # NOTE : If a dependency is missing from self._state there is no way to deduce the\n                #        task family and parameters.\n                family_match = TASK_FAMILY_RE.match(task_id)\n                family = family_match.group(1) if family_match else UNKNOWN\n                params = {\"task_id\": task_id}\n                serialized[task_id] = {\n                    \"deps\": [],\n                    \"status\": UNKNOWN,\n                    \"workers\": [],\n                    \"start_time\": UNKNOWN,\n                    \"params\": params,\n                    \"name\": family,\n                    \"display_name\": task_id,\n                    \"priority\": 0,\n                }\n            else:\n                deps = dep_func(task)\n                if not include_done:\n                    deps = list(self._filter_done(deps))\n                serialized[task_id] = self._serialize_task(task_id, deps=deps)\n                for dep in sorted(deps):\n                    if dep not in seen:\n                        seen.add(dep)\n                        queue.append(dep)\n\n            if task_id != root_task_id:\n                del serialized[task_id][\"display_name\"]\n            if len(serialized) >= self._config.max_graph_nodes:\n                break\n\n        return serialized\n\n    @rpc_method()\n    def dep_graph(self, task_id, include_done=True, **kwargs):\n        self.prune()\n        if not self._state.has_task(task_id):\n            return {}\n        return self._traverse_graph(task_id, include_done=include_done)\n\n    @rpc_method()\n    def inverse_dep_graph(self, task_id, include_done=True, **kwargs):\n        self.prune()\n        if not self._state.has_task(task_id):\n            return {}\n        inverse_graph = collections.defaultdict(set)\n        for task in self._state.get_active_tasks():\n            for dep in task.deps:\n                inverse_graph[dep].add(task.id)\n        return self._traverse_graph(task_id, dep_func=lambda t: inverse_graph[t.id], include_done=include_done)\n\n    @rpc_method()\n    def task_list(self, status=\"\", upstream_status=\"\", limit=True, search=None, max_shown_tasks=None, **kwargs):\n        \"\"\"\n        Query for a subset of tasks by status.\n        \"\"\"\n        if not search:\n            count_limit = max_shown_tasks or self._config.max_shown_tasks\n            pre_count = self._state.get_active_task_count_for_status(status)\n            if limit and pre_count > count_limit:\n                return {\"num_tasks\": -1 if upstream_status else pre_count}\n        self.prune()\n\n        result = {}\n        upstream_status_table = {}  # used to memoize upstream status\n        if search is None:\n\n            def filter_func(_):\n                return True\n        else:\n            terms = search.split()\n\n            def filter_func(t):\n                return all(term.casefold() in t.pretty_id.casefold() for term in terms)\n\n        tasks = self._state.get_active_tasks_by_status(status) if status else self._state.get_active_tasks()\n        for task in filter(filter_func, tasks):\n            if task.status != PENDING or not upstream_status or upstream_status == self._upstream_status(task.id, upstream_status_table):\n                serialized = self._serialize_task(task.id, include_deps=False)\n                result[task.id] = serialized\n        if limit and len(result) > (max_shown_tasks or self._config.max_shown_tasks):\n            return {\"num_tasks\": len(result)}\n        return result\n\n    def _first_task_display_name(self, worker):\n        task_id = worker.info.get(\"first_task\", \"\")\n        if self._state.has_task(task_id):\n            return self._state.get_task(task_id).pretty_id\n        else:\n            return task_id\n\n    @rpc_method()\n    def worker_list(self, include_running=True, **kwargs):\n        self.prune()\n        workers = [\n            dict(\n                name=worker.id,\n                last_active=worker.last_active,\n                started=worker.started,\n                state=worker.state,\n                first_task_display_name=self._first_task_display_name(worker),\n                num_unread_rpc_messages=len(worker.rpc_messages),\n                **worker.info,\n            )\n            for worker in self._state.get_active_workers()\n        ]\n        workers.sort(key=lambda worker: worker[\"started\"], reverse=True)\n        if include_running:\n            running = collections.defaultdict(dict)\n            for task in self._state.get_active_tasks_by_status(RUNNING):\n                if task.worker_running:\n                    running[task.worker_running][task.id] = self._serialize_task(task.id, include_deps=False)\n\n            num_pending = collections.defaultdict(int)\n            num_uniques = collections.defaultdict(int)\n            for task in self._state.get_active_tasks_by_status(PENDING):\n                for worker in task.workers:\n                    num_pending[worker] += 1\n                if len(task.workers) == 1:\n                    num_uniques[list(task.workers)[0]] += 1\n\n            for worker in workers:\n                tasks = running[worker[\"name\"]]\n                worker[\"num_running\"] = len(tasks)\n                worker[\"num_pending\"] = num_pending[worker[\"name\"]]\n                worker[\"num_uniques\"] = num_uniques[worker[\"name\"]]\n                worker[\"running\"] = tasks\n        return workers\n\n    @rpc_method()\n    def resource_list(self):\n        \"\"\"\n        Resources usage info and their consumers (tasks).\n        \"\"\"\n        self.prune()\n        resources = [dict(name=resource, num_total=r_dict[\"total\"], num_used=r_dict[\"used\"]) for resource, r_dict in self.resources().items()]\n        if self._resources is not None:\n            consumers = collections.defaultdict(dict)\n            for task in self._state.get_active_tasks_by_status(RUNNING):\n                if task.status == RUNNING and task.resources:\n                    for resource, amount in task.resources.items():\n                        consumers[resource][task.id] = self._serialize_task(task.id, include_deps=False)\n            for resource in resources:\n                tasks = consumers[resource[\"name\"]]\n                resource[\"num_consumer\"] = len(tasks)\n                resource[\"running\"] = tasks\n        return resources\n\n    def resources(self):\n        \"\"\"get total resources and available ones\"\"\"\n        used_resources = self._used_resources()\n        ret = collections.defaultdict(dict)\n        for resource, total in self._resources.items():\n            ret[resource][\"total\"] = total\n            if resource in used_resources:\n                ret[resource][\"used\"] = used_resources[resource]\n            else:\n                ret[resource][\"used\"] = 0\n        return ret\n\n    @rpc_method()\n    def task_search(self, task_str, **kwargs):\n        \"\"\"\n        Query for a subset of tasks by task_id.\n\n        :param task_str:\n        :return:\n        \"\"\"\n        self.prune()\n        result = collections.defaultdict(dict)\n        for task in self._state.get_active_tasks():\n            if task.id.find(task_str) != -1:\n                serialized = self._serialize_task(task.id, include_deps=False)\n                result[task.status][task.id] = serialized\n        return result\n\n    @rpc_method()\n    def re_enable_task(self, task_id):\n        serialized = {}\n        task = self._state.get_task(task_id)\n        if task and task.status == DISABLED and task.scheduler_disable_time:\n            self._state.re_enable(task, self._config)\n            serialized = self._serialize_task(task_id)\n        return serialized\n\n    @rpc_method()\n    def fetch_error(self, task_id, **kwargs):\n        if self._state.has_task(task_id):\n            task = self._state.get_task(task_id)\n            return {\n                \"taskId\": task_id,\n                \"error\": task.expl,\n                \"displayName\": task.pretty_id,\n                \"taskParams\": task.params,\n                \"taskModule\": task.module,\n                \"taskFamily\": task.family,\n            }\n        else:\n            return {\"taskId\": task_id, \"error\": \"\"}\n\n    @rpc_method()\n    def set_task_status_message(self, task_id, status_message):\n        if self._state.has_task(task_id):\n            task = self._state.get_task(task_id)\n            task.status_message = status_message\n            if task.status == RUNNING and task.batch_id is not None:\n                for batch_task in self._state.get_batch_running_tasks(task.batch_id):\n                    batch_task.status_message = status_message\n\n    @rpc_method()\n    def get_task_status_message(self, task_id):\n        if self._state.has_task(task_id):\n            task = self._state.get_task(task_id)\n            return {\"taskId\": task_id, \"statusMessage\": task.status_message}\n        else:\n            return {\"taskId\": task_id, \"statusMessage\": \"\"}\n\n    @rpc_method()\n    def set_task_progress_percentage(self, task_id, progress_percentage):\n        if self._state.has_task(task_id):\n            task = self._state.get_task(task_id)\n            task.progress_percentage = progress_percentage\n            if task.status == RUNNING and task.batch_id is not None:\n                for batch_task in self._state.get_batch_running_tasks(task.batch_id):\n                    batch_task.progress_percentage = progress_percentage\n\n    @rpc_method()\n    def get_task_progress_percentage(self, task_id):\n        if self._state.has_task(task_id):\n            task = self._state.get_task(task_id)\n            return {\"taskId\": task_id, \"progressPercentage\": task.progress_percentage}\n        else:\n            return {\"taskId\": task_id, \"progressPercentage\": None}\n\n    @rpc_method()\n    def decrease_running_task_resources(self, task_id, decrease_resources):\n        if self._state.has_task(task_id):\n            task = self._state.get_task(task_id)\n            if task.status != RUNNING:\n                return\n\n            def decrease(resources, decrease_resources):\n                for resource, decrease_amount in decrease_resources.items():\n                    if decrease_amount > 0 and resource in resources:\n                        resources[resource] = max(0, resources[resource] - decrease_amount)\n\n            decrease(task.resources_running, decrease_resources)\n            if task.batch_id is not None:\n                for batch_task in self._state.get_batch_running_tasks(task.batch_id):\n                    decrease(batch_task.resources_running, decrease_resources)\n\n    @rpc_method()\n    def get_running_task_resources(self, task_id):\n        if self._state.has_task(task_id):\n            task = self._state.get_task(task_id)\n            return {\"taskId\": task_id, \"resources\": getattr(task, \"resources_running\", None)}\n        else:\n            return {\"taskId\": task_id, \"resources\": None}\n\n    def _update_task_history(self, task, status, host=None):\n        try:\n            if status == DONE or status == FAILED:\n                successful = status == DONE\n                self._task_history.task_finished(task, successful)\n            elif status == PENDING:\n                self._task_history.task_scheduled(task)\n            elif status == RUNNING:\n                self._task_history.task_started(task, host)\n        except BaseException:\n            logger.warning(\"Error saving Task history\", exc_info=True)\n\n    @property\n    def task_history(self):\n        # Used by server.py to expose the calls\n        return self._task_history\n\n    @rpc_method()\n    def update_metrics_task_started(self, task):\n        self._state._metrics_collector.handle_task_started(task)\n\n    @rpc_method()\n    def report_task_statistics(self, task_id, statistics):\n        if self._state.has_task(task_id):\n            task = self._state.get_task(task_id)\n            self._state._metrics_collector.handle_task_statistics(task, statistics)\n"
  },
  {
    "path": "luigi/server.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nSimple REST server that takes commands in a JSON payload\nInterface to the :py:class:`~luigi.scheduler.Scheduler` class.\nSee :doc:`/central_scheduler` for more info.\n\"\"\"\n#\n# Description: Added codes for visualization of how long each task takes\n# running-time until it reaches the next status (failed or done)\n# At \"{base_url}/tasklist\", all completed(failed or done) tasks are shown.\n# At \"{base_url}/tasklist\", a user can select one specific task to see\n# how its running-time has changed over time.\n# At \"{base_url}/tasklist/{task_name}\", it visualizes a multi-bar graph\n# that represents the changes of the running-time for a selected task\n# up to the next status (failed or done).\n# This visualization let us know how the running-time of the specific task\n# has changed over time.\n#\n# Copyright 2015 Naver Corp.\n# Author Yeseul Park (yeseul.park@navercorp.com)\n#\n\nimport atexit\nimport datetime\nimport importlib\nimport json\nimport logging\nimport os\nimport signal\nimport sys\nimport time\n\nimport tornado.httpserver\nimport tornado.ioloop\nimport tornado.netutil\nimport tornado.web\n\nfrom luigi import Config, parameter\nfrom luigi.scheduler import RPC_METHODS, Scheduler\n\nlogger = logging.getLogger(\"luigi.server\")\n\n\nclass cors(Config):\n    enabled = parameter.BoolParameter(default=False, description=\"Enables CORS support.\")\n    allowed_origins = parameter.ListParameter(default=(), description=\"A list of allowed origins. Used only if `allow_any_origin` is false.\")\n    allow_any_origin = parameter.BoolParameter(default=False, description=\"Accepts requests from any origin.\")\n    allow_null_origin = parameter.BoolParameter(default=False, description=\"Allows the request to set `null` value of the `Origin` header.\")\n    max_age = parameter.IntParameter(default=86400, description=\"Content of `Access-Control-Max-Age`.\")\n    allowed_methods = parameter.Parameter(default=\"GET, OPTIONS\", description=\"Content of `Access-Control-Allow-Methods`.\")\n    allowed_headers = parameter.Parameter(default=\"Accept, Content-Type, Origin\", description=\"Content of `Access-Control-Allow-Headers`.\")\n    exposed_headers = parameter.Parameter(default=\"\", description=\"Content of `Access-Control-Expose-Headers`.\")\n    allow_credentials = parameter.BoolParameter(default=False, description=\"Indicates that the actual request can include user credentials.\")\n\n    def __init__(self, *args, **kwargs):\n        super(cors, self).__init__(*args, **kwargs)\n        self.allowed_origins = set(i for i in self.allowed_origins if i not in [\"*\", \"null\"])\n\n\nclass RPCHandler(tornado.web.RequestHandler):\n    \"\"\"\n    Handle remote scheduling calls using rpc.RemoteSchedulerResponder.\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        super(RPCHandler, self).__init__(*args, **kwargs)\n        self._cors_config = cors()\n\n    def initialize(self, scheduler):\n        self._scheduler = scheduler\n\n    def options(self, *args):\n        if self._cors_config.enabled:\n            self._handle_cors_preflight()\n\n        self.set_status(204)\n        self.finish()\n\n    def get(self, method):\n        if method not in RPC_METHODS:\n            self.send_error(404)\n            return\n        payload = self.get_argument(\"data\", default=\"{}\")\n        arguments = json.loads(payload)\n\n        if hasattr(self._scheduler, method):\n            result = getattr(self._scheduler, method)(**arguments)\n\n            if self._cors_config.enabled:\n                self._handle_cors()\n\n            self.write({\"response\": result})  # wrap all json response in a dictionary\n        else:\n            self.send_error(404)\n\n    post = get\n\n    def _handle_cors_preflight(self):\n        origin = self.request.headers.get(\"Origin\")\n        if not origin:\n            return\n\n        if origin == \"null\":\n            if self._cors_config.allow_null_origin:\n                self.set_header(\"Access-Control-Allow-Origin\", \"null\")\n                self._set_other_cors_headers()\n        else:\n            if self._cors_config.allow_any_origin:\n                self.set_header(\"Access-Control-Allow-Origin\", \"*\")\n                self._set_other_cors_headers()\n            elif origin in self._cors_config.allowed_origins:\n                self.set_header(\"Access-Control-Allow-Origin\", origin)\n                self._set_other_cors_headers()\n\n    def _handle_cors(self):\n        origin = self.request.headers.get(\"Origin\")\n        if not origin:\n            return\n\n        if origin == \"null\":\n            if self._cors_config.allow_null_origin:\n                self.set_header(\"Access-Control-Allow-Origin\", \"null\")\n        else:\n            if self._cors_config.allow_any_origin:\n                self.set_header(\"Access-Control-Allow-Origin\", \"*\")\n            elif origin in self._cors_config.allowed_origins:\n                self.set_header(\"Access-Control-Allow-Origin\", origin)\n                self.set_header(\"Vary\", \"Origin\")\n\n    def _set_other_cors_headers(self):\n        self.set_header(\"Access-Control-Max-Age\", str(self._cors_config.max_age))\n        self.set_header(\"Access-Control-Allow-Methods\", self._cors_config.allowed_methods)\n        self.set_header(\"Access-Control-Allow-Headers\", self._cors_config.allowed_headers)\n        if self._cors_config.allow_credentials:\n            self.set_header(\"Access-Control-Allow-Credentials\", \"true\")\n        if self._cors_config.exposed_headers:\n            self.set_header(\"Access-Control-Expose-Headers\", self._cors_config.exposed_headers)\n\n\nclass BaseTaskHistoryHandler(tornado.web.RequestHandler):\n    def initialize(self, scheduler):\n        self._scheduler = scheduler\n\n    def get_template_path(self):\n        return importlib.resources.files(\"templates\").name\n\n\nclass AllRunHandler(BaseTaskHistoryHandler):\n    def get(self):\n        all_tasks = self._scheduler.task_history.find_all_runs()\n        tasknames = [task.name for task in all_tasks]\n        # show all tasks with their name list to be selected\n        # why all tasks? the duration of the event history of a selected task\n        # can be more than 24 hours.\n        self.render(\"menu.html\", tasknames=tasknames)\n\n\nclass SelectedRunHandler(BaseTaskHistoryHandler):\n    def get(self, name):\n        statusResults = {}\n        taskResults = []\n        # get all tasks that has been updated\n        all_tasks = self._scheduler.task_history.find_all_runs()\n        # get events history for all tasks\n        all_tasks_event_history = self._scheduler.task_history.find_all_events()\n\n        # build the dictionary tasks with index: id, value: task_name\n        tasks = {task.id: str(task.name) for task in all_tasks}\n\n        for task in all_tasks_event_history:\n            # if the name of user-selected task is in tasks, get its task_id\n            if tasks.get(task.task_id) == str(name):\n                status = str(task.event_name)\n                if status not in statusResults:\n                    statusResults[status] = []\n                # append the id, task_id, ts, y with 0, next_process with null\n                # for the status(running/failed/done) of the selected task\n                statusResults[status].append(({\"id\": str(task.id), \"task_id\": str(task.task_id), \"x\": from_utc(str(task.ts)), \"y\": 0, \"next_process\": \"\"}))\n                # append the id, task_name, task_id, status, datetime, timestamp\n                # for the selected task\n                taskResults.append(\n                    {\n                        \"id\": str(task.id),\n                        \"taskName\": str(name),\n                        \"task_id\": str(task.task_id),\n                        \"status\": str(task.event_name),\n                        \"datetime\": str(task.ts),\n                        \"timestamp\": from_utc(str(task.ts)),\n                    }\n                )\n        statusResults = json.dumps(statusResults)\n        taskResults = json.dumps(taskResults)\n        statusResults = tornado.escape.xhtml_unescape(str(statusResults))\n        taskResults = tornado.escape.xhtml_unescape(str(taskResults))\n        self.render(\"history.html\", name=name, statusResults=statusResults, taskResults=taskResults)\n\n\ndef from_utc(utcTime, fmt=None):\n    \"\"\"convert UTC time string to time.struct_time: change datetime.datetime to time, return time.struct_time type\"\"\"\n    if fmt is None:\n        try_formats = [\"%Y-%m-%d %H:%M:%S.%f\", \"%Y-%m-%d %H:%M:%S\"]\n    else:\n        try_formats = [fmt]\n\n    for fmt in try_formats:\n        try:\n            time_struct = datetime.datetime.strptime(utcTime, fmt)\n        except ValueError:\n            pass\n        else:\n            date = int(time.mktime(time_struct.timetuple()))\n            return date\n    else:\n        raise ValueError(\"No UTC format matches {}\".format(utcTime))\n\n\nclass RecentRunHandler(BaseTaskHistoryHandler):\n    def get(self):\n        with self._scheduler.task_history._session(None) as session:\n            tasks = self._scheduler.task_history.find_latest_runs(session)\n            self.render(\"recent.html\", tasks=tasks)\n\n\nclass ByNameHandler(BaseTaskHistoryHandler):\n    def get(self, name):\n        with self._scheduler.task_history._session(None) as session:\n            tasks = self._scheduler.task_history.find_all_by_name(name, session)\n            self.render(\"recent.html\", tasks=tasks)\n\n\nclass ByIdHandler(BaseTaskHistoryHandler):\n    def get(self, id):\n        with self._scheduler.task_history._session(None) as session:\n            task = self._scheduler.task_history.find_task_by_id(id, session)\n            self.render(\"show.html\", task=task)\n\n\nclass ByTaskIdHandler(BaseTaskHistoryHandler):\n    def get(self, task_id):\n        with self._scheduler.task_history._session(None) as session:\n            task = self._scheduler.task_history.find_task_by_task_id(task_id, session)\n            self.render(\"show.html\", task=task)\n\n\nclass ByParamsHandler(BaseTaskHistoryHandler):\n    def get(self, name):\n        payload = self.get_argument(\"data\", default=\"{}\")\n        arguments = json.loads(payload)\n        with self._scheduler.task_history._session(None) as session:\n            tasks = self._scheduler.task_history.find_all_by_parameters(name, session=session, **arguments)\n            self.render(\"recent.html\", tasks=tasks)\n\n\nclass RootPathHandler(BaseTaskHistoryHandler):\n    def get(self):\n        # we omit the leading slash in case the visualizer is behind a different\n        # path (as in a reverse proxy setup)\n        #\n        # For example, if luigi is behind my.app.com/my/luigi/, we want / to\n        # redirect relative (so it goes to my.app.com/my/luigi/static/visualizer/index.html)\n        # instead of absolute (which would be my.app.com/static/visualizer/index.html)\n        self.redirect(\"static/visualiser/index.html\")\n\n    def head(self):\n        \"\"\"HEAD endpoint for health checking the scheduler\"\"\"\n        self.set_status(204)\n        self.finish()\n\n\nclass MetricsHandler(tornado.web.RequestHandler):\n    def initialize(self, scheduler):\n        self._scheduler = scheduler\n\n    def get(self):\n        metrics_collector = self._scheduler._state._metrics_collector\n        metrics = metrics_collector.generate_latest()\n        if metrics:\n            metrics_collector.configure_http_handler(self)\n            self.write(metrics)\n\n\ndef app(scheduler):\n    settings = {\n        \"static_path\": os.path.join(os.path.dirname(__file__), \"static\"),\n        \"unescape\": tornado.escape.xhtml_unescape,\n        \"compress_response\": True,\n    }\n    handlers = [\n        (r\"/api/(.*)\", RPCHandler, {\"scheduler\": scheduler}),\n        (r\"/\", RootPathHandler, {\"scheduler\": scheduler}),\n        (r\"/tasklist\", AllRunHandler, {\"scheduler\": scheduler}),\n        (r\"/tasklist/(.*?)\", SelectedRunHandler, {\"scheduler\": scheduler}),\n        (r\"/history\", RecentRunHandler, {\"scheduler\": scheduler}),\n        (r\"/history/by_name/(.*?)\", ByNameHandler, {\"scheduler\": scheduler}),\n        (r\"/history/by_id/(.*?)\", ByIdHandler, {\"scheduler\": scheduler}),\n        (r\"/history/by_task_id/(.*?)\", ByTaskIdHandler, {\"scheduler\": scheduler}),\n        (r\"/history/by_params/(.*?)\", ByParamsHandler, {\"scheduler\": scheduler}),\n        (r\"/metrics\", MetricsHandler, {\"scheduler\": scheduler}),\n    ]\n    api_app = tornado.web.Application(handlers, **settings)\n    return api_app\n\n\ndef _init_api(scheduler, api_port=None, address=None, unix_socket=None):\n    api_app = app(scheduler)\n    if unix_socket is not None:\n        api_sockets = [tornado.netutil.bind_unix_socket(unix_socket)]\n    else:\n        api_sockets = tornado.netutil.bind_sockets(api_port, address=address)\n    server = tornado.httpserver.HTTPServer(api_app)\n    server.add_sockets(api_sockets)\n\n    # Return the bound socket names.  Useful for connecting client in test scenarios.\n    return [s.getsockname() for s in api_sockets]\n\n\ndef run(api_port=8082, address=None, unix_socket=None, scheduler=None):\n    \"\"\"\n    Runs one instance of the API server.\n    \"\"\"\n    if scheduler is None:\n        scheduler = Scheduler()\n\n    # load scheduler state\n    scheduler.load()\n\n    _init_api(\n        scheduler=scheduler,\n        api_port=api_port,\n        address=address,\n        unix_socket=unix_socket,\n    )\n\n    # prune work DAG every 60 seconds\n    pruner = tornado.ioloop.PeriodicCallback(scheduler.prune, 60000)\n    pruner.start()\n\n    def shutdown_handler(signum, frame):\n        exit_handler()\n        sys.exit(0)\n\n    @atexit.register\n    def exit_handler():\n        logger.info(\"Scheduler instance shutting down\")\n        scheduler.dump()\n        stop()\n\n    signal.signal(signal.SIGINT, shutdown_handler)\n    signal.signal(signal.SIGTERM, shutdown_handler)\n    if os.name == \"nt\":\n        signal.signal(signal.SIGBREAK, shutdown_handler)\n    else:\n        signal.signal(signal.SIGQUIT, shutdown_handler)\n\n    logger.info(\"Scheduler starting up\")\n\n    tornado.ioloop.IOLoop.instance().start()\n\n\ndef stop():\n    tornado.ioloop.IOLoop.instance().stop()\n\n\nif __name__ == \"__main__\":\n    run()\n"
  },
  {
    "path": "luigi/setup_logging.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2018 Vote Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nThis module contains helper classes for configuring logging for luigid and\nworkers via command line arguments and options from config files.\n\"\"\"\n\nimport logging\nimport logging.config\nimport os.path\nfrom configparser import NoSectionError\n\nfrom luigi.configuration import LuigiConfigParser, get_config\nfrom luigi.freezing import recursively_unfreeze\n\n\nclass BaseLogging:\n    config = get_config()\n\n    @classmethod\n    def _section(cls, opts):\n        \"\"\"Get logging settings from config file section \"logging\".\"\"\"\n        if isinstance(cls.config, LuigiConfigParser):\n            return False\n        try:\n            logging_config = cls.config[\"logging\"]\n        except (TypeError, KeyError, NoSectionError):\n            return False\n        logging.config.dictConfig(recursively_unfreeze(logging_config))\n        return True\n\n    @classmethod\n    def setup(cls, opts=type(\"opts\", (), {\"background\": None, \"logdir\": None, \"logging_conf_file\": None, \"log_level\": \"DEBUG\"})):\n        \"\"\"Setup logging via CLI params and config.\"\"\"\n        logger = logging.getLogger(\"luigi\")\n\n        if cls._configured:\n            logger.info(\"logging already configured\")\n            return False\n        cls._configured = True\n\n        if cls.config.getboolean(\"core\", \"no_configure_logging\", False):\n            logger.info(\"logging disabled in settings\")\n            return False\n\n        configured = cls._cli(opts)\n        if configured:\n            logger = logging.getLogger(\"luigi\")\n            logger.info(\"logging configured via special settings\")\n            return True\n\n        configured = cls._conf(opts)\n        if configured:\n            logger = logging.getLogger(\"luigi\")\n            logger.info(\"logging configured via *.conf file\")\n            return True\n\n        configured = cls._section(opts)\n        if configured:\n            logger = logging.getLogger(\"luigi\")\n            logger.info(\"logging configured via config section\")\n            return True\n\n        configured = cls._default(opts)\n        if configured:\n            logger = logging.getLogger(\"luigi\")\n            logger.info(\"logging configured by default settings\")\n        return configured\n\n\nclass DaemonLogging(BaseLogging):\n    \"\"\"Configure logging for luigid\"\"\"\n\n    _configured = False\n    _log_format = \"%(asctime)s %(name)s[%(process)s] %(levelname)s: %(message)s\"\n\n    @classmethod\n    def _cli(cls, opts):\n        \"\"\"Setup logging via CLI options\n\n        If `--background` -- set INFO level for root logger.\n        If `--logdir` -- set logging with next params:\n            default Luigi's formatter,\n            INFO level,\n            output in logdir in `luigi-server.log` file\n        \"\"\"\n        if opts.background:\n            logging.getLogger().setLevel(logging.INFO)\n            return True\n\n        if opts.logdir:\n            logging.basicConfig(level=logging.INFO, format=cls._log_format, filename=os.path.join(opts.logdir, \"luigi-server.log\"))\n            return True\n\n        return False\n\n    @classmethod\n    def _conf(cls, opts):\n        \"\"\"Setup logging via ini-file from logging_conf_file option.\"\"\"\n        logging_conf = cls.config.get(\"core\", \"logging_conf_file\", None)\n        if logging_conf is None:\n            return False\n\n        if not os.path.exists(logging_conf):\n            # FileNotFoundError added only in Python 3.3\n            # https://docs.python.org/3/whatsnew/3.3.html#pep-3151-reworking-the-os-and-io-exception-hierarchy\n            raise OSError(\"Error: Unable to locate specified logging configuration file!\")\n\n        logging.config.fileConfig(logging_conf)\n        return True\n\n    @classmethod\n    def _default(cls, opts):\n        \"\"\"Setup default logger\"\"\"\n        logging.basicConfig(level=logging.INFO, format=cls._log_format)\n        return True\n\n\n# Part of this logic taken for dropped function \"setup_interface_logging\"\nclass InterfaceLogging(BaseLogging):\n    \"\"\"Configure logging for worker\"\"\"\n\n    _configured = False\n\n    @classmethod\n    def _cli(cls, opts):\n        return False\n\n    @classmethod\n    def _conf(cls, opts):\n        \"\"\"Setup logging via ini-file from logging_conf_file option.\"\"\"\n        if not opts.logging_conf_file:\n            return False\n\n        if not os.path.exists(opts.logging_conf_file):\n            # FileNotFoundError added only in Python 3.3\n            # https://docs.python.org/3/whatsnew/3.3.html#pep-3151-reworking-the-os-and-io-exception-hierarchy\n            raise OSError(\"Error: Unable to locate specified logging configuration file!\")\n\n        logging.config.fileConfig(opts.logging_conf_file, disable_existing_loggers=False)\n        return True\n\n    @classmethod\n    def _default(cls, opts):\n        \"\"\"Setup default logger\"\"\"\n        level = getattr(logging, opts.log_level, logging.DEBUG)\n\n        logger = logging.getLogger(\"luigi-interface\")\n        logger.setLevel(level)\n\n        stream_handler = logging.StreamHandler()\n        stream_handler.setLevel(level)\n\n        formatter = logging.Formatter(\"%(levelname)s: %(message)s\")\n        stream_handler.setFormatter(formatter)\n\n        logger.addHandler(stream_handler)\n        return True\n"
  },
  {
    "path": "luigi/static/visualiser/css/luigi.css",
    "content": ".nodeCircle {\n  stroke: #fff;\n  stroke-width: 1.5px;\n}\ntext {\n    font-size:8pt;\n}\n\n.link {\n  stroke: #999;\n  stroke-opacity: .6;\n}\n\nsvg {\n    border:1px solid #DDDDDD;\n    overflow: inherit;\n}\n\n.taskRow {\n  word-break:break-all;\n}\n\n\n@-webkit-keyframes flash {\n  0%, 50%, 100% {\n    opacity: 1;\n  }\n\n  25%, 75% {\n    opacity: 0.2;\n  }\n}\n\n@keyframes flash {\n  0%, 50%, 100% {\n    opacity: 1;\n  }\n\n  25%, 75% {\n    opacity: 0.2;\n  }\n}\n\n.RUNNING {\n  -webkit-animation-duration: 5s;\n  animation-duration: 5s;\n  -webkit-animation-fill-mode: both;\n  animation-fill-mode: both;\n  -webkit-animation-iteration-count: 1;\n  animation-iteration-count: 1;\n}\n\n.live.map {\n  width: 100%;\n  height: 600px;\n  background: #333;\n}\n\n.live.map text {\n  font-weight: 300;\n  font-size: 14px;\n}\n\n.live.map .node rect {\n  stroke-width: 1.5px;\n  stroke: #bbb;\n  fill: #666;\n}\n\n.live.map .status {\n  height: 100%;\n  width: 15px;\n  display: block;\n  float: left;\n  border-top-left-radius: 5px;\n  border-bottom-left-radius: 5px;\n  margin-right: 4px;\n}\n\n.live.map .DONE .status {\n  background-color: #7f7;\n}\n\n.live.map .RUNNING .status {\n  background-color: #7f7;\n}\n\n.live.map .PENDING .status {\n  background-color: #FFFF46;\n}\n\n.live.map .ERROR .status {\n  background-color: #f77;\n}\n\n.live.map .FAILED .status {\n  background-color: #dd4b39;\n}\n\n.live.map .RUNNING .queue {\n  color: #f77;\n}\n\n.live.map .DISABLED .status {\n  background-color: #aaaaaa;\n}\n\n.RUNNING {\n  -webkit-animation-name: flash;\n  animation-name: flash;\n}\n\n.live.map .consumers {\n  margin-right: 2px;\n}\n\n.live.map .consumers,\n.live.map .name {\n  margin-top: 4px;\n}\n\n.live.map .consumers:after {\n  content: \"x\";\n}\n\n.live.map .queue {\n  display: block;\n  float: left;\n  width: 130px;\n  height: 20px;\n  font-size: 12px;\n  margin-top: 2px;\n}\n\n.live.map .node g div {\n  width: 200px;\n  height: 40px;\n  color: #fff;\n}\n\n.live.map .node g div span.consumers {\n  display: inline-block;\n  width: 20px;\n}\n\n.live.map .edgeLabel text {\n  width: 50px;\n  fill: #fff;\n}\n\n.live.map .edgePath path {\n  stroke: #999;\n  stroke-width: 1.5px;\n  fill: #999;\n}\n\ntd.details-control {\n  cursor: pointer;\n}\n\nspan.status-icon {\n  border-top-left-radius: 2px;\n  border-top-right-radius: 2px;\n  border-bottom-right-radius: 2px;\n  border-bottom-left-radius: 2px;\n  display: inline-block;\n  height: 24px;\n  width: 24px;\n  text-align: center;\n  font-size: 12px;\n  line-height: 24px;\n\n}\n\n#serverSide {\n  float: right;\n  margin: 4px;\n}\n\n.infoBar {\n  min-height: 80px;\n}\n\n#taskTable_filter {\n  margin-top: 9px;\n}\n\n#loadTaskForm input {\n  width: 20em;\n}\n\n#workerList .box-tools > div {\n  display: inline-block;\n}\n\n#workerList .btn-set-workers > span.caret {\n  margin-left: 4px;\n}\n\n#workerList .box-tools > span.label-unread-worker-messages {\n  margin-right: 6px;\n  vertical-align: middle;\n  font-style: italic;\n  color: red;\n}\n\n#resourceList i.resources-collapse {\n  padding-left: 10px;\n}\n\n#clear-task-filter {\n  margin-left: 20px;\n  cursor: pointer;\n}\n\n#clear-task-filter:hover {\n  opacity: 0.9;\n}\n\n#clear-task-filter:active {\n  box-shadow: inset -2px 3px 1px rgba(0,0,0,0.2);\n}\n\n.sidebar-menu li > a.sidebar-folder {\n  font-weight: bold;\n  background-color: #ddd !important;\n}\n\n.sidebar-menu li > a.sidebar-folder:hover {\n  opacity: 0.9;\n}\n\n.sidebar-menu li > a.sidebar-folder.expanded {\n  background-color: rgb(0, 166, 90) !important;\n  color: white !important;\n}\n\n.popover{\n    max-width: 100% !important; \n}\n"
  },
  {
    "path": "luigi/static/visualiser/css/tipsy.css",
    "content": ".tipsy { font-size: 10px; position: absolute; padding: 5px; z-index: 100000; }\n  .tipsy-inner { background-color: #000; color: #FFF; max-width: 200px; padding: 5px 8px 4px 8px; text-align: center; }\n\n  /* Rounded corners */\n  .tipsy-inner { border-radius: 3px; -moz-border-radius: 3px; -webkit-border-radius: 3px; }\n  \n  /* Uncomment for shadow */\n  .tipsy-inner { box-shadow: 0 0 5px #000000; -webkit-box-shadow: 0 0 5px #000000; -moz-box-shadow: 0 0 5px #000000; }\n  \n  .tipsy-arrow { position: absolute; width: 0; height: 0; line-height: 0; border: 5px dashed #000; }\n  \n  /* Rules to colour arrows */\n  .tipsy-arrow-n { border-bottom-color: #000; }\n  .tipsy-arrow-s { border-top-color: #000; }\n  .tipsy-arrow-e { border-left-color: #000; }\n  .tipsy-arrow-w { border-right-color: #000; }\n  \n\t.tipsy-n .tipsy-arrow { top: 0px; left: 50%; margin-left: -5px; border-bottom-style: solid; border-top: none; border-left-color: transparent; border-right-color: transparent; }\n    .tipsy-nw .tipsy-arrow { top: 0; left: 10px; border-bottom-style: solid; border-top: none; border-left-color: transparent; border-right-color: transparent;}\n    .tipsy-ne .tipsy-arrow { top: 0; right: 10px; border-bottom-style: solid; border-top: none;  border-left-color: transparent; border-right-color: transparent;}\n  .tipsy-s .tipsy-arrow { bottom: 0; left: 50%; margin-left: -5px; border-top-style: solid; border-bottom: none;  border-left-color: transparent; border-right-color: transparent; }\n    .tipsy-sw .tipsy-arrow { bottom: 0; left: 10px; border-top-style: solid; border-bottom: none;  border-left-color: transparent; border-right-color: transparent; }\n    .tipsy-se .tipsy-arrow { bottom: 0; right: 10px; border-top-style: solid; border-bottom: none; border-left-color: transparent; border-right-color: transparent; }\n  .tipsy-e .tipsy-arrow { right: 0; top: 50%; margin-top: -5px; border-left-style: solid; border-right: none; border-top-color: transparent; border-bottom-color: transparent; }\n  .tipsy-w .tipsy-arrow { left: 0; top: 50%; margin-top: -5px; border-right-style: solid; border-left: none; border-top-color: transparent; border-bottom-color: transparent; }\n"
  },
  {
    "path": "luigi/static/visualiser/index.html",
    "content": "<!DOCTYPE html>\n<html lang=\"en\">\n    <head>\n        <title>Luigi Task Visualiser</title>\n        <link href=\"css/luigi.css\" rel=\"stylesheet\">\n        <script src=\"lib/jquery-1.10.0.min.js\"></script>\n        <link href=\"lib/bootstrap3/css/bootstrap.min.css\" rel=\"stylesheet\">\n        <link href=\"lib/bootstrap3/css/bootstrap-theme.min.css\" rel=\"stylesheet\">\n        <script src=\"lib/bootstrap3/js/bootstrap.min.js\"></script>\n        <link rel=\"stylesheet\" href=\"css/tipsy.css\">\n        <link href=\"lib/AdminLTE/css/AdminLTE.min.css\" rel=\"stylesheet\"/>\n        <link href=\"css/font-awesome.min.css\" rel=\"stylesheet\"/>\n        <link href=\"lib/AdminLTE/css/skin-green-light.min.css\" rel=\"stylesheet\"/>\n        <link href=\"lib/datatables/css/jquery.dataTables.min.css\" rel=\"stylesheet\"/>\n        <link href=\"lib/bootstrap-toggle/css/bootstrap-toggle.min.css\" rel=\"stylesheet\">\n        <script src=\"lib/d3/d3.min.js\" charset=\"utf-8\"></script>\n        <script src=\"lib/d3/dagre-d3.min.js\"></script>\n        <script src=\"lib/mustache.js\"></script>\n        <script src=\"js/util.js\"></script>\n        <script src=\"js/luigi.js\"></script>\n        <script src=\"js/graph.js\"></script>\n        <script src=\"js/visualiserApp.js\"></script>\n        <script src=\"js/tipsy.js\"></script>\n        <script src=\"lib/jquery.slimscroll.min.js\"></script>\n        <script src=\"lib/AdminLTE/js/app.min.js\"></script>\n        <script src=\"lib/datatables/js/jquery.dataTables.min.js\"></script>\n        <script src=\"lib/URI/1.18.2/URI.js\"></script>\n        <script src=\"lib/bootstrap-toggle/js/bootstrap-toggle.min.js\"></script>\n\n        <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n\n        <script type=\"text/template\" name=\"actionsTemplate\">\n            <div class=\"span2\">\n                <a href=\"#tab=graph&taskId={{encodedTaskId}}&hideDone=1\" class=\"btn btn-info btn-xs\" title=\"View graph\" data-toggle=\"tooltip\"><i class=\"fa fa-sitemap\"></i></a>\n                {{#error}}<button class=\"btn btn-danger btn-xs showError\" title=\"Show error\" data-toggle=\"tooltip\"><i class=\"fa fa-bug\"></i></button>{{/error}}\n                {{#error}}<button class=\"btn btn-warning btn-xs forgiveFailures\" title=\"Forgive failures\" data-toggle=\"tooltip\"><i class=\"fa fa-ambulance\"></i></button>{{/error}}\n                {{#re_enable}}<button class=\"btn btn-danger btn-xs showError\" title=\"Show error\" data-toggle=\"tooltip\"><i class=\"fa fa-bug\"></i></button>{{/re_enable}}\n                {{#re_enable}}<a class=\"btn btn-warning btn-xs re-enable-button\" title=\"Re-enable\" data-toggle=\"tooltip\" data-task-id=\"{{taskId}}\">Re-enable</a>{{/re_enable}}\n                {{#mark_as_done}}<a class=\"btn btn-success btn-xs markAsDone\" title=\"Mark as done\" data-toggle=\"tooltip\" data-task-id=\"{{taskId}}\"><i class=\"fa fa-fast-forward\"></i></a>{{/mark_as_done}}\n                {{#trackingUrl}}<a target=\"_blank\" href=\"{{trackingUrl}}\" class=\"btn btn-primary btn-xs\" title=\"Track Progress\" data-toggle=\"tooltip\"><i class=\"fa fa-eye\"></i></a>{{/trackingUrl}}\n                {{#statusMessage}}<button class=\"btn btn-primary btn-xs statusMessage\" title=\"Status message\" data-toggle=\"tooltip\" data-task-id=\"{{taskId}}\" data-display-name=\"{{displayName}}\"><i class=\"fa fa-comment\"></i></button>{{/statusMessage}}\n                {{^statusMessage}}\n                  {{#progressPercentage}}<button class=\"btn btn-primary btn-xs statusMessage\" title=\"Status message\" data-toggle=\"tooltip\" data-task-id=\"{{taskId}}\" data-display-name=\"{{displayName}}\"><i class=\"fa fa-comment\"></i></button>\n                  {{/progressPercentage}}\n                {{/statusMessage}}\n                {{#acceptsMessages}}<button class=\"btn btn-default btn-xs schedulerMessage\" title=\"Send message\" data-toggle=\"tooltip\" data-task-id=\"{{taskId}}\" data-display-name=\"{{displayName}}\" data-worker=\"{{workerIdRunning}}\"><i class=\"fa fa-paper-plane\"></i></button>{{/acceptsMessages}}\n            </div>\n        </script>\n        <script type=\"text/template\" name=\"errorTemplate\">\n          <div class=\"modal-dialog\">\n            <div class=\"modal-content\">\n              <div class=\"modal-header\">\n                <button type=\"button\" class=\"close\" data-dismiss=\"modal\"><span aria-hidden=\"true\">&times;</span><span class=\"sr-only\">Close</span></button>\n                <h4 class=\"modal-title\" id=\"myModalLabel\">Traceback for {{displayName}}</h4>\n              </div>\n              <div class=\"modal-body\">\n                <pre class=\"pre-scrollable\">{{error}}</pre>\n                Command to re-run:\n                <pre class=\"pre-scrollable\">luigi --module {{taskModule}} {{taskFamily}} {{taskParams}}</pre>\n              </div>\n              <div class=\"modal-footer\">\n                <button type=\"button\" class=\"btn btn-default\" data-dismiss=\"modal\">Close</button>\n              </div>\n            </div>\n          </div>\n        </script>\n        <script type=\"text/template\" name=\"statusMessageTemplate\">\n          <div class=\"modal-dialog\">\n            <div class=\"modal-content\">\n              <div class=\"modal-header\">\n                <button type=\"button\" class=\"close\" data-dismiss=\"modal\"><span aria-hidden=\"true\">&times;</span><span class=\"sr-only\">Close</span></button>\n                <h4 class=\"modal-title\" id=\"myModalLabel\">Status message for {{displayName}}</h4>\n              </div>\n              <div class=\"modal-body\">\n                <pre class=\"pre-scrollable\">{{statusMessage}}</pre>\n                <div class=\"progress\">\n                  <div class=\"progress-bar\" role=\"progressbar\" aria-valuenow=\"{{progressPercentage}}\" aria-valuemin=\"0\" aria-valuemax=\"100\" style=\"min-width: 2em;\">\n                  {{progressPercentage}}%\n                  </div>\n                </div>\n              </div>\n              <div class=\"modal-footer\">\n                <button type=\"button\" class=\"btn btn-default\" data-dismiss=\"modal\">Close</button>\n              </div>\n            </div>\n          </div>\n        </script>\n        <script type=\"text/template\" name=\"schedulerMessageTemplate\">\n          <div class=\"modal-dialog\">\n            <div class=\"modal-content\">\n              <div class=\"modal-header\">\n                <button type=\"button\" class=\"close\" data-dismiss=\"modal\"><span aria-hidden=\"true\">&times;</span><span class=\"sr-only\">Close</span></button>\n                <h4 class=\"modal-title\" id=\"myModalLabel\">Send message to {{displayName}}</h4>\n              </div>\n              <div class=\"modal-body\">\n                  <form>\n                    <div class=\"form-group\">\n                      <label for=\"schedulerMessageInput\">Message:</label>\n                      <input type=\"text\" class=\"form-control\" id=\"schedulerMessageInput\" placeholder=\"\">\n                    </div>\n                    <div class=\"form-group\">\n                      <input type=\"checkbox\" class=\"form-check-input\" id=\"schedulerMessageAwaitResponse\">\n                      <label class=\"form-check-label\" for=\"schedulerMessageAwaitResponse\">Await response</label>\n                    </div>\n                  </form>\n                  <div class=\"form-group\" id=\"schedulerMessageResponse\" style=\"display: none;\">\n                    <hr />\n                    <label>Response:</label>\n                    <pre class=\"pre-scrollable\"><i class=\"fa fa-spinner fa-pulse\" id=\"schedulerMessageSpinner\"></i><div></div></pre>\n                  </div>\n                </div>\n                <div class=\"modal-footer\">\n                  <button type=\"button\" class=\"btn btn-default\" data-dismiss=\"modal\">Cancel</button>\n                  <button type=\"button\" id=\"schedulerMessageButton\" data-dismiss=\"modal\" class=\"btn btn-primary\">Send</button>\n                </div>\n            </div>\n          </div>\n        </script>\n        <script type=\"text/template\" name=\"workerTemplate\">\n            <div class=\"modal fade\" id=\"setWorkersModal\" tabindex=\"-1\" role=\"dialog\" aria-labelledby=\"setWorkersLabel\">\n              <div class=\"modal-dialog\" role=\"document\">\n                <div class=\"modal-content\">\n                  <div class=\"modal-header\">\n                    <button type=\"button\" class=\"close\" data-dismiss=\"modal\" aria-label=\"Close\"><span aria-hidden=\"true\">&times;</span></button>\n                    <h4 class=\"modal-title\" id=\"setWorkersLabel\">Set workers</h4>\n                  </div>\n                  <div class=\"modal-body\">\n                    <form>\n                      <div class=\"form-group\">\n                        <label for=\"setWorkersInput\">New number of workers:</label>\n                        <input type=\"text\" class=\"form-control\" id=\"setWorkersInput\" placeholder=\"positive number\">\n                      </div>\n                    </form>\n                  </div>\n                  <div class=\"modal-footer\">\n                    <button type=\"button\" class=\"btn btn-default\" data-dismiss=\"modal\">Cancel</button>\n                    <button type=\"button\" id=\"setWorkersButton\" data-dismiss=\"modal\" class=\"btn btn-primary\">Set</button>\n                  </div>\n                </div>\n              </div>\n            </div>\n            <div class=\"modal fade\" id=\"disableWorkerModal\" tabindex=\"-1\" role=\"dialog\" aria-labelledby=\"disableWorkerLabel\">\n              <div class=\"modal-dialog\" role=\"document\">\n                <div class=\"modal-content\">\n                  <div class=\"modal-header\">\n                    <button type=\"button\" class=\"close\" data-dismiss=\"modal\" aria-label=\"Close\"><span aria-hidden=\"true\">&times;</span></button>\n                    <h4 class=\"modal-title\" id=\"disiableWorkerLabel\">Disable worker?</h4>\n                  </div>\n                  <div class=\"modal-body\">\n                    Are you sure you want to disable this worker?\n                    <p>\n                        A disabled worker will finish its existing tasks but not start new ones.\n                    </p>\n                  </div>\n                  <div class=\"modal-footer\">\n                    <button type=\"button\" class=\"btn btn-default\" data-dismiss=\"modal\">Cancel</button>\n                    <button type=\"button\" id=\"disableWorkerButton\" data-dismiss=\"modal\" class=\"btn btn-danger\">Disable Worker</button>\n                  </div>\n                </div>\n              </div>\n            </div>\n            {{#workerList}}\n            {{#is_disabled}}\n            <div class=\"box box-solid box-default\">\n            {{/is_disabled}}\n            {{^is_disabled}}\n            <div class=\"box\">\n            {{/is_disabled}}\n                <div class=\"box-header with-border\">\n                    <h3 class=\"box-title\">{{name}}</h3>\n                    <div class=\"box-tools pull-right\">\n                      {{#num_unread_rpc_messages}}\n                      <span class=\"label-unread-worker-messages\">{{num_unread_rpc_messages}} unread message(s)</span>\n                      {{/num_unread_rpc_messages}}\n                      {{^is_disabled}}\n                      {{#workers}}\n                      <div class=\"btn-group\">\n                        <button type=\"button\" class=\"btn btn-sm btn-default dropdown-toggle btn-set-workers\" data-toggle=\"dropdown\" aria-haspopup=\"true\" aria-expanded=\"false\">\n                          Workers: <span id=\"label-n-workers\" data-worker=\"{{name}}\">{{workers}}</span> <span class=\"caret\"></span>\n                        </button>\n                        <ul class=\"dropdown-menu\">\n                          <li><a href=\"#\" id=\"btn-increment-workers\" data-worker=\"{{name}}\">\n                            <i class=\"glyphicon glyphicon-plus\"></i> Add 1 worker\n                          </a></li>\n                          <li><a href=\"#\" id=\"btn-decrement-workers\" data-worker=\"{{name}}\">\n                            <i class=\"glyphicon glyphicon-minus\"></i> Remove 1 worker\n                          </a></li>\n                          <li><a href=\"#\" id=\"btn-set-workers\" data-toggle=\"modal\" data-target=\"#setWorkersModal\" data-worker=\"{{name}}\">\n                            <i class=\"glyphicon glyphicon-pencil\"></i> Set workers ...\n                          </a></li>\n                        </ul>\n                      </div>\n                      {{/workers}}\n                      <div class=\"button-tooltip\" data-toggle=\"tooltip\" title=\"Disable Worker\">\n                        <button type=\"button\" class=\"btn btn-sm btn-danger btn-disable-worker\" data-toggle=\"modal\" data-target=\"#disableWorkerModal\" data-worker=\"{{name}}\">\n                          <i class=\"fa fa-fire-extinguisher\"></i>\n                        </button>\n                      </div>\n                      {{/is_disabled}}\n                    </div>\n                </div>\n                <div class=\"box-body\">\n                    Started: {{start_time}}<br>\n                    Last Checkin: {{active}}<br>\n                    Root Task: <a href=\"#tab=graph&taskId={{{encoded_first_task}}}&hideDone=1\">{{first_task_display_name}}</a><br>\n                    Running: {{num_running}}<br>\n                    Pending: {{num_pending}}<br>\n                    Unique Pending: {{num_uniques}}<br>\n                    {{#is_disabled}}\n                    This worker is <b>disabled</b>. It will not start new tasks.<br>\n                    {{/is_disabled}}\n\n                    {{#num_running}}\n                    <hr>\n                    <table class=\"table table-striped worker-table\">\n                      <thead>\n                        <th>Name</th>\n                        <th>Priority</th>\n                        <th>Resources</th>\n                        <th>Progress</th>\n                        <th>Time</th>\n                        <th>Actions</th>\n                      </thead>\n                      <tbody>\n                      {{#tasks}}\n                      <tr>\n                        <td>{{displayName}}</td>\n                        <td>{{priority}}</td>\n                        <td>{{resources}}</td>\n                        <td>\n                          {{#progressPercentage}}\n                          <div class=\"progress\">\n                            <div class=\"progress-bar taskProgressBar\" role=\"progressbar\" data-task-id=\"{{taskId}}\" style=\"width: {{progressPercentage}}%\" aria-valuenow=\"{{progressPercentage}}\" aria-valuemin=\"0\" aria-valuemax=\"100\">{{progressPercentage}}%</div>\n                          </div>\n                          {{/progressPercentage}}\n                          {{^progressPercentage}}-{{/progressPercentage}}\n                        </td>\n                        <td>{{displayTime}}</td>\n                        <td>\n                          <a href=\"#tab=graph&taskId={{encodedTaskId}}\" class=\"btn btn-info btn-xs\" title=\"View graph\" data-toggle=\"tooltip\" data-action=\"drawGraph\"><i class=\"fa fa-sitemap\"/></a>\n                          {{#trackingUrl}}<a target=\"_blank\" href=\"{{trackingUrl}}\" class=\"btn btn-primary btn-xs\" title=\"Track Progress\" data-toggle=\"tooltip\"><i class=\"fa fa-eye\"></i></a>{{/trackingUrl}}\n                          {{#statusMessage}}<button class=\"btn btn-primary btn-xs statusMessage\" title=\"Status message\" data-toggle=\"tooltip\" data-task-id=\"{{taskId}}\" data-display-name=\"{{displayName}}\"><i class=\"fa fa-comment\"></i></button>{{/statusMessage}}\n                          {{^statusMessage}}\n                            {{#progressPercentage}}<button class=\"btn btn-primary btn-xs statusMessage\" title=\"Status message\" data-toggle=\"tooltip\" data-task-id=\"{{taskId}}\" data-display-name=\"{{displayName}}\"><i class=\"fa fa-comment\"></i></button>\n                            {{/progressPercentage}}\n                          {{/statusMessage}}\n                          {{#acceptsMessages}}<button class=\"btn btn-success btn-xs schedulerMessage\" title=\"Send message\" data-toggle=\"tooltip\" data-task-id=\"{{taskId}}\" data-display-name=\"{{displayName}}\" data-worker=\"{{workerIdRunning}}\"><i class=\"fa fa-paper-plane\"></i></button>\n                          {{/acceptsMessages}}\n                        </td>\n                      </tr>\n                      {{/tasks}}\n                      {{/num_running}}\n                      </tbody>\n                    </table>\n                </div>\n            </div>\n            {{/workerList}}\n        </script>\n\n        <script type=\"text/template\" name=\"resourceTemplate\">\n        <div class=\"modal fade\" id=\"setResourcesModal\" tabindex=\"-1\" role=\"dialog\" aria-labelledby=\"setResourcesLabel\">\n          <div class=\"modal-dialog\" role=\"document\">\n            <div class=\"modal-content\">\n              <div class=\"modal-header\">\n                <button type=\"button\" class=\"close\" data-dismiss=\"modal\" aria-label=\"Close\"><span aria-hidden=\"true\">&times;</span></button>\n                <h4 class=\"modal-title\" id=\"setResourcesLabel\">Set resources</h4>\n              </div>\n              <div class=\"modal-body\">\n                <form>\n                  <div class=\"form-group\">\n                    <label for=\"setResourcesInput\">New number of resources:</label>\n                    <input type=\"text\" class=\"form-control\" id=\"setResourcesInput\" placeholder=\"non-negative integer\">\n                  </div>\n                </form>\n              </div>\n              <div class=\"modal-footer\">\n                <button type=\"button\" class=\"btn btn-default\" data-dismiss=\"modal\">Cancel</button>\n                <button type=\"button\" id=\"setResourcesButton\" data-dismiss=\"modal\" class=\"btn btn-primary\">Set</button>\n              </div>\n            </div>\n          </div>\n        </div>\n        {{#resources}}\n        <div class=\"box\">\n            <div class=\"box-header with-border\">\n                <h3 class=\"box-title\">{{name}}</h3>\n                <div class=\"box-tools pull-right\">\n                    <div class=\"btn-group\">\n                      <button type=\"button\" class=\"btn btn-sm btn-default dropdown-toggle btn-set-resources\" data-toggle=\"dropdown\" aria-haspopup=\"true\" aria-expanded=\"false\">\n                        Change resources\n                      </button>\n                      <ul class=\"dropdown-menu\">\n                        <li><a href=\"#\" class=\"btn-increment-resources\" data-resource=\"{{name}}\">\n                          <i class=\"glyphicon glyphicon-plus\"></i> Add 1 resource\n                        </a></li>\n                        <li><a href=\"#\" class=\"btn-decrement-resources\" data-resource=\"{{name}}\">\n                          <i class=\"glyphicon glyphicon-minus\"></i> Remove 1 resource\n                        </a></li>\n                        <li><a href=\"#\" class=\"btn-set-resources\" data-toggle=\"modal\" data-target=\"#setResourcesModal\" data-resource=\"{{name}}\">\n                          <i class=\"glyphicon glyphicon-pencil\"></i> Set resources ...\n                        </a></li>\n                      </ul>\n                    </div>\n                    {{#num_consumer}}\n                    <i class=\"fa fa-navicon resources-collapse\" data-target=\"#collapse-{{name}}\"></i>\n                    {{/num_consumer}}\n                </div><!-- /.box-tools -->\n            </div><!-- /.box-header -->\n            <div class=\"box-body\" id=\"{{name}}-resource-box\">\n                <div class=\"progress\">\n                    <div class=\"progress-bar progress-bar-{{bar_type}}\" style=\"width: {{percent_used}}%\">\n                        <b>{{num_used}}/{{num_total}}</b>\n                    </div>\n                </div>\n\n                {{#num_consumer}}\n                <div class=\"collapse resource-box\" id=\"collapse-{{name}}\" data-resource=\"{{name}}\">\n                    <table class=\"table table-striped worker-table\">\n                        <thead>\n                            <th>Name</th>\n                            <th>Priority</th>\n                            <th>Time</th>\n                            <th>Actions</th>\n                        </thead>\n                        <tbody>\n                            {{#tasks}}\n                            <tr>\n                                <td>{{displayName}}</td>\n                                <td>{{priority}}</td>\n                                <td>{{displayTime}}</td>\n                                <td><a href=\"#tab=graph&taskId={{taskId}}&hideDone=1\" class=\"btn btn-info btn-xs\" title=\"View graph\" data-toggle=\"tooltip\" data-action=\"drawGraph\"><i class=\"fa fa-sitemap\"/></a></td>\n                            </tr>\n                            {{/tasks}}\n                        </tbody>\n                    </table>\n                </div>\n                {{/num_consumer}}\n            </div><!-- /.box-body -->\n        </div><!-- /.box -->\n        {{/resources}}\n        </script>\n\n        <script type=\"text/template\" name=\"sidebarTemplate\">\n            <ul class=\"sidebar-menu\">\n                <li class=\"header\">TASK FAMILIES <span class=\"badge\" id=\"clear-task-filter\">Clear selection</span></li>\n                {{#tasks}}\n                <li>\n                  <a href=\"#\" class=\"sidebar-folder\">{{name}}</a>\n                  <ul class=\"sidebar-menu\" style=\"display: none\">\n                    {{#tasks}}\n                    <li data-task=\"{{name}}\"><a href=\"#\"><span class=\"badge\">{{count}}</span> {{name}}</a>\n                    </li>\n                    {{/tasks}}\n                  </ul>\n                </li>\n                {{/tasks}}\n            </ul>\n        </script>\n\n        <script type=\"text/template\" name=\"warningsTemplate\">\n           <div class=\"callout callout-danger\">\n            <p>\n              Too many {{#missingCategories}} <strong>{{name}}</strong>, {{/missingCategories}}  tasks to display.\n            </p>\n            <p>\n              Task family counts only include displayed tasks.  Use <strong>Filter on Server</strong> to constrain search.\n            </p>\n          </div>\n        </script>\n\n        <script type=\"text/template\" name=\"currentFilterTemplate\">\n          <div class=\"callout callout-info\">\n            Displaying {{#catNames}}<strong>{{ name }}</strong>, {{/catNames}} tasks\n            {{#taskFamily}}of family <strong>{{taskFamily}}</strong>{{/taskFamily}} {{#tableFilter}}filtered by <strong>\"{{tableFilter}}\"</strong>{{/tableFilter}}.\n          </div>\n        </script>\n\n        <script type=\"text/template\" name=\"topNavbarItem\">\n          <li>\n            <a class=\"js-nav-link\" href=\"{{href}}\" {{#dataTab}}data-tab=\"{{dataTab}}\"{{/dataTab}}>\n              {{label}}\n            </a>\n          </li>\n        </script>\n\n    </head>\n    <body class=\"skin-green-light fixed\">\n        <div class=\"modal fade\" id=\"errorModal\" tabindex=\"-1\" role=\"dialog\" aria-labelledby=\"myModalLabel\" aria-hidden=\"true\">\n        </div>\n        <div class=\"modal fade\" id=\"statusMessageModal\" tabindex=\"-1\" role=\"dialog\" aria-labelledby=\"myModalLabel\" aria-hidden=\"true\">\n        </div>\n        <div class=\"modal fade\" id=\"schedulerMessageModal\" tabindex=\"-1\" role=\"dialog\" aria-labelledby=\"myModalLabel\" aria-hidden=\"true\">\n        </div>\n\n        <div class=\"wrapper\">\n            <div class=\"main-header\">\n                <a class=\"logo\" href=\"#\">Luigi Task Status</a>\n                <nav class=\"navbar navbar-static-top\">\n                    <a href=\"#\" class=\"sidebar-toggle\" data-toggle=\"offcanvas\" role=\"button\">\n                        <span class=\"sr-only\">Toggle navigation</span>\n                    </a>\n                    <div class=\"container-fluid\">\n                        <div class=\"navbar-header\">\n                            <button type=\"button\" class=\"navbar-toggle collapsed\" data-toggle=\"collapse\" data-target=\"#bs-example-navbar-collapse-1\">\n                                <span class=\"sr-only\">Toggle navigation</span>\n                                <span class=\"icon-bar\"></span>\n                                <span class=\"icon-bar\"></span>\n                                <span class=\"icon-bar\"></span>\n                            </button>\n                        </div>\n                        <div class=\"collapse navbar-collapse\">\n                            <ul class=\"nav navbar-nav\" id=\"topNavbar\">\n                                <li><a class=\"js-nav-link\" href=\"#tab=tasks\" data-tab=\"taskList\">Task List</a></li>\n                                <li><a class=\"js-nav-link\" href=\"#tab=graph\" data-tab=\"dependencyGraph\">Dependency Graph</a></li>\n                                <li><a class=\"js-nav-link\" href=\"#tab=workers\" data-tab=\"workerList\">Workers</a></li>\n                                <li><a class=\"js-nav-link\" href=\"#tab=resource\" data-tab=\"resourceList\">Resources</a></li>\n                            </ul>\n                            <form class=\"navbar-form navbar-right\" id=\"pause-form\">\n                            </form>\n                        </div>\n                    </div>\n                </nav>\n            </div>\n            <div class=\"main-sidebar\">\n                <div class=\"sidebar\" id=\"familySidebar\">\n                </div>\n            </div>\n\n\n        <div class=\"content-wrapper\">\n          <div class=\"content\">\n\n        <div class=\"tab-content\">\n            <section id=\"taskList\" class=\"container-fluid tab-pane active\">\n                <div class=\"row\">\n                  <div class=\"col-md-3 col-sm-6 col-xs-12\">\n                    <div class=\"info-box status-info\" data-color='yellow' data-category='PENDING' id=\"PENDING_info\">\n                      <span class=\"info-box-icon bg-yellow\"><i class=\"fa fa-spinner fa-pulse\"></i></span>\n                      <div class=\"info-box-content\">\n                        <span class=\"info-box-text\">Pending Tasks</span>\n                        <span class=\"info-box-number\">?</span>\n                      </div><!-- /.info-box-content -->\n                    </div><!-- /.info-box -->\n                  </div>\n\n                  <div class=\"col-md-3 col-sm-6 col-xs-12\">\n                    <div class=\"info-box status-info\" data-color='aqua' data-category='RUNNING' id=\"RUNNING_info\">\n                      <span class=\"info-box-icon bg-aqua\"><i class=\"fa fa-spinner fa-pulse\"></i></span>\n                      <div class=\"info-box-content\">\n                        <span class=\"info-box-text\">Running Tasks</span>\n                        <span class=\"info-box-number\">?</span>\n                      </div><!-- /.info-box-content -->\n                    </div><!-- /.info-box -->\n                  </div>\n\n                  <div class=\"col-md-3 col-sm-6 col-xs-12\">\n                    <div class=\"info-box status-info\" data-color='purple' data-category='BATCH_RUNNING' id=\"BATCH_RUNNING_info\">\n                      <span class=\"info-box-icon bg-purple\"><i class=\"fa fa-spinner fa-pulse\"></i></span>\n                      <div class=\"info-box-content\">\n                        <span class=\"info-box-text\">Batch Running Tasks</span>\n                        <span class=\"info-box-number\">?</span>\n                      </div><!-- /.info-box-content -->\n                    </div><!-- /.info-box -->\n                  </div>\n\n                  <div class=\"col-md-3 col-sm-6 col-xs-12\">\n                    <div class=\"info-box status-info\" data-color='green' data-category='DONE' id=\"DONE_info\">\n                      <span class=\"info-box-icon bg-green\"><i class=\"fa fa-spinner fa-pulse\"></i></span>\n                      <div class=\"info-box-content\">\n                        <span class=\"info-box-text\">Done Tasks</span>\n                        <span class=\"info-box-number\">?</span>\n                      </div><!-- /.info-box-content -->\n                    </div><!-- /.info-box -->\n                  </div>\n\n                  <div class=\"col-md-3 col-sm-6 col-xs-12\">\n                    <div class=\"info-box status-info\" data-color='red' data-category='FAILED' id=\"FAILED_info\">\n                      <span class=\"info-box-icon bg-red\"><i class=\"fa fa-spinner fa-pulse\"></i></span>\n                      <div class=\"info-box-content\">\n                        <span class=\"info-box-text\">Failed Tasks</span>\n                        <span class=\"info-box-number\">?</span>\n                      </div><!-- /.info-box-content -->\n                    </div><!-- /.info-box -->\n                  </div>\n\n                  <div class=\"col-md-3 col-sm-6 col-xs-12\">\n                    <div class=\"info-box status-info\" data-color='maroon' data-category='UPSTREAM_FAILED' id=\"UPSTREAM_FAILED_info\">\n                      <span class=\"info-box-icon bg-maroon\"><i class=\"fa fa-spinner fa-pulse\"></i></span>\n                      <div class=\"info-box-content\">\n                        <span class=\"info-box-text\">Upstream Failure</span>\n                        <span class=\"info-box-number\">?</span>\n                      </div><!-- /.info-box-content -->\n                    </div><!-- /.info-box -->\n                  </div>\n\n                  <div class=\"col-md-3 col-sm-6 col-xs-12\">\n                    <div class=\"info-box status-info\" data-color='gray' data-category='DISABLED' id=\"DISABLED_info\">\n                      <span class=\"info-box-icon bg-gray\"><i class=\"fa fa-spinner fa-pulse\"></i></span>\n                      <div class=\"info-box-content\">\n                        <span class=\"info-box-text\">Disabled Tasks</span>\n                        <span class=\"info-box-number\">?</span>\n                      </div><!-- /.info-box-content -->\n                    </div><!-- /.info-box -->\n                  </div>\n\n                  <div class=\"col-md-3 col-sm-6 col-xs-12\">\n                    <div class=\"info-box status-info\" data-color='gray' data-category='UPSTREAM_DISABLED' id=\"UPSTREAM_DISABLED_info\">\n                      <span class=\"info-box-icon bg-gray\"><i class=\"fa fa-spinner fa-pulse\"></i></span>\n                      <div class=\"info-box-content\">\n                        <span class=\"info-box-text\">Upstream Disabled</span>\n                        <span class=\"info-box-number\">?</span>\n                      </div><!-- /.info-box-content -->\n                    </div><!-- /.info-box -->\n                  </div>\n\n                </div>\n                <div class=\"container-fluid infoBar\">\n                  <div id=\"currentFilter\" class=\"col-md-6 col-sm-12 col-xs-12\"></div>\n                  <div id=\"warnings\" class=\"col-md-6 col-sm-12 col-xs-12\"></div>\n                </div>\n                <div class=\"col-md-8 col-md-offset-2\">\n                    <div id=\"checkboxes\"></div>\n                </div>\n\n                <table id=\"taskTable\" class=\"table table-striped\">\n                  <thead>\n                    <th><!-- Category --></th>\n                    <th>Name</th>\n                    <th>Details</th>\n                    <th>Priority</th>\n                    <th>Time</th>\n                    <th>Actions</th>\n                  </thead>\n                </table>\n\n\n            </section>\n            <section id=\"dependencyGraph\" class=\"tab-pane\">\n              <div class=\"container-fluid\">\n                    <div class=\"form-group col-md-6 col-sm-4\">\n                      <form class=\"form-inline\" id=\"loadTaskForm\">\n                        <input id=\"js-task-id\" type=\"text\" class=\"search-query form-control\" placeholder=\"TaskId\">\n                        <button type=\"submit\" class=\"btn btn-default form-control\">Show task details</button>\n                      </form>\n                    </div>\n                    <form class=\"form-inline\" id=\"visForm\">\n                      <div class=\"form-group col-md-3\">\n                        <label class=\"btn btn-default\" for=\"invertCheckbox\">Show Upstream Dependencies\n                            <input type=\"checkbox\" id=\"invertCheckbox\"/>\n                        </label>\n                        <label class=\"btn btn-default\" for=\"hideDoneCheckbox\">Hide Done\n                            <input type=\"checkbox\" id=\"hideDoneCheckbox\"/>\n                        </label>\n                      </div>\n                      <div class=\"form-group col-md-3\">\n                        <label>Visualisation Type</label>\n                        <div id=\"toggleVisButtons\" class=\"btn-group\" data-toggle=\"buttons\">\n                            <label class=\"btn btn-default\">\n                                <input type=\"radio\" name=\"vis-type\" value=\"d3\"/> D3\n                            </label>\n                            <label class=\"btn btn-default\">\n                                <input type=\"radio\" name=\"vis-type\" value=\"svg\"/> SVG\n                            </label>\n                        </div>\n                       </div>\n                    </form>\n                </div>\n\n                <div id=\"searchError\">\n                </div>\n                <div id=\"graphContainer\" class=\"container-fluid\">\n                    <h4 id=\"dependencyTitle\"></h4>\n                    <h5>Dependency Graph</h5>\n                    <div id=\"graphPlaceholder\"></div>\n                </div>\n            </section>\n            <section id=\"workerList\" class=\"container-fluid tab-pane active\">\n            </section>\n            <section id=\"resourceList\" class=\"tab-pane\">\n            </section>\n        </div> <!-- /.tab-content -->\n        </div> <!-- /.content -->\n        </div> <!-- /.content-wrapper -->\n\n        </div> <!-- /.wrapper -->\n\n\n        <script>\n            visualiserApp(new LuigiAPI(\"../../api\"));\n        </script>\n    </body>\n</html>\n"
  },
  {
    "path": "luigi/static/visualiser/js/graph.js",
    "content": "Graph = (function() {\n    var statusColors = {\n        \"FAILED\":\"#DD0000\",\n        \"RUNNING\":\"#0044DD\",\n        \"BATCH_RUNNING\":\"#BB00BB\",\n        \"PENDING\":\"#EEBB00\",\n        \"DONE\":\"#00DD00\",\n        \"DISABLED\":\"#808080\",\n        \"UNKNOWN\":\"#000000\",\n        \"TRUNCATED\":\"#FF00FF\"\n    };\n\n    /* Line height for items in task status legend */\n    var legendLineHeight = 20;\n\n    /* Height of vertical space between nodes */\n    var nodeHeight = 10;\n\n    /* Amount of horizontal space given for each node */\n    var nodeWidth = 200;\n\n    /* Random horizontal offset for each row */\n    var jitterWidth = 100;\n\n    /* Calculate minimum SVG height required for legend */\n    var legendMaxY = (function () {\n        return Object.keys(statusColors).length * legendLineHeight + ( legendLineHeight / 2 )\n    })();\n\n    var legendWidth = 110;\n\n    function nodeFromTask(task) {\n        var deps = task.deps;\n        deps.sort();\n        return {\n            name: task.name,\n            taskId: task.taskId,\n            status: task.status,\n            trackingUrl: this.hashBase + task.taskId,\n            deps: deps,\n            params: task.params,\n            priority: task.priority,\n            depth: -1\n        };\n    }\n\n    /* Convert array to dict by indexing on propertyName */\n    function uniqueIndexByProperty(data, propertyName) {\n        var nodeIndex = {};\n        $.each(data, function(i, dataPoint) {\n            nodeIndex[dataPoint[propertyName]] = i;\n        });\n        return nodeIndex;\n    }\n\n    /* Create edges between the supplied node using the deps property of each node */\n    function createDependencyEdges(nodes, nodeIndex) {\n        var edges = [];\n        $.each(nodes, function(i, task) {\n            $.each(task.deps, function(j, dep) {\n                if (nodeIndex[dep]) {\n                    edges.push({\n                        source: nodes[nodeIndex[task.taskId]],\n                        target: nodes[nodeIndex[dep]]\n                    });\n                }\n            });\n        });\n        return edges;\n    }\n    /* Compute the depth of each node for layout purposes */\n    function computeDepth(nodes, nodeIndex) {\n        var selfDependencies = false\n        function descend(n, depth) {\n            if (n.depth === undefined || depth > n.depth) {\n                n.depth = depth;\n                $.each(n.deps, function(i, dep) {\n                    if (nodeIndex[dep]) {\n                        var child_node = nodes[nodeIndex[dep]]\n                        descend(child_node, depth + 1);\n                        if (!selfDependencies && n.name == child_node.name) {\n                            selfDependencies = true;\n                        }\n                    }\n                });\n            }\n        }\n        descend(nodes[0], 0);\n        return selfDependencies\n    }\n\n    /* Group tasks, so all tasks with the same name appear at the same depth. */\n    function groupTasks(nodes) {\n\n        // compute average assigned depth\n        var taskDepths = {};\n        $.each(nodes, function(i, n) {\n            if (taskDepths[n.name] === undefined) {\n                taskDepths[n.name] = [n.depth];\n            } else {\n                taskDepths[n.name].push(n.depth);\n            }\n        });\n        var averages = [];\n        $.each(taskDepths, function(key, array) {\n            var total = 0;\n            for (var i in array) total += array[i];\n            var mean = total / array.length;\n            averages.push([key, mean]);\n        });\n\n        // sort tasks\n        averages.sort( function(first, second) {\n            return first[1] - second[1];\n        });\n\n        // reassign task depths and node depths\n        var classDepths = {}\n        $.each(averages, function(i, a) {\n            classDepths[a[0]] = i;\n        });\n\n        $.each(nodes, function(i, n) {\n            n.depth = classDepths[n.name];\n        });\n        return classDepths\n    }\n\n    /* Compute the depth of each node for layout purposes, returns the number\n       of nodes at each depth level (for layout purposes) */\n    function computeRows(nodes, nodeIndex) {\n        var selfDependencies = computeDepth(nodes, nodeIndex)\n\n        if (!selfDependencies) {\n            var classDepths = groupTasks(nodes)\n        }\n\n        var rowSizes = [];\n        function placeNodes(n, depth) {\n            if (rowSizes[depth] === undefined) {\n                rowSizes[depth] = 0;\n            }\n            if (n.xOrder === undefined && depth === n.depth) {\n                n.xOrder = rowSizes[depth];\n                rowSizes[depth]++;\n                $.each(n.deps, function(i, dep) {\n                    if (nodeIndex[dep]) {\n                        var next_node = nodes[nodeIndex[dep]]\n                        var next_depth = (selfDependencies ? depth + 1 : classDepths[next_node.name])\n                        placeNodes(next_node, next_depth);\n                    }\n                });\n            }\n        }\n        placeNodes(nodes[0], 0);\n\n        return rowSizes;\n    }\n    /* Format nodes according to their depth and horizontal sort order.\n       Algorithm: evenly distribute nodes along each depth level, offsetting each\n       by the text line height to prevent overlapping text. This is done within\n       multiple columns to keep the levels from being too tall. The column width\n       is at least nodeWidth to ensure readability. The height of each level is\n       determined by number of nodes divided by number of columns, rounded up. */\n    function layoutNodes(nodes, rowSizes) {\n        var numCols = Math.max(2, Math.floor((graphWidth - jitterWidth) / nodeWidth));\n        function rowStartPosition(depth) {\n            if (depth === 0) return 20;\n            var rowHeight = Math.ceil(rowSizes[depth-1] / numCols);\n            return rowStartPosition(depth-1)+Math.max(rowHeight * nodeHeight + 100);\n        }\n        var jitter = []\n        for (var i in rowSizes) {\n            jitter[i] = Math.ceil(Math.random() * jitterWidth)\n        }\n        $.each(nodes, function(i, node) {\n            var numRows = Math.ceil(rowSizes[node.depth] / numCols);\n            var levelCols = Math.ceil(rowSizes[node.depth] / numRows);\n            var row = node.xOrder % numRows;\n            var col = node.xOrder / numRows;\n            node.x =\n                ((col + 1) / (levelCols + 1))\n                * (graphWidth - jitterWidth - nodeWidth)\n                + jitter[node.depth];\n            node.y = rowStartPosition(node.depth) + row * nodeHeight;\n        });\n    }\n\n    /* Parses a list of tasks to a graph format */\n    function createGraph(tasks, hashBase) {\n        if (tasks.length === 0) return {nodes: [], links: []};\n\n        this.hashBase = hashBase;\n        var nodes = $.map(tasks, nodeFromTask);\n        var nodeIndex = uniqueIndexByProperty(nodes, \"taskId\");\n\n        var rowSizes = computeRows(nodes, nodeIndex);\n\n        nodes = $.map(nodes, function(node) { return node.depth >= 0 ? node: null; });\n\n        layoutNodes(nodes, rowSizes);\n\n        // We need to re-index nodes after filtering\n        nodeIndex = uniqueIndexByProperty(nodes, \"taskId\");\n        var edges = createDependencyEdges(nodes, nodeIndex);\n\n        return {\n            nodes: nodes,\n            links: edges\n        };\n    }\n\n    function findBounds(nodes) {\n        var maxX = 0;\n        var maxY = legendMaxY;\n        $.each(nodes, function(i, node) {\n            if (node.x>maxX) maxX = node.x;\n            if (node.y>maxY) maxY = node.y;\n        });\n        return {\n            x:maxX,\n            y:maxY\n        };\n    }\n\n    var graphWidth = window.innerWidth - 80;\n\n    function DependencyGraph(containerElement) {\n        this.svg = $(svgElement(\"svg\")).appendTo($(containerElement));\n    }\n\n\n    /* We need custom element creators for svg nodes and xlink attributes because jQuery doesn't support\n       namespaces properly */\n    function svgElement(name) {\n        return document.createElementNS(\"http://www.w3.org/2000/svg\", name);\n    }\n\n    function svgLink(url) {\n        var element = svgElement(\"a\");\n        element.setAttributeNS(\"http://www.w3.org/1999/xlink\", \"href\", url);\n        return element;\n    }\n\n    DependencyGraph.prototype.renderGraph = function() {\n        var self = this;\n\n        $.each(this.graph.links, function(i, link) {\n            var line = $(svgElement(\"line\"))\n                        .attr(\"class\",\"link\")\n                        .attr(\"x1\", link.source.x)\n                        .attr(\"y1\", link.source.y)\n                        .attr(\"x2\", link.target.x)\n                        .attr(\"y2\", link.target.y)\n                        .appendTo(self.svg);\n        });\n\n        $.each(this.graph.nodes, function(i, node) {\n            var g = $(svgElement(\"g\"))\n                .addClass(\"node\")\n                .attr(\"transform\", \"translate(\" + node.x + \",\" + node.y +\")\")\n                .appendTo(self.svg);\n\n            $(svgElement(\"circle\"))\n                .addClass(\"nodeCircle\")\n                .attr(\"r\", 7)\n                .attr(\"fill\", statusColors[node.status])\n                .appendTo(g);\n            $(svgLink(node.trackingUrl))\n                .append(\n                    $(svgElement(\"text\"))\n                    .text(escapeHtml(node.name))\n                    .attr(\"y\", 3))\n                .attr(\"class\",\"graph-node-a\")\n                .attr(\"data-task-status\", node.status)\n                .attr(\"data-task-id\", node.taskId)\n                .appendTo(g);\n\n            var titleText = node.name;\n            var content = $.map(node.params, function (value, name) { return escapeHtml(name + \": \" + value); }).join(\"<br>\");\n            g.attr(\"title\", titleText)\n                .popover({\n                    trigger: 'hover',\n                    container: 'body',\n                    html: true,\n                    placement: 'top',\n                    content: content\n                });\n        });\n\n        // Legend for Task status\n        var legend = $(svgElement(\"g\"))\n                .addClass(\"legend\")\n                .appendTo(self.svg);\n\n        $(svgElement(\"rect\"))\n            .attr(\"x\", -1)\n            .attr(\"y\", -1)\n            .attr(\"width\", legendWidth + \"px\")\n            .attr(\"height\", legendMaxY + \"px\")\n            .attr(\"fill\", \"#FFF\")\n            .attr(\"stroke\", \"#DDD\")\n            .appendTo(legend);\n\n        var x = 0;\n        $.each(statusColors, function(key, color) {\n            var c = $(svgElement(\"circle\"))\n                .addClass(\"nodeCircle\")\n                .attr(\"r\", 7)\n                .attr(\"cx\", legendLineHeight)\n                .attr(\"cy\", (legendLineHeight-4)+(x*legendLineHeight))\n                .attr(\"fill\", color)\n                .appendTo(legend);\n\n            $(svgElement(\"text\"))\n                .text(escapeHtml(key.charAt(0).toUpperCase() + key.substring(1).toLowerCase().replace(/_./gi, function (x) { return \" \" + x[1].toUpperCase(); })))\n                .attr(\"x\", legendLineHeight + 14)\n                .attr(\"y\", legendLineHeight+(x*legendLineHeight))\n                .appendTo(legend);\n\n            x++;\n        });\n    };\n\n    DependencyGraph.prototype.updateData = function(taskList, hashBase) {\n        $('.popover').popover('destroy');\n        this.graph = createGraph(taskList, hashBase);\n        bounds = findBounds(this.graph.nodes);\n        this.renderGraph();\n        this.svg.attr(\"height\", bounds.y+10);\n        this.svg.attr(\"width\", graphWidth+10);\n        this.svg[0].setAttributeNS(\"http://www.w3.org/2000/svg\", \"preserveAspectRatio\", \"xMidYMid meet\");\n        this.svg[0].setAttributeNS(\"http://www.w3.org/2000/svg\", \"viewBox\", \"0 0 \" + graphWidth + \" \" + (bounds.y+10));\n    };\n\n    return {\n        DependencyGraph: DependencyGraph,\n        testableMethods: {\n            nodeFromTask: nodeFromTask,\n            uniqueIndexByProperty: uniqueIndexByProperty,\n            createDependencyEdges: createDependencyEdges,\n            computeDepth: computeDepth,\n            computeRows: computeRows,\n            createGraph: createGraph,\n            findBounds: findBounds\n        }\n    };\n})();\n"
  },
  {
    "path": "luigi/static/visualiser/js/luigi.js",
    "content": "var LuigiAPI = (function() {\n    function LuigiAPI (urlRoot) {\n        this.urlRoot = urlRoot;\n    }\n\n    function flatten(response, rootId) {\n        var flattened = [];\n        // Make the requested taskId the first in the list\n        if (rootId && response[rootId]) {\n            var rootNode = response[rootId];\n            rootNode.taskId=rootId;\n            flattened.push(rootNode);\n            delete response[rootId];\n        }\n        $.each(response, function(key, value) {\n            value.taskId = key;\n            flattened.push(value);\n        });\n        return flattened;\n    }\n\n    function flatten_running(response) {\n        $.each(response, function(key, value) {\n            value.running = flatten(value.running);\n        });\n        return response;\n    }\n\n    function jsonRPC(url, paramObject, callback) {\n        return $.ajax(url, {\n            data: {data: JSON.stringify(paramObject)},\n            method: \"GET\",\n            success: callback,\n            dataType: \"json\"\n        });\n    }\n\n    function searchTerm() {\n        // FIXME : leaky API.  This shouldn't rely on the DOM.\n        if ($('#serverSideCheckbox')[0].checked) {\n            return $('#taskTable_filter').find('input').val();\n        }\n        else {\n            return '';\n        }\n    }\n\n    LuigiAPI.prototype.getDependencyGraph = function (taskId, callback, include_done) {\n        return jsonRPC(this.urlRoot + \"/dep_graph\", {task_id: taskId, include_done: include_done}, function(response) {\n            callback(flatten(response.response, taskId));\n        });\n    };\n\n    LuigiAPI.prototype.getInverseDependencyGraph = function (taskId, callback, include_done) {\n        return jsonRPC(this.urlRoot + \"/inverse_dep_graph\", {task_id: taskId, include_done: include_done}, function(response) {\n            callback(flatten(response.response, taskId));\n        });\n    };\n\n    LuigiAPI.prototype.forgiveFailures = function (taskId, callback) {\n        return jsonRPC(this.urlRoot + \"/forgive_failures\", {task_id: taskId}, function(response) {\n            callback(flatten(response.response));\n        });\n    };\n\n    LuigiAPI.prototype.markAsDone = function (taskId, callback) {\n        return jsonRPC(this.urlRoot + \"/mark_as_done\", {task_id: taskId}, function(response) {\n            callback(flatten(response.response));\n        });\n    };\n\n    LuigiAPI.prototype.getFailedTaskList = function(callback) {\n        return jsonRPC(this.urlRoot + \"/task_list\", {status: \"FAILED\", upstream_status: \"\", search: searchTerm()}, function(response) {\n            callback(flatten(response.response));\n        });\n    };\n\n    LuigiAPI.prototype.getUpstreamFailedTaskList = function(callback) {\n        return jsonRPC(this.urlRoot + \"/task_list\", {status: \"PENDING\", upstream_status: \"UPSTREAM_FAILED\", search: searchTerm()}, function(response) {\n            callback(flatten(response.response));\n        });\n    };\n\n    LuigiAPI.prototype.getDoneTaskList = function(callback) {\n        return jsonRPC(this.urlRoot + \"/task_list\", {status: \"DONE\", upstream_status: \"\", search: searchTerm()}, function(response) {\n            callback(flatten(response.response));\n        });\n    };\n\n    LuigiAPI.prototype.reEnable = function(taskId, callback) {\n        return jsonRPC(this.urlRoot + \"/re_enable_task\", {task_id: taskId}, function(response) {\n            callback(response.response);\n        });\n    };\n\n    LuigiAPI.prototype.getErrorTrace = function(taskId, callback) {\n        return jsonRPC(this.urlRoot + \"/fetch_error\", {task_id: taskId}, function(response) {\n            callback(response.response);\n        });\n    };\n\n    LuigiAPI.prototype.getTaskStatusMessage = function(taskId, callback) {\n        return jsonRPC(this.urlRoot + \"/get_task_status_message\", {task_id: taskId}, function(response) {\n            callback(response.response);\n        });\n    };\n\n    LuigiAPI.prototype.getTaskProgressPercentage = function(taskId, callback) {\n        return jsonRPC(this.urlRoot + \"/get_task_progress_percentage\", {task_id: taskId}, function(response) {\n            callback(response.response);\n        });\n    };\n\n    LuigiAPI.prototype.getRunningTaskList = function(callback) {\n        return jsonRPC(this.urlRoot + \"/task_list\", {status: \"RUNNING\", upstream_status: \"\", search: searchTerm()}, function(response) {\n            callback(flatten(response.response));\n        });\n    };\n\n    LuigiAPI.prototype.getBatchRunningTaskList = function(callback) {\n        return jsonRPC(this.urlRoot + \"/task_list\", {status: \"BATCH_RUNNING\", upstream_status: \"\", search: searchTerm()}, function(response) {\n            callback(flatten(response.response));\n        });\n    };\n\n    LuigiAPI.prototype.getPendingTaskList = function(callback) {\n        return jsonRPC(this.urlRoot + \"/task_list\", {status: \"PENDING\", upstream_status: \"\", search: searchTerm()}, function(response) {\n            callback(flatten(response.response));\n        });\n    };\n\n    LuigiAPI.prototype.getDisabledTaskList = function(callback) {\n        jsonRPC(this.urlRoot + \"/task_list\", {status: \"DISABLED\", upstream_status: \"\", search: searchTerm()}, function(response) {\n            callback(flatten(response.response));\n        });\n    };\n\n    LuigiAPI.prototype.getUpstreamDisabledTaskList = function(callback) {\n        jsonRPC(this.urlRoot + \"/task_list\", {status: \"PENDING\", upstream_status: \"UPSTREAM_DISABLED\", search: searchTerm()}, function(response) {\n            callback(flatten(response.response));\n        });\n    };\n\n    LuigiAPI.prototype.getWorkerList = function(callback) {\n        jsonRPC(this.urlRoot + \"/worker_list\", {}, function(response) {\n            callback(flatten_running(response.response));\n        });\n    };\n\n    LuigiAPI.prototype.getResourceList = function(callback) {\n        jsonRPC(this.urlRoot + \"/resource_list\", {}, function(response) {\n            callback(flatten_running(response.response));\n        });\n    };\n\n    LuigiAPI.prototype.disableWorker = function(workerId) {\n        jsonRPC(this.urlRoot + \"/disable_worker\", {'worker': workerId});\n    };\n\n    LuigiAPI.prototype.setWorkerProcesses = function(workerId, n, callback) {\n        var data = {worker: workerId, n: n};\n        jsonRPC(this.urlRoot + \"/set_worker_processes\", data, function(response) {\n            callback();\n        });\n    };\n\n    LuigiAPI.prototype.sendSchedulerMessage = function(workerId, taskId, content, callback) {\n        var data = {worker: workerId, task: taskId, content: content};\n        jsonRPC(this.urlRoot + \"/send_scheduler_message\", data, function(response) {\n            if (callback) {\n                callback(response.response.message_id);\n            }\n        });\n    };\n\n    LuigiAPI.prototype.getSchedulerMessageResponse = function(taskId, messageId, callback) {\n        var data = {task_id: taskId, message_id: messageId};\n        jsonRPC(this.urlRoot + \"/get_scheduler_message_response\", data, function(response) {\n            callback(response.response.response);\n        });\n    };\n\n    LuigiAPI.prototype.isPauseEnabled = function(callback) {\n        jsonRPC(this.urlRoot + '/is_pause_enabled', {}, function(response) {\n            callback(response.response.enabled);\n        });\n    };\n\n    LuigiAPI.prototype.hasTaskHistory = function(callback) {\n        jsonRPC(this.urlRoot + '/has_task_history', {}, function(response) {\n            callback(response.response);\n        });\n    };\n\n    LuigiAPI.prototype.pause = function() {\n        jsonRPC(this.urlRoot + '/pause');\n    };\n\n    LuigiAPI.prototype.unpause = function() {\n        jsonRPC(this.urlRoot + '/unpause');\n    };\n\n    LuigiAPI.prototype.isPaused = function(callback) {\n        jsonRPC(this.urlRoot + \"/is_paused\", {}, function(response) {\n            callback(!response.response.paused);\n        });\n    };\n\n    LuigiAPI.prototype.updateResource = function(resource, n, callback) {\n        var data = {'resource': resource, 'amount': n};\n        jsonRPC(this.urlRoot + \"/update_resource\", data, function(response) {\n            callback();\n        });\n    };\n\n    return LuigiAPI;\n})();\n"
  },
  {
    "path": "luigi/static/visualiser/js/test/graph_test.js",
    "content": "module(\"graph.js\");\n\ntest(\"nodeFromTask\", function() {\n    var task = {\n        deps: [\"B1\",\"C1\"],\n        taskId: \"A1\",\n        status: \"DONE\",\n        name: \"A\",\n        params: {},\n        priority: 0,\n    };\n    var expected = {\n        taskId: \"A1\",\n        status: \"DONE\",\n        trackingUrl: \"#A1\",\n        deps: [\"B1\",\"C1\"],\n        depth: -1,\n        name: \"A\",\n        params: {},\n        priority: 0,\n    };\n    let graph = {\n        hashBase: \"#\"\n    }\n    deepEqual(Graph.testableMethods.nodeFromTask.bind(graph)(task), expected);\n});\n\ntest(\"uniqueIndexByProperty\", function() {\n    var input = [\n        {a:\"x\", b:100},\n        {a:\"y\", b:101},\n        {a:\"z\", b:102}\n    ];\n    var expected = {\n        \"x\": 0,\n        \"y\": 1,\n        \"z\": 2\n    };\n    deepEqual(Graph.testableMethods.uniqueIndexByProperty(input, \"a\"), expected);\n});\n\ntest(\"createDependencyEdges\", function() {\n    var A = {taskId: \"A\", deps: [\"B\",\"C\"]};\n    var B = {taskId: \"B\", deps: [\"D\"]};\n    var C = {taskId: \"C\", deps: []};\n    var D = {taskId: \"D\", deps: []};\n    var nodes = [A,B,C,D];\n    var nodeIndex = {\"A\":0, \"B\":1, \"C\":2, \"D\":3};\n    var edges = Graph.testableMethods.createDependencyEdges(nodes, nodeIndex);\n    var expected = [\n        {source: A, target: B},\n        {source: A, target: C},\n        {source: B, target: D}\n    ];\n    deepEqual(edges, expected);\n});\n\ntest(\"computeDepth\", function() {\n    var A = {taskId: \"A\", deps: [\"B\",\"C\"], depth:-1};\n    var B = {taskId: \"B\", deps: [\"D\"], depth:-1};\n    var C = {taskId: \"C\", deps: [], depth:-1};\n    var D = {taskId: \"D\", deps: [], depth:-1};\n    var E = {taskId: \"C\", deps: [], depth:-1};\n    var nodes = [A,B,C,D,E];\n    var nodeIndex = {\"A\":0, \"B\":1, \"C\":2, \"D\":3};\n    Graph.testableMethods.computeDepth(nodes, nodeIndex);\n    equal(A.depth, 0);\n    equal(B.depth, 1);\n    equal(C.depth, 1);\n    equal(D.depth, 2);\n    equal(E.depth, -1);\n});\n\ntest(\"computeRowsSelfDeps\", function () {\n    var A1 = {name: \"A\", taskId: \"A1\", deps: [\"A2\"], depth: -1}\n    var A2 = {name: \"A\", taskId: \"A2\", deps: [], depth: -1}\n    var nodes = [A1, A2]\n    var nodeIndex = {\"A1\": 0, \"A2\": 1}\n    var rowSizes = Graph.testableMethods.computeRows(nodes, nodeIndex)\n    equal(A1.depth, 0)\n    equal(A2.depth, 1)\n    deepEqual(rowSizes, [1, 1])\n});\n\ntest(\"computeRowsGrouped\", function() {\n    var A0 = {name: \"A\", taskId: \"A0\", deps: [\"D0\", \"B0\"], depth: -1}\n    var B0 = {name: \"B\", taskId: \"B0\", deps: [\"C1\", \"C2\"], depth: -1}\n    var C1 = {name: \"C\", taskId: \"C1\", deps: [\"D1\", \"E1\"], depth: -1}\n    var C2 = {name: \"C\", taskId: \"C2\", deps: [\"D2\", \"E2\"], depth: -1}\n    var D0 = {name: \"D\", taskId: \"D0\", deps: [], depth: -1}\n    var D1 = {name: \"D\", taskId: \"D1\", deps: [], depth: -1}\n    var D2 = {name: \"D\", taskId: \"D2\", deps: [], depth: -1}\n    var E1 = {name: \"E\", taskId: \"E1\", deps: [], depth: -1}\n    var E2 = {name: \"E\", taskId: \"E2\", deps: [], depth: -1}\n    var nodes = [A0, B0, C1, C2, D0, D1, D2, E1, E2]\n    var nodeIndex = {\"A0\": 0, \"B0\": 1, \"C1\": 2, \"C2\": 3, \"D0\": 4, \"D1\": 5, \"D2\": 6, \"E1\": 7, \"E2\": 8}\n    var rowSizes = Graph.testableMethods.computeRows(nodes, nodeIndex)\n    equal(A0.depth, 0)\n    equal(B0.depth, 1)\n    equal(C1.depth, 2)\n    equal(C2.depth, 2)\n    equal(D0.depth, 3)\n    equal(D1.depth, 3)\n    equal(D2.depth, 3)\n    equal(E1.depth, 4)\n    equal(E2.depth, 4)\n    deepEqual(rowSizes, [1, 1, 2, 3, 2])\n});\n\ntest(\"createGraph\", function() {\n    var tasks = [\n        {taskId: \"A\", deps: [\"B\",\"C\"], status: \"PENDING\"},\n        {taskId: \"B\", deps: [\"D\"], status: \"RUNNING\"},\n        {taskId: \"C\", deps: [], status: \"DONE\"},\n        {taskId: \"D\", deps: [], status: \"DONE\"},\n        {taskId: \"E\", deps: [], status: \"DONE\"}\n    ];\n    var graph = Graph.testableMethods.createGraph(tasks);\n    equal(graph.nodes.length, 4);\n    equal(graph.links.length, 3);\n    $.each(graph.nodes, function() {\n        notEqual(this.x, 0);\n        notEqual(this.y, 0);\n    });\n\n    // TODO: more assertions\n});\n"
  },
  {
    "path": "luigi/static/visualiser/js/tipsy.js",
    "content": "// tipsy, facebook style tooltips for jquery\n// version 1.0.0a\n// (c) 2008-2010 jason frame [jason@onehackoranother.com]\n// released under the MIT license\n\n(function($) {\n    \n    function maybeCall(thing, ctx) {\n        return (typeof thing == 'function') ? (thing.call(ctx)) : thing;\n    }\n    \n    function Tipsy(element, options) {\n        this.$element = $(element);\n        this.options = options;\n        this.enabled = true;\n        this.fixTitle();\n    }\n    \n    Tipsy.prototype = {\n        show: function() {\n            var title = this.getTitle();\n            if (title && this.enabled) {\n                var $tip = this.tip();\n                \n                $tip.find('.tipsy-inner')[this.options.html ? 'html' : 'text'](title);\n                $tip[0].className = 'tipsy'; // reset classname in case of dynamic gravity\n                $tip.remove().css({top: 0, left: 0, visibility: 'hidden', display: 'block'}).prependTo(document.body);\n                \n                var pos = $.extend({}, this.$element.offset(), {\n                    width: this.$element[0].offsetWidth || 0,\n                    height: this.$element[0].offsetHeight || 0\n                });\n\n                if (typeof this.$element[0].nearestViewportElement == 'object') {\n                    // SVG\n\t\t\t\t\tvar el = this.$element[0];\n                    var rect = el.getBoundingClientRect();\n\t\t\t\t\tpos.width = rect.width;\n\t\t\t\t\tpos.height = rect.height;\n                }\n\n                \n                var actualWidth = $tip[0].offsetWidth,\n                    actualHeight = $tip[0].offsetHeight,\n                    gravity = maybeCall(this.options.gravity, this.$element[0]);\n                \n                var tp;\n                switch (gravity.charAt(0)) {\n                    case 'n':\n                        tp = {top: pos.top + pos.height + this.options.offset, left: pos.left + pos.width / 2 - actualWidth / 2};\n                        break;\n                    case 's':\n                        tp = {top: pos.top - actualHeight - this.options.offset, left: pos.left + pos.width / 2 - actualWidth / 2};\n                        break;\n                    case 'e':\n                        tp = {top: pos.top + pos.height / 2 - actualHeight / 2, left: pos.left - actualWidth - this.options.offset};\n                        break;\n                    case 'w':\n                        tp = {top: pos.top + pos.height / 2 - actualHeight / 2, left: pos.left + pos.width + this.options.offset};\n                        break;\n                }\n                \n                if (gravity.length == 2) {\n                    if (gravity.charAt(1) == 'w') {\n                        tp.left = pos.left + pos.width / 2 - 15;\n                    } else {\n                        tp.left = pos.left + pos.width / 2 - actualWidth + 15;\n                    }\n                }\n                \n                $tip.css(tp).addClass('tipsy-' + gravity);\n                $tip.find('.tipsy-arrow')[0].className = 'tipsy-arrow tipsy-arrow-' + gravity.charAt(0);\n                if (this.options.className) {\n                    $tip.addClass(maybeCall(this.options.className, this.$element[0]));\n                }\n                \n                if (this.options.fade) {\n                    $tip.stop().css({opacity: 0, display: 'block', visibility: 'visible'}).animate({opacity: this.options.opacity});\n                } else {\n                    $tip.css({visibility: 'visible', opacity: this.options.opacity});\n                }\n\n                var t = this;\n                var set_hovered  = function(set_hover){\n                    return function(){\n                        t.$tip.stop();\n                        t.tipHovered = set_hover;\n                        if (!set_hover){\n                            if (t.options.delayOut === 0) {\n                                t.hide();\n                            } else {\n                                setTimeout(function() { \n                                    if (t.hoverState == 'out') t.hide(); }, t.options.delayOut);\n                            }\n                        }\n                    };\n                };\n               $tip.hover(set_hovered(true), set_hovered(false));\n            }\n        },\n        \n        hide: function() {\n            if (this.options.fade) {\n                this.tip().stop().fadeOut(function() { $(this).remove(); });\n            } else {\n                this.tip().remove();\n            }\n        },\n        \n        fixTitle: function() {\n            var $e = this.$element;\n            \n            if ($e.attr('title') || typeof($e.attr('original-title')) != 'string') {\n                $e.attr('original-title', $e.attr('title') || '').removeAttr('title');\n            }\n            if (typeof $e.context.nearestViewportElement == 'object'){                                                        \n                if ($e.children('title').length){\n                    $e.append('<original-title>' + ($e.children('title').text() || '') + '</original-title>')\n                        .children('title').remove();\n                }\n            }\n        },\n        \n        getTitle: function() {\n            \n            var title, $e = this.$element, o = this.options;\n            this.fixTitle();\n\n            if (typeof o.title == 'string') {\n                var title_name = o.title == 'title' ? 'original-title' : o.title;\n                if ($e.children(title_name).length){\n                    title = $e.children(title_name).html();\n                } else{\n                    title = $e.attr(title_name);\n                }\n                \n            } else if (typeof o.title == 'function') {\n                title = o.title.call($e[0]);\n            }\n            title = ('' + title).replace(/(^\\s*|\\s*$)/, \"\");\n            return title || o.fallback;\n        },\n        \n        tip: function() {\n            if (!this.$tip) {\n                this.$tip = $('<div class=\"tipsy\"></div>').html('<div class=\"tipsy-arrow\"></div><div class=\"tipsy-inner\"></div>');\n            }\n            return this.$tip;\n        },\n        \n        validate: function() {\n            if (!this.$element[0].parentNode) {\n                this.hide();\n                this.$element = null;\n                this.options = null;\n            }\n        },\n        \n        enable: function() { this.enabled = true; },\n        disable: function() { this.enabled = false; },\n        toggleEnabled: function() { this.enabled = !this.enabled; }\n    };\n    \n    $.fn.tipsy = function(options) {\n        \n        if (options === true) {\n            return this.data('tipsy');\n        } else if (typeof options == 'string') {\n            var tipsy = this.data('tipsy');\n            if (tipsy) tipsy[options]();\n            return this;\n        }\n        \n        options = $.extend({}, $.fn.tipsy.defaults, options);\n\n        if (options.hoverlock && options.delayOut === 0) {\n\t    options.delayOut = 100;\n\t}\n        \n        function get(ele) {\n            var tipsy = $.data(ele, 'tipsy');\n            if (!tipsy) {\n                tipsy = new Tipsy(ele, $.fn.tipsy.elementOptions(ele, options));\n                $.data(ele, 'tipsy', tipsy);\n            }\n            return tipsy;\n        }\n        \n        function enter() {\n            var tipsy = get(this);\n            tipsy.hoverState = 'in';\n            if (options.delayIn === 0) {\n                tipsy.show();\n            } else {\n                tipsy.fixTitle();\n                setTimeout(function() { if (tipsy.hoverState == 'in') tipsy.show(); }, options.delayIn);\n            }\n        }\n        \n        function leave() {\n            var tipsy = get(this);\n            tipsy.hoverState = 'out';\n            if (options.delayOut === 0) {\n                tipsy.hide();\n            } else {\n                var to = function() {\n                    if (!tipsy.tipHovered || !options.hoverlock){\n                        if (tipsy.hoverState == 'out') tipsy.hide(); \n                    }\n                };\n                setTimeout(to, options.delayOut);\n            }    \n        }\n\n        if (options.trigger != 'manual') {\n            var binder = options.live ? 'live' : 'bind',\n                eventIn = options.trigger == 'hover' ? 'mouseenter' : 'focus',\n                eventOut = options.trigger == 'hover' ? 'mouseleave' : 'blur';\n            this[binder](eventIn, enter)[binder](eventOut, leave);\n        }\n        \n        return this;\n        \n    };\n    \n    $.fn.tipsy.defaults = {\n        className: null,\n        delayIn: 0,\n        delayOut: 0,\n        fade: false,\n        fallback: '',\n        gravity: 'n',\n        html: false,\n        live: false,\n        offset: 0,\n        opacity: 0.8,\n        title: 'title',\n        trigger: 'hover',\n        hoverlock: false\n    };\n    \n    // Overwrite this method to provide options on a per-element basis.\n    // For example, you could store the gravity in a 'tipsy-gravity' attribute:\n    // return $.extend({}, options, {gravity: $(ele).attr('tipsy-gravity') || 'n' });\n    // (remember - do not modify 'options' in place!)\n    $.fn.tipsy.elementOptions = function(ele, options) {\n        return $.metadata ? $.extend({}, options, $(ele).metadata()) : options;\n    };\n    \n    $.fn.tipsy.autoNS = function() {\n        return $(this).offset().top > ($(document).scrollTop() + $(window).height() / 2) ? 's' : 'n';\n    };\n    \n    $.fn.tipsy.autoWE = function() {\n        return $(this).offset().left > ($(document).scrollLeft() + $(window).width() / 2) ? 'e' : 'w';\n    };\n    \n    /**\n     * yields a closure of the supplied parameters, producing a function that takes\n     * no arguments and is suitable for use as an autogravity function like so:\n     *\n     * @param margin (int) - distance from the viewable region edge that an\n     *        element should be before setting its tooltip's gravity to be away\n     *        from that edge.\n     * @param prefer (string, e.g. 'n', 'sw', 'w') - the direction to prefer\n     *        if there are no viewable region edges effecting the tooltip's\n     *        gravity. It will try to vary from this minimally, for example,\n     *        if 'sw' is preferred and an element is near the right viewable \n     *        region edge, but not the top edge, it will set the gravity for\n     *        that element's tooltip to be 'se', preserving the southern\n     *        component.\n     */\n     $.fn.tipsy.autoBounds = function(margin, prefer) {\n\t\treturn function() {\n\t\t\tvar dir = {ns: prefer[0], ew: (prefer.length > 1 ? prefer[1] : false)},\n\t\t\t    boundTop = $(document).scrollTop() + margin,\n\t\t\t    boundLeft = $(document).scrollLeft() + margin,\n\t\t\t    $this = $(this);\n\n\t\t\tif ($this.offset().top < boundTop) dir.ns = 'n';\n\t\t\tif ($this.offset().left < boundLeft) dir.ew = 'w';\n\t\t\tif ($(window).width() + $(document).scrollLeft() - $this.offset().left < margin) dir.ew = 'e';\n\t\t\tif ($(window).height() + $(document).scrollTop() - $this.offset().top < margin) dir.ns = 's';\n\n\t\t\treturn dir.ns + (dir.ew ? dir.ew : '');\n\t\t};\n    };\n})(jQuery);"
  },
  {
    "path": "luigi/static/visualiser/js/util.js",
    "content": "function escapeHtml(unsafe) {\n  return unsafe\n    .replace(/&/g, \"&amp;\")\n    .replace(/</g, \"&lt;\")\n    .replace(/>/g, \"&gt;\")\n    .replace(/\"/g, \"&quot;\")\n    .replace(/'/g, \"&#039;\");\n}\n"
  },
  {
    "path": "luigi/static/visualiser/js/visualiserApp.js",
    "content": "function visualiserApp(luigi) {\n    var templates = {};\n    var typingTimer = 0;\n    var dt; // DataTable instantiated in $(document).ready()\n    var missingCategories = {};\n    var currentFilter = {\n        taskFamily: \"\",\n        taskCategory: [],\n        tableFilter: \"\"\n    };\n    var taskIcons = {\n        PENDING: 'pause',\n        RUNNING: 'play',\n        BATCH_RUNNING: 'play',\n        DONE: 'check',\n        FAILED: 'times',\n        UPSTREAM_FAILED: 'warning',\n        DISABLED: 'minus-circle',\n        UPSTREAM_DISABLED: 'warning'\n    };\n    var VISTYPE_DEFAULT = 'svg';\n\n    /*\n     * Updates view of the Visualization type.\n     */\n    function updateVisType(newVisType) {\n        $('#toggleVisButtons label').removeClass('active');\n        var visTypeInput = $('#toggleVisButtons input[value=\"' + newVisType + '\"]');\n        visTypeInput.parent().addClass('active');\n        visTypeInput.prop('checked', true);\n    }\n\n    function loadTemplates() {\n        $(\"script[type='text/template']\").each(function(i, element) {\n            var name = $(element).attr(\"name\");\n            var content = $(element).text();\n            templates[name] = content;\n        });\n    }\n\n    function renderTemplate(templateName, dataObject) {\n        return $(\"<div>\").html(Mustache.render(templates[templateName], dataObject));\n    }\n\n\n    function formatTime(dateObject) {\n        return dateObject.getHours() + \":\" + dateObject.getMinutes() + \":\" + dateObject.getSeconds();\n    }\n\n    function taskToDisplayTask(task) {\n        var taskName = task.name;\n        var taskParams = JSON.stringify(task.params);\n        var displayTime = new Date(Math.floor(task.last_updated*1000)).toLocaleString();\n        var time_running = -1;\n        if (task.status == \"RUNNING\" && \"time_running\" in task) {\n            var current_time = new Date().getTime();\n            var minutes_running = Math.round((current_time - task.time_running * 1000) / 1000 / 60);\n            time_running = task.time_running;\n            displayTime += \" | \" + minutes_running + \" minutes\";\n        }\n        return {\n            taskId: task.taskId,\n            encodedTaskId: encodeURIComponent(task.taskId),\n            taskName: taskName,\n            taskParams: taskParams,\n            displayName: task.display_name,\n            priority: task.priority,\n            resources: JSON.stringify(task.resources_running || task.resources).replace(/,\"/g, ', \"'),\n            displayTime: displayTime,\n            displayTimestamp: task.last_updated,\n            timeRunning: time_running,\n            trackingUrl: task.tracking_url,\n            status: task.status,\n            graph: (task.status == \"PENDING\" || task.status == \"RUNNING\" || task.status == \"DONE\"),\n            error: task.status == \"FAILED\",\n            re_enable: task.status == \"DISABLED\" && task.re_enable_able,\n            mark_as_done: (task.status == \"RUNNING\" || task.status == \"FAILED\" || task.status == \"DISABLED\"),\n            statusMessage: task.status_message,\n            progressPercentage: task.progress_percentage,\n            acceptsMessages: task.accepts_messages,\n            workerIdRunning: task.worker_running,\n        };\n    }\n\n    function taskCategoryIcon(category) {\n        var iconClass;\n        var iconColor;\n        switch (category) {\n            case 'PENDING':\n                iconClass = 'fa-pause';\n                iconColor = 'yellow';\n                break;\n            case 'RUNNING':\n                iconClass = 'fa-play';\n                iconColor = 'aqua';\n                break;\n            case 'BATCH_RUNNING':\n                iconClass = 'fa-play';\n                iconColor = 'purple';\n                break;\n            case 'DONE':\n                iconClass = 'fa-check';\n                iconColor = 'green';\n                break;\n            case 'FAILED':\n                iconClass = 'fa-times';\n                iconColor = 'red';\n                break;\n            case 'DISABLED':\n                iconClass = 'fa-minus-circle';\n                iconColor = 'gray';\n                break;\n            case 'UPSTREAM_FAILED':\n                iconClass = 'fa-warning';\n                iconColor = 'maroon';\n                break;\n            case 'UPSTREAM_DISABLED':\n                iconClass = 'fa-warning';\n                iconColor = 'gray';\n                break;\n            default:\n                iconClass = 'fa-bug';\n                iconColor = 'orange';\n                break;\n        }\n        return '<span class=\"status-icon bg-' + iconColor + '\"><i class=\"fa ' + iconClass + '\"></i></span>';\n    }\n\n    /**\n     * Filter table by all activated info boxes.\n     */\n    function filterByCategory(dt, activeBoxes) {\n        if (activeBoxes === undefined) {\n            activeBoxes = getActiveBoxes();\n        }\n        currentFilter.taskCategory = activeBoxes;\n        dt.column(0).search(categoryQuery(activeBoxes), regex=true).draw();\n    }\n\n    function categoryQuery(activeBoxes) {\n        // Searched content will be <icon> <category>.\n        return '\\\\b(' + activeBoxes.join('|') + ')\\\\b';\n    }\n\n    function getActiveBoxes() {\n        var infoBoxes = $('.info-box');\n\n        var activeBoxes = [];\n        infoBoxes.each(function (i) {\n            if (infoBoxes[i].dataset.on === 'yes') {\n                activeBoxes.push(infoBoxes[i].dataset.category);\n            }\n        });\n        return activeBoxes;\n    }\n\n    function filterByTaskFamily(taskFamily, dt) {\n        currentFilter.taskFamily = taskFamily;\n        if (taskFamily === \"\") {\n            dt.column(1).search('').draw();\n        }\n        else {\n            dt.column(1).search('^' + taskFamily + '$', regex = true).draw();\n        }\n    }\n\n    function toggleInfoBox(infoBox, activate) {\n        var infoBoxColor = infoBox.dataset.color;\n        var infoBoxIcon = $(infoBox).find('.info-box-icon');\n        var colorClass = 'bg-' + infoBoxColor;\n\n        if ((infoBox.dataset.on === undefined) || (infoBox.dataset.on === 'no') || activate) {\n            infoBox.dataset.on = 'yes';\n            infoBoxIcon.removeClass(colorClass);\n            $(infoBox).addClass(colorClass);\n        }\n        else {\n            infoBox.dataset.on = 'no';\n            $(infoBox).removeClass(colorClass);\n            infoBoxIcon.addClass(colorClass);\n        }\n    }\n\n    function renderSidebar(tasks) {\n        // tasks is a list of task names\n        var counts = {};\n        $.each(tasks, function(i) {\n            var name = tasks[i];\n            if (counts[name] === undefined) {\n                counts[name] = 0;\n            }\n            counts[name] += 1;\n        });\n        var taskList = [];\n        $.each(counts, function (name) {\n            var dotIndex = name.indexOf('.');\n            var prefix = 'Others';\n            if (dotIndex > 0) {\n                prefix = name.slice(0, dotIndex);\n            }\n            var prefixList = taskList.find(function (pref) {\n                return pref.name == prefix;\n            })\n            if (prefixList) {\n                prefixList.tasks.push({name: name, count: counts[name]});\n            } else {\n                prefixList = {\n                    name: prefix,\n                    tasks: [{name: name, count: counts[name]}]\n                }\n                taskList.push(prefixList);\n            }\n\n        });\n        taskList.sort(function(a,b){\n            if (a.name == 'Others') {\n                if (b.name == 'Others') {\n                    return 0;\n                }\n                return 1;\n            } else if (b.name == 'Others') {\n                return -1;\n            }\n            return a.name.localeCompare(b.name);\n        });\n        taskList.forEach(function(p){\n            p.tasks.sort(function(a,b){\n                return a.name.localeCompare(b.name);\n            });\n        });\n        return renderTemplate(\"sidebarTemplate\", {\"tasks\": taskList});\n    }\n\n    function selectSidebarItem(item) {\n        var sidebarItems = $('.sidebar').find('li');\n        sidebarItems.each(function (i) {\n            var item2 = sidebarItems[i];\n            if (item2.dataset.task === undefined) {\n                return;\n            }\n            if (item === item2) {\n                if ($(item2).hasClass('active')) {\n                    // item is active, deselect\n                    $(item2).removeClass('active');\n                    $(item2).find('.badge').removeClass('bg-green');\n                }\n                else {\n                    // select item\n                    $(item2).addClass('active');\n                    $(item2).find('.badge').addClass('bg-green');\n                }\n            }\n            else {\n                // clear any selection\n                $(item2).removeClass('active');\n                $(item2).find('.badge').removeClass('bg-green');\n            }\n        });\n    }\n\n    function renderWarnings() {\n        return renderTemplate(\n            \"warningsTemplate\",\n            {missingCategories: $.map(missingCategories, function (v, k) {return v;})}\n        );\n    }\n\n    function processWorker(worker) {\n        worker.encoded_first_task = encodeURIComponent(worker.first_task);\n        worker.tasks = worker.running.map(taskToDisplayTask);\n        worker.tasks.sort(function(task1, task2) { return task1.timeRunning - task2.timeRunning; });\n        worker.start_time = new Date(worker.started * 1000).toLocaleString();\n        worker.active = new Date(worker.last_active * 1000).toLocaleString();\n        worker.is_disabled = worker.state === 'disabled';\n        return worker;\n    }\n\n    function renderWorkers(workers) {\n        return renderTemplate(\"workerTemplate\", {\"workerList\": workers.map(processWorker)});\n    }\n\n    function processResource(resource) {\n        resource.tasks = resource.running.map(taskToDisplayTask);\n        resource.percent_used = 100 * resource.num_used / resource.num_total;\n        if (resource.percent_used >= 100) {\n            resource.bar_type = 'danger';\n            resource.percent_used = 100;\n        } else if (resource.percent_used > 50) {\n            resource.bar_type = 'warning';\n        } else {\n            resource.bar_type = 'success';\n        }\n        return resource;\n    }\n\n    function renderResources(resources) {\n        return renderTemplate(\"resourceTemplate\", {\n            \"resources\": resources.map(processResource).sort(function(r1, r2) {\n                if (r1.percent_used > r2.percent_used)\n                    return -1;\n                else if (r1.percent_used < r2.percent_used)\n                    return 1;\n                else if (r1.num_used > r2.num_used)\n                    return -1;\n                else if (r1.num_used < r2.num_used)\n                    return 1;\n                else if (r1.name < r2.name)\n                    return -1;\n                else if (r1.name > r2.name)\n                    return 1;\n                else\n                    return 0;\n            })\n        });\n    }\n\n    function switchTab(tabId) {\n        $(\".tabButton\").parent().removeClass(\"active\");\n        $(\".tab-pane\").removeClass(\"active\");\n        $(\"#\" + tabId).addClass(\"active\");\n        $(\".navbar-nav li\").removeClass(\"active\");\n        $(\".js-nav-link[data-tab=\" + tabId + \"]\").parent().addClass(\"active\");\n        updateSidebar(tabId);\n    }\n\n    function showErrorTrace(data) {\n        data.error = decodeError(data.error);\n        if (data.taskParams) {\n          data.taskParams = Object.entries(data.taskParams).map(([k,v]) => `--${k.replace(/_/g, '-')} ${JSON.stringify(v)}`).join(\" \");\n        }\n        $(\"#errorModal\").empty().append(renderTemplate(\"errorTemplate\", data));\n        $(\"#errorModal\").modal({});\n    }\n\n    function showStatusMessage(data) {\n        $(\"#statusMessageModal\").empty().append(renderTemplate(\"statusMessageTemplate\", data));\n        $(\"#statusMessageModal\").modal({});\n        var refreshInterval = setInterval(function() {\n                if ($(\"#statusMessageModal\").is(\":hidden\"))\n                    clearInterval(refreshInterval);\n                else {\n                    luigi.getTaskStatusMessage(data.taskId, function(data) {\n                        if (data.statusMessage === null)\n                            $(\"#statusMessageModal pre\").hide();\n                        else {\n                            $(\"#statusMessageModal pre\").html(data.statusMessage).show();\n                        }\n                    });\n                    luigi.getTaskProgressPercentage(data.taskId, function(data) {\n                        // show or hide the progress bar container in the message modal\n                        $(\"#statusMessageModal .progress\").toggle(data.progressPercentage !== null);\n\n                        // adjust the status of both progress bars (message modal and worker list)\n                        var value = data.progressPercentage || 0;\n                        var progressBars = $('#statusMessageModal .progress-bar, ' +\n                            '.worker-table tbody .taskProgressBar[data-task-id=\"' + data.taskId + '\"]');\n                        progressBars.attr('aria-valuenow', value)\n                            .text(value + '%')\n                            .css({'width': value + '%'});\n                    });\n                }\n            },\n            500\n        );\n    }\n\n    function showSchedulerMessageModal(data) {\n        var $modal = $(\"#schedulerMessageModal\");\n\n        $modal.empty().append(renderTemplate(\"schedulerMessageTemplate\", data));\n        var $input = $modal.find(\"#schedulerMessageInput\");\n        var $send = $modal.find(\"#schedulerMessageButton\");\n        var $awaitResponse = $modal.find(\"#schedulerMessageAwaitResponse\");\n        var $responseContainer = $modal.find(\"#schedulerMessageResponse\");\n        var $responseSpinner = $responseContainer.find(\"pre > i\");\n        var $responseContent = $responseContainer.find(\"pre > div\");\n\n        $input.on(\"keypress\", function($event) {\n            if (event.keyCode == 13) {\n                $send.trigger(\"click\");\n                $event.preventDefault();\n            }\n        });\n\n        $send.on(\"click\", function($event) {\n            var content = $input.val();\n            var awaitResponse = $awaitResponse.prop(\"checked\");\n            if (content && data.worker) {\n                if (awaitResponse) {\n                    $responseContainer.show();\n                    $responseSpinner.show();\n                    $responseContent.empty();\n                    luigi.sendSchedulerMessage(data.worker, data.taskId, content, function(messageId) {\n                        var interval = window.setInterval(function() {\n                            luigi.getSchedulerMessageResponse(data.taskId, messageId, function(response) {\n                                if (response != null) {\n                                    clearInterval(interval);\n                                    $responseSpinner.hide();\n                                    $responseContent.html(response);\n                                }\n                            });\n                        }, 1000);\n                    });\n                    $event.stopPropagation();\n                } else {\n                    $responseContainer.hide();\n                    luigi.sendSchedulerMessage(data.worker, data.taskId, content);\n                }\n            }\n        });\n\n        $modal.on(\"shown.bs.modal\", function() {\n            $input.focus();\n        });\n\n        $modal.modal({});\n    }\n\n    function preProcessGraph(dependencyGraph) {\n        var extraNodes = [];\n        var seen = {};\n        $.each(dependencyGraph, function(i, node) {\n            seen[node.taskId] = true;\n        });\n        $.each(dependencyGraph, function(i, node) {\n            $.each(node.deps, function(j, dep) {\n                if (!seen[dep]) {\n                    seen[dep] = true;\n                    var paramsStrs = (/\\((.*)\\)/.exec(dep) || ['', ''])[1].split(', ');\n                    var params = {};\n                    $.each(paramsStrs, function(i, param) {\n                        if (param !== \"\") {\n                            var kv = param.split('=');\n                            params[kv[0]] = kv[1];\n                        }\n                    });\n\n                    extraNodes.push({\n                        name: (/(\\w+)\\(/.exec(dep) || [])[1],\n                        taskId: dep,\n                        deps: [],\n                        params: params,\n                        status: \"TRUNCATED\"\n                    });\n                }\n            });\n        });\n        return dependencyGraph.concat(extraNodes);\n    }\n\n    function makeGraphCallback(visType, taskId, paint) {\n        function depGraphCallbackD3(dependencyGraph) {\n            $(\"#searchError\").empty();\n            $(\"#searchError\").removeClass();\n            if(dependencyGraph.length > 0) {\n                $(\"#dependencyTitle\").text(dependencyGraph[0].display_name);\n                if(dependencyGraph != '{}'){\n                    for (var id in dependencyGraph) {\n                        if (dependencyGraph[id].deps.length > 0) {\n                            //console.log(asingInput(dependencyGraph, id));\n                            dependencyGraph[id].inputQueue = asingInput(dependencyGraph, id);\n                            dependencyGraph[id].inputThroughput = 50;\n                            dependencyGraph[id].count = 5;\n                            dependencyGraph[id].consumers = 1;\n                        }else{\n                            dependencyGraph[id].inputThroughput = 50;\n                            dependencyGraph[id].count = 5;\n                            dependencyGraph[id].consumers = 1;\n                        }\n                    }\n                }\n            } else {\n                $(\"#searchError\").addClass(\"alert alert-error\");\n                $(\"#searchError\").text(\"Couldn't find task \" + taskId);\n            }\n            drawGraphETL(dependencyGraph, paint);\n            bindGraphEvents();\n        }\n\n        function depGraphCallback (dependencyGraph) {\n            $(\"#graphPlaceholder svg\").empty();\n            $(\"#searchError\").empty();\n            $(\"#searchError\").removeClass();\n            if(dependencyGraph.length > 0) {\n                $(\"#dependencyTitle\").text(dependencyGraph[0].display_name);\n                var hashBaseObj = URI.parseQuery(location.hash.replace('#', ''));\n                delete hashBaseObj.taskId;\n                var hashBase = '#' + URI.buildQuery(hashBaseObj) + '&taskId=';\n                $(\"#graphPlaceholder\").get(0).graph.updateData(dependencyGraph, hashBase);\n                $(\"#graphContainer\").show();\n                bindGraphEvents();\n            } else {\n                $(\"#searchError\").addClass(\"alert alert-error\");\n                $(\"#searchError\").text(\"Couldn't find task \" + taskId);\n            }\n        }\n\n        function processedCallback(callback) {\n            function processed(dependencyGraph) {\n                return callback(preProcessGraph(dependencyGraph));\n            }\n            return processed;\n        }\n\n        if (visType == 'd3') {\n            return processedCallback(depGraphCallbackD3);\n        }\n        else {\n            return processedCallback(depGraphCallback);\n        }\n    }\n\n    function processHashChange(paint) {\n        var hash = decodeURIComponent(location.hash);\n        // Convert fragment params to object.\n        var fragmentQuery = URI.parseQuery(location.hash.replace('#', '')); // \"http://example.org/#!/foo/bar/baz.html\");\n\n        if (fragmentQuery.tab == \"workers\") {\n            switchTab(\"workerList\");\n        } else if (fragmentQuery.tab == \"resources\") {\n            expandResources(fragmentQuery.resources);\n            switchTab(\"resourceList\");\n        } else if (fragmentQuery.tab == \"graph\") {\n            var taskId = fragmentQuery.taskId;\n            var hideDone = fragmentQuery.hideDone === '1' ? true : false;\n\n            // Populate fields with values from hash.\n            $('#hideDoneCheckbox').prop('checked', hideDone);\n            $(\"#invertCheckbox\").prop('checked', fragmentQuery.invert === '1' ? true : false);\n            $(\"#js-task-id\").val(fragmentQuery.taskId);\n\n            // Empty errors.\n            $(\"#searchError\").empty();\n            $(\"#searchError\").removeClass();\n\n            var visType = fragmentQuery.visType || VISTYPE_DEFAULT;\n            if (taskId) {\n                var depGraphCallback = makeGraphCallback(visType, taskId, paint);\n\n                if (fragmentQuery.invert) {\n                    luigi.getInverseDependencyGraph(taskId, depGraphCallback, !hideDone);\n                } else {\n                    luigi.getDependencyGraph(taskId, depGraphCallback, !hideDone);\n                }\n            }\n            updateVisType(visType);\n            initVisualisation(visType);\n            switchTab(\"dependencyGraph\");\n        } else {\n            // Tasks tab.\n\n            // Populate fields with values from hash.\n            if (fragmentQuery.length) {\n                $('select[name=taskTable_length]').val(fragmentQuery.length);\n            }\n            $(\"#serverSideCheckbox\").prop('checked', fragmentQuery.filterOnServer === '1' ? true : false);\n            dt.search(fragmentQuery.search__search);\n\n            $('#familySidebar li').removeClass('active');\n            $('#familySidebar li .badge').removeClass('bg-green');\n            if (fragmentQuery.family) {\n                family_item = $('#familySidebar li[data-task=\"' + fragmentQuery.family + '\"]');\n                family_item.addClass('active');\n                family_item.find('.badge').addClass('bg-green');\n                filterByTaskFamily(fragmentQuery.family, dt);\n            }\n\n            if (fragmentQuery.statuses) {\n                var statuses = JSON.parse(fragmentQuery.statuses);\n                $.each(statuses, function (status) {\n                    toggleInfoBox($('#' + statuses[status] + '_info')[0], true);\n                });\n                filterByCategory(dt, statuses);\n            }\n\n            if (fragmentQuery.order) {\n                dt.order([fragmentQuery.order.split(',')]);\n            }\n            dt.draw();\n            switchTab(\"taskList\");\n        }\n    }\n\n    function bindGraphEvents() {\n        var fragmentQuery = URI.parseQuery(location.hash.replace('#', ''));\n        var visType = fragmentQuery.visType;\n        if (visType === 'd3') {\n            $('.node').click(function(event) {\n                var taskDiv = $(this).find('.taskNode');\n                var taskId = taskDiv.attr(\"data-task-id\");\n                event.preventDefault();\n                // NOTE : hasClass() not reliable inside SVG\n                if ($(this).attr('class').match(/\\bFAILED\\b/)) {\n                    luigi.getErrorTrace(taskId, function (error) {\n                        showErrorTrace(error);\n                    });\n                }\n                else {\n                    fragmentQuery['taskId'] = taskId;\n                    window.location.href = 'index.html#' + URI.buildQuery(fragmentQuery);\n                }\n            });\n        }\n        else {\n            $(\".graph-node-a\").click(function(event) {\n                var taskId = $(this).attr(\"data-task-id\");\n                var status = $(this).attr(\"data-task-status\");\n                if (status == \"FAILED\") {\n                    event.preventDefault();\n                    luigi.getErrorTrace(taskId, function(error) {\n                       showErrorTrace(error);\n                    });\n                }\n            });\n        }\n    }\n\n    function bindListEvents() {\n        $(window).on('hashchange', processHashChange);\n\n        $('#serverSideCheckbox').click(function(e) {\n            e.preventDefault();\n            changeState('filterOnServer', this.checked ? '1' : null);\n            updateTasks();\n        });\n\n        $(\"#invertCheckbox\").click(function(e) {\n            e.preventDefault();\n            changeState('invert', this.checked ? '1' : null);\n        });\n\n        $('#hideDoneCheckbox').click(function(e) {\n            // Copy checkbox value to hash.\n            e.preventDefault();\n            changeState('hideDone', this.checked ? '1' : null);\n        });\n        $(\"a[href=#list]\").click(function() { location.hash=\"\"; });\n        $(\"#loadTaskForm\").submit(function(event) {\n            event.preventDefault();\n            var taskId = $(this).find(\"input\").val();\n            changeState('taskId', taskId.length > 0 ? taskId : null);\n        });\n\n        $('.info-box').on('click', function () {\n            toggleInfoBox(this);\n            filterByCategory(dt);\n        });\n\n        $('input[name=vis-type]').on('change', function () {\n            changeState('visType', $(this).val());\n        });\n\n        /*\n          Note: The #filter-input element is used by LuigiAPI to constrain requests to the server.\n          When the accompanying button is pressed we force a reload.\n         */\n        $('#serverSide').on('change', 'label', function () {\n            updateTasks();\n        });\n    }\n\n    function asingInput(worker, id){\n        if (worker[id].deps.length > 0) {\n            //console.log(worker[id].deps);\n            return worker[id].deps;\n        }\n    }\n\n    function getDurations(tasks, listId){\n        var durations = {};\n        for (var i = 0; i < listId.length; i++) {\n            for (var j = 0; j < tasks.length; j++) {\n                if (listId[i] === tasks[j].taskId) {\n                    // The duration of the task from when it started running to when it finished.\n                    var finishTime = new Date(tasks[j].last_updated*1000);\n                    var startTime = new Date(tasks[j].time_running*1000);\n                    durations[listId[i]] = new Date(finishTime - startTime);\n                }\n            }\n        }\n        return durations;\n    }\n\n    function getParam(tasks, id){\n        for (var i = 0; i < tasks.length; i++) {\n            if (tasks[i].taskId === id) {\n                return tasks[i].worker_running;\n            }\n        }\n    }\n\n    function getStatusTasks(tasks){\n        var status;\n        for (var i = 0; i < tasks.length; i++) {\n            if (tasks[i].status === \"DONE\") {\n                status = true;\n            } else {\n                return false;\n            }\n        }\n        return status;\n    }\n\n    function drawGraphETL(tasks, paint){\n        // Set up zoom support\n        var svg = d3.select(\"#mysvg\");\n        var inner = svg.select(\"g\"),\n            zoom = d3.behavior.zoom().on(\"zoom\", function() {\n            inner.attr(\"transform\", \"translate(\" + d3.event.translate + \")\" +\n                \"scale(\" + d3.event.scale + \")\");\n            });\n        svg.call(zoom);\n\n        // Create map of taskId to task\n        var taskIdMap = {};\n        $.each(tasks, function (i, task) {\n            taskIdMap[task.taskId] = task;\n        });\n\n        var render = new dagreD3.render();\n        // Left-to-right layout\n        var g = new dagreD3.graphlib.Graph();\n        g.setGraph({\n            nodesep: 70,\n            ranksep: 50,\n            rankdir: \"LR\",\n            marginx: 20,\n            marginy: 20,\n            height: 400,\n            ranker: \"longest-path\"\n        });\n\n        function draw(isUpdate) {\n            for (var id in tasks) {\n                var task = tasks[id];\n                var className = task.status;\n\n                var html = \"<div class='taskNode' data-task-id='\" + task.taskId + \"'>\";\n                html += \"<span class=status></span>\";\n                html += \"<span class=name>\"+task.name+\"</span>\";\n                html += \"<span class=queue><span class=counter>\"+ task.status +\"</span></span>\";\n                html += \"</div>\";\n                g.setNode(task.taskId, {\n                    labelType: \"html\",\n                    label: html,\n                    rx: 5,\n                    ry: 5,\n                    padding: 0,\n                    class: className\n                });\n                if (task.inputQueue) {\n                    for (var i =  0; i < task.inputQueue.length; i++) {\n                        // Destination node may not be in tasks if this is an inverted graph\n                        if (taskIdMap[task.inputQueue[i]] !== undefined) {\n                            if (task.status === \"DONE\") {\n                                var durations = getDurations(tasks, task.inputQueue);\n                                var duration = durations[task.inputQueue[i]];\n                                var oneDayInMilliseconds = 24 * 60 * 60 * 1000;\n                                var durationLabel;\n                                if (duration.getTime() < oneDayInMilliseconds) {\n                                    // Label task duration in stripped ISO format (hh:mm:ss.f)\n                                    durationLabel = duration.toISOString().substr(11, 12);\n                                } else {\n                                    durationLabel = \"> 24h\";\n                                }\n                                g.setEdge(task.inputQueue[i], task.taskId, {\n                                    label: durationLabel,\n                                    width: 40\n                                });\n                            } else {\n                                g.setEdge(task.inputQueue[i], task.taskId, {\n                                    width: 40\n                                });\n                            }\n                        }\n                    }\n\n                }\n            }\n            var styleTooltip = function(name, description) {\n                return \"<p class='name'>\" + name + \"</p><p class='description'>\" + description + \"</p>\";\n            };\n            inner.call(render, g);\n            if(paint){\n                // Zoom and scale to fit\n                var zoomScale = zoom.scale();\n                var graphWidth = g.graph().width + 80;\n                var graphHeight = g.graph().height + 40;\n                var width = parseInt(svg.style(\"width\").replace(/px/, \"\"));\n                var height = parseInt(svg.style(\"height\").replace(/px/, \"\"));\n                zoomScale = Math.min(width / graphWidth, height / graphHeight);\n                var translate = [(width/2) - ((graphWidth*zoomScale)/2), (height/2) - ((graphHeight*zoomScale)/2)];\n                zoom.translate(translate);\n                zoom.scale(zoomScale);\n                zoom.event(isUpdate ? svg.transition().duration(3000) : d3.select(\"#mysvg\"));\n            }\n\n            inner.selectAll(\"g.node\")\n                .attr(\"title\", function(v) { return styleTooltip(v, getParam(tasks, v)); })\n                .each(function(v) { $(this).tipsy({ gravity: \"w\", opacity: 1, html: true }); });\n        }\n        draw();\n    }\n\n    /*\n       DataTables functions\n     */\n    // Remove tasks of a given category and add new ones.\n    function updateTaskCategory(dt, category, tasks) {\n        var taskMap = {};\n\n        var mostImportantCategory = function (cat1, cat2) {\n            var priorities = [\n                'RUNNING',\n                'BATCH_RUNNING',\n                'DONE',\n                'PENDING',\n                'UPSTREAM_DISABLED',\n                'UPSTREAM_FAILED',\n                'DISABLED',\n                'FAILED'\n            ];\n            // NOTE : -1 indicates not in list\n            var i1 = priorities.indexOf(cat1);\n            var i2 = priorities.indexOf(cat2);\n            var ret;\n            if (i1 > i2) {\n                ret = cat1;\n            }\n            else {\n                ret = cat2;\n            }\n            return ret;\n        };\n\n        dt.rows(function (i, data) {\n            taskMap[data.taskId] = data.category;\n            return data.category === category;\n        }).remove();\n\n        var taskCount;\n        /* Check for integers in tasks.  This indicates max-shown-tasks was exceeded */\n        if (tasks.length === 1 && typeof(tasks[0]) === 'number') {\n            taskCount = tasks[0] === -1 ? 'unknown' : tasks[0];\n            missingCategories[category] = {name: category, count: taskCount};\n        }\n        else {\n            var displayTasks = tasks.map(taskToDisplayTask);\n            displayTasks = displayTasks.filter(function (obj) {\n                if (obj === null) {\n                    return false;\n                }\n                if (category === mostImportantCategory(category, taskMap[obj.taskId])) {\n                    obj.category = category;\n                    return true;\n                }\n                return false;\n            });\n            dt.rows.add(displayTasks);\n            taskCount = displayTasks.length;\n            delete missingCategories[category];\n        }\n\n        $('#'+category+'_info').find('.info-box-number').html(taskCount);\n        $('#'+category+'_info i.fa').removeClass().addClass('fa fa-'+taskIcons[category]);\n    }\n\n    function updateCurrentFilter() {\n        var content;\n        currentFilter.tableFilter = dt.search();\n\n        if ((currentFilter.tableFilter === \"\") &&\n            ($.isEmptyObject(currentFilter.taskCategory)) &&\n            (currentFilter.taskFamily === \"\")) {\n\n            content = '';\n        }\n        else {\n            if (currentFilter.taskCategory !== \"\") {\n                currentFilter.catNames = $.map(currentFilter.taskCategory, function (x) {\n                    return {name: x};\n                });\n            }\n\n            content = renderTemplate('currentFilterTemplate', currentFilter);\n        }\n\n        $('#currentFilter').html(content);\n    }\n\n    function initVisualisation(newVisType) {\n\n        // Prepare graphPlaceholder for D3 code\n        if (newVisType == 'd3') {\n            $('#graphPlaceholder').empty();\n            $('#graphPlaceholder').html('<div class=\"live map\"><svg width=\"100%\" height=\"100%\" id=\"mysvg\"><g/></svg></div>');\n        }\n        else {\n            $('#graphPlaceholder').empty();\n            var graph = new Graph.DependencyGraph($(\"#graphPlaceholder\")[0]);\n            $(\"#graphPlaceholder\")[0].graph = graph;\n        }\n    }\n\n    function updateTasks() {\n        $('.status-info .info-box-number').text('?');\n        $('.status-info i.fa').removeClass().addClass('fa fa-spinner fa-pulse');\n\n        var ajax1 = luigi.getRunningTaskList(function(runningTasks) {\n            updateTaskCategory(dt, 'RUNNING', runningTasks);\n        });\n\n        var ajax2 = luigi.getBatchRunningTaskList(function(batchRunningTasks) {\n            updateTaskCategory(dt, 'BATCH_RUNNING', batchRunningTasks);\n        });\n\n        var ajax3 = luigi.getFailedTaskList(function(failedTasks) {\n            updateTaskCategory(dt, 'FAILED', failedTasks);\n        });\n\n        var ajax4 = luigi.getUpstreamFailedTaskList(function(upstreamFailedTasks) {\n            updateTaskCategory(dt, 'UPSTREAM_FAILED', upstreamFailedTasks);\n        });\n\n        var ajax5 = luigi.getDisabledTaskList(function(disabledTasks) {\n            updateTaskCategory(dt, 'DISABLED', disabledTasks);\n        });\n\n        var ajax6 = luigi.getUpstreamDisabledTaskList(function(upstreamDisabledTasks) {\n            updateTaskCategory(dt, 'UPSTREAM_DISABLED', upstreamDisabledTasks);\n        });\n\n        var ajax7 = luigi.getPendingTaskList(function(pendingTasks) {\n            updateTaskCategory(dt, 'PENDING', pendingTasks);\n        });\n\n        var ajax8 = luigi.getDoneTaskList(function(doneTasks) {\n            updateTaskCategory(dt, 'DONE', doneTasks);\n        });\n\n        $.when(ajax1, ajax2, ajax3, ajax4, ajax5, ajax6, ajax7, ajax8).done(function () {\n            dt.draw();\n\n            $('.sidebar').html(renderSidebar(dt.column(1).data()));\n            var selectedFamily = $('.sidebar-menu').find('li[data-task=\"' + currentFilter.taskFamily + '\"]')[0];\n            selectSidebarItem(selectedFamily);\n\n            if (selectedFamily) {\n                var selectedUl = $(selectedFamily).parent();\n                selectedUl.show();\n                selectedUl.prev().addClass('expanded');\n            } else {\n                var others = $('.sidebar-folder:contains(Others)')\n                others.addClass('expanded')\n                others.next().show()\n            }\n\n            $('.sidebar-menu').on('click', 'li:not(.sidebar-folder)', function (e) {\n                e.stopPropagation();\n                if (this.dataset.task) {\n                    selectSidebarItem(this);\n                    if ($(this).hasClass('active')) {\n                        filterByTaskFamily(this.dataset.task, dt);\n                    }\n                    else {\n                        filterByTaskFamily(\"\", dt);\n                    }\n                }\n            });\n\n            $('.sidebar-menu').on('click', '.sidebar-folder', function () {\n                const ul = this.nextElementSibling;\n                $(ul).slideToggle()\n                this.classList.toggle('expanded')\n            })\n\n            $('#clear-task-filter').on('click', function () {\n                filterByTaskFamily(\"\", dt);\n            });\n\n            if ($.isEmptyObject(missingCategories)) {\n                $('#warnings').html('');\n            }\n            else {\n                $('#warnings').html(renderWarnings());\n            }\n\n            processHashChange();\n        });\n    }\n\n    function updateSidebar(tabName) {\n        if (tabName === 'taskList') {\n            $('body').removeClass('sidebar-collapse');\n        }\n        else {\n            $('body').addClass('sidebar-collapse');\n        }\n    }\n\n    // Error strings may or may not be JSON encoded, depending on client version\n    // Decoding an unencoded string may raise an exception.\n    function decodeError(error) {\n        var decoded;\n        try {\n            decoded = JSON.parse(error);\n        }\n        catch (e) {\n            decoded = error;\n        }\n        return decoded;\n    }\n\n    /**\n     * Return HTML of a task parameter dictionary\n     * @param params: task parameter dictionary\n     */\n    function renderParams(params) {\n        var htmls = [];\n        for (var key in params) {\n            htmls.push('<span class=\"param-name\">' + escapeHtml(key) +\n                '</span>=<span class=\"param-value\">' + escapeHtml(params[key]) + '</span>');\n        }\n        return htmls.join(', ');\n    }\n\n    /**\n     * Updates the number of worker processes of a worker\n     * @param worker: the id of the worker\n     * @param n: the number of processes to set\n     */\n    function updateWorkerProcesses(worker, n) {\n        n = Math.max(1, n);\n\n        // the spinner is just for visual feedback\n        var $label = $('#workerList').find('#label-n-workers[data-worker=\"' + worker + '\"]');\n        $label.html('<i class=\"fa fa-spinner fa-spin\" aria-hidden=\"true\"></i>');\n\n        luigi.setWorkerProcesses(worker, n, function() {\n            $label.text(n);\n        });\n    }\n\n    /**\n     * Updates the number of units of a given resource available in the scheduler\n     * @param resource: the name of the resource\n     * @param n: the number of units to set the resource limit to\n     */\n    function updateResourceCount(resource, n) {\n        var progressBar = $('#' + resource + '-resource-box .progress-bar');\n        var used = /(\\S+)\\//.exec(progressBar.text())[1];\n        nVal = parseInt(n);\n        if (isNaN(nVal) || nVal < 0) {\n            return;\n        }\n        usedVal = parseInt(used);\n        width = Math.floor(100 * usedVal / nVal);\n        if (width < 0) {\n            width = 0;\n        }\n        if (width > 100) {\n            width = 100;\n        }\n        luigi.updateResource(resource, n, function() {\n            progressBar.text(usedVal + '/' + nVal);\n            progressBar.attr('style', 'width: ' + width + '%');\n        });\n    }\n\n    /**\n     * Returns the current units of a resource used\n     * @param resource: the name of the resource\n     */\n    function currentResourceCount(resource) {\n        var progressBar = $('#' + resource + '-resource-box .progress-bar');\n        var count = /\\/(\\S+)/.exec(progressBar.text())[1];\n        return parseInt(count);\n    }\n\n    function changeState(key, value) {\n        var fragmentQuery = URI.parseQuery(location.hash.replace('#', ''));\n        if (value) {\n            fragmentQuery[key] = value;\n        } else {\n            delete fragmentQuery[key];\n        }\n        location.hash = '#' + URI.buildQuery(fragmentQuery);\n    }\n\n   function expandedResources() {\n        return $('.resource-box.in').toArray().map(function (val) { return val.dataset.resource; });\n    }\n\n    function expandResources(resources) {\n        if (resources === undefined) {\n            resources = [];\n        } else {\n            resources = JSON.parse(resources);\n        }\n        $('.resource-box').each(function (i, item) {\n            if (resources.indexOf(item.dataset.resource) === -1) {\n                $(item).collapse('hide');\n            } else {\n                $(item).collapse('show');\n            }\n        });\n    }\n\n    /**\n     * Create the pause/unpause toggle\n     */\n    function createPauseToggle(checked) {\n        var check = checked ? \" checked\" : \"\";\n        var html = $('<input id=\"pause\" type=\"checkbox\"' + check + ' data-toggle=\"toggle\">');\n        $('#pause-form').append(html);\n        $('#pause').bootstrapToggle({\n            on: 'Running',\n            off: 'Paused',\n            onstyle: 'success',\n            offstyle: 'danger'\n        });\n        $('#pause').change(function() {\n            if (this.checked) {\n                luigi.unpause();\n            } else {\n                luigi.pause();\n            }\n        })\n    }\n\n    $(document).ready(function() {\n        loadTemplates();\n\n        luigi.hasTaskHistory(function(hasTaskHistory) {\n            if (hasTaskHistory) {\n                $('#topNavbar').append(renderTemplate('topNavbarItem', {\n                    label: \"History\",\n                    href: \"../../history\",\n                }).children()[0]);\n            }\n        });\n\n        luigi.isPauseEnabled(function(enabled) {\n            if (enabled) {\n                luigi.isPaused(createPauseToggle);\n            }\n        });\n\n        luigi.getWorkerList(function(workers) {\n            $(\"#workerList\").append(renderWorkers(workers));\n\n            $('.worker-table tbody').on('click', 'td .statusMessage', function() {\n                var data = $(this).data();\n                showStatusMessage(data);\n            });\n\n            $('.worker-table tbody').on('click', 'td .schedulerMessage', function() {\n                var data = $(this).data();\n                showSchedulerMessageModal(data);\n            });\n        });\n\n        luigi.getResourceList(function(resources) {\n            $(\"#resourceList\").append(renderResources(resources));\n            expandResources(URI.parseQuery(location.hash.replace('#', '')).resources);\n            $('.resources-collapse').click(function (e) {\n                e.preventDefault();\n                var collapse_block = $(this.dataset.target);\n                if (collapse_block.hasClass('collapsing')) {\n                    return;\n                }\n                var resource = collapse_block.attr('data-resource');\n                var resourceList = expandedResources();\n                var resourceIdx = resourceList.indexOf(resource);\n                if (resourceIdx === -1) {\n                    resourceList.push(resource);\n                } else {\n                    resourceList.splice(resourceIdx, 1);\n                }\n                changeState('resources', resourceList.length > 0 ? JSON.stringify(resourceList) : null);\n                collapse_block.collapse('toggle');\n            });\n        });\n\n        dt = $('#taskTable').DataTable({\n            stateSave: true,\n            stateSaveCallback: function(settings, data) {\n                // Save data table state to browser's hash.\n                var state = URI.parseQuery(location.hash.replace('#', ''));\n\n                if (data.search.search) {\n                    state.search__search = data.search.search;\n                } else {\n                    delete state.search__search;\n                }\n\n                var family_search = data.columns[1].search.search;\n                if (family_search) {\n                    state.family = family_search.substring(1, family_search.length - 1);\n                } else {\n                    delete state.family;\n                }\n\n                if (currentFilter.taskCategory.length > 0) {\n                    state.statuses = JSON.stringify(currentFilter.taskCategory);\n                } else {\n                    delete state.statuses;\n                }\n\n                if (data.order && data.order.length) {\n                    state.order = '' + data.order[0][0] + ',' + data.order[0][1];\n                }\n\n                if (data.length && data.length !== 10) {\n                    // Keep in hash only if length is not default.\n                    state.length = data.length;\n                } else {\n                    delete state.length;\n                }\n\n                if (state.filterOnServer) {\n                    state.filterOnServer = '1';\n                }\n                location.hash = '#' + URI.buildQuery(state);\n            },\n            stateLoadCallback: function(settings) {\n                // Restore datatable state from browser's hash.\n                var fragmentQuery = URI.parseQuery(location.hash.replace('#', ''));\n\n                var order = [];\n                if (fragmentQuery.order) {\n                    order = [fragmentQuery.order.split(',')];\n                }\n\n                var family_search = {};\n                if (fragmentQuery.family) {\n                    family_search = {'search': '^' + fragmentQuery.family + '$', 'regex': true};\n                }\n\n                var status_search = {};\n                if (fragmentQuery.statuses) {\n                    var statuses = JSON.parse(fragmentQuery.statuses);\n                    currentFilter.taskCategory = statuses;\n                    status_search = {'search': categoryQuery(statuses), 'regex': true};\n                }\n\n                // Prepare state for datatable.\n                var o = {\n                    order: order,                 // Table rows order.\n                    length: fragmentQuery.length, // Entries on page.\n                    start: 0,                     // Pagination initial page.\n                    time: new Date().getTime(),   // Current time to help datatable.js to handle asynchronous.\n                    columns: [\n                        {visible: true, search: status_search},\n                        {visible: true, search: family_search},  // Name column\n                        {visible: true, search: {}},  // Details column\n                        {visible: true, search: {}},  // Priority column\n                        {visible: true, search: {}},  // Time column\n                        {visible: true, search: {}}   // Actions column\n                    ],\n                    // Search input state.\n                    search: {\n                        caseInsensitive: true,\n                        search: fragmentQuery.search__search\n                    }\n                };\n\n                return o;\n            },\n            dom: 'l<\"#serverSide\">frtip',\n            language: {\n                search: 'Filter table:'\n            },\n            columns: [\n                {\n                    data: 'category',\n                    render: function (data, type, row) {\n                        return taskCategoryIcon(data) + ' ' + data;\n                    }\n                },\n                {data: 'taskName'},\n                {\n                    data: 'taskParams',\n                    render: function(data, type, row) {\n                        var params = JSON.parse(data);\n                        if (row.resources !== '{}') {\n                            return '<div>' + renderParams(params) + '</div><div>' + row.resources + '</div>';\n                        } else {\n                            return '<div>' + renderParams(params) + '</div>';\n                        }\n                    }\n                },\n                {data: 'priority', width: \"2em\"},\n                {data: 'displayTime'},\n                {\n                    className: 'details-control',\n                    orderable: false,\n                    data: null,\n                    render: function (data, type, row) {\n                        return Mustache.render(templates.actionsTemplate, row);\n                    }\n                }\n            ]\n        });\n\n        dt.on('draw', updateCurrentFilter);\n\n        $('#serverSide').html('<form class=\"form-inline\"><label class=\"btn btn-default\" for=\"serverSideCheckbox\">' +\n                      'Filter on Server <input type=\"checkbox\" id=\"serverSideCheckbox\"/>' +\n                      '</label></form>');\n\n        // If using server-side filter we need to updateTasks every time the filter changes\n\n        $('#taskTable_filter').on('keyup paste', 'input', function () {\n            if ($('#serverSideCheckbox')[0].checked) {\n                clearTimeout(typingTimer);\n                if ($(this).val) {\n                    typingTimer = setTimeout(updateTasks, 400);\n                }\n            }\n        });\n\n        processHashChange();\n        updateTasks();\n        bindListEvents();\n\n        $('#taskTable tbody').on('click', 'td.details-control .showError', function () {\n            var tr = $(this).closest('tr');\n            var row = dt.row( tr );\n            var data = row.data();\n            luigi.getErrorTrace(data.taskId, function(error) {\n                showErrorTrace(error);\n            });\n        } );\n\n        $('#taskTable tbody').on('click', 'td.details-control .forgiveFailures', function (ev) {\n            var that = $(this);\n            var tr = that.closest('tr');\n            var row = dt.row( tr );\n            var data = row.data();\n            luigi.forgiveFailures(data.taskId, function(data) {\n                if (ev.altKey) {\n                    updateTasks(); // update may not be cheap\n                } else {\n                    that.tooltip('hide');\n                    that.remove();\n                }\n            });\n        } );\n\n        $('#taskTable tbody').on('click', 'td.details-control .markAsDone', function (ev) {\n            var that = $(this);\n            var tr = that.closest('tr');\n            var row = dt.row( tr );\n            var data = row.data();\n            luigi.markAsDone(data.taskId, function(data) {\n                if (ev.altKey) {\n                    updateTasks(); // update may not be cheap\n                } else {\n                    that.tooltip('hide');\n                    that.remove();\n                }\n            });\n        } );\n\n        $('#taskTable tbody').on('click', 'td.details-control .re-enable-button', function (ev) {\n            var that = $(this);\n            luigi.reEnable(that.attr(\"data-task-id\"), function(data) {\n                if (ev.altKey) {\n                    updateTasks(); // update may not be cheap\n                } else {\n                    that.tooltip('hide');\n                    that.remove();\n                }\n            });\n        });\n\n        $('#taskTable tbody').on('click', 'td.details-control .statusMessage', function () {\n            var data = $(this).data();\n            showStatusMessage(data);\n        });\n\n        $('#taskTable tbody').on('click', 'td.details-control .schedulerMessage', function () {\n            var data = $(this).data();\n            showSchedulerMessageModal(data);\n        });\n\n        $('.navbar-nav').on('click', 'a', function () {\n            var tabName = $(this).data('tab');\n            updateSidebar(tabName);\n        });\n\n        $('#workerList').on('show.bs.modal', '#disableWorkerModal', function (event) {\n            var triggerButton = $(event.relatedTarget);\n            $('#disableWorkerButton').data('trigger', triggerButton);\n        });\n\n        $('#workerList').on('click', '#disableWorkerButton', function() {\n            var triggerButton = $(this).data('trigger');\n            var worker = triggerButton.data('worker');\n\n            luigi.disableWorker(worker);\n\n            // show the worker as disabled in the visualiser\n            var box = triggerButton.parents('.box').addClass('box-solid box-default');\n\n            // remove the worker tools\n            box.find('.box-tools').remove();\n        });\n\n        $('#workerList').on('click', '#btn-increment-workers', function($event) {\n            var worker = $(this).data(\"worker\");\n            var $label = $('#workerList').find('#label-n-workers[data-worker=\"' + worker + '\"]');\n            var n = parseInt($label.text());\n            if (!isNaN(n)) {\n                updateWorkerProcesses(worker, n + 1);\n            }\n            $event.preventDefault();\n        });\n\n        $('#workerList').on('click', '#btn-decrement-workers', function($event) {\n            var worker = $(this).data(\"worker\");\n            var $label = $('#workerList').find('#label-n-workers[data-worker=\"' + worker + '\"]');\n            var n = parseInt($label.text());\n            if (!isNaN(n)) {\n                updateWorkerProcesses(worker, n - 1);\n            }\n            $event.preventDefault();\n        });\n\n        $('#workerList').on('show.bs.modal', '#setWorkersModal', function($event) {\n            $('#setWorkersButton').data('worker', $($event.relatedTarget).data('worker'));\n            var $input = $(this).find('#setWorkersInput').on('keypress', function($event) {\n                if (event.keyCode == 13) {\n                    $('#workerList').find('#setWorkersButton').trigger('click');\n                }\n                $event.stopPropagation();\n            });\n            setTimeout(function() {\n                $input.focus();\n            }.bind(this), 600);\n        });\n\n        $('#workerList').on('hidden.bs.modal', '#setWorkersModal', function() {\n            $(this).find('#setWorkersInput').off('keypress').val('');\n        });\n\n        $('#workerList').on('click', '#setWorkersButton', function($event) {\n            var worker = $(this).data('worker');\n            var n = parseInt($(\"#setWorkersInput\").val());\n            if (!isNaN(n)) {\n                updateWorkerProcesses(worker, n);\n            }\n            $event.preventDefault();\n        });\n\n        $('#resourceList').on('click', '.btn-increment-resources', function($event) {\n            $event.preventDefault();\n            var resource = $(this).data('resource');\n            var count = currentResourceCount(resource);\n            updateResourceCount(resource, count + 1);\n        });\n\n        $('#resourceList').on('click', '.btn-decrement-resources', function($event) {\n            $event.preventDefault();\n            var resource = $(this).data('resource');\n            var count = currentResourceCount(resource);\n            updateResourceCount(resource, count - 1);\n        });\n\n        $('#resourceList').on('show.bs.modal', '#setResourcesModal', function($event) {\n            $('#setResourcesButton').data('resource', $($event.relatedTarget).data('resource'));\n            var $input = $(this).find('#setResourcesInput').on('keypress', function($event) {\n                if (event.keyCode == 13) {\n                    $('#resourceList').find('#setResourcesButton').trigger('click');\n                }\n                $event.stopPropagation();\n            });\n            setTimeout(function() {\n                $input.focus();\n            }.bind(this), 600);\n        });\n\n        $('#resourceList').on('hidden.bs.modal', '#setResourcesModal', function() {\n            $(this).find('#setResourcesInput').off('keypress').val('');\n        });\n\n        $('#resourceList').on('click', '#setResourcesButton', function($event) {\n            var resource = $(this).data('resource');\n            var n = parseInt($(\"#setResourcesInput\").val());\n            updateResourceCount(resource, n);\n            $event.preventDefault();\n        });\n        $('.js-nav-link').click(function(e) {\n            // User followed tab from navigation link. Copy state from fields to hash.\n            e.preventDefault();\n            var state = {};\n            var tabId = $(this).attr('data-tab');\n\n            if (tabId == 'taskList') {\n                var order = dt.order();\n                var search = dt.search();\n                state.tab = 'tasks';\n\n                if ($('select[name=taskTable_length]').val() !== '10') {\n                    // Add length to hash only if the value is not default.\n                    state.length = $('select[name=taskTable_length]').val();\n                }\n\n                if ($('#serverSideCheckbox').is(':checked')) {\n                    state.filterOnServer = '1';\n                }\n\n                var family = $('#familySidebar li.active').attr('data-task');\n                if (family) {\n                    state.family = family;\n                } else {\n                    delete state.family;\n                }\n\n                if (currentFilter.taskCategory.length > 0) {\n                    state.statuses = JSON.stringify(currentFilter.taskCategory);\n                } else {\n                    delete state.statuses;\n                }\n\n                if (search) {\n                    state.search__search = search;\n                }\n\n                if (order.length > 0) {\n                    state.order = '' + order[0][0] + ',' + order[0][1];\n                }\n\n            } else if (tabId == 'dependencyGraph') {\n                state.tab = 'graph';\n\n                // Get state from fields.\n\n                if ($('#hideDoneCheckbox').is(':checked')) {\n                    state.hideDone = '1';\n                }\n\n                if ($('#idTaskForm input.search-query').val()) {\n                    state.taskId = $('#idTaskForm input.search-query').val();\n                }\n\n                if ($('#invertCheckbox').is(':checked')) {\n                    state.invert = '1';\n                }\n\n                state.visType = $('input[name=vis-type]:checked').val();\n            } else if (tabId == 'workerList') {\n                state.tab = 'workers';\n            } else if (tabId == 'resourceList') {\n                state.resources = JSON.stringify(expandedResources());\n                state.tab = 'resources';\n            }\n\n            location.hash = '#' + URI.buildQuery(state);\n        });\n\n        processHashChange();\n    });\n}\n"
  },
  {
    "path": "luigi/static/visualiser/lib/URI/1.18.2/URI.js",
    "content": "/*!\n * URI.js - Mutating URLs\n *\n * Version: 1.18.2\n *\n * Author: Rodney Rehm\n * Web: http://medialize.github.io/URI.js/\n *\n * Licensed under\n *   MIT License http://www.opensource.org/licenses/mit-license\n *\n */\n(function (root, factory) {\n  'use strict';\n  // https://github.com/umdjs/umd/blob/master/returnExports.js\n  if (typeof exports === 'object') {\n    // Node\n    module.exports = factory(require('./punycode'), require('./IPv6'), require('./SecondLevelDomains'));\n  } else if (typeof define === 'function' && define.amd) {\n    // AMD. Register as an anonymous module.\n    define(['./punycode', './IPv6', './SecondLevelDomains'], factory);\n  } else {\n    // Browser globals (root is window)\n    root.URI = factory(root.punycode, root.IPv6, root.SecondLevelDomains, root);\n  }\n}(this, function (punycode, IPv6, SLD, root) {\n  'use strict';\n  /*global location, escape, unescape */\n  // FIXME: v2.0.0 renamce non-camelCase properties to uppercase\n  /*jshint camelcase: false */\n\n  // save current URI variable, if any\n  var _URI = root && root.URI;\n\n  function URI(url, base) {\n    var _urlSupplied = arguments.length >= 1;\n    var _baseSupplied = arguments.length >= 2;\n\n    // Allow instantiation without the 'new' keyword\n    if (!(this instanceof URI)) {\n      if (_urlSupplied) {\n        if (_baseSupplied) {\n          return new URI(url, base);\n        }\n\n        return new URI(url);\n      }\n\n      return new URI();\n    }\n\n    if (url === undefined) {\n      if (_urlSupplied) {\n        throw new TypeError('undefined is not a valid argument for URI');\n      }\n\n      if (typeof location !== 'undefined') {\n        url = location.href + '';\n      } else {\n        url = '';\n      }\n    }\n\n    this.href(url);\n\n    // resolve to base according to http://dvcs.w3.org/hg/url/raw-file/tip/Overview.html#constructor\n    if (base !== undefined) {\n      return this.absoluteTo(base);\n    }\n\n    return this;\n  }\n\n  URI.version = '1.18.2';\n\n  var p = URI.prototype;\n  var hasOwn = Object.prototype.hasOwnProperty;\n\n  function escapeRegEx(string) {\n    // https://github.com/medialize/URI.js/commit/85ac21783c11f8ccab06106dba9735a31a86924d#commitcomment-821963\n    return string.replace(/([.*+?^=!:${}()|[\\]\\/\\\\])/g, '\\\\$1');\n  }\n\n  function getType(value) {\n    // IE8 doesn't return [Object Undefined] but [Object Object] for undefined value\n    if (value === undefined) {\n      return 'Undefined';\n    }\n\n    return String(Object.prototype.toString.call(value)).slice(8, -1);\n  }\n\n  function isArray(obj) {\n    return getType(obj) === 'Array';\n  }\n\n  function filterArrayValues(data, value) {\n    var lookup = {};\n    var i, length;\n\n    if (getType(value) === 'RegExp') {\n      lookup = null;\n    } else if (isArray(value)) {\n      for (i = 0, length = value.length; i < length; i++) {\n        lookup[value[i]] = true;\n      }\n    } else {\n      lookup[value] = true;\n    }\n\n    for (i = 0, length = data.length; i < length; i++) {\n      /*jshint laxbreak: true */\n      var _match = lookup && lookup[data[i]] !== undefined\n        || !lookup && value.test(data[i]);\n      /*jshint laxbreak: false */\n      if (_match) {\n        data.splice(i, 1);\n        length--;\n        i--;\n      }\n    }\n\n    return data;\n  }\n\n  function arrayContains(list, value) {\n    var i, length;\n\n    // value may be string, number, array, regexp\n    if (isArray(value)) {\n      // Note: this can be optimized to O(n) (instead of current O(m * n))\n      for (i = 0, length = value.length; i < length; i++) {\n        if (!arrayContains(list, value[i])) {\n          return false;\n        }\n      }\n\n      return true;\n    }\n\n    var _type = getType(value);\n    for (i = 0, length = list.length; i < length; i++) {\n      if (_type === 'RegExp') {\n        if (typeof list[i] === 'string' && list[i].match(value)) {\n          return true;\n        }\n      } else if (list[i] === value) {\n        return true;\n      }\n    }\n\n    return false;\n  }\n\n  function arraysEqual(one, two) {\n    if (!isArray(one) || !isArray(two)) {\n      return false;\n    }\n\n    // arrays can't be equal if they have different amount of content\n    if (one.length !== two.length) {\n      return false;\n    }\n\n    one.sort();\n    two.sort();\n\n    for (var i = 0, l = one.length; i < l; i++) {\n      if (one[i] !== two[i]) {\n        return false;\n      }\n    }\n\n    return true;\n  }\n\n  function trimSlashes(text) {\n    var trim_expression = /^\\/+|\\/+$/g;\n    return text.replace(trim_expression, '');\n  }\n\n  URI._parts = function() {\n    return {\n      protocol: null,\n      username: null,\n      password: null,\n      hostname: null,\n      urn: null,\n      port: null,\n      path: null,\n      query: null,\n      fragment: null,\n      // state\n      duplicateQueryParameters: URI.duplicateQueryParameters,\n      escapeQuerySpace: URI.escapeQuerySpace\n    };\n  };\n  // state: allow duplicate query parameters (a=1&a=1)\n  URI.duplicateQueryParameters = false;\n  // state: replaces + with %20 (space in query strings)\n  URI.escapeQuerySpace = true;\n  // static properties\n  URI.protocol_expression = /^[a-z][a-z0-9.+-]*$/i;\n  URI.idn_expression = /[^a-z0-9\\.-]/i;\n  URI.punycode_expression = /(xn--)/i;\n  // well, 333.444.555.666 matches, but it sure ain't no IPv4 - do we care?\n  URI.ip4_expression = /^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}$/;\n  // credits to Rich Brown\n  // source: http://forums.intermapper.com/viewtopic.php?p=1096#1096\n  // specification: http://www.ietf.org/rfc/rfc4291.txt\n  URI.ip6_expression = /^\\s*((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(\\.(25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)){3}))|:)))(%.+)?\\s*$/;\n  // expression used is \"gruber revised\" (@gruber v2) determined to be the\n  // best solution in a regex-golf we did a couple of ages ago at\n  // * http://mathiasbynens.be/demo/url-regex\n  // * http://rodneyrehm.de/t/url-regex.html\n  URI.find_uri_expression = /\\b((?:[a-z][\\w-]+:(?:\\/{1,3}|[a-z0-9%])|www\\d{0,3}[.]|[a-z0-9.\\-]+[.][a-z]{2,4}\\/)(?:[^\\s()<>]+|\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\))+(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:'\".,<>?«»“”‘’]))/ig;\n  URI.findUri = {\n    // valid \"scheme://\" or \"www.\"\n    start: /\\b(?:([a-z][a-z0-9.+-]*:\\/\\/)|www\\.)/gi,\n    // everything up to the next whitespace\n    end: /[\\s\\r\\n]|$/,\n    // trim trailing punctuation captured by end RegExp\n    trim: /[`!()\\[\\]{};:'\".,<>?«»“”„‘’]+$/\n  };\n  // http://www.iana.org/assignments/uri-schemes.html\n  // http://en.wikipedia.org/wiki/List_of_TCP_and_UDP_port_numbers#Well-known_ports\n  URI.defaultPorts = {\n    http: '80',\n    https: '443',\n    ftp: '21',\n    gopher: '70',\n    ws: '80',\n    wss: '443'\n  };\n  // allowed hostname characters according to RFC 3986\n  // ALPHA DIGIT \"-\" \".\" \"_\" \"~\" \"!\" \"$\" \"&\" \"'\" \"(\" \")\" \"*\" \"+\" \",\" \";\" \"=\" %encoded\n  // I've never seen a (non-IDN) hostname other than: ALPHA DIGIT . -\n  URI.invalid_hostname_characters = /[^a-zA-Z0-9\\.-]/;\n  // map DOM Elements to their URI attribute\n  URI.domAttributes = {\n    'a': 'href',\n    'blockquote': 'cite',\n    'link': 'href',\n    'base': 'href',\n    'script': 'src',\n    'form': 'action',\n    'img': 'src',\n    'area': 'href',\n    'iframe': 'src',\n    'embed': 'src',\n    'source': 'src',\n    'track': 'src',\n    'input': 'src', // but only if type=\"image\"\n    'audio': 'src',\n    'video': 'src'\n  };\n  URI.getDomAttribute = function(node) {\n    if (!node || !node.nodeName) {\n      return undefined;\n    }\n\n    var nodeName = node.nodeName.toLowerCase();\n    // <input> should only expose src for type=\"image\"\n    if (nodeName === 'input' && node.type !== 'image') {\n      return undefined;\n    }\n\n    return URI.domAttributes[nodeName];\n  };\n\n  function escapeForDumbFirefox36(value) {\n    // https://github.com/medialize/URI.js/issues/91\n    return escape(value);\n  }\n\n  // encoding / decoding according to RFC3986\n  function strictEncodeURIComponent(string) {\n    // see https://developer.mozilla.org/en-US/docs/JavaScript/Reference/Global_Objects/encodeURIComponent\n    return encodeURIComponent(string)\n      .replace(/[!'()*]/g, escapeForDumbFirefox36)\n      .replace(/\\*/g, '%2A');\n  }\n  URI.encode = strictEncodeURIComponent;\n  URI.decode = decodeURIComponent;\n  URI.iso8859 = function() {\n    URI.encode = escape;\n    URI.decode = unescape;\n  };\n  URI.unicode = function() {\n    URI.encode = strictEncodeURIComponent;\n    URI.decode = decodeURIComponent;\n  };\n  URI.characters = {\n    pathname: {\n      encode: {\n        // RFC3986 2.1: For consistency, URI producers and normalizers should\n        // use uppercase hexadecimal digits for all percent-encodings.\n        expression: /%(24|26|2B|2C|3B|3D|3A|40)/ig,\n        map: {\n          // -._~!'()*\n          '%24': '$',\n          '%26': '&',\n          '%2B': '+',\n          '%2C': ',',\n          '%3B': ';',\n          '%3D': '=',\n          '%3A': ':',\n          '%40': '@'\n        }\n      },\n      decode: {\n        expression: /[\\/\\?#]/g,\n        map: {\n          '/': '%2F',\n          '?': '%3F',\n          '#': '%23'\n        }\n      }\n    },\n    reserved: {\n      encode: {\n        // RFC3986 2.1: For consistency, URI producers and normalizers should\n        // use uppercase hexadecimal digits for all percent-encodings.\n        expression: /%(21|23|24|26|27|28|29|2A|2B|2C|2F|3A|3B|3D|3F|40|5B|5D)/ig,\n        map: {\n          // gen-delims\n          '%3A': ':',\n          '%2F': '/',\n          '%3F': '?',\n          '%23': '#',\n          '%5B': '[',\n          '%5D': ']',\n          '%40': '@',\n          // sub-delims\n          '%21': '!',\n          '%24': '$',\n          '%26': '&',\n          '%27': '\\'',\n          '%28': '(',\n          '%29': ')',\n          '%2A': '*',\n          '%2B': '+',\n          '%2C': ',',\n          '%3B': ';',\n          '%3D': '='\n        }\n      }\n    },\n    urnpath: {\n      // The characters under `encode` are the characters called out by RFC 2141 as being acceptable\n      // for usage in a URN. RFC2141 also calls out \"-\", \".\", and \"_\" as acceptable characters, but\n      // these aren't encoded by encodeURIComponent, so we don't have to call them out here. Also\n      // note that the colon character is not featured in the encoding map; this is because URI.js\n      // gives the colons in URNs semantic meaning as the delimiters of path segements, and so it\n      // should not appear unencoded in a segment itself.\n      // See also the note above about RFC3986 and capitalalized hex digits.\n      encode: {\n        expression: /%(21|24|27|28|29|2A|2B|2C|3B|3D|40)/ig,\n        map: {\n          '%21': '!',\n          '%24': '$',\n          '%27': '\\'',\n          '%28': '(',\n          '%29': ')',\n          '%2A': '*',\n          '%2B': '+',\n          '%2C': ',',\n          '%3B': ';',\n          '%3D': '=',\n          '%40': '@'\n        }\n      },\n      // These characters are the characters called out by RFC2141 as \"reserved\" characters that\n      // should never appear in a URN, plus the colon character (see note above).\n      decode: {\n        expression: /[\\/\\?#:]/g,\n        map: {\n          '/': '%2F',\n          '?': '%3F',\n          '#': '%23',\n          ':': '%3A'\n        }\n      }\n    }\n  };\n  URI.encodeQuery = function(string, escapeQuerySpace) {\n    var escaped = URI.encode(string + '');\n    if (escapeQuerySpace === undefined) {\n      escapeQuerySpace = URI.escapeQuerySpace;\n    }\n\n    return escapeQuerySpace ? escaped.replace(/%20/g, '+') : escaped;\n  };\n  URI.decodeQuery = function(string, escapeQuerySpace) {\n    string += '';\n    if (escapeQuerySpace === undefined) {\n      escapeQuerySpace = URI.escapeQuerySpace;\n    }\n\n    try {\n      return URI.decode(escapeQuerySpace ? string.replace(/\\+/g, '%20') : string);\n    } catch(e) {\n      // we're not going to mess with weird encodings,\n      // give up and return the undecoded original string\n      // see https://github.com/medialize/URI.js/issues/87\n      // see https://github.com/medialize/URI.js/issues/92\n      return string;\n    }\n  };\n  // generate encode/decode path functions\n  var _parts = {'encode':'encode', 'decode':'decode'};\n  var _part;\n  var generateAccessor = function(_group, _part) {\n    return function(string) {\n      try {\n        return URI[_part](string + '').replace(URI.characters[_group][_part].expression, function(c) {\n          return URI.characters[_group][_part].map[c];\n        });\n      } catch (e) {\n        // we're not going to mess with weird encodings,\n        // give up and return the undecoded original string\n        // see https://github.com/medialize/URI.js/issues/87\n        // see https://github.com/medialize/URI.js/issues/92\n        return string;\n      }\n    };\n  };\n\n  for (_part in _parts) {\n    URI[_part + 'PathSegment'] = generateAccessor('pathname', _parts[_part]);\n    URI[_part + 'UrnPathSegment'] = generateAccessor('urnpath', _parts[_part]);\n  }\n\n  var generateSegmentedPathFunction = function(_sep, _codingFuncName, _innerCodingFuncName) {\n    return function(string) {\n      // Why pass in names of functions, rather than the function objects themselves? The\n      // definitions of some functions (but in particular, URI.decode) will occasionally change due\n      // to URI.js having ISO8859 and Unicode modes. Passing in the name and getting it will ensure\n      // that the functions we use here are \"fresh\".\n      var actualCodingFunc;\n      if (!_innerCodingFuncName) {\n        actualCodingFunc = URI[_codingFuncName];\n      } else {\n        actualCodingFunc = function(string) {\n          return URI[_codingFuncName](URI[_innerCodingFuncName](string));\n        };\n      }\n\n      var segments = (string + '').split(_sep);\n\n      for (var i = 0, length = segments.length; i < length; i++) {\n        segments[i] = actualCodingFunc(segments[i]);\n      }\n\n      return segments.join(_sep);\n    };\n  };\n\n  // This takes place outside the above loop because we don't want, e.g., encodeUrnPath functions.\n  URI.decodePath = generateSegmentedPathFunction('/', 'decodePathSegment');\n  URI.decodeUrnPath = generateSegmentedPathFunction(':', 'decodeUrnPathSegment');\n  URI.recodePath = generateSegmentedPathFunction('/', 'encodePathSegment', 'decode');\n  URI.recodeUrnPath = generateSegmentedPathFunction(':', 'encodeUrnPathSegment', 'decode');\n\n  URI.encodeReserved = generateAccessor('reserved', 'encode');\n\n  URI.parse = function(string, parts) {\n    var pos;\n    if (!parts) {\n      parts = {};\n    }\n    // [protocol\"://\"[username[\":\"password]\"@\"]hostname[\":\"port]\"/\"?][path][\"?\"querystring][\"#\"fragment]\n\n    // extract fragment\n    pos = string.indexOf('#');\n    if (pos > -1) {\n      // escaping?\n      parts.fragment = string.substring(pos + 1) || null;\n      string = string.substring(0, pos);\n    }\n\n    // extract query\n    pos = string.indexOf('?');\n    if (pos > -1) {\n      // escaping?\n      parts.query = string.substring(pos + 1) || null;\n      string = string.substring(0, pos);\n    }\n\n    // extract protocol\n    if (string.substring(0, 2) === '//') {\n      // relative-scheme\n      parts.protocol = null;\n      string = string.substring(2);\n      // extract \"user:pass@host:port\"\n      string = URI.parseAuthority(string, parts);\n    } else {\n      pos = string.indexOf(':');\n      if (pos > -1) {\n        parts.protocol = string.substring(0, pos) || null;\n        if (parts.protocol && !parts.protocol.match(URI.protocol_expression)) {\n          // : may be within the path\n          parts.protocol = undefined;\n        } else if (string.substring(pos + 1, pos + 3) === '//') {\n          string = string.substring(pos + 3);\n\n          // extract \"user:pass@host:port\"\n          string = URI.parseAuthority(string, parts);\n        } else {\n          string = string.substring(pos + 1);\n          parts.urn = true;\n        }\n      }\n    }\n\n    // what's left must be the path\n    parts.path = string;\n\n    // and we're done\n    return parts;\n  };\n  URI.parseHost = function(string, parts) {\n    // Copy chrome, IE, opera backslash-handling behavior.\n    // Back slashes before the query string get converted to forward slashes\n    // See: https://github.com/joyent/node/blob/386fd24f49b0e9d1a8a076592a404168faeecc34/lib/url.js#L115-L124\n    // See: https://code.google.com/p/chromium/issues/detail?id=25916\n    // https://github.com/medialize/URI.js/pull/233\n    string = string.replace(/\\\\/g, '/');\n\n    // extract host:port\n    var pos = string.indexOf('/');\n    var bracketPos;\n    var t;\n\n    if (pos === -1) {\n      pos = string.length;\n    }\n\n    if (string.charAt(0) === '[') {\n      // IPv6 host - http://tools.ietf.org/html/draft-ietf-6man-text-addr-representation-04#section-6\n      // I claim most client software breaks on IPv6 anyways. To simplify things, URI only accepts\n      // IPv6+port in the format [2001:db8::1]:80 (for the time being)\n      bracketPos = string.indexOf(']');\n      parts.hostname = string.substring(1, bracketPos) || null;\n      parts.port = string.substring(bracketPos + 2, pos) || null;\n      if (parts.port === '/') {\n        parts.port = null;\n      }\n    } else {\n      var firstColon = string.indexOf(':');\n      var firstSlash = string.indexOf('/');\n      var nextColon = string.indexOf(':', firstColon + 1);\n      if (nextColon !== -1 && (firstSlash === -1 || nextColon < firstSlash)) {\n        // IPv6 host contains multiple colons - but no port\n        // this notation is actually not allowed by RFC 3986, but we're a liberal parser\n        parts.hostname = string.substring(0, pos) || null;\n        parts.port = null;\n      } else {\n        t = string.substring(0, pos).split(':');\n        parts.hostname = t[0] || null;\n        parts.port = t[1] || null;\n      }\n    }\n\n    if (parts.hostname && string.substring(pos).charAt(0) !== '/') {\n      pos++;\n      string = '/' + string;\n    }\n\n    return string.substring(pos) || '/';\n  };\n  URI.parseAuthority = function(string, parts) {\n    string = URI.parseUserinfo(string, parts);\n    return URI.parseHost(string, parts);\n  };\n  URI.parseUserinfo = function(string, parts) {\n    // extract username:password\n    var firstSlash = string.indexOf('/');\n    var pos = string.lastIndexOf('@', firstSlash > -1 ? firstSlash : string.length - 1);\n    var t;\n\n    // authority@ must come before /path\n    if (pos > -1 && (firstSlash === -1 || pos < firstSlash)) {\n      t = string.substring(0, pos).split(':');\n      parts.username = t[0] ? URI.decode(t[0]) : null;\n      t.shift();\n      parts.password = t[0] ? URI.decode(t.join(':')) : null;\n      string = string.substring(pos + 1);\n    } else {\n      parts.username = null;\n      parts.password = null;\n    }\n\n    return string;\n  };\n  URI.parseQuery = function(string, escapeQuerySpace) {\n    if (!string) {\n      return {};\n    }\n\n    // throw out the funky business - \"?\"[name\"=\"value\"&\"]+\n    string = string.replace(/&+/g, '&').replace(/^\\?*&*|&+$/g, '');\n\n    if (!string) {\n      return {};\n    }\n\n    var items = {};\n    var splits = string.split('&');\n    var length = splits.length;\n    var v, name, value;\n\n    for (var i = 0; i < length; i++) {\n      v = splits[i].split('=');\n      name = URI.decodeQuery(v.shift(), escapeQuerySpace);\n      // no \"=\" is null according to http://dvcs.w3.org/hg/url/raw-file/tip/Overview.html#collect-url-parameters\n      value = v.length ? URI.decodeQuery(v.join('='), escapeQuerySpace) : null;\n\n      if (hasOwn.call(items, name)) {\n        if (typeof items[name] === 'string' || items[name] === null) {\n          items[name] = [items[name]];\n        }\n\n        items[name].push(value);\n      } else {\n        items[name] = value;\n      }\n    }\n\n    return items;\n  };\n\n  URI.build = function(parts) {\n    var t = '';\n\n    if (parts.protocol) {\n      t += parts.protocol + ':';\n    }\n\n    if (!parts.urn && (t || parts.hostname)) {\n      t += '//';\n    }\n\n    t += (URI.buildAuthority(parts) || '');\n\n    if (typeof parts.path === 'string') {\n      if (parts.path.charAt(0) !== '/' && typeof parts.hostname === 'string') {\n        t += '/';\n      }\n\n      t += parts.path;\n    }\n\n    if (typeof parts.query === 'string' && parts.query) {\n      t += '?' + parts.query;\n    }\n\n    if (typeof parts.fragment === 'string' && parts.fragment) {\n      t += '#' + parts.fragment;\n    }\n    return t;\n  };\n  URI.buildHost = function(parts) {\n    var t = '';\n\n    if (!parts.hostname) {\n      return '';\n    } else if (URI.ip6_expression.test(parts.hostname)) {\n      t += '[' + parts.hostname + ']';\n    } else {\n      t += parts.hostname;\n    }\n\n    if (parts.port) {\n      t += ':' + parts.port;\n    }\n\n    return t;\n  };\n  URI.buildAuthority = function(parts) {\n    return URI.buildUserinfo(parts) + URI.buildHost(parts);\n  };\n  URI.buildUserinfo = function(parts) {\n    var t = '';\n\n    if (parts.username) {\n      t += URI.encode(parts.username);\n    }\n\n    if (parts.password) {\n      t += ':' + URI.encode(parts.password);\n    }\n\n    if (t) {\n      t += '@';\n    }\n\n    return t;\n  };\n  URI.buildQuery = function(data, duplicateQueryParameters, escapeQuerySpace) {\n    // according to http://tools.ietf.org/html/rfc3986 or http://labs.apache.org/webarch/uri/rfc/rfc3986.html\n    // being »-._~!$&'()*+,;=:@/?« %HEX and alnum are allowed\n    // the RFC explicitly states ?/foo being a valid use case, no mention of parameter syntax!\n    // URI.js treats the query string as being application/x-www-form-urlencoded\n    // see http://www.w3.org/TR/REC-html40/interact/forms.html#form-content-type\n\n    var t = '';\n    var unique, key, i, length;\n    for (key in data) {\n      if (hasOwn.call(data, key) && key) {\n        if (isArray(data[key])) {\n          unique = {};\n          for (i = 0, length = data[key].length; i < length; i++) {\n            if (data[key][i] !== undefined && unique[data[key][i] + ''] === undefined) {\n              t += '&' + URI.buildQueryParameter(key, data[key][i], escapeQuerySpace);\n              if (duplicateQueryParameters !== true) {\n                unique[data[key][i] + ''] = true;\n              }\n            }\n          }\n        } else if (data[key] !== undefined) {\n          t += '&' + URI.buildQueryParameter(key, data[key], escapeQuerySpace);\n        }\n      }\n    }\n\n    return t.substring(1);\n  };\n  URI.buildQueryParameter = function(name, value, escapeQuerySpace) {\n    // http://www.w3.org/TR/REC-html40/interact/forms.html#form-content-type -- application/x-www-form-urlencoded\n    // don't append \"=\" for null values, according to http://dvcs.w3.org/hg/url/raw-file/tip/Overview.html#url-parameter-serialization\n    return URI.encodeQuery(name, escapeQuerySpace) + (value !== null ? '=' + URI.encodeQuery(value, escapeQuerySpace) : '');\n  };\n\n  URI.addQuery = function(data, name, value) {\n    if (typeof name === 'object') {\n      for (var key in name) {\n        if (hasOwn.call(name, key)) {\n          URI.addQuery(data, key, name[key]);\n        }\n      }\n    } else if (typeof name === 'string') {\n      if (data[name] === undefined) {\n        data[name] = value;\n        return;\n      } else if (typeof data[name] === 'string') {\n        data[name] = [data[name]];\n      }\n\n      if (!isArray(value)) {\n        value = [value];\n      }\n\n      data[name] = (data[name] || []).concat(value);\n    } else {\n      throw new TypeError('URI.addQuery() accepts an object, string as the name parameter');\n    }\n  };\n  URI.removeQuery = function(data, name, value) {\n    var i, length, key;\n\n    if (isArray(name)) {\n      for (i = 0, length = name.length; i < length; i++) {\n        data[name[i]] = undefined;\n      }\n    } else if (getType(name) === 'RegExp') {\n      for (key in data) {\n        if (name.test(key)) {\n          data[key] = undefined;\n        }\n      }\n    } else if (typeof name === 'object') {\n      for (key in name) {\n        if (hasOwn.call(name, key)) {\n          URI.removeQuery(data, key, name[key]);\n        }\n      }\n    } else if (typeof name === 'string') {\n      if (value !== undefined) {\n        if (getType(value) === 'RegExp') {\n          if (!isArray(data[name]) && value.test(data[name])) {\n            data[name] = undefined;\n          } else {\n            data[name] = filterArrayValues(data[name], value);\n          }\n        } else if (data[name] === String(value) && (!isArray(value) || value.length === 1)) {\n          data[name] = undefined;\n        } else if (isArray(data[name])) {\n          data[name] = filterArrayValues(data[name], value);\n        }\n      } else {\n        data[name] = undefined;\n      }\n    } else {\n      throw new TypeError('URI.removeQuery() accepts an object, string, RegExp as the first parameter');\n    }\n  };\n  URI.hasQuery = function(data, name, value, withinArray) {\n    switch (getType(name)) {\n      case 'String':\n        // Nothing to do here\n        break;\n\n      case 'RegExp':\n        for (var key in data) {\n          if (hasOwn.call(data, key)) {\n            if (name.test(key) && (value === undefined || URI.hasQuery(data, key, value))) {\n              return true;\n            }\n          }\n        }\n\n        return false;\n\n      case 'Object':\n        for (var _key in name) {\n          if (hasOwn.call(name, _key)) {\n            if (!URI.hasQuery(data, _key, name[_key])) {\n              return false;\n            }\n          }\n        }\n\n        return true;\n\n      default:\n        throw new TypeError('URI.hasQuery() accepts a string, regular expression or object as the name parameter');\n    }\n\n    switch (getType(value)) {\n      case 'Undefined':\n        // true if exists (but may be empty)\n        return name in data; // data[name] !== undefined;\n\n      case 'Boolean':\n        // true if exists and non-empty\n        var _booly = Boolean(isArray(data[name]) ? data[name].length : data[name]);\n        return value === _booly;\n\n      case 'Function':\n        // allow complex comparison\n        return !!value(data[name], name, data);\n\n      case 'Array':\n        if (!isArray(data[name])) {\n          return false;\n        }\n\n        var op = withinArray ? arrayContains : arraysEqual;\n        return op(data[name], value);\n\n      case 'RegExp':\n        if (!isArray(data[name])) {\n          return Boolean(data[name] && data[name].match(value));\n        }\n\n        if (!withinArray) {\n          return false;\n        }\n\n        return arrayContains(data[name], value);\n\n      case 'Number':\n        value = String(value);\n        /* falls through */\n      case 'String':\n        if (!isArray(data[name])) {\n          return data[name] === value;\n        }\n\n        if (!withinArray) {\n          return false;\n        }\n\n        return arrayContains(data[name], value);\n\n      default:\n        throw new TypeError('URI.hasQuery() accepts undefined, boolean, string, number, RegExp, Function as the value parameter');\n    }\n  };\n\n\n  URI.joinPaths = function() {\n    var input = [];\n    var segments = [];\n    var nonEmptySegments = 0;\n\n    for (var i = 0; i < arguments.length; i++) {\n      var url = new URI(arguments[i]);\n      input.push(url);\n      var _segments = url.segment();\n      for (var s = 0; s < _segments.length; s++) {\n        if (typeof _segments[s] === 'string') {\n          segments.push(_segments[s]);\n        }\n\n        if (_segments[s]) {\n          nonEmptySegments++;\n        }\n      }\n    }\n\n    if (!segments.length || !nonEmptySegments) {\n      return new URI('');\n    }\n\n    var uri = new URI('').segment(segments);\n\n    if (input[0].path() === '' || input[0].path().slice(0, 1) === '/') {\n      uri.path('/' + uri.path());\n    }\n\n    return uri.normalize();\n  };\n\n  URI.commonPath = function(one, two) {\n    var length = Math.min(one.length, two.length);\n    var pos;\n\n    // find first non-matching character\n    for (pos = 0; pos < length; pos++) {\n      if (one.charAt(pos) !== two.charAt(pos)) {\n        pos--;\n        break;\n      }\n    }\n\n    if (pos < 1) {\n      return one.charAt(0) === two.charAt(0) && one.charAt(0) === '/' ? '/' : '';\n    }\n\n    // revert to last /\n    if (one.charAt(pos) !== '/' || two.charAt(pos) !== '/') {\n      pos = one.substring(0, pos).lastIndexOf('/');\n    }\n\n    return one.substring(0, pos + 1);\n  };\n\n  URI.withinString = function(string, callback, options) {\n    options || (options = {});\n    var _start = options.start || URI.findUri.start;\n    var _end = options.end || URI.findUri.end;\n    var _trim = options.trim || URI.findUri.trim;\n    var _attributeOpen = /[a-z0-9-]=[\"']?$/i;\n\n    _start.lastIndex = 0;\n    while (true) {\n      var match = _start.exec(string);\n      if (!match) {\n        break;\n      }\n\n      var start = match.index;\n      if (options.ignoreHtml) {\n        // attribut(e=[\"']?$)\n        var attributeOpen = string.slice(Math.max(start - 3, 0), start);\n        if (attributeOpen && _attributeOpen.test(attributeOpen)) {\n          continue;\n        }\n      }\n\n      var end = start + string.slice(start).search(_end);\n      var slice = string.slice(start, end).replace(_trim, '');\n      if (options.ignore && options.ignore.test(slice)) {\n        continue;\n      }\n\n      end = start + slice.length;\n      var result = callback(slice, start, end, string);\n      if (result === undefined) {\n        _start.lastIndex = end;\n        continue;\n      }\n\n      result = String(result);\n      string = string.slice(0, start) + result + string.slice(end);\n      _start.lastIndex = start + result.length;\n    }\n\n    _start.lastIndex = 0;\n    return string;\n  };\n\n  URI.ensureValidHostname = function(v) {\n    // Theoretically URIs allow percent-encoding in Hostnames (according to RFC 3986)\n    // they are not part of DNS and therefore ignored by URI.js\n\n    if (v.match(URI.invalid_hostname_characters)) {\n      // test punycode\n      if (!punycode) {\n        throw new TypeError('Hostname \"' + v + '\" contains characters other than [A-Z0-9.-] and Punycode.js is not available');\n      }\n\n      if (punycode.toASCII(v).match(URI.invalid_hostname_characters)) {\n        throw new TypeError('Hostname \"' + v + '\" contains characters other than [A-Z0-9.-]');\n      }\n    }\n  };\n\n  // noConflict\n  URI.noConflict = function(removeAll) {\n    if (removeAll) {\n      var unconflicted = {\n        URI: this.noConflict()\n      };\n\n      if (root.URITemplate && typeof root.URITemplate.noConflict === 'function') {\n        unconflicted.URITemplate = root.URITemplate.noConflict();\n      }\n\n      if (root.IPv6 && typeof root.IPv6.noConflict === 'function') {\n        unconflicted.IPv6 = root.IPv6.noConflict();\n      }\n\n      if (root.SecondLevelDomains && typeof root.SecondLevelDomains.noConflict === 'function') {\n        unconflicted.SecondLevelDomains = root.SecondLevelDomains.noConflict();\n      }\n\n      return unconflicted;\n    } else if (root.URI === this) {\n      root.URI = _URI;\n    }\n\n    return this;\n  };\n\n  p.build = function(deferBuild) {\n    if (deferBuild === true) {\n      this._deferred_build = true;\n    } else if (deferBuild === undefined || this._deferred_build) {\n      this._string = URI.build(this._parts);\n      this._deferred_build = false;\n    }\n\n    return this;\n  };\n\n  p.clone = function() {\n    return new URI(this);\n  };\n\n  p.valueOf = p.toString = function() {\n    return this.build(false)._string;\n  };\n\n\n  function generateSimpleAccessor(_part){\n    return function(v, build) {\n      if (v === undefined) {\n        return this._parts[_part] || '';\n      } else {\n        this._parts[_part] = v || null;\n        this.build(!build);\n        return this;\n      }\n    };\n  }\n\n  function generatePrefixAccessor(_part, _key){\n    return function(v, build) {\n      if (v === undefined) {\n        return this._parts[_part] || '';\n      } else {\n        if (v !== null) {\n          v = v + '';\n          if (v.charAt(0) === _key) {\n            v = v.substring(1);\n          }\n        }\n\n        this._parts[_part] = v;\n        this.build(!build);\n        return this;\n      }\n    };\n  }\n\n  p.protocol = generateSimpleAccessor('protocol');\n  p.username = generateSimpleAccessor('username');\n  p.password = generateSimpleAccessor('password');\n  p.hostname = generateSimpleAccessor('hostname');\n  p.port = generateSimpleAccessor('port');\n  p.query = generatePrefixAccessor('query', '?');\n  p.fragment = generatePrefixAccessor('fragment', '#');\n\n  p.search = function(v, build) {\n    var t = this.query(v, build);\n    return typeof t === 'string' && t.length ? ('?' + t) : t;\n  };\n  p.hash = function(v, build) {\n    var t = this.fragment(v, build);\n    return typeof t === 'string' && t.length ? ('#' + t) : t;\n  };\n\n  p.pathname = function(v, build) {\n    if (v === undefined || v === true) {\n      var res = this._parts.path || (this._parts.hostname ? '/' : '');\n      return v ? (this._parts.urn ? URI.decodeUrnPath : URI.decodePath)(res) : res;\n    } else {\n      if (this._parts.urn) {\n        this._parts.path = v ? URI.recodeUrnPath(v) : '';\n      } else {\n        this._parts.path = v ? URI.recodePath(v) : '/';\n      }\n      this.build(!build);\n      return this;\n    }\n  };\n  p.path = p.pathname;\n  p.href = function(href, build) {\n    var key;\n\n    if (href === undefined) {\n      return this.toString();\n    }\n\n    this._string = '';\n    this._parts = URI._parts();\n\n    var _URI = href instanceof URI;\n    var _object = typeof href === 'object' && (href.hostname || href.path || href.pathname);\n    if (href.nodeName) {\n      var attribute = URI.getDomAttribute(href);\n      href = href[attribute] || '';\n      _object = false;\n    }\n\n    // window.location is reported to be an object, but it's not the sort\n    // of object we're looking for:\n    // * location.protocol ends with a colon\n    // * location.query != object.search\n    // * location.hash != object.fragment\n    // simply serializing the unknown object should do the trick\n    // (for location, not for everything...)\n    if (!_URI && _object && href.pathname !== undefined) {\n      href = href.toString();\n    }\n\n    if (typeof href === 'string' || href instanceof String) {\n      this._parts = URI.parse(String(href), this._parts);\n    } else if (_URI || _object) {\n      var src = _URI ? href._parts : href;\n      for (key in src) {\n        if (hasOwn.call(this._parts, key)) {\n          this._parts[key] = src[key];\n        }\n      }\n    } else {\n      throw new TypeError('invalid input');\n    }\n\n    this.build(!build);\n    return this;\n  };\n\n  // identification accessors\n  p.is = function(what) {\n    var ip = false;\n    var ip4 = false;\n    var ip6 = false;\n    var name = false;\n    var sld = false;\n    var idn = false;\n    var punycode = false;\n    var relative = !this._parts.urn;\n\n    if (this._parts.hostname) {\n      relative = false;\n      ip4 = URI.ip4_expression.test(this._parts.hostname);\n      ip6 = URI.ip6_expression.test(this._parts.hostname);\n      ip = ip4 || ip6;\n      name = !ip;\n      sld = name && SLD && SLD.has(this._parts.hostname);\n      idn = name && URI.idn_expression.test(this._parts.hostname);\n      punycode = name && URI.punycode_expression.test(this._parts.hostname);\n    }\n\n    switch (what.toLowerCase()) {\n      case 'relative':\n        return relative;\n\n      case 'absolute':\n        return !relative;\n\n      // hostname identification\n      case 'domain':\n      case 'name':\n        return name;\n\n      case 'sld':\n        return sld;\n\n      case 'ip':\n        return ip;\n\n      case 'ip4':\n      case 'ipv4':\n      case 'inet4':\n        return ip4;\n\n      case 'ip6':\n      case 'ipv6':\n      case 'inet6':\n        return ip6;\n\n      case 'idn':\n        return idn;\n\n      case 'url':\n        return !this._parts.urn;\n\n      case 'urn':\n        return !!this._parts.urn;\n\n      case 'punycode':\n        return punycode;\n    }\n\n    return null;\n  };\n\n  // component specific input validation\n  var _protocol = p.protocol;\n  var _port = p.port;\n  var _hostname = p.hostname;\n\n  p.protocol = function(v, build) {\n    if (v !== undefined) {\n      if (v) {\n        // accept trailing ://\n        v = v.replace(/:(\\/\\/)?$/, '');\n\n        if (!v.match(URI.protocol_expression)) {\n          throw new TypeError('Protocol \"' + v + '\" contains characters other than [A-Z0-9.+-] or doesn\\'t start with [A-Z]');\n        }\n      }\n    }\n    return _protocol.call(this, v, build);\n  };\n  p.scheme = p.protocol;\n  p.port = function(v, build) {\n    if (this._parts.urn) {\n      return v === undefined ? '' : this;\n    }\n\n    if (v !== undefined) {\n      if (v === 0) {\n        v = null;\n      }\n\n      if (v) {\n        v += '';\n        if (v.charAt(0) === ':') {\n          v = v.substring(1);\n        }\n\n        if (v.match(/[^0-9]/)) {\n          throw new TypeError('Port \"' + v + '\" contains characters other than [0-9]');\n        }\n      }\n    }\n    return _port.call(this, v, build);\n  };\n  p.hostname = function(v, build) {\n    if (this._parts.urn) {\n      return v === undefined ? '' : this;\n    }\n\n    if (v !== undefined) {\n      var x = {};\n      var res = URI.parseHost(v, x);\n      if (res !== '/') {\n        throw new TypeError('Hostname \"' + v + '\" contains characters other than [A-Z0-9.-]');\n      }\n\n      v = x.hostname;\n    }\n    return _hostname.call(this, v, build);\n  };\n\n  // compound accessors\n  p.origin = function(v, build) {\n    if (this._parts.urn) {\n      return v === undefined ? '' : this;\n    }\n\n    if (v === undefined) {\n      var protocol = this.protocol();\n      var authority = this.authority();\n      if (!authority) {\n        return '';\n      }\n\n      return (protocol ? protocol + '://' : '') + this.authority();\n    } else {\n      var origin = URI(v);\n      this\n        .protocol(origin.protocol())\n        .authority(origin.authority())\n        .build(!build);\n      return this;\n    }\n  };\n  p.host = function(v, build) {\n    if (this._parts.urn) {\n      return v === undefined ? '' : this;\n    }\n\n    if (v === undefined) {\n      return this._parts.hostname ? URI.buildHost(this._parts) : '';\n    } else {\n      var res = URI.parseHost(v, this._parts);\n      if (res !== '/') {\n        throw new TypeError('Hostname \"' + v + '\" contains characters other than [A-Z0-9.-]');\n      }\n\n      this.build(!build);\n      return this;\n    }\n  };\n  p.authority = function(v, build) {\n    if (this._parts.urn) {\n      return v === undefined ? '' : this;\n    }\n\n    if (v === undefined) {\n      return this._parts.hostname ? URI.buildAuthority(this._parts) : '';\n    } else {\n      var res = URI.parseAuthority(v, this._parts);\n      if (res !== '/') {\n        throw new TypeError('Hostname \"' + v + '\" contains characters other than [A-Z0-9.-]');\n      }\n\n      this.build(!build);\n      return this;\n    }\n  };\n  p.userinfo = function(v, build) {\n    if (this._parts.urn) {\n      return v === undefined ? '' : this;\n    }\n\n    if (v === undefined) {\n      var t = URI.buildUserinfo(this._parts);\n      return t ? t.substring(0, t.length -1) : t;\n    } else {\n      if (v[v.length-1] !== '@') {\n        v += '@';\n      }\n\n      URI.parseUserinfo(v, this._parts);\n      this.build(!build);\n      return this;\n    }\n  };\n  p.resource = function(v, build) {\n    var parts;\n\n    if (v === undefined) {\n      return this.path() + this.search() + this.hash();\n    }\n\n    parts = URI.parse(v);\n    this._parts.path = parts.path;\n    this._parts.query = parts.query;\n    this._parts.fragment = parts.fragment;\n    this.build(!build);\n    return this;\n  };\n\n  // fraction accessors\n  p.subdomain = function(v, build) {\n    if (this._parts.urn) {\n      return v === undefined ? '' : this;\n    }\n\n    // convenience, return \"www\" from \"www.example.org\"\n    if (v === undefined) {\n      if (!this._parts.hostname || this.is('IP')) {\n        return '';\n      }\n\n      // grab domain and add another segment\n      var end = this._parts.hostname.length - this.domain().length - 1;\n      return this._parts.hostname.substring(0, end) || '';\n    } else {\n      var e = this._parts.hostname.length - this.domain().length;\n      var sub = this._parts.hostname.substring(0, e);\n      var replace = new RegExp('^' + escapeRegEx(sub));\n\n      if (v && v.charAt(v.length - 1) !== '.') {\n        v += '.';\n      }\n\n      if (v) {\n        URI.ensureValidHostname(v);\n      }\n\n      this._parts.hostname = this._parts.hostname.replace(replace, v);\n      this.build(!build);\n      return this;\n    }\n  };\n  p.domain = function(v, build) {\n    if (this._parts.urn) {\n      return v === undefined ? '' : this;\n    }\n\n    if (typeof v === 'boolean') {\n      build = v;\n      v = undefined;\n    }\n\n    // convenience, return \"example.org\" from \"www.example.org\"\n    if (v === undefined) {\n      if (!this._parts.hostname || this.is('IP')) {\n        return '';\n      }\n\n      // if hostname consists of 1 or 2 segments, it must be the domain\n      var t = this._parts.hostname.match(/\\./g);\n      if (t && t.length < 2) {\n        return this._parts.hostname;\n      }\n\n      // grab tld and add another segment\n      var end = this._parts.hostname.length - this.tld(build).length - 1;\n      end = this._parts.hostname.lastIndexOf('.', end -1) + 1;\n      return this._parts.hostname.substring(end) || '';\n    } else {\n      if (!v) {\n        throw new TypeError('cannot set domain empty');\n      }\n\n      URI.ensureValidHostname(v);\n\n      if (!this._parts.hostname || this.is('IP')) {\n        this._parts.hostname = v;\n      } else {\n        var replace = new RegExp(escapeRegEx(this.domain()) + '$');\n        this._parts.hostname = this._parts.hostname.replace(replace, v);\n      }\n\n      this.build(!build);\n      return this;\n    }\n  };\n  p.tld = function(v, build) {\n    if (this._parts.urn) {\n      return v === undefined ? '' : this;\n    }\n\n    if (typeof v === 'boolean') {\n      build = v;\n      v = undefined;\n    }\n\n    // return \"org\" from \"www.example.org\"\n    if (v === undefined) {\n      if (!this._parts.hostname || this.is('IP')) {\n        return '';\n      }\n\n      var pos = this._parts.hostname.lastIndexOf('.');\n      var tld = this._parts.hostname.substring(pos + 1);\n\n      if (build !== true && SLD && SLD.list[tld.toLowerCase()]) {\n        return SLD.get(this._parts.hostname) || tld;\n      }\n\n      return tld;\n    } else {\n      var replace;\n\n      if (!v) {\n        throw new TypeError('cannot set TLD empty');\n      } else if (v.match(/[^a-zA-Z0-9-]/)) {\n        if (SLD && SLD.is(v)) {\n          replace = new RegExp(escapeRegEx(this.tld()) + '$');\n          this._parts.hostname = this._parts.hostname.replace(replace, v);\n        } else {\n          throw new TypeError('TLD \"' + v + '\" contains characters other than [A-Z0-9]');\n        }\n      } else if (!this._parts.hostname || this.is('IP')) {\n        throw new ReferenceError('cannot set TLD on non-domain host');\n      } else {\n        replace = new RegExp(escapeRegEx(this.tld()) + '$');\n        this._parts.hostname = this._parts.hostname.replace(replace, v);\n      }\n\n      this.build(!build);\n      return this;\n    }\n  };\n  p.directory = function(v, build) {\n    if (this._parts.urn) {\n      return v === undefined ? '' : this;\n    }\n\n    if (v === undefined || v === true) {\n      if (!this._parts.path && !this._parts.hostname) {\n        return '';\n      }\n\n      if (this._parts.path === '/') {\n        return '/';\n      }\n\n      var end = this._parts.path.length - this.filename().length - 1;\n      var res = this._parts.path.substring(0, end) || (this._parts.hostname ? '/' : '');\n\n      return v ? URI.decodePath(res) : res;\n\n    } else {\n      var e = this._parts.path.length - this.filename().length;\n      var directory = this._parts.path.substring(0, e);\n      var replace = new RegExp('^' + escapeRegEx(directory));\n\n      // fully qualifier directories begin with a slash\n      if (!this.is('relative')) {\n        if (!v) {\n          v = '/';\n        }\n\n        if (v.charAt(0) !== '/') {\n          v = '/' + v;\n        }\n      }\n\n      // directories always end with a slash\n      if (v && v.charAt(v.length - 1) !== '/') {\n        v += '/';\n      }\n\n      v = URI.recodePath(v);\n      this._parts.path = this._parts.path.replace(replace, v);\n      this.build(!build);\n      return this;\n    }\n  };\n  p.filename = function(v, build) {\n    if (this._parts.urn) {\n      return v === undefined ? '' : this;\n    }\n\n    if (v === undefined || v === true) {\n      if (!this._parts.path || this._parts.path === '/') {\n        return '';\n      }\n\n      var pos = this._parts.path.lastIndexOf('/');\n      var res = this._parts.path.substring(pos+1);\n\n      return v ? URI.decodePathSegment(res) : res;\n    } else {\n      var mutatedDirectory = false;\n\n      if (v.charAt(0) === '/') {\n        v = v.substring(1);\n      }\n\n      if (v.match(/\\.?\\//)) {\n        mutatedDirectory = true;\n      }\n\n      var replace = new RegExp(escapeRegEx(this.filename()) + '$');\n      v = URI.recodePath(v);\n      this._parts.path = this._parts.path.replace(replace, v);\n\n      if (mutatedDirectory) {\n        this.normalizePath(build);\n      } else {\n        this.build(!build);\n      }\n\n      return this;\n    }\n  };\n  p.suffix = function(v, build) {\n    if (this._parts.urn) {\n      return v === undefined ? '' : this;\n    }\n\n    if (v === undefined || v === true) {\n      if (!this._parts.path || this._parts.path === '/') {\n        return '';\n      }\n\n      var filename = this.filename();\n      var pos = filename.lastIndexOf('.');\n      var s, res;\n\n      if (pos === -1) {\n        return '';\n      }\n\n      // suffix may only contain alnum characters (yup, I made this up.)\n      s = filename.substring(pos+1);\n      res = (/^[a-z0-9%]+$/i).test(s) ? s : '';\n      return v ? URI.decodePathSegment(res) : res;\n    } else {\n      if (v.charAt(0) === '.') {\n        v = v.substring(1);\n      }\n\n      var suffix = this.suffix();\n      var replace;\n\n      if (!suffix) {\n        if (!v) {\n          return this;\n        }\n\n        this._parts.path += '.' + URI.recodePath(v);\n      } else if (!v) {\n        replace = new RegExp(escapeRegEx('.' + suffix) + '$');\n      } else {\n        replace = new RegExp(escapeRegEx(suffix) + '$');\n      }\n\n      if (replace) {\n        v = URI.recodePath(v);\n        this._parts.path = this._parts.path.replace(replace, v);\n      }\n\n      this.build(!build);\n      return this;\n    }\n  };\n  p.segment = function(segment, v, build) {\n    var separator = this._parts.urn ? ':' : '/';\n    var path = this.path();\n    var absolute = path.substring(0, 1) === '/';\n    var segments = path.split(separator);\n\n    if (segment !== undefined && typeof segment !== 'number') {\n      build = v;\n      v = segment;\n      segment = undefined;\n    }\n\n    if (segment !== undefined && typeof segment !== 'number') {\n      throw new Error('Bad segment \"' + segment + '\", must be 0-based integer');\n    }\n\n    if (absolute) {\n      segments.shift();\n    }\n\n    if (segment < 0) {\n      // allow negative indexes to address from the end\n      segment = Math.max(segments.length + segment, 0);\n    }\n\n    if (v === undefined) {\n      /*jshint laxbreak: true */\n      return segment === undefined\n        ? segments\n        : segments[segment];\n      /*jshint laxbreak: false */\n    } else if (segment === null || segments[segment] === undefined) {\n      if (isArray(v)) {\n        segments = [];\n        // collapse empty elements within array\n        for (var i=0, l=v.length; i < l; i++) {\n          if (!v[i].length && (!segments.length || !segments[segments.length -1].length)) {\n            continue;\n          }\n\n          if (segments.length && !segments[segments.length -1].length) {\n            segments.pop();\n          }\n\n          segments.push(trimSlashes(v[i]));\n        }\n      } else if (v || typeof v === 'string') {\n        v = trimSlashes(v);\n        if (segments[segments.length -1] === '') {\n          // empty trailing elements have to be overwritten\n          // to prevent results such as /foo//bar\n          segments[segments.length -1] = v;\n        } else {\n          segments.push(v);\n        }\n      }\n    } else {\n      if (v) {\n        segments[segment] = trimSlashes(v);\n      } else {\n        segments.splice(segment, 1);\n      }\n    }\n\n    if (absolute) {\n      segments.unshift('');\n    }\n\n    return this.path(segments.join(separator), build);\n  };\n  p.segmentCoded = function(segment, v, build) {\n    var segments, i, l;\n\n    if (typeof segment !== 'number') {\n      build = v;\n      v = segment;\n      segment = undefined;\n    }\n\n    if (v === undefined) {\n      segments = this.segment(segment, v, build);\n      if (!isArray(segments)) {\n        segments = segments !== undefined ? URI.decode(segments) : undefined;\n      } else {\n        for (i = 0, l = segments.length; i < l; i++) {\n          segments[i] = URI.decode(segments[i]);\n        }\n      }\n\n      return segments;\n    }\n\n    if (!isArray(v)) {\n      v = (typeof v === 'string' || v instanceof String) ? URI.encode(v) : v;\n    } else {\n      for (i = 0, l = v.length; i < l; i++) {\n        v[i] = URI.encode(v[i]);\n      }\n    }\n\n    return this.segment(segment, v, build);\n  };\n\n  // mutating query string\n  var q = p.query;\n  p.query = function(v, build) {\n    if (v === true) {\n      return URI.parseQuery(this._parts.query, this._parts.escapeQuerySpace);\n    } else if (typeof v === 'function') {\n      var data = URI.parseQuery(this._parts.query, this._parts.escapeQuerySpace);\n      var result = v.call(this, data);\n      this._parts.query = URI.buildQuery(result || data, this._parts.duplicateQueryParameters, this._parts.escapeQuerySpace);\n      this.build(!build);\n      return this;\n    } else if (v !== undefined && typeof v !== 'string') {\n      this._parts.query = URI.buildQuery(v, this._parts.duplicateQueryParameters, this._parts.escapeQuerySpace);\n      this.build(!build);\n      return this;\n    } else {\n      return q.call(this, v, build);\n    }\n  };\n  p.setQuery = function(name, value, build) {\n    var data = URI.parseQuery(this._parts.query, this._parts.escapeQuerySpace);\n\n    if (typeof name === 'string' || name instanceof String) {\n      data[name] = value !== undefined ? value : null;\n    } else if (typeof name === 'object') {\n      for (var key in name) {\n        if (hasOwn.call(name, key)) {\n          data[key] = name[key];\n        }\n      }\n    } else {\n      throw new TypeError('URI.addQuery() accepts an object, string as the name parameter');\n    }\n\n    this._parts.query = URI.buildQuery(data, this._parts.duplicateQueryParameters, this._parts.escapeQuerySpace);\n    if (typeof name !== 'string') {\n      build = value;\n    }\n\n    this.build(!build);\n    return this;\n  };\n  p.addQuery = function(name, value, build) {\n    var data = URI.parseQuery(this._parts.query, this._parts.escapeQuerySpace);\n    URI.addQuery(data, name, value === undefined ? null : value);\n    this._parts.query = URI.buildQuery(data, this._parts.duplicateQueryParameters, this._parts.escapeQuerySpace);\n    if (typeof name !== 'string') {\n      build = value;\n    }\n\n    this.build(!build);\n    return this;\n  };\n  p.removeQuery = function(name, value, build) {\n    var data = URI.parseQuery(this._parts.query, this._parts.escapeQuerySpace);\n    URI.removeQuery(data, name, value);\n    this._parts.query = URI.buildQuery(data, this._parts.duplicateQueryParameters, this._parts.escapeQuerySpace);\n    if (typeof name !== 'string') {\n      build = value;\n    }\n\n    this.build(!build);\n    return this;\n  };\n  p.hasQuery = function(name, value, withinArray) {\n    var data = URI.parseQuery(this._parts.query, this._parts.escapeQuerySpace);\n    return URI.hasQuery(data, name, value, withinArray);\n  };\n  p.setSearch = p.setQuery;\n  p.addSearch = p.addQuery;\n  p.removeSearch = p.removeQuery;\n  p.hasSearch = p.hasQuery;\n\n  // sanitizing URLs\n  p.normalize = function() {\n    if (this._parts.urn) {\n      return this\n        .normalizeProtocol(false)\n        .normalizePath(false)\n        .normalizeQuery(false)\n        .normalizeFragment(false)\n        .build();\n    }\n\n    return this\n      .normalizeProtocol(false)\n      .normalizeHostname(false)\n      .normalizePort(false)\n      .normalizePath(false)\n      .normalizeQuery(false)\n      .normalizeFragment(false)\n      .build();\n  };\n  p.normalizeProtocol = function(build) {\n    if (typeof this._parts.protocol === 'string') {\n      this._parts.protocol = this._parts.protocol.toLowerCase();\n      this.build(!build);\n    }\n\n    return this;\n  };\n  p.normalizeHostname = function(build) {\n    if (this._parts.hostname) {\n      if (this.is('IDN') && punycode) {\n        this._parts.hostname = punycode.toASCII(this._parts.hostname);\n      } else if (this.is('IPv6') && IPv6) {\n        this._parts.hostname = IPv6.best(this._parts.hostname);\n      }\n\n      this._parts.hostname = this._parts.hostname.toLowerCase();\n      this.build(!build);\n    }\n\n    return this;\n  };\n  p.normalizePort = function(build) {\n    // remove port of it's the protocol's default\n    if (typeof this._parts.protocol === 'string' && this._parts.port === URI.defaultPorts[this._parts.protocol]) {\n      this._parts.port = null;\n      this.build(!build);\n    }\n\n    return this;\n  };\n  p.normalizePath = function(build) {\n    var _path = this._parts.path;\n    if (!_path) {\n      return this;\n    }\n\n    if (this._parts.urn) {\n      this._parts.path = URI.recodeUrnPath(this._parts.path);\n      this.build(!build);\n      return this;\n    }\n\n    if (this._parts.path === '/') {\n      return this;\n    }\n\n    _path = URI.recodePath(_path);\n\n    var _was_relative;\n    var _leadingParents = '';\n    var _parent, _pos;\n\n    // handle relative paths\n    if (_path.charAt(0) !== '/') {\n      _was_relative = true;\n      _path = '/' + _path;\n    }\n\n    // handle relative files (as opposed to directories)\n    if (_path.slice(-3) === '/..' || _path.slice(-2) === '/.') {\n      _path += '/';\n    }\n\n    // resolve simples\n    _path = _path\n      .replace(/(\\/(\\.\\/)+)|(\\/\\.$)/g, '/')\n      .replace(/\\/{2,}/g, '/');\n\n    // remember leading parents\n    if (_was_relative) {\n      _leadingParents = _path.substring(1).match(/^(\\.\\.\\/)+/) || '';\n      if (_leadingParents) {\n        _leadingParents = _leadingParents[0];\n      }\n    }\n\n    // resolve parents\n    while (true) {\n      _parent = _path.search(/\\/\\.\\.(\\/|$)/);\n      if (_parent === -1) {\n        // no more ../ to resolve\n        break;\n      } else if (_parent === 0) {\n        // top level cannot be relative, skip it\n        _path = _path.substring(3);\n        continue;\n      }\n\n      _pos = _path.substring(0, _parent).lastIndexOf('/');\n      if (_pos === -1) {\n        _pos = _parent;\n      }\n      _path = _path.substring(0, _pos) + _path.substring(_parent + 3);\n    }\n\n    // revert to relative\n    if (_was_relative && this.is('relative')) {\n      _path = _leadingParents + _path.substring(1);\n    }\n\n    this._parts.path = _path;\n    this.build(!build);\n    return this;\n  };\n  p.normalizePathname = p.normalizePath;\n  p.normalizeQuery = function(build) {\n    if (typeof this._parts.query === 'string') {\n      if (!this._parts.query.length) {\n        this._parts.query = null;\n      } else {\n        this.query(URI.parseQuery(this._parts.query, this._parts.escapeQuerySpace));\n      }\n\n      this.build(!build);\n    }\n\n    return this;\n  };\n  p.normalizeFragment = function(build) {\n    if (!this._parts.fragment) {\n      this._parts.fragment = null;\n      this.build(!build);\n    }\n\n    return this;\n  };\n  p.normalizeSearch = p.normalizeQuery;\n  p.normalizeHash = p.normalizeFragment;\n\n  p.iso8859 = function() {\n    // expect unicode input, iso8859 output\n    var e = URI.encode;\n    var d = URI.decode;\n\n    URI.encode = escape;\n    URI.decode = decodeURIComponent;\n    try {\n      this.normalize();\n    } finally {\n      URI.encode = e;\n      URI.decode = d;\n    }\n    return this;\n  };\n\n  p.unicode = function() {\n    // expect iso8859 input, unicode output\n    var e = URI.encode;\n    var d = URI.decode;\n\n    URI.encode = strictEncodeURIComponent;\n    URI.decode = unescape;\n    try {\n      this.normalize();\n    } finally {\n      URI.encode = e;\n      URI.decode = d;\n    }\n    return this;\n  };\n\n  p.readable = function() {\n    var uri = this.clone();\n    // removing username, password, because they shouldn't be displayed according to RFC 3986\n    uri.username('').password('').normalize();\n    var t = '';\n    if (uri._parts.protocol) {\n      t += uri._parts.protocol + '://';\n    }\n\n    if (uri._parts.hostname) {\n      if (uri.is('punycode') && punycode) {\n        t += punycode.toUnicode(uri._parts.hostname);\n        if (uri._parts.port) {\n          t += ':' + uri._parts.port;\n        }\n      } else {\n        t += uri.host();\n      }\n    }\n\n    if (uri._parts.hostname && uri._parts.path && uri._parts.path.charAt(0) !== '/') {\n      t += '/';\n    }\n\n    t += uri.path(true);\n    if (uri._parts.query) {\n      var q = '';\n      for (var i = 0, qp = uri._parts.query.split('&'), l = qp.length; i < l; i++) {\n        var kv = (qp[i] || '').split('=');\n        q += '&' + URI.decodeQuery(kv[0], this._parts.escapeQuerySpace)\n          .replace(/&/g, '%26');\n\n        if (kv[1] !== undefined) {\n          q += '=' + URI.decodeQuery(kv[1], this._parts.escapeQuerySpace)\n            .replace(/&/g, '%26');\n        }\n      }\n      t += '?' + q.substring(1);\n    }\n\n    t += URI.decodeQuery(uri.hash(), true);\n    return t;\n  };\n\n  // resolving relative and absolute URLs\n  p.absoluteTo = function(base) {\n    var resolved = this.clone();\n    var properties = ['protocol', 'username', 'password', 'hostname', 'port'];\n    var basedir, i, p;\n\n    if (this._parts.urn) {\n      throw new Error('URNs do not have any generally defined hierarchical components');\n    }\n\n    if (!(base instanceof URI)) {\n      base = new URI(base);\n    }\n\n    if (!resolved._parts.protocol) {\n      resolved._parts.protocol = base._parts.protocol;\n    }\n\n    if (this._parts.hostname) {\n      return resolved;\n    }\n\n    for (i = 0; (p = properties[i]); i++) {\n      resolved._parts[p] = base._parts[p];\n    }\n\n    if (!resolved._parts.path) {\n      resolved._parts.path = base._parts.path;\n      if (!resolved._parts.query) {\n        resolved._parts.query = base._parts.query;\n      }\n    } else {\n      if (resolved._parts.path.substring(-2) === '..') {\n        resolved._parts.path += '/';\n      }\n\n      if (resolved.path().charAt(0) !== '/') {\n        basedir = base.directory();\n        basedir = basedir ? basedir : base.path().indexOf('/') === 0 ? '/' : '';\n        resolved._parts.path = (basedir ? (basedir + '/') : '') + resolved._parts.path;\n        resolved.normalizePath();\n      }\n    }\n\n    resolved.build();\n    return resolved;\n  };\n  p.relativeTo = function(base) {\n    var relative = this.clone().normalize();\n    var relativeParts, baseParts, common, relativePath, basePath;\n\n    if (relative._parts.urn) {\n      throw new Error('URNs do not have any generally defined hierarchical components');\n    }\n\n    base = new URI(base).normalize();\n    relativeParts = relative._parts;\n    baseParts = base._parts;\n    relativePath = relative.path();\n    basePath = base.path();\n\n    if (relativePath.charAt(0) !== '/') {\n      throw new Error('URI is already relative');\n    }\n\n    if (basePath.charAt(0) !== '/') {\n      throw new Error('Cannot calculate a URI relative to another relative URI');\n    }\n\n    if (relativeParts.protocol === baseParts.protocol) {\n      relativeParts.protocol = null;\n    }\n\n    if (relativeParts.username !== baseParts.username || relativeParts.password !== baseParts.password) {\n      return relative.build();\n    }\n\n    if (relativeParts.protocol !== null || relativeParts.username !== null || relativeParts.password !== null) {\n      return relative.build();\n    }\n\n    if (relativeParts.hostname === baseParts.hostname && relativeParts.port === baseParts.port) {\n      relativeParts.hostname = null;\n      relativeParts.port = null;\n    } else {\n      return relative.build();\n    }\n\n    if (relativePath === basePath) {\n      relativeParts.path = '';\n      return relative.build();\n    }\n\n    // determine common sub path\n    common = URI.commonPath(relativePath, basePath);\n\n    // If the paths have nothing in common, return a relative URL with the absolute path.\n    if (!common) {\n      return relative.build();\n    }\n\n    var parents = baseParts.path\n      .substring(common.length)\n      .replace(/[^\\/]*$/, '')\n      .replace(/.*?\\//g, '../');\n\n    relativeParts.path = (parents + relativeParts.path.substring(common.length)) || './';\n\n    return relative.build();\n  };\n\n  // comparing URIs\n  p.equals = function(uri) {\n    var one = this.clone();\n    var two = new URI(uri);\n    var one_map = {};\n    var two_map = {};\n    var checked = {};\n    var one_query, two_query, key;\n\n    one.normalize();\n    two.normalize();\n\n    // exact match\n    if (one.toString() === two.toString()) {\n      return true;\n    }\n\n    // extract query string\n    one_query = one.query();\n    two_query = two.query();\n    one.query('');\n    two.query('');\n\n    // definitely not equal if not even non-query parts match\n    if (one.toString() !== two.toString()) {\n      return false;\n    }\n\n    // query parameters have the same length, even if they're permuted\n    if (one_query.length !== two_query.length) {\n      return false;\n    }\n\n    one_map = URI.parseQuery(one_query, this._parts.escapeQuerySpace);\n    two_map = URI.parseQuery(two_query, this._parts.escapeQuerySpace);\n\n    for (key in one_map) {\n      if (hasOwn.call(one_map, key)) {\n        if (!isArray(one_map[key])) {\n          if (one_map[key] !== two_map[key]) {\n            return false;\n          }\n        } else if (!arraysEqual(one_map[key], two_map[key])) {\n          return false;\n        }\n\n        checked[key] = true;\n      }\n    }\n\n    for (key in two_map) {\n      if (hasOwn.call(two_map, key)) {\n        if (!checked[key]) {\n          // two contains a parameter not present in one\n          return false;\n        }\n      }\n    }\n\n    return true;\n  };\n\n  // state\n  p.duplicateQueryParameters = function(v) {\n    this._parts.duplicateQueryParameters = !!v;\n    return this;\n  };\n\n  p.escapeQuerySpace = function(v) {\n    this._parts.escapeQuerySpace = !!v;\n    return this;\n  };\n\n  return URI;\n}));\n"
  },
  {
    "path": "luigi/static/visualiser/lib/mustache.js",
    "content": "/*!\n * mustache.js - Logic-less {{mustache}} templates with JavaScript\n * http://github.com/janl/mustache.js\n */\n\n/*global define: false*/\n\n(function (root, factory) {\n  if (typeof exports === \"object\" && exports) {\n    factory(exports); // CommonJS\n  } else {\n    var mustache = {};\n    factory(mustache);\n    if (typeof define === \"function\" && define.amd) {\n      define(mustache); // AMD\n    } else {\n      root.Mustache = mustache; // <script>\n    }\n  }\n}(this, function (mustache) {\n\n  var whiteRe = /\\s*/;\n  var spaceRe = /\\s+/;\n  var nonSpaceRe = /\\S/;\n  var eqRe = /\\s*=/;\n  var curlyRe = /\\s*\\}/;\n  var tagRe = /#|\\^|\\/|>|\\{|&|=|!/;\n\n  // Workaround for https://issues.apache.org/jira/browse/COUCHDB-577\n  // See https://github.com/janl/mustache.js/issues/189\n  var RegExp_test = RegExp.prototype.test;\n  function testRegExp(re, string) {\n    return RegExp_test.call(re, string);\n  }\n\n  function isWhitespace(string) {\n    return !testRegExp(nonSpaceRe, string);\n  }\n\n  var Object_toString = Object.prototype.toString;\n  var isArray = Array.isArray || function (obj) {\n    return Object_toString.call(obj) === '[object Array]';\n  };\n\n  function escapeRegExp(string) {\n    return string.replace(/[\\-\\[\\]{}()*+?.,\\\\\\^$|#\\s]/g, \"\\\\$&\");\n  }\n\n  var entityMap = {\n    \"&\": \"&amp;\",\n    \"<\": \"&lt;\",\n    \">\": \"&gt;\",\n    '\"': '&quot;',\n    \"'\": '&#39;',\n    \"/\": '&#x2F;'\n  };\n\n  function escapeHtml(string) {\n    return String(string).replace(/[&<>\"'\\/]/g, function (s) {\n      return entityMap[s];\n    });\n  }\n\n  function Scanner(string) {\n    this.string = string;\n    this.tail = string;\n    this.pos = 0;\n  }\n\n  /**\n   * Returns `true` if the tail is empty (end of string).\n   */\n  Scanner.prototype.eos = function () {\n    return this.tail === \"\";\n  };\n\n  /**\n   * Tries to match the given regular expression at the current position.\n   * Returns the matched text if it can match, the empty string otherwise.\n   */\n  Scanner.prototype.scan = function (re) {\n    var match = this.tail.match(re);\n\n    if (match && match.index === 0) {\n      this.tail = this.tail.substring(match[0].length);\n      this.pos += match[0].length;\n      return match[0];\n    }\n\n    return \"\";\n  };\n\n  /**\n   * Skips all text until the given regular expression can be matched. Returns\n   * the skipped string, which is the entire tail if no match can be made.\n   */\n  Scanner.prototype.scanUntil = function (re) {\n    var match, pos = this.tail.search(re);\n\n    switch (pos) {\n    case -1:\n      match = this.tail;\n      this.pos += this.tail.length;\n      this.tail = \"\";\n      break;\n    case 0:\n      match = \"\";\n      break;\n    default:\n      match = this.tail.substring(0, pos);\n      this.tail = this.tail.substring(pos);\n      this.pos += pos;\n    }\n\n    return match;\n  };\n\n  function Context(view, parent) {\n    this.view = view || {};\n    this.parent = parent;\n    this._cache = {};\n  }\n\n  Context.make = function (view) {\n    return (view instanceof Context) ? view : new Context(view);\n  };\n\n  Context.prototype.push = function (view) {\n    return new Context(view, this);\n  };\n\n  Context.prototype.lookup = function (name) {\n    var value = this._cache[name];\n\n    if (!value) {\n      if (name == '.') {\n        value = this.view;\n      } else {\n        var context = this;\n\n        while (context) {\n          if (name.indexOf('.') > 0) {\n            value = context.view;\n            var names = name.split('.'), i = 0;\n            while (value && i < names.length) {\n              value = value[names[i++]];\n            }\n          } else {\n            value = context.view[name];\n          }\n\n          if (value != null) break;\n\n          context = context.parent;\n        }\n      }\n\n      this._cache[name] = value;\n    }\n\n    if (typeof value === 'function') value = value.call(this.view);\n\n    return value;\n  };\n\n  function Writer() {\n    this.clearCache();\n  }\n\n  Writer.prototype.clearCache = function () {\n    this._cache = {};\n    this._partialCache = {};\n  };\n\n  Writer.prototype.compile = function (template, tags) {\n    var fn = this._cache[template];\n\n    if (!fn) {\n      var tokens = mustache.parse(template, tags);\n      fn = this._cache[template] = this.compileTokens(tokens, template);\n    }\n\n    return fn;\n  };\n\n  Writer.prototype.compilePartial = function (name, template, tags) {\n    var fn = this.compile(template, tags);\n    this._partialCache[name] = fn;\n    return fn;\n  };\n\n  Writer.prototype.getPartial = function (name) {\n    if (!(name in this._partialCache) && this._loadPartial) {\n      this.compilePartial(name, this._loadPartial(name));\n    }\n\n    return this._partialCache[name];\n  };\n\n  Writer.prototype.compileTokens = function (tokens, template) {\n    var self = this;\n    return function (view, partials) {\n      if (partials) {\n        if (typeof partials === 'function') {\n          self._loadPartial = partials;\n        } else {\n          for (var name in partials) {\n            self.compilePartial(name, partials[name]);\n          }\n        }\n      }\n\n      return renderTokens(tokens, self, Context.make(view), template);\n    };\n  };\n\n  Writer.prototype.render = function (template, view, partials) {\n    return this.compile(template)(view, partials);\n  };\n\n  /**\n   * Low-level function that renders the given `tokens` using the given `writer`\n   * and `context`. The `template` string is only needed for templates that use\n   * higher-order sections to extract the portion of the original template that\n   * was contained in that section.\n   */\n  function renderTokens(tokens, writer, context, template) {\n    var buffer = '';\n\n    var token, tokenValue, value;\n    for (var i = 0, len = tokens.length; i < len; ++i) {\n      token = tokens[i];\n      tokenValue = token[1];\n\n      switch (token[0]) {\n      case '#':\n        value = context.lookup(tokenValue);\n\n        if (typeof value === 'object') {\n          if (isArray(value)) {\n            for (var j = 0, jlen = value.length; j < jlen; ++j) {\n              buffer += renderTokens(token[4], writer, context.push(value[j]), template);\n            }\n          } else if (value) {\n            buffer += renderTokens(token[4], writer, context.push(value), template);\n          }\n        } else if (typeof value === 'function') {\n          var text = template == null ? null : template.slice(token[3], token[5]);\n          value = value.call(context.view, text, function (template) {\n            return writer.render(template, context);\n          });\n          if (value != null) buffer += value;\n        } else if (value) {\n          buffer += renderTokens(token[4], writer, context, template);\n        }\n\n        break;\n      case '^':\n        value = context.lookup(tokenValue);\n\n        // Use JavaScript's definition of falsy. Include empty arrays.\n        // See https://github.com/janl/mustache.js/issues/186\n        if (!value || (isArray(value) && value.length === 0)) {\n          buffer += renderTokens(token[4], writer, context, template);\n        }\n\n        break;\n      case '>':\n        value = writer.getPartial(tokenValue);\n        if (typeof value === 'function') buffer += value(context);\n        break;\n      case '&':\n        value = context.lookup(tokenValue);\n        if (value != null) buffer += value;\n        break;\n      case 'name':\n        value = context.lookup(tokenValue);\n        if (value != null) buffer += mustache.escape(value);\n        break;\n      case 'text':\n        buffer += tokenValue;\n        break;\n      }\n    }\n\n    return buffer;\n  }\n\n  /**\n   * Forms the given array of `tokens` into a nested tree structure where\n   * tokens that represent a section have two additional items: 1) an array of\n   * all tokens that appear in that section and 2) the index in the original\n   * template that represents the end of that section.\n   */\n  function nestTokens(tokens) {\n    var tree = [];\n    var collector = tree;\n    var sections = [];\n\n    var token;\n    for (var i = 0, len = tokens.length; i < len; ++i) {\n      token = tokens[i];\n      switch (token[0]) {\n      case '#':\n      case '^':\n        sections.push(token);\n        collector.push(token);\n        collector = token[4] = [];\n        break;\n      case '/':\n        var section = sections.pop();\n        section[5] = token[2];\n        collector = sections.length > 0 ? sections[sections.length - 1][4] : tree;\n        break;\n      default:\n        collector.push(token);\n      }\n    }\n\n    return tree;\n  }\n\n  /**\n   * Combines the values of consecutive text tokens in the given `tokens` array\n   * to a single token.\n   */\n  function squashTokens(tokens) {\n    var squashedTokens = [];\n\n    var token, lastToken;\n    for (var i = 0, len = tokens.length; i < len; ++i) {\n      token = tokens[i];\n      if (token) {\n        if (token[0] === 'text' && lastToken && lastToken[0] === 'text') {\n          lastToken[1] += token[1];\n          lastToken[3] = token[3];\n        } else {\n          lastToken = token;\n          squashedTokens.push(token);\n        }\n      }\n    }\n\n    return squashedTokens;\n  }\n\n  function escapeTags(tags) {\n    return [\n      new RegExp(escapeRegExp(tags[0]) + \"\\\\s*\"),\n      new RegExp(\"\\\\s*\" + escapeRegExp(tags[1]))\n    ];\n  }\n\n  /**\n   * Breaks up the given `template` string into a tree of token objects. If\n   * `tags` is given here it must be an array with two string values: the\n   * opening and closing tags used in the template (e.g. [\"<%\", \"%>\"]). Of\n   * course, the default is to use mustaches (i.e. Mustache.tags).\n   */\n  function parseTemplate(template, tags) {\n    template = template || '';\n    tags = tags || mustache.tags;\n\n    if (typeof tags === 'string') tags = tags.split(spaceRe);\n    if (tags.length !== 2) throw new Error('Invalid tags: ' + tags.join(', '));\n\n    var tagRes = escapeTags(tags);\n    var scanner = new Scanner(template);\n\n    var sections = [];     // Stack to hold section tokens\n    var tokens = [];       // Buffer to hold the tokens\n    var spaces = [];       // Indices of whitespace tokens on the current line\n    var hasTag = false;    // Is there a {{tag}} on the current line?\n    var nonSpace = false;  // Is there a non-space char on the current line?\n\n    // Strips all whitespace tokens array for the current line\n    // if there was a {{#tag}} on it and otherwise only space.\n    function stripSpace() {\n      if (hasTag && !nonSpace) {\n        while (spaces.length) {\n          delete tokens[spaces.pop()];\n        }\n      } else {\n        spaces = [];\n      }\n\n      hasTag = false;\n      nonSpace = false;\n    }\n\n    var start, type, value, chr, token;\n    while (!scanner.eos()) {\n      start = scanner.pos;\n\n      // Match any text between tags.\n      value = scanner.scanUntil(tagRes[0]);\n      if (value) {\n        for (var i = 0, len = value.length; i < len; ++i) {\n          chr = value.charAt(i);\n\n          if (isWhitespace(chr)) {\n            spaces.push(tokens.length);\n          } else {\n            nonSpace = true;\n          }\n\n          tokens.push(['text', chr, start, start + 1]);\n          start += 1;\n\n          // Check for whitespace on the current line.\n          if (chr == '\\n') stripSpace();\n        }\n      }\n\n      // Match the opening tag.\n      if (!scanner.scan(tagRes[0])) break;\n      hasTag = true;\n\n      // Get the tag type.\n      type = scanner.scan(tagRe) || 'name';\n      scanner.scan(whiteRe);\n\n      // Get the tag value.\n      if (type === '=') {\n        value = scanner.scanUntil(eqRe);\n        scanner.scan(eqRe);\n        scanner.scanUntil(tagRes[1]);\n      } else if (type === '{') {\n        value = scanner.scanUntil(new RegExp('\\\\s*' + escapeRegExp('}' + tags[1])));\n        scanner.scan(curlyRe);\n        scanner.scanUntil(tagRes[1]);\n        type = '&';\n      } else {\n        value = scanner.scanUntil(tagRes[1]);\n      }\n\n      // Match the closing tag.\n      if (!scanner.scan(tagRes[1])) throw new Error('Unclosed tag at ' + scanner.pos);\n\n      token = [type, value, start, scanner.pos];\n      tokens.push(token);\n\n      if (type === '#' || type === '^') {\n        sections.push(token);\n      } else if (type === '/') {\n        // Check section nesting.\n        if (sections.length === 0) throw new Error('Unopened section \"' + value + '\" at ' + start);\n        var openSection = sections.pop();\n        if (openSection[1] !== value) throw new Error('Unclosed section \"' + openSection[1] + '\" at ' + start);\n      } else if (type === 'name' || type === '{' || type === '&') {\n        nonSpace = true;\n      } else if (type === '=') {\n        // Set the tags for the next time around.\n        tags = value.split(spaceRe);\n        if (tags.length !== 2) throw new Error('Invalid tags at ' + start + ': ' + tags.join(', '));\n        tagRes = escapeTags(tags);\n      }\n    }\n\n    // Make sure there are no open sections when we're done.\n    var openSection = sections.pop();\n    if (openSection) throw new Error('Unclosed section \"' + openSection[1] + '\" at ' + scanner.pos);\n\n    tokens = squashTokens(tokens);\n\n    return nestTokens(tokens);\n  }\n\n  mustache.name = \"mustache.js\";\n  mustache.version = \"0.7.2\";\n  mustache.tags = [\"{{\", \"}}\"];\n\n  mustache.Scanner = Scanner;\n  mustache.Context = Context;\n  mustache.Writer = Writer;\n\n  mustache.parse = parseTemplate;\n\n  // Export the escaping function so that the user may override it.\n  // See https://github.com/janl/mustache.js/issues/244\n  mustache.escape = escapeHtml;\n\n  // All Mustache.* functions use this writer.\n  var defaultWriter = new Writer();\n\n  /**\n   * Clears all cached templates and partials in the default writer.\n   */\n  mustache.clearCache = function () {\n    return defaultWriter.clearCache();\n  };\n\n  /**\n   * Compiles the given `template` to a reusable function using the default\n   * writer.\n   */\n  mustache.compile = function (template, tags) {\n    return defaultWriter.compile(template, tags);\n  };\n\n  /**\n   * Compiles the partial with the given `name` and `template` to a reusable\n   * function using the default writer.\n   */\n  mustache.compilePartial = function (name, template, tags) {\n    return defaultWriter.compilePartial(name, template, tags);\n  };\n\n  /**\n   * Compiles the given array of tokens (the output of a parse) to a reusable\n   * function using the default writer.\n   */\n  mustache.compileTokens = function (tokens, template) {\n    return defaultWriter.compileTokens(tokens, template);\n  };\n\n  /**\n   * Renders the `template` with the given `view` and `partials` using the\n   * default writer.\n   */\n  mustache.render = function (template, view, partials) {\n    return defaultWriter.render(template, view, partials);\n  };\n\n  // This is here for backwards compatibility with 0.4.x.\n  mustache.to_html = function (template, view, partials, send) {\n    var result = mustache.render(template, view, partials);\n\n    if (typeof send === \"function\") {\n      send(result);\n    } else {\n      return result;\n    }\n  };\n\n}));"
  },
  {
    "path": "luigi/static/visualiser/mockdata/dep_graph",
    "content": "{\n    \"response\": {\n        \"FactorTask(product=12)\": {\n            \"deps\": [\n                \"FactorTask(product=2)\",\n                \"FactorTask(product=6)\"\n            ],\n            \"start_time\": 1369300552.60482,\n            \"status\": \"PENDING\",\n            \"workers\": [\n                \"worker-641996460\"\n            ]\n        },\n        \"FactorTask(product=2)\": {\n            \"deps\": [],\n            \"start_time\": 1369300552.60741,\n            \"status\": \"FAILED\",\n            \"workers\": [\n                \"worker-641996460\"\n            ]\n        },\n        \"FactorTask(product=3)\": {\n            \"deps\": [],\n            \"start_time\": 1369300552.61154,\n            \"status\": \"PENDING\",\n            \"workers\": [\n                \"worker-641996460\"\n            ]\n        },\n        \"FactorTask(product=6)\": {\n            \"deps\": [\n                \"FactorTask(product=2)\",\n                \"FactorTask(product=3)\"\n            ],\n            \"start_time\": 1369300552.609396,\n            \"status\": \"DONE\",\n            \"workers\": [\n                \"worker-641996460\"\n            ]\n        }\n    }\n}\n"
  },
  {
    "path": "luigi/static/visualiser/mockdata/fetch_error",
    "content": "{\n    \"response\": {\n        \"taskId\": \"FactorTask(product=2)\",\n        \"error\": \"Runtime error:\\nTraceback (most recent call last):\\n  File '/Users/davw/projects/luigi-core/luigi/worker.py', line 164, in _run_task\\n    task.run()\\n  File '/Users/davw/projects/luigi-core/test/scheduler_visualisation_test.py', line 62, in run\\n    raise Exception('Error Message')\\nException: Error Message\\n\"\n    }\n}"
  },
  {
    "path": "luigi/static/visualiser/mockdata/task_list",
    "content": "{\n    \"response\": {\n        \"FactorTask(product=12)\": {\n            \"deps\": [\n                \"FactorTask(product=2)\",\n                \"FactorTask(product=6)\"\n            ],\n            \"start_time\": 1369300552.60482,\n            \"status\": \"PENDING\",\n            \"workers\": [\n                \"worker-641996460\"\n            ]\n        },\n        \"FactorTask(product=2)\": {\n            \"deps\": [],\n            \"start_time\": 1369300552.60741,\n            \"status\": \"FAILED\",\n            \"workers\": [\n                \"worker-641996460\"\n            ]\n        },\n        \"FactorTask(product=3)\": {\n            \"deps\": [],\n            \"start_time\": 1369300552.61154,\n            \"status\": \"PENDING\",\n            \"workers\": [\n                \"worker-641996460\"\n            ]\n        },\n        \"FactorTask(product=6)\": {\n            \"deps\": [\n                \"FactorTask(product=2)\",\n                \"FactorTask(product=3)\"\n            ],\n            \"start_time\": 1369300552.609396,\n            \"status\": \"DONE\",\n            \"workers\": [\n                \"worker-641996460\"\n            ]\n        }\n    }\n}\n"
  },
  {
    "path": "luigi/static/visualiser/test.html",
    "content": "<!DOCTYPE html>\n<html>\n    <head>\n        <title>Luigi Visualiser Tests</title>\n        <link rel=\"stylesheet\" href=\"http://code.jquery.com/qunit/qunit-1.11.0.css\">\n    </head>\n    </body>\n        <div id=\"qunit\"></div>\n        <div id=\"qunit-fixture\"></div>\n        <script src=\"http://code.jquery.com/qunit/qunit-1.11.0.js\"></script>\n        <script src=\"lib/jquery-1.10.0.min.js\"></script>\n        <script src=\"js/graph.js\"></script>\n        <script src=\"js/test/graph_test.js\"></script>\n    </body>\n</html>"
  },
  {
    "path": "luigi/target.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nThe abstract :py:class:`Target` class.\nIt is a central concept of Luigi and represents the state of the workflow.\n\"\"\"\n\nimport abc\nimport io\nimport logging\nimport os\nimport random\nimport tempfile\nimport warnings\nfrom contextlib import contextmanager\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nclass Target(metaclass=abc.ABCMeta):\n    \"\"\"\n    A Target is a resource generated by a :py:class:`~luigi.task.Task`.\n\n    For example, a Target might correspond to a file in HDFS or data in a database. The Target\n    interface defines one method that must be overridden: :py:meth:`exists`, which signifies if the\n    Target has been created or not.\n\n    Typically, a :py:class:`~luigi.task.Task` will define one or more Targets as output, and the Task\n    is considered complete if and only if each of its output Targets exist.\n    \"\"\"\n\n    @abc.abstractmethod\n    def exists(self):\n        \"\"\"\n        Returns ``True`` if the :py:class:`Target` exists and ``False`` otherwise.\n        \"\"\"\n        pass\n\n\nclass FileSystemException(Exception):\n    \"\"\"\n    Base class for generic file system exceptions.\n    \"\"\"\n\n    pass\n\n\nclass FileAlreadyExists(FileSystemException):\n    \"\"\"\n    Raised when a file system operation can't be performed because\n    a directory exists but is required to not exist.\n    \"\"\"\n\n    pass\n\n\nclass MissingParentDirectory(FileSystemException):\n    \"\"\"\n    Raised when a parent directory doesn't exist.\n    (Imagine mkdir without -p)\n    \"\"\"\n\n    pass\n\n\nclass NotADirectory(FileSystemException):\n    \"\"\"\n    Raised when a file system operation can't be performed because\n    an expected directory is actually a file.\n    \"\"\"\n\n    pass\n\n\nclass FileSystem(metaclass=abc.ABCMeta):\n    \"\"\"\n    FileSystem abstraction used in conjunction with :py:class:`FileSystemTarget`.\n\n    Typically, a FileSystem is associated with instances of a :py:class:`FileSystemTarget`. The\n    instances of the :py:class:`FileSystemTarget` will delegate methods such as\n    :py:meth:`FileSystemTarget.exists` and :py:meth:`FileSystemTarget.remove` to the FileSystem.\n\n    Methods of FileSystem raise :py:class:`FileSystemException` if there is a problem completing the\n    operation.\n    \"\"\"\n\n    @abc.abstractmethod\n    def exists(self, path):\n        \"\"\"\n        Return ``True`` if file or directory at ``path`` exist, ``False`` otherwise\n\n        :param str path: a path within the FileSystem to check for existence.\n        \"\"\"\n        pass\n\n    @abc.abstractmethod\n    def remove(self, path, recursive=True, skip_trash=True):\n        \"\"\"Remove file or directory at location ``path``\n\n        :param str path: a path within the FileSystem to remove.\n        :param bool recursive: if the path is a directory, recursively remove the directory and all\n                               of its descendants. Defaults to ``True``.\n        \"\"\"\n        pass\n\n    def mkdir(self, path, parents=True, raise_if_exists=False):\n        \"\"\"\n        Create directory at location ``path``\n\n        Creates the directory at ``path`` and implicitly create parent\n        directories if they do not already exist.\n\n        :param str path: a path within the FileSystem to create as a directory.\n        :param bool parents: Create parent directories when necessary. When\n                             parents=False and the parent directory doesn't\n                             exist, raise luigi.target.MissingParentDirectory\n        :param bool raise_if_exists: raise luigi.target.FileAlreadyExists if\n                                     the folder already exists.\n        \"\"\"\n        raise NotImplementedError(\"mkdir() not implemented on {0}\".format(self.__class__.__name__))\n\n    def isdir(self, path):\n        \"\"\"\n        Return ``True`` if the location at ``path`` is a directory. If not, return ``False``.\n\n        :param str path: a path within the FileSystem to check as a directory.\n\n        *Note*: This method is optional, not all FileSystem subclasses implements it.\n        \"\"\"\n        raise NotImplementedError(\"isdir() not implemented on {0}\".format(self.__class__.__name__))\n\n    def listdir(self, path):\n        \"\"\"Return a list of files rooted in path.\n\n        This returns an iterable of the files rooted at ``path``. This is intended to be a\n        recursive listing.\n\n        :param str path: a path within the FileSystem to list.\n\n        *Note*: This method is optional, not all FileSystem subclasses implements it.\n        \"\"\"\n        raise NotImplementedError(\"listdir() not implemented on {0}\".format(self.__class__.__name__))\n\n    def move(self, path, dest):\n        \"\"\"\n        Move a file, as one would expect.\n        \"\"\"\n        raise NotImplementedError(\"move() not implemented on {0}\".format(self.__class__.__name__))\n\n    def rename_dont_move(self, path, dest):\n        \"\"\"\n        Potentially rename ``path`` to ``dest``, but don't move it into the\n        ``dest`` folder (if it is a folder).  This relates to :ref:`AtomicWrites`.\n\n        This method has a reasonable but not bullet proof default\n        implementation.  It will just do ``move()`` if the file doesn't\n        ``exists()`` already.\n        \"\"\"\n        warnings.warn(\"File system {} client doesn't support atomic mv.\".format(self.__class__.__name__))\n        if self.exists(dest):\n            raise FileAlreadyExists()\n        self.move(path, dest)\n\n    def rename(self, *args, **kwargs):\n        \"\"\"\n        Alias for ``move()``\n        \"\"\"\n        self.move(*args, **kwargs)\n\n    def copy(self, path, dest):\n        \"\"\"\n        Copy a file or a directory with contents.\n        Currently, LocalFileSystem and MockFileSystem support only single file\n        copying but S3Client copies either a file or a directory as required.\n        \"\"\"\n        raise NotImplementedError(\"copy() not implemented on {0}\".format(self.__class__.__name__))\n\n\nclass FileSystemTarget(Target):\n    \"\"\"\n    Base class for FileSystem Targets like :class:`~luigi.local_target.LocalTarget` and :class:`~luigi.contrib.hdfs.HdfsTarget`.\n\n    A FileSystemTarget has an associated :py:class:`FileSystem` to which certain operations can be\n    delegated. By default, :py:meth:`exists` and :py:meth:`remove` are delegated to the\n    :py:class:`FileSystem`, which is determined by the :py:attr:`fs` property.\n\n    Methods of FileSystemTarget raise :py:class:`FileSystemException` if there is a problem\n    completing the operation.\n\n    Usage:\n        .. code-block:: python\n\n            target = FileSystemTarget('~/some_file.txt')\n            target = FileSystemTarget(pathlib.Path('~') / 'some_file.txt')\n            target.exists()  # False\n    \"\"\"\n\n    def __init__(self, path):\n        \"\"\"\n        Initializes a FileSystemTarget instance.\n\n        :param path: the path associated with this FileSystemTarget.\n        \"\"\"\n        # cast to str to allow path to be objects like pathlib.PosixPath and py._path.local.LocalPath\n        self.path = str(path)\n\n    def __str__(self):\n        return self.path\n\n    @property\n    @abc.abstractmethod\n    def fs(self):\n        \"\"\"\n        The :py:class:`FileSystem` associated with this FileSystemTarget.\n        \"\"\"\n        raise NotImplementedError()\n\n    @abc.abstractmethod\n    def open(self, mode):\n        \"\"\"\n        Open the FileSystem target.\n\n        This method returns a file-like object which can either be read from or written to depending\n        on the specified mode.\n\n        :param str mode: the mode `r` opens the FileSystemTarget in read-only mode, whereas `w` will\n                         open the FileSystemTarget in write mode. Subclasses can implement\n                         additional options. Using `b` is not supported; initialize with\n                         `format=Nop` instead.\n        \"\"\"\n        pass\n\n    def exists(self):\n        \"\"\"\n        Returns ``True`` if the path for this FileSystemTarget exists; ``False`` otherwise.\n\n        This method is implemented by using :py:attr:`fs`.\n        \"\"\"\n        path = self.path\n        if \"*\" in path or \"?\" in path or \"[\" in path or \"{\" in path:\n            logger.warning(\"Using wildcards in path %s might lead to processing of an incomplete dataset; override exists() to suppress the warning.\", path)\n        return self.fs.exists(path)\n\n    def remove(self):\n        \"\"\"\n        Remove the resource at the path specified by this FileSystemTarget.\n\n        This method is implemented by using :py:attr:`fs`.\n        \"\"\"\n        self.fs.remove(self.path)\n\n    @contextmanager\n    def temporary_path(self):\n        \"\"\"\n        A context manager that enables a reasonably short, general and\n        magic-less way to solve the :ref:`AtomicWrites`.\n\n         * On *entering*, it will create the parent directories so the\n           temporary_path is writeable right away.\n           This step uses :py:meth:`FileSystem.mkdir`.\n         * On *exiting*, it will move the temporary file if there was no exception thrown.\n           This step uses :py:meth:`FileSystem.rename_dont_move`\n\n        The file system operations will be carried out by calling them on :py:attr:`fs`.\n\n        The typical use case looks like this:\n\n        .. code:: python\n\n            class MyTask(luigi.Task):\n                def output(self):\n                    return MyFileSystemTarget(...)\n\n                def run(self):\n                    with self.output().temporary_path() as self.temp_output_path:\n                        run_some_external_command(output_path=self.temp_output_path)\n        \"\"\"\n        num = random.randrange(0, 10_000_000_000)\n        slashless_path = self.path.rstrip(\"/\").rstrip(\"\\\\\")\n        _temp_path = \"{}-luigi-tmp-{:010}{}\".format(slashless_path, num, self._trailing_slash())\n        # TODO: os.path doesn't make sense here as it's os-dependent\n        tmp_dir = os.path.dirname(slashless_path)\n        if tmp_dir:\n            self.fs.mkdir(tmp_dir, parents=True, raise_if_exists=False)\n\n        yield _temp_path\n        # We won't reach here if there was an user exception.\n        self.fs.rename_dont_move(_temp_path, self.path)\n\n    def _touchz(self):\n        with self.open(\"w\"):\n            pass\n\n    def _trailing_slash(self):\n        # I suppose one day schema-like paths, like\n        # file:///path/blah.txt?params=etc can be parsed too\n        return self.path[-1] if self.path[-1] in r\"\\/\" else \"\"\n\n\nclass AtomicLocalFile(io.BufferedWriter):\n    \"\"\"Abstract class to create a Target that creates\n    a temporary file in the local filesystem before\n    moving it to its final destination.\n\n    This class is just for the writing part of the Target. See\n    :class:`luigi.local_target.LocalTarget` for example\n    \"\"\"\n\n    def __init__(self, path):\n        self.__tmp_path = self.generate_tmp_path(path)\n        self.path = path\n        super(AtomicLocalFile, self).__init__(io.FileIO(self.__tmp_path, \"w\"))\n\n    def close(self):\n        super(AtomicLocalFile, self).close()\n        self.move_to_final_destination()\n\n    def generate_tmp_path(self, path):\n        return os.path.join(tempfile.gettempdir(), \"luigi-s3-tmp-%09d\" % random.randrange(0, 10_000_000_000))\n\n    def move_to_final_destination(self):\n        raise NotImplementedError()\n\n    def __del__(self):\n        if os.path.exists(self.tmp_path):\n            os.remove(self.tmp_path)\n\n    @property\n    def tmp_path(self):\n        return self.__tmp_path\n\n    def __exit__(self, exc_type, exc, traceback):\n        \"Close/commit the file if there are no exception\"\n        if exc_type:\n            return\n        return super(AtomicLocalFile, self).__exit__(exc_type, exc, traceback)\n"
  },
  {
    "path": "luigi/task.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nThe abstract :py:class:`Task` class.\nIt is a central concept of Luigi and represents the state of the workflow.\nSee :doc:`/tasks` for an overview.\n\"\"\"\n\nimport copy\nimport functools\nimport hashlib\nimport json\nimport logging\nimport re\nimport traceback\nimport warnings\nfrom collections import OrderedDict, deque\nfrom contextlib import contextmanager\nfrom typing import Any, Dict, Optional\n\nfrom typing_extensions import dataclass_transform\n\nimport luigi\nfrom luigi import configuration, parameter\nfrom luigi.parameter import ParameterVisibility, UnconsumedParameterWarning\nfrom luigi.task_register import Register\n\nParameter = parameter.Parameter\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nTASK_ID_INCLUDE_PARAMS = 3\nTASK_ID_TRUNCATE_PARAMS = 16\nTASK_ID_TRUNCATE_HASH = 10\nTASK_ID_INVALID_CHAR_REGEX = re.compile(r\"[^A-Za-z0-9_]\")\n_SAME_AS_PYTHON_MODULE = \"_same_as_python_module\"\n\n\ndef namespace(namespace=None, scope=\"\"):\n    \"\"\"\n    Call to set namespace of tasks declared after the call.\n\n    It is often desired to call this function with the keyword argument\n    ``scope=__name__``.\n\n    The ``scope`` keyword makes it so that this call is only effective for task\n    classes with a matching [*]_ ``__module__``. The default value for\n    ``scope`` is the empty string, which means all classes. Multiple calls with\n    the same scope simply replace each other.\n\n    The namespace of a :py:class:`Task` can also be changed by specifying the property\n    ``task_namespace``.\n\n    .. code-block:: python\n\n        class Task2(luigi.Task):\n            task_namespace = 'namespace2'\n\n    This explicit setting takes priority over whatever is set in the\n    ``namespace()`` method, and it's also inherited through normal python\n    inheritence.\n\n    There's no equivalent way to set the ``task_family``.\n\n    *New since Luigi 2.6.0:* ``scope`` keyword argument.\n\n    .. [*] When there are multiple levels of matching module scopes like\n           ``a.b`` vs ``a.b.c``, the more specific one (``a.b.c``) wins.\n    .. seealso:: The new and better scaling :py:func:`auto_namespace`\n    \"\"\"\n    Register._default_namespace_dict[scope] = namespace or \"\"\n\n\ndef auto_namespace(scope=\"\"):\n    \"\"\"\n    Same as :py:func:`namespace`, but instead of a constant namespace, it will\n    be set to the ``__module__`` of the task class. This is desirable for these\n    reasons:\n\n     * Two tasks with the same name will not have conflicting task families\n     * It's more pythonic, as modules are Python's recommended way to\n       do namespacing.\n     * It's traceable. When you see the full name of a task, you can immediately\n       identify where it is defined.\n\n    We recommend calling this function from your package's outermost\n    ``__init__.py`` file. The file contents could look like this:\n\n    .. code-block:: python\n\n        import luigi\n\n        luigi.auto_namespace(scope=__name__)\n\n    To reset an ``auto_namespace()`` call, you can use\n    ``namespace(scope='my_scope')``.  But this will not be\n    needed (and is also discouraged) if you use the ``scope`` kwarg.\n\n    *New since Luigi 2.6.0.*\n    \"\"\"\n    namespace(namespace=_SAME_AS_PYTHON_MODULE, scope=scope)\n\n\ndef task_id_str(task_family, params):\n    \"\"\"\n    Returns a canonical string used to identify a particular task\n\n    :param task_family: The task family (class name) of the task\n    :param params: a dict mapping parameter names to their serialized values\n    :return: A unique, shortened identifier corresponding to the family and params\n    \"\"\"\n    # task_id is a concatenation of task family, the first values of the first 3 parameters\n    # sorted by parameter name and a md5hash of the family/parameters as a cananocalised json.\n    param_str = json.dumps(params, separators=(\",\", \":\"), sort_keys=True)\n    param_hash = hashlib.new(\"md5\", param_str.encode(\"utf-8\"), usedforsecurity=False).hexdigest()\n\n    param_summary = \"_\".join(p[:TASK_ID_TRUNCATE_PARAMS] for p in (params[p] for p in sorted(params)[:TASK_ID_INCLUDE_PARAMS]))\n    param_summary = TASK_ID_INVALID_CHAR_REGEX.sub(\"_\", param_summary)\n\n    return \"{}_{}_{}\".format(task_family, param_summary, param_hash[:TASK_ID_TRUNCATE_HASH])\n\n\nclass BulkCompleteNotImplementedError(NotImplementedError):\n    \"\"\"This is here to trick pylint.\n\n    pylint thinks anything raising NotImplementedError needs to be implemented\n    in any subclass. bulk_complete isn't like that. This tricks pylint into\n    thinking that the default implementation is a valid implementation and not\n    an abstract method.\"\"\"\n\n    pass\n\n\n@dataclass_transform(eq_default=False, order_default=False, kw_only_default=True, field_specifiers=(Parameter,))\nclass Task(metaclass=Register):\n    \"\"\"\n    This is the base class of all Luigi Tasks, the base unit of work in Luigi.\n\n    A Luigi Task describes a unit or work.\n\n    The key methods of a Task, which must be implemented in a subclass are:\n\n    * :py:meth:`run` - the computation done by this task.\n    * :py:meth:`requires` - the list of Tasks that this Task depends on.\n    * :py:meth:`output` - the output :py:class:`Target` that this Task creates.\n\n    Each :py:class:`~luigi.Parameter` of the Task should be declared as members:\n\n    .. code:: python\n\n        class MyTask(luigi.Task):\n            count = luigi.IntParameter()\n            second_param = luigi.Parameter()\n\n    In addition to any declared properties and methods, there are a few\n    non-declared properties, which are created by the :py:class:`Register`\n    metaclass:\n\n    \"\"\"\n\n    _event_callbacks: Dict[Any, Any] = {}\n\n    #: Priority of the task: the scheduler should favor available\n    #: tasks with higher priority values first.\n    #: See :ref:`Task.priority`\n    priority = 0\n    disabled = False\n\n    #: Resources used by the task. Should be formatted like {\"scp\": 1} to indicate that the\n    #: task requires 1 unit of the scp resource.\n    resources: Dict[str, Any] = {}\n\n    #: Number of seconds after which to time out the run function.\n    #: No timeout if set to 0.\n    #: Defaults to 0 or worker-timeout value in config\n    worker_timeout: Optional[int] = None\n\n    #: Maximum number of tasks to run together as a batch. Infinite by default\n    max_batch_size = float(\"inf\")\n\n    @property\n    def batchable(self):\n        \"\"\"\n        True if this instance can be run as part of a batch. By default, True\n        if it has any batched parameters\n        \"\"\"\n        return bool(self.batch_param_names())\n\n    @property\n    def retry_count(self):\n        \"\"\"\n        Override this positive integer to have different ``retry_count`` at task level\n        Check :ref:`scheduler-config`\n        \"\"\"\n        return None\n\n    @property\n    def disable_hard_timeout(self):\n        \"\"\"\n        Override this positive integer to have different ``disable_hard_timeout`` at task level.\n        Check :ref:`scheduler-config`\n        \"\"\"\n        return None\n\n    @property\n    def disable_window(self):\n        \"\"\"\n        Override this positive integer to have different ``disable_window`` at task level.\n        Check :ref:`scheduler-config`\n        \"\"\"\n        return None\n\n    @property\n    def disable_window_seconds(self):\n        warnings.warn(\"Use of `disable_window_seconds` has been deprecated, use `disable_window` instead\", DeprecationWarning)\n        return self.disable_window\n\n    @property\n    def owner_email(self):\n        \"\"\"\n        Override this to send out additional error emails to task owner, in addition to the one\n        defined in the global configuration. This should return a string or a list of strings. e.g.\n        'test@exmaple.com' or ['test1@example.com', 'test2@example.com']\n        \"\"\"\n        return None\n\n    def _owner_list(self):\n        \"\"\"\n        Turns the owner_email property into a list. This should not be overridden.\n        \"\"\"\n        owner_email = self.owner_email\n        if owner_email is None:\n            return []\n        elif isinstance(owner_email, str):\n            return owner_email.split(\",\")\n        else:\n            return owner_email\n\n    @property\n    def use_cmdline_section(self):\n        \"\"\"Property used by core config such as `--workers` etc.\n        These will be exposed without the class as prefix.\"\"\"\n        return True\n\n    @classmethod\n    def event_handler(cls, event):\n        \"\"\"\n        Decorator for adding event handlers.\n        \"\"\"\n\n        def wrapped(callback):\n            cls._event_callbacks.setdefault(cls, {}).setdefault(event, set()).add(callback)\n            return callback\n\n        return wrapped\n\n    @classmethod\n    def remove_event_handler(cls, event, callback):\n        \"\"\"\n        Function to remove the event handler registered previously by the cls.event_handler decorator.\n        \"\"\"\n        cls._event_callbacks[cls][event].remove(callback)\n\n    def trigger_event(self, event, *args, **kwargs):\n        \"\"\"\n        Trigger that calls all of the specified events associated with this class.\n        \"\"\"\n        for event_class, event_callbacks in self._event_callbacks.items():\n            if not isinstance(self, event_class):\n                continue\n            for callback in event_callbacks.get(event, []):\n                try:\n                    # callbacks are protected\n                    callback(*args, **kwargs)\n                except KeyboardInterrupt:\n                    return\n                except BaseException:\n                    logger.exception(\"Error in event callback for %r\", event)\n\n    @property\n    def accepts_messages(self):\n        \"\"\"\n        For configuring which scheduler messages can be received. When falsy, this tasks does not\n        accept any message. When True, all messages are accepted.\n        \"\"\"\n        return False\n\n    @property\n    def task_module(self):\n        \"\"\"Returns what Python module to import to get access to this class.\"\"\"\n        # TODO(erikbern): we should think about a language-agnostic mechanism\n        return self.__class__.__module__\n\n    _visible_in_registry = True  # TODO: Consider using in luigi.util as well\n\n    __not_user_specified = \"__not_user_specified\"\n\n    # This is here just to help pylint, the Register metaclass will always set\n    # this value anyway.\n    _namespace_at_class_time = None\n\n    task_namespace = __not_user_specified\n    \"\"\"\n    This value can be overridden to set the namespace that will be used.\n    (See :ref:`Task.namespaces_famlies_and_ids`)\n    If it's not specified and you try to read this value anyway, it will return\n    garbage. Please use :py:meth:`get_task_namespace` to read the namespace.\n\n    Note that setting this value with ``@property`` will not work, because this\n    is a class level value.\n    \"\"\"\n\n    @classmethod\n    def get_task_namespace(cls):\n        \"\"\"\n        The task family for the given class.\n\n        Note: You normally don't want to override this.\n        \"\"\"\n        if cls.task_namespace != cls.__not_user_specified:\n            return cls.task_namespace\n        elif cls._namespace_at_class_time == _SAME_AS_PYTHON_MODULE:\n            return cls.__module__\n        return cls._namespace_at_class_time\n\n    @property\n    def task_family(self):\n        \"\"\"\n        DEPRECATED since after 2.4.0. See :py:meth:`get_task_family` instead.\n        Hopefully there will be less meta magic in Luigi.\n\n        Convenience method since a property on the metaclass isn't directly\n        accessible through the class instances.\n        \"\"\"\n        return self.__class__.task_family\n\n    @classmethod\n    def get_task_family(cls):\n        \"\"\"\n        The task family for the given class.\n\n        If ``task_namespace`` is not set, then it's simply the name of the\n        class.  Otherwise, ``<task_namespace>.`` is prefixed to the class name.\n\n        Note: You normally don't want to override this.\n        \"\"\"\n        if not cls.get_task_namespace():\n            return cls.__name__\n        else:\n            return \"{}.{}\".format(cls.get_task_namespace(), cls.__name__)\n\n    @classmethod\n    def get_params(cls):\n        \"\"\"\n        Returns all of the Parameters for this Task.\n        \"\"\"\n        # We want to do this here and not at class instantiation, or else there is no room to extend classes dynamically\n        params = []\n        for param_name in dir(cls):\n            param_obj = getattr(cls, param_name)\n            if not isinstance(param_obj, Parameter):\n                continue\n\n            params.append((param_name, param_obj))\n\n        # The order the parameters are created matters. See Parameter class\n        params.sort(key=lambda t: t[1]._counter)\n        return params\n\n    @classmethod\n    def batch_param_names(cls):\n        return [name for name, p in cls.get_params() if p._is_batchable()]\n\n    @classmethod\n    def get_param_names(cls, include_significant=False):\n        return [name for name, p in cls.get_params() if include_significant or p.significant]\n\n    @classmethod\n    def get_param_values(cls, params, args, kwargs):\n        \"\"\"\n        Get the values of the parameters from the args and kwargs.\n\n        :param params: list of (param_name, Parameter).\n        :param args: positional arguments\n        :param kwargs: keyword arguments.\n        :returns: list of `(name, value)` tuples, one for each parameter.\n        \"\"\"\n        result = {}\n\n        params_dict = dict(params)\n\n        task_family = cls.get_task_family()\n\n        # In case any exceptions are thrown, create a helpful description of how the Task was invoked\n        # TODO: should we detect non-reprable arguments? These will lead to mysterious errors\n        exc_desc = \"%s[args=%s, kwargs=%s]\" % (task_family, args, kwargs)\n\n        # Fill in the positional arguments\n        positional_params = [(n, p) for n, p in params if p.positional]\n        for i, arg in enumerate(args):\n            if i >= len(positional_params):\n                raise parameter.UnknownParameterException(\"%s: takes at most %d parameters (%d given)\" % (exc_desc, len(positional_params), len(args)))\n            param_name, param_obj = positional_params[i]\n            result[param_name] = param_obj.normalize(arg)\n\n        # Then the keyword arguments\n        for param_name, arg in kwargs.items():\n            if param_name in result:\n                raise parameter.DuplicateParameterException(\"%s: parameter %s was already set as a positional parameter\" % (exc_desc, param_name))\n            if param_name not in params_dict:\n                raise parameter.UnknownParameterException(\"%s: unknown parameter %s\" % (exc_desc, param_name))\n            result[param_name] = params_dict[param_name].normalize(arg)\n\n        # Then use the defaults for anything not filled in\n        for param_name, param_obj in params:\n            if param_name not in result:\n                try:\n                    has_task_value = param_obj.has_task_value(task_family, param_name)\n                except Exception as exc:\n                    raise ValueError(\"%s: Error when parsing the default value of '%s'\" % (exc_desc, param_name)) from exc\n                if not has_task_value:\n                    raise parameter.MissingParameterException(\"%s: requires the '%s' parameter to be set\" % (exc_desc, param_name))\n                result[param_name] = param_obj.task_value(task_family, param_name)\n\n        def list_to_tuple(x):\n            \"\"\"Make tuples out of lists and sets to allow hashing\"\"\"\n            if isinstance(x, list) or isinstance(x, set):\n                return tuple(x)\n            else:\n                return x\n\n        # Check for unconsumed parameters\n        conf = configuration.get_config()\n        if not hasattr(cls, \"_unconsumed_params\"):\n            cls._unconsumed_params = set()\n        if task_family in conf.sections():\n            ignore_unconsumed = getattr(cls, \"ignore_unconsumed\", set())\n            for key, value in conf[task_family].items():\n                key = key.replace(\"-\", \"_\")\n                composite_key = f\"{task_family}_{key}\"\n                if key not in result and key not in ignore_unconsumed and composite_key not in cls._unconsumed_params:\n                    warnings.warn(\n                        f\"The configuration contains the parameter '{key}' with value '{value}' that is not consumed by the task '{task_family}'.\",\n                        UnconsumedParameterWarning,\n                    )\n                    cls._unconsumed_params.add(composite_key)\n\n        # Sort it by the correct order and make a list\n        return [(param_name, list_to_tuple(result[param_name])) for param_name, param_obj in params]\n\n    def __init__(self, *args, **kwargs):\n        params = self.get_params()\n        param_values = self.get_param_values(params, args, kwargs)\n\n        # Set all values on class instance\n        for key, value in param_values:\n            setattr(self, key, value)\n\n        # Register kwargs as an attribute on the class. Might be useful\n        self.param_kwargs = dict(param_values)\n\n        self._warn_on_wrong_param_types()\n        self.task_id = task_id_str(self.get_task_family(), self.to_str_params(only_significant=True, only_public=True))\n        self.__hash = hash(self.task_id)\n\n        self.set_tracking_url = None\n        self.set_status_message = None\n        self.set_progress_percentage = None\n\n    @property\n    def param_args(self):\n        warnings.warn(\"Use of param_args has been deprecated.\", DeprecationWarning)\n        return tuple(self.param_kwargs[k] for k, v in self.get_params())\n\n    def initialized(self):\n        \"\"\"\n        Returns ``True`` if the Task is initialized and ``False`` otherwise.\n        \"\"\"\n        return hasattr(self, \"task_id\")\n\n    def _warn_on_wrong_param_types(self):\n        params = dict(self.get_params())\n        for param_name, param_value in self.param_kwargs.items():\n            params[param_name]._warn_on_wrong_param_type(param_name, param_value)\n\n    @classmethod\n    def from_str_params(cls, params_str):\n        \"\"\"\n        Creates an instance from a str->str hash.\n\n        :param params_str: dict of param name -> value as string.\n        \"\"\"\n        kwargs = {}\n        for param_name, param in cls.get_params():\n            if param_name in params_str:\n                param_str = params_str[param_name]\n                if isinstance(param_str, list):\n                    kwargs[param_name] = param._parse_list(param_str)\n                else:\n                    kwargs[param_name] = param.parse(param_str)\n\n        return cls(**kwargs)\n\n    def to_str_params(self, only_significant=False, only_public=False):\n        \"\"\"\n        Convert all parameters to a str->str hash.\n        \"\"\"\n        params_str = {}\n        params = dict(self.get_params())\n        for param_name, param_value in self.param_kwargs.items():\n            if (\n                ((not only_significant) or params[param_name].significant)\n                and ((not only_public) or params[param_name].visibility == ParameterVisibility.PUBLIC)\n                and params[param_name].visibility != ParameterVisibility.PRIVATE\n            ):\n                params_str[param_name] = params[param_name].serialize(param_value)\n\n        return params_str\n\n    def _get_param_visibilities(self):\n        param_visibilities = {}\n        params = dict(self.get_params())\n        for param_name, param_value in self.param_kwargs.items():\n            if params[param_name].visibility != ParameterVisibility.PRIVATE:\n                param_visibilities[param_name] = params[param_name].visibility.serialize()\n\n        return param_visibilities\n\n    def clone(self, cls=None, **kwargs):\n        \"\"\"\n        Creates a new instance from an existing instance where some of the args have changed.\n\n        There's at least two scenarios where this is useful (see test/clone_test.py):\n\n        * remove a lot of boiler plate when you have recursive dependencies and lots of args\n        * there's task inheritance and some logic is on the base class\n\n        :param cls:\n        :param kwargs:\n        :return:\n        \"\"\"\n        if cls is None:\n            cls = self.__class__\n\n        new_k = {}\n        for param_name, param_class in cls.get_params():\n            if param_name in kwargs:\n                new_k[param_name] = kwargs[param_name]\n            elif hasattr(self, param_name):\n                new_k[param_name] = getattr(self, param_name)\n\n        return cls(**new_k)\n\n    def __hash__(self):\n        return self.__hash\n\n    def __repr__(self):\n        \"\"\"\n        Build a task representation like `MyTask(param1=1.5, param2='5')`\n        \"\"\"\n        params = self.get_params()\n        param_values = self.get_param_values(params, [], self.param_kwargs)\n\n        # Build up task id\n        repr_parts = []\n        param_objs = dict(params)\n        for param_name, param_value in param_values:\n            if param_objs[param_name].significant:\n                repr_parts.append(\"%s=%s\" % (param_name, param_objs[param_name].serialize(param_value)))\n\n        task_str = \"{}({})\".format(self.get_task_family(), \", \".join(repr_parts))\n\n        return task_str\n\n    def __eq__(self, other):\n        return self.__class__ == other.__class__ and self.task_id == other.task_id\n\n    def complete(self):\n        \"\"\"\n        If the task has any outputs, return ``True`` if all outputs exist.\n        Otherwise, return ``False``.\n\n        However, you may freely override this method with custom logic.\n        \"\"\"\n        outputs = flatten(self.output())\n        if len(outputs) == 0:\n            warnings.warn(\"Task %r without outputs has no custom complete() method\" % self, stacklevel=2)\n            return False\n\n        return all(map(lambda output: output.exists(), outputs))\n\n    @classmethod\n    def bulk_complete(cls, parameter_tuples):\n        \"\"\"\n        Returns those of parameter_tuples for which this Task is complete.\n\n        Override (with an efficient implementation) for efficient scheduling\n        with range tools. Keep the logic consistent with that of complete().\n        \"\"\"\n        raise BulkCompleteNotImplementedError()\n\n    def output(self):\n        \"\"\"\n        The output that this Task produces.\n\n        The output of the Task determines if the Task needs to be run--the task\n        is considered finished iff the outputs all exist. Subclasses should\n        override this method to return a single :py:class:`Target` or a list of\n        :py:class:`Target` instances.\n\n        Implementation note\n          If running multiple workers, the output must be a resource that is accessible\n          by all workers, such as a DFS or database. Otherwise, workers might compute\n          the same output since they don't see the work done by other workers.\n\n        See :ref:`Task.output`\n        \"\"\"\n        return []  # default impl\n\n    def requires(self):\n        \"\"\"\n        The Tasks that this Task depends on.\n\n        A Task will only run if all of the Tasks that it requires are completed.\n        If your Task does not require any other Tasks, then you don't need to\n        override this method. Otherwise, a subclass can override this method\n        to return a single Task, a list of Task instances, or a dict whose\n        values are Task instances.\n\n        See :ref:`Task.requires`\n        \"\"\"\n        return []  # default impl\n\n    def _requires(self):\n        \"\"\"\n        Override in \"template\" tasks which themselves are supposed to be\n        subclassed and thus have their requires() overridden (name preserved to\n        provide consistent end-user experience), yet need to introduce\n        (non-input) dependencies.\n\n        Must return an iterable which among others contains the _requires() of\n        the superclass.\n        \"\"\"\n        return flatten(self.requires())  # base impl\n\n    def process_resources(self):\n        \"\"\"\n        Override in \"template\" tasks which provide common resource functionality\n        but allow subclasses to specify additional resources while preserving\n        the name for consistent end-user experience.\n        \"\"\"\n        return self.resources  # default impl\n\n    def input(self):\n        \"\"\"\n        Returns the outputs of the Tasks returned by :py:meth:`requires`\n\n        See :ref:`Task.input`\n\n        :return: a list of :py:class:`Target` objects which are specified as\n                 outputs of all required Tasks.\n        \"\"\"\n        return getpaths(self.requires())\n\n    def deps(self):\n        \"\"\"\n        Internal method used by the scheduler.\n\n        Returns the flattened list of requires.\n        \"\"\"\n        # used by scheduler\n        return flatten(self._requires())\n\n    def run(self):\n        \"\"\"\n        The task run method, to be overridden in a subclass.\n\n        See :ref:`Task.run`\n        \"\"\"\n        pass  # default impl\n\n    def on_failure(self, exception):\n        \"\"\"\n        Override for custom error handling.\n\n        This method gets called if an exception is raised in :py:meth:`run`.\n        The returned value of this method is json encoded and sent to the scheduler\n        as the `expl` argument. Its string representation will be used as the\n        body of the error email sent out if any.\n\n        Default behavior is to return a string representation of the stack trace.\n        \"\"\"\n\n        traceback_string = traceback.format_exc()\n        return \"Runtime error:\\n%s\" % traceback_string\n\n    def on_success(self):\n        \"\"\"\n        Override for doing custom completion handling for a larger class of tasks\n\n        This method gets called when :py:meth:`run` completes without raising any exceptions.\n\n        The returned value is json encoded and sent to the scheduler as the `expl` argument.\n\n        Default behavior is to send an None value\"\"\"\n        pass\n\n    @contextmanager\n    def no_unpicklable_properties(self):\n        \"\"\"\n        Remove unpicklable properties before dump task and resume them after.\n\n        This method could be called in subtask's dump method, to ensure unpicklable\n        properties won't break dump.\n\n        This method is a context-manager which can be called as below:\n\n        .. code-block: python\n\n            class DummyTask(luigi):\n\n                def _dump(self):\n                    with self.no_unpicklable_properties():\n                        pickle.dumps(self)\n\n        \"\"\"\n        unpicklable_properties = tuple(luigi.worker.TaskProcess.forward_reporter_attributes.values())\n        reserved_properties = {}\n        for property_name in unpicklable_properties:\n            if hasattr(self, property_name):\n                reserved_properties[property_name] = getattr(self, property_name)\n                setattr(self, property_name, \"placeholder_during_pickling\")\n\n        yield\n\n        for property_name, value in reserved_properties.items():\n            setattr(self, property_name, value)\n\n\nclass MixinNaiveBulkComplete:\n    \"\"\"\n    Enables a Task to be efficiently scheduled with e.g. range tools, by providing a bulk_complete implementation which checks completeness in a loop.\n\n    Applicable to tasks whose completeness checking is cheap.\n\n    This doesn't exploit output location specific APIs for speed advantage, nevertheless removes redundant scheduler roundtrips.\n    \"\"\"\n\n    @classmethod\n    def bulk_complete(cls, parameter_tuples):\n        generated_tuples = []\n        for parameter_tuple in parameter_tuples:\n            if isinstance(parameter_tuple, (list, tuple)):\n                if cls(*parameter_tuple).complete():\n                    generated_tuples.append(parameter_tuple)\n            elif isinstance(parameter_tuple, dict):\n                if cls(**parameter_tuple).complete():\n                    generated_tuples.append(parameter_tuple)\n            else:\n                if cls(parameter_tuple).complete():\n                    generated_tuples.append(parameter_tuple)\n        return generated_tuples\n\n\nclass DynamicRequirements(object):\n    \"\"\"\n    Wraps dynamic requirements yielded in tasks's run methods to control how completeness checks of\n    (e.g.) large chunks of tasks are performed. Besides the wrapped *requirements*, instances of\n    this class can be passed an optional function *custom_complete* that might implement an\n    optimized check for completeness. If set, the function will be called with a single argument,\n    *complete_fn*, which should be used to perform the per-task check. Example:\n\n    .. code-block:: python\n\n        class SomeTaskWithDynamicRequirements(luigi.Task):\n            ...\n\n            def run(self):\n                large_chunk_of_tasks = [OtherTask(i=i) for i in range(10000)]\n\n                def custom_complete(complete_fn):\n                    # example: assume OtherTask always write into the same directory, so just check\n                    #          if the first task is complete, and compare basenames for the rest\n                    if not complete_fn(large_chunk_of_tasks[0]):\n                        return False\n                    paths = [task.output().path for task in large_chunk_of_tasks]\n                    basenames = os.listdir(os.path.dirname(paths[0]))  # a single fs call\n                    return all(os.path.basename(path) in basenames for path in paths)\n\n                yield DynamicRequirements(large_chunk_of_tasks, custom_complete)\n\n    .. py:attribute:: requirements\n\n        The original, wrapped requirements.\n\n    .. py:attribute:: custom_complete\n\n       The optional, custom function performing the completeness check of the wrapped requirements.\n    \"\"\"\n\n    def __init__(self, requirements, custom_complete=None):\n        super().__init__()\n\n        # store attributes\n        self.requirements = requirements\n        self.custom_complete = custom_complete\n\n        # cached flat requirements and paths\n        self._flat_requirements = None\n        self._paths = None\n\n    @property\n    def flat_requirements(self):\n        if self._flat_requirements is None:\n            self._flat_requirements = flatten(self.requirements)\n        return self._flat_requirements\n\n    @property\n    def paths(self):\n        if self._paths is None:\n            self._paths = getpaths(self.requirements)\n        return self._paths\n\n    def complete(self, complete_fn=None):\n        # default completeness check\n        if complete_fn is None:\n\n            def complete_fn(task):\n                return task.complete()\n\n        # use the custom complete function when set\n        if self.custom_complete:\n            return self.custom_complete(complete_fn)\n\n        # default implementation\n        return all(complete_fn(t) for t in self.flat_requirements)\n\n\nclass ExternalTask(Task):\n    \"\"\"\n    Subclass for references to external dependencies.\n\n    An ExternalTask's does not have a `run` implementation, which signifies to\n    the framework that this Task's :py:meth:`output` is generated outside of\n    Luigi.\n    \"\"\"\n\n    run = None\n\n\ndef externalize(taskclass_or_taskobject):\n    \"\"\"\n    Returns an externalized version of a Task. You may both pass an\n    instantiated task object or a task class. Some examples:\n\n    .. code-block:: python\n\n        class RequiringTask(luigi.Task):\n            def requires(self):\n                task_object = self.clone(MyTask)\n                return externalize(task_object)\n\n            ...\n\n    Here's mostly equivalent code, but ``externalize`` is applied to a task\n    class instead.\n\n    .. code-block:: python\n\n        @luigi.util.requires(externalize(MyTask))\n        class RequiringTask(luigi.Task):\n            pass\n            ...\n\n    Of course, it may also be used directly on classes and objects (for example\n    for reexporting or other usage).\n\n    .. code-block:: python\n\n        MyTask = externalize(MyTask)\n        my_task_2 = externalize(MyTask2(param='foo'))\n\n    If you however want a task class to be external from the beginning, you're\n    better off inheriting :py:class:`ExternalTask` rather than :py:class:`Task`.\n\n    This function tries to be side-effect free by creating a copy of the class\n    or the object passed in and then modify that object. In particular this\n    code shouldn't do anything.\n\n    .. code-block:: python\n\n        externalize(MyTask)  # BAD: This does nothing (as after luigi 2.4.0)\n    \"\"\"\n    copied_value = copy.copy(taskclass_or_taskobject)\n    if copied_value is taskclass_or_taskobject:\n        # Assume it's a class\n        clazz = taskclass_or_taskobject\n\n        @_task_wraps(clazz)\n        class _CopyOfClass(clazz):\n            # How to copy a class: http://stackoverflow.com/a/9541120/621449\n            _visible_in_registry = False\n\n        _CopyOfClass.run = None\n        return _CopyOfClass\n    else:\n        # We assume it's an object\n        copied_value.run = None\n        return copied_value\n\n\nclass WrapperTask(Task):\n    \"\"\"\n    Use for tasks that only wrap other tasks and that by definition are done if all their requirements exist.\n    \"\"\"\n\n    def complete(self):\n        return all(r.complete() for r in flatten(self.requires()))\n\n\nclass Config(Task):\n    \"\"\"\n    Class for configuration. See :ref:`ConfigClasses`.\n    \"\"\"\n\n    # TODO: let's refactor Task & Config so that it inherits from a common\n    # ParamContainer base class\n    pass\n\n\ndef getpaths(struct):\n    \"\"\"\n    Maps all Tasks in a structured data object to their .output().\n    \"\"\"\n    if isinstance(struct, Task):\n        return struct.output()\n    elif isinstance(struct, dict):\n        return struct.__class__((k, getpaths(v)) for k, v in struct.items())\n    elif isinstance(struct, (list, tuple)):\n        return struct.__class__(getpaths(r) for r in struct)\n    else:\n        # Remaining case: assume struct is iterable...\n        try:\n            return [getpaths(r) for r in struct]\n        except TypeError:\n            raise Exception(\"Cannot map %s to Task/dict/list\" % str(struct))\n\n\ndef flatten(struct):\n    \"\"\"\n    Creates a flat list of all items in structured output (dicts, lists, items):\n\n    .. code-block:: python\n\n        >>> sorted(flatten({'a': 'foo', 'b': 'bar'}))\n        ['bar', 'foo']\n        >>> sorted(flatten(['foo', ['bar', 'troll']]))\n        ['bar', 'foo', 'troll']\n        >>> flatten('foo')\n        ['foo']\n        >>> flatten(42)\n        [42]\n    \"\"\"\n    if struct is None:\n        return []\n    flat = []\n    if isinstance(struct, dict):\n        for _, result in struct.items():\n            flat += flatten(result)\n        return flat\n    if isinstance(struct, str):\n        return [struct]\n\n    try:\n        # if iterable\n        iterator = iter(struct)\n    except TypeError:\n        return [struct]\n\n    for result in iterator:\n        flat += flatten(result)\n    return flat\n\n\ndef flatten_output(task):\n    \"\"\"\n    Lists all output targets by recursively walking output-less (wrapper) tasks.\n    \"\"\"\n\n    output_tasks = OrderedDict()  # OrderedDict used as ordered set\n    tasks_to_process = deque([task])\n    while tasks_to_process:\n        current_task = tasks_to_process.popleft()\n        if flatten(current_task.output()):\n            if current_task not in output_tasks:\n                output_tasks[current_task] = None\n        else:\n            tasks_to_process.extend(flatten(current_task.requires()))\n\n    return flatten(task.output() for task in output_tasks)\n\n\ndef _task_wraps(task_class):\n    # In order to make the behavior of a wrapper class nicer, we set the name of the\n    # new class to the wrapped class, and copy over the docstring and module as well.\n    # This makes it possible to pickle the wrapped class etc.\n    # Btw, this is a slight abuse of functools.wraps. It's meant to be used only for\n    # functions, but it works for classes too, if you pass updated=[]\n    assigned = functools.WRAPPER_ASSIGNMENTS + (\"_namespace_at_class_time\",)\n    return functools.wraps(task_class, assigned=assigned, updated=[])\n"
  },
  {
    "path": "luigi/task_history.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nAbstract class for task history.\nCurrently the only subclass is :py:class:`~luigi.db_task_history.DbTaskHistory`.\n\"\"\"\n\nimport abc\nimport logging\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nclass StoredTask:\n    \"\"\"\n    Interface for methods on TaskHistory\n    \"\"\"\n\n    # TODO : do we need this task as distinct from luigi.scheduler.Task?\n    #        this only records host and record_id in addition to task parameters.\n\n    def __init__(self, task, status, host=None):\n        self._task = task\n        self.status = status\n        self.record_id = None\n        self.host = host\n\n    @property\n    def task_family(self):\n        return self._task.family\n\n    @property\n    def parameters(self):\n        return self._task.params\n\n\nclass TaskHistory(metaclass=abc.ABCMeta):\n    \"\"\"\n    Abstract Base Class for updating the run history of a task\n    \"\"\"\n\n    @abc.abstractmethod\n    def task_scheduled(self, task):\n        pass\n\n    @abc.abstractmethod\n    def task_finished(self, task, successful):\n        pass\n\n    @abc.abstractmethod\n    def task_started(self, task, worker_host):\n        pass\n\n    # TODO(erikbern): should web method (find_latest_runs etc) be abstract?\n\n\nclass NopHistory(TaskHistory):\n    def task_scheduled(self, task):\n        pass\n\n    def task_finished(self, task, successful):\n        pass\n\n    def task_started(self, task, worker_host):\n        pass\n"
  },
  {
    "path": "luigi/task_register.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nDefine the centralized register of all :class:`~luigi.task.Task` classes.\n\"\"\"\n\nimport abc\nimport logging\nfrom typing import Any, Dict, List\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nclass TaskClassException(Exception):\n    pass\n\n\nclass TaskClassNotFoundException(TaskClassException):\n    pass\n\n\nclass TaskClassAmbigiousException(TaskClassException):\n    pass\n\n\nclass Register(abc.ABCMeta):\n    \"\"\"\n    The Metaclass of :py:class:`Task`.\n\n    Acts as a global registry of Tasks with the following properties:\n\n    1. Cache instances of objects so that eg. ``X(1, 2, 3)`` always returns the\n       same object.\n    2. Keep track of all subclasses of :py:class:`Task` and expose them.\n    \"\"\"\n\n    __instance_cache: Dict[str, Any] = {}\n    _default_namespace_dict: Dict[str, Any] = {}\n    _reg: List[Any] = []\n    AMBIGUOUS_CLASS = object()  # Placeholder denoting an error\n    \"\"\"If this value is returned by :py:meth:`_get_reg` then there is an\n    ambiguous task name (two :py:class:`Task` have the same name). This denotes\n    an error.\"\"\"\n\n    def __new__(metacls, classname, bases, classdict, **kwargs):\n        \"\"\"\n        Custom class creation for namespacing.\n\n        Also register all subclasses.\n\n        When the set or inherited namespace evaluates to ``None``, set the task namespace to\n        whatever the currently declared namespace is.\n        \"\"\"\n        cls = super(Register, metacls).__new__(metacls, classname, bases, classdict, **kwargs)\n        cls._namespace_at_class_time = metacls._get_namespace(cls.__module__)\n        metacls._reg.append(cls)\n        return cls\n\n    def __call__(cls, *args, **kwargs):\n        \"\"\"\n        Custom class instantiation utilizing instance cache.\n\n        If a Task has already been instantiated with the same parameters,\n        the previous instance is returned to reduce number of object instances.\n        \"\"\"\n\n        def instantiate():\n            return super(Register, cls).__call__(*args, **kwargs)\n\n        h = cls.__instance_cache\n\n        if h is None:  # disabled\n            return instantiate()\n\n        params = cls.get_params()\n        param_values = cls.get_param_values(params, args, kwargs)\n\n        k = (cls, tuple(param_values))\n\n        try:\n            hash(k)\n        except TypeError:\n            logger.debug(\"Not all parameter values are hashable so instance isn't coming from the cache\")\n            return instantiate()  # unhashable types in parameters\n\n        if k not in h:\n            h[k] = instantiate()\n\n        return h[k]\n\n    @classmethod\n    def clear_instance_cache(cls):\n        \"\"\"\n        Clear/Reset the instance cache.\n        \"\"\"\n        cls.__instance_cache = {}\n\n    @classmethod\n    def disable_instance_cache(cls):\n        \"\"\"\n        Disables the instance cache.\n        \"\"\"\n        cls.__instance_cache = None\n\n    @property\n    def task_family(cls):\n        \"\"\"\n        Internal note: This function will be deleted soon.\n        \"\"\"\n        task_namespace = cls.get_task_namespace()\n        if not task_namespace:\n            return cls.__name__\n        else:\n            return f\"{task_namespace}.{cls.__name__}\"\n\n    @classmethod\n    def _get_reg(cls):\n        \"\"\"Return all of the registered classes.\n\n        :return:  an ``dict`` of task_family -> class\n        \"\"\"\n        # We have to do this on-demand in case task names have changed later\n        reg = dict()\n        for task_cls in cls._reg:\n            if not task_cls._visible_in_registry:\n                continue\n\n            name = task_cls.get_task_family()\n            if name in reg and (\n                reg[name] == Register.AMBIGUOUS_CLASS  # Check so issubclass doesn't crash\n                or not issubclass(task_cls, reg[name])\n            ):\n                # Registering two different classes - this means we can't instantiate them by name\n                # The only exception is if one class is a subclass of the other. In that case, we\n                # instantiate the most-derived class (this fixes some issues with decorator wrappers).\n                reg[name] = Register.AMBIGUOUS_CLASS\n            else:\n                reg[name] = task_cls\n\n        return reg\n\n    @classmethod\n    def _set_reg(cls, reg):\n        \"\"\"The writing complement of _get_reg\"\"\"\n        cls._reg = [task_cls for task_cls in reg.values() if task_cls is not cls.AMBIGUOUS_CLASS]\n\n    @classmethod\n    def task_names(cls):\n        \"\"\"\n        List of task names as strings\n        \"\"\"\n        return sorted(cls._get_reg().keys())\n\n    @classmethod\n    def tasks_str(cls):\n        \"\"\"\n        Human-readable register contents dump.\n        \"\"\"\n        return \",\".join(cls.task_names())\n\n    @classmethod\n    def get_task_cls(cls, name):\n        \"\"\"\n        Returns an unambiguous class or raises an exception.\n        \"\"\"\n        task_cls = cls._get_reg().get(name)\n        if not task_cls:\n            raise TaskClassNotFoundException(cls._missing_task_msg(name))\n\n        if task_cls == cls.AMBIGUOUS_CLASS:\n            raise TaskClassAmbigiousException(\"Task %r is ambiguous\" % name)\n        return task_cls\n\n    @classmethod\n    def get_all_params(cls):\n        \"\"\"\n        Compiles and returns all parameters for all :py:class:`Task`.\n\n        :return: a generator of tuples (TODO: we should make this more elegant)\n        \"\"\"\n        for task_name, task_cls in cls._get_reg().items():\n            if task_cls == cls.AMBIGUOUS_CLASS:\n                continue\n            for param_name, param_obj in task_cls.get_params():\n                yield task_name, (not task_cls.use_cmdline_section), param_name, param_obj\n\n    @staticmethod\n    def _editdistance(a, b):\n        \"\"\"Simple unweighted Levenshtein distance\"\"\"\n        r0 = range(0, len(b) + 1)\n        r1 = [0] * (len(b) + 1)\n\n        for i in range(0, len(a)):\n            r1[0] = i + 1\n\n            for j in range(0, len(b)):\n                c = 0 if a[i] is b[j] else 1\n                r1[j + 1] = min(r1[j] + 1, r0[j + 1] + 1, r0[j] + c)\n\n            r0 = r1[:]\n\n        return r1[len(b)]\n\n    @classmethod\n    def _missing_task_msg(cls, task_name):\n        weighted_tasks = [(Register._editdistance(task_name, task_name_2), task_name_2) for task_name_2 in cls.task_names()]\n        ordered_tasks = sorted(weighted_tasks, key=lambda pair: pair[0])\n        candidates = [task for (dist, task) in ordered_tasks if dist <= 5 and dist < len(task)]\n        if candidates:\n            return \"No task %s. Did you mean:\\n%s\" % (task_name, \"\\n\".join(candidates))\n        else:\n            return \"No task %s. Candidates are: %s\" % (task_name, cls.tasks_str())\n\n    @classmethod\n    def _get_namespace(mcs, module_name):\n        for parent in mcs._module_parents(module_name):\n            entry = mcs._default_namespace_dict.get(parent)\n            if entry:\n                return entry\n        return \"\"  # Default if nothing specifies\n\n    @staticmethod\n    def _module_parents(module_name):\n        \"\"\"\n        >>> list(Register._module_parents('a.b'))\n        ['a.b', 'a', '']\n        \"\"\"\n        spl = module_name.split(\".\")\n        for i in range(len(spl), 0, -1):\n            yield \".\".join(spl[0:i])\n        if module_name:\n            yield \"\"\n\n\ndef load_task(module, task_name, params_str):\n    \"\"\"\n    Imports task dynamically given a module and a task name.\n    \"\"\"\n    if module is not None:\n        __import__(module)\n    task_cls = Register.get_task_cls(task_name)\n    return task_cls.from_str_params(params_str)\n"
  },
  {
    "path": "luigi/task_status.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nPossible values for a Task's status in the Scheduler\n\"\"\"\n\nPENDING = \"PENDING\"\nFAILED = \"FAILED\"\nDONE = \"DONE\"\nRUNNING = \"RUNNING\"\nBATCH_RUNNING = \"BATCH_RUNNING\"\nSUSPENDED = \"SUSPENDED\"  # Only kept for backward compatibility with old clients\nUNKNOWN = \"UNKNOWN\"\nDISABLED = \"DISABLED\"\n"
  },
  {
    "path": "luigi/templates/history.html",
    "content": "<!--\n@Copyright 2015 Naver Corp.\n@Author Yeseul Park (yeseul.park@navercorp.com)\n-->\n\n<!-- Extend our site layout -->\n{% extends \"layout.html\" %}\n{% block content %}\n    <h1> {{name}} History </h1>\n\t<br>\n\t<div class=\"row\">\n\t<style>\n\t\t#chart svg {\n\t\t\theight: 800px;\n\t\t\twidth: 1200px;\n\t\t\tmargin-top: 80px;\n\t\t\tfont-family: Arial, sans-serif;\n\t\t\tfont-size: 9px;\n\t\t\toverflow: scroll;\n\t\t}\n\t\t\n\t</style>\n\t<div id=\"chart\">\n\t\t<svg></svg>\n\t</div>\n\t{% if statusResults and taskResults %}\n\t<!--h5> {{ statusResults }} and {{ taskResults }}</h5 -->\n\t<script>\n\t\tvar status_results  =  unescape(\"{{ statusResults }}\");\n\t\tvar status_results_str = String(status_results).replace(/&quot;/ig,'\"');\n\t\tvar status_data = JSON.parse(status_results_str);\n\t\tvar running_data = status_data.RUNNING;\n\t\tvar done_data = status_data.DONE;\n\t\tvar failed_data = status_data.FAILED;\n\t\tvar task_results = unescape(\"{{ taskResults }}\");\n\t\tvar task_results_str = String(task_results).replace(/&quot;/ig,'\"');\n\t\tvar task_data = JSON.parse(task_results_str);\n\t\tvar dateSet = new Set();\n\t\tfor (var i = 0; i < running_data.length; i++) {\n\t\t\tvar id = running_data[i].id;\n\t\t\t//millisecond converion of time for x-axis\n\t\t\trunning_data[i].x = running_data[i].x * 1000;\n\t\t\tdateSet.add(running_data[i].x);\n\t\t\tfor (var j = 0; j < task_data.length; j++) {\n\t\t\t\tvar latter_id = parseInt(id) + 1;\n\t\t\t\tif (latter_id == task_data[j].id) {\n\t\t\t\t\t//calculate time spent for running task with millisecond conversion of time for y-axis\n\t\t\t\t\trunning_data[i].y = task_data[j].timestamp * 1000 - running_data[i].x;\n\t\t\t\t\t//get the next process after completing running status\n\t\t\t\t\trunning_data[i].next_process = task_data[j].status;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tvar refined_failed_data = {\"key\": \"FAILED\", \"color\": \"#CC0000\", \"values\":[]};\n\t\tvar refined_done_data = {\"key\": \"DONE\", \"color\": \"#669900\", \"values\":[]};\n\t\tvar dateSetL = [];\n\t\tdateSet.forEach(function(d) {\n\t\t\tdateSetL.push(d);\n\t\t});\n\t\tfor (var i = 0; i < running_data.length; i++) {\n\t\t\t//push each running data whose next process is \"Failed\" into refined_failed_data\n\t\t\tif (running_data[i].next_process == \"FAILED\") {\n\t\t\t\tto_be_failed_data = [running_data[i].x, running_data[i].y, running_data[i].id, running_data[i].task_id];\n\t\t\t\trefined_failed_data[\"values\"].push(to_be_failed_data);\n\t\t\t} \n\t\t\t//push each running data whose next process is \"Done\" into refined_done_data\n\t\t\tif (running_data[i].next_process == \"DONE\") {\n\t\t\t\tto_be_done_data = [running_data[i].x, running_data[i].y, running_data[i].id, running_data[i].task_id];\n\t\t\t\trefined_done_data[\"values\"].push(to_be_done_data);\n\t\t\t}\n\t\t}\n\t\tvar merged_data = [];\n\t\tif (refined_failed_data.values.length > 0 && refined_done_data.values.length > 0) merged_data = [refined_failed_data, refined_done_data];\n\t\telse if (refined_failed_data.values.length > 0 && refined_done_data.values.length == 0) merged_data = [refined_failed_data];\n\t\telse if (refined_failed_data.values.length == 0 && refined_done_data.values.length > 0) merged_data = [refined_done_data];\n\t\tnv.addGraph(function() {\n\t\t\tvar\th = nv.utils.windowSize().height/2;\n\t\t\tvar chart = nv.models.multiBarChart()\n\t\t\t.margin({top: 30, right: 100, bottom: 30, left: 100})\n\t\t\t.x(function(d, i) { return i; })\n\t\t\t.y(function(d) { if (d[1] > 0) return d[1]; })\n\t\t\t.height(h);\n\t\t\t\n\t\t\tchart.xAxis\n\t\t\t.axisLabel(\"Datetime\")\n\t\t\t.rotateLabels(-45)\n\t\t\t.showMaxMin(false)\n\t\t\t.ticks(refined_failed_data.values.length+refined_done_data.values.length)\n\t\t\t.tickFormat(function(d, i) {\n\t\t\t\tvar dx = (merged_data[0].values[d] && merged_data[0].values[d][0]) || (merged_data[1].values[d] && merged_data[1].values[d][0]);\n\t\t\t\treturn d3.time.format('%x %X')(new Date(dx));\n\t\t\t});\n\t\t\tchart.yAxis\n\t\t\t.axisLabel(\"Processing Time (H:M:S)\")\n\t\t\t.tickFormat(function(d) {\n\t\t\t\tvar dy = new Date();\n\t\t\t\tdy.setHours(0, 0, 0, d);\n\t\t\t\treturn d3.time.format('%X')(dy);\n\t\t\t});\n\t\t\tchart.tooltipContent(function(key, x, y, e, graph) {\n\t\t\t\ttooltip_str = '<center><b>Result status: '+key+'</b></center>'+'<center>processing time: '+y+'</center><center> on '+x+'</center>';\n\t\t\t\treturn tooltip_str;\n\t\t\t});\n\t\n\t\t\td3.select('#chart svg')\n\t\t\t\t.datum(merged_data)\n\t\t\t\t.transition()\n\t\t\t\t.duration(500)\n\t\t\t\t.call(chart);\n\n\t\t\tnv.utils.windowResize(chart.update);\n\n\t\t\treturn chart;\n\t\t});\n\t</script>\n\t{% end %}\n</div>\n{% end %}\n"
  },
  {
    "path": "luigi/templates/layout.html",
    "content": "<!--\n@Copyright 2015 Naver Corp.\n@Author Yeseul Park (yeseul.park@navercorp.com)\n-->\n\n<!DOCTYPE html>\n<html lang=\"en\">\n  <head>\n    <meta http-equiv=\"content-type\" content=\"text/html\" charset=\"utf-8\">\n\t<title> Luigi History Viewer </title>\n    <!-- Derived from example at http://twitter.github.com/bootstrap/examples/sticky-footer.html -->\n    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n    <meta name=\"description\" content=\"view luigi task history\">\n    <meta name=\"author\" content=\"ypark\">\n    <!-- CSS -->\n    <style type=\"text/css\">\n\n      /* Sticky footer styles\n      -------------------------------------------------- */\n\n      html,\n      body {\n        height: 100%;\n        /* The html and body elements cannot have any padding or margin. */\n      }\n\n      /* Wrapper for page content to push down footer */\n      #wrap {\n        min-height: 90%;\n        /*height: auto !important;*/\n        height: 730px;\n        /* Negative indent footer by it's height */\n        margin: 0 auto -60px;\n\t\toverflow: scroll;\n      }\n\n      /* Set the fixed height of the footer here */\n      #push,\n      #footer {\n        height: 60px;\n      }\n      #footer {\n        background-color: #f5f5f5;\n      }\n\n      /* Lastly, apply responsive CSS fixes as necessary */\n      @media (max-width: 767px) {\n        #footer {\n          margin-left: -20px;\n          margin-right: -20px;\n          padding-left: 20px;\n          padding-right: 20px;\n        }\n      }\n\n      /* Custom page CSS\n      -------------------------------------------------- */\n      /* Not required for template or sticky footer method. */\n\n      .container {\n        width: auto;\n        max-width: 1200px;\n        margin-bottom: 20px;\n      }\n      .container .credit {\n        margin: 20px 0;\n      }\n      \n      .container[role=\"main\"] {\n          padding-bottom: 60px;\n      }\n      \n      #footer {\n          position: fixed;\n          bottom: 0;\n          left: 0;\n          right: 0;\n      }\n      #proper-content {\n\t\t  overflow: auto;\n\t  }\n      .lead { margin-top: -17px; margin-bottom: 13px; }\n\n    </style>\n    <link href=\"{{ static_url(\"visualiser/lib/bootstrap3/css/bootstrap.min.css\") }}\" rel=\"stylesheet\">\n    <link href=\"https://cdnjs.cloudflare.com/ajax/libs/nvd3/1.7.0/nv.d3.min.css\" rel=\"stylesheet\">\n    <script src=\"{{ static_url(\"visualiser/lib/jquery-1.10.0.min.js\") }}\"></script>\n    <script src=\"{{ static_url(\"visualiser/lib/bootstrap3/js/bootstrap.min.js\") }}\"></script>\n  </head>\n  <body>\n\n\t<!-- Wrap all page content here -->\n\t<div id=\"wrap\">\n    \t<script src=\"https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js\"></script>\n    \t<script src=\"https://cdnjs.cloudflare.com/ajax/libs/nvd3/1.7.0/nv.d3.min.js\"></script>\n \n\t\t<div class=\"container\">\n\t\t\t<div class=\"wrapper\">\n\t\t\t\t<div class=\"proper-content\">\n        \t\t\t{% block content %}{% end %}\n\t\t\t\t</div><!-- /.proper-content -->\n\t\t\t\t<div class=\"push\"></div>\n\t\t\t</div><!-- /.wrapper -->\n\t  \t</div> <!-- /container -->\n\t</div> <!-- /wrap -->\n  </body>\n</html>\n"
  },
  {
    "path": "luigi/templates/menu.html",
    "content": "<!--\n@Copyright 2015 Naver Corp.\n@Author Yeseul Park (yeseul.park@navercorp.com)\n-->\n\n<!-- Extend our site layout -->\n{% extends \"layout.html\" %}\n\n<!-- Parse a pig tuple being passed through in string format -->\n\n{% block content %}\n\t\n  <div class=\"container\">\n  {% if tasknames %}\n  <h3 style=\"margin-top: 20px;\">[ Task History ]</h3>\n  <ul class=\"nav nav-pills\">\n  {% for item in tasknames %}\n\t  <li class=\"active\">\n \t  \t<a style=\"margin: 3px;\" href=\"/tasklist/{{ item }}\">{{ item }}</a>\n  \t  </li>\n  {% end %}\n  </ul>\n  {% end %}\n  </div>\n\n{% end %}\n"
  },
  {
    "path": "luigi/templates/recent.html",
    "content": "{% extends \"layout.html\" %}\n{% block content %}\n<h2>Luigi Task History</h2>\n<table class=\"table table-striped table-bordered\">\n  <thead>\n    <tr>\n    <th>Name</th>\n    <th>Host</th>\n    <th>Last Action</th>\n    <th>Status</th>\n    <th>Parameters</th>\n  </tr>\n  </thead>\n  <tbody>\n    {% for task in tasks %}\n      <tr>\n        <td><a href=\"/history/by_id/{{task.id}}\">{{task.name}}</a></td>\n        <td>{{task.host}}</td>\n        <td>{{task.events[0].ts}}</td>\n        <td>{{task.events[0].event_name}}</td>\n\t\t<td>{% for (k, param) in task.parameters.items() %}\n\t\t\t\t<table class=\"table-condensed\"><tr><td>{{k}}</td><td>{{param.value}}</td></tr></table>\n    \t\t{% end %}</td>\n      </tr>\n    {% end %}\n  </tbody>\n</table>\n{% end %}\n"
  },
  {
    "path": "luigi/templates/show.html",
    "content": "{% extends \"layout.html\" %}\n{% block content %}\n<div class=\"row\">\n  <div class=\"span6\">\n    <h3>Info</h3>\n    <table class=\"table table-striped table-bordered\">\n      <tbody>\n        <tr>\n          <td>Task Id</td>\n          <td>{{task.id}}</td>\n        </tr>\n        <tr>\n          <td>Task Name</td>\n          <td>{{task.name}}</td>\n        </tr>\n        <tr>\n          <td>Host</td>\n          <td>{{task.host}}</td>\n        </tr>\n        <tr>\n          <td>More</td>\n          <td><a href=\"/history/by_name/{{task.name}}\">All \"{{task.name}}\" runs</a></td>\n        </tr>\n      </tbody>\n    </table>\n  </div>\n</div>\n<h3>Parameters</h3>\n<table class=\"table table-striped table-bordered\">\n  <thead>\n    <tr>\n      <th>Name</th>\n      <th>Value</th>\n    </tr>\n  </thead>\n  <tbody>\n    {% for (k, param) in task.parameters.items() %}\n      <tr>\n        <td>{{k}}</td>\n        <td>{{param.value}}</td>\n      </tr>\n    {% end %}\n  </tbody>\n</table>\n<h3>Actions</h3>\n<table class=\"table table-striped table-bordered\">\n  <thead>\n    <tr>\n      <th>Status</th>\n      <th>Action Time</th>\n    </tr>\n  </thead>\n  <tbody>\n    {% for event in task.events %}\n      <tr>\n        <td>{{event.event_name}}</td>\n        <td>{{event.ts}}</td>\n      </tr>\n    {% end %}\n\n  </tbody>\n</table>\n{% end %}\n"
  },
  {
    "path": "luigi/tools/__init__.py",
    "content": "# -*- coding: utf-8 -*-\n# Copyright (c) 2014 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n\n\"\"\"\nSort of a standard library for doing stuff with Tasks at a somewhat abstract level.\n\nSubmodule introduced to stop growing util.py unstructured.\n\"\"\"\n"
  },
  {
    "path": "luigi/tools/deps.py",
    "content": "#!/usr/bin/env python\n\n\n# Finds all tasks and task outputs on the dependency paths from the given downstream task T\n# up to the given source/upstream task S (optional). If the upstream task is not given,\n# all upstream tasks on all dependency paths of T will be returned.\n\n# Terms:\n# if  the execution of Task T depends on the output of task S on a dependency graph,\n#  T is called a downstream/sink task, S is called an upstream/source task.\n\n# This is useful and practical way to find all upstream tasks of task T.\n# For example suppose you have a daily computation that starts with a task named Daily.\n# And suppose you have another task named Aggregate. Daily triggers a few tasks\n# which eventually trigger Aggregate. Now, suppose you find a bug in Aggregate.\n# You fixed the bug and now you want to rerun it, including all it's upstream deps.\n#\n# To do that you run:\n#      bin/deps.py --module daily_module Aggregate --daily-param1 xxx --upstream-family Daily\n#\n# This will output all the tasks on the dependency path between Daily and Aggregate. In\n# effect, this is how you find all upstream tasks for Aggregate. Now you can delete its\n# output and run Aggregate again. Daily will eventually trigget Aggregate and all tasks on\n# the way.\n#\n# The same code here might be used as a CLI tool as well as a python module.\n# In python, invoke find_deps(task, upstream_name) to get a set of all task instances on the\n# paths between task T and upstream task S. You can then use the task instances to delete their output or\n# perform other computation based on that.\n#\n# Example:\n#\n# PYTHONPATH=$PYTHONPATH:/path/to/your/luigi/tasks bin/deps.py \\\n# --module my.tasks  MyDownstreamTask\n# --downstream_task_param1 123456\n# [--upstream-family MyUpstreamTask]\n#\n\nimport sys\nfrom collections.abc import Iterable\n\nimport luigi.interface\nfrom luigi import parameter\nfrom luigi.cmdline_parser import CmdlineParser\nfrom luigi.contrib.postgres import PostgresTarget\nfrom luigi.contrib.s3 import S3Target\nfrom luigi.contrib.ssh import RemoteTarget\nfrom luigi.target import FileSystemTarget\nfrom luigi.task import flatten\n\n\ndef get_task_requires(task):\n    return set(flatten(task.requires()))\n\n\ndef dfs_paths(start_task, goal_task_family, path=None):\n    if path is None:\n        path = [start_task]\n    if start_task.task_family == goal_task_family or goal_task_family is None:\n        for item in path:\n            yield item\n    for next in get_task_requires(start_task) - set(path):\n        for t in dfs_paths(next, goal_task_family, path + [next]):\n            yield t\n\n\nclass upstream(luigi.task.Config):\n    \"\"\"\n    Used to provide the parameter upstream-family\n    \"\"\"\n\n    family = parameter.OptionalParameter(default=None)\n\n\ndef find_deps(task, upstream_task_family):\n    \"\"\"\n    Finds all dependencies that start with the given task and have a path\n    to upstream_task_family\n\n    Returns all deps on all paths between task and upstream\n    \"\"\"\n    return {t for t in dfs_paths(task, upstream_task_family)}\n\n\ndef find_deps_cli():\n    \"\"\"\n    Finds all tasks on all paths from provided CLI task\n    \"\"\"\n    cmdline_args = sys.argv[1:]\n    with CmdlineParser.global_instance(cmdline_args) as cp:\n        return find_deps(cp.get_task_obj(), upstream().family)\n\n\ndef get_task_output_description(task_output):\n    \"\"\"\n    Returns a task's output as a string\n    \"\"\"\n    output_description = \"n/a\"\n\n    if isinstance(task_output, RemoteTarget):\n        output_description = \"[SSH] {0}:{1}\".format(task_output._fs.remote_context.host, task_output.path)\n    elif isinstance(task_output, S3Target):\n        output_description = \"[S3] {0}\".format(task_output.path)\n    elif isinstance(task_output, FileSystemTarget):\n        output_description = \"[FileSystem] {0}\".format(task_output.path)\n    elif isinstance(task_output, PostgresTarget):\n        output_description = \"[DB] {0}:{1}\".format(task_output.host, task_output.table)\n    else:\n        output_description = \"to be determined\"\n\n    return output_description\n\n\ndef main():\n    deps = find_deps_cli()\n    for task in deps:\n        task_output = task.output()\n\n        if isinstance(task_output, dict):\n            output_descriptions = [get_task_output_description(output) for label, output in task_output.items()]\n        elif isinstance(task_output, Iterable):\n            output_descriptions = [get_task_output_description(output) for output in task_output]\n        else:\n            output_descriptions = [get_task_output_description(task_output)]\n\n        print(\"   TASK: {0}\".format(task))\n        for desc in output_descriptions:\n            print(\"                       : {0}\".format(desc))\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "luigi/tools/deps_tree.py",
    "content": "# -*- coding: utf-8 -*-\n\"\"\"\nThis module parses commands exactly the same as the luigi task runner. You must specify the module, the task and task parameters.\nInstead of executing a task, this module prints the significant parameters and state of the task and its dependencies in a tree format.\nUse this to visualize the execution plan in the terminal.\n\n.. code-block:: none\n\n    $ luigi-deps-tree --module foo_complex examples.Foo\n    ...\n    └─--[Foo-{} (PENDING)]\n        |---[Bar-{'num': '0'} (PENDING)]\n        |   |---[Bar-{'num': '4'} (PENDING)]\n        |   └─--[Bar-{'num': '5'} (PENDING)]\n        |---[Bar-{'num': '1'} (PENDING)]\n        └─--[Bar-{'num': '2'} (PENDING)]\n            └─--[Bar-{'num': '6'} (PENDING)]\n                |---[Bar-{'num': '7'} (PENDING)]\n                |   |---[Bar-{'num': '9'} (PENDING)]\n                |   └─--[Bar-{'num': '10'} (PENDING)]\n                |       └─--[Bar-{'num': '11'} (PENDING)]\n                └─--[Bar-{'num': '8'} (PENDING)]\n                    └─--[Bar-{'num': '12'} (PENDING)]\n\"\"\"\n\nimport sys\nimport warnings\n\nfrom luigi.cmdline_parser import CmdlineParser\nfrom luigi.task import flatten\n\n\nclass bcolors:\n    \"\"\"\n    colored output for task status\n    \"\"\"\n\n    OKBLUE = \"\\033[94m\"\n    OKGREEN = \"\\033[92m\"\n    ENDC = \"\\033[0m\"\n\n\ndef print_tree(task, indent=\"\", last=True):\n    \"\"\"\n    Return a string representation of the tasks, their statuses/parameters in a dependency tree format\n    \"\"\"\n    # dont bother printing out warnings about tasks with no output\n    with warnings.catch_warnings():\n        warnings.filterwarnings(action=\"ignore\", message=\"Task .* without outputs has no custom complete\\\\(\\\\) method\")\n        is_task_complete = task.complete()\n    is_complete = (bcolors.OKGREEN + \"COMPLETE\" if is_task_complete else bcolors.OKBLUE + \"PENDING\") + bcolors.ENDC\n    name = task.__class__.__name__\n    params = task.to_str_params(only_significant=True)\n    result = \"\\n\" + indent\n    if last:\n        result += \"└─--\"\n        indent += \"    \"\n    else:\n        result += \"|---\"\n        indent += \"|   \"\n    result += \"[{0}-{1} ({2})]\".format(name, params, is_complete)\n    children = flatten(task.requires())\n    for index, child in enumerate(children):\n        result += print_tree(child, indent, (index + 1) == len(children))\n    return result\n\n\ndef main():\n    cmdline_args = sys.argv[1:]\n    with CmdlineParser.global_instance(cmdline_args) as cp:\n        task = cp.get_task_obj()\n        print(print_tree(task))\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "luigi/tools/luigi_grep.py",
    "content": "#!/usr/bin/env python\n\nimport argparse\nimport json\nfrom collections import defaultdict\nfrom urllib.request import urlopen\n\n\nclass LuigiGrep:\n    def __init__(self, host, port):\n        self._host = host\n        self._port = port\n\n    @property\n    def graph_url(self):\n        return \"http://{0}:{1}/api/graph\".format(self._host, self._port)\n\n    def _fetch_json(self):\n        \"\"\"Returns the json representation of the dep graph\"\"\"\n        print(\"Fetching from url: \" + self.graph_url)\n        resp = urlopen(self.graph_url).read()\n        return json.loads(resp.decode(\"utf-8\"))\n\n    def _build_results(self, jobs, job):\n        job_info = jobs[job]\n        deps = job_info[\"deps\"]\n        deps_status = defaultdict(list)\n        for j in deps:\n            if j in jobs:\n                deps_status[jobs[j][\"status\"]].append(j)\n            else:\n                deps_status[\"UNKNOWN\"].append(j)\n        return {\"name\": job, \"status\": job_info[\"status\"], \"deps_by_status\": deps_status}\n\n    def prefix_search(self, job_name_prefix):\n        \"\"\"Searches for jobs matching the given ``job_name_prefix``.\"\"\"\n        json = self._fetch_json()\n        jobs = json[\"response\"]\n        for job in jobs:\n            if job.startswith(job_name_prefix):\n                yield self._build_results(jobs, job)\n\n    def status_search(self, status):\n        \"\"\"Searches for jobs matching the given ``status``.\"\"\"\n        json = self._fetch_json()\n        jobs = json[\"response\"]\n        for job in jobs:\n            job_info = jobs[job]\n            if job_info[\"status\"].lower() == status.lower():\n                yield self._build_results(jobs, job)\n\n\ndef main():\n    parser = argparse.ArgumentParser(\"luigi-grep is used to search for workflows using the luigi scheduler's json api\")\n    parser.add_argument(\"--scheduler-host\", default=\"localhost\", help=\"hostname of the luigi scheduler\")\n    parser.add_argument(\"--scheduler-port\", default=\"8082\", help=\"port of the luigi scheduler\")\n    parser.add_argument(\"--prefix\", help=\"prefix of a task query to search for\", default=None)\n    parser.add_argument(\"--status\", help=\"search for jobs with the given status\", default=None)\n\n    args = parser.parse_args()\n    grep = LuigiGrep(args.scheduler_host, args.scheduler_port)\n\n    results = []\n    if args.prefix:\n        results = grep.prefix_search(args.prefix)\n    elif args.status:\n        results = grep.status_search(args.status)\n\n    for job in results:\n        print(\"{name}: {status}, Dependencies:\".format(name=job[\"name\"], status=job[\"status\"]))\n        for status, jobs in job[\"deps_by_status\"].items():\n            print(\"  status={status}\".format(status=status))\n            for job in jobs:\n                print(\"    {job}\".format(job=job))\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "luigi/tools/range.py",
    "content": "# -*- coding: utf-8 -*-\n# Copyright (c) 2014 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n\n\"\"\"\nProduces contiguous completed ranges of recurring tasks.\n\nSee ``RangeDaily`` and ``RangeHourly`` for basic usage.\n\nCaveat - if gaps accumulate, their causes (e.g. missing dependencies) going\nunmonitored/unmitigated, then this will eventually keep retrying the same gaps\nover and over and make no progress to more recent times. (See ``task_limit``\nand ``reverse`` parameters.)\nTODO foolproof against that kind of misuse?\n\"\"\"\n\nimport functools\nimport itertools\nimport logging\nimport re\nimport time\nimport warnings\nfrom collections import Counter\nfrom datetime import date, datetime, timedelta\n\nfrom dateutil.relativedelta import relativedelta\n\nimport luigi\nfrom luigi.parameter import ParameterException\nfrom luigi.target import FileSystemTarget\nfrom luigi.task import Register, flatten_output\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nclass RangeEvent(luigi.Event):  # Not sure if subclassing currently serves a purpose. Stringly typed, events are.\n    \"\"\"\n    Events communicating useful metrics.\n\n    ``COMPLETE_COUNT`` would normally be nondecreasing, and its derivative\n    would describe performance (how many instances complete\n    invocation-over-invocation).\n\n    ``COMPLETE_FRACTION`` reaching 1 would be a telling event in case of a\n    backfill with defined start and stop. Would not be strikingly useful for a\n    typical recurring task without stop defined, fluctuating close to 1.\n\n    ``DELAY`` is measured from the first found missing datehour till (current\n    time + hours_forward), or till stop if it is defined. In hours for Hourly.\n    TBD different units for other frequencies?\n    TODO any different for reverse mode? From first missing till last missing?\n    From last gap till stop?\n    \"\"\"\n\n    COMPLETE_COUNT = \"event.tools.range.complete.count\"\n    COMPLETE_FRACTION = \"event.tools.range.complete.fraction\"\n    DELAY = \"event.tools.range.delay\"\n\n\nclass RangeBase(luigi.WrapperTask):\n    \"\"\"\n    Produces a contiguous completed range of a recurring task.\n\n    Made for the common use case where a task is parameterized by e.g.\n    ``DateParameter``, and assurance is needed that any gaps arising from\n    downtime are eventually filled.\n\n    Emits events that one can use to monitor gaps and delays.\n\n    At least one of start and stop needs to be specified.\n\n    (This is quite an abstract base class for subclasses with different\n    datetime parameter classes, e.g. ``DateParameter``, ``DateHourParameter``,\n    ..., and different parameter naming, e.g. days_back/forward,\n    hours_back/forward, ..., as well as different documentation wording,\n    to improve user experience.)\n\n    Subclasses will need to use the ``of`` parameter when overriding methods.\n    \"\"\"\n\n    # TODO lift the single parameter constraint by passing unknown parameters through WrapperTask?\n    of = luigi.TaskParameter(description=\"task name to be completed. The task must take a single datetime parameter\")\n    of_params = luigi.DictParameter(default=dict(), description=\"Arguments to be provided to the 'of' class when instantiating\")\n    # The common parameters 'start' and 'stop' have type (e.g. DateParameter,\n    # DateHourParameter) dependent on the concrete subclass, cumbersome to\n    # define here generically without dark magic. Refer to the overrides.\n    start = luigi.Parameter()\n    stop = luigi.Parameter()\n    reverse = luigi.BoolParameter(\n        default=False,\n        description=\"specifies the preferred order for catching up. False - work from the oldest missing outputs onward; True - from the newest backward\",\n    )\n    task_limit = luigi.IntParameter(default=50, description=\"how many of 'of' tasks to require. Guards against scheduling insane amounts of tasks in one go\")\n    # TODO overridable exclude_datetimes or something...\n    now = luigi.IntParameter(default=None, description=\"set to override current time. In seconds since epoch\")\n    param_name = luigi.Parameter(\n        default=None,\n        description=\"parameter name used to pass in parameterized value. Defaults to None, meaning use first positional parameter\",\n        positional=False,\n    )\n\n    @property\n    def of_cls(self):\n        \"\"\"\n        DONT USE. Will be deleted soon. Use ``self.of``!\n        \"\"\"\n        if isinstance(self.of, str):\n            warnings.warn('When using Range programatically, dont pass \"of\" param as string!')\n            return Register.get_task_cls(self.of)\n        return self.of\n\n    # a bunch of datetime arithmetic building blocks that need to be provided in subclasses\n    def datetime_to_parameter(self, dt):\n        raise NotImplementedError\n\n    def parameter_to_datetime(self, p):\n        raise NotImplementedError\n\n    def datetime_to_parameters(self, dt):\n        \"\"\"\n        Given a date-time, will produce a dictionary of of-params combined with the ranged task parameter\n        \"\"\"\n        raise NotImplementedError\n\n    def parameters_to_datetime(self, p):\n        \"\"\"\n        Given a dictionary of parameters, will extract the ranged task parameter value\n        \"\"\"\n        raise NotImplementedError\n\n    def moving_start(self, now):\n        \"\"\"\n        Returns a datetime from which to ensure contiguousness in the case when\n        start is None or unfeasibly far back.\n        \"\"\"\n        raise NotImplementedError\n\n    def moving_stop(self, now):\n        \"\"\"\n        Returns a datetime till which to ensure contiguousness in the case when\n        stop is None or unfeasibly far forward.\n        \"\"\"\n        raise NotImplementedError\n\n    def finite_datetimes(self, finite_start, finite_stop):\n        \"\"\"\n        Returns the individual datetimes in interval [finite_start, finite_stop)\n        for which task completeness should be required, as a sorted list.\n        \"\"\"\n        raise NotImplementedError\n\n    def _emit_metrics(self, missing_datetimes, finite_start, finite_stop):\n        \"\"\"\n        For consistent metrics one should consider the entire range, but\n        it is open (infinite) if stop or start is None.\n\n        Hence make do with metrics respective to the finite simplification.\n        \"\"\"\n        datetimes = self.finite_datetimes(\n            finite_start if self.start is None else min(finite_start, self.parameter_to_datetime(self.start)),\n            finite_stop if self.stop is None else max(finite_stop, self.parameter_to_datetime(self.stop)),\n        )\n\n        delay_in_jobs = len(datetimes) - datetimes.index(missing_datetimes[0]) if datetimes and missing_datetimes else 0\n        self.trigger_event(RangeEvent.DELAY, self.of_cls.task_family, delay_in_jobs)\n\n        expected_count = len(datetimes)\n        complete_count = expected_count - len(missing_datetimes)\n        self.trigger_event(RangeEvent.COMPLETE_COUNT, self.of_cls.task_family, complete_count)\n        self.trigger_event(RangeEvent.COMPLETE_FRACTION, self.of_cls.task_family, float(complete_count) / expected_count if expected_count else 1)\n\n    def _format_datetime(self, dt):\n        return self.datetime_to_parameter(dt)\n\n    def _format_range(self, datetimes):\n        param_first = self._format_datetime(datetimes[0])\n        param_last = self._format_datetime(datetimes[-1])\n        return \"[%s, %s]\" % (param_first, param_last)\n\n    def _instantiate_task_cls(self, param):\n        return self.of(**self._task_parameters(param))\n\n    @property\n    def _param_name(self):\n        if self.param_name is None:\n            return next(x[0] for x in self.of.get_params() if x[1].positional)\n        else:\n            return self.param_name\n\n    def _task_parameters(self, param):\n        kwargs = dict(**self.of_params)\n        kwargs[self._param_name] = param\n        return kwargs\n\n    def requires(self):\n        # cache because we anticipate a fair amount of computation\n        if hasattr(self, \"_cached_requires\"):\n            return self._cached_requires\n\n        if not self.start and not self.stop:\n            raise ParameterException(\"At least one of start and stop needs to be specified\")\n        if not self.start and not self.reverse:\n            raise ParameterException(\"Either start needs to be specified or reverse needs to be True\")\n        if self.start and self.stop and self.start > self.stop:\n            raise ParameterException(\"Can't have start > stop\")\n        # TODO check overridden complete() and exists()\n\n        now = datetime.utcfromtimestamp(time.time() if self.now is None else self.now)\n\n        moving_start = self.moving_start(now)\n        finite_start = moving_start if self.start is None else max(self.parameter_to_datetime(self.start), moving_start)\n        moving_stop = self.moving_stop(now)\n        finite_stop = moving_stop if self.stop is None else min(self.parameter_to_datetime(self.stop), moving_stop)\n\n        datetimes = self.finite_datetimes(finite_start, finite_stop) if finite_start <= finite_stop else []\n\n        if datetimes:\n            logger.debug(\"Actually checking if range %s of %s is complete\", self._format_range(datetimes), self.of_cls.task_family)\n            missing_datetimes = sorted(self._missing_datetimes(datetimes))\n            logger.debug(\n                \"Range %s lacked %d of expected %d %s instances\", self._format_range(datetimes), len(missing_datetimes), len(datetimes), self.of_cls.task_family\n            )\n        else:\n            missing_datetimes = []\n            logger.debug(\"Empty range. No %s instances expected\", self.of_cls.task_family)\n\n        self._emit_metrics(missing_datetimes, finite_start, finite_stop)\n\n        if self.reverse:\n            required_datetimes = missing_datetimes[-self.task_limit :]\n        else:\n            required_datetimes = missing_datetimes[: self.task_limit]\n        if required_datetimes:\n            logger.debug(\n                \"Requiring %d missing %s instances in range %s\", len(required_datetimes), self.of_cls.task_family, self._format_range(required_datetimes)\n            )\n        if self.reverse:\n            required_datetimes.reverse()  # TODO priorities, so that within the batch tasks are ordered too\n\n        self._cached_requires = [self._instantiate_task_cls(self.datetime_to_parameter(d)) for d in required_datetimes]\n        return self._cached_requires\n\n    def missing_datetimes(self, finite_datetimes):\n        \"\"\"\n        Override in subclasses to do bulk checks.\n\n        Returns a sorted list.\n\n        This is a conservative base implementation that brutally checks completeness, instance by instance.\n\n        Inadvisable as it may be slow.\n        \"\"\"\n        return [d for d in finite_datetimes if not self._instantiate_task_cls(self.datetime_to_parameter(d)).complete()]\n\n    def _missing_datetimes(self, finite_datetimes):\n        \"\"\"\n        Backward compatible wrapper. Will be deleted eventually (stated on Dec 2015)\n        \"\"\"\n        try:\n            return self.missing_datetimes(finite_datetimes)\n        except TypeError as ex:\n            if \"missing_datetimes()\" in repr(ex):\n                warnings.warn(\"In your Range* subclass, missing_datetimes() should only take 1 argument (see latest docs)\")\n                return self.missing_datetimes(self.of_cls, finite_datetimes)\n            else:\n                raise\n\n\nclass RangeDailyBase(RangeBase):\n    \"\"\"\n    Produces a contiguous completed range of a daily recurring task.\n    \"\"\"\n\n    start = luigi.DateParameter(default=None, description=\"beginning date, inclusive. Default: None - work backward forever (requires reverse=True)\")\n    stop = luigi.DateParameter(default=None, description=\"ending date, exclusive. Default: None - work forward forever\")\n    days_back = luigi.IntParameter(\n        default=100,  # slightly more than three months\n        description=(\n            \"extent to which contiguousness is to be assured into \"\n            \"past, in days from current time. Prevents infinite loop \"\n            \"when start is none. If the dataset has limited retention\"\n            \" (i.e. old outputs get removed), this should be set \"\n            \"shorter to that, too, to prevent the oldest outputs \"\n            \"flapping. Increase freely if you intend to process old \"\n            \"dates - worker's memory is the limit\"\n        ),\n    )\n    days_forward = luigi.IntParameter(\n        default=0,\n        description=\"extent to which contiguousness is to be assured into future, in days from current time. Prevents infinite loop when stop is none\",\n    )\n\n    def datetime_to_parameter(self, dt):\n        return dt.date()\n\n    def parameter_to_datetime(self, p):\n        return datetime(p.year, p.month, p.day)\n\n    def datetime_to_parameters(self, dt):\n        \"\"\"\n        Given a date-time, will produce a dictionary of of-params combined with the ranged task parameter\n        \"\"\"\n        return self._task_parameters(dt.date())\n\n    def parameters_to_datetime(self, p):\n        \"\"\"\n        Given a dictionary of parameters, will extract the ranged task parameter value\n        \"\"\"\n        dt = p[self._param_name]\n        return datetime(dt.year, dt.month, dt.day)\n\n    def moving_start(self, now):\n        return now - timedelta(days=self.days_back)\n\n    def moving_stop(self, now):\n        return now + timedelta(days=self.days_forward)\n\n    def finite_datetimes(self, finite_start, finite_stop):\n        \"\"\"\n        Simply returns the points in time that correspond to turn of day.\n        \"\"\"\n        date_start = datetime(finite_start.year, finite_start.month, finite_start.day)\n        dates = []\n        for i in itertools.count():\n            t = date_start + timedelta(days=i)\n            if t >= finite_stop:\n                return dates\n            if t >= finite_start:\n                dates.append(t)\n\n\nclass RangeHourlyBase(RangeBase):\n    \"\"\"\n    Produces a contiguous completed range of an hourly recurring task.\n    \"\"\"\n\n    start = luigi.DateHourParameter(default=None, description=\"beginning datehour, inclusive. Default: None - work backward forever (requires reverse=True)\")\n    stop = luigi.DateHourParameter(default=None, description=\"ending datehour, exclusive. Default: None - work forward forever\")\n    hours_back = luigi.IntParameter(\n        default=100 * 24,  # slightly more than three months\n        description=(\n            \"extent to which contiguousness is to be assured into \"\n            \"past, in hours from current time. Prevents infinite \"\n            \"loop when start is none. If the dataset has limited \"\n            \"retention (i.e. old outputs get removed), this should \"\n            \"be set shorter to that, too, to prevent the oldest \"\n            \"outputs flapping. Increase freely if you intend to \"\n            \"process old dates - worker's memory is the limit\"\n        ),\n    )\n    # TODO always entire interval for reprocessings (fixed start and stop)?\n    hours_forward = luigi.IntParameter(\n        default=0,\n        description=\"extent to which contiguousness is to be assured into future, in hours from current time. Prevents infinite loop when stop is none\",\n    )\n\n    def datetime_to_parameter(self, dt):\n        return dt\n\n    def parameter_to_datetime(self, p):\n        return p\n\n    def datetime_to_parameters(self, dt):\n        \"\"\"\n        Given a date-time, will produce a dictionary of of-params combined with the ranged task parameter\n        \"\"\"\n        return self._task_parameters(dt)\n\n    def parameters_to_datetime(self, p):\n        \"\"\"\n        Given a dictionary of parameters, will extract the ranged task parameter value\n        \"\"\"\n        return p[self._param_name]\n\n    def moving_start(self, now):\n        return now - timedelta(hours=self.hours_back)\n\n    def moving_stop(self, now):\n        return now + timedelta(hours=self.hours_forward)\n\n    def finite_datetimes(self, finite_start, finite_stop):\n        \"\"\"\n        Simply returns the points in time that correspond to whole hours.\n        \"\"\"\n        datehour_start = datetime(finite_start.year, finite_start.month, finite_start.day, finite_start.hour)\n        datehours = []\n        for i in itertools.count():\n            t = datehour_start + timedelta(hours=i)\n            if t >= finite_stop:\n                return datehours\n            if t >= finite_start:\n                datehours.append(t)\n\n    def _format_datetime(self, dt):\n        return luigi.DateHourParameter().serialize(dt)\n\n\nclass RangeByMinutesBase(RangeBase):\n    \"\"\"\n    Produces a contiguous completed range of an recurring tasks separated a specified number of minutes.\n    \"\"\"\n\n    start = luigi.DateMinuteParameter(\n        default=None, description=\"beginning date-hour-minute, inclusive. Default: None - work backward forever (requires reverse=True)\"\n    )\n    stop = luigi.DateMinuteParameter(default=None, description=\"ending date-hour-minute, exclusive. Default: None - work forward forever\")\n    minutes_back = luigi.IntParameter(\n        default=60 * 24,  # one day\n        description=(\n            \"extent to which contiguousness is to be assured into \"\n            \"past, in minutes from current time. Prevents infinite \"\n            \"loop when start is none. If the dataset has limited \"\n            \"retention (i.e. old outputs get removed), this should \"\n            \"be set shorter to that, too, to prevent the oldest \"\n            \"outputs flapping. Increase freely if you intend to \"\n            \"process old dates - worker's memory is the limit\"\n        ),\n    )\n    minutes_forward = luigi.IntParameter(\n        default=0,\n        description=\"extent to which contiguousness is to be assured into future, in minutes from current time. Prevents infinite loop when stop is none\",\n    )\n\n    minutes_interval = luigi.IntParameter(default=1, description=\"separation between events in minutes. It must evenly divide 60\")\n\n    def datetime_to_parameter(self, dt):\n        return dt\n\n    def parameter_to_datetime(self, p):\n        return p\n\n    def datetime_to_parameters(self, dt):\n        \"\"\"\n        Given a date-time, will produce a dictionary of of-params combined with the ranged task parameter\n        \"\"\"\n        return self._task_parameters(dt)\n\n    def parameters_to_datetime(self, p):\n        \"\"\"\n        Given a dictionary of parameters, will extract the ranged task parameter value\n        \"\"\"\n        dt = p[self._param_name]\n        return datetime(dt.year, dt.month, dt.day, dt.hour, dt.minute)\n\n    def moving_start(self, now):\n        return now - timedelta(minutes=self.minutes_back)\n\n    def moving_stop(self, now):\n        return now + timedelta(minutes=self.minutes_forward)\n\n    def finite_datetimes(self, finite_start, finite_stop):\n        \"\"\"\n        Simply returns the points in time that correspond to a whole number of minutes intervals.\n        \"\"\"\n        # Validate that the minutes_interval can divide 60 and it is greater than 0 and lesser than 60\n        if not (0 < self.minutes_interval < 60):\n            raise ParameterException(\"minutes-interval must be within 0..60\")\n        if 60 % self.minutes_interval != 0:\n            raise ParameterException(\"minutes-interval does not evenly divide 60\")\n        # start of a complete interval, e.g. 20:13 and the interval is 5 -> 20:10\n        start_minute = int(finite_start.minute / self.minutes_interval) * self.minutes_interval\n        datehour_start = datetime(year=finite_start.year, month=finite_start.month, day=finite_start.day, hour=finite_start.hour, minute=start_minute)\n        datehours = []\n        for i in itertools.count():\n            t = datehour_start + timedelta(minutes=i * self.minutes_interval)\n            if t >= finite_stop:\n                return datehours\n            if t >= finite_start:\n                datehours.append(t)\n\n    def _format_datetime(self, dt):\n        return luigi.DateMinuteParameter().serialize(dt)\n\n\ndef _constrain_glob(glob, paths, limit=5):\n    \"\"\"\n    Tweaks glob into a list of more specific globs that together still cover paths and not too much extra.\n\n    Saves us minutes long listings for long dataset histories.\n\n    Specifically, in this implementation the leftmost occurrences of \"[0-9]\"\n    give rise to a few separate globs that each specialize the expression to\n    digits that actually occur in paths.\n    \"\"\"\n\n    def digit_set_wildcard(chars):\n        \"\"\"\n        Makes a wildcard expression for the set, a bit readable, e.g. [1-5].\n        \"\"\"\n        chars = sorted(chars)\n        if len(chars) > 1 and ord(chars[-1]) - ord(chars[0]) == len(chars) - 1:\n            return \"[%s-%s]\" % (chars[0], chars[-1])\n        else:\n            return \"[%s]\" % \"\".join(chars)\n\n    current = {glob: paths}\n    while True:\n        pos = list(current.keys())[0].find(\"[0-9]\")\n        if pos == -1:\n            # no wildcard expressions left to specialize in the glob\n            return list(current.keys())\n        char_sets = {}\n        for g, p in current.items():\n            char_sets[g] = sorted({path[pos] for path in p})\n        if sum(len(s) for s in char_sets.values()) > limit:\n            return [g.replace(\"[0-9]\", digit_set_wildcard(char_sets[g]), 1) for g in current]\n        for g, s in char_sets.items():\n            for c in s:\n                new_glob = g.replace(\"[0-9]\", c, 1)\n                new_paths = list(filter(lambda p: p[pos] == c, current[g]))\n                current[new_glob] = new_paths\n            del current[g]\n\n\ndef most_common(items):\n    [(element, counter)] = Counter(items).most_common(1)\n    return element, counter\n\n\ndef _get_per_location_glob(tasks, outputs, regexes):\n    \"\"\"\n    Builds a glob listing existing output paths.\n\n    Esoteric reverse engineering, but worth it given that (compared to an\n    equivalent contiguousness guarantee by naive complete() checks)\n    requests to the filesystem are cut by orders of magnitude, and users\n    don't even have to retrofit existing tasks anyhow.\n    \"\"\"\n    paths = [o.path for o in outputs]\n    # naive, because some matches could be confused by numbers earlier\n    # in path, e.g. /foo/fifa2000k/bar/2000-12-31/00\n    matches = [r.search(p) for r, p in zip(regexes, paths)]\n\n    for m, p, t in zip(matches, paths, tasks):\n        if m is None:\n            raise NotImplementedError(\"Couldn't deduce datehour representation in output path %r of task %s\" % (p, t))\n\n    n_groups = len(matches[0].groups())\n    # the most common position of every group is likely\n    # to be conclusive hit or miss\n    positions = [most_common((m.start(i), m.end(i)) for m in matches)[0] for i in range(1, n_groups + 1)]\n\n    glob = list(paths[0])  # FIXME sanity check that it's the same for all paths\n    for start, end in positions:\n        glob = glob[:start] + [\"[0-9]\"] * (end - start) + glob[end:]\n    # chop off the last path item\n    # (wouldn't need to if `hadoop fs -ls -d` equivalent were available)\n    return \"\".join(glob).rsplit(\"/\", 1)[0]\n\n\ndef _get_filesystems_and_globs(datetime_to_task, datetime_to_re):\n    \"\"\"\n    Yields a (filesystem, glob) tuple per every output location of task.\n\n    The task can have one or several FileSystemTarget outputs.\n\n    For convenience, the task can be a luigi.WrapperTask,\n    in which case outputs of all its dependencies are considered.\n    \"\"\"\n    # probe some scattered datetimes unlikely to all occur in paths, other than by being sincere datetime parameter's representations\n    # TODO limit to [self.start, self.stop) so messages are less confusing? Done trivially it can kill correctness\n    sample_datetimes = [datetime(y, m, d, h) for y in range(2000, 2050, 10) for m in range(1, 4) for d in range(5, 8) for h in range(21, 24)]\n    regexes = [re.compile(datetime_to_re(d)) for d in sample_datetimes]\n    sample_tasks = [datetime_to_task(d) for d in sample_datetimes]\n    sample_outputs = [flatten_output(t) for t in sample_tasks]\n\n    for o, t in zip(sample_outputs, sample_tasks):\n        if len(o) != len(sample_outputs[0]):\n            raise NotImplementedError(\"Outputs must be consistent over time, sorry; was %r for %r and %r for %r\" % (o, t, sample_outputs[0], sample_tasks[0]))\n            # TODO fall back on requiring last couple of days? to avoid astonishing blocking when changes like that are deployed\n            # erm, actually it's not hard to test entire hours_back..hours_forward and split into consistent subranges FIXME?\n        for target in o:\n            if not isinstance(target, FileSystemTarget):\n                raise NotImplementedError(\"Output targets must be instances of FileSystemTarget; was %r for %r\" % (target, t))\n\n    for o in zip(*sample_outputs):  # transposed, so here we're iterating over logical outputs, not datetimes\n        glob = _get_per_location_glob(sample_tasks, o, regexes)\n        yield o[0].fs, glob\n\n\ndef _list_existing(filesystem, glob, paths):\n    \"\"\"\n    Get all the paths that do in fact exist. Returns a set of all existing paths.\n\n    Takes a luigi.target.FileSystem object, a str which represents a glob and\n    a list of strings representing paths.\n    \"\"\"\n    globs = _constrain_glob(glob, paths)\n    time_start = time.time()\n    listing = []\n    for g in sorted(globs):\n        logger.debug(\"Listing %s\", g)\n        if filesystem.exists(g):\n            listing.extend(filesystem.listdir(g))\n    logger.debug(\"%d %s listings took %f s to return %d items\", len(globs), filesystem.__class__.__name__, time.time() - time_start, len(listing))\n    return set(listing)\n\n\ndef infer_bulk_complete_from_fs(datetimes, datetime_to_task, datetime_to_re):\n    \"\"\"\n    Efficiently determines missing datetimes by filesystem listing.\n\n    The current implementation works for the common case of a task writing\n    output to a ``FileSystemTarget`` whose path is built using strftime with\n    format like '...%Y...%m...%d...%H...', without custom ``complete()`` or\n    ``exists()``.\n\n    (Eventually Luigi could have ranges of completion as first-class citizens.\n    Then this listing business could be factored away/be provided for\n    explicitly in target API or some kind of a history server.)\n    \"\"\"\n    filesystems_and_globs_by_location = _get_filesystems_and_globs(datetime_to_task, datetime_to_re)\n    paths_by_datetime = [[o.path for o in flatten_output(datetime_to_task(d))] for d in datetimes]\n    listing = set()\n    for (f, g), p in zip(filesystems_and_globs_by_location, zip(*paths_by_datetime)):  # transposed, so here we're iterating over logical outputs, not datetimes\n        listing |= _list_existing(f, g, p)\n\n    # quickly learn everything that's missing\n    missing_datetimes = []\n    for d, p in zip(datetimes, paths_by_datetime):\n        if not set(p) <= listing:\n            missing_datetimes.append(d)\n\n    return missing_datetimes\n\n\nclass RangeMonthly(RangeBase):\n    \"\"\"\n    Produces a contiguous completed range of a monthly recurring task.\n\n    Unlike the Range* classes with shorter intervals, this class does not perform bulk optimisation.\n    It is assumed that the number of months is low enough not to motivate the increased complexity.\n    Hence, there is no class RangeMonthlyBase.\n    \"\"\"\n\n    start = luigi.MonthParameter(default=None, description=\"beginning month, inclusive. Default: None - work backward forever (requires reverse=True)\")\n    stop = luigi.MonthParameter(default=None, description=\"ending month, exclusive. Default: None - work forward forever\")\n    months_back = luigi.IntParameter(\n        default=13,  # Little over a year\n        description=(\n            \"extent to which contiguousness is to be assured into \"\n            \"past, in months from current time. Prevents infinite loop \"\n            \"when start is none. If the dataset has limited retention\"\n            \" (i.e. old outputs get removed), this should be set \"\n            \"shorter to that, too, to prevent the oldest outputs \"\n            \"flapping. Increase freely if you intend to process old \"\n            \"dates - worker's memory is the limit\"\n        ),\n    )\n    months_forward = luigi.IntParameter(\n        default=0,\n        description=\"extent to which contiguousness is to be assured into future, in months from current time. Prevents infinite loop when stop is none\",\n    )\n\n    def datetime_to_parameter(self, dt):\n        return date(dt.year, dt.month, 1)\n\n    def parameter_to_datetime(self, p):\n        return datetime(p.year, p.month, 1)\n\n    def datetime_to_parameters(self, dt):\n        \"\"\"\n        Given a date-time, will produce a dictionary of of-params combined with the ranged task parameter\n        \"\"\"\n        return self._task_parameters(dt.date())\n\n    def parameters_to_datetime(self, p):\n        \"\"\"\n        Given a dictionary of parameters, will extract the ranged task parameter value\n        \"\"\"\n        dt = p[self._param_name]\n        return datetime(dt.year, dt.month, 1)\n\n    def _format_datetime(self, dt):\n        return dt.strftime(\"%Y-%m\")\n\n    def moving_start(self, now):\n        return self._align(now) - relativedelta(months=self.months_back)\n\n    def moving_stop(self, now):\n        return self._align(now) + relativedelta(months=self.months_forward)\n\n    def _align(self, dt):\n        return datetime(dt.year, dt.month, 1)\n\n    def finite_datetimes(self, finite_start, finite_stop):\n        \"\"\"\n        Simply returns the points in time that correspond to turn of month.\n        \"\"\"\n        start_date = self._align(finite_start)\n        aligned_stop = self._align(finite_stop)\n        dates = []\n        for m in itertools.count():\n            t = start_date + relativedelta(months=m)\n            if t >= aligned_stop:\n                return dates\n            if t >= finite_start:\n                dates.append(t)\n\n\nclass RangeDaily(RangeDailyBase):\n    \"\"\"Efficiently produces a contiguous completed range of a daily recurring\n    task that takes a single ``DateParameter``.\n\n    Falls back to infer it from output filesystem listing to facilitate the\n    common case usage.\n\n    Convenient to use even from command line, like:\n\n    .. code-block:: console\n\n        luigi --module your.module RangeDaily --of YourActualTask --start 2014-01-01\n    \"\"\"\n\n    def missing_datetimes(self, finite_datetimes):\n        try:\n            cls_with_params = functools.partial(self.of, **self.of_params)\n            complete_parameters = self.of.bulk_complete.__func__(cls_with_params, map(self.datetime_to_parameter, finite_datetimes))\n            return set(finite_datetimes) - set(map(self.parameter_to_datetime, complete_parameters))\n        except NotImplementedError:\n            return infer_bulk_complete_from_fs(\n                finite_datetimes, lambda d: self._instantiate_task_cls(self.datetime_to_parameter(d)), lambda d: d.strftime(\"(%Y).*(%m).*(%d)\")\n            )\n\n\nclass RangeHourly(RangeHourlyBase):\n    \"\"\"Efficiently produces a contiguous completed range of an hourly recurring\n    task that takes a single ``DateHourParameter``.\n\n    Benefits from ``bulk_complete`` information to efficiently cover gaps.\n\n    Falls back to infer it from output filesystem listing to facilitate the\n    common case usage.\n\n    Convenient to use even from command line, like:\n\n    .. code-block:: console\n\n        luigi --module your.module RangeHourly --of YourActualTask --start 2014-01-01T00\n    \"\"\"\n\n    def missing_datetimes(self, finite_datetimes):\n        try:\n            # TODO: Why is there a list() here but not for the RangeDaily??\n            cls_with_params = functools.partial(self.of, **self.of_params)\n            complete_parameters = self.of.bulk_complete.__func__(cls_with_params, list(map(self.datetime_to_parameter, finite_datetimes)))\n            return set(finite_datetimes) - set(map(self.parameter_to_datetime, complete_parameters))\n        except NotImplementedError:\n            return infer_bulk_complete_from_fs(\n                finite_datetimes, lambda d: self._instantiate_task_cls(self.datetime_to_parameter(d)), lambda d: d.strftime(\"(%Y).*(%m).*(%d).*(%H)\")\n            )\n\n\nclass RangeByMinutes(RangeByMinutesBase):\n    \"\"\"Efficiently produces a contiguous completed range of an recurring\n    task every interval minutes that takes a single ``DateMinuteParameter``.\n\n    Benefits from ``bulk_complete`` information to efficiently cover gaps.\n\n    Falls back to infer it from output filesystem listing to facilitate the\n    common case usage.\n\n    Convenient to use even from command line, like:\n\n    .. code-block:: console\n\n        luigi --module your.module RangeByMinutes --of YourActualTask --start 2014-01-01T0123\n    \"\"\"\n\n    def missing_datetimes(self, finite_datetimes):\n        try:\n            cls_with_params = functools.partial(self.of, **self.of_params)\n            complete_parameters = self.of.bulk_complete.__func__(cls_with_params, map(self.datetime_to_parameter, finite_datetimes))\n            return set(finite_datetimes) - set(map(self.parameter_to_datetime, complete_parameters))\n        except NotImplementedError:\n            return infer_bulk_complete_from_fs(\n                finite_datetimes, lambda d: self._instantiate_task_cls(self.datetime_to_parameter(d)), lambda d: d.strftime(\"(%Y).*(%m).*(%d).*(%H).*(%M)\")\n            )\n"
  },
  {
    "path": "luigi/util.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\n============================================================\nUsing ``inherits`` and ``requires`` to ease parameter pain\n============================================================\n\nMost luigi plumbers will find themselves in an awkward task parameter situation\nat some point or another.  Consider the following \"parameter explosion\"\nproblem:\n\n.. code-block:: python\n\n    class TaskA(luigi.ExternalTask):\n        param_a = luigi.Parameter()\n\n        def output(self):\n            return luigi.LocalTarget('/tmp/log-{t.param_a}'.format(t=self))\n\n    class TaskB(luigi.Task):\n        param_b = luigi.Parameter()\n        param_a = luigi.Parameter()\n\n        def requires(self):\n            return TaskA(param_a=self.param_a)\n\n    class TaskC(luigi.Task):\n        param_c = luigi.Parameter()\n        param_b = luigi.Parameter()\n        param_a = luigi.Parameter()\n\n        def requires(self):\n            return TaskB(param_b=self.param_b, param_a=self.param_a)\n\n\nIn work flows requiring many tasks to be chained together in this manner,\nparameter handling can spiral out of control.  Each downstream task becomes\nmore burdensome than the last.  Refactoring becomes more difficult.  There\nare several ways one might try and avoid the problem.\n\n**Approach 1**:  Parameters via command line or config instead of :func:`~luigi.task.Task.requires`.\n\n.. code-block:: python\n\n    class TaskA(luigi.ExternalTask):\n        param_a = luigi.Parameter()\n\n        def output(self):\n            return luigi.LocalTarget('/tmp/log-{t.param_a}'.format(t=self))\n\n    class TaskB(luigi.Task):\n        param_b = luigi.Parameter()\n\n        def requires(self):\n            return TaskA()\n\n    class TaskC(luigi.Task):\n        param_c = luigi.Parameter()\n\n        def requires(self):\n            return TaskB()\n\n\nThen run in the shell like so:\n\n.. code-block:: bash\n\n    luigi --module my_tasks TaskC --param-c foo --TaskB-param-b bar --TaskA-param-a baz\n\n\nRepetitive parameters have been eliminated, but at the cost of making the job's\ncommand line interface slightly clunkier.  Often this is a reasonable\ntrade-off.\n\nBut parameters can't always be refactored out every class.  Downstream\ntasks might also need to use some of those parameters.  For example,\nif ``TaskC`` needs to use ``param_a`` too, then ``param_a`` would still need\nto be repeated.\n\n\n**Approach 2**:  Use a common parameter class\n\n.. code-block:: python\n\n    class Params(luigi.Config):\n        param_c = luigi.Parameter()\n        param_b = luigi.Parameter()\n        param_a = luigi.Parameter()\n\n    class TaskA(Params, luigi.ExternalTask):\n        def output(self):\n            return luigi.LocalTarget('/tmp/log-{t.param_a}'.format(t=self))\n\n    class TaskB(Params):\n        def requires(self):\n            return TaskA()\n\n    class TaskB(Params):\n        def requires(self):\n            return TaskB()\n\n\nThis looks great at first glance, but a couple of issues lurk. Now ``TaskA``\nand ``TaskB`` have unnecessary significant parameters.  Significant parameters\nhelp define the identity of a task.  Identical tasks are prevented from\nrunning at the same time by the central planner.  This helps preserve the\nidempotent and atomic nature of luigi tasks.  Unnecessary significant task\nparameters confuse a task's identity.  Under the right circumstances, task\nidentity confusion could lead to that task running when it shouldn't, or\nfailing to run when it should.\n\nThis approach should only be used when all of the parameters of the config\nclass, are significant (or all insignificant) for all of its subclasses.\n\nAnd wait a second... there's a bug in the above code.  See it?\n\n``TaskA`` won't behave as an ``ExternalTask`` because the parent classes are\nspecified in the wrong order.  This contrived example is easy to fix (by\nswapping the ordering of the parents of ``TaskA``), but real world cases can be\nmore difficult to both spot and fix.  Inheriting from multiple classes\nderived from :class:`~luigi.task.Task` should be undertaken with caution and avoided\nwhere possible.\n\n\n**Approach 3**: Use :class:`~luigi.util.inherits` and :class:`~luigi.util.requires`\n\nThe :class:`~luigi.util.inherits` class decorator in this module copies parameters (and\nnothing else) from one task class to another, and avoids direct pythonic\ninheritance.\n\n.. code-block:: python\n\n    import luigi\n    from luigi.util import inherits\n\n    class TaskA(luigi.ExternalTask):\n        param_a = luigi.Parameter()\n\n        def output(self):\n            return luigi.LocalTarget('/tmp/log-{t.param_a}'.format(t=self))\n\n    @inherits(TaskA)\n    class TaskB(luigi.Task):\n        param_b = luigi.Parameter()\n\n        def requires(self):\n            t = self.clone(TaskA)  # or t = self.clone_parent()\n\n            # Wait... whats this clone thingy do?\n            #\n            # Pass it a task class.  It calls that task.  And when it does, it\n            # supplies all parameters (and only those parameters) common to\n            # the caller and callee!\n            #\n            # The call to clone is equivalent to the following (note the\n            # fact that clone avoids passing param_b).\n            #\n            #   return TaskA(param_a=self.param_a)\n\n            return t\n\n    @inherits(TaskB)\n    class TaskC(luigi.Task):\n        param_c = luigi.Parameter()\n\n        def requires(self):\n            return self.clone(TaskB)\n\n\nThis totally eliminates the need to repeat parameters, avoids inheritance\nissues, and keeps the task command line interface as simple (as it can be,\nanyway).  Refactoring task parameters is also much easier.\n\nThe :class:`~luigi.util.requires` helper function can reduce this pattern even further.   It\ndoes everything :class:`~luigi.util.inherits` does,\nand also attaches a :class:`~luigi.util.requires` method\nto your task (still all without pythonic inheritance).\n\nBut how does it know how to invoke the upstream task?  It uses :func:`~luigi.task.Task.clone`\nbehind the scenes!\n\n.. code-block:: python\n\n    import luigi\n    from luigi.util import inherits, requires\n\n    class TaskA(luigi.ExternalTask):\n        param_a = luigi.Parameter()\n\n        def output(self):\n            return luigi.LocalTarget('/tmp/log-{t.param_a}'.format(t=self))\n\n    @requires(TaskA)\n    class TaskB(luigi.Task):\n        param_b = luigi.Parameter()\n\n        # The class decorator does this for me!\n        # def requires(self):\n        #     return self.clone(TaskA)\n\nUse these helper functions effectively to avoid unnecessary\nrepetition and dodge a few potentially nasty workflow pitfalls at the same\ntime. Brilliant!\n\"\"\"\n\nimport datetime\nimport logging\n\nfrom luigi import parameter, task\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\ndef common_params(task_instance, task_cls):\n    \"\"\"\n    Grab all the values in task_instance that are found in task_cls.\n    \"\"\"\n    if not isinstance(task_cls, task.Register):\n        raise TypeError(\"task_cls must be an uninstantiated Task\")\n\n    task_instance_param_names = dict(task_instance.get_params()).keys()\n    task_cls_params_dict = dict(task_cls.get_params())\n    task_cls_param_names = task_cls_params_dict.keys()\n    common_param_names = set(task_instance_param_names).intersection(set(task_cls_param_names))\n    common_param_vals = [(key, task_cls_params_dict[key]) for key in common_param_names]\n    common_kwargs = dict((key, task_instance.param_kwargs[key]) for key in common_param_names)\n    vals = dict(task_instance.get_param_values(common_param_vals, [], common_kwargs))\n    return vals\n\n\nclass inherits:\n    \"\"\"\n    Task inheritance.\n\n    *New after Luigi 2.7.6:* multiple arguments support.\n\n    Usage:\n\n    .. code-block:: python\n\n        class AnotherTask(luigi.Task):\n            m = luigi.IntParameter()\n\n        class YetAnotherTask(luigi.Task):\n            n = luigi.IntParameter()\n\n        @inherits(AnotherTask)\n        class MyFirstTask(luigi.Task):\n            def requires(self):\n               return self.clone_parent()\n\n            def run(self):\n               print self.m # this will be defined\n               # ...\n\n        @inherits(AnotherTask, YetAnotherTask)\n        class MySecondTask(luigi.Task):\n            def requires(self):\n               return self.clone_parents()\n\n            def run(self):\n               print self.n # this will be defined\n               # ...\n    \"\"\"\n\n    def __init__(self, *tasks_to_inherit, **kw_tasks_to_inherit):\n        super(inherits, self).__init__()\n        if not tasks_to_inherit and not kw_tasks_to_inherit:\n            raise TypeError(\"tasks_to_inherit or kw_tasks_to_inherit must contain at least one task\")\n        if tasks_to_inherit and kw_tasks_to_inherit:\n            raise TypeError(\"Only one of tasks_to_inherit or kw_tasks_to_inherit may be present\")\n        self.tasks_to_inherit = tasks_to_inherit\n        self.kw_tasks_to_inherit = kw_tasks_to_inherit\n\n    def __call__(self, task_that_inherits):\n        # Get all parameter objects from each of the underlying tasks\n        task_iterator = self.tasks_to_inherit or self.kw_tasks_to_inherit.values()\n        for task_to_inherit in task_iterator:\n            for param_name, param_obj in task_to_inherit.get_params():\n                # Check if the parameter exists in the inheriting task\n                if not hasattr(task_that_inherits, param_name):\n                    # If not, add it to the inheriting task\n                    setattr(task_that_inherits, param_name, param_obj)\n\n        # Modify task_that_inherits by adding methods\n\n        # Handle unnamed tasks as a list, named as a dictionary\n        if self.tasks_to_inherit:\n\n            def clone_parent(_self, **kwargs):\n                return _self.clone(cls=self.tasks_to_inherit[0], **kwargs)\n\n            task_that_inherits.clone_parent = clone_parent\n\n            def clone_parents(_self, **kwargs):\n                return [_self.clone(cls=task_to_inherit, **kwargs) for task_to_inherit in self.tasks_to_inherit]\n\n            task_that_inherits.clone_parents = clone_parents\n        elif self.kw_tasks_to_inherit:\n            # Even if there is just one named task, return a dictionary\n            def clone_parents(_self, **kwargs):\n                return {task_name: _self.clone(cls=task_to_inherit, **kwargs) for task_name, task_to_inherit in self.kw_tasks_to_inherit.items()}\n\n            task_that_inherits.clone_parents = clone_parents\n\n        return task_that_inherits\n\n\nclass requires:\n    \"\"\"\n    Same as :class:`~luigi.util.inherits`, but also auto-defines the requires method.\n\n    *New after Luigi 2.7.6:* multiple arguments support.\n\n    \"\"\"\n\n    def __init__(self, *tasks_to_require, **kw_tasks_to_require):\n        super(requires, self).__init__()\n\n        self.tasks_to_require = tasks_to_require\n        self.kw_tasks_to_require = kw_tasks_to_require\n\n    def __call__(self, task_that_requires):\n        task_that_requires = inherits(*self.tasks_to_require, **self.kw_tasks_to_require)(task_that_requires)\n\n        # Modify task_that_requires by adding requires method.\n        # If only one task is required, this single task is returned.\n        # Otherwise, list of tasks is returned\n        def requires(_self):\n            return _self.clone_parent() if len(self.tasks_to_require) == 1 else _self.clone_parents()\n\n        task_that_requires.requires = requires\n\n        return task_that_requires\n\n\nclass copies:\n    \"\"\"\n    Auto-copies a task.\n\n    Usage:\n\n    .. code-block:: python\n\n        @copies(MyTask):\n        class CopyOfMyTask(luigi.Task):\n            def output(self):\n               return LocalTarget(self.date.strftime('/var/xyz/report-%Y-%m-%d'))\n    \"\"\"\n\n    def __init__(self, task_to_copy):\n        super(copies, self).__init__()\n        self.requires_decorator = requires(task_to_copy)\n\n    def __call__(self, task_that_copies):\n        task_that_copies = self.requires_decorator(task_that_copies)\n\n        # Modify task_that_copies by subclassing it and adding methods\n        @task._task_wraps(task_that_copies)\n        class Wrapped(task_that_copies):\n            def run(_self):\n                i, o = _self.input(), _self.output()\n                f = o.open(\"w\")  # TODO: assert that i, o are Target objects and not complex datastructures\n                for line in i.open(\"r\"):\n                    f.write(line)\n                f.close()\n\n        return Wrapped\n\n\ndef delegates(task_that_delegates):\n    \"\"\"Lets a task call methods on subtask(s).\n\n    The way this works is that the subtask is run as a part of the task, but\n    the task itself doesn't have to care about the requirements of the subtasks.\n    The subtask doesn't exist from the scheduler's point of view, and\n    its dependencies are instead required by the main task.\n\n    Example:\n\n    .. code-block:: python\n\n        class PowersOfN(luigi.Task):\n            n = luigi.IntParameter()\n            def f(self, x): return x ** self.n\n\n        @delegates\n        class T(luigi.Task):\n            def subtasks(self): return PowersOfN(5)\n            def run(self): print self.subtasks().f(42)\n    \"\"\"\n    if not hasattr(task_that_delegates, \"subtasks\"):\n        # This method can (optionally) define a couple of delegate tasks that\n        # will be accessible as interfaces, meaning that the task can access\n        # those tasks and run methods defined on them, etc\n        raise AttributeError('%s needs to implement the method \"subtasks\"' % task_that_delegates)\n\n    @task._task_wraps(task_that_delegates)\n    class Wrapped(task_that_delegates):\n        def deps(self):\n            # Overrides method in base class\n            return task.flatten(self.requires()) + task.flatten([t.deps() for t in task.flatten(self.subtasks())])\n\n        def run(self):\n            for t in task.flatten(self.subtasks()):\n                t.run()\n            task_that_delegates.run(self)\n\n    return Wrapped\n\n\ndef previous(task):\n    \"\"\"\n    Return a previous Task of the same family.\n\n    By default checks if this task family only has one non-global parameter and if\n    it is a DateParameter, DateHourParameter or DateIntervalParameter in which case\n    it returns with the time decremented by 1 (hour, day or interval)\n    \"\"\"\n    params = task.get_params()\n    previous_params = {}\n    previous_date_params = {}\n\n    for param_name, param_obj in params:\n        param_value = getattr(task, param_name)\n\n        if isinstance(param_obj, parameter.DateParameter):\n            previous_date_params[param_name] = param_value - datetime.timedelta(days=1)\n        elif isinstance(param_obj, parameter.DateSecondParameter):\n            previous_date_params[param_name] = param_value - datetime.timedelta(seconds=1)\n        elif isinstance(param_obj, parameter.DateMinuteParameter):\n            previous_date_params[param_name] = param_value - datetime.timedelta(minutes=1)\n        elif isinstance(param_obj, parameter.DateHourParameter):\n            previous_date_params[param_name] = param_value - datetime.timedelta(hours=1)\n        elif isinstance(param_obj, parameter.DateIntervalParameter):\n            previous_date_params[param_name] = param_value.prev()\n        else:\n            previous_params[param_name] = param_value\n\n    previous_params.update(previous_date_params)\n\n    if len(previous_date_params) == 0:\n        raise NotImplementedError(\"No task parameter - can't determine previous task\")\n    elif len(previous_date_params) > 1:\n        raise NotImplementedError(\"Too many date-related task parameters - can't determine previous task\")\n    else:\n        return task.clone(**previous_params)\n\n\ndef get_previous_completed(task, max_steps=10):\n    prev = task\n    for _ in range(max_steps):\n        prev = previous(prev)\n        logger.debug(\"Checking if %s is complete\", prev)\n        if prev.complete():\n            return prev\n    return None\n"
  },
  {
    "path": "luigi/worker.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nThe worker communicates with the scheduler and does two things:\n\n1. Sends all tasks that has to be run\n2. Gets tasks from the scheduler that should be run\n\nWhen running in local mode, the worker talks directly to a :py:class:`~luigi.scheduler.Scheduler` instance.\nWhen you run a central server, the worker will talk to the scheduler using a :py:class:`~luigi.rpc.RemoteScheduler` instance.\n\nEverything in this module is private to luigi and may change in incompatible\nways between versions. The exception is the exception types and the\n:py:class:`worker` config class.\n\"\"\"\n\nimport collections\nimport collections.abc\nimport contextlib\nimport datetime\nimport functools\nimport getpass\nimport importlib\nimport json\nimport logging\nimport multiprocessing\nimport os\nimport queue as Queue\nimport random\nimport signal\nimport socket\nimport subprocess\nimport sys\nimport threading\nimport time\nimport traceback\n\nfrom luigi import notifications\nfrom luigi.event import Event\nfrom luigi.parameter import BoolParameter, FloatParameter, IntParameter, OptionalParameter, Parameter, TimeDeltaParameter\nfrom luigi.scheduler import DISABLED, DONE, FAILED, PENDING, UNKNOWN, WORKER_STATE_ACTIVE, WORKER_STATE_DISABLED, RetryPolicy, Scheduler\nfrom luigi.target import Target\nfrom luigi.task import Config, DynamicRequirements, Task, flatten\nfrom luigi.task_register import TaskClassException, load_task\nfrom luigi.task_status import RUNNING\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n# Prevent fork() from being called during a C-level getaddrinfo() which uses a process-global mutex,\n# that may not be unlocked in child process, resulting in the process being locked indefinitely.\nfork_lock = threading.Lock()\n\n# Why we assert on _WAIT_INTERVAL_EPS:\n# multiprocessing.Queue.get() is undefined for timeout=0 it seems:\n# https://docs.python.org/3.4/library/multiprocessing.html#multiprocessing.Queue.get.\n# I also tried with really low epsilon, but then ran into the same issue where\n# the test case \"test_external_dependency_worker_is_patient\" got stuck. So I\n# unscientifically just set the final value to a floating point number that\n# \"worked for me\".\n_WAIT_INTERVAL_EPS = 0.00001\n\n\ndef _is_external(task):\n    return task.run is None or task.run == NotImplemented\n\n\ndef _get_retry_policy_dict(task):\n    return RetryPolicy(task.retry_count, task.disable_hard_timeout, task.disable_window)._asdict()\n\n\nclass TaskException(Exception):\n    pass\n\n\nGetWorkResponse = collections.namedtuple(\n    \"GetWorkResponse\",\n    (\n        \"task_id\",\n        \"running_tasks\",\n        \"n_pending_tasks\",\n        \"n_unique_pending\",\n        \"n_pending_last_scheduled\",\n        \"worker_state\",\n    ),\n)\n\n\nclass TaskProcess(multiprocessing.Process):\n    \"\"\"Wrap all task execution in this class.\n\n    Mainly for convenience since this is run in a separate process.\"\"\"\n\n    # mapping of status_reporter attributes to task attributes that are added to tasks\n    # before they actually run, and removed afterwards\n    forward_reporter_attributes = {\n        \"update_tracking_url\": \"set_tracking_url\",\n        \"update_status_message\": \"set_status_message\",\n        \"update_progress_percentage\": \"set_progress_percentage\",\n        \"decrease_running_resources\": \"decrease_running_resources\",\n        \"scheduler_messages\": \"scheduler_messages\",\n    }\n\n    def __init__(\n        self,\n        task,\n        worker_id,\n        result_queue,\n        status_reporter,\n        use_multiprocessing=False,\n        worker_timeout=0,\n        check_unfulfilled_deps=True,\n        check_complete_on_run=False,\n        task_completion_cache=None,\n    ):\n        super(TaskProcess, self).__init__()\n        self.task = task\n        self.worker_id = worker_id\n        self.result_queue = result_queue\n        self.status_reporter = status_reporter\n        self.worker_timeout = task.worker_timeout if task.worker_timeout is not None else worker_timeout\n        self.timeout_time = time.time() + self.worker_timeout if self.worker_timeout else None\n        self.use_multiprocessing = use_multiprocessing or self.timeout_time is not None\n        self.check_unfulfilled_deps = check_unfulfilled_deps\n        self.check_complete_on_run = check_complete_on_run\n        self.task_completion_cache = task_completion_cache\n\n        # completeness check using the cache\n        self.check_complete = functools.partial(check_complete_cached, completion_cache=task_completion_cache)\n\n    def _run_get_new_deps(self):\n        task_gen = self.task.run()\n\n        if not isinstance(task_gen, collections.abc.Generator):\n            return None\n\n        next_send = None\n        while True:\n            try:\n                if next_send is None:\n                    requires = next(task_gen)\n                else:\n                    requires = task_gen.send(next_send)\n            except StopIteration:\n                return None\n\n            # if requires is not a DynamicRequirements, create one to use its default behavior\n            if not isinstance(requires, DynamicRequirements):\n                requires = DynamicRequirements(requires)\n\n            if not requires.complete(self.check_complete):\n                # not all requirements are complete, return them which adds them to the tree\n                new_deps = [(t.task_module, t.task_family, t.to_str_params()) for t in requires.flat_requirements]\n                return new_deps\n\n            # get the next generator result\n            next_send = requires.paths\n\n    def run(self):\n        logger.info(\"[pid %s] Worker %s running   %s\", os.getpid(), self.worker_id, self.task)\n\n        if self.use_multiprocessing:\n            # Need to have different random seeds if running in separate processes\n            processID = os.getpid()\n            currentTime = time.time()\n            random.seed(processID * currentTime)\n\n        status = FAILED\n        expl = \"\"\n        missing = []\n        new_deps = []\n        try:\n            # Verify that all the tasks are fulfilled! For external tasks we\n            # don't care about unfulfilled dependencies, because we are just\n            # checking completeness of self.task so outputs of dependencies are\n            # irrelevant.\n            if self.check_unfulfilled_deps and not _is_external(self.task):\n                missing = []\n                for dep in self.task.deps():\n                    if not self.check_complete(dep):\n                        nonexistent_outputs = [output for output in flatten(dep.output()) if not output.exists()]\n                        if nonexistent_outputs:\n                            missing.append(f\"{dep.task_id} ({', '.join(map(str, nonexistent_outputs))})\")\n                        else:\n                            missing.append(dep.task_id)\n                if missing:\n                    deps = \"dependency\" if len(missing) == 1 else \"dependencies\"\n                    raise RuntimeError(\"Unfulfilled %s at run time: %s\" % (deps, \", \".join(missing)))\n            self.task.trigger_event(Event.START, self.task)\n            t0 = time.time()\n            status = None\n\n            if _is_external(self.task):\n                # External task\n                if self.check_complete(self.task):\n                    status = DONE\n                else:\n                    status = FAILED\n                    expl = \"Task is an external data dependency and data does not exist (yet?).\"\n            else:\n                with self._forward_attributes():\n                    new_deps = self._run_get_new_deps()\n                if not new_deps:\n                    if not self.check_complete_on_run:\n                        # update the cache\n                        if self.task_completion_cache is not None:\n                            self.task_completion_cache[self.task.task_id] = True\n                        status = DONE\n                    elif self.check_complete(self.task):\n                        status = DONE\n                    else:\n                        raise TaskException(\"Task finished running, but complete() is still returning false.\")\n                else:\n                    status = PENDING\n\n            if new_deps:\n                logger.info(\"[pid %s] Worker %s new requirements      %s\", os.getpid(), self.worker_id, self.task)\n            elif status == DONE:\n                self.task.trigger_event(Event.PROCESSING_TIME, self.task, time.time() - t0)\n                expl = self.task.on_success()\n                logger.info(\"[pid %s] Worker %s done      %s\", os.getpid(), self.worker_id, self.task)\n                self.task.trigger_event(Event.SUCCESS, self.task)\n\n        except KeyboardInterrupt:\n            raise\n        except BaseException as ex:\n            status = FAILED\n            expl = self._handle_run_exception(ex)\n\n        finally:\n            self.result_queue.put((self.task.task_id, status, expl, missing, new_deps))\n\n    def _handle_run_exception(self, ex):\n        logger.exception(\"[pid %s] Worker %s failed    %s\", os.getpid(), self.worker_id, self.task)\n        self.task.trigger_event(Event.FAILURE, self.task, ex)\n        return self.task.on_failure(ex)\n\n    def _recursive_terminate(self):\n        import psutil\n\n        try:\n            parent = psutil.Process(self.pid)\n            children = parent.children(recursive=True)\n\n            # terminate parent. Give it a chance to clean up\n            super(TaskProcess, self).terminate()\n            parent.wait()\n\n            # terminate children\n            for child in children:\n                try:\n                    child.terminate()\n                except psutil.NoSuchProcess:\n                    continue\n        except psutil.NoSuchProcess:\n            return\n\n    def terminate(self):\n        \"\"\"Terminate this process and its subprocesses.\"\"\"\n        # default terminate() doesn't cleanup child processes, it orphans them.\n        try:\n            return self._recursive_terminate()\n        except ImportError:\n            return super(TaskProcess, self).terminate()\n\n    @contextlib.contextmanager\n    def _forward_attributes(self):\n        # forward configured attributes to the task\n        for reporter_attr, task_attr in self.forward_reporter_attributes.items():\n            setattr(self.task, task_attr, getattr(self.status_reporter, reporter_attr))\n        try:\n            yield self\n        finally:\n            # reset attributes again\n            for reporter_attr, task_attr in self.forward_reporter_attributes.items():\n                setattr(self.task, task_attr, None)\n\n\n# This code and the task_process_context config key currently feels a bit ad-hoc.\n# Discussion on generalizing it into a plugin system: https://github.com/spotify/luigi/issues/1897\nclass ContextManagedTaskProcess(TaskProcess):\n    def __init__(self, context, *args, **kwargs):\n        super(ContextManagedTaskProcess, self).__init__(*args, **kwargs)\n        self.context = context\n\n    def run(self):\n        if self.context:\n            logger.debug(\"Importing module and instantiating \" + self.context)\n            module_path, class_name = self.context.rsplit(\".\", 1)\n            module = importlib.import_module(module_path)\n            cls = getattr(module, class_name)\n\n            with cls(self):\n                super(ContextManagedTaskProcess, self).run()\n        else:\n            super(ContextManagedTaskProcess, self).run()\n\n\nclass TaskStatusReporter:\n    \"\"\"\n    Reports task status information to the scheduler.\n\n    This object must be pickle-able for passing to `TaskProcess` on systems\n    where fork method needs to pickle the process object (e.g.  Windows).\n    \"\"\"\n\n    def __init__(self, scheduler, task_id, worker_id, scheduler_messages):\n        self._task_id = task_id\n        self._worker_id = worker_id\n        self._scheduler = scheduler\n        self.scheduler_messages = scheduler_messages\n\n    def update_tracking_url(self, tracking_url):\n        self._scheduler.add_task(task_id=self._task_id, worker=self._worker_id, status=RUNNING, tracking_url=tracking_url)\n\n    def update_status_message(self, message):\n        self._scheduler.set_task_status_message(self._task_id, message)\n\n    def update_progress_percentage(self, percentage):\n        self._scheduler.set_task_progress_percentage(self._task_id, percentage)\n\n    def decrease_running_resources(self, decrease_resources):\n        self._scheduler.decrease_running_task_resources(self._task_id, decrease_resources)\n\n    def report_task_statistics(self, statistics):\n        self._scheduler.report_task_statistics(self._task_id, statistics)\n\n\nclass SchedulerMessage:\n    \"\"\"\n    Message object that is build by the the :py:class:`Worker` when a message from the scheduler is\n    received and passed to the message queue of a :py:class:`Task`.\n    \"\"\"\n\n    def __init__(self, scheduler, task_id, message_id, content, **payload):\n        super(SchedulerMessage, self).__init__()\n\n        self._scheduler = scheduler\n        self._task_id = task_id\n        self._message_id = message_id\n\n        self.content = content\n        self.payload = payload\n\n    def __str__(self):\n        return str(self.content)\n\n    def __eq__(self, other):\n        return self.content == other\n\n    def respond(self, response):\n        self._scheduler.add_scheduler_message_response(self._task_id, self._message_id, response)\n\n\nclass SingleProcessPool:\n    \"\"\"\n    Dummy process pool for using a single processor.\n\n    Imitates the api of multiprocessing.Pool using single-processor equivalents.\n    \"\"\"\n\n    def apply_async(self, function, args):\n        return function(*args)\n\n    def close(self):\n        pass\n\n    def join(self):\n        pass\n\n\nclass DequeQueue(collections.deque):\n    \"\"\"\n    deque wrapper implementing the Queue interface.\n    \"\"\"\n\n    def put(self, obj, block=None, timeout=None):\n        return self.append(obj)\n\n    def get(self, block=None, timeout=None):\n        try:\n            return self.pop()\n        except IndexError:\n            raise Queue.Empty\n\n\nclass AsyncCompletionException(Exception):\n    \"\"\"\n    Exception indicating that something went wrong with checking complete.\n    \"\"\"\n\n    def __init__(self, trace):\n        self.trace = trace\n\n\nclass TracebackWrapper:\n    \"\"\"\n    Class to wrap tracebacks so we can know they're not just strings.\n    \"\"\"\n\n    def __init__(self, trace):\n        self.trace = trace\n\n\ndef check_complete_cached(task, completion_cache=None):\n    # check if cached and complete\n    cache_key = task.task_id\n    if completion_cache is not None and completion_cache.get(cache_key):\n        return True\n\n    # (re-)check the status\n    is_complete = task.complete()\n\n    # tell the cache when complete\n    if completion_cache is not None and is_complete:\n        completion_cache[cache_key] = is_complete\n\n    return is_complete\n\n\ndef check_complete(task, out_queue, completion_cache=None):\n    \"\"\"\n    Checks if task is complete, puts the result to out_queue, optionally using the completion cache.\n    \"\"\"\n    logger.debug(\"Checking if %s is complete\", task)\n    try:\n        is_complete = check_complete_cached(task, completion_cache)\n    except Exception:\n        is_complete = TracebackWrapper(traceback.format_exc())\n    out_queue.put((task, is_complete))\n\n\nclass worker(Config):\n    # NOTE: `section.config-variable` in the config_path argument is deprecated in favor of `worker.config_variable`\n\n    id = Parameter(default=\"\", description=\"Override the auto-generated worker_id\")\n    ping_interval = FloatParameter(default=1.0, config_path=dict(section=\"core\", name=\"worker-ping-interval\"))\n    keep_alive = BoolParameter(default=False, config_path=dict(section=\"core\", name=\"worker-keep-alive\"))\n    count_uniques = BoolParameter(\n        default=False,\n        config_path=dict(section=\"core\", name=\"worker-count-uniques\"),\n        description=\"worker-count-uniques means that we will keep a worker alive only if it has a unique pending task, as well as having keep-alive true\",\n    )\n    count_last_scheduled = BoolParameter(default=False, description=\"Keep a worker alive only if there are pending tasks which it was the last to schedule.\")\n    wait_interval = FloatParameter(default=1.0, config_path=dict(section=\"core\", name=\"worker-wait-interval\"))\n    wait_jitter = FloatParameter(default=5.0)\n\n    max_keep_alive_idle_duration = TimeDeltaParameter(default=datetime.timedelta(0))\n\n    max_reschedules = IntParameter(default=1, config_path=dict(section=\"core\", name=\"worker-max-reschedules\"))\n    timeout = IntParameter(default=0, config_path=dict(section=\"core\", name=\"worker-timeout\"))\n    task_limit = IntParameter(default=None, config_path=dict(section=\"core\", name=\"worker-task-limit\"))\n    retry_external_tasks = BoolParameter(\n        default=False,\n        config_path=dict(section=\"core\", name=\"retry-external-tasks\"),\n        description=\"If true, incomplete external tasks will be retested for completion while Luigi is running.\",\n    )\n    send_failure_email = BoolParameter(default=True, description=\"If true, send e-mails directly from the workeron failure\")\n    no_install_shutdown_handler = BoolParameter(default=False, description=\"If true, the SIGUSR1 shutdown handler willNOT be install on the worker\")\n    check_unfulfilled_deps = BoolParameter(default=True, description=\"If true, check for completeness of dependencies before running a task\")\n    check_complete_on_run = BoolParameter(\n        default=False,\n        description=\"If true, only mark tasks as done after running if they are complete. \"\n        \"Regardless of this setting, the worker will always check if external \"\n        \"tasks are complete before marking them as done.\",\n    )\n    force_multiprocessing = BoolParameter(default=False, description=\"If true, use multiprocessing also when running with 1 worker\")\n    task_process_context = OptionalParameter(\n        default=None,\n        description=\"If set to a fully qualified class name, the class will \"\n        \"be instantiated with a TaskProcess as its constructor parameter and \"\n        \"applied as a context manager around its run() call, so this can be \"\n        \"used for obtaining high level customizable monitoring or logging of \"\n        \"each individual Task run.\",\n    )\n    cache_task_completion = BoolParameter(\n        default=False,\n        description=\"If true, cache the response of successful completion checks \"\n        \"of tasks assigned to a worker. This can especially speed up tasks with \"\n        \"dynamic dependencies but assumes that the completion status does not change \"\n        \"after it was true the first time.\",\n    )\n\n\nclass KeepAliveThread(threading.Thread):\n    \"\"\"\n    Periodically tell the scheduler that the worker still lives.\n    \"\"\"\n\n    def __init__(self, scheduler, worker_id, ping_interval, rpc_message_callback):\n        super(KeepAliveThread, self).__init__()\n        self._should_stop = threading.Event()\n        self._scheduler = scheduler\n        self._worker_id = worker_id\n        self._ping_interval = ping_interval\n        self._rpc_message_callback = rpc_message_callback\n\n    def stop(self):\n        self._should_stop.set()\n\n    def run(self):\n        while True:\n            self._should_stop.wait(self._ping_interval)\n            if self._should_stop.is_set():\n                logger.info(\"Worker %s was stopped. Shutting down Keep-Alive thread\" % self._worker_id)\n                break\n            with fork_lock:\n                response = None\n                try:\n                    response = self._scheduler.ping(worker=self._worker_id)\n                except BaseException:  # httplib.BadStatusLine:\n                    logger.warning(\"Failed pinging scheduler\")\n\n                # handle rpc messages\n                if response:\n                    for message in response[\"rpc_messages\"]:\n                        self._rpc_message_callback(message)\n\n\ndef rpc_message_callback(fn):\n    fn.is_rpc_message_callback = True\n    return fn\n\n\nclass Worker:\n    \"\"\"\n    Worker object communicates with a scheduler.\n\n    Simple class that talks to a scheduler and:\n\n    * tells the scheduler what it has to do + its dependencies\n    * asks for stuff to do (pulls it in a loop and runs it)\n    \"\"\"\n\n    def __init__(self, scheduler=None, worker_id=None, worker_processes=1, assistant=False, **kwargs):\n        if scheduler is None:\n            scheduler = Scheduler()\n\n        self.worker_processes = int(worker_processes)\n        self._worker_info = self._generate_worker_info()\n\n        self._config = worker(**kwargs)\n\n        worker_id = worker_id or self._config.id or self._generate_worker_id(self._worker_info)\n\n        assert self._config.wait_interval >= _WAIT_INTERVAL_EPS, \"[worker] wait_interval must be positive\"\n        assert self._config.wait_jitter >= 0.0, \"[worker] wait_jitter must be equal or greater than zero\"\n\n        self._id = worker_id\n        self._scheduler = scheduler\n        self._assistant = assistant\n        self._stop_requesting_work = False\n\n        self.host = socket.gethostname()\n        self._scheduled_tasks = {}\n        self._suspended_tasks = {}\n        self._batch_running_tasks = {}\n        self._batch_families_sent = set()\n\n        self._first_task = None\n\n        self.add_succeeded = True\n        self.run_succeeded = True\n\n        self.unfulfilled_counts = collections.defaultdict(int)\n\n        # note that ``signal.signal(signal.SIGUSR1, fn)`` only works inside the main execution thread, which is why we\n        # provide the ability to conditionally install the hook.\n        if not self._config.no_install_shutdown_handler:\n            try:\n                signal.signal(signal.SIGUSR1, self.handle_interrupt)\n                signal.siginterrupt(signal.SIGUSR1, False)\n            except AttributeError:\n                pass\n\n        # Keep info about what tasks are running (could be in other processes)\n        self._task_result_queue = multiprocessing.Queue()\n        self._running_tasks = {}\n        self._idle_since = None\n\n        # mp-safe dictionary for caching completation checks across task processes\n        self._task_completion_cache = None\n        if self._config.cache_task_completion:\n            self._task_completion_cache = multiprocessing.Manager().dict()\n\n        # Stuff for execution_summary\n        self._add_task_history = []\n        self._get_work_response_history = []\n\n    def _add_task(self, *args, **kwargs):\n        \"\"\"\n        Call ``self._scheduler.add_task``, but store the values too so we can\n        implement :py:func:`luigi.execution_summary.summary`.\n        \"\"\"\n        task_id = kwargs[\"task_id\"]\n        status = kwargs[\"status\"]\n        runnable = kwargs[\"runnable\"]\n        task = self._scheduled_tasks.get(task_id)\n        if task:\n            self._add_task_history.append((task, status, runnable))\n            kwargs[\"owners\"] = task._owner_list()\n\n        if task_id in self._batch_running_tasks:\n            for batch_task in self._batch_running_tasks.pop(task_id):\n                self._add_task_history.append((batch_task, status, True))\n\n        if task and kwargs.get(\"params\"):\n            kwargs[\"param_visibilities\"] = task._get_param_visibilities()\n\n        self._scheduler.add_task(*args, **kwargs)\n\n        logger.info(\"Informed scheduler that task   %s   has status   %s\", task_id, status)\n\n    def __enter__(self):\n        \"\"\"\n        Start the KeepAliveThread.\n        \"\"\"\n        self._keep_alive_thread = KeepAliveThread(self._scheduler, self._id, self._config.ping_interval, self._handle_rpc_message)\n        self._keep_alive_thread.daemon = True\n        self._keep_alive_thread.start()\n        return self\n\n    def __exit__(self, type, value, traceback):\n        \"\"\"\n        Stop the KeepAliveThread and kill still running tasks.\n        \"\"\"\n        self._keep_alive_thread.stop()\n        self._keep_alive_thread.join()\n        for task in self._running_tasks.values():\n            if task.is_alive():\n                task.terminate()\n        self._task_result_queue.close()\n        return False  # Don't suppress exception\n\n    def _generate_worker_info(self):\n        # Generate as much info as possible about the worker\n        # Some of these calls might not be available on all OS's\n        args = [(\"salt\", \"%09d\" % random.randrange(0, 10_000_000_000)), (\"workers\", self.worker_processes)]\n        try:\n            args += [(\"host\", socket.gethostname())]\n        except BaseException:\n            pass\n        try:\n            args += [(\"username\", getpass.getuser())]\n        except BaseException:\n            pass\n        try:\n            args += [(\"pid\", os.getpid())]\n        except BaseException:\n            pass\n        try:\n            sudo_user = os.getenv(\"SUDO_USER\")\n            if sudo_user:\n                args.append((\"sudo_user\", sudo_user))\n        except BaseException:\n            pass\n        return args\n\n    def _generate_worker_id(self, worker_info):\n        worker_info_str = \", \".join([\"{}={}\".format(k, v) for k, v in worker_info])\n        return \"Worker({})\".format(worker_info_str)\n\n    def _validate_task(self, task):\n        if not isinstance(task, Task):\n            raise TaskException(\"Can not schedule non-task %s\" % task)\n\n        if not task.initialized():\n            # we can't get the repr of it since it's not initialized...\n            raise TaskException(\"Task of class %s not initialized. Did you override __init__ and forget to call super(...).__init__?\" % task.__class__.__name__)\n\n    def _log_complete_error(self, task, tb):\n        log_msg = \"Will not run {task} or any dependencies due to error in complete() method:\\n{tb}\".format(task=task, tb=tb)\n        logger.warning(log_msg)\n\n    def _log_dependency_error(self, task, tb):\n        log_msg = \"Will not run {task} or any dependencies due to error in deps() method:\\n{tb}\".format(task=task, tb=tb)\n        logger.warning(log_msg)\n\n    def _log_unexpected_error(self, task):\n        logger.exception(\"Luigi unexpected framework error while scheduling %s\", task)  # needs to be called from within except clause\n\n    def _announce_scheduling_failure(self, task, expl):\n        try:\n            self._scheduler.announce_scheduling_failure(\n                worker=self._id,\n                task_name=str(task),\n                family=task.task_family,\n                params=task.to_str_params(only_significant=True),\n                expl=expl,\n                owners=task._owner_list(),\n            )\n        except Exception:\n            formatted_traceback = traceback.format_exc()\n            self._email_unexpected_error(task, formatted_traceback)\n            raise\n\n    def _email_complete_error(self, task, formatted_traceback):\n        self._announce_scheduling_failure(task, formatted_traceback)\n        if self._config.send_failure_email:\n            self._email_error(\n                task,\n                formatted_traceback,\n                subject=\"Luigi: {task} failed scheduling. Host: {host}\",\n                headline=\"Will not run {task} or any dependencies due to error in complete() method\",\n            )\n\n    def _email_dependency_error(self, task, formatted_traceback):\n        self._announce_scheduling_failure(task, formatted_traceback)\n        if self._config.send_failure_email:\n            self._email_error(\n                task,\n                formatted_traceback,\n                subject=\"Luigi: {task} failed scheduling. Host: {host}\",\n                headline=\"Will not run {task} or any dependencies due to error in deps() method\",\n            )\n\n    def _email_unexpected_error(self, task, formatted_traceback):\n        # this sends even if failure e-mails are disabled, as they may indicate\n        # a more severe failure that may not reach other alerting methods such\n        # as scheduler batch notification\n        self._email_error(\n            task,\n            formatted_traceback,\n            subject=\"Luigi: Framework error while scheduling {task}. Host: {host}\",\n            headline=\"Luigi framework error\",\n        )\n\n    def _email_task_failure(self, task, formatted_traceback):\n        if self._config.send_failure_email:\n            self._email_error(\n                task,\n                formatted_traceback,\n                subject=\"Luigi: {task} FAILED. Host: {host}\",\n                headline=\"A task failed when running. Most likely run() raised an exception.\",\n            )\n\n    def _email_error(self, task, formatted_traceback, subject, headline):\n        formatted_subject = subject.format(task=task, host=self.host)\n        formatted_headline = headline.format(task=task, host=self.host)\n        command = subprocess.list2cmdline(sys.argv)\n        message = notifications.format_task_error(formatted_headline, task, command, formatted_traceback)\n        notifications.send_error_email(formatted_subject, message, task.owner_email)\n\n    def _handle_task_load_error(self, exception, task_ids):\n        msg = \"Cannot find task(s) sent by scheduler: {}\".format(\",\".join(task_ids))\n        logger.exception(msg)\n        subject = \"Luigi: {}\".format(msg)\n        error_message = notifications.wrap_traceback(exception)\n        for task_id in task_ids:\n            self._add_task(\n                worker=self._id,\n                task_id=task_id,\n                status=FAILED,\n                runnable=False,\n                expl=error_message,\n            )\n        notifications.send_error_email(subject, error_message)\n\n    def add(self, task, multiprocess=False, processes=0):\n        \"\"\"\n        Add a Task for the worker to check and possibly schedule and run.\n\n        Returns True if task and its dependencies were successfully scheduled or completed before.\n        \"\"\"\n        if self._first_task is None and hasattr(task, \"task_id\"):\n            self._first_task = task.task_id\n        self.add_succeeded = True\n        if multiprocess:\n            queue = multiprocessing.Manager().Queue()\n            pool = multiprocessing.Pool(processes=processes if processes > 0 else None)\n        else:\n            queue = DequeQueue()\n            pool = SingleProcessPool()\n        self._validate_task(task)\n        pool.apply_async(check_complete, [task, queue, self._task_completion_cache])\n\n        # we track queue size ourselves because len(queue) won't work for multiprocessing\n        queue_size = 1\n        try:\n            seen = {task.task_id}\n            while queue_size:\n                current = queue.get()\n                queue_size -= 1\n                item, is_complete = current\n                for next in self._add(item, is_complete):\n                    if next.task_id not in seen:\n                        self._validate_task(next)\n                        seen.add(next.task_id)\n                        pool.apply_async(check_complete, [next, queue, self._task_completion_cache])\n                        queue_size += 1\n        except (KeyboardInterrupt, TaskException):\n            raise\n        except Exception as ex:\n            self.add_succeeded = False\n            formatted_traceback = traceback.format_exc()\n            self._log_unexpected_error(task)\n            task.trigger_event(Event.BROKEN_TASK, task, ex)\n            self._email_unexpected_error(task, formatted_traceback)\n            raise\n        finally:\n            pool.close()\n            pool.join()\n        return self.add_succeeded\n\n    def _add_task_batcher(self, task):\n        family = task.task_family\n        if family not in self._batch_families_sent:\n            task_class = type(task)\n            batch_param_names = task_class.batch_param_names()\n            if batch_param_names:\n                self._scheduler.add_task_batcher(\n                    worker=self._id,\n                    task_family=family,\n                    batched_args=batch_param_names,\n                    max_batch_size=task.max_batch_size,\n                )\n            self._batch_families_sent.add(family)\n\n    def _add(self, task, is_complete):\n        if self._config.task_limit is not None and len(self._scheduled_tasks) >= self._config.task_limit:\n            logger.warning(\"Will not run %s or any dependencies due to exceeded task-limit of %d\", task, self._config.task_limit)\n            deps = None\n            status = UNKNOWN\n            runnable = False\n\n        else:\n            formatted_traceback = None\n            try:\n                self._check_complete_value(is_complete)\n            except KeyboardInterrupt:\n                raise\n            except AsyncCompletionException as ex:\n                formatted_traceback = ex.trace\n            except BaseException:\n                formatted_traceback = traceback.format_exc()\n\n            if formatted_traceback is not None:\n                self.add_succeeded = False\n                self._log_complete_error(task, formatted_traceback)\n                task.trigger_event(Event.DEPENDENCY_MISSING, task)\n                self._email_complete_error(task, formatted_traceback)\n                deps = None\n                status = UNKNOWN\n                runnable = False\n\n            elif is_complete:\n                deps = None\n                status = DONE\n                runnable = False\n                task.trigger_event(Event.DEPENDENCY_PRESENT, task)\n\n            elif _is_external(task):\n                deps = None\n                status = PENDING\n                runnable = self._config.retry_external_tasks\n                task.trigger_event(Event.DEPENDENCY_MISSING, task)\n                logger.warning(\"Data for %s does not exist (yet?). The task is an external data dependency, so it cannot be run from this luigi process.\", task)\n\n            else:\n                try:\n                    deps = task.deps()\n                    self._add_task_batcher(task)\n                except Exception as ex:\n                    formatted_traceback = traceback.format_exc()\n                    self.add_succeeded = False\n                    self._log_dependency_error(task, formatted_traceback)\n                    task.trigger_event(Event.BROKEN_TASK, task, ex)\n                    self._email_dependency_error(task, formatted_traceback)\n                    deps = None\n                    status = UNKNOWN\n                    runnable = False\n                else:\n                    status = PENDING\n                    runnable = True\n\n            if task.disabled:\n                status = DISABLED\n\n            if deps:\n                for d in deps:\n                    self._validate_dependency(d)\n                    task.trigger_event(Event.DEPENDENCY_DISCOVERED, task, d)\n                    yield d  # return additional tasks to add\n\n                deps = [d.task_id for d in deps]\n\n        self._scheduled_tasks[task.task_id] = task\n        self._add_task(\n            worker=self._id,\n            task_id=task.task_id,\n            status=status,\n            deps=deps,\n            runnable=runnable,\n            priority=task.priority,\n            resources=task.process_resources(),\n            params=task.to_str_params(),\n            family=task.task_family,\n            module=task.task_module,\n            batchable=task.batchable,\n            retry_policy_dict=_get_retry_policy_dict(task),\n            accepts_messages=task.accepts_messages,\n        )\n\n    def _validate_dependency(self, dependency):\n        if isinstance(dependency, Target):\n            raise Exception(\"requires() can not return Target objects. Wrap it in an ExternalTask class\")\n        elif not isinstance(dependency, Task):\n            raise Exception(\"requires() must return Task objects but {} is a {}\".format(dependency, type(dependency)))\n\n    def _check_complete_value(self, is_complete):\n        if is_complete not in (True, False):\n            if isinstance(is_complete, TracebackWrapper):\n                raise AsyncCompletionException(is_complete.trace)\n            raise Exception(\"Return value of Task.complete() must be boolean (was %r)\" % is_complete)\n\n    def _add_worker(self):\n        self._worker_info.append((\"first_task\", self._first_task))\n        self._scheduler.add_worker(self._id, self._worker_info)\n\n    def _log_remote_tasks(self, get_work_response):\n        logger.debug(\"Done\")\n        logger.debug(\"There are no more tasks to run at this time\")\n        if get_work_response.running_tasks:\n            for r in get_work_response.running_tasks:\n                logger.debug(\"%s is currently run by worker %s\", r[\"task_id\"], r[\"worker\"])\n        elif get_work_response.n_pending_tasks:\n            logger.debug(\"There are %s pending tasks possibly being run by other workers\", get_work_response.n_pending_tasks)\n            if get_work_response.n_unique_pending:\n                logger.debug(\"There are %i pending tasks unique to this worker\", get_work_response.n_unique_pending)\n            if get_work_response.n_pending_last_scheduled:\n                logger.debug(\"There are %i pending tasks last scheduled by this worker\", get_work_response.n_pending_last_scheduled)\n\n    def _get_work_task_id(self, get_work_response):\n        if get_work_response.get(\"task_id\") is not None:\n            return get_work_response[\"task_id\"]\n        elif \"batch_id\" in get_work_response:\n            try:\n                task = load_task(\n                    module=get_work_response.get(\"task_module\"),\n                    task_name=get_work_response[\"task_family\"],\n                    params_str=get_work_response[\"task_params\"],\n                )\n            except Exception as ex:\n                self._handle_task_load_error(ex, get_work_response[\"batch_task_ids\"])\n                self.run_succeeded = False\n                return None\n\n            self._scheduler.add_task(\n                worker=self._id,\n                task_id=task.task_id,\n                module=get_work_response.get(\"task_module\"),\n                family=get_work_response[\"task_family\"],\n                params=task.to_str_params(),\n                status=RUNNING,\n                batch_id=get_work_response[\"batch_id\"],\n            )\n            return task.task_id\n        else:\n            return None\n\n    def _get_work(self):\n        if self._stop_requesting_work:\n            return GetWorkResponse(None, 0, 0, 0, 0, WORKER_STATE_DISABLED)\n\n        if self.worker_processes > 0:\n            logger.debug(\"Asking scheduler for work...\")\n            r = self._scheduler.get_work(\n                worker=self._id,\n                host=self.host,\n                assistant=self._assistant,\n                current_tasks=list(self._running_tasks.keys()),\n            )\n        else:\n            logger.debug(\"Checking if tasks are still pending\")\n            r = self._scheduler.count_pending(worker=self._id)\n\n        running_tasks = r[\"running_tasks\"]\n        task_id = self._get_work_task_id(r)\n\n        self._get_work_response_history.append(\n            {\n                \"task_id\": task_id,\n                \"running_tasks\": running_tasks,\n            }\n        )\n\n        if task_id is not None and task_id not in self._scheduled_tasks:\n            logger.info(\"Did not schedule %s, will load it dynamically\", task_id)\n\n            try:\n                # TODO: we should obtain the module name from the server!\n                self._scheduled_tasks[task_id] = load_task(module=r.get(\"task_module\"), task_name=r[\"task_family\"], params_str=r[\"task_params\"])\n            except TaskClassException as ex:\n                self._handle_task_load_error(ex, [task_id])\n                task_id = None\n                self.run_succeeded = False\n\n        if task_id is not None and \"batch_task_ids\" in r:\n            batch_tasks = filter(None, [self._scheduled_tasks.get(batch_id) for batch_id in r[\"batch_task_ids\"]])\n            self._batch_running_tasks[task_id] = batch_tasks\n\n        return GetWorkResponse(\n            task_id=task_id,\n            running_tasks=running_tasks,\n            n_pending_tasks=r[\"n_pending_tasks\"],\n            n_unique_pending=r[\"n_unique_pending\"],\n            # TODO: For a tiny amount of time (a month?) we'll keep forwards compatibility\n            #  That is you can user a newer client than server (Sep 2016)\n            n_pending_last_scheduled=r.get(\"n_pending_last_scheduled\", 0),\n            worker_state=r.get(\"worker_state\", WORKER_STATE_ACTIVE),\n        )\n\n    def _run_task(self, task_id):\n        if task_id in self._running_tasks:\n            logger.debug(\"Got already running task id {} from scheduler, taking a break\".format(task_id))\n            next(self._sleeper())\n            return\n\n        task = self._scheduled_tasks[task_id]\n\n        task_process = self._create_task_process(task)\n\n        self._running_tasks[task_id] = task_process\n\n        if task_process.use_multiprocessing:\n            with fork_lock:\n                task_process.start()\n        else:\n            # Run in the same process\n            task_process.run()\n\n    def _create_task_process(self, task):\n        message_queue = multiprocessing.Queue() if task.accepts_messages else None\n        reporter = TaskStatusReporter(self._scheduler, task.task_id, self._id, message_queue)\n        use_multiprocessing = self._config.force_multiprocessing or bool(self.worker_processes > 1)\n        return ContextManagedTaskProcess(\n            self._config.task_process_context,\n            task,\n            self._id,\n            self._task_result_queue,\n            reporter,\n            use_multiprocessing=use_multiprocessing,\n            worker_timeout=self._config.timeout,\n            check_unfulfilled_deps=self._config.check_unfulfilled_deps,\n            check_complete_on_run=self._config.check_complete_on_run,\n            task_completion_cache=self._task_completion_cache,\n        )\n\n    def _purge_children(self):\n        \"\"\"\n        Find dead children and put a response on the result queue.\n\n        :return:\n        \"\"\"\n        for task_id, p in self._running_tasks.items():\n            if not p.is_alive() and p.exitcode:\n                error_msg = \"Task {} died unexpectedly with exit code {}\".format(task_id, p.exitcode)\n                p.task.trigger_event(Event.PROCESS_FAILURE, p.task, error_msg)\n            elif p.timeout_time is not None and time.time() > float(p.timeout_time) and p.is_alive():\n                p.terminate()\n                error_msg = \"Task {} timed out after {} seconds and was terminated.\".format(task_id, p.worker_timeout)\n                p.task.trigger_event(Event.TIMEOUT, p.task, error_msg)\n            else:\n                continue\n\n            logger.info(error_msg)\n            self._task_result_queue.put((task_id, FAILED, error_msg, [], []))\n\n    def _handle_next_task(self):\n        \"\"\"\n        We have to catch three ways a task can be \"done\":\n\n        1. normal execution: the task runs/fails and puts a result back on the queue,\n        2. new dependencies: the task yielded new deps that were not complete and\n           will be rescheduled and dependencies added,\n        3. child process dies: we need to catch this separately.\n        \"\"\"\n        self._idle_since = None\n        while True:\n            self._purge_children()  # Deal with subprocess failures\n\n            try:\n                task_id, status, expl, missing, new_requirements = self._task_result_queue.get(timeout=self._config.wait_interval)\n            except Queue.Empty:\n                return\n\n            task = self._scheduled_tasks[task_id]\n            if not task or task_id not in self._running_tasks:\n                continue\n                # Not a running task. Probably already removed.\n                # Maybe it yielded something?\n\n            # external task if run not implemented, retry-able if config option is enabled.\n            external_task_retryable = _is_external(task) and self._config.retry_external_tasks\n            if status == FAILED and not external_task_retryable:\n                self._email_task_failure(task, expl)\n\n            new_deps = []\n            if new_requirements:\n                new_req = [load_task(module, name, params) for module, name, params in new_requirements]\n                for t in new_req:\n                    self.add(t)\n                new_deps = [t.task_id for t in new_req]\n\n            self._add_task(\n                worker=self._id,\n                task_id=task_id,\n                status=status,\n                expl=json.dumps(expl),\n                resources=task.process_resources(),\n                runnable=None,\n                params=task.to_str_params(),\n                family=task.task_family,\n                module=task.task_module,\n                new_deps=new_deps,\n                assistant=self._assistant,\n                retry_policy_dict=_get_retry_policy_dict(task),\n            )\n\n            self._running_tasks.pop(task_id)\n\n            # re-add task to reschedule missing dependencies\n            if missing:\n                reschedule = True\n\n                # keep out of infinite loops by not rescheduling too many times\n                for task_id in missing:\n                    self.unfulfilled_counts[task_id] += 1\n                    if self.unfulfilled_counts[task_id] > self._config.max_reschedules:\n                        reschedule = False\n                if reschedule:\n                    self.add(task)\n\n            self.run_succeeded &= (status == DONE) or (len(new_deps) > 0)\n            return\n\n    def _sleeper(self):\n        # TODO is exponential backoff necessary?\n        while True:\n            jitter = self._config.wait_jitter\n            wait_interval = self._config.wait_interval + random.uniform(0, jitter)\n            logger.debug(\"Sleeping for %f seconds\", wait_interval)\n            time.sleep(wait_interval)\n            yield\n\n    def _keep_alive(self, get_work_response):\n        \"\"\"\n        Returns true if a worker should stay alive given.\n\n        If worker-keep-alive is not set, this will always return false.\n        For an assistant, it will always return the value of worker-keep-alive.\n        Otherwise, it will return true for nonzero n_pending_tasks.\n\n        If worker-count-uniques is true, it will also\n        require that one of the tasks is unique to this worker.\n        \"\"\"\n        if not self._config.keep_alive:\n            return False\n        elif self._assistant:\n            return True\n        elif self._config.count_last_scheduled:\n            return get_work_response.n_pending_last_scheduled > 0\n        elif self._config.count_uniques:\n            return get_work_response.n_unique_pending > 0\n        elif get_work_response.n_pending_tasks == 0:\n            return False\n        elif not self._config.max_keep_alive_idle_duration:\n            return True\n        elif not self._idle_since:\n            return True\n        else:\n            time_to_shutdown = self._idle_since + self._config.max_keep_alive_idle_duration - datetime.datetime.now()\n            logger.debug(\"[%s] %s until shutdown\", self._id, time_to_shutdown)\n            return time_to_shutdown > datetime.timedelta(0)\n\n    def handle_interrupt(self, signum, _):\n        \"\"\"\n        Stops the assistant from asking for more work on SIGUSR1\n        \"\"\"\n        if signum == signal.SIGUSR1:\n            self._start_phasing_out()\n\n    def _start_phasing_out(self):\n        \"\"\"\n        Go into a mode where we dont ask for more work and quit once existing\n        tasks are done.\n        \"\"\"\n        self._config.keep_alive = False\n        self._stop_requesting_work = True\n\n    def run(self):\n        \"\"\"\n        Returns True if all scheduled tasks were executed successfully.\n        \"\"\"\n        logger.info(\"Running Worker with %d processes\", self.worker_processes)\n\n        sleeper = self._sleeper()\n        self.run_succeeded = True\n\n        self._add_worker()\n\n        while True:\n            while len(self._running_tasks) >= self.worker_processes > 0:\n                logger.debug(\"%d running tasks, waiting for next task to finish\", len(self._running_tasks))\n                self._handle_next_task()\n\n            get_work_response = self._get_work()\n\n            if get_work_response.worker_state == WORKER_STATE_DISABLED:\n                self._start_phasing_out()\n\n            if get_work_response.task_id is None:\n                if not self._stop_requesting_work:\n                    self._log_remote_tasks(get_work_response)\n                if len(self._running_tasks) == 0:\n                    self._idle_since = self._idle_since or datetime.datetime.now()\n                    if self._keep_alive(get_work_response):\n                        next(sleeper)\n                        continue\n                    else:\n                        break\n                else:\n                    self._handle_next_task()\n                    continue\n\n            # task_id is not None:\n            logger.debug(\"Pending tasks: %s\", get_work_response.n_pending_tasks)\n            self._run_task(get_work_response.task_id)\n\n        while len(self._running_tasks):\n            logger.debug(\"Shut down Worker, %d more tasks to go\", len(self._running_tasks))\n            self._handle_next_task()\n\n        return self.run_succeeded\n\n    def _handle_rpc_message(self, message):\n        logger.info(\"Worker %s got message %s\" % (self._id, message))\n\n        # the message is a dict {'name': <function_name>, 'kwargs': <function_kwargs>}\n        name = message[\"name\"]\n        kwargs = message[\"kwargs\"]\n\n        # find the function and check if it's callable and configured to work\n        # as a message callback\n        func = getattr(self, name, None)\n        tpl = (self._id, name)\n        if not callable(func):\n            logger.error(\"Worker %s has no function '%s'\" % tpl)\n        elif not getattr(func, \"is_rpc_message_callback\", False):\n            logger.error(\"Worker %s function '%s' is not available as rpc message callback\" % tpl)\n        else:\n            logger.info(\"Worker %s successfully dispatched rpc message to function '%s'\" % tpl)\n            func(**kwargs)\n\n    @rpc_message_callback\n    def set_worker_processes(self, n):\n        # set the new value\n        self.worker_processes = max(1, n)\n\n        # tell the scheduler\n        self._scheduler.add_worker(self._id, {\"workers\": self.worker_processes})\n\n    @rpc_message_callback\n    def dispatch_scheduler_message(self, task_id, message_id, content, **kwargs):\n        task_id = str(task_id)\n        if task_id in self._running_tasks:\n            task_process = self._running_tasks[task_id]\n            if task_process.status_reporter.scheduler_messages:\n                message = SchedulerMessage(self._scheduler, task_id, message_id, content, **kwargs)\n                task_process.status_reporter.scheduler_messages.put(message)\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[build-system]\nrequires = ['hatchling', 'hatch-fancy-pypi-readme']\nbuild-backend = 'hatchling.build'\n\n[project]\nname = \"luigi\"\ndescription = \"Workflow mgmgt + task scheduling + dependency resolution.\"\nauthors = [\n  {name = \"The Luigi Authors\"}\n]\nlicense = {file = \"LICENSE\"}\nrequires-python = \">=3.10, <3.14\"\ndependencies = [\n  \"python-dateutil>=2.7.5,<3\",\n  \"tenacity>=9\",\n  \"tornado>=5.0,<7\",\n  \"python-daemon<2.2.0; sys_platform == 'win32'\",\n  \"python-daemon; sys_platform != 'win32'\",\n  \"typing-extensions>=4.12.2\",\n]\nclassifiers = [\n  \"Development Status :: 5 - Production/Stable\",\n  \"Environment :: Console\",\n  \"Environment :: Web Environment\",\n  \"Intended Audience :: Developers\",\n  \"Intended Audience :: System Administrators\",\n  \"License :: OSI Approved :: Apache Software License\",\n  \"Programming Language :: Python :: 3.10\",\n  \"Programming Language :: Python :: 3.11\",\n  \"Programming Language :: Python :: 3.12\",\n  \"Programming Language :: Python :: 3.13\",\n  \"Topic :: System :: Monitoring\",\n]\ndynamic = [\"version\", \"readme\"]\n\n[project.urls]\nHomepage = \"https://github.com/spotify/luigi\"\n\n[project.scripts]\nluigi = \"luigi.cmdline:luigi_run\"\nluigid = \"luigi.cmdline:luigid\"\nluigi-grep = \"luigi.tools.luigi_grep:main\"\nluigi-deps = \"luigi.tools.deps:main\"\nluigi-deps-tree = \"luigi.tools.deps_tree:main\"\n\n[project.optional-dependencies]\njsonschema = [\"jsonschema\"]\nprometheus = [\"prometheus-client>=0.5,<0.25\"]\ntoml = [\"toml<2.0.0\"]\n\n[dependency-groups]\n# groups and dependencies should be sort in lexicographical order\ncdh = [\n  \"hdfs>=2.0.4,<3.0.0\",\n]\n\ncommon = [\n  \"avro-python3\",\n  \"azure-storage-blob<=12.20.0\",\n  \"boto>=2.42,<3.0\",\n  \"boto3>=1.11.0\",\n  \"codecov>=1.4.0\",\n  \"coverage>=5.0,<6\",\n  \"datadog==0.22.0\",\n  \"docker>=2.1.0\",\n  \"elasticsearch>=1.0.0,<2.0.0\",\n  \"google-compute-engine\",\n  \"HTTPretty==0.8.10\",\n  \"hypothesis>=6.7.0,<7.0.0\",\n  \"jsonschema\",\n  \"mock<2.0\",\n  \"moto>=1.3.10,<5.0\",\n  \"mypy\",\n  \"mysql-connector-python\",\n  \"prometheus-client>=0.5.0,<0.25\",\n  \"psutil<4.0\",\n  \"pygments\",\n  \"pyhive[presto]==0.6.1\",\n  \"pymongo==3.4.0\",\n  \"pytest\",\n  \"pytest-cov\",\n  \"pytest-xdist\",\n  \"requests>=2.20.0,<=2.31.0\",\n  \"responses<1.0.0\",\n  \"s3transfer>=0.3,<4.0\",\n  \"selenium==3.0.2\",\n  \"sqlalchemy<1.4\",\n  \"toml<2.0.0\",\n  \"types-python-dateutil\",\n  \"types-requests\",\n  \"types-toml\",\n]\n\ndocs = [\n  \"azure-storage-blob<=12.28.0\",\n  \"jinja2>=3.1,<4\",\n  \"mypy\",\n  \"prometheus-client>=0.5.0,<0.25\",\n  \"Sphinx>=9.0,<10; python_version >= '3.12'\",\n  \"sphinx-rtd-theme>=2.0; python_version >= '3.12'\",\n  \"sqlalchemy\",\n]\n\ndropbox = [\n  \"dropbox>=11.0.0\",\n]\n\ngcloud = [\n  \"google-api-python-client>=1.6.6,<2.0\",\n  \"google-auth==1.4.1\",\n  \"google-auth-httplib2==0.0.3\",\n]\n\nhdp = [\n  \"hdfs>=2.0.4,<3.0.0\",\n]\n\nlint = [\n  \"ruff\",\n]\n\npostgres = [\n  \"pg8000>=1.23.0\",\n  \"psycopg2<3.0\",\n]\n\nunixsocket = [\n  \"requests-unixsocket<1.0\",\n]\n\n# for tox test dependencies\ntest_cdh = [\n  {include-group = \"cdh\"},\n  {include-group = \"common\"},\n]\n\ntest_dropbox = [\n  {include-group = \"dropbox\"},\n  {include-group = \"common\"},\n]\n\ntest_gcloud = [\n  {include-group = \"gcloud\"},\n  {include-group = \"common\"},\n]\ntest_hdp = [\n  {include-group = \"hdp\"},\n  {include-group = \"common\"},\n]\n\ntest_postgres = [\n  {include-group = \"postgres\"},\n  {include-group = \"common\"},\n]\n\ntest_unixsocket = [\n  {include-group = \"unixsocket\"},\n  {include-group = \"common\"},\n]\n\nvisualizer = [\n  \"mock<2.0\",\n  \"selenium==3.0.2\"\n]\n\n# for local development\ndev = [\n  {include-group = \"gcloud\"},\n  {include-group = \"postgres\"},\n  {include-group = \"dropbox\"},\n  {include-group = \"cdh\"}, # same deps as hdp\n  {include-group = \"unixsocket\"},\n  {include-group = \"common\"},\n  {include-group = \"lint\"},\n]\n\n[tool.mypy]\n# Keep this set to the minimum supported Python version (see requires-python in [project])\npython_version = \"3.10\"\nignore_missing_imports = true\n# Gradually tighten: remove a module from the ignore list below after fixing its errors\n\n[[tool.mypy.overrides]]\nmodule = [\n    \"luigi.contrib.gcs\",\n    \"luigi.contrib.hadoop\",\n    \"luigi.contrib.hdfs.config\",\n    \"luigi.contrib.postgres\",\n    \"luigi.contrib.redis_store\",\n    \"luigi.contrib.spark\",\n    \"luigi.contrib.sqla\",\n    \"luigi.interface\",\n    \"luigi.notifications\",\n    \"luigi.tools.range\",\n    \"luigi.worker\",\n]\nignore_errors = true\n\n[tool.ruff]\nline-length = 160\nexclude = [\"doc\"]\n\n[tool.ruff.lint]\nselect = [\n    \"E\",  # pycodestyle errors\n    \"F\",  # pyflakes\n    \"I\",  # isort\n    \"W\",  # pycodestyle warnings\n]\n\n[tool.ruff.lint.isort]\nknown-first-party = [\"luigi\"]\n\n[tool.uv]\ndefault-groups = ['dev']\ncache-keys = [ { file = \"pyproject.toml\" }, { git = true } ]\n\n[tool.hatch.version]\npath = \"luigi/__version__.py\"\n\n[tool.hatch.build.targets.sdist]\ninclude = [\n  \"/LICENSE\",\n  \"/README.rst\",\n  \"/examples\",\n  \"/luigi\",\n  \"/test\",\n]\n\n[tool.hatch.metadata.hooks.fancy-pypi-readme]\ncontent-type = \"text/x-rst\"\n# construct the PyPI readme from README.md and HISTORY.md\nfragments = [\n  {text = \"\\n.. note::\\n\\tFor the latest source, discussion, etc, please visit the\\n\\t`GitHub repository <https://github.com/spotify/luigi>`_\\n\"},\n  {path = \"README.rst\"},\n]\n"
  },
  {
    "path": "scripts/ci/conditional_tox.sh",
    "content": "#!/usr/bin/env bash\n\nset -ex\n\nENDENV=$(echo $TOXENV | tail -c 7)\nif [[ $ENDENV == gcloud ]]\nthen\n  [[ $DIDNT_CREATE_GCP_CREDS = 1 ]] || tox\nelse\n  tox --hashseed 1\nfi\n"
  },
  {
    "path": "scripts/ci/install_start_azurite.sh",
    "content": "#!/usr/bin/env bash\n\necho \"$DOCKERHUB_TOKEN\" | docker login -u spotifyci --password-stdin\n\ndocker pull mcr.microsoft.com/azure-storage/azurite\nmkdir -p blob_emulator\n$1/stop_azurite.sh\ndocker run -p 10000:10000 -v blob_emulator:/data -e AZURITE_ACCOUNTS=devstoreaccount1:YXp1cml0ZQ== -d mcr.microsoft.com/azure-storage/azurite azurite-blob -l /data --blobHost 0.0.0.0 --blobPort 10000\n"
  },
  {
    "path": "scripts/ci/setup_hadoop_env.sh",
    "content": "#!/usr/bin/env bash\n\nHADOOP_DISTRO=${HADOOP_DISTRO:-\"hdp\"}\n\nONLY_DOWNLOAD=${ONLY_DOWNLOAD:-false}\nONLY_EXTRACT=${ONLY_EXTRACT:-false}\n\nwhile test $# -gt 0; do\n    case \"$1\" in\n        -h|--help)\n            echo \"Setup environment for snakebite tests\"\n            echo \" \"\n            echo \"options:\"\n            echo -e \"\\t-h, --help            show brief help\"\n            echo -e \"\\t-o, --only-download   just download hadoop tar(s)\"\n            echo -e \"\\t-e, --only-extract    just extract hadoop tar(s)\"\n            echo -e \"\\t-d, --distro          select distro (hdp|cdh)\"\n            exit 0\n            ;;\n        -o|--only-download)\n            shift\n            ONLY_DOWNLOAD=true\n            ;;\n        -e|--only-extract)\n            shift\n            ONLY_EXTRACT=true\n            ;;\n        -d|--distro)\n            shift\n            if test $# -gt 0; then\n                HADOOP_DISTRO=$1\n            else\n                echo \"No Hadoop distro specified - abort\" >&2\n                exit 1\n            fi\n            shift\n            ;;\n        *)\n            echo \"Unknown options: $1\" >&2\n            exit 1\n            ;;\n    esac\ndone\n\nif $ONLY_DOWNLOAD && $ONLY_EXTRACT; then\n    echo \"Both only-download and only-extract specified - abort\" >&2\n    exit 1\nfi\n\nmkdir -p $HADOOP_HOME\n\nif [ $HADOOP_DISTRO = \"cdh\" ]; then\n    URL=\"http://archive.cloudera.com/cdh5/cdh/5/hadoop-latest.tar.gz\"\nelif [ $HADOOP_DISTRO = \"hdp\" ]; then\n    # This site provides good URLs:\n    # https://github.com/saltstack-formulas/hadoop-formula/blob/5034a2204da691eceb9c2d8cd8260f11d5cc06f3/hadoop/settings.sls\n    URL=\"http://public-repo-1.hortonworks.com/HDP/centos6/2.x/updates/2.2.6.0/tars/hadoop-2.6.0.2.2.6.0-2800.tar.gz\"\nelse\n    echo \"No/bad HADOOP_DISTRO='${HADOOP_DISTRO}' specified\" >&2\n    exit 1\nfi\n\nif ! $ONLY_EXTRACT && [ ! -e ${HADOOP_HOME}/hadoop.tar.gz ] ; then\n    echo \"Downloading Hadoop from $URL to ${HADOOP_HOME}/hadoop.tar.gz\"\n    curl -z ${HADOOP_HOME}/hadoop.tar.gz -o ${HADOOP_HOME}/hadoop.tar.gz -L $URL\n\n    if [ $? != 0 ]; then\n        echo \"Failed to download Hadoop from $URL - abort\" >&2\n        exit 1\n    fi\nfi\n\nif $ONLY_DOWNLOAD; then\n    exit 0\nfi\n\necho \"Extracting ${HADOOP_HOME}/hadoop.tar.gz into $HADOOP_HOME\"\ntar zxf ${HADOOP_HOME}/hadoop.tar.gz --strip-components 1 -C $HADOOP_HOME\n\nif [ $? != 0 ]; then\n    echo \"Failed to extract Hadoop from ${HADOOP_HOME}/hadoop.tar.gz to ${HADOOP_HOME} - abort\" >&2\n    exit 1\nfi\n"
  },
  {
    "path": "scripts/ci/stop_azurite.sh",
    "content": "#!/usr/bin/env bash\ndocker stop \"$(docker ps -q --filter ancestor=mcr.microsoft.com/azure-storage/azurite)\""
  },
  {
    "path": "test/_mysqldb_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport mysql.connector\nfrom helpers import unittest\n\nfrom luigi.contrib.mysqldb import MySqlTarget\n\nhost = \"localhost\"\nport = 3306\ndatabase = \"luigi_test\"\nusername = None\npassword = None\ntable_updates = \"table_updates\"\n\n\ndef _create_test_database():\n    con = mysql.connector.connect(user=username, password=password, host=host, port=port, autocommit=True)\n    con.cursor().execute(\"CREATE DATABASE IF NOT EXISTS %s\" % database)\n\n\n_create_test_database()\ntarget = MySqlTarget(host, database, username, password, \"\", \"update_id\")\n\n\nclass MySqlTargetTest(unittest.TestCase):\n    def test_touch_and_exists(self):\n        drop()\n        self.assertFalse(target.exists(), \"Target should not exist before touching it\")\n        target.touch()\n        self.assertTrue(target.exists(), \"Target should exist after touching it\")\n\n\ndef drop():\n    con = target.connect(autocommit=True)\n    con.cursor().execute(\"DROP TABLE IF EXISTS %s\" % table_updates)\n"
  },
  {
    "path": "test/_test_ftp.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n# this is an integration test. to run this test requires that an actuall FTP server\n# is running somewhere. to run a local ftp server do the following\n# pip install pyftpdlib==1.5.0\n# mkdir /tmp/luigi-test-ftp/\n# sudo python -m _test_ftp\n\n\nimport datetime\nimport ftplib\nimport os\nimport shutil\nimport sys\nfrom io import StringIO\n\nfrom helpers import unittest\n\nfrom luigi.contrib.ftp import RemoteFileSystem, RemoteTarget\n\n# dumb files\nFILE1 = \"\"\"this is file1\"\"\"\nFILE2 = \"\"\"this is file2\"\"\"\nFILE3 = \"\"\"this is file3\"\"\"\n\nHOST = \"localhost\"\nUSER = \"luigi\"\nPWD = \"some_password\"\n\n\nclass TestFTPFilesystem(unittest.TestCase):\n    def setUp(self):\n        \"\"\"Creates structure\n\n        /test\n        /test/file1\n        /test/hola/\n        /test/hola/file2\n        /test/hola/singlefile\n        /test/hola/file3\n        \"\"\"\n        # create structure\n        ftp = ftplib.FTP(HOST, USER, PWD)\n        ftp.cwd(\"/\")\n        ftp.mkd(\"test\")\n        ftp.cwd(\"test\")\n        ftp.mkd(\"hola\")\n        ftp.cwd(\"hola\")\n        f2 = StringIO(FILE2)\n        ftp.storbinary(\"STOR file2\", f2)  # send the file\n        f3 = StringIO(FILE3)\n        ftp.storbinary(\"STOR file3\", f3)  # send the file\n        ftp.cwd(\"..\")\n        f1 = StringIO(FILE1)\n        ftp.storbinary(\"STOR file1\", f1)  # send the file\n        ftp.close()\n\n    def test_file_remove(self):\n        \"\"\"Delete with recursive deactivated\"\"\"\n        rfs = RemoteFileSystem(HOST, USER, PWD)\n        rfs.remove(\"/test/hola/file3\", recursive=False)\n        rfs.remove(\"/test/hola/file2\", recursive=False)\n        rfs.remove(\"/test/hola\", recursive=False)\n        rfs.remove(\"/test/file1\", recursive=False)\n        rfs.remove(\"/test\", recursive=False)\n\n        ftp = ftplib.FTP(HOST, USER, PWD)\n        list_dir = ftp.nlst()\n\n        self.assertFalse(\"test\" in list_dir)\n\n    def test_recursive_remove(self):\n        \"\"\"Test FTP filesystem removing files recursive\"\"\"\n        rfs = RemoteFileSystem(HOST, USER, PWD)\n        rfs.remove(\"/test\")\n\n        ftp = ftplib.FTP(HOST, USER, PWD)\n        list_dir = ftp.nlst()\n\n        self.assertFalse(\"test\" in list_dir)\n\n\nclass TestFTPFilesystemUpload(unittest.TestCase):\n    def test_single(self):\n        \"\"\"Test upload file with creation of intermediate folders\"\"\"\n        ftp_path = \"/test/nest/luigi-test\"\n        local_filepath = \"/tmp/luigi-test-ftp\"\n\n        # create local temp file\n        with open(local_filepath, \"w\") as outfile:\n            outfile.write(\"something to fill\")\n\n        rfs = RemoteFileSystem(HOST, USER, PWD)\n        rfs.put(local_filepath, ftp_path)\n\n        # manually connect to ftp\n        ftp = ftplib.FTP(HOST, USER, PWD)\n        ftp.cwd(\"/test/nest\")\n        list_dir = ftp.nlst()\n        # file is successfuly created\n        self.assertTrue(\"luigi-test\" in list_dir)\n\n        # delete tmp files\n        ftp.delete(\"luigi-test\")\n        ftp.cwd(\"/\")\n        ftp.rmd(\"/test/nest\")\n        ftp.rmd(\"test\")\n        os.remove(local_filepath)\n        ftp.close()\n\n\nclass TestRemoteTarget(unittest.TestCase):\n    def test_put(self):\n        \"\"\"Test RemoteTarget put method with uploading to an FTP\"\"\"\n        local_filepath = \"/tmp/luigi-remotetarget-write-test\"\n        remote_file = \"/test/example.put.file\"\n\n        # create local temp file\n        with open(local_filepath, \"w\") as outfile:\n            outfile.write(\"something to fill\")\n\n        remotetarget = RemoteTarget(remote_file, HOST, username=USER, password=PWD)\n        remotetarget.put(local_filepath)\n\n        # manually connect to ftp\n        ftp = ftplib.FTP(HOST, USER, PWD)\n        ftp.cwd(\"/test\")\n        list_dir = ftp.nlst()\n\n        # file is successfuly created\n        self.assertTrue(remote_file.split(\"/\")[-1] in list_dir)\n\n        # clean\n        os.remove(local_filepath)\n        ftp.delete(remote_file)\n        ftp.cwd(\"/\")\n        ftp.rmd(\"test\")\n        ftp.close()\n\n    def test_get(self):\n        \"\"\"Test Remote target get method downloading a file from ftp\"\"\"\n        local_filepath = \"/tmp/luigi-remotetarget-read-test\"\n        tmp_filepath = \"/tmp/tmp-luigi-remotetarget-read-test\"\n        remote_file = \"/test/example.get.file\"\n\n        # create local temp file\n        with open(tmp_filepath, \"w\") as outfile:\n            outfile.write(\"something to fill\")\n\n        # manualy upload to ftp\n        ftp = ftplib.FTP(HOST, USER, PWD)\n        ftp.mkd(\"test\")\n        ftp.storbinary(\"STOR %s\" % remote_file, open(tmp_filepath, \"rb\"))\n        ftp.close()\n\n        # execute command\n        remotetarget = RemoteTarget(remote_file, HOST, username=USER, password=PWD)\n        remotetarget.get(local_filepath)\n\n        # make sure that it can open file\n        with remotetarget.open(\"r\") as fin:\n            self.assertEqual(fin.read(), \"something to fill\")\n\n        # check for cleaning temporary files\n        if sys.version_info >= (3, 2):\n            # cleanup uses tempfile.TemporaryDirectory only available in 3.2+\n            temppath = remotetarget._RemoteTarget__tmp_path\n            self.assertTrue(os.path.exists(temppath))\n            remotetarget = None  # garbage collect remotetarget\n            self.assertFalse(os.path.exists(temppath))\n\n        # file is successfuly created\n        self.assertTrue(os.path.exists(local_filepath))\n\n        # test RemoteTarget with mtime\n        ts = datetime.datetime.now() - datetime.timedelta(days=2)\n        delayed_remotetarget = RemoteTarget(remote_file, HOST, username=USER, password=PWD, mtime=ts)\n        self.assertTrue(delayed_remotetarget.exists())\n\n        ts = datetime.datetime.now() + datetime.timedelta(days=2)  # who knows what timezone it is in\n        delayed_remotetarget = RemoteTarget(remote_file, HOST, username=USER, password=PWD, mtime=ts)\n        self.assertFalse(delayed_remotetarget.exists())\n\n        # clean\n        os.remove(local_filepath)\n        os.remove(tmp_filepath)\n        ftp = ftplib.FTP(HOST, USER, PWD)\n        ftp.delete(remote_file)\n        ftp.cwd(\"/\")\n        ftp.rmd(\"test\")\n        ftp.close()\n\n\ndef _run_ftp_server():\n    from pyftpdlib.authorizers import DummyAuthorizer\n    from pyftpdlib.handlers import FTPHandler\n    from pyftpdlib.servers import FTPServer\n\n    # Instantiate a dummy authorizer for managing 'virtual' users\n    authorizer = DummyAuthorizer()\n\n    tmp_folder = \"/tmp/luigi-test-ftp-server/\"\n    if os.path.exists(tmp_folder):\n        shutil.rmtree(tmp_folder)\n    os.mkdir(tmp_folder)\n\n    authorizer.add_user(USER, PWD, tmp_folder, perm=\"elradfmwM\")\n    handler = FTPHandler\n    handler.authorizer = authorizer\n    address = (\"localhost\", 21)\n    server = FTPServer(address, handler)\n    server.serve_forever()\n\n\nif __name__ == \"__main__\":\n    _run_ftp_server()\n"
  },
  {
    "path": "test/auto_namespace_test/__init__.py",
    "content": "import luigi\n\nluigi.auto_namespace(scope=__name__)\n"
  },
  {
    "path": "test/auto_namespace_test/my_namespace_test.py",
    "content": "from helpers import LuigiTestCase\n\nimport luigi\n\n\nclass MyNamespaceTest(LuigiTestCase):\n    def test_auto_namespace_scope(self):\n        class MyTask(luigi.Task):\n            pass\n\n        self.assertTrue(self.run_locally([\"auto_namespace_test.my_namespace_test.MyTask\"]))\n        self.assertEqual(MyTask.get_task_namespace(), \"auto_namespace_test.my_namespace_test\")\n"
  },
  {
    "path": "test/batch_notifier_test.py",
    "content": "# coding=utf-8\nimport unittest\nfrom smtplib import SMTPServerDisconnected\n\nimport mock\n\nimport luigi.batch_notifier\n\nBATCH_NOTIFIER_DEFAULTS = {\n    \"error_lines\": 0,\n    \"error_messages\": 0,\n    \"group_by_error_messages\": False,\n}\n\n\nclass BatchNotifier(luigi.batch_notifier.BatchNotifier):\n    \"\"\"BatchNotifier class with defaults that produce smaller output for testing\"\"\"\n\n    def __init__(self, **kwargs):\n        full_args = BATCH_NOTIFIER_DEFAULTS.copy()\n        full_args.update(kwargs)\n        super(BatchNotifier, self).__init__(**full_args)\n\n\nclass BatchNotifierTest(unittest.TestCase):\n    def setUp(self):\n        self.time_mock = mock.patch(\"luigi.batch_notifier.time.time\")\n        self.time = self.time_mock.start()\n        self.time.return_value = 0.0\n\n        self.send_email_mock = mock.patch(\"luigi.batch_notifier.send_email\")\n        self.send_email = self.send_email_mock.start()\n\n        self.email_mock = mock.patch(\"luigi.batch_notifier.email\")\n        self.email = self.email_mock.start()\n        self.email().sender = \"sender@test.com\"\n        self.email().receiver = \"r@test.com\"\n\n    def tearDown(self):\n        self.time_mock.stop()\n        self.send_email_mock.stop()\n        self.email_mock.stop()\n\n    def incr_time(self, minutes):\n        self.time.return_value += minutes * 60\n\n    def check_email_send(self, subject, message, receiver=\"r@test.com\", sender=\"sender@test.com\"):\n        self.send_email.assert_called_once_with(subject, message, sender, (receiver,))\n\n    def test_send_single_failure(self):\n        bn = BatchNotifier(batch_mode=\"all\")\n        bn.add_failure(\"Task(a=5)\", \"Task\", {\"a\": 5}, \"error\", [])\n        bn.send_email()\n        self.check_email_send(\"Luigi: 1 failure in the last 60 minutes\", \"- Task(a=5) (1 failure)\")\n\n    def test_do_not_send_single_failure_without_receiver(self):\n        self.email().receiver = None\n        bn = BatchNotifier(batch_mode=\"all\")\n        bn.add_failure(\"Task(a=5)\", \"Task\", {\"a\": 5}, \"error\", [])\n        bn.send_email()\n        self.send_email.assert_not_called()\n\n    def test_send_single_failure_to_owner_only(self):\n        self.email().receiver = None\n        bn = BatchNotifier(batch_mode=\"all\")\n        bn.add_failure(\"Task(a=5)\", \"Task\", {\"a\": 5}, \"error\", [\"owner@test.com\"])\n        bn.send_email()\n        self.check_email_send(\n            \"Luigi: Your tasks have 1 failure in the last 60 minutes\",\n            \"- Task(a=5) (1 failure)\",\n            receiver=\"owner@test.com\",\n        )\n\n    def test_send_single_disable(self):\n        bn = BatchNotifier(batch_mode=\"all\")\n        for _ in range(10):\n            bn.add_failure(\"Task(a=5)\", \"Task\", {\"a\": 5}, \"error\", [])\n        bn.add_disable(\"Task(a=5)\", \"Task\", {\"a\": 5}, [])\n        bn.send_email()\n        self.check_email_send(\"Luigi: 10 failures, 1 disable in the last 60 minutes\", \"- Task(a=5) (10 failures, 1 disable)\")\n\n    def test_send_multiple_disables(self):\n        bn = BatchNotifier(batch_mode=\"family\")\n        for _ in range(10):\n            bn.add_failure(\"Task(a=5)\", \"Task\", {\"a\": 5}, \"error\", [])\n            bn.add_failure(\"Task(a=6)\", \"Task\", {\"a\": 6}, \"error\", [])\n        bn.add_disable(\"Task(a=5)\", \"Task\", {\"a\": 5}, [])\n        bn.add_disable(\"Task(a=6)\", \"Task\", {\"a\": 6}, [])\n        bn.send_email()\n        self.check_email_send(\"Luigi: 20 failures, 2 disables in the last 60 minutes\", \"- Task (20 failures, 2 disables)\")\n\n    def test_send_single_scheduling_fail(self):\n        bn = BatchNotifier(batch_mode=\"family\")\n        bn.add_scheduling_fail(\"Task()\", \"Task\", {}, \"error\", [])\n        bn.send_email()\n        self.check_email_send(\n            \"Luigi: 1 scheduling failure in the last 60 minutes\",\n            \"- Task (1 scheduling failure)\",\n        )\n\n    def test_multiple_failures_of_same_job(self):\n        bn = BatchNotifier(batch_mode=\"all\")\n        bn.add_failure(\"Task(a=5)\", \"Task\", {\"a\": 5}, \"error\", [])\n        bn.add_failure(\"Task(a=5)\", \"Task\", {\"a\": 5}, \"error\", [])\n        bn.add_failure(\"Task(a=5)\", \"Task\", {\"a\": 5}, \"error\", [])\n        bn.send_email()\n        self.check_email_send(\"Luigi: 3 failures in the last 60 minutes\", \"- Task(a=5) (3 failures)\")\n\n    def test_multiple_failures_of_multiple_jobs(self):\n        bn = BatchNotifier(batch_mode=\"all\")\n        bn.add_failure(\"Task(a=5)\", \"Task\", {\"a\": 5}, \"error\", [])\n        bn.add_failure(\"Task(a=6)\", \"Task\", {\"a\": 6}, \"error\", [])\n        bn.add_failure(\"Task(a=6)\", \"Task\", {\"a\": 6}, \"error\", [])\n        bn.send_email()\n        self.check_email_send(\"Luigi: 3 failures in the last 60 minutes\", \"- Task(a=6) (2 failures)\\n- Task(a=5) (1 failure)\")\n\n    def test_group_on_family(self):\n        bn = BatchNotifier(batch_mode=\"family\")\n        bn.add_failure(\"Task(a=5)\", \"Task\", {\"a\": 5}, \"error\", [])\n        bn.add_failure(\"Task(a=6)\", \"Task\", {\"a\": 6}, \"error\", [])\n        bn.add_failure(\"Task(a=6)\", \"Task\", {\"a\": 6}, \"error\", [])\n        bn.add_failure(\"OtherTask(a=6)\", \"OtherTask\", {\"a\": 6}, \"error\", [])\n        bn.send_email()\n        self.check_email_send(\"Luigi: 4 failures in the last 60 minutes\", \"- Task (3 failures)\\n- OtherTask (1 failure)\")\n\n    def test_group_on_unbatched_params(self):\n        bn = BatchNotifier(batch_mode=\"unbatched_params\")\n        bn.add_failure(\"Task(a=5, b=1)\", \"Task\", {\"a\": 5}, \"error\", [])\n        bn.add_failure(\"Task(a=5, b=2)\", \"Task\", {\"a\": 5}, \"error\", [])\n        bn.add_failure(\"Task(a=6, b=1)\", \"Task\", {\"a\": 6}, \"error\", [])\n        bn.add_failure(\"Task(a=6, b=2)\", \"Task\", {\"a\": 6}, \"error\", [])\n        bn.add_failure(\"Task(a=6, b=3)\", \"Task\", {\"a\": 6}, \"error\", [])\n        bn.add_failure(\"Task(a=6, b=4)\", \"Task\", {\"a\": 6}, \"error\", [])\n        bn.add_failure(\"OtherTask(a=5, b=1)\", \"OtherTask\", {\"a\": 5}, \"error\", [])\n        bn.add_failure(\"OtherTask(a=6, b=1)\", \"OtherTask\", {\"a\": 6}, \"error\", [])\n        bn.add_failure(\"OtherTask(a=6, b=2)\", \"OtherTask\", {\"a\": 6}, \"error\", [])\n        bn.add_failure(\"OtherTask(a=6, b=3)\", \"OtherTask\", {\"a\": 6}, \"error\", [])\n        bn.send_email()\n        self.check_email_send(\n            \"Luigi: 10 failures in the last 60 minutes\",\n            \"- Task(a=6) (4 failures)\\n- OtherTask(a=6) (3 failures)\\n- Task(a=5) (2 failures)\\n- OtherTask(a=5) (1 failure)\",\n        )\n\n    def test_include_one_expl_includes_latest(self):\n        bn = BatchNotifier(batch_mode=\"family\", error_messages=1)\n        bn.add_failure(\"Task(a=1)\", \"Task\", {\"a\": 1}, \"error 1\", [])\n        bn.add_failure(\"Task(a=2)\", \"Task\", {\"a\": 2}, \"error 2\", [])\n        bn.add_failure(\"TaskB(a=1)\", \"TaskB\", {\"a\": 1}, \"error\", [])\n\n        bn.send_email()\n        self.check_email_send(\"Luigi: 3 failures in the last 60 minutes\", \"- Task (2 failures)\\n\\n      error 2\\n\\n- TaskB (1 failure)\\n\\n      error\")\n\n    def test_include_two_expls(self):\n        bn = BatchNotifier(batch_mode=\"family\", error_messages=2)\n        bn.add_failure(\"Task(a=1)\", \"Task\", {\"a\": 1}, \"error 1\", [])\n        bn.add_failure(\"Task(a=2)\", \"Task\", {\"a\": 2}, \"error 2\", [])\n        bn.add_failure(\"TaskB(a=1)\", \"TaskB\", {\"a\": 1}, \"error\", [])\n\n        bn.send_email()\n        self.check_email_send(\n            \"Luigi: 3 failures in the last 60 minutes\", \"- Task (2 failures)\\n\\n      error 1\\n\\n      error 2\\n\\n- TaskB (1 failure)\\n\\n      error\"\n        )\n\n    def test_limit_expl_length(self):\n        bn = BatchNotifier(batch_mode=\"family\", error_messages=1, error_lines=2)\n        bn.add_failure(\"Task(a=1)\", \"Task\", {\"a\": \"1\"}, \"line 1\\nline 2\\nline 3\\nline 4\\n\", [])\n        bn.send_email()\n        self.check_email_send(\"Luigi: 1 failure in the last 60 minutes\", \"- Task (1 failure)\\n\\n      line 3\\n      line 4\")\n\n    def test_expl_varies_by_owner(self):\n        bn = BatchNotifier(batch_mode=\"family\", error_messages=1)\n        bn.add_failure(\"Task(a=1)\", \"Task\", {\"a\": \"1\"}, \"msg1\", owners=[\"a@test.com\"])\n        bn.add_failure(\"Task(a=2)\", \"Task\", {\"a\": \"2\"}, \"msg2\", owners=[\"b@test.com\"])\n        bn.send_email()\n        send_calls = [\n            mock.call(\n                \"Luigi: Your tasks have 1 failure in the last 60 minutes\",\n                \"- Task (1 failure)\\n\\n      msg1\",\n                \"sender@test.com\",\n                (\"a@test.com\",),\n            ),\n            mock.call(\n                \"Luigi: Your tasks have 1 failure in the last 60 minutes\",\n                \"- Task (1 failure)\\n\\n      msg2\",\n                \"sender@test.com\",\n                (\"b@test.com\",),\n            ),\n            mock.call(\n                \"Luigi: 2 failures in the last 60 minutes\",\n                \"- Task (2 failures)\\n\\n      msg2\",\n                \"sender@test.com\",\n                (\"r@test.com\",),\n            ),\n        ]\n        self.send_email.assert_has_calls(send_calls, any_order=True)\n\n    def test_include_two_expls_html_format(self):\n        self.email().format = \"html\"\n        bn = BatchNotifier(batch_mode=\"family\", error_messages=2)\n        bn.add_failure(\"Task(a=1)\", \"Task\", {\"a\": 1}, \"error 1\", [])\n        bn.add_failure(\"Task(a=2)\", \"Task\", {\"a\": 2}, \"error 2\", [])\n        bn.add_failure(\"TaskB(a=1)\", \"TaskB\", {\"a\": 1}, \"error\", [])\n\n        bn.send_email()\n        self.check_email_send(\n            \"Luigi: 3 failures in the last 60 minutes\",\n            \"<ul>\\n<li>Task (2 failures)\\n<pre>error 1</pre>\\n<pre>error 2</pre>\\n<li>TaskB (1 failure)\\n<pre>error</pre>\\n</ul>\",\n        )\n\n    def test_limit_expl_length_html_format(self):\n        self.email().format = \"html\"\n        bn = BatchNotifier(batch_mode=\"family\", error_messages=1, error_lines=2)\n        bn.add_failure(\"Task(a=1)\", \"Task\", {\"a\": 1}, \"line 1\\nline 2\\nline 3\\nline 4\\n\", [])\n        bn.send_email()\n        self.check_email_send(\"Luigi: 1 failure in the last 60 minutes\", \"<ul>\\n<li>Task (1 failure)\\n<pre>line 3\\nline 4</pre>\\n</ul>\")\n\n    def test_send_clears_backlog(self):\n        bn = BatchNotifier(batch_mode=\"all\")\n        bn.add_failure(\"Task(a=5)\", \"Task\", {\"a\": 5}, \"error\", [])\n        bn.add_disable(\"Task(a=5)\", \"Task\", {\"a\": 5}, [])\n        bn.add_scheduling_fail(\"Task(a=6)\", \"Task\", {\"a\": 6}, \"scheduling error\", [])\n        bn.send_email()\n\n        self.send_email.reset_mock()\n        bn.send_email()\n        self.send_email.assert_not_called()\n\n    def test_email_gets_cleared_on_failure(self):\n        bn = BatchNotifier(batch_mode=\"all\")\n\n        bn.add_failure(\"Task(a=5)\", \"Task\", {\"a\": 1}, \"\", [])\n        self.send_email.side_effect = SMTPServerDisconnected(\"timeout\")\n        self.assertRaises(SMTPServerDisconnected, bn.send_email)\n\n        self.send_email.reset_mock()\n        bn.send_email()\n        self.send_email.assert_not_called()\n\n    def test_send_clears_all_old_data(self):\n        bn = BatchNotifier(batch_mode=\"all\", error_messages=100)\n\n        for i in range(100):\n            bn.add_failure(\"Task(a=5)\", \"Task\", {\"a\": 5}, \"error {}\".format(i), [])\n            bn.add_disable(\"Task(a=5)\", \"Task\", {\"a\": 5}, [])\n            bn.add_scheduling_fail(\"Task(a=6)\", \"Task\", {\"a\": 6}, \"scheduling error {}\".format(i), [])\n            bn.send_email()\n            self.check_email_send(\n                \"Luigi: 1 failure, 1 disable, 1 scheduling failure in the last 60 minutes\",\n                \"- Task(a=5) (1 failure, 1 disable)\\n\\n      error {}\\n\\n- Task(a=6) (1 scheduling failure)\\n\\n      scheduling error {}\".format(i, i),\n            )\n            self.send_email.reset_mock()\n\n    def test_auto_send_on_update_after_time_period(self):\n        bn = BatchNotifier(batch_mode=\"all\")\n        bn.add_failure(\"Task(a=5)\", \"Task\", {\"a\": 5}, \"error\", [])\n\n        for i in range(60):\n            bn.update()\n            self.send_email.assert_not_called()\n            self.incr_time(minutes=1)\n\n        bn.update()\n        self.check_email_send(\"Luigi: 1 failure in the last 60 minutes\", \"- Task(a=5) (1 failure)\")\n\n    def test_auto_send_on_update_after_time_period_with_disable_only(self):\n        bn = BatchNotifier(batch_mode=\"all\")\n        bn.add_disable(\"Task(a=5)\", \"Task\", {\"a\": 5}, [])\n\n        for i in range(60):\n            bn.update()\n            self.send_email.assert_not_called()\n            self.incr_time(minutes=1)\n\n        bn.update()\n        self.check_email_send(\"Luigi: 1 disable in the last 60 minutes\", \"- Task(a=5) (1 disable)\")\n\n    def test_no_auto_send_until_end_of_interval_with_error(self):\n        bn = BatchNotifier(batch_mode=\"all\")\n\n        for i in range(90):\n            bn.update()\n            self.send_email.assert_not_called()\n            self.incr_time(minutes=1)\n\n        bn.add_failure(\"Task(a=5)\", \"Task\", {\"a\": 5}, \"error\", [])\n        for i in range(30):\n            bn.update()\n            self.send_email.assert_not_called()\n            self.incr_time(minutes=1)\n\n        bn.update()\n        self.check_email_send(\"Luigi: 1 failure in the last 60 minutes\", \"- Task(a=5) (1 failure)\")\n\n    def test_no_auto_send_for_interval_after_exception(self):\n        bn = BatchNotifier(batch_mode=\"all\")\n        bn.add_failure(\"Task(a=5)\", \"Task\", {\"a\": 5}, \"error\", [])\n        self.send_email.side_effect = SMTPServerDisconnected\n\n        self.incr_time(minutes=60)\n        self.assertRaises(SMTPServerDisconnected, bn.update)\n\n        self.send_email.reset_mock()\n        self.send_email.side_effect = None\n        bn.add_failure(\"Task(a=5)\", \"Task\", {\"a\": 5}, \"error\", [])\n        for i in range(60):\n            bn.update()\n            self.send_email.assert_not_called()\n            self.incr_time(minutes=1)\n\n        bn.update()\n        self.assertEqual(1, self.send_email.call_count)\n\n    def test_send_batch_failure_emails_to_owners(self):\n        bn = BatchNotifier(batch_mode=\"all\")\n        bn.add_failure(\"Task(a=1)\", \"Task\", {\"a\": \"1\"}, \"error\", [\"a@test.com\", \"b@test.com\"])\n        bn.add_failure(\"Task(a=1)\", \"Task\", {\"a\": \"1\"}, \"error\", [\"b@test.com\"])\n        bn.add_failure(\"Task(a=2)\", \"Task\", {\"a\": \"2\"}, \"error\", [\"a@test.com\"])\n        bn.send_email()\n\n        send_calls = [\n            mock.call(\n                \"Luigi: 3 failures in the last 60 minutes\",\n                \"- Task(a=1) (2 failures)\\n- Task(a=2) (1 failure)\",\n                \"sender@test.com\",\n                (\"r@test.com\",),\n            ),\n            mock.call(\n                \"Luigi: Your tasks have 2 failures in the last 60 minutes\",\n                \"- Task(a=1) (1 failure)\\n- Task(a=2) (1 failure)\",\n                \"sender@test.com\",\n                (\"a@test.com\",),\n            ),\n            mock.call(\n                \"Luigi: Your tasks have 2 failures in the last 60 minutes\",\n                \"- Task(a=1) (2 failures)\",\n                \"sender@test.com\",\n                (\"b@test.com\",),\n            ),\n        ]\n        self.send_email.assert_has_calls(send_calls, any_order=True)\n\n    def test_send_batch_disable_email_to_owners(self):\n        bn = BatchNotifier(batch_mode=\"all\")\n        bn.add_disable(\"Task(a=1)\", \"Task\", {\"a\": \"1\"}, [\"a@test.com\"])\n        bn.send_email()\n\n        send_calls = [\n            mock.call(\n                \"Luigi: 1 disable in the last 60 minutes\",\n                \"- Task(a=1) (1 disable)\",\n                \"sender@test.com\",\n                (\"r@test.com\",),\n            ),\n            mock.call(\n                \"Luigi: Your tasks have 1 disable in the last 60 minutes\",\n                \"- Task(a=1) (1 disable)\",\n                \"sender@test.com\",\n                (\"a@test.com\",),\n            ),\n        ]\n        self.send_email.assert_has_calls(send_calls, any_order=True)\n\n    def test_batch_identical_expls(self):\n        bn = BatchNotifier(error_messages=1, group_by_error_messages=True)\n        bn.add_failure(\"Task(a=1)\", \"Task\", {\"a\": \"1\"}, \"msg1\", [])\n        bn.add_failure(\"Task(a=2)\", \"Task\", {\"a\": \"2\"}, \"msg1\", [])\n        bn.add_failure(\"Task(a=3)\", \"Task\", {\"a\": \"3\"}, \"msg1\", [])\n        bn.add_failure(\"Task(a=4)\", \"Task\", {\"a\": \"4\"}, \"msg2\", [])\n        bn.add_failure(\"Task(a=4)\", \"Task\", {\"a\": \"4\"}, \"msg2\", [])\n        bn.send_email()\n        self.check_email_send(\n            \"Luigi: 5 failures in the last 60 minutes\",\n            \"- Task(a=1) (1 failure)\\n  Task(a=2) (1 failure)\\n  Task(a=3) (1 failure)\\n\\n      msg1\\n\\n- Task(a=4) (2 failures)\\n\\n      msg2\",\n        )\n\n    def test_batch_identical_expls_html(self):\n        self.email().format = \"html\"\n        bn = BatchNotifier(error_messages=1, group_by_error_messages=True)\n        bn.add_failure(\"Task(a=1)\", \"Task\", {\"a\": \"1\"}, \"msg1\", [])\n        bn.add_failure(\"Task(a=2)\", \"Task\", {\"a\": \"2\"}, \"msg1\", [])\n        bn.add_failure(\"Task(a=3)\", \"Task\", {\"a\": \"3\"}, \"msg1\", [])\n        bn.add_failure(\"Task(a=4)\", \"Task\", {\"a\": \"4\"}, \"msg2\", [])\n        bn.add_failure(\"Task(a=4)\", \"Task\", {\"a\": \"4\"}, \"msg2\", [])\n        bn.send_email()\n        self.check_email_send(\n            \"Luigi: 5 failures in the last 60 minutes\",\n            \"<ul>\\n\"\n            \"<li>Task(a=1) (1 failure)\\n\"\n            \"<br>Task(a=2) (1 failure)\\n\"\n            \"<br>Task(a=3) (1 failure)\\n\"\n            \"<pre>msg1</pre>\\n\"\n            \"<li>Task(a=4) (2 failures)\\n\"\n            \"<pre>msg2</pre>\\n\"\n            \"</ul>\",\n        )\n\n    def test_unicode_error_message(self):\n        bn = BatchNotifier(error_messages=1)\n        bn.add_failure(\"Task()\", \"Task\", {}, \"Érror\", [])\n        bn.send_email()\n        self.check_email_send(\"Luigi: 1 failure in the last 60 minutes\", \"- Task() (1 failure)\\n\\n      Érror\")\n\n    def test_unicode_error_message_html(self):\n        self.email().format = \"html\"\n        bn = BatchNotifier(error_messages=1)\n        bn.add_failure(\"Task()\", \"Task\", {}, \"Érror\", [])\n        bn.send_email()\n        self.check_email_send(\"Luigi: 1 failure in the last 60 minutes\", \"<ul>\\n<li>Task() (1 failure)\\n<pre>Érror</pre>\\n</ul>\")\n\n    def test_unicode_param_value(self):\n        for batch_mode in (\"all\", \"unbatched_params\"):\n            self.send_email.reset_mock()\n            bn = BatchNotifier(batch_mode=batch_mode)\n            bn.add_failure(\"Task(a=á)\", \"Task\", {\"a\": \"á\"}, \"error\", [])\n            bn.send_email()\n            self.check_email_send(\"Luigi: 1 failure in the last 60 minutes\", \"- Task(a=á) (1 failure)\")\n\n    def test_unicode_param_value_html(self):\n        self.email().format = \"html\"\n        for batch_mode in (\"all\", \"unbatched_params\"):\n            self.send_email.reset_mock()\n            bn = BatchNotifier(batch_mode=batch_mode)\n            bn.add_failure(\"Task(a=á)\", \"Task\", {\"a\": \"á\"}, \"error\", [])\n            bn.send_email()\n            self.check_email_send(\"Luigi: 1 failure in the last 60 minutes\", \"<ul>\\n<li>Task(a=á) (1 failure)\\n</ul>\")\n\n    def test_unicode_param_name(self):\n        for batch_mode in (\"all\", \"unbatched_params\"):\n            self.send_email.reset_mock()\n            bn = BatchNotifier(batch_mode=batch_mode)\n            bn.add_failure(\"Task(á=a)\", \"Task\", {\"á\": \"a\"}, \"error\", [])\n            bn.send_email()\n            self.check_email_send(\"Luigi: 1 failure in the last 60 minutes\", \"- Task(á=a) (1 failure)\")\n\n    def test_unicode_param_name_html(self):\n        self.email().format = \"html\"\n        for batch_mode in (\"all\", \"unbatched_params\"):\n            self.send_email.reset_mock()\n            bn = BatchNotifier(batch_mode=batch_mode)\n            bn.add_failure(\"Task(á=a)\", \"Task\", {\"á\": \"a\"}, \"error\", [])\n            bn.send_email()\n            self.check_email_send(\"Luigi: 1 failure in the last 60 minutes\", \"<ul>\\n<li>Task(á=a) (1 failure)\\n</ul>\")\n\n    def test_unicode_class_name(self):\n        bn = BatchNotifier()\n        bn.add_failure(\"Tásk()\", \"Tásk\", {}, \"error\", [])\n        bn.send_email()\n        self.check_email_send(\"Luigi: 1 failure in the last 60 minutes\", \"- Tásk() (1 failure)\")\n\n    def test_unicode_class_name_html(self):\n        self.email().format = \"html\"\n        bn = BatchNotifier()\n        bn.add_failure(\"Tásk()\", \"Tásk\", {}, \"error\", [])\n        bn.send_email()\n        self.check_email_send(\"Luigi: 1 failure in the last 60 minutes\", \"<ul>\\n<li>Tásk() (1 failure)\\n</ul>\")\n"
  },
  {
    "path": "test/choice_parameter_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom helpers import unittest\n\nimport luigi\n\n\nclass ChoiceParameterTest(unittest.TestCase):\n    def test_parse_str(self):\n        d = luigi.ChoiceParameter(choices=[\"1\", \"2\", \"3\"])\n        self.assertEqual(\"3\", d.parse(\"3\"))\n\n    def test_parse_int(self):\n        d = luigi.ChoiceParameter(var_type=int, choices=[1, 2, 3])\n        self.assertEqual(3, d.parse(3))\n\n    def test_parse_int_conv(self):\n        d = luigi.ChoiceParameter(var_type=int, choices=[1, 2, 3])\n        self.assertEqual(3, d.parse(\"3\"))\n\n    def test_invalid_choice(self):\n        d = luigi.ChoiceParameter(choices=[\"1\", \"2\", \"3\"])\n        self.assertRaises(ValueError, lambda: d.parse(\"xyz\"))\n\n    def test_invalid_choice_type(self):\n        self.assertRaises(AssertionError, lambda: luigi.ChoiceParameter(var_type=int, choices=[1, 2, \"3\"]))\n\n    def test_choices_parameter_exception(self):\n        self.assertRaises(luigi.parameter.ParameterException, lambda: luigi.ChoiceParameter(var_type=int))\n\n    def test_hash_str(self):\n        class Foo(luigi.Task):\n            args = luigi.ChoiceParameter(var_type=str, choices=[\"1\", \"2\", \"3\"])\n\n        p = luigi.ChoiceParameter(var_type=str, choices=[\"3\", \"2\", \"1\"])\n        self.assertEqual(hash(Foo(args=\"3\").args), hash(p.parse(\"3\")))\n\n    def test_serialize_parse(self):\n        a = luigi.ChoiceParameter(var_type=str, choices=[\"1\", \"2\", \"3\"])\n        b = \"3\"\n        self.assertEqual(b, a.parse(a.serialize(b)))\n\n    def test_invalid_choice_task(self):\n        class Foo(luigi.Task):\n            args = luigi.ChoiceParameter(var_type=str, choices=[\"1\", \"2\", \"3\"])\n\n        self.assertRaises(ValueError, lambda: Foo(args=\"4\"))\n"
  },
  {
    "path": "test/clone_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom helpers import unittest\n\nimport luigi\nimport luigi.notifications\n\nluigi.notifications.DEBUG = True\n\n\nclass LinearSum(luigi.Task):\n    lo = luigi.IntParameter()\n    hi = luigi.IntParameter()\n\n    def requires(self):\n        if self.hi > self.lo:\n            return self.clone(hi=self.hi - 1)\n\n    def run(self):\n        if self.hi > self.lo:\n            self.s = self.requires().s + self.f(self.hi - 1)\n        else:\n            self.s = 0\n        self.complete = lambda: True  # workaround since we don't write any output\n\n    def complete(self):\n        return False\n\n    def f(self, x):\n        return x\n\n\nclass PowerSum(LinearSum):\n    p = luigi.IntParameter()\n\n    def f(self, x):\n        return x**self.p\n\n\nclass CloneTest(unittest.TestCase):\n    def test_args(self):\n        t = LinearSum(lo=42, hi=45)\n        self.assertEqual(t.param_args, (42, 45))\n        self.assertEqual(t.param_kwargs, {\"lo\": 42, \"hi\": 45})\n\n    def test_recursion(self):\n        t = LinearSum(lo=42, hi=45)\n        luigi.build([t], local_scheduler=True)\n        self.assertEqual(t.s, 42 + 43 + 44)\n\n    def test_inheritance(self):\n        t = PowerSum(lo=42, hi=45, p=2)\n        luigi.build([t], local_scheduler=True)\n        self.assertEqual(t.s, 42**2 + 43**2 + 44**2)\n\n    def test_inheritance_from_non_parameter(self):\n        \"\"\"\n        Cloning can pull non-source-parameters from source to target parameter.\n        \"\"\"\n\n        class SubTask(luigi.Task):\n            lo = 1\n\n            @property\n            def hi(self):\n                return 2\n\n        t1 = SubTask()\n        t2 = t1.clone(cls=LinearSum)\n        self.assertEqual(t2.lo, 1)\n        self.assertEqual(t2.hi, 2)\n"
  },
  {
    "path": "test/cmdline_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport os\nimport subprocess\n\nimport mock\nfrom helpers import unittest\n\nimport luigi\nimport luigi.cmdline\nfrom luigi.configuration import LuigiTomlParser, get_config\nfrom luigi.mock import MockTarget\nfrom luigi.setup_logging import DaemonLogging, InterfaceLogging\n\n\nclass SomeTask(luigi.Task):\n    n = luigi.IntParameter()\n\n    def output(self):\n        return MockTarget(\"/tmp/test_%d\" % self.n)\n\n    def run(self):\n        f = self.output().open(\"w\")\n        f.write(\"done\")\n        f.close()\n\n\nclass AmbiguousClass(luigi.Task):\n    pass\n\n\nclass AmbiguousClass(luigi.Task):  # NOQA\n    pass\n\n\nclass TaskWithSameName(luigi.Task):\n    def run(self):\n        self.x = 42\n\n\nclass TaskWithSameName(luigi.Task):  # NOQA\n    # there should be no ambiguity\n\n    def run(self):\n        self.x = 43\n\n\nclass WriteToFile(luigi.Task):\n    filename = luigi.Parameter()\n\n    def output(self):\n        return luigi.LocalTarget(self.filename)\n\n    def run(self):\n        f = self.output().open(\"w\")\n        print(\"foo\", file=f)\n        f.close()\n\n\nclass FooBaseClass(luigi.Task):\n    x = luigi.Parameter(default=\"foo_base_default\")\n\n\nclass FooSubClass(FooBaseClass):\n    pass\n\n\nclass ATaskThatFails(luigi.Task):\n    def run(self):\n        raise ValueError()\n\n\nclass RequiredConfig(luigi.Config):\n    required_test_param = luigi.Parameter()\n\n\nclass TaskThatRequiresConfig(luigi.WrapperTask):\n    def requires(self):\n        if RequiredConfig().required_test_param == \"A\":\n            return SubTaskThatFails()\n\n\nclass SubTaskThatFails(luigi.Task):\n    def complete(self):\n        return False\n\n    def run(self):\n        raise Exception()\n\n\nclass CmdlineTest(unittest.TestCase):\n    def setUp(self):\n        MockTarget.fs.clear()\n        DaemonLogging._configured = False\n\n    def tearDown(self):\n        DaemonLogging._configured = False\n        DaemonLogging.config = get_config()\n        InterfaceLogging.config = get_config()\n\n    def _clean_config(self):\n        DaemonLogging.config = LuigiTomlParser()\n        DaemonLogging.config.data = {}\n\n    def _restore_config(self):\n        DaemonLogging.config = LuigiTomlParser.instance()\n\n    @mock.patch(\"logging.getLogger\")\n    def test_cmdline_main_task_cls(self, logger):\n        luigi.run([\"--local-scheduler\", \"--no-lock\", \"--n\", \"100\"], main_task_cls=SomeTask)\n        self.assertEqual(dict(MockTarget.fs.get_all_data()), {\"/tmp/test_100\": b\"done\"})\n\n    @mock.patch(\"logging.getLogger\")\n    def test_cmdline_local_scheduler(self, logger):\n        luigi.run([\"SomeTask\", \"--no-lock\", \"--n\", \"101\"], local_scheduler=True)\n        self.assertEqual(dict(MockTarget.fs.get_all_data()), {\"/tmp/test_101\": b\"done\"})\n\n    @mock.patch(\"logging.getLogger\")\n    def test_cmdline_other_task(self, logger):\n        luigi.run([\"--local-scheduler\", \"--no-lock\", \"SomeTask\", \"--n\", \"1000\"])\n        self.assertEqual(dict(MockTarget.fs.get_all_data()), {\"/tmp/test_1000\": b\"done\"})\n\n    @mock.patch(\"logging.getLogger\")\n    def test_cmdline_ambiguous_class(self, logger):\n        self.assertRaises(Exception, luigi.run, [\"--local-scheduler\", \"--no-lock\", \"AmbiguousClass\"])\n\n    @mock.patch(\"logging.getLogger\")\n    @mock.patch(\"logging.StreamHandler\")\n    def test_setup_interface_logging(self, handler, logger):\n        opts = type(\"opts\", (), {})\n        opts.background = False\n        opts.logdir = False\n        opts.logging_conf_file = None\n        opts.log_level = \"INFO\"\n\n        handler.return_value = mock.Mock(name=\"stream_handler\")\n\n        InterfaceLogging._configured = False\n        InterfaceLogging.config = LuigiTomlParser()\n        InterfaceLogging.config.data = {}\n        InterfaceLogging.setup(opts)\n\n        self.assertEqual([mock.call(handler.return_value)], logger.return_value.addHandler.call_args_list)\n\n        InterfaceLogging._configured = False\n        opts.logging_conf_file = \"/blah\"\n        with self.assertRaises(OSError):\n            InterfaceLogging.setup(opts)\n        InterfaceLogging._configured = False\n\n    @mock.patch(\"argparse.ArgumentParser.print_usage\")\n    def test_non_existent_class(self, print_usage):\n        self.assertRaises(luigi.task_register.TaskClassNotFoundException, luigi.run, [\"--local-scheduler\", \"--no-lock\", \"XYZ\"])\n\n    @mock.patch(\"argparse.ArgumentParser.print_usage\")\n    def test_no_task(self, print_usage):\n        self.assertRaises(SystemExit, luigi.run, [\"--local-scheduler\", \"--no-lock\"])\n\n    def test_luigid_logging_conf(self):\n        with mock.patch(\"luigi.server.run\") as server_run, mock.patch(\"logging.config.fileConfig\") as fileConfig:\n            luigi.cmdline.luigid([])\n            self.assertTrue(server_run.called)\n            # the default test configuration specifies a logging conf file\n            fileConfig.assert_called_with(\"test/testconfig/logging.cfg\")\n\n    def test_luigid_no_logging_conf(self):\n        with mock.patch(\"luigi.server.run\") as server_run, mock.patch(\"logging.basicConfig\") as basicConfig:\n            self._clean_config()\n            DaemonLogging.config.data = {\n                \"core\": {\n                    \"no_configure_logging\": False,\n                    \"logging_conf_file\": None,\n                }\n            }\n            luigi.cmdline.luigid([])\n            self.assertTrue(server_run.called)\n            self.assertTrue(basicConfig.called)\n\n    def test_luigid_missing_logging_conf(self):\n        with mock.patch(\"luigi.server.run\") as server_run, mock.patch(\"logging.basicConfig\") as basicConfig:\n            self._restore_config()\n            DaemonLogging.config.data = {\n                \"core\": {\n                    \"no_configure_logging\": False,\n                    \"logging_conf_file\": \"nonexistent.cfg\",\n                }\n            }\n            self.assertRaises(Exception, luigi.cmdline.luigid, [])\n            self.assertFalse(server_run.called)\n            self.assertFalse(basicConfig.called)\n\n\nclass InvokeOverCmdlineTest(unittest.TestCase):\n    def _run_cmdline(self, args):\n        env = os.environ.copy()\n        env[\"PYTHONPATH\"] = env.get(\"PYTHONPATH\", \"\") + \":.:test\"\n        print(\"Running: \" + \" \".join(args))  # To simplify rerunning failing tests\n        p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)\n        stdout, stderr = p.communicate()  # Unfortunately subprocess.check_output is 2.7+\n        return p.returncode, stdout, stderr\n\n    def test_bin_luigi(self):\n        t = luigi.LocalTarget(is_tmp=True)\n        args = [\"./bin/luigi\", \"--module\", \"cmdline_test\", \"WriteToFile\", \"--filename\", t.path, \"--local-scheduler\", \"--no-lock\"]\n        self._run_cmdline(args)\n        self.assertTrue(t.exists())\n\n    def test_direct_python(self):\n        t = luigi.LocalTarget(is_tmp=True)\n        args = [\"python\", \"test/cmdline_test.py\", \"WriteToFile\", \"--filename\", t.path, \"--local-scheduler\", \"--no-lock\"]\n        self._run_cmdline(args)\n        self.assertTrue(t.exists())\n\n    def test_python_module(self):\n        t = luigi.LocalTarget(is_tmp=True)\n        args = [\"python\", \"-m\", \"luigi\", \"--module\", \"cmdline_test\", \"WriteToFile\", \"--filename\", t.path, \"--local-scheduler\", \"--no-lock\"]\n        self._run_cmdline(args)\n        self.assertTrue(t.exists())\n\n    def test_direct_python_help(self):\n        returncode, stdout, stderr = self._run_cmdline([\"python\", \"test/cmdline_test.py\", \"--help-all\"])\n        self.assertTrue(stdout.find(b\"--FooBaseClass-x\") != -1)\n        self.assertFalse(stdout.find(b\"--x\") != -1)\n\n    def test_direct_python_help_class(self):\n        returncode, stdout, stderr = self._run_cmdline([\"python\", \"test/cmdline_test.py\", \"FooBaseClass\", \"--help\"])\n        self.assertTrue(stdout.find(b\"--FooBaseClass-x\") != -1)\n        self.assertTrue(stdout.find(b\"--x\") != -1)\n\n    def test_bin_luigi_help(self):\n        returncode, stdout, stderr = self._run_cmdline([\"./bin/luigi\", \"--module\", \"cmdline_test\", \"--help-all\"])\n        self.assertTrue(stdout.find(b\"--FooBaseClass-x\") != -1)\n        self.assertFalse(stdout.find(b\"--x\") != -1)\n\n    def test_python_module_luigi_help(self):\n        returncode, stdout, stderr = self._run_cmdline([\"python\", \"-m\", \"luigi\", \"--module\", \"cmdline_test\", \"--help-all\"])\n        self.assertTrue(stdout.find(b\"--FooBaseClass-x\") != -1)\n        self.assertFalse(stdout.find(b\"--x\") != -1)\n\n    def test_bin_luigi_help_no_module(self):\n        returncode, stdout, stderr = self._run_cmdline([\"./bin/luigi\", \"--help\"])\n        self.assertTrue(stdout.find(b\"usage:\") != -1)\n\n    def test_bin_luigi_help_not_spammy(self):\n        \"\"\"\n        Test that `luigi --help` fits on one screen\n        \"\"\"\n        returncode, stdout, stderr = self._run_cmdline([\"./bin/luigi\", \"--help\"])\n        self.assertLessEqual(len(stdout.splitlines()), 15)\n\n    def test_bin_luigi_all_help_spammy(self):\n        \"\"\"\n        Test that `luigi --help-all` doesn't fit on a screen\n\n        Naturally, I don't mind this test breaking, but it convinces me that\n        the \"not spammy\" test is actually testing what it claims too.\n        \"\"\"\n        returncode, stdout, stderr = self._run_cmdline([\"./bin/luigi\", \"--help-all\"])\n        self.assertGreater(len(stdout.splitlines()), 15)\n\n    def test_error_mesage_on_misspelled_task(self):\n        returncode, stdout, stderr = self._run_cmdline([\"./bin/luigi\", \"RangeDaili\"])\n        self.assertTrue(stderr.find(b\"RangeDaily\") != -1)\n\n    def test_bin_luigi_no_parameters(self):\n        returncode, stdout, stderr = self._run_cmdline([\"./bin/luigi\"])\n        self.assertTrue(stderr.find(b\"No task specified\") != -1)\n\n    def test_python_module_luigi_no_parameters(self):\n        returncode, stdout, stderr = self._run_cmdline([\"python\", \"-m\", \"luigi\"])\n        self.assertTrue(stderr.find(b\"No task specified\") != -1)\n\n    def test_bin_luigi_help_class(self):\n        returncode, stdout, stderr = self._run_cmdline([\"./bin/luigi\", \"--module\", \"cmdline_test\", \"FooBaseClass\", \"--help\"])\n        self.assertTrue(stdout.find(b\"--FooBaseClass-x\") != -1)\n        self.assertTrue(stdout.find(b\"--x\") != -1)\n\n    def test_python_module_help_class(self):\n        returncode, stdout, stderr = self._run_cmdline([\"python\", \"-m\", \"luigi\", \"--module\", \"cmdline_test\", \"FooBaseClass\", \"--help\"])\n        self.assertTrue(stdout.find(b\"--FooBaseClass-x\") != -1)\n        self.assertTrue(stdout.find(b\"--x\") != -1)\n\n    def test_bin_luigi_options_before_task(self):\n        args = [\"./bin/luigi\", \"--module\", \"cmdline_test\", \"--no-lock\", \"--local-scheduler\", \"--FooBaseClass-x\", \"hello\", \"FooBaseClass\"]\n        returncode, stdout, stderr = self._run_cmdline(args)\n        self.assertEqual(0, returncode)\n\n    def test_bin_fail_on_unrecognized_args(self):\n        returncode, stdout, stderr = self._run_cmdline([\"./bin/luigi\", \"--no-lock\", \"--local-scheduler\", \"Task\", \"--unknown-param\", \"hiiii\"])\n        self.assertNotEqual(0, returncode)\n\n    def test_deps_py_script(self):\n        \"\"\"\n        Test the deps.py script.\n        \"\"\"\n        args = \"python luigi/tools/deps.py --module examples.top_artists ArtistToplistToDatabase --date-interval 2015-W10\".split()\n        returncode, stdout, stderr = self._run_cmdline(args)\n        self.assertEqual(0, returncode)\n        self.assertTrue(stdout.find(b\"[FileSystem] data/streams_2015_03_04_faked.tsv\") != -1)\n        self.assertTrue(stdout.find(b\"[DB] localhost\") != -1)\n\n    def test_deps_tree_py_script(self):\n        \"\"\"\n        Test the deps_tree.py script.\n        \"\"\"\n        args = \"python luigi/tools/deps_tree.py --module examples.top_artists AggregateArtists --date-interval 2012-06\".split()\n        returncode, stdout, stderr = self._run_cmdline(args)\n        self.assertEqual(0, returncode)\n        for i in range(1, 30):\n            self.assertTrue(stdout.find((\"-[Streams-{{'date': '2012-06-{0}'}}\".format(str(i).zfill(2))).encode(\"utf-8\")) != -1)\n\n    def test_bin_mentions_misspelled_task(self):\n        \"\"\"\n        Test that the error message is informative when a task is misspelled.\n\n        In particular it should say that the task is misspelled and not that\n        the local parameters do not exist.\n        \"\"\"\n        returncode, stdout, stderr = self._run_cmdline([\"./bin/luigi\", \"--module\", \"cmdline_test\", \"HooBaseClass\", \"--x 5\"])\n        self.assertTrue(stderr.find(b\"FooBaseClass\") != -1)\n        self.assertTrue(stderr.find(b\"--x\") != 0)\n\n    def test_stack_trace_has_no_inner(self):\n        \"\"\"\n        Test that the stack trace for failing tasks are short\n\n        The stack trace shouldn't contain unreasonably much implementation\n        details of luigi In particular it should say that the task is\n        misspelled and not that the local parameters do not exist.\n        \"\"\"\n        returncode, stdout, stderr = self._run_cmdline([\"./bin/luigi\", \"--module\", \"cmdline_test\", \"ATaskThatFails\", \"--local-scheduler\", \"--no-lock\"])\n        print(stdout)\n\n        self.assertFalse(stdout.find(b\"run() got an unexpected keyword argument 'tracking_url_callback'\") != -1)\n        self.assertFalse(stdout.find(b\"During handling of the above exception, another exception occurred\") != -1)\n\n    def test_cmd_line_params_are_available_for_execution_summary(self):\n        \"\"\"\n        Test that config parameters specified on the command line are available while generating the execution summary.\n        \"\"\"\n        returncode, stdout, stderr = self._run_cmdline(\n            [\n                \"./bin/luigi\",\n                \"--module\",\n                \"cmdline_test\",\n                \"TaskThatRequiresConfig\",\n                \"--local-scheduler\",\n                \"--no-lock--RequiredConfig-required-test-param\",\n                \"A\",\n            ]\n        )\n        print(stdout)\n        print(stderr)\n\n        self.assertNotEqual(returncode, 1)\n        self.assertFalse(b\"required_test_param\" in stderr)\n\n\nif __name__ == \"__main__\":\n    # Needed for one of the tests\n    luigi.run()\n"
  },
  {
    "path": "test/config_env_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2018 Vote inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nimport os\n\nfrom helpers import LuigiTestCase, with_config\n\nfrom luigi.configuration import LuigiConfigParser, LuigiTomlParser, get_config\nfrom luigi.configuration.cfg_parser import InterpolationMissingEnvvarError\n\n\nclass ConfigParserTest(LuigiTestCase):\n    environ = {\n        \"TESTVAR\": \"1\",\n    }\n\n    def setUp(self):\n        self.environ_backup = {os.environ[key] for key in self.environ if key in os.environ}\n        for key, value in self.environ.items():\n            os.environ[key] = value\n        LuigiConfigParser._instance = None\n        super(ConfigParserTest, self).setUp()\n\n    def tearDown(self):\n        for key in self.environ:\n            os.environ.pop(key)\n        for key, value in self.environ_backup:\n            os.environ[key] = value\n        if \"LUIGI_CONFIG_PARSER\" in os.environ:\n            del os.environ[\"LUIGI_CONFIG_PARSER\"]\n\n    @with_config(\n        {\n            \"test\": {\n                \"a\": \"testval\",\n                \"b\": \"%(a)s\",\n                \"c\": \"%(a)s%(a)s\",\n            }\n        }\n    )\n    def test_basic_interpolation(self):\n        # Make sure the default ConfigParser behaviour is not broken\n        config = get_config()\n\n        self.assertEqual(config.get(\"test\", \"b\"), config.get(\"test\", \"a\"))\n        self.assertEqual(config.get(\"test\", \"c\"), 2 * config.get(\"test\", \"a\"))\n\n    @with_config(\n        {\n            \"test\": {\n                \"a\": \"${TESTVAR}\",\n                \"b\": \"${TESTVAR} ${TESTVAR}\",\n                \"c\": \"${TESTVAR} %(a)s\",\n                \"d\": \"${NONEXISTING}\",\n            }\n        }\n    )\n    def test_env_interpolation(self):\n        config = get_config()\n\n        self.assertEqual(config.get(\"test\", \"a\"), \"1\")\n        self.assertEqual(config.getint(\"test\", \"a\"), 1)\n        self.assertEqual(config.getboolean(\"test\", \"a\"), True)\n\n        self.assertEqual(config.get(\"test\", \"b\"), \"1 1\")\n\n        self.assertEqual(config.get(\"test\", \"c\"), \"1 1\")\n\n        with self.assertRaises(InterpolationMissingEnvvarError):\n            config.get(\"test\", \"d\")\n\n    @with_config(\n        {\n            \"test\": {\n                \"foo-bar\": \"fob\",\n                \"baz_qux\": \"bax\",\n            }\n        }\n    )\n    def test_underscore_vs_dash_style(self):\n        config = get_config()\n        self.assertEqual(config.get(\"test\", \"foo-bar\"), \"fob\")\n        self.assertEqual(config.get(\"test\", \"foo_bar\"), \"fob\")\n        self.assertEqual(config.get(\"test\", \"baz-qux\"), \"bax\")\n        self.assertEqual(config.get(\"test\", \"baz_qux\"), \"bax\")\n\n    @with_config(\n        {\n            \"test\": {\n                \"foo-bar\": \"fob\",\n                \"foo_bar\": \"bax\",\n            }\n        }\n    )\n    def test_underscore_vs_dash_style_priority(self):\n        config = get_config()\n        self.assertEqual(config.get(\"test\", \"foo-bar\"), \"bax\")\n        self.assertEqual(config.get(\"test\", \"foo_bar\"), \"bax\")\n\n    def test_default_parser(self):\n        config = get_config()\n        self.assertIsInstance(config, LuigiConfigParser)\n        os.environ[\"LUIGI_CONFIG_PARSER\"] = \"toml\"\n        config = get_config()\n        self.assertIsInstance(config, LuigiTomlParser)\n"
  },
  {
    "path": "test/config_toml_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2018 Vote inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nfrom helpers import LuigiTestCase\n\nfrom luigi.configuration import LuigiTomlParser, add_config_path, get_config\n\n\nclass TomlConfigParserTest(LuigiTestCase):\n    @classmethod\n    def setUpClass(cls):\n        add_config_path(\"test/testconfig/luigi.toml\")\n        add_config_path(\"test/testconfig/luigi_local.toml\")\n\n    def setUp(self):\n        LuigiTomlParser._instance = None\n        super(TomlConfigParserTest, self).setUp()\n\n    def test_get_config(self):\n        config = get_config(\"toml\")\n        self.assertIsInstance(config, LuigiTomlParser)\n\n    def test_file_reading(self):\n        config = get_config(\"toml\")\n        self.assertIn(\"hdfs\", config.data)\n\n    def test_get(self):\n        config = get_config(\"toml\")\n\n        # test getting\n        self.assertEqual(config.get(\"hdfs\", \"client\"), \"hadoopcli\")\n        self.assertEqual(config.get(\"hdfs\", \"client\", \"test\"), \"hadoopcli\")\n\n        # test default\n        self.assertEqual(config.get(\"hdfs\", \"test\", \"check\"), \"check\")\n        with self.assertRaises(KeyError):\n            config.get(\"hdfs\", \"test\")\n\n        # test override\n        self.assertEqual(config.get(\"hdfs\", \"namenode_host\"), \"localhost\")\n        # test non-string values\n        self.assertEqual(config.get(\"hdfs\", \"namenode_port\"), 50030)\n\n    def test_set(self):\n        config = get_config(\"toml\")\n\n        self.assertEqual(config.get(\"hdfs\", \"client\"), \"hadoopcli\")\n        config.set(\"hdfs\", \"client\", \"test\")\n        self.assertEqual(config.get(\"hdfs\", \"client\"), \"test\")\n        config.set(\"hdfs\", \"check\", \"test me\")\n        self.assertEqual(config.get(\"hdfs\", \"check\"), \"test me\")\n\n    def test_has_option(self):\n        config = get_config(\"toml\")\n        self.assertTrue(config.has_option(\"hdfs\", \"client\"))\n        self.assertFalse(config.has_option(\"hdfs\", \"nope\"))\n        self.assertFalse(config.has_option(\"nope\", \"client\"))\n\n\nclass HelpersTest(LuigiTestCase):\n    def test_add_without_install(self):\n        enabled = LuigiTomlParser.enabled\n        LuigiTomlParser.enabled = False\n        with self.assertRaises(ImportError):\n            add_config_path(\"test/testconfig/luigi.toml\")\n        LuigiTomlParser.enabled = enabled\n\n    def test_get_without_install(self):\n        enabled = LuigiTomlParser.enabled\n        LuigiTomlParser.enabled = False\n        with self.assertRaises(ImportError):\n            get_config(\"toml\")\n        LuigiTomlParser.enabled = enabled\n"
  },
  {
    "path": "test/conftest.py",
    "content": "from typing import List\n\nimport pytest\n\nimport luigi.task_register\n\n\n@pytest.fixture(autouse=True)\ndef reset_luigi_registry():\n    \"\"\"Reset the Luigi task registry before and after each test.\n\n    Prevents registry pollution between tests when running with pytest-xdist,\n    where multiple tests execute sequentially within the same worker process.\n    This mirrors the behaviour of LuigiTestCase.setUp/tearDown and applies it\n    to all tests automatically, including those that inherit unittest.TestCase\n    directly without going through LuigiTestCase.\n    \"\"\"\n    original = luigi.task_register.Register._get_reg()\n    luigi.task_register.Register.clear_instance_cache()\n    yield\n    luigi.task_register.Register._set_reg(original)\n    luigi.task_register.Register.clear_instance_cache()\n\n\ndef pytest_collection_modifyitems(items: List[pytest.Item]) -> None:\n    \"\"\"\n    Automatically add the equivalent of pytest.mark.unmarked to any test which has no markers\n\n    For example, enables the ability to target \"contrib + unmarked\" tests (eventually getting rid of the generic \"contrib\" marker):\n      - pytest test/contrib/ -m \"contrib or unmarked\"\n    \"\"\"\n    for item in items:\n        # Check if the item has any markers (custom or builtin)\n        if not any(item.iter_markers()):\n            item.add_marker(pytest.mark.unmarked)\n"
  },
  {
    "path": "test/contrib/__init__.py",
    "content": ""
  },
  {
    "path": "test/contrib/_webhdfs_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport os\n\nimport pytest\nfrom helpers import unittest\n\nfrom luigi.contrib import webhdfs\n\n\n@pytest.mark.apache\nclass TestWebHdfsTarget(unittest.TestCase):\n    \"\"\"\n    This test requires a running Hadoop cluster with WebHdfs enabled\n    This test requires the luigi.cfg file to have a `hdfs` section\n    with the namenode_host, namenode_port and user settings.\n    \"\"\"\n\n    def setUp(self):\n        self.testDir = \"/tmp/luigi-test\".format()\n        self.path = os.path.join(self.testDir, \"out.txt\")\n        self.client = webhdfs.WebHdfsClient()\n        self.target = webhdfs.WebHdfsTarget(self.path)\n\n    def tearDown(self):\n        if self.client.exists(self.testDir):\n            self.client.remove(self.testDir, recursive=True)\n\n    def test_write(self):\n        self.assertFalse(self.client.exists(self.path))\n        output = self.target.open(\"w\")\n        output.write(\"this is line 1\\n\")\n        output.write(\"this is line #2\\n\")\n        output.close()\n        self.assertTrue(self.client.exists(self.path))\n\n    def test_read(self):\n        self.test_write()\n        input_ = self.target.open(\"r\")\n        all_test = \"this is line 1\\nthis is line #2\\n\"\n        self.assertEqual(all_test, input_.read())\n        input_.close()\n\n    def test_read_lines(self):\n        self.test_write()\n        input_ = self.target.open(\"r\")\n        lines = list(input_.readlines())\n        self.assertEqual(lines[0], \"this is line 1\")\n        self.assertEqual(lines[1], \"this is line #2\")\n        input_.close()\n"
  },
  {
    "path": "test/contrib/azureblob_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2018 Microsoft Corporation\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nIntegration tests for azureblob module.\n\"\"\"\n\nimport json\nimport os\nimport unittest\n\nimport pytest\n\nimport luigi\nfrom luigi.contrib.azureblob import AzureBlobClient, AzureBlobTarget\nfrom luigi.target import FileAlreadyExists\n\naccount_name = os.environ.get(\"AZURITE_ACCOUNT_NAME\")\naccount_key = os.environ.get(\"AZURITE_ACCOUNT_KEY\")\nsas_token = os.environ.get(\"AZURITE_SAS_TOKEN\")\ncustom_domain = os.environ.get(\"AZURITE_CUSTOM_DOMAIN\")\nprotocol = os.environ.get(\"AZURITE_PROTOCOL\", \"http\")\nclient = AzureBlobClient(account_name, account_key, sas_token, custom_domain=custom_domain, protocol=protocol)\n\n\n@pytest.mark.azureblob\nclass AzureBlobClientTest(unittest.TestCase):\n    def setUp(self):\n        self.client = client\n\n    def tearDown(self):\n        pass\n\n    def test_splitfilepath_blob_none(self):\n        container, blob = self.client.splitfilepath(\"abc\")\n        self.assertEqual(container, \"abc\")\n        self.assertIsNone(blob)\n\n    def test_splitfilepath_blob_toplevel(self):\n        container, blob = self.client.splitfilepath(\"abc/cde\")\n        self.assertEqual(container, \"abc\")\n        self.assertEqual(blob, \"cde\")\n\n    def test_splitfilepath_blob_nested(self):\n        container, blob = self.client.splitfilepath(\"abc/cde/xyz.txt\")\n        self.assertEqual(container, \"abc\")\n        self.assertEqual(blob, \"cde/xyz.txt\")\n\n    def test_create_delete_container(self):\n        import datetime\n        import hashlib\n\n        m = hashlib.new(\"md5\", usedforsecurity=False)\n        m.update(datetime.datetime.now().__str__().encode())\n        container_name = m.hexdigest()\n\n        self.assertFalse(self.client.exists(container_name))\n        self.assertTrue(self.client.create_container(container_name))\n        self.assertTrue(self.client.exists(container_name))\n        self.client.delete_container(container_name)\n        self.assertFalse(self.client.exists(container_name))\n\n    def test_upload_copy_move_remove_blob(self):\n        import datetime\n        import hashlib\n        import tempfile\n\n        m = hashlib.new(\"md5\", usedforsecurity=False)\n        m.update(datetime.datetime.now().__str__().encode())\n        container_name = m.hexdigest()\n        m.update(datetime.datetime.now().__str__().encode())\n        from_blob_name = m.hexdigest()\n        from_path = \"{container_name}/{from_blob_name}\".format(container_name=container_name, from_blob_name=from_blob_name)\n        m.update(datetime.datetime.now().__str__().encode())\n        to_blob_name = m.hexdigest()\n        to_path = \"{container_name}/{to_blob_name}\".format(container_name=container_name, to_blob_name=to_blob_name)\n        message = datetime.datetime.now().__str__().encode()\n\n        self.assertTrue(self.client.create_container(container_name))\n        with tempfile.NamedTemporaryFile() as f:\n            f.write(message)\n            f.flush()\n\n            # upload\n            self.client.upload(f.name, container_name, from_blob_name)\n            self.assertTrue(self.client.exists(from_path))\n\n        # mkdir\n        self.assertRaises(FileAlreadyExists, self.client.mkdir, from_path, False, True)\n\n        # mkdir does not actually create anything\n        self.client.mkdir(to_path, True, True)\n        self.assertFalse(self.client.exists(to_path))\n\n        # copy\n        self.assertIn(self.client.copy(from_path, to_path)[\"copy_status\"], [\"success\", \"pending\"])\n        self.assertTrue(self.client.exists(to_path))\n\n        # remove\n        self.assertTrue(self.client.remove(from_path))\n        self.assertFalse(self.client.exists(from_path))\n\n        # move back file\n        self.client.move(to_path, from_path)\n        self.assertTrue(self.client.exists(from_path))\n        self.assertFalse(self.client.exists(to_path))\n\n        self.assertTrue(self.client.remove(from_path))\n        self.assertFalse(self.client.exists(from_path))\n\n        # delete container\n        self.client.delete_container(container_name)\n        self.assertFalse(self.client.exists(container_name))\n\n\nclass MovieScriptTask(luigi.Task):\n    def output(self):\n        return AzureBlobTarget(\"luigi-test\", \"movie-cheesy.txt\", client, download_when_reading=False)\n\n    def run(self):\n        client.create_container(\"luigi-test\")\n        with self.output().open(\"w\") as op:\n            op.write(\"I'm going to make him an offer he can't refuse.\\n\")\n            op.write(\"Toto, I've got a feeling we're not in Kansas anymore.\\n\")\n            op.write(\"May the Force be with you.\\n\")\n            op.write(\"Bond. James Bond.\\n\")\n            op.write(\"Greed, for lack of a better word, is good.\\n\")\n\n\nclass AzureJsonDumpTask(luigi.Task):\n    def output(self):\n        return AzureBlobTarget(\"luigi-test\", \"stats.json\", client)\n\n    def run(self):\n        with self.output().open(\"w\") as op:\n            json.dump([1, 2, 3], op)\n\n\nclass FinalTask(luigi.Task):\n    def requires(self):\n        return {\"movie\": self.clone(MovieScriptTask), \"np\": self.clone(AzureJsonDumpTask)}\n\n    def run(self):\n        with self.input()[\"movie\"].open(\"r\") as movie, self.input()[\"np\"].open(\"r\") as np, self.output().open(\"w\") as output:\n            movie_lines = movie.read()\n            assert \"Toto, I've got a feeling\" in movie_lines\n            output.write(movie_lines)\n\n            data = json.load(np)\n            assert data == [1, 2, 3]\n            output.write(data.__str__())\n\n    def output(self):\n        return luigi.LocalTarget(\"samefile\")\n\n\n@pytest.mark.azureblob\nclass AzureBlobTargetTest(unittest.TestCase):\n    def setUp(self):\n        self.client = client\n\n    def tearDown(self):\n        pass\n\n    def test_AzureBlobTarget(self):\n        final_task = FinalTask()\n        luigi.build([final_task], local_scheduler=True, log_level=\"NOTSET\")\n        output = final_task.output().open(\"r\").read()\n        assert \"Toto\" in output\n"
  },
  {
    "path": "test/contrib/batch_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2018 Outlier Bio, LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport pytest\nfrom helpers import skipOnTravisAndGithubActions, unittest\n\nimport luigi.contrib.batch as batch\n\ntry:\n    import boto3\n\n    client = boto3.client(\"batch\")\nexcept ImportError:\n    raise unittest.SkipTest(\"boto3 is not installed. BatchTasks require boto3\")\n\n\nclass MockBotoBatchClient:\n    def describe_job_queues(self):\n        return {\"jobQueues\": [{\"jobQueueName\": \"test_queue\", \"state\": \"ENABLED\", \"status\": \"VALID\"}]}\n\n    def list_jobs(self, jobQueue=\"\", jobStatus=\"\"):\n        return {\"jobSummaryList\": [{\"jobName\": \"test_job\", \"jobId\": \"abcd\"}]}\n\n    def describe_jobs(self, jobs=[]):\n        return {\n            \"ResponseMetadata\": {\"HTTPStatusCode\": 200},\n            \"jobs\": [{\"status\": \"SUCCEEDED\", \"attempts\": [{\"container\": {\"logStreamName\": \"test_job_abcd_log_stream\"}}]}],\n        }\n\n    def submit_job(self, jobDefinition=\"\", jobName=\"\", jobQueue=\"\", parameters={}):\n        return {\"jobId\": \"abcd\"}\n\n    def register_job_definition(self, **kwargs):\n        return {\"ResponseMetadata\": {\"HTTPStatusCode\": 200}}\n\n\nclass MockBotoLogsClient:\n    def get_log_events(self, logGroupName=\"\", logStreamName=\"\", startFromHead=True):\n        return {\"events\": [{\"message\": \"log line 1\"}, {\"message\": \"log line 2\"}, {\"message\": \"log line 3\"}]}\n\n\n@pytest.mark.aws\n@skipOnTravisAndGithubActions(\"boto3 now importable. These tests need mocked\")\nclass BatchClientTest(unittest.TestCase):\n    def setUp(self):\n        self.bc = batch.BatchClient(poll_time=10)\n        self.bc._client = MockBotoBatchClient()\n        self.bc._log_client = MockBotoLogsClient()\n\n    def test_get_active_queue(self):\n        self.assertEqual(self.bc.get_active_queue(), \"test_queue\")\n\n    def test_get_job_id_from_name(self):\n        self.assertEqual(self.bc.get_job_id_from_name(\"test_job\"), \"abcd\")\n\n    def test_get_job_status(self):\n        self.assertEqual(self.bc.get_job_status(\"abcd\"), \"SUCCEEDED\")\n\n    def test_get_logs(self):\n        log_str = \"log line 1\\nlog line 2\\nlog line 3\"\n        self.assertEqual(self.bc.get_logs(\"test_job_abcd_log_stream\"), log_str)\n\n    def test_submit_job(self):\n        job_id = self.bc.submit_job(\"test_job_def\", {\"param1\": \"foo\", \"param2\": \"bar\"}, job_name=\"test_job\")\n        self.assertEqual(job_id, \"abcd\")\n\n    def test_submit_job_specific_queue(self):\n        job_id = self.bc.submit_job(\"test_job_def\", {\"param1\": \"foo\", \"param2\": \"bar\"}, job_name=\"test_job\", queue=\"test_queue\")\n        self.assertEqual(job_id, \"abcd\")\n\n    def test_submit_job_non_existant_queue(self):\n        with self.assertRaises(Exception):\n            self.bc.submit_job(\"test_job_def\", {\"param1\": \"foo\", \"param2\": \"bar\"}, job_name=\"test_job\", queue=\"non_existant_queue\")\n\n    def test_wait_on_job(self):\n        job_id = self.bc.submit_job(\"test_job_def\", {\"param1\": \"foo\", \"param2\": \"bar\"}, job_name=\"test_job\")\n        self.assertTrue(self.bc.wait_on_job(job_id))\n\n    def test_wait_on_job_failed(self):\n        job_id = self.bc.submit_job(\"test_job_def\", {\"param1\": \"foo\", \"param2\": \"bar\"}, job_name=\"test_job\")\n        self.bc.get_job_status = lambda x: \"FAILED\"\n        with self.assertRaises(batch.BatchJobException) as context:\n            self.bc.wait_on_job(job_id)\n            self.assertTrue(\"log line 1\" in context.exception)\n\n\n@pytest.mark.aws\n@skipOnTravisAndGithubActions(\"boto3 now importable. These tests need mocked\")\nclass BatchTaskTest(unittest.TestCase):\n    def setUp(self):\n        self.task = batch.BatchTask(job_definition=\"test_job_def\", job_name=\"test_job\", poll_time=10)\n"
  },
  {
    "path": "test/contrib/beam_dataflow_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2019 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport json\nimport unittest\n\nimport mock\nimport pytest\nfrom mock import MagicMock, patch\n\nimport luigi\nfrom luigi import local_target\nfrom luigi.contrib import beam_dataflow, bigquery, gcs\n\n\nclass TestDataflowParamKeys(beam_dataflow.DataflowParamKeys):\n    runner = \"runner\"\n    project = \"project\"\n    zone = \"zone\"\n    region = \"region\"\n    staging_location = \"stagingLocation\"\n    temp_location = \"tempLocation\"\n    gcp_temp_location = \"gcpTempLocation\"\n    num_workers = \"numWorkers\"\n    autoscaling_algorithm = \"autoscalingAlgorithm\"\n    max_num_workers = \"maxNumWorkers\"\n    disk_size_gb = \"diskSizeGb\"\n    worker_machine_type = \"workerMachineType\"\n    worker_disk_type = \"workerDiskType\"\n    job_name = \"jobName\"\n    service_account = \"serviceAccount\"\n    network = \"network\"\n    subnetwork = \"subnetwork\"\n    labels = \"labels\"\n\n\nclass TestRequires(luigi.ExternalTask):\n    def output(self):\n        return luigi.LocalTarget(path=\"some-input-dir\")\n\n\nclass SimpleTestTask(beam_dataflow.BeamDataflowJobTask):\n    dataflow_params = TestDataflowParamKeys()\n\n    def requires(self):\n        return TestRequires()\n\n    def output(self):\n        return local_target.LocalTarget(path=\"some-output.txt\")\n\n    def dataflow_executable(self):\n        return [\"java\", \"com.spotify.luigi.SomeJobClass\"]\n\n\nclass FullTestTask(beam_dataflow.BeamDataflowJobTask):\n    project = \"some-project\"\n    runner = \"DirectRunner\"\n    temp_location = \"some-temp\"\n    staging_location = \"some-staging\"\n    gcp_temp_location = \"some-gcp-temp\"\n    num_workers = 1\n    autoscaling_algorithm = \"THROUGHPUT_BASED\"\n    max_num_workers = 2\n    network = \"some-network\"\n    subnetwork = \"some-subnetwork\"\n    disk_size_gb = 5\n    worker_machine_type = \"n1-standard-4\"\n    job_name = \"SomeJobName\"\n    worker_disk_type = \"compute.googleapis.com/projects//zones//diskTypes/pd-ssd\"\n    service_account = \"some-service-account@google.com\"\n    zone = \"europe-west1-c\"\n    region = \"europe-west1\"\n    labels = {\"k1\": \"v1\"}\n\n    dataflow_params = TestDataflowParamKeys()\n\n    def requires(self):\n        return TestRequires()\n\n    def output(self):\n        return {\"output\": luigi.LocalTarget(path=\"some-output.txt\")}\n\n    def args(self):\n        return [\"--extraArg=present\"]\n\n    def dataflow_executable(self):\n        return [\"java\", \"com.spotify.luigi.SomeJobClass\"]\n\n\nclass FilePatternsTestTask(beam_dataflow.BeamDataflowJobTask):\n    dataflow_params = TestDataflowParamKeys()\n\n    def requires(self):\n        return {\"input1\": TestRequires(), \"input2\": TestRequires()}\n\n    def file_pattern(self):\n        return {\"input2\": \"*.some-ext\"}\n\n    def output(self):\n        return {\"output\": luigi.LocalTarget(path=\"some-output.txt\")}\n\n    def dataflow_executable(self):\n        return [\"java\", \"com.spotify.luigi.SomeJobClass\"]\n\n\nclass DummyCmdLineTestTask(beam_dataflow.BeamDataflowJobTask):\n    dataflow_params = TestDataflowParamKeys()\n\n    def dataflow_executable(self):\n        pass\n\n    def requires(self):\n        return {}\n\n    def output(self):\n        return {}\n\n    def _mk_cmd_line(self):\n        return [\"echo\", '\"hello world\"']\n\n\n@pytest.mark.gcloud\nclass BeamDataflowTest(unittest.TestCase):\n    def test_dataflow_simple_cmd_line_args(self):\n        task = SimpleTestTask()\n        task.runner = \"DirectRunner\"\n\n        expected = [\"java\", \"com.spotify.luigi.SomeJobClass\", \"--runner=DirectRunner\", \"--input=some-input-dir/part-*\", \"--output=some-output.txt\"]\n\n        self.assertEqual(task._mk_cmd_line(), expected)\n\n    def test_dataflow_full_cmd_line_args(self):\n        full_test_task = FullTestTask()\n        cmd_line_args = full_test_task._mk_cmd_line()\n\n        expected = [\n            \"java\",\n            \"com.spotify.luigi.SomeJobClass\",\n            \"--runner=DirectRunner\",\n            \"--project=some-project\",\n            \"--zone=europe-west1-c\",\n            \"--region=europe-west1\",\n            \"--stagingLocation=some-staging\",\n            \"--tempLocation=some-temp\",\n            \"--gcpTempLocation=some-gcp-temp\",\n            \"--numWorkers=1\",\n            \"--autoscalingAlgorithm=THROUGHPUT_BASED\",\n            \"--maxNumWorkers=2\",\n            \"--diskSizeGb=5\",\n            \"--workerMachineType=n1-standard-4\",\n            \"--workerDiskType=compute.googleapis.com/projects//zones//diskTypes/pd-ssd\",\n            \"--network=some-network\",\n            \"--subnetwork=some-subnetwork\",\n            \"--jobName=SomeJobName\",\n            \"--serviceAccount=some-service-account@google.com\",\n            '--labels={\"k1\": \"v1\"}',\n            \"--extraArg=present\",\n            \"--input=some-input-dir/part-*\",\n            \"--output=some-output.txt\",\n        ]\n\n        self.assertEqual(json.loads(cmd_line_args[19][9:]), {\"k1\": \"v1\"})\n        self.assertEqual(cmd_line_args, expected)\n\n    def test_dataflow_with_file_patterns(self):\n        cmd_line_args = FilePatternsTestTask()._mk_cmd_line()\n\n        self.assertIn(\"--input1=some-input-dir/part-*\", cmd_line_args)\n        self.assertIn(\"--input2=some-input-dir/*.some-ext\", cmd_line_args)\n\n    def test_dataflow_with_invalid_file_patterns(self):\n        task = FilePatternsTestTask()\n        task.file_pattern = MagicMock(return_value=\"notadict\")\n        with self.assertRaises(ValueError):\n            task._mk_cmd_line()\n\n    def test_dataflow_input_arg_formatting(self):\n        class TestTaskListOfTargetsInput(SimpleTestTask):\n            class TestRequiresListOfTargets(luigi.ExternalTask):\n                def output(self):\n                    return [luigi.LocalTarget(path=\"some-input-1\"), luigi.LocalTarget(path=\"some-input-2\")]\n\n            def requires(self):\n                return self.TestRequiresListOfTargets()\n\n        task_list_input = TestTaskListOfTargetsInput()\n        self.assertEqual(task_list_input._format_input_args(), [\"--input=some-input-1/part-*,some-input-2/part-*\"])\n\n        class TestTaskListOfTuplesInput(SimpleTestTask):\n            class TestRequiresListOfTuples(luigi.ExternalTask):\n                def output(self):\n                    return [(\"input1\", luigi.LocalTarget(path=\"some-input-1\")), (\"input2\", luigi.LocalTarget(path=\"some-input-2\"))]\n\n            def requires(self):\n                return self.TestRequiresListOfTuples()\n\n        task_list_tuples_input = TestTaskListOfTuplesInput()\n        self.assertEqual(task_list_tuples_input._format_input_args(), [\"--input1=some-input-1/part-*\", \"--input2=some-input-2/part-*\"])\n\n        class TestTaskDictInput(SimpleTestTask):\n            class TestRequiresDict(luigi.ExternalTask):\n                def output(self):\n                    return {\"input1\": luigi.LocalTarget(path=\"some-input-1\"), \"input2\": luigi.LocalTarget(path=\"some-input-2\")}\n\n            def requires(self):\n                return self.TestRequiresDict()\n\n        task_dict_input = TestTaskDictInput()\n        self.assertEqual(task_dict_input._format_input_args(), [\"--input1=some-input-1/part-*\", \"--input2=some-input-2/part-*\"])\n\n        class TestTaskTupleInput(SimpleTestTask):\n            class TestRequiresTuple(luigi.ExternalTask):\n                def output(self):\n                    return \"some-key\", luigi.LocalTarget(path=\"some-input\")\n\n            def requires(self):\n                return self.TestRequiresTuple()\n\n        task_tuple_input = TestTaskTupleInput()\n        self.assertEqual(task_tuple_input._format_input_args(), [\"--some-key=some-input/part-*\"])\n\n    def test_task_output_arg_completion(self):\n        class TestCompleteTarget(luigi.Target):\n            def exists(self):\n                return True\n\n        class TestIncompleteTarget(luigi.Target):\n            def exists(self):\n                return False\n\n        class TestTaskDictOfCompleteOutput(SimpleTestTask):\n            def output(self):\n                return {\"output\": TestCompleteTarget()}\n\n        self.assertEqual(TestTaskDictOfCompleteOutput().complete(), True)\n\n        class TestTaskDictOfIncompleteOutput(SimpleTestTask):\n            def output(self):\n                return {\"output\": TestIncompleteTarget()}\n\n        self.assertEqual(TestTaskDictOfIncompleteOutput().complete(), False)\n\n        class TestTaskDictOfMixedCompleteOutput(SimpleTestTask):\n            def output(self):\n                return {\"output1\": TestIncompleteTarget(), \"output2\": TestCompleteTarget()}\n\n        self.assertEqual(TestTaskDictOfMixedCompleteOutput().complete(), False)\n\n    def test_get_target_path(self):\n        bq_target = bigquery.BigQueryTarget(\"p\", \"d\", \"t\", client=\"fake_client\")\n        self.assertEqual(SimpleTestTask.get_target_path(bq_target), \"p:d.t\")\n\n        gcs_target = gcs.GCSTarget(\"gs://foo/bar.txt\", client=\"fake_client\")\n        self.assertEqual(SimpleTestTask.get_target_path(gcs_target), \"gs://foo/bar.txt\")\n\n        with self.assertRaises(ValueError):\n            SimpleTestTask.get_target_path(\"not_a_target\")\n\n    def test_dataflow_runner_resolution(self):\n        task = SimpleTestTask()\n        # Test that supported runners are passed through\n        for runner in [\"DirectRunner\", \"DataflowRunner\"]:\n            task.runner = runner\n            self.assertEqual(task._get_runner(), runner)\n\n        # Test that unsupported runners throw an error\n        task.runner = \"UnsupportedRunner\"\n        with self.assertRaises(ValueError):\n            task._get_runner()\n\n    def test_dataflow_successful_run_callbacks(self):\n        task = DummyCmdLineTestTask()\n\n        task.before_run = MagicMock()\n        task.validate_output = MagicMock()\n        task.on_successful_run = MagicMock()\n        task.on_successful_output_validation = MagicMock()\n        task.cleanup_on_error = MagicMock()\n\n        task.run()\n\n        task.before_run.assert_called_once_with()\n        task.validate_output.assert_called_once_with()\n        task.cleanup_on_error.assert_not_called()\n        task.on_successful_run.assert_called_once_with()\n        task.on_successful_output_validation.assert_called_once_with()\n\n    def test_dataflow_successful_run_invalid_output_callbacks(self):\n        task = DummyCmdLineTestTask()\n\n        task.before_run = MagicMock()\n        task.validate_output = MagicMock(return_value=False)\n        task.on_successful_run = MagicMock()\n        task.on_successful_output_validation = MagicMock()\n        task.cleanup_on_error = MagicMock()\n\n        with self.assertRaises(ValueError):\n            task.run()\n\n        task.before_run.assert_called_once_with()\n        task.validate_output.assert_called_once_with()\n        task.cleanup_on_error.assert_called_once_with(mock.ANY)\n        task.on_successful_run.assert_called_once_with()\n        task.on_successful_output_validation.assert_not_called()\n\n    @patch(\"luigi.contrib.beam_dataflow.subprocess.Popen.wait\", return_value=1)\n    @patch(\"luigi.contrib.beam_dataflow.os._exit\", side_effect=OSError)\n    def test_dataflow_failed_run_callbacks(self, popen, os_exit):\n        task = DummyCmdLineTestTask()\n\n        task.before_run = MagicMock()\n        task.validate_output = MagicMock()\n        task.on_successful_run = MagicMock()\n        task.on_successful_output_validation = MagicMock()\n        task.cleanup_on_error = MagicMock()\n\n        with self.assertRaises(OSError):\n            task.run()\n\n        task.before_run.assert_called_once_with()\n        task.validate_output.assert_not_called()\n        task.cleanup_on_error.assert_called_once_with(mock.ANY)\n        task.on_successful_run.assert_not_called()\n        task.on_successful_output_validation.assert_not_called()\n"
  },
  {
    "path": "test/contrib/bigquery_avro_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2019 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nThese are the unit tests for the BigQueryLoadAvro class.\n\"\"\"\n\nimport unittest\n\nimport avro\nimport avro.schema\nimport pytest\n\nfrom luigi.contrib.bigquery_avro import BigQueryLoadAvro\n\n\n@pytest.mark.gcloud\nclass BigQueryAvroTest(unittest.TestCase):\n    def test_writer_schema_method_existence(self):\n        schema_json = \"\"\"\n        {\n            \"namespace\": \"example.avro\",\n            \"type\": \"record\",\n            \"name\": \"User\",\n            \"fields\": [\n                {\"name\": \"name\", \"type\": \"string\"},\n                {\"name\": \"favorite_number\",  \"type\": [\"int\", \"null\"]},\n                {\"name\": \"favorite_color\", \"type\": [\"string\", \"null\"]}\n            ]\n        }\n        \"\"\"\n        avro_schema = avro.schema.Parse(schema_json)\n        reader = avro.io.DatumReader(avro_schema, avro_schema)\n        actual_schema = BigQueryLoadAvro._get_writer_schema(reader)\n        self.assertEqual(actual_schema, avro_schema, \"writer(s) avro_schema attribute not found\")\n        # otherwise AttributeError is thrown\n"
  },
  {
    "path": "test/contrib/bigquery_gcloud_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2015 Twitter Inc\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nThis is an integration test for the BigQuery-luigi binding.\n\nThis test requires credentials that can access GCS & access to a bucket below.\nFollow the directions in the gcloud tools to set up local credentials.\n\"\"\"\n\nimport json\nimport os\nimport unittest\n\nimport luigi\n\ntry:\n    import google.auth\n    import googleapiclient.errors\nexcept ImportError:\n    raise unittest.SkipTest(\"Unable to load googleapiclient module\")\nimport avro.schema\nimport pytest\nfrom avro.datafile import DataFileWriter\nfrom avro.io import DatumWriter\nfrom helpers import unittest\n\nfrom luigi.contrib import bigquery, bigquery_avro, gcs\nfrom luigi.contrib.bigquery import BigQueryExecutionError\nfrom luigi.contrib.gcs import GCSTarget\n\n# In order to run this test, you should set your GCS/BigQuery project/bucket.\n# Unfortunately there's no mock\nPROJECT_ID = os.environ.get(\"GCS_TEST_PROJECT_ID\", \"your_project_id_here\")\nBUCKET_NAME = os.environ.get(\"GCS_TEST_BUCKET\", \"your_test_bucket_here\")\nTEST_FOLDER = os.environ.get(\"TRAVIS_BUILD_ID\", \"bigquery_test_folder\")\nDATASET_ID = os.environ.get(\"BQ_TEST_DATASET_ID\", \"luigi_tests\")\nEU_DATASET_ID = os.environ.get(\"BQ_TEST_EU_DATASET_ID\", \"luigi_tests_eu\")\nEU_LOCATION = \"EU\"\nUS_LOCATION = \"US\"\n\nCREDENTIALS, _ = google.auth.default()\n\n\ndef bucket_url(suffix):\n    \"\"\"\n    Actually it's bucket + test folder name\n    \"\"\"\n    return \"gs://{}/{}/{}\".format(BUCKET_NAME, TEST_FOLDER, suffix)\n\n\n@pytest.mark.gcloud\nclass TestLoadTask(bigquery.BigQueryLoadTask):\n    source = luigi.Parameter()\n    table = luigi.Parameter()\n    dataset = luigi.Parameter()\n    location = luigi.Parameter(default=None)\n\n    @property\n    def schema(self):\n        return [\n            {\"mode\": \"NULLABLE\", \"name\": \"field1\", \"type\": \"STRING\"},\n            {\"mode\": \"NULLABLE\", \"name\": \"field2\", \"type\": \"INTEGER\"},\n        ]\n\n    def source_uris(self):\n        return [self.source]\n\n    def output(self):\n        return bigquery.BigQueryTarget(PROJECT_ID, self.dataset, self.table, location=self.location)\n\n\n@pytest.mark.gcloud\nclass TestRunQueryTask(bigquery.BigQueryRunQueryTask):\n    query = \"\"\" SELECT 'hello' as field1, 2 as field2 \"\"\"\n    table = luigi.Parameter()\n    dataset = luigi.Parameter()\n\n    def output(self):\n        return bigquery.BigQueryTarget(PROJECT_ID, self.dataset, self.table)\n\n\n@pytest.mark.gcloud\nclass TestExtractTask(bigquery.BigQueryExtractTask):\n    source = luigi.Parameter()\n    table = luigi.Parameter()\n    dataset = luigi.Parameter()\n    location = luigi.Parameter(default=None)\n    extract_gcs_file = luigi.Parameter()\n\n    destination_format = luigi.Parameter(default=bigquery.DestinationFormat.CSV)\n    print_header = luigi.Parameter(default=bigquery.PrintHeader.TRUE)\n    field_delimiter = luigi.Parameter(default=bigquery.FieldDelimiter.COMMA)\n\n    def output(self):\n        return GCSTarget(bucket_url(self.extract_gcs_file))\n\n    def requires(self):\n        return TestLoadTask(source=self.source, dataset=self.dataset, table=self.table)\n\n\n@pytest.mark.gcloud\nclass BigQueryGcloudTest(unittest.TestCase):\n    def setUp(self):\n        self.bq_client = bigquery.BigQueryClient(CREDENTIALS)\n        self.gcs_client = gcs.GCSClient(CREDENTIALS)\n\n        # Setup GCS input data\n        try:\n            self.gcs_client.client.buckets().insert(project=PROJECT_ID, body={\"name\": BUCKET_NAME, \"location\": EU_LOCATION}).execute()\n        except googleapiclient.errors.HttpError as ex:\n            # todo verify that existing dataset is not US\n            if ex.resp.status != 409:  # bucket already exists\n                raise\n\n        self.gcs_client.remove(bucket_url(\"\"), recursive=True)\n        self.gcs_client.mkdir(bucket_url(\"\"))\n\n        text = \"\\n\".join(map(json.dumps, [{\"field1\": \"hi\", \"field2\": 1}, {\"field1\": \"bye\", \"field2\": 2}]))\n        self.gcs_file = bucket_url(self.id())\n        self.gcs_client.put_string(text, self.gcs_file)\n\n        # Setup BigQuery datasets\n        self.table = bigquery.BQTable(project_id=PROJECT_ID, dataset_id=DATASET_ID, table_id=self.id().split(\".\")[-1], location=None)\n        self.table_eu = bigquery.BQTable(project_id=PROJECT_ID, dataset_id=EU_DATASET_ID, table_id=self.id().split(\".\")[-1] + \"_eu\", location=EU_LOCATION)\n\n        self.addCleanup(self.gcs_client.remove, bucket_url(\"\"), recursive=True)\n        self.addCleanup(self.bq_client.delete_dataset, self.table.dataset)\n        self.addCleanup(self.bq_client.delete_dataset, self.table_eu.dataset)\n\n        self.bq_client.delete_dataset(self.table.dataset)\n        self.bq_client.delete_dataset(self.table_eu.dataset)\n        self.bq_client.make_dataset(self.table.dataset, body={})\n        self.bq_client.make_dataset(self.table_eu.dataset, body={})\n\n    def test_extract_to_gcs_csv(self):\n        task1 = TestLoadTask(source=self.gcs_file, dataset=self.table.dataset.dataset_id, table=self.table.table_id)\n        task1.run()\n\n        task2 = TestExtractTask(\n            source=self.gcs_file,\n            dataset=self.table.dataset.dataset_id,\n            table=self.table.table_id,\n            extract_gcs_file=self.id() + \"_extract_file\",\n            destination_format=bigquery.DestinationFormat.CSV,\n        )\n        task2.run()\n\n        self.assertTrue(task2.output().exists)\n\n    def test_extract_to_gcs_csv_alternate(self):\n        task1 = TestLoadTask(source=self.gcs_file, dataset=self.table.dataset.dataset_id, table=self.table.table_id)\n        task1.run()\n\n        task2 = TestExtractTask(\n            source=self.gcs_file,\n            dataset=self.table.dataset.dataset_id,\n            table=self.table.table_id,\n            extract_gcs_file=self.id() + \"_extract_file\",\n            destination_format=bigquery.DestinationFormat.CSV,\n            print_header=bigquery.PrintHeader.FALSE,\n            field_delimiter=bigquery.FieldDelimiter.PIPE,\n        )\n        task2.run()\n\n        self.assertTrue(task2.output().exists)\n\n    def test_extract_to_gcs_json(self):\n        task1 = TestLoadTask(source=self.gcs_file, dataset=self.table.dataset.dataset_id, table=self.table.table_id)\n        task1.run()\n\n        task2 = TestExtractTask(\n            source=self.gcs_file,\n            dataset=self.table.dataset.dataset_id,\n            table=self.table.table_id,\n            extract_gcs_file=self.id() + \"_extract_file\",\n            destination_format=bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON,\n        )\n        task2.run()\n\n        self.assertTrue(task2.output().exists)\n\n    def test_extract_to_gcs_avro(self):\n        task1 = TestLoadTask(source=self.gcs_file, dataset=self.table.dataset.dataset_id, table=self.table.table_id)\n        task1.run()\n\n        task2 = TestExtractTask(\n            source=self.gcs_file,\n            dataset=self.table.dataset.dataset_id,\n            table=self.table.table_id,\n            extract_gcs_file=self.id() + \"_extract_file\",\n            destination_format=bigquery.DestinationFormat.AVRO,\n        )\n        task2.run()\n\n        self.assertTrue(task2.output().exists)\n\n    def test_load_eu_to_undefined(self):\n        task = TestLoadTask(source=self.gcs_file, dataset=self.table.dataset.dataset_id, table=self.table.table_id, location=EU_LOCATION)\n        self.assertRaises(Exception, task.run)\n\n    def test_load_us_to_eu(self):\n        task = TestLoadTask(source=self.gcs_file, dataset=self.table_eu.dataset.dataset_id, table=self.table_eu.table_id, location=US_LOCATION)\n        self.assertRaises(Exception, task.run)\n\n    def test_load_eu_to_eu(self):\n        task = TestLoadTask(source=self.gcs_file, dataset=self.table_eu.dataset.dataset_id, table=self.table_eu.table_id, location=EU_LOCATION)\n        task.run()\n\n        self.assertTrue(self.bq_client.dataset_exists(self.table_eu))\n        self.assertTrue(self.bq_client.table_exists(self.table_eu))\n        self.assertIn(self.table_eu.dataset_id, list(self.bq_client.list_datasets(self.table_eu.project_id)))\n        self.assertIn(self.table_eu.table_id, list(self.bq_client.list_tables(self.table_eu.dataset)))\n\n    def test_load_undefined_to_eu(self):\n        task = TestLoadTask(source=self.gcs_file, dataset=self.table_eu.dataset.dataset_id, table=self.table_eu.table_id)\n        task.run()\n\n        self.assertTrue(self.bq_client.dataset_exists(self.table_eu))\n        self.assertTrue(self.bq_client.table_exists(self.table_eu))\n        self.assertIn(self.table_eu.dataset_id, list(self.bq_client.list_datasets(self.table_eu.project_id)))\n        self.assertIn(self.table_eu.table_id, list(self.bq_client.list_tables(self.table_eu.dataset)))\n\n    def test_load_new_eu_dataset(self):\n        self.bq_client.delete_dataset(self.table.dataset)\n        self.bq_client.delete_dataset(self.table_eu.dataset)\n\n        self.assertFalse(self.bq_client.dataset_exists(self.table_eu))\n\n        task = TestLoadTask(source=self.gcs_file, dataset=self.table_eu.dataset.dataset_id, table=self.table_eu.table_id, location=EU_LOCATION)\n        task.run()\n\n        self.assertTrue(self.bq_client.dataset_exists(self.table_eu))\n        self.assertTrue(self.bq_client.table_exists(self.table_eu))\n        self.assertIn(self.table_eu.dataset_id, list(self.bq_client.list_datasets(self.table_eu.project_id)))\n        self.assertIn(self.table_eu.table_id, list(self.bq_client.list_tables(self.table_eu.dataset)))\n\n    def test_copy(self):\n        task = TestLoadTask(source=self.gcs_file, dataset=self.table.dataset.dataset_id, table=self.table.table_id)\n        task.run()\n\n        self.assertTrue(self.bq_client.dataset_exists(self.table))\n        self.assertTrue(self.bq_client.table_exists(self.table))\n        self.assertIn(self.table.dataset_id, list(self.bq_client.list_datasets(self.table.project_id)))\n        self.assertIn(self.table.table_id, list(self.bq_client.list_tables(self.table.dataset)))\n\n        new_table = self.table._replace(table_id=self.table.table_id + \"_copy\")\n        self.bq_client.copy(source_table=self.table, dest_table=new_table)\n        self.assertTrue(self.bq_client.table_exists(new_table))\n        self.bq_client.delete_table(new_table)\n        self.assertFalse(self.bq_client.table_exists(new_table))\n\n    def test_table_uri(self):\n        intended_uri = \"bq://\" + PROJECT_ID + \"/\" + DATASET_ID + \"/\" + self.table.table_id\n        self.assertTrue(self.table.uri == intended_uri)\n\n    def test_run_query(self):\n        task = TestRunQueryTask(table=self.table.table_id, dataset=self.table.dataset.dataset_id)\n        task._BIGQUERY_CLIENT = self.bq_client\n        task.run()\n\n        self.assertTrue(self.bq_client.table_exists(self.table))\n\n    def test_run_successful_job(self):\n        body = {\"configuration\": {\"query\": {\"query\": \"select count(*) from unnest([1,2,3])\"}}}\n\n        job_id = self.bq_client.run_job(PROJECT_ID, body)\n\n        self.assertIsNotNone(job_id)\n        self.assertNotEqual(\"\", job_id)\n\n    def test_run_failing_job(self):\n        body = {\"configuration\": {\"query\": {\"query\": \"this is not a valid query\"}}}\n\n        self.assertRaises(BigQueryExecutionError, lambda: self.bq_client.run_job(PROJECT_ID, body))\n\n\n@pytest.mark.gcloud\nclass BigQueryLoadAvroTest(unittest.TestCase):\n    def _produce_test_input(self):\n        schema = avro.schema.parse(\"\"\"\n        {\n          \"type\":\"record\",\n          \"name\":\"TrackEntity2\",\n          \"namespace\":\"com.spotify.entity.schema\",\n          \"doc\":\"Track entity merged from various sources\",\n          \"fields\":[\n            {\n              \"name\":\"map_record\",\n              \"type\":{\n                \"type\":\"map\",\n                \"values\":{\n                  \"type\":\"record\",\n                  \"name\":\"MapNestedRecordObj\",\n                  \"doc\":\"Nested Record in a map doc\",\n                  \"fields\":[\n                    {\n                      \"name\":\"element1\",\n                      \"type\":\"string\",\n                      \"doc\":\"element 1 doc\"\n                    },\n                    {\n                      \"name\":\"element2\",\n                      \"type\":[\n                        \"null\",\n                        \"string\"\n                      ],\n                      \"doc\":\"element 2 doc\"\n                    }\n                  ]\n                }\n              },\n              \"doc\":\"doc for map\"\n            },\n            {\n              \"name\":\"additional\",\n              \"type\":{\n                \"type\":\"map\",\n                \"values\":\"string\"\n              },\n              \"doc\":\"doc for second map record\"\n            },\n            {\n              \"name\":\"track_gid\",\n              \"type\":\"string\",\n              \"doc\":\"Track GID in hexadecimal string\"\n            },\n            {\n              \"name\":\"track_uri\",\n              \"type\":\"string\",\n              \"doc\":\"Track URI in base62 string\"\n            },\n            {\n              \"name\":\"Suit\",\n              \"type\":{\n                \"type\":\"enum\",\n                \"name\":\"Suit\",\n                \"doc\":\"enum documentation broz\",\n                \"symbols\":[\n                  \"SPADES\",\n                  \"HEARTS\",\n                  \"DIAMONDS\",\n                  \"CLUBS\"\n                ]\n              }\n            },\n            {\n              \"name\":\"FakeRecord\",\n              \"type\":{\n                \"type\":\"record\",\n                \"name\":\"FakeRecord\",\n                \"namespace\":\"com.spotify.data.types.coolType\",\n                \"doc\":\"My Fake Record doc\",\n                \"fields\":[\n                  {\n                    \"name\":\"coolName\",\n                    \"type\":\"string\",\n                    \"doc\":\"Cool Name doc\"\n                  }\n                ]\n              }\n            },\n            {\n              \"name\":\"master_metadata\",\n              \"type\":[\n                \"null\",\n                {\n                  \"type\":\"record\",\n                  \"name\":\"MasterMetadata\",\n                  \"namespace\":\"com.spotify.data.types.metadata\",\n                  \"doc\":\"metadoc\",\n                  \"fields\":[\n                    {\n                      \"name\":\"track\",\n                      \"type\":[\n                        \"null\",\n                        {\n                          \"type\":\"record\",\n                          \"name\":\"Track\",\n                          \"doc\":\"Sqoop import of track\",\n                          \"fields\":[\n                            {\n                              \"name\":\"id\",\n                              \"type\":[\n                                \"null\",\n                                \"int\"\n                              ],\n                              \"doc\":\"id description field\",\n                              \"default\":null,\n                              \"columnName\":\"id\",\n                              \"sqlType\":\"4\"\n                            },\n                            {\n                              \"name\":\"name\",\n                              \"type\":[\n                                \"null\",\n                                \"string\"\n                              ],\n                              \"doc\":\"name description field\",\n                              \"default\":null,\n                              \"columnName\":\"name\",\n                              \"sqlType\":\"12\"\n                            }\n                          ],\n                          \"tableName\":\"track\"\n                        }\n                      ],\n                      \"default\":null\n                    }\n                  ]\n                }\n              ]\n            },\n            {\n              \"name\":\"children\",\n              \"type\":{\n                \"type\":\"array\",\n                \"items\":{\n                  \"type\":\"record\",\n                  \"name\":\"Child\",\n                  \"doc\":\"array of children documentation\",\n                  \"fields\":[\n                    {\n                      \"name\":\"name\",\n                      \"type\":\"string\",\n                      \"doc\":\"my specific child\\'s doc\"\n                    }\n                  ]\n                }\n              }\n            }\n          ]\n        }\"\"\")\n        self.addCleanup(os.remove, \"tmp.avro\")\n        writer = DataFileWriter(open(\"tmp.avro\", \"wb\"), DatumWriter(), schema)\n        writer.append(\n            {\n                \"track_gid\": \"Cool guid\",\n                \"map_record\": {\"Cool key\": {\"element1\": \"element 1 data\", \"element2\": \"element 2 data\"}},\n                \"additional\": {\"key1\": \"value1\"},\n                \"master_metadata\": {\"track\": {\"id\": 1, \"name\": \"Cool Track Name\"}},\n                \"track_uri\": \"Totally a url here\",\n                \"FakeRecord\": {\"coolName\": \"Cool Fake Record Name\"},\n                \"Suit\": \"DIAMONDS\",\n                \"children\": [{\"name\": \"Bob\"}, {\"name\": \"Joe\"}],\n            }\n        )\n        writer.close()\n        self.gcs_client.put(\"tmp.avro\", self.gcs_dir_url + \"/tmp.avro\")\n\n    def setUp(self):\n        self.gcs_client = gcs.GCSClient(CREDENTIALS)\n        self.bq_client = bigquery.BigQueryClient(CREDENTIALS)\n\n        self.table_id = \"avro_bq_table\"\n        self.gcs_dir_url = \"gs://\" + BUCKET_NAME + \"/foo\"\n        self.addCleanup(self.gcs_client.remove, self.gcs_dir_url)\n        self.addCleanup(self.bq_client.delete_dataset, bigquery.BQDataset(PROJECT_ID, DATASET_ID, EU_LOCATION))\n        self._produce_test_input()\n\n    def test_load_avro_dir_and_propagate_doc(self):\n        class BigQueryLoadAvroTestInput(luigi.ExternalTask):\n            def output(_):\n                return gcs.GCSTarget(self.gcs_dir_url)\n\n        class BigQueryLoadAvroTestTask(bigquery_avro.BigQueryLoadAvro):\n            def requires(_):\n                return BigQueryLoadAvroTestInput()\n\n            def output(_):\n                return bigquery.BigQueryTarget(PROJECT_ID, DATASET_ID, self.table_id, location=EU_LOCATION)\n\n        task = BigQueryLoadAvroTestTask()\n        self.assertFalse(task.complete())\n        task.run()\n        self.assertTrue(task.complete())\n\n        table = self.bq_client.client.tables().get(projectId=PROJECT_ID, datasetId=DATASET_ID, tableId=self.table_id).execute()\n        self.assertEqual(table[\"description\"], \"Track entity merged from various sources\")\n        # First map\n        self.assertEqual(table[\"schema\"][\"fields\"][0][\"description\"], \"doc for map\")\n        # key\n        self.assertFalse(\"description\" in table[\"schema\"][\"fields\"][0][\"fields\"][0])\n        # Value\n        self.assertEqual(table[\"schema\"][\"fields\"][0][\"fields\"][1][\"description\"], \"Nested Record in a map doc\")\n        # Value record data\n        self.assertEqual(table[\"schema\"][\"fields\"][0][\"fields\"][1][\"fields\"][0][\"description\"], \"element 1 doc\")\n        self.assertEqual(table[\"schema\"][\"fields\"][0][\"fields\"][1][\"fields\"][1][\"description\"], \"element 2 doc\")\n\n        # Second map\n        self.assertEqual(table[\"schema\"][\"fields\"][1][\"description\"], \"doc for second map record\")\n        # key\n        self.assertFalse(\"description\" in table[\"schema\"][\"fields\"][1][\"fields\"][0])\n        # Value\n        self.assertFalse(\"description\" in table[\"schema\"][\"fields\"][1][\"fields\"][1])\n\n        # Several top level Primitive and Enums\n        self.assertEqual(table[\"schema\"][\"fields\"][2][\"description\"], \"Track GID in hexadecimal string\")\n        self.assertEqual(table[\"schema\"][\"fields\"][3][\"description\"], \"Track URI in base62 string\")\n        self.assertEqual(table[\"schema\"][\"fields\"][4][\"description\"], \"enum documentation broz\")\n\n        # Nested Record containing primitive\n        self.assertEqual(table[\"schema\"][\"fields\"][5][\"description\"], \"My Fake Record doc\")\n        self.assertEqual(table[\"schema\"][\"fields\"][5][\"fields\"][0][\"description\"], \"Cool Name doc\")\n\n        # Union with internal Record\n        self.assertEqual(table[\"schema\"][\"fields\"][6][\"description\"], \"metadoc\")\n        self.assertEqual(table[\"schema\"][\"fields\"][6][\"fields\"][0][\"description\"], \"Sqoop import of track\")\n        self.assertEqual(table[\"schema\"][\"fields\"][6][\"fields\"][0][\"fields\"][0][\"description\"], \"id description field\")\n        self.assertEqual(table[\"schema\"][\"fields\"][6][\"fields\"][0][\"fields\"][1][\"description\"], \"name description field\")\n\n        # Array of Primitive\n        self.assertEqual(table[\"schema\"][\"fields\"][7][\"description\"], \"array of children documentation\")\n        self.assertEqual(table[\"schema\"][\"fields\"][7][\"fields\"][0][\"description\"], \"my specific child's doc\")\n"
  },
  {
    "path": "test/contrib/bigquery_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2019 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nThese are the unit tests for the BigQueryLoadAvro class.\n\"\"\"\n\nimport unittest\n\nimport mock\nimport pytest\nfrom mock.mock import MagicMock\n\nfrom luigi.contrib import bigquery\nfrom luigi.contrib.bigquery import BigQueryClient, BigQueryExtractTask, BigQueryLoadTask, BigQueryRunQueryTask, BigQueryTarget, BQDataset\nfrom luigi.contrib.gcs import GCSTarget\n\n\n@pytest.mark.gcloud\nclass BigQueryLoadTaskTest(unittest.TestCase):\n    @mock.patch(\"luigi.contrib.bigquery.BigQueryClient.run_job\")\n    def test_configure_job(self, run_job):\n        class MyBigQueryLoadTask(BigQueryLoadTask):\n            def source_uris(self):\n                return [\"gs://_\"]\n\n            def configure_job(self, configuration):\n                configuration[\"load\"][\"destinationTableProperties\"] = {\"description\": \"Nice table\"}\n                return configuration\n\n            def output(self):\n                return BigQueryTarget(project_id=\"proj\", dataset_id=\"ds\", table_id=\"t\")\n\n        job = MyBigQueryLoadTask()\n        job.run()\n\n        expected_body = {\n            \"configuration\": {\n                \"load\": {\n                    \"destinationTable\": {\"projectId\": \"proj\", \"datasetId\": \"ds\", \"tableId\": \"t\"},\n                    \"encoding\": \"UTF-8\",\n                    \"sourceFormat\": \"NEWLINE_DELIMITED_JSON\",\n                    \"writeDisposition\": \"WRITE_EMPTY\",\n                    \"sourceUris\": [\"gs://_\"],\n                    \"maxBadRecords\": 0,\n                    \"ignoreUnknownValues\": False,\n                    \"autodetect\": True,\n                    \"destinationTableProperties\": {\"description\": \"Nice table\"},\n                }\n            }\n        }\n        run_job.assert_called_with(\"proj\", expected_body, dataset=BQDataset(\"proj\", \"ds\", None))\n\n\n@pytest.mark.gcloud\nclass BigQueryRunQueryTaskTest(unittest.TestCase):\n    @mock.patch(\"luigi.contrib.bigquery.BigQueryClient.run_job\")\n    def test_configure_job(self, run_job):\n        class MyBigQueryRunQuery(BigQueryRunQueryTask):\n            query = \"SELECT @thing\"\n            use_legacy_sql = False\n\n            def configure_job(self, configuration):\n                configuration[\"query\"][\"parameterMode\"] = \"NAMED\"\n                configuration[\"query\"][\"queryParameters\"] = {\"name\": \"thing\", \"parameterType\": {\"type\": \"STRING\"}, \"parameterValue\": {\"value\": \"Nice Thing\"}}\n                return configuration\n\n            def output(self):\n                return BigQueryTarget(project_id=\"proj\", dataset_id=\"ds\", table_id=\"t\")\n\n        job = MyBigQueryRunQuery()\n        job.run()\n\n        expected_body = {\n            \"configuration\": {\n                \"query\": {\n                    \"query\": \"SELECT @thing\",\n                    \"priority\": \"INTERACTIVE\",\n                    \"destinationTable\": {\"projectId\": \"proj\", \"datasetId\": \"ds\", \"tableId\": \"t\"},\n                    \"allowLargeResults\": True,\n                    \"createDisposition\": \"CREATE_IF_NEEDED\",\n                    \"writeDisposition\": \"WRITE_TRUNCATE\",\n                    \"flattenResults\": True,\n                    \"userDefinedFunctionResources\": [],\n                    \"useLegacySql\": False,\n                    \"parameterMode\": \"NAMED\",\n                    \"queryParameters\": {\"name\": \"thing\", \"parameterType\": {\"type\": \"STRING\"}, \"parameterValue\": {\"value\": \"Nice Thing\"}},\n                }\n            }\n        }\n        run_job.assert_called_with(\"proj\", expected_body, dataset=BQDataset(\"proj\", \"ds\", None))\n\n\n@pytest.mark.gcloud\nclass BigQueryExtractTaskTest(unittest.TestCase):\n    @mock.patch(\"luigi.contrib.bigquery.BigQueryClient.run_job\")\n    def test_configure_job(self, run_job):\n        class MyBigQueryExtractTask(BigQueryExtractTask):\n            destination_format = \"AVRO\"\n\n            def configure_job(self, configuration):\n                configuration[\"extract\"][\"useAvroLogicalTypes\"] = True\n                return configuration\n\n            def input(self):\n                return BigQueryTarget(project_id=\"proj\", dataset_id=\"ds\", table_id=\"t\")\n\n            def output(self):\n                return GCSTarget(\"gs://_\")\n\n        job = MyBigQueryExtractTask()\n        job.run()\n\n        expected_body = {\n            \"configuration\": {\n                \"extract\": {\n                    \"sourceTable\": {\"projectId\": \"proj\", \"datasetId\": \"ds\", \"tableId\": \"t\"},\n                    \"destinationUris\": [\"gs://_\"],\n                    \"destinationFormat\": \"AVRO\",\n                    \"compression\": \"NONE\",\n                    \"useAvroLogicalTypes\": True,\n                }\n            }\n        }\n        run_job.assert_called_with(\"proj\", expected_body, dataset=BQDataset(\"proj\", \"ds\", None))\n\n\n@pytest.mark.gcloud\nclass BigQueryClientTest(unittest.TestCase):\n    def test_retry_succeeds_on_second_attempt(self):\n        try:\n            from googleapiclient import errors\n        except ImportError:\n            raise unittest.SkipTest(\"Unable to load googleapiclient module\")\n        client = MagicMock(spec=BigQueryClient)\n        attempts = 0\n\n        @bigquery.bq_retry\n        def fail_once(bq_client):\n            nonlocal attempts\n            attempts += 1\n            if attempts == 1:\n                raise errors.HttpError(\n                    resp=MagicMock(status=500),\n                    content=b'{\"error\": {\"message\": \"stub\"}',\n                )\n            else:\n                return MagicMock(status=200)\n\n        response = fail_once(client)\n        client._initialise_client.assert_called_once()\n        self.assertEqual(attempts, 2)\n        self.assertEqual(response.status, 200)\n"
  },
  {
    "path": "test/contrib/cascading_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport pytest\nfrom helpers import unittest\n\nimport luigi.target\nfrom luigi.contrib.target import CascadingClient\n\n\n@pytest.mark.contrib\nclass CascadingClientTest(unittest.TestCase):\n    def setUp(self):\n        class FirstClient:\n            def exists(self, pos_arg, kw_arg=\"first\"):\n                if pos_arg < 10:\n                    return pos_arg\n                elif pos_arg < 20:\n                    return kw_arg\n                elif kw_arg == \"raise_fae\":\n                    raise luigi.target.FileAlreadyExists(\"oh noes!\")\n                else:\n                    raise Exception()\n\n        class SecondClient:\n            def exists(self, pos_arg, other_kw_arg=\"second\", kw_arg=\"for-backwards-compatibility\"):\n                if pos_arg < 30:\n                    return -pos_arg\n                elif pos_arg < 40:\n                    return other_kw_arg\n                else:\n                    raise Exception()\n\n        self.clients = [FirstClient(), SecondClient()]\n        self.client = CascadingClient(self.clients)\n\n    def test_successes(self):\n        self.assertEqual(5, self.client.exists(5))\n        self.assertEqual(\"yay\", self.client.exists(15, kw_arg=\"yay\"))\n\n    def test_fallbacking(self):\n        self.assertEqual(-25, self.client.exists(25))\n        self.assertEqual(\"lol\", self.client.exists(35, kw_arg=\"yay\", other_kw_arg=\"lol\"))\n        # Note: the first method don't accept the other keyword argument\n        self.assertEqual(-15, self.client.exists(15, kw_arg=\"yay\", other_kw_arg=\"lol\"))\n\n    def test_failings(self):\n        self.assertRaises(Exception, lambda: self.client.exists(45))\n        self.assertRaises(AttributeError, lambda: self.client.mkdir())\n\n    def test_FileAlreadyExists_propagation(self):\n        self.assertRaises(luigi.target.FileAlreadyExists, lambda: self.client.exists(25, kw_arg=\"raise_fae\"))\n\n    def test_method_names_kwarg(self):\n        self.client = CascadingClient(self.clients, method_names=[])\n        self.assertRaises(AttributeError, lambda: self.client.exists())\n        self.client = CascadingClient(self.clients, method_names=[\"exists\"])\n        self.assertEqual(5, self.client.exists(5))\n"
  },
  {
    "path": "test/contrib/datadog_metric_test.py",
    "content": "# -*- coding: utf-8 -*-\n\nimport time\n\nimport mock\nfrom helpers import unittest\n\nfrom luigi.contrib.datadog_metric import DatadogMetricsCollector\nfrom luigi.metrics import MetricsCollectors\nfrom luigi.scheduler import Scheduler\n\nWORKER = \"myworker\"\n\n\nclass DatadogMetricTest(unittest.TestCase):\n    def setUp(self):\n        self.mockDatadog()\n        self.time = time.time\n        self.collector = DatadogMetricsCollector()\n        self.s = Scheduler(metrics_collector=MetricsCollectors.datadog)\n\n    def tearDown(self):\n        self.unMockDatadog()\n\n        if time.time != self.time:\n            time.time = self.time\n\n    def startTask(self, scheduler=None):\n        if scheduler:\n            s = scheduler\n        else:\n            s = self.s\n\n        s.add_task(worker=WORKER, task_id=\"DDTaskID\", family=\"DDTaskName\")\n        task = s._state.get_task(\"DDTaskID\")\n\n        task.time_running = 0\n        return task\n\n    def mockDatadog(self):\n        self.create_patcher = mock.patch(\"datadog.api.Event.create\")\n        self.mock_create = self.create_patcher.start()\n\n        self.increment_patcher = mock.patch(\"datadog.statsd.increment\")\n        self.mock_increment = self.increment_patcher.start()\n\n        self.gauge_patcher = mock.patch(\"datadog.statsd.gauge\")\n        self.mock_gauge = self.gauge_patcher.start()\n\n    def unMockDatadog(self):\n        self.create_patcher.stop()\n        self.increment_patcher.stop()\n        self.gauge_patcher.stop()\n\n    def setTime(self, t):\n        time.time = lambda: t\n\n    def test_send_event_on_task_started(self):\n        task = self.startTask()\n        self.collector.handle_task_started(task)\n\n        self.mock_create.assert_called_once_with(\n            alert_type=\"info\",\n            priority=\"low\",\n            tags=[\"task_name:DDTaskName\", \"task_state:STARTED\", \"environment:development\", \"application:luigi\"],\n            text=\"A task has been started in the pipeline named: DDTaskName\",\n            title=\"Luigi: A task has been started!\",\n        )\n\n    def test_send_increment_on_task_started(self):\n        task = self.startTask()\n        self.collector.handle_task_started(task)\n\n        self.mock_increment.assert_called_once_with(\"luigi.task.started\", 1, tags=[\"task_name:DDTaskName\", \"environment:development\", \"application:luigi\"])\n\n    def test_send_event_on_task_failed(self):\n        task = self.startTask()\n        self.collector.handle_task_failed(task)\n\n        self.mock_create.assert_called_once_with(\n            alert_type=\"error\",\n            priority=\"normal\",\n            tags=[\"task_name:DDTaskName\", \"task_state:FAILED\", \"environment:development\", \"application:luigi\"],\n            text=\"A task has failed in the pipeline named: DDTaskName\",\n            title=\"Luigi: A task has failed!\",\n        )\n\n    def test_send_increment_on_task_failed(self):\n        task = self.startTask()\n        self.collector.handle_task_failed(task)\n\n        self.mock_increment.assert_called_once_with(\"luigi.task.failed\", 1, tags=[\"task_name:DDTaskName\", \"environment:development\", \"application:luigi\"])\n\n    def test_send_event_on_task_disabled(self):\n        s = Scheduler(metrics_collector=MetricsCollectors.datadog, disable_persist=10, retry_count=2, disable_window=2)\n        task = self.startTask(scheduler=s)\n        self.collector.handle_task_disabled(task, s._config)\n\n        self.mock_create.assert_called_once_with(\n            alert_type=\"error\",\n            priority=\"normal\",\n            tags=[\"task_name:DDTaskName\", \"task_state:DISABLED\", \"environment:development\", \"application:luigi\"],\n            text=\"A task has been disabled in the pipeline named: DDTaskName. \"\n            + \"The task has failed 2 times in the last 2 seconds\"\n            + \", so it is being disabled for 10 seconds.\",\n            title=\"Luigi: A task has been disabled!\",\n        )\n\n    def test_send_increment_on_task_disabled(self):\n        task = self.startTask()\n        self.collector.handle_task_disabled(task, self.s._config)\n\n        self.mock_increment.assert_called_once_with(\"luigi.task.disabled\", 1, tags=[\"task_name:DDTaskName\", \"environment:development\", \"application:luigi\"])\n\n    def test_send_event_on_task_done(self):\n        task = self.startTask()\n        self.collector.handle_task_done(task)\n\n        self.mock_create.assert_called_once_with(\n            alert_type=\"info\",\n            priority=\"low\",\n            tags=[\"task_name:DDTaskName\", \"task_state:DONE\", \"environment:development\", \"application:luigi\"],\n            text=\"A task has completed in the pipeline named: DDTaskName\",\n            title=\"Luigi: A task has been completed!\",\n        )\n\n    def test_send_increment_on_task_done(self):\n        task = self.startTask()\n        self.collector.handle_task_done(task)\n\n        self.mock_increment.assert_called_once_with(\"luigi.task.done\", 1, tags=[\"task_name:DDTaskName\", \"environment:development\", \"application:luigi\"])\n\n    def test_send_gauge_on_task_done(self):\n        self.setTime(0)\n        task = self.startTask()\n        self.collector.handle_task_done(task)\n\n        self.mock_gauge.assert_called_once_with(\"luigi.task.execution_time\", 0, tags=[\"task_name:DDTaskName\", \"environment:development\", \"application:luigi\"])\n"
  },
  {
    "path": "test/contrib/dataproc_test.py",
    "content": "\"\"\"This is an integration test for the Dataproc-luigi binding.\n\nThis test requires credentials that can access GCS & access to a bucket below.\nFollow the directions in the gcloud tools to set up local credentials.\n\"\"\"\n\nimport unittest\n\ntry:\n    import google.auth\n    from googleapiclient import discovery\n\n    from luigi.contrib import dataproc\n\n    default_credentials, _ = google.auth.default()\n    default_client = discovery.build(\"dataproc\", \"v1\", cache_discovery=False, credentials=default_credentials)\n    dataproc.set_dataproc_client(default_client)\nexcept ImportError:\n    raise unittest.SkipTest(\"Unable to load google cloud dependencies\")\n\nimport os\nimport time\n\nimport pytest\n\nimport luigi\n\n# In order to run this test, you should set these to your GCS project.\n# Unfortunately there's no mock\nPROJECT_ID = os.environ.get(\"DATAPROC_TEST_PROJECT_ID\", \"your_project_id_here\")\nCLUSTER_NAME = os.environ.get(\"DATAPROC_TEST_CLUSTER\", \"unit-test-cluster\")\nREGION = os.environ.get(\"DATAPROC_REGION\", \"global\")\nIMAGE_VERSION = \"1-0\"\n\n\nclass _DataprocBaseTestCase(unittest.TestCase):\n    def setUp(self):\n        pass\n\n    def tearDown(self):\n        pass\n\n\n@pytest.mark.gcloud\nclass DataprocTaskTest(_DataprocBaseTestCase):\n    def test_1_create_cluster(self):\n        success = luigi.run(\n            [\"--local-scheduler\", \"--no-lock\", \"CreateDataprocClusterTask\", \"--gcloud-project-id=\" + PROJECT_ID, \"--dataproc-cluster-name=\" + CLUSTER_NAME]\n        )\n        self.assertTrue(success)\n\n    def test_2_create_cluster_should_notice_existing_cluster_and_return_immediately(self):\n        job_start = time.time()\n        success = luigi.run(\n            [\"--local-scheduler\", \"--no-lock\", \"CreateDataprocClusterTask\", \"--gcloud-project-id=\" + PROJECT_ID, \"--dataproc-cluster-name=\" + CLUSTER_NAME]\n        )\n        self.assertTrue(success)\n        self.assertLess(time.time() - job_start, 3)\n\n    def test_3_submit_minimal_job(self):\n        # The job itself will fail because the job files don't exist\n        # We don't care, because then we would be testing spark\n        # We care the job was submitted correctly, so that's what we test\n\n        luigi.run(\n            [\n                \"--local-scheduler\",\n                \"--no-lock\",\n                \"DataprocSparkTask\",\n                \"--gcloud-project-id=\" + PROJECT_ID,\n                \"--dataproc-cluster-name=\" + CLUSTER_NAME,\n                \"--main-class=my.MinimalMainClass\",\n            ]\n        )\n\n        response = dataproc.get_dataproc_client().projects().regions().jobs().list(projectId=PROJECT_ID, region=REGION, clusterName=CLUSTER_NAME).execute()\n        lastJob = response[\"jobs\"][0][\"sparkJob\"]\n\n        self.assertEqual(lastJob[\"mainClass\"], \"my.MinimalMainClass\")\n\n    def test_4_submit_spark_job(self):\n        # The job itself will fail because the job files don't exist\n        # We don't care, because then we would be testing spark\n        # We care the job was submitted correctly, so that's what we test\n\n        luigi.run(\n            [\n                \"--local-scheduler\",\n                \"--no-lock\",\n                \"DataprocSparkTask\",\n                \"--gcloud-project-id=\" + PROJECT_ID,\n                \"--dataproc-cluster-name=\" + CLUSTER_NAME,\n                \"--main-class=my.MainClass\",\n                \"--jars=one.jar,two.jar\",\n                \"--job-args=foo,bar\",\n            ]\n        )\n\n        response = dataproc.get_dataproc_client().projects().regions().jobs().list(projectId=PROJECT_ID, region=REGION, clusterName=CLUSTER_NAME).execute()\n        lastJob = response[\"jobs\"][0][\"sparkJob\"]\n\n        self.assertEqual(lastJob[\"mainClass\"], \"my.MainClass\")\n        self.assertEqual(lastJob[\"jarFileUris\"], [\"one.jar\", \"two.jar\"])\n        self.assertEqual(lastJob[\"args\"], [\"foo\", \"bar\"])\n\n    def test_5_submit_pyspark_job(self):\n        # The job itself will fail because the job files don't exist\n        # We don't care, because then we would be testing pyspark\n        # We care the job was submitted correctly, so that's what we test\n\n        luigi.run(\n            [\n                \"--local-scheduler\",\n                \"--no-lock\",\n                \"DataprocPysparkTask\",\n                \"--gcloud-project-id=\" + PROJECT_ID,\n                \"--dataproc-cluster-name=\" + CLUSTER_NAME,\n                \"--job-file=main_job.py\",\n                \"--extra-files=extra1.py,extra2.py\",\n                \"--job-args=foo,bar\",\n            ]\n        )\n\n        response = dataproc.get_dataproc_client().projects().regions().jobs().list(projectId=PROJECT_ID, region=REGION, clusterName=CLUSTER_NAME).execute()\n        lastJob = response[\"jobs\"][0][\"pysparkJob\"]\n\n        self.assertEqual(lastJob[\"mainPythonFileUri\"], \"main_job.py\")\n        self.assertEqual(lastJob[\"pythonFileUris\"], [\"extra1.py\", \"extra2.py\"])\n        self.assertEqual(lastJob[\"args\"], [\"foo\", \"bar\"])\n\n    def test_6_delete_cluster(self):\n        success = luigi.run(\n            [\"--local-scheduler\", \"--no-lock\", \"DeleteDataprocClusterTask\", \"--gcloud-project-id=\" + PROJECT_ID, \"--dataproc-cluster-name=\" + CLUSTER_NAME]\n        )\n        self.assertTrue(success)\n\n    def test_7_delete_cluster_should_return_immediately_if_no_cluster(self):\n        job_start = time.time()\n        success = luigi.run(\n            [\"--local-scheduler\", \"--no-lock\", \"DeleteDataprocClusterTask\", \"--gcloud-project-id=\" + PROJECT_ID, \"--dataproc-cluster-name=\" + CLUSTER_NAME]\n        )\n        self.assertTrue(success)\n        self.assertLess(time.time() - job_start, 3)\n\n    def test_8_create_cluster_image_version(self):\n        success = luigi.run(\n            [\n                \"--local-scheduler\",\n                \"--no-lock\",\n                \"CreateDataprocClusterTask\",\n                \"--gcloud-project-id=\" + PROJECT_ID,\n                \"--dataproc-cluster-name=\" + CLUSTER_NAME + \"-\" + IMAGE_VERSION,\n                \"--image-version=1.0\",\n            ]\n        )\n        self.assertTrue(success)\n\n    def test_9_delete_cluster_image_version(self):\n        success = luigi.run(\n            [\n                \"--local-scheduler\",\n                \"--no-lock\",\n                \"DeleteDataprocClusterTask\",\n                \"--gcloud-project-id=\" + PROJECT_ID,\n                \"--dataproc-cluster-name=\" + CLUSTER_NAME + \"-\" + IMAGE_VERSION,\n            ]\n        )\n        self.assertTrue(success)\n"
  },
  {
    "path": "test/contrib/docker_runner_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2017 Open Targets\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\n\"\"\"\nTests for Docker container wrapper for Luigi.\n\n\nRequires:\n\n- docker: ``pip install docker``\n\nWritten and maintained by Andrea Pierleoni (@apierleoni).\nContributions by Eliseo Papa (@elipapa)\n\"\"\"\n\nimport logging\nimport tempfile\nfrom tempfile import NamedTemporaryFile\n\nimport pytest\nfrom helpers import unittest\n\nimport luigi\nfrom luigi.contrib.docker_runner import DockerTask\n\nlogger = logging.getLogger(\"luigi-interface\")\n\ntry:\n    import docker\n    from docker.errors import ContainerError, ImageNotFound\n\n    client = docker.from_env()\n    client.version()\nexcept ImportError:\n    raise unittest.SkipTest(\"Unable to load docker module\")\nexcept Exception:\n    raise unittest.SkipTest(\"Unable to connect to docker daemon\")\n\ntempfile.tempdir = \"/tmp\"  # set it explicitly to make it work out of the box in mac os\nlocal_file = NamedTemporaryFile()\nlocal_file.write(b\"this is a test file\\n\")\nlocal_file.flush()\n\n\nclass SuccessJob(DockerTask):\n    image = \"busybox:latest\"\n    name = \"SuccessJob\"\n\n\nclass FailJobImageNotFound(DockerTask):\n    image = \"image-does-not-exists\"\n    name = \"FailJobImageNotFound\"\n\n\nclass FailJobContainer(DockerTask):\n    image = \"busybox\"\n    name = \"FailJobContainer\"\n    command = \"cat this-file-does-not-exist\"\n\n\nclass WriteToTmpDir(DockerTask):\n    image = \"busybox\"\n    name = \"WriteToTmpDir\"\n    container_tmp_dir = \"/tmp/luigi-test\"\n    command = \"test -d  /tmp/luigi-test\"\n    # command = 'test -d $LUIGI_TMP_DIR'# && echo ok >$LUIGI_TMP_DIR/test'\n\n\nclass MountLocalFileAsVolume(DockerTask):\n    image = \"busybox\"\n    name = \"MountLocalFileAsVolume\"\n    # volumes= {'/tmp/local_file_test': {'bind': local_file.name, 'mode': 'rw'}}\n    binds = [local_file.name + \":/tmp/local_file_test\"]\n    command = \"test -f /tmp/local_file_test\"\n\n\nclass MountLocalFileAsVolumeWithParam(DockerTask):\n    dummyopt = luigi.Parameter()\n    image = \"busybox\"\n    name = \"MountLocalFileAsVolumeWithParam\"\n    binds = [local_file.name + \":/tmp/local_file_test\"]\n    command = \"test -f /tmp/local_file_test\"\n\n\nclass MountLocalFileAsVolumeWithParamRedefProperties(DockerTask):\n    dummyopt = luigi.Parameter()\n    image = \"busybox\"\n    name = \"MountLocalFileAsVolumeWithParamRedef\"\n\n    @property\n    def binds(self):\n        return [local_file.name + \":/tmp/local_file_test\" + self.dummyopt]\n\n    @property\n    def command(self):\n        return \"test -f /tmp/local_file_test\" + self.dummyopt\n\n    def complete(self):\n        return True\n\n\nclass MultipleDockerTask(luigi.WrapperTask):\n    \"\"\"because the volumes property is defined as a list, spinning multiple\n    containers led to conflict in the volume binds definition, with multiple\n    host directories pointing to the same container directory\"\"\"\n\n    def requires(self):\n        return [MountLocalFileAsVolumeWithParam(dummyopt=opt) for opt in [\"one\", \"two\", \"three\"]]\n\n\nclass MultipleDockerTaskRedefProperties(luigi.WrapperTask):\n    def requires(self):\n        return [MountLocalFileAsVolumeWithParamRedefProperties(dummyopt=opt) for opt in [\"one\", \"two\", \"three\"]]\n\n\n@pytest.mark.contrib\nclass TestDockerTask(unittest.TestCase):\n    # def tearDown(self):\n    #     local_file.close()\n\n    def test_success_job(self):\n        success = SuccessJob()\n        luigi.build([success], local_scheduler=True)\n        self.assertTrue(success)\n\n    def test_temp_dir_creation(self):\n        writedir = WriteToTmpDir()\n        writedir.run()\n\n    def test_local_file_mount(self):\n        localfile = MountLocalFileAsVolume()\n        localfile.run()\n\n    def test_fail_job_image_not_found(self):\n        fail = FailJobImageNotFound()\n        self.assertRaises(ImageNotFound, fail.run)\n\n    def test_fail_job_container(self):\n        fail = FailJobContainer()\n        self.assertRaises(ContainerError, fail.run)\n\n    def test_multiple_jobs(self):\n        worked = MultipleDockerTask()\n        luigi.build([worked], local_scheduler=True)\n        self.assertTrue(worked)\n\n    def test_multiple_jobs2(self):\n        worked = MultipleDockerTaskRedefProperties()\n        luigi.build([worked], local_scheduler=True)\n        self.assertTrue(worked)\n"
  },
  {
    "path": "test/contrib/dropbox_test.py",
    "content": "import bz2\nimport os\nimport tempfile\nimport unittest\nimport uuid\nfrom datetime import datetime\n\nimport pytest\n\nimport luigi\nfrom luigi.format import NopFormat\n\ntry:\n    import dropbox\n    import dropbox.exceptions\n\n    from luigi.contrib.dropbox import DropboxClient\nexcept ImportError:\n    raise unittest.SkipTest(\"DropboxTarget and DropboxClient will not be tested. Dropbox library is not installed\")\n\nDROPBOX_APP_TOKEN = os.environ.get(\"DROPBOX_APP_TOKEN\")\n\nif not DROPBOX_APP_TOKEN:\n    raise ValueError(\n        \"In order to test DropboxTarget and DropboxClient, the DROPBOX_APP_TOKEN environment variable \"\n        \"must contain a valid Dropbox OAuth2 Token. \\n\"\n        \"Get one at https://www.dropbox.com/developers/apps \"\n    )\n\nDROPBOX_TEST_PATH = \"/luigi-tests/luigi-test-\" + datetime.now().strftime(\"%Y.%m.%d-%H.%M.%S\") + str(uuid.uuid4())\n\n# These paths will be created in the test set-up\nDROPBOX_TEST_SIMPLE_DIR = DROPBOX_TEST_PATH + \"/dir2\"\nDROPBOX_TEST_FILE_IN_DIR = DROPBOX_TEST_SIMPLE_DIR + \"/test2.txt\"\nDROPBOX_TEST_SIMPLE_FILE = DROPBOX_TEST_PATH + \"/test.txt\"\nDROPBOX_TEST_DIR_TO_DELETE = DROPBOX_TEST_PATH + \"/dir_to_delete\"\nDROPBOX_TEST_FILE_TO_DELETE_2 = DROPBOX_TEST_DIR_TO_DELETE + \"/test3.2.txt\"\nDROPBOX_TEST_FILE_TO_DELETE_1 = DROPBOX_TEST_DIR_TO_DELETE + \"/test3.1.txt\"\nDROPBOX_TEST_FILE_TO_COPY_ORIG = DROPBOX_TEST_PATH + \"/dir4/test4.txt\"\nDROPBOX_TEST_FILE_TO_MOVE_ORIG = DROPBOX_TEST_PATH + \"/dir3/test3.txt\"\n\n# All the following paths will be used by the tests\nDROPBOX_TEST_SMALL_FILE = DROPBOX_TEST_PATH + \"/dir/small.txt\"\nDROPBOX_TEST_LARGE_FILE = DROPBOX_TEST_PATH + \"/dir/big.bin\"\n\nDROPBOX_TEST_FILE_TO_COPY_DEST = DROPBOX_TEST_PATH + \"/dir_four/test_four.txt\"\n\nDROPBOX_TEST_FILE_TO_MOVE_DEST = DROPBOX_TEST_PATH + \"/dir_three/test_three.txt\"\nDROPBOX_TEST_OUTER_DIR_TO_CREATE = DROPBOX_TEST_PATH + \"/new_folder\"\nDROPBOX_TEST_DIR_TO_CREATE = DROPBOX_TEST_OUTER_DIR_TO_CREATE + \"/inner_folder\"\n\nDROPBOX_TEST_SIMPLE_DIR_TO_CREATE_AND_DELETE = DROPBOX_TEST_PATH + \"/another_new_folder\"\n\nDROPBOX_TEST_FILE_TO_UPLOAD_BZIP2 = DROPBOX_TEST_PATH + \"/bin.file\"\nDROPBOX_TEST_FILE_TO_UPLOAD_TEXT = DROPBOX_TEST_PATH + \"/text.txt\"\nDROPBOX_TEST_FILE_TO_UPLOAD_BIN = DROPBOX_TEST_PATH + \"/file.bin\"\nDROPBOX_TEST_FILE_TO_UPLOAD_LARGE = DROPBOX_TEST_PATH + \"/file.blob\"\n\nDROPBOX_TEST_NON_EXISTING_FILE = DROPBOX_TEST_SIMPLE_DIR + \"ajdlkajfal\"\n\n\n@pytest.mark.dropbox\nclass TestClientDropbox(unittest.TestCase):\n    def setUp(self):\n        self.luigiconn = DropboxClient(DROPBOX_APP_TOKEN)\n\n        self.dropbox_api = dropbox.dropbox_client.Dropbox(DROPBOX_APP_TOKEN)\n        self.dropbox_api.files_upload(b\"hello\", DROPBOX_TEST_SIMPLE_FILE)\n        self.dropbox_api.files_upload(b\"hello2\", DROPBOX_TEST_FILE_IN_DIR)\n        self.dropbox_api.files_upload(b\"hello3\", DROPBOX_TEST_FILE_TO_MOVE_ORIG)\n        self.dropbox_api.files_upload(b\"hello4\", DROPBOX_TEST_FILE_TO_COPY_ORIG)\n        self.dropbox_api.files_upload(b\"hello3.1\", DROPBOX_TEST_FILE_TO_DELETE_1)\n        self.dropbox_api.files_upload(b\"hello3.2\", DROPBOX_TEST_FILE_TO_DELETE_2)\n\n    def tearDown(self):\n        self.dropbox_api.files_delete_v2(DROPBOX_TEST_PATH)\n        self.dropbox_api._session.close()\n\n    def test_exists(self):\n        self.assertTrue(self.luigiconn.exists(\"/\"))\n        self.assertTrue(self.luigiconn.exists(DROPBOX_TEST_PATH))\n        self.assertTrue(self.luigiconn.exists(DROPBOX_TEST_SIMPLE_DIR))\n        self.assertTrue(self.luigiconn.exists(DROPBOX_TEST_SIMPLE_DIR + \"/\"))\n        self.assertTrue(self.luigiconn.exists(DROPBOX_TEST_SIMPLE_FILE))\n\n        self.assertFalse(self.luigiconn.exists(DROPBOX_TEST_SIMPLE_FILE + \"/\"))\n        self.assertFalse(self.luigiconn.exists(DROPBOX_TEST_NON_EXISTING_FILE))\n\n    def test_listdir_simple(self):\n        list_of_dirs = self.luigiconn.listdir(DROPBOX_TEST_PATH)\n        self.assertTrue(\"/\" not in list_of_dirs)\n        self.assertTrue(DROPBOX_TEST_PATH in list_of_dirs)\n        self.assertTrue(DROPBOX_TEST_SIMPLE_FILE in list_of_dirs)  # we verify recursivity\n\n    def test_listdir_simple_with_one_slash(self):\n        list_of_dirs = self.luigiconn.listdir(DROPBOX_TEST_PATH + \"/\")\n        self.assertTrue(\"/\" not in list_of_dirs)\n        self.assertTrue(DROPBOX_TEST_PATH in list_of_dirs)\n        self.assertTrue(DROPBOX_TEST_SIMPLE_FILE in list_of_dirs)  # we verify recursivity\n\n    def test_listdir_multiple(self):\n        list_of_dirs = self.luigiconn.listdir(DROPBOX_TEST_PATH, limit=2)\n        self.assertTrue(\"/\" not in list_of_dirs)\n        self.assertTrue(DROPBOX_TEST_PATH in list_of_dirs)\n        self.assertTrue(DROPBOX_TEST_SIMPLE_FILE in list_of_dirs)  # we verify recursivity\n\n    def test_listdir_nonexisting(self):\n        with self.assertRaises(dropbox.exceptions.ApiError):\n            self.luigiconn.listdir(DROPBOX_TEST_NON_EXISTING_FILE)\n\n    def test_remove(self):\n        # We remove File_to_delete_1. We make sure it is the only file that gets deleted\n        self.assertTrue(self.luigiconn.exists(DROPBOX_TEST_FILE_TO_DELETE_1))\n        self.assertTrue(self.luigiconn.exists(DROPBOX_TEST_FILE_TO_DELETE_2))\n        self.assertTrue(self.luigiconn.remove(DROPBOX_TEST_FILE_TO_DELETE_1))\n        self.assertFalse(self.luigiconn.exists(DROPBOX_TEST_FILE_TO_DELETE_1))\n        self.assertTrue(self.luigiconn.exists(DROPBOX_TEST_FILE_TO_DELETE_2))\n\n        # We remove a directory, we make sure that the files that were in the directory are also deleted\n        self.luigiconn.remove(DROPBOX_TEST_DIR_TO_DELETE)\n        self.assertFalse(self.luigiconn.exists(DROPBOX_TEST_FILE_TO_DELETE_2))\n\n        # We make sure that we return False when we fail to remove a non-existing path\n        self.assertFalse(self.luigiconn.remove(DROPBOX_TEST_NON_EXISTING_FILE))\n        self.assertFalse(self.luigiconn.remove(DROPBOX_TEST_NON_EXISTING_FILE + \"/\"))\n\n    def test_mkdir_new_dir(self):\n        self.assertFalse(self.luigiconn.exists(DROPBOX_TEST_DIR_TO_CREATE))\n        self.assertFalse(self.luigiconn.exists(DROPBOX_TEST_OUTER_DIR_TO_CREATE))\n        self.luigiconn.mkdir(DROPBOX_TEST_DIR_TO_CREATE)\n        self.assertTrue(self.luigiconn.isdir(DROPBOX_TEST_OUTER_DIR_TO_CREATE))\n        self.assertTrue(self.luigiconn.isdir(DROPBOX_TEST_DIR_TO_CREATE))\n        self.assertTrue(self.luigiconn.isdir(DROPBOX_TEST_DIR_TO_CREATE))\n\n    def aux_lifecycle_of_directory(self, path):\n        # Initially, the directory does not exists\n        self.assertFalse(self.luigiconn.exists(path))\n        self.assertFalse(self.luigiconn.isdir(path))\n\n        # Now we create the directory and verify that it exists\n        self.luigiconn.mkdir(path)\n        self.assertTrue(self.luigiconn.exists(path))\n        self.assertTrue(self.luigiconn.isdir(path))\n\n        # Now we remote the directory and verify that it no longer exists\n        self.luigiconn.remove(path)\n        self.assertFalse(self.luigiconn.exists(path))\n        self.assertFalse(self.luigiconn.isdir(path))\n\n    def test_lifecycle_of_dirpath(self):\n        self.aux_lifecycle_of_directory(DROPBOX_TEST_SIMPLE_DIR_TO_CREATE_AND_DELETE)\n\n    def test_lifecycle_of_dirpath_with_trailing_slash(self):\n        self.aux_lifecycle_of_directory(DROPBOX_TEST_SIMPLE_DIR_TO_CREATE_AND_DELETE + \"/\")\n\n    def test_lifecycle_of_dirpath_with_several_trailing_mixed(self):\n        self.luigiconn.mkdir(DROPBOX_TEST_SIMPLE_DIR_TO_CREATE_AND_DELETE + \"/\")\n        self.assertTrue(self.luigiconn.exists(DROPBOX_TEST_SIMPLE_DIR_TO_CREATE_AND_DELETE))\n        self.luigiconn.remove(DROPBOX_TEST_SIMPLE_DIR_TO_CREATE_AND_DELETE)\n        self.assertFalse(self.luigiconn.exists(DROPBOX_TEST_SIMPLE_DIR_TO_CREATE_AND_DELETE + \"/\"))\n\n    def test_lifecycle_of_dirpath_with_several_trailing_mixed_2(self):\n        self.luigiconn.mkdir(DROPBOX_TEST_SIMPLE_DIR_TO_CREATE_AND_DELETE)\n        self.assertTrue(self.luigiconn.exists(DROPBOX_TEST_SIMPLE_DIR_TO_CREATE_AND_DELETE + \"/\"))\n        self.luigiconn.remove(DROPBOX_TEST_SIMPLE_DIR_TO_CREATE_AND_DELETE + \"/\")\n        self.assertFalse(self.luigiconn.exists(DROPBOX_TEST_SIMPLE_DIR_TO_CREATE_AND_DELETE))\n\n    def test_mkdir_new_dir_two_slashes(self):\n        with self.assertRaises(dropbox.dropbox_client.ApiError):\n            self.luigiconn.mkdir(DROPBOX_TEST_SIMPLE_DIR_TO_CREATE_AND_DELETE + \"//\")\n\n    def test_mkdir_recreate_dir(self):\n        try:\n            self.luigiconn.mkdir(DROPBOX_TEST_SIMPLE_DIR)\n        except Exception as ex:\n            self.fail(\"mkdir with default options raises Exception:\" + str(ex))\n\n        try:\n            self.luigiconn.mkdir(DROPBOX_TEST_SIMPLE_DIR, raise_if_exists=False)\n        except Exception as ex:\n            self.fail(\"mkdir with 'raise_if_exists=False' raises Exception:\" + str(ex))\n\n        with self.assertRaises(luigi.target.FileAlreadyExists):\n            self.luigiconn.mkdir(DROPBOX_TEST_SIMPLE_DIR, raise_if_exists=True)\n\n    def test_mkdir_recreate_slashed_dir(self):\n        try:\n            self.luigiconn.mkdir(DROPBOX_TEST_SIMPLE_DIR + \"/\")\n        except Exception as ex:\n            self.fail(\"mkdir with default options raises Exception:\" + str(ex))\n\n        try:\n            self.luigiconn.mkdir(DROPBOX_TEST_SIMPLE_DIR + \"/\", raise_if_exists=False)\n        except Exception as ex:\n            self.fail(\"mkdir with 'raise_if_exists=False' raises Exception:\" + str(ex))\n\n        with self.assertRaises(luigi.target.FileAlreadyExists):\n            self.luigiconn.mkdir(DROPBOX_TEST_SIMPLE_DIR + \"/\", raise_if_exists=True)\n\n    def test_mkdir_recreate_file(self):\n        with self.assertRaises(luigi.target.NotADirectory):\n            self.luigiconn.mkdir(DROPBOX_TEST_SIMPLE_FILE)\n\n        with self.assertRaises(luigi.target.NotADirectory):\n            self.luigiconn.mkdir(DROPBOX_TEST_SIMPLE_FILE, raise_if_exists=True)\n\n        with self.assertRaises(luigi.target.NotADirectory):\n            self.luigiconn.mkdir(DROPBOX_TEST_SIMPLE_FILE, raise_if_exists=False)\n\n    def test_isdir(self):\n        self.assertTrue(self.luigiconn.isdir(\"/\"))\n        self.assertTrue(self.luigiconn.isdir(DROPBOX_TEST_PATH))\n        self.assertTrue(self.luigiconn.isdir(DROPBOX_TEST_SIMPLE_DIR))\n        self.assertTrue(self.luigiconn.isdir(DROPBOX_TEST_SIMPLE_DIR + \"/\"))\n\n        self.assertFalse(self.luigiconn.isdir(DROPBOX_TEST_SIMPLE_FILE))\n        self.assertFalse(self.luigiconn.isdir(DROPBOX_TEST_NON_EXISTING_FILE))\n        self.assertFalse(self.luigiconn.isdir(DROPBOX_TEST_NON_EXISTING_FILE + \"/\"))\n\n    def test_move(self):\n        md, res = self.dropbox_api.files_download(DROPBOX_TEST_FILE_TO_MOVE_ORIG)\n        initial_contents = res.content\n\n        self.luigiconn.move(DROPBOX_TEST_FILE_TO_MOVE_ORIG, DROPBOX_TEST_FILE_TO_MOVE_DEST)\n\n        md, res = self.dropbox_api.files_download(DROPBOX_TEST_FILE_TO_MOVE_DEST)\n        after_moving_contents = res.content\n\n        self.assertEqual(initial_contents, after_moving_contents)\n        self.assertFalse(self.luigiconn.exists(DROPBOX_TEST_FILE_TO_MOVE_ORIG))\n        self.assertTrue(self.luigiconn.exists(DROPBOX_TEST_FILE_TO_MOVE_DEST))\n\n    def test_copy(self):\n        md, res = self.dropbox_api.files_download(DROPBOX_TEST_FILE_TO_COPY_ORIG)\n        initial_contents = res.content\n\n        self.luigiconn.copy(DROPBOX_TEST_FILE_TO_COPY_ORIG, DROPBOX_TEST_FILE_TO_COPY_DEST)\n\n        md, res = self.dropbox_api.files_download(DROPBOX_TEST_FILE_TO_COPY_DEST)\n        after_copyng_contents = res.content\n\n        self.assertEqual(initial_contents, after_copyng_contents)\n        self.assertTrue(self.luigiconn.exists(DROPBOX_TEST_FILE_TO_COPY_ORIG))\n        self.assertTrue(self.luigiconn.exists(DROPBOX_TEST_FILE_TO_COPY_DEST))\n\n\n@pytest.mark.dropbox\nclass TestDropboxTarget(unittest.TestCase):\n    def setUp(self):\n        self.luigiconn = DropboxClient(DROPBOX_APP_TOKEN)\n        self.dropbox_api = dropbox.dropbox_client.Dropbox(DROPBOX_APP_TOKEN)\n\n        self.initial_contents = b\"\\x00hello\\xff\\x00-\\xe2\\x82\\x28\"  # Binary invalid-utf8 sequence\n        self.dropbox_api.files_upload(self.initial_contents, DROPBOX_TEST_SIMPLE_FILE)\n\n    def tearDown(self):\n        self.dropbox_api.files_delete_v2(DROPBOX_TEST_PATH)\n        self.dropbox_api._session.close()\n\n    def test_download_from_dropboxtarget_to_local(self):\n        class Download(luigi.ExternalTask):\n            dbx_path = luigi.Parameter()\n\n            def output(self):\n                return luigi.contrib.dropbox.DropboxTarget(self.dbx_path, DROPBOX_APP_TOKEN, format=NopFormat())\n\n        class DbxToLocalTask(luigi.Task):\n            local_path = luigi.Parameter()\n            dbx_path = luigi.Parameter()\n\n            def requires(self):\n                return Download(dbx_path=self.dbx_path)\n\n            def output(self):\n                return luigi.LocalTarget(path=self.local_path, format=NopFormat())\n\n            def run(self):\n                with self.input().open(\"r\") as dbxfile, self.output().open(\"w\") as localfile:\n                    remote_contents = dbxfile.read()\n                    localfile.write(remote_contents * 3)\n\n        tmp_file = tempfile.mkdtemp() + os.sep + \"tmp.file\"\n        luigi.build([DbxToLocalTask(dbx_path=DROPBOX_TEST_SIMPLE_FILE, local_path=tmp_file)], local_scheduler=True)\n\n        expected_contents = self.initial_contents * 3\n        with open(tmp_file, \"rb\") as f:\n            actual_contents = f.read()\n\n        self.assertEqual(expected_contents, actual_contents)\n\n    def test_write_small_text_file_to_dropbox(self):\n        small_input_text = \"The greatest glory in living lies not in never falling\\nbut in rising every time we fall.\"\n\n        class WriteToDrobopxTest(luigi.Task):\n            def output(self):\n                return luigi.contrib.dropbox.DropboxTarget(DROPBOX_TEST_FILE_TO_UPLOAD_TEXT, DROPBOX_APP_TOKEN)\n\n            def run(self):\n                with self.output().open(\"w\") as dbxfile:\n                    dbxfile.write(small_input_text)\n\n        luigi.build([WriteToDrobopxTest()], local_scheduler=True)\n        actual_content = self.dropbox_api.files_download(DROPBOX_TEST_FILE_TO_UPLOAD_TEXT)[1].content\n        self.assertEqual(actual_content.decode(), small_input_text)\n\n    def aux_write_binary_file_to_dropbox(self, multiplier):\n        large_contents = b\"X\\n\\xe2\\x28\\xa1\" * multiplier\n        output_file = DROPBOX_TEST_FILE_TO_UPLOAD_LARGE + str(multiplier)\n\n        class WriteToDrobopxTest(luigi.Task):\n            def output(self):\n                return luigi.contrib.dropbox.DropboxTarget(output_file, DROPBOX_APP_TOKEN, format=luigi.format.Nop)\n\n            def run(self):\n                with self.output().open(\"w\") as dbxfile:\n                    dbxfile.write(large_contents)\n\n        luigi.build([WriteToDrobopxTest()], local_scheduler=True)\n        actual_content = self.dropbox_api.files_download(output_file)[1].content\n        self.assertEqual(actual_content, large_contents)\n\n    def test_write_small_binary_file_to_dropbox(self):\n        self.aux_write_binary_file_to_dropbox(1024)\n\n    def test_write_medium_binary_file_to_dropbox(self):\n        self.aux_write_binary_file_to_dropbox(1024 * 1024)\n\n    def test_write_large_binary_file_to_dropbox(self):\n        self.aux_write_binary_file_to_dropbox(3 * 1024 * 1024)\n\n    def test_write_using_nondefault_format(self):\n        contents = b\"X\\n\\xe2\\x28\\xa1\"\n\n        class WriteToDrobopxTest(luigi.Task):\n            def output(self):\n                return luigi.contrib.dropbox.DropboxTarget(DROPBOX_TEST_FILE_TO_UPLOAD_BZIP2, DROPBOX_APP_TOKEN, format=luigi.format.Bzip2)\n\n            def run(self):\n                with self.output().open(\"w\") as bzip2_dbxfile:\n                    bzip2_dbxfile.write(contents)\n\n        luigi.build([WriteToDrobopxTest()], local_scheduler=True)\n\n        remote_content = self.dropbox_api.files_download(DROPBOX_TEST_FILE_TO_UPLOAD_BZIP2)[1].content\n        self.assertEqual(contents, bz2.decompress(remote_content))\n\n    def test_write_using_a_temporary_path(self):\n        contents = b\"X\\n\\xe2\\x28\\xa1\"\n\n        class WriteToDrobopxTest(luigi.Task):\n            def output(self):\n                return luigi.contrib.dropbox.DropboxTarget(DROPBOX_TEST_FILE_TO_UPLOAD_BIN, DROPBOX_APP_TOKEN)\n\n            def run(self):\n                with self.output().temporary_path() as tmp_path:\n                    open(tmp_path, \"wb\").write(contents)\n\n        luigi.build([WriteToDrobopxTest()], local_scheduler=True)\n        actual_content = self.dropbox_api.files_download(DROPBOX_TEST_FILE_TO_UPLOAD_BIN)[1].content\n        self.assertEqual(actual_content, contents)\n"
  },
  {
    "path": "test/contrib/ecs_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2015 Outlier Bio, LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nIntegration test for the Luigi wrapper of EC2 Container Service (ECSTask)\n\nRequires:\n\n- boto3 package\n- Amazon AWS credentials discoverable by boto3 (e.g., by using ``aws configure``\nfrom awscli_)\n- A running ECS cluster (see `ECS Get Started`_)\n\nWritten and maintained by Jake Feala (@jfeala) for Outlier Bio (@outlierbio)\n\n.. _awscli: https://aws.amazon.com/cli\n.. _`ECS Get Started`: http://docs.aws.amazon.com/AmazonECS/latest/developerguide/ECS_GetStarted.html\n\"\"\"\n\nimport unittest\n\nimport pytest\nfrom moto import mock_ecs\n\nimport luigi\nfrom luigi.contrib.ecs import ECSTask, _get_task_statuses\n\ntry:\n    import boto3\nexcept ImportError:\n    raise unittest.SkipTest(\"boto3 is not installed. ECSTasks require boto3\")\n\nTEST_TASK_DEF = {\n    \"family\": \"hello-world\",\n    \"volumes\": [],\n    \"containerDefinitions\": [\n        {\"memory\": 1, \"essential\": True, \"name\": \"hello-world\", \"image\": \"ubuntu\", \"command\": [\"/bin/echo\", \"hello world\"]},\n        {\"memory\": 1, \"essential\": True, \"name\": \"hello-world-2\", \"image\": \"ubuntu\", \"command\": [\"/bin/echo\", \"hello world #2!\"]},\n    ],\n}\n\n\nclass ECSTaskNoOutput(ECSTask):\n    def complete(self):\n        if self.ecs_task_ids:\n            return all([status == \"STOPPED\" for status in _get_task_statuses(self.ecs_task_ids)])\n        return False\n\n\nclass ECSTaskOverrideCommand(ECSTaskNoOutput):\n    @property\n    def command(self):\n        return [{\"name\": \"hello-world\", \"command\": [\"/bin/sleep\", \"10\"]}]\n\n\nclass ECSTaskCustomRunTaskKwargs(ECSTaskNoOutput):\n    @property\n    def run_task_kwargs(self):\n        return {\"overrides\": {\"ephemeralStorage\": {\"sizeInGiB\": 30}}}\n\n\nclass ECSTaskCustomRunTaskKwargsWithCollidingCommand(ECSTaskNoOutput):\n    @property\n    def command(self):\n        return [\n            {\"name\": \"hello-world\", \"command\": [\"/bin/sleep\", \"10\"]},\n            {\"name\": \"hello-world-2\", \"command\": [\"/bin/sleep\", \"10\"]},\n        ]\n\n    @property\n    def run_task_kwargs(self):\n        return {\n            \"launchType\": \"FARGATE\",\n            \"platformVersion\": \"1.4.0\",\n            \"networkConfiguration\": {\n                \"awsvpcConfiguration\": {\n                    \"subnets\": [\"subnet-01234567890abcdef\", \"subnet-abcdef01234567890\"],\n                    \"securityGroups\": [\n                        \"sg-abcdef01234567890\",\n                    ],\n                    \"assignPublicIp\": \"ENABLED\",\n                }\n            },\n            \"overrides\": {\"containerOverrides\": [{\"name\": \"hello-world-2\", \"command\": [\"command-to-be-overwritten\"]}], \"ephemeralStorage\": {\"sizeInGiB\": 30}},\n        }\n\n\nclass ECSTaskCustomRunTaskKwargsWithMergedCommands(ECSTaskNoOutput):\n    @property\n    def command(self):\n        return [{\"name\": \"hello-world\", \"command\": [\"/bin/sleep\", \"10\"]}]\n\n    @property\n    def run_task_kwargs(self):\n        return {\n            \"launchType\": \"FARGATE\",\n            \"platformVersion\": \"1.4.0\",\n            \"networkConfiguration\": {\n                \"awsvpcConfiguration\": {\n                    \"subnets\": [\"subnet-01234567890abcdef\", \"subnet-abcdef01234567890\"],\n                    \"securityGroups\": [\n                        \"sg-abcdef01234567890\",\n                    ],\n                    \"assignPublicIp\": \"ENABLED\",\n                }\n            },\n            \"overrides\": {\"containerOverrides\": [{\"name\": \"hello-world-2\", \"command\": [\"/bin/sleep\", \"10\"]}], \"ephemeralStorage\": {\"sizeInGiB\": 30}},\n        }\n\n\n@pytest.mark.aws\nclass TestECSTask(unittest.TestCase):\n    @mock_ecs\n    def setUp(self):\n        # Register the test task definition\n        response = boto3.client(\"ecs\").register_task_definition(**TEST_TASK_DEF)\n        self.arn = response[\"taskDefinition\"][\"taskDefinitionArn\"]\n\n    @mock_ecs\n    def test_unregistered_task(self):\n        t = ECSTaskNoOutput(task_def=TEST_TASK_DEF)\n        luigi.build([t], local_scheduler=True)\n\n    @mock_ecs\n    def test_registered_task(self):\n        t = ECSTaskNoOutput(task_def_arn=self.arn)\n        luigi.build([t], local_scheduler=True)\n\n    @mock_ecs\n    def test_override_command(self):\n        t = ECSTaskOverrideCommand(task_def_arn=self.arn)\n        luigi.build([t], local_scheduler=True)\n\n    @mock_ecs\n    def test_custom_run_task_kwargs(self):\n        t = ECSTaskCustomRunTaskKwargs(task_def_arn=self.arn)\n        self.assertEqual(t.combined_overrides, {\"ephemeralStorage\": {\"sizeInGiB\": 30}})\n        luigi.build([t], local_scheduler=True)\n\n    @mock_ecs\n    def test_custom_run_task_kwargs_with_colliding_command(self):\n        t = ECSTaskCustomRunTaskKwargsWithCollidingCommand(task_def_arn=self.arn)\n        combined_overrides = t.combined_overrides\n        self.assertEqual(\n            sorted(combined_overrides[\"containerOverrides\"], key=lambda x: x[\"name\"]),\n            sorted(\n                [\n                    {\"name\": \"hello-world\", \"command\": [\"/bin/sleep\", \"10\"]},\n                    {\"name\": \"hello-world-2\", \"command\": [\"/bin/sleep\", \"10\"]},\n                ],\n                key=lambda x: x[\"name\"],\n            ),\n        )\n        self.assertEqual(combined_overrides[\"ephemeralStorage\"], {\"sizeInGiB\": 30})\n        luigi.build([t], local_scheduler=True)\n\n    @mock_ecs\n    def test_custom_run_task_kwargs_with_merged_commands(self):\n        t = ECSTaskCustomRunTaskKwargsWithMergedCommands(task_def_arn=self.arn)\n        combined_overrides = t.combined_overrides\n        self.assertEqual(\n            sorted(combined_overrides[\"containerOverrides\"], key=lambda x: x[\"name\"]),\n            sorted(\n                [\n                    {\"name\": \"hello-world\", \"command\": [\"/bin/sleep\", \"10\"]},\n                    {\"name\": \"hello-world-2\", \"command\": [\"/bin/sleep\", \"10\"]},\n                ],\n                key=lambda x: x[\"name\"],\n            ),\n        )\n        self.assertEqual(combined_overrides[\"ephemeralStorage\"], {\"sizeInGiB\": 30})\n        luigi.build([t], local_scheduler=True)\n"
  },
  {
    "path": "test/contrib/esindex_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nTests for Elasticsearch index (esindex) target and indexing.\n\nAn Elasticsearch server must be running for these tests.\n\nTo use a non-standard host and port, use `ESINDEX_TEST_HOST`,\n`ESINDEX_TEST_PORT` environment variables to override defaults.\n\nTo test HTTP basic authentication `ESINDEX_TEST_HTTP_AUTH`.\n\nExample running tests against port 9201 with basic auth:\n\n    $ ESINDEX_TEST_PORT=9201 ESINDEX_TEST_HTTP_AUTH='admin:admin' nosetests test/_esindex_test.py\n\n\"\"\"\n\n# pylint: disable=C0103,E1101,F0401\nimport collections\nimport datetime\nimport os\n\nimport elasticsearch\nimport pytest\nfrom elasticsearch.connection import Urllib3HttpConnection\nfrom helpers import unittest\n\nimport luigi\nfrom luigi.contrib.esindex import CopyToIndex, ElasticsearchTarget\n\nHOST = os.getenv(\"ESINDEX_TEST_HOST\", \"localhost\")\nPORT = os.getenv(\"ESINDEX_TEST_PORT\", 9200)\nHTTP_AUTH = os.getenv(\"ESINDEX_TEST_HTTP_AUTH\", None)\nINDEX = \"esindex_luigi_test\"\nDOC_TYPE = \"esindex_test_type\"\nMARKER_INDEX = \"esindex_luigi_test_index_updates\"\nMARKER_DOC_TYPE = \"esindex_test_entry\"\n\n\ndef _create_test_index():\n    \"\"\"Create content index, if if does not exists.\"\"\"\n    es = elasticsearch.Elasticsearch(connection_class=Urllib3HttpConnection, host=HOST, port=PORT, http_auth=HTTP_AUTH)\n    if not es.indices.exists(INDEX):\n        es.indices.create(INDEX)\n\n\ntry:\n    _create_test_index()\nexcept Exception:\n    raise unittest.SkipTest(\"Unable to connect to ElasticSearch\")\n\n\n@pytest.mark.aws\nclass ElasticsearchTargetTest(unittest.TestCase):\n    \"\"\"Test touch and exists.\"\"\"\n\n    def test_touch_and_exists(self):\n        \"\"\"Basic test.\"\"\"\n        target = ElasticsearchTarget(HOST, PORT, INDEX, DOC_TYPE, \"update_id\", http_auth=HTTP_AUTH)\n        target.marker_index = MARKER_INDEX\n        target.marker_doc_type = MARKER_DOC_TYPE\n\n        delete()\n        self.assertFalse(target.exists(), \"Target should not exist before touching it\")\n        target.touch()\n        self.assertTrue(target.exists(), \"Target should exist after touching it\")\n        delete()\n\n\ndef delete():\n    \"\"\"Delete marker_index, if it exists.\"\"\"\n    es = elasticsearch.Elasticsearch(connection_class=Urllib3HttpConnection, host=HOST, port=PORT, http_auth=HTTP_AUTH)\n    if es.indices.exists(MARKER_INDEX):\n        es.indices.delete(MARKER_INDEX)\n    es.indices.refresh()\n\n\nclass CopyToTestIndex(CopyToIndex):\n    \"\"\"Override the default `marker_index` table with a test name.\"\"\"\n\n    host = HOST\n    port = PORT\n    http_auth = HTTP_AUTH\n    index = INDEX\n    doc_type = DOC_TYPE\n    marker_index_hist_size = 0\n\n    def output(self):\n        \"\"\"Use a test target with an own marker_index.\"\"\"\n        target = ElasticsearchTarget(\n            host=self.host,\n            port=self.port,\n            http_auth=self.http_auth,\n            index=self.index,\n            doc_type=self.doc_type,\n            update_id=self.update_id(),\n            marker_index_hist_size=self.marker_index_hist_size,\n        )\n        target.marker_index = MARKER_INDEX\n        target.marker_doc_type = MARKER_DOC_TYPE\n        return target\n\n\nclass IndexingTask1(CopyToTestIndex):\n    \"\"\"Test the redundant version, where `_index` and `_type` are\n    given in the `docs` as well. A more DRY example is `IndexingTask2`.\"\"\"\n\n    def docs(self):\n        \"\"\"Return a list with a single doc.\"\"\"\n        return [{\"_id\": 123, \"_index\": self.index, \"_type\": self.doc_type, \"name\": \"sample\", \"date\": \"today\"}]\n\n\nclass IndexingTask2(CopyToTestIndex):\n    \"\"\"Just another task.\"\"\"\n\n    def docs(self):\n        \"\"\"Return a list with a single doc.\"\"\"\n        return [{\"_id\": 234, \"_index\": self.index, \"_type\": self.doc_type, \"name\": \"another\", \"date\": \"today\"}]\n\n\nclass IndexingTask3(CopyToTestIndex):\n    \"\"\"This task will request an empty index to start with.\"\"\"\n\n    purge_existing_index = True\n\n    def docs(self):\n        \"\"\"Return a list with a single doc.\"\"\"\n        return [{\"_id\": 234, \"_index\": self.index, \"_type\": self.doc_type, \"name\": \"yet another\", \"date\": \"today\"}]\n\n\ndef _cleanup():\n    \"\"\"Delete both the test marker index and the content index.\"\"\"\n    es = elasticsearch.Elasticsearch(connection_class=Urllib3HttpConnection, host=HOST, port=PORT, http_auth=HTTP_AUTH)\n    if es.indices.exists(MARKER_INDEX):\n        es.indices.delete(MARKER_INDEX)\n    if es.indices.exists(INDEX):\n        es.indices.delete(INDEX)\n\n\n@pytest.mark.aws\nclass CopyToIndexTest(unittest.TestCase):\n    \"\"\"Test indexing tasks.\"\"\"\n\n    @classmethod\n    def setUpClass(cls):\n        cls.es = elasticsearch.Elasticsearch(connection_class=Urllib3HttpConnection, host=HOST, port=PORT, http_auth=HTTP_AUTH)\n\n    def setUp(self):\n        \"\"\"Cleanup before each test.\"\"\"\n        _cleanup()\n\n    def tearDown(self):\n        \"\"\"Remove residues after each test.\"\"\"\n        _cleanup()\n\n    def test_copy_to_index(self):\n        \"\"\"Test a single document upload.\"\"\"\n        task = IndexingTask1()\n        self.assertFalse(self.es.indices.exists(task.index))\n        self.assertFalse(task.complete())\n        luigi.build([task], local_scheduler=True)\n        self.assertTrue(self.es.indices.exists(task.index))\n        self.assertTrue(task.complete())\n        self.assertEqual(1, self.es.count(index=task.index).get(\"count\"))\n        self.assertEqual({\"date\": \"today\", \"name\": \"sample\"}, self.es.get_source(index=task.index, doc_type=task.doc_type, id=123))\n\n    def test_copy_to_index_incrementally(self):\n        \"\"\"Test two tasks that upload docs into the same index.\"\"\"\n        task1 = IndexingTask1()\n        task2 = IndexingTask2()\n        self.assertFalse(self.es.indices.exists(task1.index))\n        self.assertFalse(self.es.indices.exists(task2.index))\n        self.assertFalse(task1.complete())\n        self.assertFalse(task2.complete())\n        luigi.build([task1, task2], local_scheduler=True)\n        self.assertTrue(self.es.indices.exists(task1.index))\n        self.assertTrue(self.es.indices.exists(task2.index))\n        self.assertTrue(task1.complete())\n        self.assertTrue(task2.complete())\n        self.assertEqual(2, self.es.count(index=task1.index).get(\"count\"))\n        self.assertEqual(2, self.es.count(index=task2.index).get(\"count\"))\n\n        self.assertEqual({\"date\": \"today\", \"name\": \"sample\"}, self.es.get_source(index=task1.index, doc_type=task1.doc_type, id=123))\n\n        self.assertEqual({\"date\": \"today\", \"name\": \"another\"}, self.es.get_source(index=task2.index, doc_type=task2.doc_type, id=234))\n\n    def test_copy_to_index_purge_existing(self):\n        \"\"\"Test purge_existing_index purges index.\"\"\"\n        task1 = IndexingTask1()\n        task2 = IndexingTask2()\n        task3 = IndexingTask3()\n        luigi.build([task1, task2], local_scheduler=True)\n        luigi.build([task3], local_scheduler=True)\n        self.assertTrue(self.es.indices.exists(task3.index))\n        self.assertTrue(task3.complete())\n        self.assertEqual(1, self.es.count(index=task3.index).get(\"count\"))\n\n        self.assertEqual({\"date\": \"today\", \"name\": \"yet another\"}, self.es.get_source(index=task3.index, doc_type=task3.doc_type, id=234))\n\n\n@pytest.mark.aws\nclass MarkerIndexTest(unittest.TestCase):\n    @classmethod\n    def setUpClass(cls):\n        cls.es = elasticsearch.Elasticsearch(connection_class=Urllib3HttpConnection, host=HOST, port=PORT, http_auth=HTTP_AUTH)\n\n    def setUp(self):\n        \"\"\"Cleanup before each test.\"\"\"\n        _cleanup()\n\n    def tearDown(self):\n        \"\"\"Remove residues after each test.\"\"\"\n        _cleanup()\n\n    def test_update_marker(self):\n        def will_raise():\n            self.es.count(index=MARKER_INDEX, doc_type=MARKER_DOC_TYPE, body={\"query\": {\"match_all\": {}}})\n\n        self.assertRaises(elasticsearch.NotFoundError, will_raise)\n\n        task1 = IndexingTask1()\n        luigi.build([task1], local_scheduler=True)\n\n        result = self.es.count(index=MARKER_INDEX, doc_type=MARKER_DOC_TYPE, body={\"query\": {\"match_all\": {}}})\n        self.assertEqual(1, result.get(\"count\"))\n\n        result = self.es.search(index=MARKER_INDEX, doc_type=MARKER_DOC_TYPE, body={\"query\": {\"match_all\": {}}})\n        marker_doc = result.get(\"hits\").get(\"hits\")[0].get(\"_source\")\n        self.assertEqual(task1.task_id, marker_doc.get(\"update_id\"))\n        self.assertEqual(INDEX, marker_doc.get(\"target_index\"))\n        self.assertEqual(DOC_TYPE, marker_doc.get(\"target_doc_type\"))\n        self.assertTrue(\"date\" in marker_doc)\n\n        task2 = IndexingTask2()\n        luigi.build([task2], local_scheduler=True)\n\n        result = self.es.count(index=MARKER_INDEX, doc_type=MARKER_DOC_TYPE, body={\"query\": {\"match_all\": {}}})\n        self.assertEqual(2, result.get(\"count\"))\n\n        result = self.es.search(index=MARKER_INDEX, doc_type=MARKER_DOC_TYPE, body={\"query\": {\"match_all\": {}}})\n        hits = result.get(\"hits\").get(\"hits\")\n        Entry = collections.namedtuple(\"Entry\", [\"date\", \"update_id\"])\n        dates_update_id = []\n        for hit in hits:\n            source = hit.get(\"_source\")\n            update_id = source.get(\"update_id\")\n            date = source.get(\"date\")\n            dates_update_id.append(Entry(date, update_id))\n\n        it = iter(sorted(dates_update_id))\n        first = next(it)\n        second = next(it)\n        self.assertTrue(first.date < second.date)\n        self.assertEqual(first.update_id, task1.task_id)\n        self.assertEqual(second.update_id, task2.task_id)\n\n\nclass IndexingTask4(CopyToTestIndex):\n    \"\"\"Just another task.\"\"\"\n\n    date = luigi.DateParameter(default=datetime.date(1970, 1, 1))\n    marker_index_hist_size = 1\n\n    def docs(self):\n        \"\"\"Return a list with a single doc.\"\"\"\n        return [{\"_id\": 234, \"_index\": self.index, \"_type\": self.doc_type, \"name\": \"another\", \"date\": \"today\"}]\n\n\n@pytest.mark.aws\nclass IndexHistSizeTest(unittest.TestCase):\n    @classmethod\n    def setUpClass(cls):\n        cls.es = elasticsearch.Elasticsearch(connection_class=Urllib3HttpConnection, host=HOST, port=PORT, http_auth=HTTP_AUTH)\n\n    def setUp(self):\n        \"\"\"Cleanup before each test.\"\"\"\n        _cleanup()\n\n    def tearDown(self):\n        \"\"\"Remove residues after each test.\"\"\"\n        _cleanup()\n\n    def test_limited_history(self):\n\n        task4_1 = IndexingTask4(date=datetime.date(2000, 1, 1))\n        luigi.build([task4_1], local_scheduler=True)\n\n        task4_2 = IndexingTask4(date=datetime.date(2001, 1, 1))\n        luigi.build([task4_2], local_scheduler=True)\n\n        task4_3 = IndexingTask4(date=datetime.date(2002, 1, 1))\n        luigi.build([task4_3], local_scheduler=True)\n\n        result = self.es.count(index=MARKER_INDEX, doc_type=MARKER_DOC_TYPE, body={\"query\": {\"match_all\": {}}})\n        self.assertEqual(1, result.get(\"count\"))\n        marker_index_document_id = task4_3.output().marker_index_document_id()\n        result = self.es.get(id=marker_index_document_id, index=MARKER_INDEX, doc_type=MARKER_DOC_TYPE)\n        self.assertEqual(task4_3.task_id, result.get(\"_source\").get(\"update_id\"))\n"
  },
  {
    "path": "test/contrib/external_daily_snapshot_test.py",
    "content": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n\nimport datetime\nimport unittest\n\nimport luigi\nfrom luigi.contrib.external_daily_snapshot import ExternalDailySnapshot\nfrom luigi.mock import MockTarget\n\n\nclass DataDump(ExternalDailySnapshot):\n    param = luigi.Parameter()\n    a = luigi.Parameter(default=\"zebra\")\n    aa = luigi.Parameter(default=\"Congo\")\n\n    def output(self):\n        return MockTarget(\"data-%s-%s-%s-%s\" % (self.param, self.a, self.aa, self.date))\n\n\nclass ExternalDailySnapshotTest(unittest.TestCase):\n    def test_latest(self):\n        MockTarget(\"data-xyz-zebra-Congo-2012-01-01\").open(\"w\").close()\n        d = DataDump.latest(date=datetime.date(2012, 1, 10), param=\"xyz\")\n        self.assertEqual(d.date, datetime.date(2012, 1, 1))\n\n    def test_latest_not_exists(self):\n        MockTarget(\"data-abc-zebra-Congo-2012-01-01\").open(\"w\").close()\n        d = DataDump.latest(date=datetime.date(2012, 1, 11), param=\"abc\", lookback=5)\n        self.assertEqual(d.date, datetime.date(2012, 1, 7))\n\n    def test_deterministic(self):\n        MockTarget(\"data-pqr-zebra-Congo-2012-01-01\").open(\"w\").close()\n        d = DataDump.latest(date=datetime.date(2012, 1, 10), param=\"pqr\", a=\"zebra\", aa=\"Congo\")\n        self.assertEqual(d.date, datetime.date(2012, 1, 1))\n\n        MockTarget(\"data-pqr-zebra-Congo-2012-01-05\").open(\"w\").close()\n        d = DataDump.latest(date=datetime.date(2012, 1, 10), param=\"pqr\", aa=\"Congo\", a=\"zebra\")\n        self.assertEqual(d.date, datetime.date(2012, 1, 1))  # Should still be the same\n"
  },
  {
    "path": "test/contrib/external_program_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2016 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nimport os\nimport shutil\nimport subprocess\nimport tempfile\nfrom functools import partial\nfrom io import BytesIO\nfrom multiprocessing import Value\nfrom subprocess import Popen\n\nimport mock\nimport pytest\nfrom helpers import unittest\nfrom mock import call, patch\n\nimport luigi\nimport luigi.contrib.hdfs\nfrom luigi.contrib.external_program import ExternalProgramRunError, ExternalProgramTask, ExternalPythonProgramTask\n\n\ndef poll_generator():\n    yield None\n    yield 1\n\n\ndef setup_run_process(proc):\n    poll_gen = poll_generator()\n    proc.return_value.poll = lambda: next(poll_gen)\n    proc.return_value.returncode = 0\n    proc.return_value.stdout = BytesIO()\n    proc.return_value.stderr = BytesIO()\n\n\nclass TestExternalProgramTask(ExternalProgramTask):\n    def program_args(self):\n        return [\"app_path\", \"arg1\", \"arg2\"]\n\n    def output(self):\n        return luigi.LocalTarget(\"output\")\n\n\nclass TestLogStderrOnFailureOnlyTask(TestExternalProgramTask):\n    always_log_stderr = False\n\n\nclass TestTouchTask(ExternalProgramTask):\n    file_path = luigi.Parameter()\n\n    def program_args(self):\n        return [\"touch\", self.output().path]\n\n    def output(self):\n        return luigi.LocalTarget(self.file_path)\n\n\nclass TestEchoTask(ExternalProgramTask):\n    MESSAGE = \"Hello, world!\"\n\n    def program_args(self):\n        return [\"echo\", self.MESSAGE]\n\n\n@pytest.mark.contrib\nclass ExternalProgramTaskTest(unittest.TestCase):\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_run(self, proc):\n        setup_run_process(proc)\n        job = TestExternalProgramTask()\n        job.run()\n\n        self.assertEqual(proc.call_args[0][0], [\"app_path\", \"arg1\", \"arg2\"])\n\n    @patch(\"luigi.contrib.external_program.logger\")\n    @patch(\"luigi.contrib.external_program.tempfile.TemporaryFile\")\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_handle_failed_job(self, proc, file, logger):\n        proc.return_value.returncode = 1\n        file.return_value = BytesIO(b\"stderr\")\n        try:\n            job = TestExternalProgramTask()\n            job.run()\n        except ExternalProgramRunError as e:\n            self.assertEqual(e.err, \"stderr\")\n            self.assertIn(\"STDERR: stderr\", str(e))\n            self.assertIn(call.info(\"Program stderr:\\nstderr\"), logger.mock_calls)\n        else:\n            self.fail(\"Should have thrown ExternalProgramRunError\")\n\n    @patch(\"luigi.contrib.external_program.logger\")\n    @patch(\"luigi.contrib.external_program.tempfile.TemporaryFile\")\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_always_log_stderr_on_failure(self, proc, file, logger):\n        proc.return_value.returncode = 1\n        file.return_value = BytesIO(b\"stderr\")\n        with self.assertRaises(ExternalProgramRunError):\n            job = TestLogStderrOnFailureOnlyTask()\n            job.run()\n\n        self.assertIn(call.info(\"Program stderr:\\nstderr\"), logger.mock_calls)\n\n    @patch(\"luigi.contrib.external_program.logger\")\n    @patch(\"luigi.contrib.external_program.tempfile.TemporaryFile\")\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_log_stderr_on_success_by_default(self, proc, file, logger):\n        proc.return_value.returncode = 0\n        file.return_value = BytesIO(b\"stderr\")\n        job = TestExternalProgramTask()\n        job.run()\n\n        self.assertIn(call.info(\"Program stderr:\\nstderr\"), logger.mock_calls)\n\n    def test_capture_output_set_to_false_writes_output_to_stdout(self):\n\n        out = tempfile.TemporaryFile()\n\n        def Popen_wrap(args, **kwargs):\n            kwargs.pop(\"stdout\", None)\n            return Popen(args, stdout=out, **kwargs)\n\n        with mock.patch(\"luigi.contrib.external_program.subprocess.Popen\", wraps=Popen_wrap):\n            task = TestEchoTask(capture_output=False)\n            task.run()\n            stdout = task._clean_output_file(out).strip()\n            self.assertEqual(stdout, task.MESSAGE)\n\n    @patch(\"luigi.contrib.external_program.logger\")\n    @patch(\"luigi.contrib.external_program.tempfile.TemporaryFile\")\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_dont_log_stderr_on_success_if_disabled(self, proc, file, logger):\n        proc.return_value.returncode = 0\n        file.return_value = BytesIO(b\"stderr\")\n        job = TestLogStderrOnFailureOnlyTask()\n        job.run()\n\n        self.assertNotIn(call.info(\"Program stderr:\\nstderr\"), logger.mock_calls)\n\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_program_args_must_be_implemented(self, proc):\n        with self.assertRaises(NotImplementedError):\n            job = ExternalProgramTask()\n            job.run()\n\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_app_interruption(self, proc):\n\n        def interrupt():\n            raise KeyboardInterrupt()\n\n        proc.return_value.wait = interrupt\n        try:\n            job = TestExternalProgramTask()\n            job.run()\n        except KeyboardInterrupt:\n            pass\n        proc.return_value.kill.check_called()\n\n    def test_non_mocked_task_run(self):\n        # create a tempdir first, to ensure an empty playground for\n        # TestTouchTask to create its file in\n        tempdir = tempfile.mkdtemp()\n        tempfile_path = os.path.join(tempdir, \"testfile\")\n\n        try:\n            job = TestTouchTask(file_path=tempfile_path)\n            job.run()\n\n            self.assertTrue(luigi.LocalTarget(tempfile_path).exists())\n        finally:\n            # clean up temp files even if assertion fails\n            shutil.rmtree(tempdir)\n\n    def test_tracking_url_pattern_works_with_capture_output_disabled(self):\n        test_val = Value(\"i\", 0)\n\n        def fake_set_tracking_url(val, url):\n            if url == \"TEXT\":\n                val.value += 1\n\n        task = TestEchoTask(capture_output=False, stream_for_searching_tracking_url=\"stdout\", tracking_url_pattern=r\"SOME (.*)\")\n        task.MESSAGE = \"SOME TEXT\"\n\n        with mock.patch.object(task, \"set_tracking_url\", new=partial(fake_set_tracking_url, test_val)):\n            task.run()\n            self.assertEqual(test_val.value, 1)\n\n    def test_tracking_url_pattern_works_with_capture_output_enabled(self):\n        test_val = Value(\"i\", 0)\n\n        def fake_set_tracking_url(val, url):\n            if url == \"THING\":\n                val.value += 1\n\n        task = TestEchoTask(capture_output=True, stream_for_searching_tracking_url=\"stdout\", tracking_url_pattern=r\"ANY(.*)\")\n        task.MESSAGE = \"ANYTHING\"\n\n        with mock.patch.object(task, \"set_tracking_url\", new=partial(fake_set_tracking_url, test_val)):\n            task.run()\n            self.assertEqual(test_val.value, 1)\n\n    def test_tracking_url_pattern_works_with_stderr(self):\n        test_val = Value(\"i\", 0)\n\n        def fake_set_tracking_url(val, url):\n            if url == \"THING_ELSE\":\n                val.value += 1\n\n        def Popen_wrap(args, **kwargs):\n            return Popen('>&2 echo \"ANYTHING_ELSE\"', shell=True, **kwargs)\n\n        task = TestEchoTask(capture_output=True, stream_for_searching_tracking_url=\"stderr\", tracking_url_pattern=r\"ANY(.*)\")\n\n        with mock.patch(\"luigi.contrib.external_program.subprocess.Popen\", wraps=Popen_wrap):\n            with mock.patch.object(task, \"set_tracking_url\", new=partial(fake_set_tracking_url, test_val)):\n                task.run()\n                self.assertEqual(test_val.value, 1)\n\n    def test_no_url_searching_is_performed_if_pattern_is_not_set(self):\n        def Popen_wrap(args, **kwargs):\n            # stdout should not be replaced with pipe if tracking_url_pattern is not set\n            self.assertNotEqual(kwargs[\"stdout\"], subprocess.PIPE)\n            return Popen(args, **kwargs)\n\n        task = TestEchoTask(capture_output=True, stream_for_searching_tracking_url=\"stdout\")\n\n        with mock.patch(\"luigi.contrib.external_program.subprocess.Popen\", wraps=Popen_wrap):\n            task.run()\n\n    def test_tracking_url_context_works_without_capture_output(self):\n        test_val = Value(\"i\", 0)\n\n        def fake_set_tracking_url(val, url):\n            if url == \"world\":\n                val.value += 1\n\n        task = TestEchoTask(capture_output=False, stream_for_searching_tracking_url=\"stdout\", tracking_url_pattern=r\"Hello, (.*)!\")\n        test_args = list(map(str, task.program_args()))\n        with mock.patch.object(task, \"set_tracking_url\", new=partial(fake_set_tracking_url, test_val)):\n            with task._proc_with_tracking_url_context(proc_args=test_args, proc_kwargs={}) as proc:\n                proc.wait()\n        self.assertEqual(test_val.value, 1)\n\n    def test_tracking_url_context_works_correctly_when_logs_output_pattern_to_url_is_not_default(self):\n\n        class _Task(TestEchoTask):\n            def build_tracking_url(self, logs_output):\n                return \"The {} is mine\".format(logs_output)\n\n        test_val = Value(\"i\", 0)\n\n        def fake_set_tracking_url(val, url):\n            if url == \"The world is mine\":\n                val.value += 1\n\n        task = _Task(capture_output=False, stream_for_searching_tracking_url=\"stdout\", tracking_url_pattern=r\"Hello, (.*)!\")\n\n        test_args = list(map(str, task.program_args()))\n\n        with mock.patch.object(task, \"set_tracking_url\", new=partial(fake_set_tracking_url, test_val)):\n            with task._proc_with_tracking_url_context(proc_args=test_args, proc_kwargs={}) as proc:\n                proc.wait()\n        self.assertEqual(test_val.value, 1)\n\n\nclass TestExternalPythonProgramTask(ExternalPythonProgramTask):\n    virtualenv = \"/path/to/venv\"\n    extra_pythonpath = \"/extra/pythonpath\"\n\n    def program_args(self):\n        return [\"app_path\", \"arg1\", \"arg2\"]\n\n    def output(self):\n        return luigi.LocalTarget(\"output\")\n\n\n@pytest.mark.contrib\nclass ExternalPythonProgramTaskTest(unittest.TestCase):\n    @patch.dict(\"os.environ\", {\"OTHERVAR\": \"otherval\"}, clear=True)\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_original_environment_is_kept_intact(self, proc):\n        setup_run_process(proc)\n\n        job = TestExternalPythonProgramTask()\n        job.run()\n\n        proc_env = proc.call_args[1][\"env\"]\n        self.assertIn(\"PYTHONPATH\", proc_env)\n        self.assertIn(\"OTHERVAR\", proc_env)\n\n    @patch.dict(\"os.environ\", {\"PATH\": \"/base/path\"}, clear=True)\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_venv_is_set_and_prepended_to_path(self, proc):\n        setup_run_process(proc)\n\n        job = TestExternalPythonProgramTask()\n        job.run()\n\n        proc_env = proc.call_args[1][\"env\"]\n        self.assertIn(\"PATH\", proc_env)\n        self.assertTrue(proc_env[\"PATH\"].startswith(\"/path/to/venv/bin\"))\n        self.assertTrue(proc_env[\"PATH\"].endswith(\"/base/path\"))\n        self.assertIn(\"VIRTUAL_ENV\", proc_env)\n        self.assertEqual(proc_env[\"VIRTUAL_ENV\"], \"/path/to/venv\")\n\n    @patch.dict(\"os.environ\", {}, clear=True)\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_pythonpath_is_set_if_empty(self, proc):\n        setup_run_process(proc)\n\n        job = TestExternalPythonProgramTask()\n        job.run()\n\n        proc_env = proc.call_args[1][\"env\"]\n        self.assertIn(\"PYTHONPATH\", proc_env)\n        self.assertTrue(proc_env[\"PYTHONPATH\"].startswith(\"/extra/pythonpath\"))\n\n    @patch.dict(\"os.environ\", {\"PYTHONPATH\": \"/base/pythonpath\"}, clear=True)\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_pythonpath_is_prepended_if_not_empty(self, proc):\n        setup_run_process(proc)\n\n        job = TestExternalPythonProgramTask()\n        job.run()\n\n        proc_env = proc.call_args[1][\"env\"]\n        self.assertIn(\"PYTHONPATH\", proc_env)\n        self.assertTrue(proc_env[\"PYTHONPATH\"].startswith(\"/extra/pythonpath\"))\n        self.assertTrue(proc_env[\"PYTHONPATH\"].endswith(\"/base/pythonpath\"))\n"
  },
  {
    "path": "test/contrib/gcs_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2015 Twitter Inc\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"This is an integration test for the GCS-luigi binding.\n\nThis test requires credentials that can access GCS & access to a bucket below.\nFollow the directions in the gcloud tools to set up local credentials.\n\"\"\"\n\nfrom helpers import unittest\n\ntry:\n    import google.auth\n    import googleapiclient.errors\nexcept ImportError:\n    raise unittest.SkipTest(\"Unable to load googleapiclient module\")\nimport os\nimport tempfile\nimport unittest\nfrom unittest import mock\n\nimport pytest\nfrom target_test import FileSystemTargetTestMixin\n\nfrom luigi.contrib import gcs\n\n# In order to run this test, you should set these to your GCS project/bucket.\n# Unfortunately there's no mock\nPROJECT_ID = os.environ.get(\"GCS_TEST_PROJECT_ID\", \"your_project_id_here\")\nBUCKET_NAME = os.environ.get(\"GCS_TEST_BUCKET\", \"your_test_bucket_here\")\nTEST_FOLDER = os.environ.get(\"TRAVIS_BUILD_ID\", \"gcs_test_folder\")\n\nCREDENTIALS, _ = google.auth.default()\nATTEMPTED_BUCKET_CREATE = False\n\n\ndef bucket_url(suffix):\n    \"\"\"\n    Actually it's bucket + test folder name\n    \"\"\"\n    return \"gs://{}/{}/{}\".format(BUCKET_NAME, TEST_FOLDER, suffix)\n\n\nclass _GCSBaseTestCase(unittest.TestCase):\n    def setUp(self):\n        self.client = gcs.GCSClient(CREDENTIALS)\n\n        global ATTEMPTED_BUCKET_CREATE\n        if not ATTEMPTED_BUCKET_CREATE:\n            try:\n                self.client.client.buckets().insert(project=PROJECT_ID, body={\"name\": BUCKET_NAME}).execute()\n            except googleapiclient.errors.HttpError as ex:\n                if ex.resp.status != 409:  # bucket already exists\n                    raise\n\n            ATTEMPTED_BUCKET_CREATE = True\n\n        self.client.remove(bucket_url(\"\"), recursive=True)\n        self.client.mkdir(bucket_url(\"\"))\n\n    def tearDown(self):\n        self.client.remove(bucket_url(\"\"), recursive=True)\n\n\n@pytest.mark.gcloud\nclass GCSClientTest(_GCSBaseTestCase):\n    def test_not_exists(self):\n        self.assertFalse(self.client.exists(bucket_url(\"does_not_exist\")))\n        self.assertFalse(self.client.isdir(bucket_url(\"does_not_exist\")))\n\n    def test_exists(self):\n        self.client.put_string(\"hello\", bucket_url(\"exists_test\"))\n        self.assertTrue(self.client.exists(bucket_url(\"exists_test\")))\n        self.assertFalse(self.client.isdir(bucket_url(\"exists_test\")))\n\n    def test_mkdir(self):\n        self.client.mkdir(bucket_url(\"exists_dir_test\"))\n        self.assertTrue(self.client.exists(bucket_url(\"exists_dir_test\")))\n        self.assertTrue(self.client.isdir(bucket_url(\"exists_dir_test\")))\n\n    def test_mkdir_by_upload(self):\n        self.client.put_string(\"hello\", bucket_url(\"test_dir_recursive/yep/file\"))\n        self.assertTrue(self.client.exists(bucket_url(\"test_dir_recursive\")))\n        self.assertTrue(self.client.isdir(bucket_url(\"test_dir_recursive\")))\n\n    def test_download(self):\n        self.client.put_string(\"hello\", bucket_url(\"test_download\"))\n        fp = self.client.download(bucket_url(\"test_download\"))\n        self.assertEqual(b\"hello\", fp.read())\n\n    def test_rename(self):\n        self.client.put_string(\"hello\", bucket_url(\"test_rename_1\"))\n        self.client.rename(bucket_url(\"test_rename_1\"), bucket_url(\"test_rename_2\"))\n        self.assertFalse(self.client.exists(bucket_url(\"test_rename_1\")))\n        self.assertTrue(self.client.exists(bucket_url(\"test_rename_2\")))\n\n    def test_rename_recursive(self):\n        self.client.mkdir(bucket_url(\"test_rename_recursive\"))\n        self.client.put_string(\"hello\", bucket_url(\"test_rename_recursive/1\"))\n        self.client.put_string(\"hello\", bucket_url(\"test_rename_recursive/2\"))\n        self.client.rename(bucket_url(\"test_rename_recursive\"), bucket_url(\"test_rename_recursive_dest\"))\n        self.assertFalse(self.client.exists(bucket_url(\"test_rename_recursive\")))\n        self.assertFalse(self.client.exists(bucket_url(\"test_rename_recursive/1\")))\n        self.assertTrue(self.client.exists(bucket_url(\"test_rename_recursive_dest\")))\n        self.assertTrue(self.client.exists(bucket_url(\"test_rename_recursive_dest/1\")))\n\n    def test_remove(self):\n        self.client.put_string(\"hello\", bucket_url(\"test_remove\"))\n        self.client.remove(bucket_url(\"test_remove\"))\n        self.assertFalse(self.client.exists(bucket_url(\"test_remove\")))\n\n    def test_remove_recursive(self):\n        self.client.mkdir(bucket_url(\"test_remove_recursive\"))\n        self.client.put_string(\"hello\", bucket_url(\"test_remove_recursive/1\"))\n        self.client.put_string(\"hello\", bucket_url(\"test_remove_recursive/2\"))\n        self.client.remove(bucket_url(\"test_remove_recursive\"))\n\n        self.assertFalse(self.client.exists(bucket_url(\"test_remove_recursive\")))\n        self.assertFalse(self.client.exists(bucket_url(\"test_remove_recursive/1\")))\n        self.assertFalse(self.client.exists(bucket_url(\"test_remove_recursive/2\")))\n\n    def test_listdir(self):\n        self.client.put_string(\"hello\", bucket_url(\"test_listdir/1\"))\n        self.client.put_string(\"hello\", bucket_url(\"test_listdir/2\"))\n\n        self.assertEqual([bucket_url(\"test_listdir/1\"), bucket_url(\"test_listdir/2\")], list(self.client.listdir(bucket_url(\"test_listdir/\"))))\n        self.assertEqual([bucket_url(\"test_listdir/1\"), bucket_url(\"test_listdir/2\")], list(self.client.listdir(bucket_url(\"test_listdir\"))))\n\n    def test_put_file(self):\n        with tempfile.NamedTemporaryFile() as fp:\n            lorem = b\"Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt\\n\"\n            # Larger file than chunk size, fails with incorrect progress set up\n            big = lorem * 41943\n            fp.write(big)\n            fp.flush()\n\n            self.client.put(fp.name, bucket_url(\"test_put_file\"))\n            self.assertTrue(self.client.exists(bucket_url(\"test_put_file\")))\n            self.assertEqual(big, self.client.download(bucket_url(\"test_put_file\")).read())\n\n    def test_put_file_multiproc(self):\n        temporary_fps = []\n        for _ in range(2):\n            fp = tempfile.NamedTemporaryFile(mode=\"wb\")\n\n            lorem = b\"Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt\\n\"\n            # Larger file than chunk size, fails with incorrect progress set up\n            big = lorem * 41943\n            fp.write(big)\n            fp.flush()\n            temporary_fps.append(fp)\n\n        filepaths = [f.name for f in temporary_fps]\n        self.client.put_multiple(filepaths, bucket_url(\"\"), num_process=2)\n\n        for fp in temporary_fps:\n            basename = os.path.basename(fp.name)\n            self.assertTrue(self.client.exists(bucket_url(basename)))\n            self.assertEqual(big, self.client.download(bucket_url(basename)).read())\n            fp.close()\n\n\n@pytest.mark.gcloud\nclass GCSTargetTest(_GCSBaseTestCase, FileSystemTargetTestMixin):\n    def create_target(self, format=None):\n        return gcs.GCSTarget(bucket_url(self.id()), format=format, client=self.client)\n\n    def test_close_twice(self):\n        # Ensure gcs._DeleteOnCloseFile().close() can be called multiple times\n        tgt = self.create_target()\n\n        with tgt.open(\"w\") as dst:\n            dst.write(\"data\")\n        assert dst.closed\n        dst.close()\n        assert dst.closed\n\n        with tgt.open() as src:\n            assert src.read().strip() == \"data\"\n        assert src.closed\n        src.close()\n        assert src.closed\n\n\nclass RetryTest(unittest.TestCase):\n    def test_success_with_retryable_error(self):\n        m = mock.MagicMock(side_effect=[IOError, IOError, \"test_func_output\"])\n\n        @gcs.gcs_retry\n        def mock_func():\n            return m()\n\n        actual = mock_func()\n        expected = \"test_func_output\"\n        self.assertEqual(expected, actual)\n\n    def test_fail_with_retry_limit_exceed(self):\n        m = mock.MagicMock(side_effect=[IOError, IOError, IOError, IOError, IOError])\n\n        @gcs.gcs_retry\n        def mock_func():\n            return m()\n\n        with self.assertRaises(IOError):\n            mock_func()\n"
  },
  {
    "path": "test/contrib/hadoop_jar_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport shlex\nimport tempfile\n\nimport pytest\nfrom helpers import unittest\nfrom mock import Mock, patch\n\nimport luigi\nfrom luigi.contrib.hadoop_jar import HadoopJarJobError, HadoopJarJobTask, fix_paths\n\n\nclass TestHadoopJarJob(HadoopJarJobTask):\n    path = luigi.Parameter()\n\n    def jar(self):\n        return self.path\n\n\nclass TestMissingJarJob(HadoopJarJobTask):\n    pass\n\n\nclass TestRemoteHadoopJarJob(TestHadoopJarJob):\n    def ssh(self):\n        return {\"host\": \"myhost\", \"key_file\": \"file\", \"username\": \"user\"}\n\n\nclass TestRemoteMissingJarJob(TestHadoopJarJob):\n    def ssh(self):\n        return {\"host\": \"myhost\", \"key_file\": \"file\"}\n\n\nclass TestRemoteHadoopJarTwoParamJob(TestRemoteHadoopJarJob):\n    param2 = luigi.Parameter()\n\n\n@pytest.mark.apache\nclass FixPathsTest(unittest.TestCase):\n    def test_fix_paths_non_hdfs_target_path(self):\n        mock_job = Mock()\n        mock_arg = Mock()\n        mock_job.args.return_value = [mock_arg]\n        mock_arg.path = \"right_path\"\n        self.assertEqual(([], [\"right_path\"]), fix_paths(mock_job))\n\n    def test_fix_paths_non_hdfs_target_str(self):\n        mock_job = Mock()\n        mock_arg = Mock(spec=[])\n        mock_job.args.return_value = [mock_arg]\n        self.assertEqual(([], [str(mock_arg)]), fix_paths(mock_job))\n\n\nclass HadoopJarJobTaskTest(unittest.TestCase):\n    @patch(\"luigi.contrib.hadoop.run_and_track_hadoop_job\")\n    def test_good(self, mock_job):\n        mock_job.return_value = None\n        with tempfile.NamedTemporaryFile() as temp_file:\n            task = TestHadoopJarJob(temp_file.name)\n            task.run()\n\n    @patch(\"luigi.contrib.hadoop.run_and_track_hadoop_job\")\n    def test_missing_jar(self, mock_job):\n        mock_job.return_value = None\n        task = TestMissingJarJob()\n        self.assertRaises(HadoopJarJobError, task.run)\n\n    @patch(\"luigi.contrib.hadoop.run_and_track_hadoop_job\")\n    def test_remote_job(self, mock_job):\n        mock_job.return_value = None\n        with tempfile.NamedTemporaryFile() as temp_file:\n            task = TestRemoteHadoopJarJob(temp_file.name)\n            task.run()\n\n    @patch(\"luigi.contrib.hadoop.run_and_track_hadoop_job\")\n    def test_remote_job_with_space_in_task_id(self, mock_job):\n        with tempfile.NamedTemporaryFile() as temp_file:\n\n            def check_space(arr, task_id):\n                for a in arr:\n                    if a.startswith(\"hadoop jar\"):\n                        found = False\n                        for x in shlex.split(a):\n                            if task_id in x:\n                                found = True\n                        if not found:\n                            raise AssertionError\n\n            task = TestRemoteHadoopJarTwoParamJob(temp_file.name, \"test\")\n            mock_job.side_effect = lambda x, _: check_space(x, str(task))\n            task.run()\n\n    @patch(\"luigi.contrib.hadoop.run_and_track_hadoop_job\")\n    def test_remote_job_missing_config(self, mock_job):\n        mock_job.return_value = None\n        with tempfile.NamedTemporaryFile() as temp_file:\n            task = TestRemoteMissingJarJob(temp_file.name)\n            self.assertRaises(HadoopJarJobError, task.run)\n"
  },
  {
    "path": "test/contrib/hdfs/webhdfs_client_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2015 VNG Corporation\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nimport unittest\n\nimport pytest\nfrom helpers import with_config\n\nfrom luigi.contrib.hdfs import WebHdfsClient\n\nInsecureClient = pytest.importorskip(\"hdfs.InsecureClient\")\nKerberosClient = pytest.importorskip(\"hdfs.ext.kerberos.KerberosClient\")\n\n\n@pytest.mark.apache\nclass TestWebHdfsClient(unittest.TestCase):\n    @with_config({\"webhdfs\": {\"client_type\": \"insecure\"}})\n    def test_insecure_client_type(self):\n        client = WebHdfsClient(host=\"localhost\").client\n        self.assertIsInstance(client, InsecureClient)\n\n    @with_config({\"webhdfs\": {\"client_type\": \"kerberos\"}})\n    def test_kerberos_client_type(self):\n        client = WebHdfsClient(host=\"localhost\").client\n        self.assertIsInstance(client, KerberosClient)\n"
  },
  {
    "path": "test/contrib/hdfs_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport pickle\nimport random\nimport re\n\nfrom target_test import FileSystemTargetTestMixin\n\nimport luigi\nimport luigi.contrib.hdfs.clients\nimport luigi.format\nfrom luigi.contrib import hdfs\n\n\nclass ComplexOldFormat(luigi.format.Format):\n    \"\"\"Should take unicode but output bytes\"\"\"\n\n    def hdfs_writer(self, output_pipe):\n        return self.pipe_writer(luigi.contrib.hdfs.Plain.hdfs_writer(output_pipe))\n\n    def pipe_writer(self, output_pipe):\n        return luigi.format.UTF8.pipe_writer(output_pipe)\n\n    def pipe_reader(self, output_pipe):\n        return output_pipe\n\n\nclass TestException(Exception):\n    pass\n\n\nclass HdfsTargetTestMixin(FileSystemTargetTestMixin):\n    def create_target(self, format=None):\n        target = hdfs.HdfsTarget(self._test_file(), format=format)\n        if target.exists():\n            target.remove(skip_trash=True)\n        return target\n\n    def test_slow_exists(self):\n        target = hdfs.HdfsTarget(self._test_file())\n        try:\n            target.remove(skip_trash=True)\n        except BaseException:\n            pass\n\n        self.assertFalse(self.fs.exists(target.path))\n        target.open(\"w\").close()\n        self.assertTrue(self.fs.exists(target.path))\n\n        def should_raise():\n            self.fs.exists(\"hdfs://doesnotexist/foo\")\n\n        self.assertRaises(hdfs.HDFSCliError, should_raise)\n\n        def should_raise_2():\n            self.fs.exists(\"hdfs://_doesnotexist_/foo\")\n\n        self.assertRaises(hdfs.HDFSCliError, should_raise_2)\n\n    def test_create_ancestors(self):\n        parent = self._test_dir()\n        target = hdfs.HdfsTarget(\"%s/foo/bar/baz\" % parent)\n        if self.fs.exists(parent):\n            self.fs.remove(parent, skip_trash=True)\n        self.assertFalse(self.fs.exists(parent))\n        fobj = target.open(\"w\")\n        fobj.write(\"lol\\n\")\n        fobj.close()\n        self.assertTrue(self.fs.exists(parent))\n        self.assertTrue(target.exists())\n\n    def test_tmp_cleanup(self):\n        path = self._test_file()\n        target = hdfs.HdfsTarget(path, is_tmp=True)\n        if target.exists():\n            target.remove(skip_trash=True)\n        with target.open(\"w\") as fobj:\n            fobj.write(\"lol\\n\")\n        self.assertTrue(target.exists())\n        del target\n        import gc\n\n        gc.collect()\n        self.assertFalse(self.fs.exists(path))\n\n    def test_luigi_tmp(self):\n        target = hdfs.HdfsTarget(is_tmp=True)\n        self.assertFalse(target.exists())\n        with target.open(\"w\"):\n            pass\n        self.assertTrue(target.exists())\n\n    def test_tmp_move(self):\n        target = hdfs.HdfsTarget(is_tmp=True)\n        target2 = hdfs.HdfsTarget(self._test_file())\n        if target2.exists():\n            target2.remove(skip_trash=True)\n        with target.open(\"w\"):\n            pass\n        self.assertTrue(target.exists())\n        target.move(target2.path)\n        self.assertFalse(target.exists())\n        self.assertTrue(target2.exists())\n\n    def test_rename_no_parent(self):\n        parent = self._test_dir() + \"/foo\"\n        if self.fs.exists(parent):\n            self.fs.remove(parent, skip_trash=True)\n\n        target1 = hdfs.HdfsTarget(is_tmp=True)\n        target2 = hdfs.HdfsTarget(parent + \"/bar\")\n        with target1.open(\"w\"):\n            pass\n        self.assertTrue(target1.exists())\n        target1.move(target2.path)\n        self.assertFalse(target1.exists())\n        self.assertTrue(target2.exists())\n\n    def test_rename_no_grandparent(self):\n        grandparent = self._test_dir() + \"/foo\"\n        if self.fs.exists(grandparent):\n            self.fs.remove(grandparent, skip_trash=True)\n\n        target1 = hdfs.HdfsTarget(is_tmp=True)\n        target2 = hdfs.HdfsTarget(grandparent + \"/bar/baz\")\n        with target1.open(\"w\"):\n            pass\n        self.assertTrue(target1.exists())\n        target1.move(target2.path)\n        self.assertFalse(target1.exists())\n        self.assertTrue(target2.exists())\n\n    def test_glob_exists(self):\n        target_dir = hdfs.HdfsTarget(self._test_dir())\n        if target_dir.exists():\n            target_dir.remove(skip_trash=True)\n        self.fs.mkdir(target_dir.path)\n        t1 = hdfs.HdfsTarget(target_dir.path + \"/part-00001\")\n        t2 = hdfs.HdfsTarget(target_dir.path + \"/part-00002\")\n        t3 = hdfs.HdfsTarget(target_dir.path + \"/another\")\n\n        with t1.open(\"w\") as f:\n            f.write(\"foo\\n\")\n        with t2.open(\"w\") as f:\n            f.write(\"bar\\n\")\n        with t3.open(\"w\") as f:\n            f.write(\"biz\\n\")\n\n        files = hdfs.HdfsTarget(\"%s/part-0000*\" % target_dir.path)\n\n        self.assertTrue(files.glob_exists(2))\n        self.assertFalse(files.glob_exists(3))\n        self.assertFalse(files.glob_exists(1))\n\n    def assertRegexpMatches(self, text, expected_regexp, msg=None):\n        \"\"\"Python 2.7 backport.\"\"\"\n        if isinstance(expected_regexp, str):\n            expected_regexp = re.compile(expected_regexp)\n        if not expected_regexp.search(text):\n            msg = msg or \"Regexp didn't match\"\n            msg = \"%s: %r not found in %r\" % (msg, expected_regexp.pattern, text)\n            raise self.failureException(msg)\n\n    def test_tmppath_not_configured(self):\n        # Given: several target paths to test\n        path1 = \"/dir1/dir2/file\"\n        path2 = \"hdfs:///dir1/dir2/file\"\n        path3 = \"hdfs://somehost/dir1/dir2/file\"\n        path4 = \"file:///dir1/dir2/file\"\n        path5 = \"/tmp/dir/file\"\n        path6 = \"file:///tmp/dir/file\"\n        path7 = \"hdfs://somehost/tmp/dir/file\"\n        path8 = None\n        path9 = \"/tmpdir/file\"\n\n        # When: I create a temporary path for targets\n        res1 = hdfs.tmppath(path1, include_unix_username=False)\n        res2 = hdfs.tmppath(path2, include_unix_username=False)\n        res3 = hdfs.tmppath(path3, include_unix_username=False)\n        res4 = hdfs.tmppath(path4, include_unix_username=False)\n        res5 = hdfs.tmppath(path5, include_unix_username=False)\n        res6 = hdfs.tmppath(path6, include_unix_username=False)\n        res7 = hdfs.tmppath(path7, include_unix_username=False)\n        res8 = hdfs.tmppath(path8, include_unix_username=False)\n        res9 = hdfs.tmppath(path9, include_unix_username=False)\n\n        # Then: I should get correct results relative to Luigi temporary directory\n        self.assertRegexpMatches(res1, \"^/tmp/dir1/dir2/file-luigitemp-\\\\d+\")\n        # it would be better to see hdfs:///path instead of hdfs:/path, but single slash also works well\n        self.assertRegexpMatches(res2, \"^hdfs:/tmp/dir1/dir2/file-luigitemp-\\\\d+\")\n        self.assertRegexpMatches(res3, \"^hdfs://somehost/tmp/dir1/dir2/file-luigitemp-\\\\d+\")\n        self.assertRegexpMatches(res4, \"^file:///tmp/dir1/dir2/file-luigitemp-\\\\d+\")\n        self.assertRegexpMatches(res5, \"^/tmp/dir/file-luigitemp-\\\\d+\")\n        # known issue with duplicated \"tmp\" if schema is present\n        self.assertRegexpMatches(res6, \"^file:///tmp/tmp/dir/file-luigitemp-\\\\d+\")\n        # known issue with duplicated \"tmp\" if schema is present\n        self.assertRegexpMatches(res7, \"^hdfs://somehost/tmp/tmp/dir/file-luigitemp-\\\\d+\")\n        self.assertRegexpMatches(res8, \"^/tmp/luigitemp-\\\\d+\")\n        self.assertRegexpMatches(res9, \"/tmp/tmpdir/file\")\n\n    def test_tmppath_username(self):\n        self.assertRegexpMatches(hdfs.tmppath(\"/path/to/stuff\", include_unix_username=True), \"^/tmp/[a-z0-9_]+/path/to/stuff-luigitemp-\\\\d+\")\n\n    def test_pickle(self):\n        t = hdfs.HdfsTarget(\"/tmp/dir\")\n        pickle.dumps(t)\n\n    def test_flag_target(self):\n        target = hdfs.HdfsFlagTarget(\"/some/dir/\", format=format)\n        if target.exists():\n            target.remove(skip_trash=True)\n        self.assertFalse(target.exists())\n\n        t1 = hdfs.HdfsTarget(target.path + \"part-00000\", format=format)\n        with t1.open(\"w\"):\n            pass\n        t2 = hdfs.HdfsTarget(target.path + \"_SUCCESS\", format=format)\n        with t2.open(\"w\"):\n            pass\n        self.assertTrue(target.exists())\n\n    def test_flag_target_fails_if_not_directory(self):\n        with self.assertRaises(ValueError):\n            hdfs.HdfsFlagTarget(\"/home/file.txt\")\n\n\nclass _MiscOperationsMixin:\n    # TODO: chown/chmod/count should really be methods on HdfsTarget rather than the client!\n\n    def get_target(self):\n        fn = \"/tmp/foo-%09d\" % random.randint(0, 999999999)\n        t = luigi.contrib.hdfs.HdfsTarget(fn)\n        with t.open(\"w\") as f:\n            f.write(\"test\")\n        return t\n\n    def test_count(self):\n        t = self.get_target()\n        res = self.get_client().count(t.path)\n        for key in [\"content_size\", \"dir_count\", \"file_count\"]:\n            self.assertTrue(key in res)\n\n    def test_chmod(self):\n        t = self.get_target()\n        self.get_client().chmod(t.path, \"777\")\n\n    def test_chown(self):\n        t = self.get_target()\n        self.get_client().chown(t.path, \"root\", \"root\")\n"
  },
  {
    "path": "test/contrib/hive_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport os\nimport sys\nimport tempfile\nfrom collections import OrderedDict\n\nimport mock\nimport pytest\nfrom helpers import unittest\n\nimport luigi.contrib.hive\nfrom luigi import LocalTarget\n\n\n@pytest.mark.apache\nclass HiveTest(unittest.TestCase):\n    count = 0\n\n    def mock_hive_cmd(self, args, check_return=True):\n        self.last_hive_cmd = args\n        self.count += 1\n        return \"statement{}\".format(self.count)\n\n    def setUp(self):\n        self.run_hive_cmd_saved = luigi.contrib.hive.run_hive\n        luigi.contrib.hive.run_hive = self.mock_hive_cmd\n\n    def tearDown(self):\n        luigi.contrib.hive.run_hive = self.run_hive_cmd_saved\n\n    def test_run_hive_command(self):\n        pre_count = self.count\n        res = luigi.contrib.hive.run_hive_cmd(\"foo\")\n        self.assertEqual([\"-e\", \"foo\"], self.last_hive_cmd)\n        self.assertEqual(\"statement{0}\".format(pre_count + 1), res)\n\n    def test_run_hive_script_not_exists(self):\n        def test():\n            luigi.contrib.hive.run_hive_script(\"/tmp/some-non-existant-file______\")\n\n        self.assertRaises(RuntimeError, test)\n\n    def test_run_hive_script_exists(self):\n        with tempfile.NamedTemporaryFile(delete=True) as f:\n            pre_count = self.count\n            res = luigi.contrib.hive.run_hive_script(f.name)\n            self.assertEqual([\"-f\", f.name], self.last_hive_cmd)\n            self.assertEqual(\"statement{0}\".format(pre_count + 1), res)\n\n    def test_create_parent_dirs(self):\n        dirname = \"/tmp/hive_task_test_dir\"\n\n        class FooHiveTask:\n            def output(self):\n                return LocalTarget(os.path.join(dirname, \"foo\"))\n\n        runner = luigi.contrib.hive.HiveQueryRunner()\n        runner.prepare_outputs(FooHiveTask())\n        self.assertTrue(os.path.exists(dirname))\n\n\n@pytest.mark.apache\nclass HiveCommandClientTest(unittest.TestCase):\n    \"\"\"Note that some of these tests are really for the CDH releases of Hive, to which I do not currently have access.\n    Hopefully there are no significant differences in the expected output\"\"\"\n\n    def setUp(self):\n        self.client = luigi.contrib.hive.HiveCommandClient()\n        self.apacheclient = luigi.contrib.hive.ApacheHiveCommandClient()\n        self.metastoreclient = luigi.contrib.hive.MetastoreClient()\n\n    @mock.patch(\"luigi.contrib.hive.run_hive_cmd\")\n    def test_default_table_location(self, run_command):\n        run_command.return_value = (\n            \"Protect Mode:       \tNone                \t \\n\"\n            \"Retention:          \t0                   \t \\n\"\n            \"Location:           \thdfs://localhost:9000/user/hive/warehouse/mytable\t \\n\"\n            \"Table Type:         \tMANAGED_TABLE       \t \\n\"\n        )\n\n        returned = self.client.table_location(\"mytable\")\n        self.assertEqual(\"hdfs://localhost:9000/user/hive/warehouse/mytable\", returned)\n\n    @mock.patch(\"luigi.contrib.hive.run_hive_cmd\")\n    def test_table_exists(self, run_command):\n        run_command.return_value = \"OK\"\n        returned = self.client.table_exists(\"mytable\")\n        self.assertFalse(returned)\n\n        run_command.return_value = \"OK\\nmytable\"\n        returned = self.client.table_exists(\"mytable\")\n        self.assertTrue(returned)\n\n        # Issue #896 test case insensitivity\n        returned = self.client.table_exists(\"MyTable\")\n        self.assertTrue(returned)\n\n        run_command.return_value = \"day=2013-06-28/hour=3\\nday=2013-06-28/hour=4\\nday=2013-07-07/hour=2\\n\"\n        self.client.partition_spec = mock.Mock(name=\"partition_spec\")\n        self.client.partition_spec.return_value = \"somepart\"\n        returned = self.client.table_exists(\"mytable\", partition={\"a\": \"b\"})\n        self.assertTrue(returned)\n\n        run_command.return_value = \"\"\n        returned = self.client.table_exists(\"mytable\", partition={\"a\": \"b\"})\n        self.assertFalse(returned)\n\n    @mock.patch(\"luigi.contrib.hive.run_hive_cmd\")\n    def test_table_schema(self, run_command):\n        run_command.return_value = \"FAILED: SemanticException [Error 10001]: blah does not exist\\nSome other stuff\"\n        returned = self.client.table_schema(\"mytable\")\n        self.assertFalse(returned)\n\n        run_command.return_value = (\n            \"OK\\n\"\n            \"col1       \tstring              \tNone                \\n\"\n            \"col2            \tstring              \tNone                \\n\"\n            \"col3         \tstring              \tNone                \\n\"\n            \"day                 \tstring              \tNone                \\n\"\n            \"hour                \tsmallint            \tNone                \\n\\n\"\n            \"# Partition Information\t \t \\n\"\n            \"# col_name            \tdata_type           \tcomment             \\n\\n\"\n            \"day                 \tstring              \tNone                \\n\"\n            \"hour                \tsmallint            \tNone                \\n\"\n            \"Time taken: 2.08 seconds, Fetched: 34 row(s)\\n\"\n        )\n        expected = [\n            (\"OK\",),\n            (\"col1\", \"string\", \"None\"),\n            (\"col2\", \"string\", \"None\"),\n            (\"col3\", \"string\", \"None\"),\n            (\"day\", \"string\", \"None\"),\n            (\"hour\", \"smallint\", \"None\"),\n            (\"\",),\n            (\"# Partition Information\",),\n            (\"# col_name\", \"data_type\", \"comment\"),\n            (\"\",),\n            (\"day\", \"string\", \"None\"),\n            (\"hour\", \"smallint\", \"None\"),\n            (\"Time taken: 2.08 seconds, Fetched: 34 row(s)\",),\n        ]\n        returned = self.client.table_schema(\"mytable\")\n        self.assertEqual(expected, returned)\n\n    def test_partition_spec(self):\n        returned = self.client.partition_spec({\"a\": \"b\", \"c\": \"d\"})\n        self.assertEqual(\"`a`='b',`c`='d'\", returned)\n\n    @mock.patch(\"luigi.contrib.hive.run_hive_cmd\")\n    def test_apacheclient_table_exists(self, run_command):\n        run_command.return_value = \"OK\"\n        returned = self.apacheclient.table_exists(\"mytable\")\n        self.assertFalse(returned)\n\n        run_command.return_value = \"OK\\nmytable\"\n        returned = self.apacheclient.table_exists(\"mytable\")\n        self.assertTrue(returned)\n\n        # Issue #896 test case insensitivity\n        returned = self.apacheclient.table_exists(\"MyTable\")\n        self.assertTrue(returned)\n\n        run_command.return_value = \"day=2013-06-28/hour=3\\nday=2013-06-28/hour=4\\nday=2013-07-07/hour=2\\n\"\n        self.apacheclient.partition_spec = mock.Mock(name=\"partition_spec\")\n        self.apacheclient.partition_spec.return_value = \"somepart\"\n        returned = self.apacheclient.table_exists(\"mytable\", partition={\"a\": \"b\"})\n        self.assertTrue(returned)\n\n        run_command.return_value = \"\"\n        returned = self.apacheclient.table_exists(\"mytable\", partition={\"a\": \"b\"})\n        self.assertFalse(returned)\n\n    @mock.patch(\"luigi.contrib.hive.run_hive_cmd\")\n    def test_apacheclient_table_schema(self, run_command):\n        run_command.return_value = \"FAILED: SemanticException [Error 10001]: Table not found mytable\\nSome other stuff\"\n        returned = self.apacheclient.table_schema(\"mytable\")\n        self.assertFalse(returned)\n\n        run_command.return_value = (\n            \"OK\\n\"\n            \"col1       \tstring              \tNone                \\n\"\n            \"col2            \tstring              \tNone                \\n\"\n            \"col3         \tstring              \tNone                \\n\"\n            \"day                 \tstring              \tNone                \\n\"\n            \"hour                \tsmallint            \tNone                \\n\\n\"\n            \"# Partition Information\t \t \\n\"\n            \"# col_name            \tdata_type           \tcomment             \\n\\n\"\n            \"day                 \tstring              \tNone                \\n\"\n            \"hour                \tsmallint            \tNone                \\n\"\n            \"Time taken: 2.08 seconds, Fetched: 34 row(s)\\n\"\n        )\n        expected = [\n            (\"OK\",),\n            (\"col1\", \"string\", \"None\"),\n            (\"col2\", \"string\", \"None\"),\n            (\"col3\", \"string\", \"None\"),\n            (\"day\", \"string\", \"None\"),\n            (\"hour\", \"smallint\", \"None\"),\n            (\"\",),\n            (\"# Partition Information\",),\n            (\"# col_name\", \"data_type\", \"comment\"),\n            (\"\",),\n            (\"day\", \"string\", \"None\"),\n            (\"hour\", \"smallint\", \"None\"),\n            (\"Time taken: 2.08 seconds, Fetched: 34 row(s)\",),\n        ]\n        returned = self.apacheclient.table_schema(\"mytable\")\n        self.assertEqual(expected, returned)\n\n    @mock.patch(\"luigi.contrib.hive.HiveThriftContext\")\n    def test_metastoreclient_partition_existence_regardless_of_order(self, thrift_context):\n        thrift_context.return_value = thrift_context\n        client_mock = mock.Mock(name=\"clientmock\")\n        client_mock.return_value = client_mock\n        thrift_context.__enter__ = client_mock\n        client_mock.get_partition_names = mock.Mock(return_value=[\"p1=x/p2=y\", \"p1=a/p2=b\"])\n\n        partition_spec = OrderedDict([(\"p1\", \"a\"), (\"p2\", \"b\")])\n        self.assertTrue(self.metastoreclient.table_exists(\"table\", \"default\", partition_spec))\n\n        partition_spec = OrderedDict([(\"p2\", \"b\"), (\"p1\", \"a\")])\n        self.assertTrue(self.metastoreclient.table_exists(\"table\", \"default\", partition_spec))\n\n    def test_metastore_partition_spec_has_the_same_order(self):\n        partition_spec = OrderedDict([(\"p1\", \"a\"), (\"p2\", \"b\")])\n        spec_string = luigi.contrib.hive.MetastoreClient().partition_spec(partition_spec)\n        self.assertEqual(spec_string, \"p1=a/p2=b\")\n\n        partition_spec = OrderedDict([(\"p2\", \"b\"), (\"p1\", \"a\")])\n        spec_string = luigi.contrib.hive.MetastoreClient().partition_spec(partition_spec)\n        self.assertEqual(spec_string, \"p1=a/p2=b\")\n\n    @mock.patch(\"luigi.configuration\")\n    def test_client_def(self, hive_syntax):\n        hive_syntax.get_config.return_value.get.return_value = \"cdh4\"\n        client = luigi.contrib.hive.get_default_client()\n        self.assertEqual(luigi.contrib.hive.HiveCommandClient, type(client))\n\n        hive_syntax.get_config.return_value.get.return_value = \"cdh3\"\n        client = luigi.contrib.hive.get_default_client()\n        self.assertEqual(luigi.contrib.hive.HiveCommandClient, type(client))\n\n        hive_syntax.get_config.return_value.get.return_value = \"apache\"\n        client = luigi.contrib.hive.get_default_client()\n        self.assertEqual(luigi.contrib.hive.ApacheHiveCommandClient, type(client))\n\n        hive_syntax.get_config.return_value.get.return_value = \"metastore\"\n        client = luigi.contrib.hive.get_default_client()\n        self.assertEqual(luigi.contrib.hive.MetastoreClient, type(client))\n\n        hive_syntax.get_config.return_value.get.return_value = \"warehouse\"\n        client = luigi.contrib.hive.get_default_client()\n        self.assertEqual(luigi.contrib.hive.WarehouseHiveClient, type(client))\n\n    @mock.patch(\"subprocess.Popen\")\n    def test_run_hive_command(self, popen):\n        # I'm testing this again to check the return codes\n        # I didn't want to tear up all the existing tests to change how run_hive is mocked\n        comm = mock.Mock(name=\"communicate_mock\")\n        comm.return_value = b\"some return stuff\", \"\"\n\n        preturn = mock.Mock(name=\"open_mock\")\n        preturn.returncode = 0\n        preturn.communicate = comm\n        popen.return_value = preturn\n\n        returned = luigi.contrib.hive.run_hive([\"blah\", \"blah\"])\n        self.assertEqual(\"some return stuff\", returned)\n\n        preturn.returncode = 17\n        self.assertRaises(luigi.contrib.hive.HiveCommandError, luigi.contrib.hive.run_hive, [\"blah\", \"blah\"])\n\n        comm.return_value = b\"\", \"some stderr stuff\"\n        returned = luigi.contrib.hive.run_hive([\"blah\", \"blah\"], False)\n        self.assertEqual(\"\", returned)\n\n\nclass WarehouseHiveClientTest(unittest.TestCase):\n    def test_table_exists_files_actually_exist(self):\n        # arrange\n        hdfs_client = mock.Mock(name=\"hdfs_client\")\n        hdfs_client.exists.return_value = True\n        hdfs_client.listdir.return_value = [\"00000_0\", \"00000_1\", \"00000_2\", \".tmp/\"]\n\n        warehouse_hive_client = luigi.contrib.hive.WarehouseHiveClient(hdfs_client=hdfs_client, warehouse_location=\"/apps/hive/warehouse\")\n\n        # act\n        exists = warehouse_hive_client.table_exists(database=\"some_db\", table=\"table_name\", partition=OrderedDict(a=1, b=2))\n\n        # assert\n        assert exists\n        hdfs_client.exists.assert_called_once_with(\"/apps/hive/warehouse/some_db.db/table_name/a=1/b=2\")\n\n    @mock.patch(\"luigi.configuration\")\n    def test_table_exists_without_partition_spec_files_actually_exist(self, warehouse_location):\n        # arrange\n        warehouse_location.get_config.return_value.get.return_value = \"/apps/hive/warehouse\"\n        hdfs_client = mock.Mock(name=\"hdfs_client\")\n        hdfs_client.exists.return_value = True\n        hdfs_client.listdir.return_value = [\"00000_0\", \"00000_1\", \"00000_2\", \".tmp/\"]\n\n        warehouse_hive_client = luigi.contrib.hive.WarehouseHiveClient(\n            hdfs_client=hdfs_client,\n        )\n\n        # act\n        exists = warehouse_hive_client.table_exists(\n            database=\"some_db\",\n            table=\"table_name\",\n        )\n\n        # assert\n        assert exists\n        hdfs_client.exists.assert_called_once_with(\"/apps/hive/warehouse/some_db.db/table_name/\")\n        hdfs_client.listdir.assert_called_once_with(\"/apps/hive/warehouse/some_db.db/table_name/\")\n\n    @mock.patch(\"luigi.configuration\")\n    def test_table_exists_only_tmp_files_exist(self, ignored_file_masks):\n        # arrange\n        ignored_file_masks.get_config.return_value.get.return_value = r\"(\\.tmp.*)\"\n        hdfs_client = mock.Mock(name=\"hdfs_client\")\n        hdfs_client.exists.return_value = True\n        hdfs_client.listdir.return_value = [\".tmp/\"]\n\n        warehouse_hive_client = luigi.contrib.hive.WarehouseHiveClient(hdfs_client=hdfs_client, warehouse_location=\"/apps/hive/warehouse\")\n\n        # act\n        exists = warehouse_hive_client.table_exists(database=\"some_db\", table=\"table_name\", partition={\"a\": 1})\n\n        # assert\n        assert not exists\n        hdfs_client.exists.assert_called_once_with(\"/apps/hive/warehouse/some_db.db/table_name/a=1\")\n        hdfs_client.listdir.assert_called_once_with(\"/apps/hive/warehouse/some_db.db/table_name/a=1\")\n\n    @mock.patch(\"luigi.configuration\")\n    def test_table_exists_ambiguous_partition(self, ignored_file_masks):\n        # arrange\n        ignored_file_masks.get_config.return_value.get.return_value = r\"(\\.tmp.*)\"\n        hdfs_client = mock.Mock(name=\"hdfs_client\")\n        hdfs_client.exists.return_value = True\n        hdfs_client.listdir.return_value = [\".tmp/\"]\n        warehouse_hive_client = luigi.contrib.hive.WarehouseHiveClient(hdfs_client=hdfs_client, warehouse_location=\"/apps/hive/warehouse\")\n\n        def _call_exists():\n            return warehouse_hive_client.table_exists(database=\"some_db\", table=\"table_name\", partition={\"a\": 1, \"b\": 2})\n\n        # act & assert\n        if sys.version_info >= (3, 7):\n            exists = _call_exists()\n            assert not exists\n            hdfs_client.exists.assert_called_once_with(\"/apps/hive/warehouse/some_db.db/table_name/a=1/b=2\")\n            hdfs_client.listdir.assert_called_once_with(\"/apps/hive/warehouse/some_db.db/table_name/a=1/b=2\")\n        else:\n            self.assertRaises(ValueError, _call_exists)\n\n\nclass MyHiveTask(luigi.contrib.hive.HiveQueryTask):\n    param = luigi.Parameter()\n\n    def query(self):\n        return \"banana banana %s\" % self.param\n\n\n@pytest.mark.apache\nclass TestHiveTask(unittest.TestCase):\n    task_class = MyHiveTask\n\n    @mock.patch(\"luigi.contrib.hadoop.run_and_track_hadoop_job\")\n    def test_run(self, run_and_track_hadoop_job):\n        success = luigi.run([self.task_class.__name__, \"--param\", \"foo\", \"--local-scheduler\", \"--no-lock\"])\n        self.assertTrue(success)\n        self.assertEqual(\"hive\", run_and_track_hadoop_job.call_args[0][0][0])\n\n\nclass MyHiveTaskArgs(MyHiveTask):\n    def hivevars(self):\n        return {\"my_variable1\": \"value1\", \"my_variable2\": \"value2\"}\n\n    def hiveconfs(self):\n        return {\"hive.additional.conf\": \"conf_value\"}\n\n\nclass TestHiveTaskArgs(TestHiveTask):\n    task_class = MyHiveTaskArgs\n\n    def test_arglist(self):\n        task = self.task_class(param=\"foo\")\n        f_name = \"my_file\"\n        runner = luigi.contrib.hive.HiveQueryRunner()\n        arglist = runner.get_arglist(f_name, task)\n\n        f_idx = arglist.index(\"-f\")\n        self.assertEqual(arglist[f_idx + 1], f_name)\n\n        hivevars = [\"{}={}\".format(k, v) for k, v in task.hivevars().items()]\n        for var in hivevars:\n            idx = arglist.index(var)\n            self.assertEqual(arglist[idx - 1], \"--hivevar\")\n\n        hiveconfs = [\"{}={}\".format(k, v) for k, v in task.hiveconfs().items()]\n        for conf in hiveconfs:\n            idx = arglist.index(conf)\n            self.assertEqual(arglist[idx - 1], \"--hiveconf\")\n\n\n@pytest.mark.apache\nclass TestHiveTarget(unittest.TestCase):\n    def test_hive_table_target(self):\n        client = mock.Mock()\n        target = luigi.contrib.hive.HiveTableTarget(database=\"db\", table=\"foo\", client=client)\n        target.exists()\n        client.table_exists.assert_called_with(\"foo\", \"db\", None)\n\n    def test_hive_partition_target(self):\n        client = mock.Mock()\n        target = luigi.contrib.hive.HivePartitionTarget(database=\"db\", table=\"foo\", partition=\"bar\", client=client)\n        target.exists()\n        client.table_exists.assert_called_with(\"foo\", \"db\", \"bar\")\n\n\nclass ExternalHiveTaskTest(unittest.TestCase):\n    def test_table(self):\n        # arrange\n        class _Task(luigi.contrib.hive.ExternalHiveTask):\n            database = \"schema1\"\n            table = \"table1\"\n\n        # act\n        output = _Task().output()\n\n        # assert\n        assert isinstance(output, luigi.contrib.hive.HivePartitionTarget)\n        assert output.database == \"schema1\"\n        assert output.table == \"table1\"\n        assert output.partition == {}\n\n    def test_partition_exists(self):\n        # arrange\n        class _Task(luigi.contrib.hive.ExternalHiveTask):\n            database = \"schema2\"\n            table = \"table2\"\n            partition = {\"a\": 1}\n\n        # act\n        output = _Task().output()\n\n        # assert\n        assert isinstance(output, luigi.contrib.hive.HivePartitionTarget)\n        assert output.database == \"schema2\"\n        assert output.table == \"table2\"\n        assert output.partition == {\"a\": 1}\n"
  },
  {
    "path": "test/contrib/kubernetes_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2015 Outlier Bio, LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nTests for the Kubernetes Job wrapper.\n\nRequires:\n\n- pykube: ``pip install pykube-ng``\n- A local minikube custer up and running: http://kubernetes.io/docs/getting-started-guides/minikube/\n\n**WARNING**: For Python versions < 3.5 the kubeconfig file must point to a Kubernetes API\nhostname, and NOT to an IP address.\n\nWritten and maintained by Marco Capuccini (@mcapuccini).\n\"\"\"\n\nimport logging\nimport unittest\n\nimport mock\nimport pytest\n\nimport luigi\nfrom luigi.contrib.kubernetes import KubernetesJobTask\n\nlogger = logging.getLogger(\"luigi-interface\")\n\ntry:\n    from pykube.config import KubeConfig\n    from pykube.http import HTTPClient\n    from pykube.objects import Job\nexcept ImportError:\n    raise unittest.SkipTest(\"pykube is not installed. This test requires pykube.\")\n\n\nclass SuccessJob(KubernetesJobTask):\n    name = \"success\"\n    spec_schema = {\"containers\": [{\"name\": \"hello\", \"image\": \"alpine:3.4\", \"command\": [\"echo\", \"Hello World!\"]}]}\n\n\nclass FailJob(KubernetesJobTask):\n    name = \"fail\"\n    max_retrials = 3\n    backoff_limit = 3\n    spec_schema = {\"containers\": [{\"name\": \"fail\", \"image\": \"alpine:3.4\", \"command\": [\"You\", \"Shall\", \"Not\", \"Pass\"]}]}\n\n    @property\n    def labels(self):\n        return {\"dummy_label\": \"dummy_value\"}\n\n\n@pytest.mark.contrib\nclass TestK8STask(unittest.TestCase):\n    def test_success_job(self):\n        success = luigi.run([\"SuccessJob\", \"--local-scheduler\"])\n        self.assertTrue(success)\n\n    def test_fail_job(self):\n        fail = FailJob()\n        self.assertRaises(RuntimeError, fail.run)\n        # Check for retrials\n        kube_api = HTTPClient(KubeConfig.from_file(\"~/.kube/config\"))  # assumes minikube\n        jobs = Job.objects(kube_api).filter(selector=\"luigi_task_id=\" + fail.job_uuid)\n        self.assertEqual(len(jobs.response[\"items\"]), 1)\n        job = Job(kube_api, jobs.response[\"items\"][0])\n        self.assertTrue(\"failed\" in job.obj[\"status\"])\n        self.assertTrue(job.obj[\"status\"][\"failed\"] > fail.max_retrials)\n        self.assertTrue(job.obj[\"spec\"][\"template\"][\"metadata\"][\"labels\"] == fail.labels())\n\n    @mock.patch.object(KubernetesJobTask, \"_KubernetesJobTask__get_job_status\")\n    @mock.patch.object(KubernetesJobTask, \"signal_complete\")\n    def test_output(self, mock_signal, mock_job_status):\n        # mock that the job succeeded\n        mock_job_status.return_value = \"succeeded\"\n        # create a kubernetes job\n        kubernetes_job = KubernetesJobTask()\n        # set logger and uu_name due to logging in __track_job()\n        kubernetes_job._KubernetesJobTask__logger = logger\n        kubernetes_job.uu_name = \"test\"\n        # track the job (bc included in run method)\n        kubernetes_job._KubernetesJobTask__track_job()\n        # Make sure successful job signals\n        self.assertTrue(mock_signal.called)\n"
  },
  {
    "path": "test/contrib/lsf_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nLSF Unit Test\n=============\n\nTest runner for the LSF wrapper. The test is based on the one used for the SGE\nwrappers\n\"\"\"\n\nimport logging\nimport os\nimport os.path\nimport subprocess\nimport unittest\nfrom glob import glob\n\nimport pytest\nfrom mock import patch\n\nimport luigi\nfrom luigi.contrib.lsf import LSFJobTask\n\nDEFAULT_HOME = \"\"\n\nLOGGER = logging.getLogger(\"luigi-interface\")\n\n\n# BJOBS_OUTPUT = \"\"\"JOBID   USER    STAT  QUEUE      FROM_HOST   EXEC_HOST   JOB_NAME   SUBMIT_TIME\n# 1000001 mcdowal RUN   production sub-node-002 node4-123  /bin/bash  Mar 14 10:10\n# 1000002 mcdowal PEND  production sub-node-002 node5-269  /bin/bash  Mar 14 10:10\n# 1000003 mcdowal EXIT  production sub-node-002            /bin/bash  Mar 14 10:10\n# \"\"\"\n\n\ndef on_lsf_master():\n    try:\n        subprocess.check_call(\"bjobs\", shell=True)\n        return True\n    except subprocess.CalledProcessError:\n        return False\n\n\nclass TestJobTask(LSFJobTask):\n    \"\"\"Simple SGE job: write a test file to NSF shared drive and waits a minute\"\"\"\n\n    i = luigi.Parameter()\n\n    def work(self):\n        LOGGER.info(\"Running test job...\")\n        with open(self.output().path, \"w\") as f:\n            f.write(\"this is a test\\n\")\n\n    def output(self):\n        return luigi.LocalTarget(os.path.join(DEFAULT_HOME, \"test_lsf_file_\" + str(self.i)))\n\n\n@pytest.mark.contrib\nclass TestSGEJob(unittest.TestCase):\n    \"\"\"Test from SGE master node\"\"\"\n\n    @patch(\"subprocess.Popen\")\n    @patch(\"subprocess.Popen.communicate\")\n    def test_run_job(self, mock_open, mock_communicate):\n        if on_lsf_master():\n            outfile = os.path.join(DEFAULT_HOME, \"testfile_1\")\n            tasks = [TestJobTask(i=str(i), n_cpu_flag=1) for i in range(3)]\n            luigi.build(tasks, local_scheduler=True, workers=3)\n            self.assertTrue(os.path.exists(outfile))\n\n    @patch(\"subprocess.Popen\")\n    @patch(\"subprocess.Popen.communicate\")\n    def test_run_job_with_dump(self, mock_open, mock_communicate):\n        mock_open.side_effect = [\"Job <1000001> is submitted to queue <queue-name>.\", \"\"]\n        task = TestJobTask(i=str(1), n_cpu_flag=1, shared_tmp_dir=\"/tmp\")\n        luigi.build([task], local_scheduler=True)\n        self.assertEqual(mock_open.call_count, 0)\n\n    def tearDown(self):\n        for fpath in glob(os.path.join(DEFAULT_HOME, \"test_lsf_file_*\")):\n            try:\n                os.remove(fpath)\n            except OSError:\n                pass\n\n\nif __name__ == \"__main__\":\n    unittest.main()\n"
  },
  {
    "path": "test/contrib/mongo_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2017 Big Datext Inc\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\nimport pytest\nfrom helpers import unittest\n\nfrom luigi.contrib.mongodb import MongoCellTarget, MongoRangeTarget\n\nHOST = \"localhost\"\nPORT = 27017\nINDEX = \"luigi_test\"\nCOLLECTION = \"luigi_collection\"\n\ntry:\n    import pymongo\n\n    mongo_client = pymongo.MongoClient(HOST, PORT)\n    mongo_client.server_info()\nexcept ImportError:\n    raise unittest.SkipTest(\"Unable to load pymongo module\")\nexcept Exception:\n    raise unittest.SkipTest(\"Unable to connect to local mongoDB instance\")\n\n\n@pytest.mark.contrib\nclass MongoCellTargetTest(unittest.TestCase):\n    \"\"\"MongoCellTarget unittest on local test database\"\"\"\n\n    def setUp(self):\n        \"\"\"\n        Fill test database with fake data\n        \"\"\"\n        self.mongo_client = pymongo.MongoClient(HOST, PORT)\n        self.collection = self.mongo_client[INDEX][COLLECTION]\n\n        self.collection.delete_many({})\n\n        test_docs = [\n            {\"_id\": \"person_1\", \"name\": \"Mike\", \"infos\": {\"family\": \"single\"}},\n            {\"_id\": \"person_2\", \"name\": \"Laura\", \"surname\": \"Gilmore\"},\n            {\"_id\": \"person_3\", \"surname\": \"Specter\"},\n            {\"_id\": \"person_4\", \"surname\": \"\", \"infos\": {\"family\": {\"children\": [\"jack\", \"rose\"]}}},\n        ]\n\n        self.collection.insert_many(test_docs)\n\n    def tearDown(self):\n        \"\"\"\n        Make sure the test database is in clean state\n        \"\"\"\n        self.collection.drop()\n        self.mongo_client.drop_database(INDEX)\n\n    def test_exists(self):\n        test_values = [\n            (\"person_1\", \"surname\", False),\n            (\"person_2\", \"surname\", True),\n            (\"person_3\", \"surname\", True),\n            (\"unknow_person\", \"surname\", False),\n        ]\n\n        for id_, field, result in test_values:\n            target = MongoCellTarget(self.mongo_client, INDEX, COLLECTION, id_, field)\n            self.assertEqual(result, target.exists())\n\n    def test_exists_nested(self):\n        test_values = [\n            (\"person_1\", \"infos\", True),\n            (\"person_1\", \"infos.family\", True),\n            (\"person_2\", \"family\", False),\n            (\"person_4\", \"infos\", True),\n            (\"person_4\", \"infos.family\", True),\n            (\"person_4\", \"infos.sexe\", False),\n            (\"person_4\", \"infos.family.children\", True),\n            (\"person_4\", \"infos.family.aunt\", False),\n        ]\n\n        for id_, path, result in test_values:\n            target = MongoCellTarget(self.mongo_client, INDEX, COLLECTION, id_, path)\n            self.assertEqual(result, target.exists())\n\n    def test_read(self):\n        test_values = [\n            (\"person_1\", \"surname\", None),\n            (\"person_2\", \"surname\", \"Gilmore\"),\n            (\"person_3\", \"surname\", \"Specter\"),\n            (\"person_4\", \"surname\", \"\"),\n            (\"unknown_person\", \"surname\", None),\n        ]\n\n        for id_, field, result in test_values:\n            target = MongoCellTarget(self.mongo_client, INDEX, COLLECTION, id_, field)\n            self.assertEqual(result, target.read())\n\n    def test_read_nested(self):\n        test_values = [\n            (\"person_1\", \"infos\", {\"family\": \"single\"}),\n            (\"person_1\", \"infos.family\", \"single\"),\n            (\"person_2\", \"family\", None),\n            (\"person_4\", \"infos\", {\"family\": {\"children\": [\"jack\", \"rose\"]}}),\n            (\"person_4\", \"infos.family\", {\"children\": [\"jack\", \"rose\"]}),\n            (\"person_4\", \"infos.sexe\", None),\n            (\"person_4\", \"infos.family.children\", [\"jack\", \"rose\"]),\n        ]\n\n        for id_, path, result in test_values:\n            target = MongoCellTarget(self.mongo_client, INDEX, COLLECTION, id_, path)\n            self.assertEqual(result, target.read())\n\n    def test_write(self):\n        ids = [\"person_1\", \"person_2\", \"person_3\", \"person_4\", \"unknow_person\"]\n\n        for id_ in ids:\n            self.setUp()\n            target = MongoCellTarget(self.mongo_client, INDEX, COLLECTION, id_, \"age\")\n            target.write(\"100\")\n            self.assertEqual(target.read(), \"100\")\n\n    def test_write_nested(self):\n        test_values = [\n            (\"person_1\", \"infos\", 12),\n            (\"person_1\", \"infos.family\", [\"ambre\", \"justin\", \"sophia\"]),\n            (\"person_2\", \"hobbies\", {\"soccer\": True}),\n            (\"person_3\", \"infos\", {\"age\": \"100\"}),\n            (\"person_3\", \"infos.hobbies\", {\"soccer\": True}),\n            (\"person_3\", \"infos.hobbies.soccer\", [{\"status\": \"young\"}, \"strong\", \"fast\"]),\n        ]\n\n        for id_, path, new_value in test_values:\n            self.setUp()\n            target = MongoCellTarget(self.mongo_client, INDEX, COLLECTION, id_, path)\n            target.write(new_value)\n            self.assertEqual(target.read(), new_value)\n            self.tearDown()\n\n\n@pytest.mark.contrib\nclass MongoRangerTargetTest(unittest.TestCase):\n    \"\"\"MongoRangelTarget unittest on local test database\"\"\"\n\n    def setUp(self):\n        \"\"\"\n        Fill test database with fake data\n        \"\"\"\n        self.mongo_client = pymongo.MongoClient(HOST, PORT)\n        self.collection = self.mongo_client[INDEX][COLLECTION]\n\n        self.collection.delete_many({})\n\n        test_docs = [\n            {\"_id\": \"person_1\", \"age\": 11, \"experience\": 10, \"content\": \"Lorem ipsum, dolor sit amet. Consectetur adipiscing elit.\"},\n            {\"_id\": \"person_2\", \"age\": 12, \"experience\": 22, \"content\": \"Sed purus nisl. Faucibus in, erat eu. Rhoncus mattis velit.\"},\n            {\"_id\": \"person_3\", \"age\": 13, \"content\": \"Nulla malesuada, fringilla lorem at pellentesque.\"},\n            {\"_id\": \"person_4\", \"age\": 14, \"content\": \"Curabitur condimentum. Venenatis fringilla.\"},\n        ]\n\n        self.collection.insert_many(test_docs)\n\n    def tearDown(self):\n        \"\"\"\n        Make sure the test database is in clean state\n        \"\"\"\n        self.collection.drop()\n        self.mongo_client.drop_database(INDEX)\n\n    def test_exists(self):\n        test_values = [\n            (\"age\", [], True),\n            (\"age\", [\"person_1\", \"person_2\", \"person_3\"], True),\n            (\"experience\", [\"person_1\", \"person_2\", \"person_3\", \"person_4\"], False),\n            (\"experience\", [\"person_1\", \"person_2\"], True),\n            (\"unknow_field\", [\"person_1\", \"person_2\"], False),\n            (\"experience\", [\"unknow_person\"], False),\n            (\"experience\", [\"person_1\", \"unknown_person\"], False),\n            (\"experience\", [\"person_3\", \"unknown_person\"], False),\n        ]\n\n        for field, ids, result in test_values:\n            target = MongoRangeTarget(self.mongo_client, INDEX, COLLECTION, ids, field)\n            self.assertEqual(result, target.exists())\n\n    def test_read(self):\n        test_values = [\n            (\"age\", [], {}),\n            (\"age\", [\"unknown_person\"], {}),\n            (\"age\", [\"person_1\", \"person_3\"], {\"person_1\": 11, \"person_3\": 13}),\n            (\"age\", [\"person_1\", \"person_3\", \"person_5\"], {\"person_1\": 11, \"person_3\": 13}),\n            (\"experience\", [\"person_1\", \"person_3\"], {\"person_1\": 10}),\n            (\"experience\", [\"person_1\", \"person_3\", \"person_5\"], {\"person_1\": 10}),\n        ]\n\n        for field, ids, result in test_values:\n            target = MongoRangeTarget(self.mongo_client, INDEX, COLLECTION, ids, field)\n            self.assertEqual(result, target.read())\n\n    def test_write(self):\n        test_values = [\n            (\n                \"age\",  # feature\n                [\"person_1\"],  # ids\n                {\"person_1\": 31},  # arg of write()\n                ({\"_id\": {\"$in\": [\"person_1\"]}}, {\"age\": True}),  # mongo request to fetch result\n                [{\"_id\": \"person_1\", \"age\": 31}],  # result\n            ),\n            (\n                \"experience\",\n                [\"person_1\", \"person_3\"],\n                {\"person_1\": 31, \"person_3\": 32},\n                ({\"_id\": {\"$in\": [\"person_1\", \"person_3\"]}}, {\"experience\": True}),\n                [{\"_id\": \"person_1\", \"experience\": 31}, {\"_id\": \"person_3\", \"experience\": 32}],\n            ),\n            (\n                \"experience\",\n                [],\n                {\"person_3\": 18},\n                ({\"_id\": {\"$in\": [\"person_1\", \"person_3\"]}}, {\"experience\": True}),\n                [{\"_id\": \"person_1\", \"experience\": 10}, {\"_id\": \"person_3\"}],\n            ),\n            (\n                \"age\",\n                [\"person_1\"],\n                {\"person_1\": [\"young\", \"old\"]},\n                ({\"_id\": \"person_1\"}, {\"age\": True}),\n                [{\"_id\": \"person_1\", \"age\": [\"young\", \"old\"]}],\n            ),\n            (\n                \"age\",\n                [\"person_1\"],\n                {\"person_1\": {\"feeling_like\": 60}},\n                ({\"_id\": \"person_1\"}, {\"age\": True}),\n                [{\"_id\": \"person_1\", \"age\": {\"feeling_like\": 60}}],\n            ),\n            (\n                \"age\",\n                [\"person_1\"],\n                {\"person_1\": [{\"feeling_like\": 60}, 24]},\n                ({\"_id\": \"person_1\"}, {\"age\": True}),\n                [{\"_id\": \"person_1\", \"age\": [{\"feeling_like\": 60}, 24]}],\n            ),\n        ]\n\n        for field, ids, docs, req, result in test_values:\n            self.setUp()\n            target = MongoRangeTarget(self.mongo_client, INDEX, COLLECTION, ids, field)\n            target.write(docs)\n            self.assertEqual(result, list(self.collection.find(*req)))\n            self.tearDown()\n"
  },
  {
    "path": "test/contrib/mysqldb_test.py",
    "content": "import datetime\n\nimport mock\nimport pytest\nfrom helpers import unittest\n\nimport luigi.contrib.mysqldb\nfrom luigi.tools.range import RangeDaily\n\n\ndef datetime_to_epoch(dt):\n    td = dt - datetime.datetime(1970, 1, 1)\n    return td.days * 86400 + td.seconds + td.microseconds / 1e6\n\n\nclass MockMysqlCursor(mock.Mock):\n    \"\"\"\n    Keeps state to simulate executing SELECT queries and fetching results.\n    \"\"\"\n\n    def __init__(self, existing_update_ids):\n        super(MockMysqlCursor, self).__init__()\n        self.existing = existing_update_ids\n\n    def execute(self, query, params):\n        if query.startswith(\"SELECT 1 FROM table_updates\"):\n            self.fetchone_result = (1,) if params[0] in self.existing else None\n        else:\n            self.fetchone_result = None\n\n    def fetchone(self):\n        return self.fetchone_result\n\n\nclass DummyMysqlImporter(luigi.contrib.mysqldb.CopyToTable):\n    date = luigi.DateParameter()\n\n    host = \"dummy_host\"\n    database = \"dummy_database\"\n    user = \"dummy_user\"\n    password = \"dummy_password\"\n    table = \"dummy_table\"\n    columns = (\n        (\"some_text\", \"text\"),\n        (\"some_int\", \"int\"),\n    )\n\n\n# Testing that an existing update will not be run in RangeDaily\n@pytest.mark.mysql\nclass DailyCopyToTableTest(unittest.TestCase):\n    @mock.patch(\"mysql.connector.connect\")\n    def test_bulk_complete(self, mock_connect):\n        mock_cursor = MockMysqlCursor(\n            [  # Existing update_ids\n                DummyMysqlImporter(date=datetime.datetime(2015, 1, 3)).task_id\n            ]\n        )\n        mock_connect.return_value.cursor.return_value = mock_cursor\n\n        task = RangeDaily(of=DummyMysqlImporter, start=datetime.date(2015, 1, 2), now=datetime_to_epoch(datetime.datetime(2015, 1, 7)))\n        actual = sorted([t.task_id for t in task.requires()])\n\n        self.assertEqual(\n            actual,\n            sorted(\n                [\n                    DummyMysqlImporter(date=datetime.datetime(2015, 1, 2)).task_id,\n                    DummyMysqlImporter(date=datetime.datetime(2015, 1, 4)).task_id,\n                    DummyMysqlImporter(date=datetime.datetime(2015, 1, 5)).task_id,\n                    DummyMysqlImporter(date=datetime.datetime(2015, 1, 6)).task_id,\n                ]\n            ),\n        )\n        self.assertFalse(task.complete())\n\n\n@pytest.mark.mysql\nclass TestCopyToTableWithMetaColumns(unittest.TestCase):\n    @mock.patch(\"luigi.contrib.mysqldb.CopyToTable.enable_metadata_columns\", new_callable=mock.PropertyMock, return_value=True)\n    @mock.patch(\"luigi.contrib.mysqldb.CopyToTable._add_metadata_columns\")\n    @mock.patch(\"luigi.contrib.mysqldb.CopyToTable.post_copy_metacolumns\")\n    @mock.patch(\"luigi.contrib.mysqldb.CopyToTable.rows\", return_value=[\"row1\", \"row2\"])\n    @mock.patch(\"luigi.contrib.mysqldb.MySqlTarget\")\n    @mock.patch(\"mysql.connector.connect\")\n    def test_copy_with_metadata_columns_enabled(\n        self, mock_connect, mock_mysql_target, mock_rows, mock_add_columns, mock_update_columns, mock_metadata_columns_enabled\n    ):\n\n        task = DummyMysqlImporter(date=datetime.datetime(1991, 3, 24))\n\n        mock_cursor = MockMysqlCursor([task.task_id])\n        mock_connect.return_value.cursor.return_value = mock_cursor\n\n        task = DummyMysqlImporter(date=datetime.datetime(1991, 3, 24))\n        task.run()\n\n        self.assertTrue(mock_add_columns.called)\n        self.assertTrue(mock_update_columns.called)\n\n    @mock.patch(\"luigi.contrib.mysqldb.CopyToTable.enable_metadata_columns\", new_callable=mock.PropertyMock, return_value=False)\n    @mock.patch(\"luigi.contrib.mysqldb.CopyToTable._add_metadata_columns\")\n    @mock.patch(\"luigi.contrib.mysqldb.CopyToTable.post_copy_metacolumns\")\n    @mock.patch(\"luigi.contrib.mysqldb.CopyToTable.rows\", return_value=[\"row1\", \"row2\"])\n    @mock.patch(\"luigi.contrib.mysqldb.MySqlTarget\")\n    @mock.patch(\"mysql.connector.connect\")\n    def test_copy_with_metadata_columns_disabled(\n        self, mock_connect, mock_mysql_target, mock_rows, mock_add_columns, mock_update_columns, mock_metadata_columns_enabled\n    ):\n\n        task = DummyMysqlImporter(date=datetime.datetime(1991, 3, 24))\n\n        mock_cursor = MockMysqlCursor([task.task_id])\n        mock_connect.return_value.cursor.return_value = mock_cursor\n\n        task.run()\n\n        self.assertFalse(mock_add_columns.called)\n        self.assertFalse(mock_update_columns.called)\n"
  },
  {
    "path": "test/contrib/opener_test.py",
    "content": "import random\nimport unittest\n\nimport mock\nimport pytest\n\nimport luigi\nfrom luigi.contrib.opener import NoOpenerError, OpenerTarget\nfrom luigi.local_target import LocalTarget\nfrom luigi.mock import MockTarget\n\n\n@pytest.mark.contrib\nclass TestOpenerTarget(unittest.TestCase):\n    def setUp(self):\n        MockTarget.fs.clear()\n\n        self.local_file = \"/tmp/{}/xyz/test.txt\".format(random.randint(0, 999999999))\n\n        if LocalTarget.fs.exists(self.local_file):\n            LocalTarget.fs.remove(self.local_file)\n\n    def tearDown(self):\n        if LocalTarget.fs.exists(self.local_file):\n            LocalTarget.fs.remove(self.local_file)\n\n    def test_invalid_target(self):\n        \"\"\"Verify invalid types raises NoOpenerError\"\"\"\n        self.assertRaises(NoOpenerError, OpenerTarget, \"foo://bar.txt\")\n\n    def test_mock_target(self):\n        \"\"\"Verify mock target url\"\"\"\n        target = OpenerTarget(\"mock://foo/bar.txt\")\n        self.assertEqual(type(target), MockTarget)\n\n        # Write to the target\n        target.open(\"w\").close()\n        self.assertTrue(MockTarget.fs.exists(\"foo/bar.txt\"))\n\n    def test_mock_target_root(self):\n        \"\"\"Verify mock target url\"\"\"\n        target = OpenerTarget(\"mock:///foo/bar.txt\")\n        self.assertEqual(type(target), MockTarget)\n\n        # Write to the target\n        target.open(\"w\").close()\n        self.assertTrue(MockTarget.fs.exists(\"/foo/bar.txt\"))\n\n    def test_default_target(self):\n        \"\"\"Verify default local target url\"\"\"\n        target = OpenerTarget(self.local_file)\n        self.assertEqual(type(target), LocalTarget)\n\n        # Write to the target\n        target.open(\"w\").close()\n        self.assertTrue(LocalTarget.fs.exists(self.local_file))\n\n    def test_local_target(self):\n        \"\"\"Verify basic local target url\"\"\"\n        local_file = \"file://{}\".format(self.local_file)\n        target = OpenerTarget(local_file)\n        self.assertEqual(type(target), LocalTarget)\n\n        # Write to the target\n        target.open(\"w\").close()\n        self.assertTrue(LocalTarget.fs.exists(self.local_file))\n\n    @mock.patch(\"luigi.local_target.LocalTarget.__init__\")\n    @mock.patch(\"luigi.local_target.LocalTarget.__del__\")\n    def test_local_tmp_target(self, lt_del_patch, lt_init_patch):\n        \"\"\"Verify local target url with query string\"\"\"\n        lt_init_patch.return_value = None\n        lt_del_patch.return_value = None\n\n        local_file = \"file://{}?is_tmp\".format(self.local_file)\n        OpenerTarget(local_file)\n        lt_init_patch.assert_called_with(self.local_file, is_tmp=True)\n\n    @mock.patch(\"luigi.contrib.s3.S3Target.__init__\")\n    def test_s3_parse(self, s3_init_patch):\n        \"\"\"Verify basic s3 target url\"\"\"\n        s3_init_patch.return_value = None\n\n        local_file = \"s3://zefr/foo/bar.txt\"\n        OpenerTarget(local_file)\n        s3_init_patch.assert_called_with(\"s3://zefr/foo/bar.txt\")\n\n    @mock.patch(\"luigi.contrib.s3.S3Target.__init__\")\n    def test_s3_parse_param(self, s3_init_patch):\n        \"\"\"Verify s3 target url with params\"\"\"\n        s3_init_patch.return_value = None\n\n        local_file = \"s3://zefr/foo/bar.txt?foo=hello&bar=true\"\n        OpenerTarget(local_file)\n        s3_init_patch.assert_called_with(\"s3://zefr/foo/bar.txt\", foo=\"hello\", bar=\"true\")\n\n    def test_binary_support(self):\n        \"\"\"\n        Make sure keyword arguments are preserved through the OpenerTarget\n        \"\"\"\n        # Verify we can't normally write binary data\n        fp = OpenerTarget(\"mock://file.txt\").open(\"w\")\n        self.assertRaises(TypeError, fp.write, b\"\\x07\\x08\\x07\")\n\n        # Verify the format is passed to the target and write binary data\n        fp = OpenerTarget(\"mock://file.txt\", format=luigi.format.MixedUnicodeBytes).open(\"w\")\n        fp.write(b\"\\x07\\x08\\x07\")\n        fp.close()\n"
  },
  {
    "path": "test/contrib/pai_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2017 Open Targets\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\n\"\"\"\nTests for OpenPAI wrapper for Luigi.\n\n\nWritten and maintained by Liu, Dongqing (@liudongqing).\n\"\"\"\n\nimport logging\nimport time\n\nimport responses\nfrom helpers import unittest\n\nimport luigi\nfrom luigi.contrib.pai import PaiTask, TaskRole\n\nlogging.basicConfig(level=logging.DEBUG)\n\n\"\"\"\nThe following configurations are required to run the test\n[OpenPai]\npai_url:http://host:port/\nusername:admin\npassword:admin-password\nexpiration:3600\n\n\"\"\"\n\n\nclass SklearnJob(PaiTask):\n    image = \"openpai/pai.example.sklearn\"\n    name = \"test_job_sk_{0}\".format(time.time())\n    command = \"cd scikit-learn/benchmarks && python bench_mnist.py\"\n    virtual_cluster = \"spark\"\n    tasks = [TaskRole(\"test\", \"cd scikit-learn/benchmarks && python bench_mnist.py\", memoryMB=4096)]\n\n\nclass TestPaiTask(unittest.TestCase):\n    @responses.activate\n    def test_success(self):\n        \"\"\"\n        Here using the responses lib to mock the PAI rest api call, the following specify the response of the call.\n        \"\"\"\n        responses.add(responses.POST, \"http://127.0.0.1:9186/api/v1/token\", json={\"token\": \"test\", \"user\": \"admin\", \"admin\": True}, status=200)\n        sk_task = SklearnJob()\n\n        responses.add(responses.POST, \"http://127.0.0.1:9186/api/v1/jobs\", json={\"message\": \"update job {0} successfully\".format(sk_task.name)}, status=202)\n\n        responses.add(responses.GET, \"http://127.0.0.1:9186/api/v1/jobs/{0}\".format(sk_task.name), json={}, status=404)\n\n        responses.add(responses.GET, \"http://127.0.0.1:9186/api/v1/jobs/{0}\".format(sk_task.name), body='{\"jobStatus\": {\"state\":\"SUCCEED\"}}', status=200)\n\n        success = luigi.build([sk_task], local_scheduler=True)\n        self.assertTrue(success)\n        self.assertTrue(sk_task.complete())\n\n    @responses.activate\n    def test_fail(self):\n        \"\"\"\n        Here using the responses lib to mock the PAI rest api call, the following specify the response of the call.\n        \"\"\"\n        responses.add(responses.POST, \"http://127.0.0.1:9186/api/v1/token\", json={\"token\": \"test\", \"user\": \"admin\", \"admin\": True}, status=200)\n        fail_task = SklearnJob()\n\n        responses.add(responses.POST, \"http://127.0.0.1:9186/api/v1/jobs\", json={\"message\": \"update job {0} successfully\".format(fail_task.name)}, status=202)\n\n        responses.add(responses.GET, \"http://127.0.0.1:9186/api/v1/jobs/{0}\".format(fail_task.name), json={}, status=404)\n\n        responses.add(responses.GET, \"http://127.0.0.1:9186/api/v1/jobs/{0}\".format(fail_task.name), body='{\"jobStatus\": {\"state\":\"FAILED\"}}', status=200)\n\n        success = luigi.build([fail_task], local_scheduler=True)\n        self.assertFalse(success)\n        self.assertFalse(fail_task.complete())\n"
  },
  {
    "path": "test/contrib/pig_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport subprocess\nimport tempfile\n\nimport pytest\nfrom helpers import unittest\nfrom mock import patch\n\nimport luigi\nfrom luigi.contrib.pig import PigJobError, PigJobTask\n\n\nclass SimpleTestJob(PigJobTask):\n    def output(self):\n        return luigi.LocalTarget(\"simple-output\")\n\n    def pig_script_path(self):\n        return \"my_simple_pig_script.pig\"\n\n\nclass ComplexTestJob(PigJobTask):\n    def output(self):\n        return luigi.LocalTarget(\"complex-output\")\n\n    def pig_script_path(self):\n        return \"my_complex_pig_script.pig\"\n\n    def pig_env_vars(self):\n        return {\"PIG_CLASSPATH\": \"/your/path\"}\n\n    def pig_properties(self):\n        return {\"pig.additional.jars\": \"/path/to/your/jar\"}\n\n    def pig_parameters(self):\n        return {\"YOUR_PARAM_NAME\": \"Your param value\"}\n\n    def pig_options(self):\n        return [\"-x\", \"local\"]\n\n\n@pytest.mark.apache\nclass SimplePigTest(unittest.TestCase):\n    def setUp(self):\n        pass\n\n    def tearDown(self):\n        pass\n\n    @patch(\"subprocess.Popen\")\n    def test_run__success(self, mock):\n        arglist_result = []\n        p = subprocess.Popen\n        subprocess.Popen = _get_fake_Popen(arglist_result, 0)\n        try:\n            job = SimpleTestJob()\n            job.run()\n            self.assertEqual([[\"/usr/share/pig/bin/pig\", \"-f\", \"my_simple_pig_script.pig\"]], arglist_result)\n        finally:\n            subprocess.Popen = p\n\n    @patch(\"subprocess.Popen\")\n    def test_run__fail(self, mock):\n        arglist_result = []\n        p = subprocess.Popen\n        subprocess.Popen = _get_fake_Popen(arglist_result, 1)\n        try:\n            job = SimpleTestJob()\n            job.run()\n            self.assertEqual([[\"/usr/share/pig/bin/pig\", \"-f\", \"my_simple_pig_script.pig\"]], arglist_result)\n        except PigJobError as e:\n            p = e\n            self.assertEqual(\"stderr\", p.err)\n        else:\n            self.fail(\"Should have thrown PigJobError\")\n        finally:\n            subprocess.Popen = p\n\n\n@pytest.mark.apache\nclass ComplexPigTest(unittest.TestCase):\n    def setUp(self):\n        pass\n\n    def tearDown(self):\n        pass\n\n    @patch(\"subprocess.Popen\")\n    def test_run__success(self, mock):\n        arglist_result = []\n        p = subprocess.Popen\n        subprocess.Popen = _get_fake_Popen(arglist_result, 0)\n\n        with (\n            tempfile.NamedTemporaryFile(delete=False) as param_file_mock,\n            tempfile.NamedTemporaryFile(delete=False) as prop_file_mock,\n            patch(\"luigi.contrib.pig.tempfile.NamedTemporaryFile\", side_effect=[param_file_mock, prop_file_mock]),\n        ):\n            try:\n                job = ComplexTestJob()\n                job.run()\n                self.assertEqual(\n                    [\n                        [\n                            \"/usr/share/pig/bin/pig\",\n                            \"-x\",\n                            \"local\",\n                            \"-param_file\",\n                            param_file_mock.name,\n                            \"-propertyFile\",\n                            prop_file_mock.name,\n                            \"-f\",\n                            \"my_complex_pig_script.pig\",\n                        ]\n                    ],\n                    arglist_result,\n                )\n\n                # Check param file\n                with open(param_file_mock.name) as pparams_file:\n                    pparams = pparams_file.readlines()\n                    self.assertEqual(1, len(pparams))\n                    self.assertEqual(\"YOUR_PARAM_NAME=Your param value\\n\", pparams[0])\n\n                # Check property file\n                with open(prop_file_mock.name) as pprops_file:\n                    pprops = pprops_file.readlines()\n                    self.assertEqual(1, len(pprops))\n                    self.assertEqual(\"pig.additional.jars=/path/to/your/jar\\n\", pprops[0])\n            finally:\n                subprocess.Popen = p\n\n    @patch(\"subprocess.Popen\")\n    def test_run__fail(self, mock):\n        arglist_result = []\n        p = subprocess.Popen\n        subprocess.Popen = _get_fake_Popen(arglist_result, 1)\n\n        with (\n            tempfile.NamedTemporaryFile(delete=False) as param_file_mock,\n            tempfile.NamedTemporaryFile(delete=False) as prop_file_mock,\n            patch(\"luigi.contrib.pig.tempfile.NamedTemporaryFile\", side_effect=[param_file_mock, prop_file_mock]),\n        ):\n            try:\n                job = ComplexTestJob()\n                job.run()\n            except PigJobError as e:\n                p = e\n                self.assertEqual(\"stderr\", p.err)\n                self.assertEqual(\n                    [\n                        [\n                            \"/usr/share/pig/bin/pig\",\n                            \"-x\",\n                            \"local\",\n                            \"-param_file\",\n                            param_file_mock.name,\n                            \"-propertyFile\",\n                            prop_file_mock.name,\n                            \"-f\",\n                            \"my_complex_pig_script.pig\",\n                        ]\n                    ],\n                    arglist_result,\n                )\n\n                # Check param file\n                with open(param_file_mock.name) as pparams_file:\n                    pparams = pparams_file.readlines()\n                    self.assertEqual(1, len(pparams))\n                    self.assertEqual(\"YOUR_PARAM_NAME=Your param value\\n\", pparams[0])\n\n                # Check property file\n                with open(prop_file_mock.name) as pprops_file:\n                    pprops = pprops_file.readlines()\n                    self.assertEqual(1, len(pprops))\n                    self.assertEqual(\"pig.additional.jars=/path/to/your/jar\\n\", pprops[0])\n            else:\n                self.fail(\"Should have thrown PigJobError\")\n            finally:\n                subprocess.Popen = p\n\n\ndef _get_fake_Popen(arglist_result, return_code, *args, **kwargs):\n    def Popen_fake(arglist, shell=None, stdout=None, stderr=None, env=None, close_fds=True):\n        arglist_result.append(arglist)\n\n        class P:\n            number_of_process_polls = 5\n\n            def __init__(self):\n                self._process_polls_left = self.number_of_process_polls\n\n            def wait(self):\n                pass\n\n            def poll(self):\n                if self._process_polls_left:\n                    self._process_polls_left -= 1\n                    return None\n\n                return 0\n\n            def communicate(self):\n                return \"end\"\n\n            def env(self):\n                return self.env\n\n        p = P()\n        p.returncode = return_code\n\n        p.stderr = tempfile.TemporaryFile()\n        p.stdout = tempfile.TemporaryFile()\n\n        p.stdout.write(b\"stdout\")\n        p.stderr.write(b\"stderr\")\n\n        # Reset temp files so the output can be read.\n        p.stdout.seek(0)\n        p.stderr.seek(0)\n\n        return p\n\n    return Popen_fake\n"
  },
  {
    "path": "test/contrib/postgres_test.py",
    "content": "# Copyright (c) 2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n\nimport datetime\n\nimport mock\nimport pytest\nfrom helpers import unittest\n\nimport luigi\nimport luigi.contrib.postgres\nfrom luigi.tools.range import RangeDaily\n\n\ndef datetime_to_epoch(dt):\n    td = dt - datetime.datetime(1970, 1, 1)\n    return td.days * 86400 + td.seconds + td.microseconds / 1e6\n\n\nclass MockPostgresCursor(mock.Mock):\n    \"\"\"\n    Keeps state to simulate executing SELECT queries and fetching results.\n    \"\"\"\n\n    def __init__(self, existing_update_ids):\n        super(MockPostgresCursor, self).__init__()\n        self.existing = existing_update_ids\n\n    def execute(self, query, params):\n        if query.startswith(\"SELECT 1 FROM table_updates\"):\n            self.fetchone_result = (1,) if params[0] in self.existing else None\n        else:\n            self.fetchone_result = None\n\n    def fetchone(self):\n        return self.fetchone_result\n\n\nclass DummyPostgresImporter(luigi.contrib.postgres.CopyToTable):\n    date = luigi.DateParameter()\n\n    host = \"dummy_host\"\n    database = \"dummy_database\"\n    user = \"dummy_user\"\n    password = \"dummy_password\"\n    table = \"dummy_table\"\n    columns = (\n        (\"some_text\", \"text\"),\n        (\"some_int\", \"int\"),\n    )\n\n\n@pytest.mark.postgres\nclass DailyCopyToTableTest(unittest.TestCase):\n    maxDiff = None\n\n    @mock.patch(\"psycopg2.connect\")\n    def test_bulk_complete(self, mock_connect):\n        mock_cursor = MockPostgresCursor([DummyPostgresImporter(date=datetime.datetime(2015, 1, 3)).task_id])\n        mock_connect.return_value.cursor.return_value = mock_cursor\n\n        task = RangeDaily(of=DummyPostgresImporter, start=datetime.date(2015, 1, 2), now=datetime_to_epoch(datetime.datetime(2015, 1, 7)))\n        actual = sorted([t.task_id for t in task.requires()])\n\n        self.assertEqual(\n            actual,\n            sorted(\n                [\n                    DummyPostgresImporter(date=datetime.datetime(2015, 1, 2)).task_id,\n                    DummyPostgresImporter(date=datetime.datetime(2015, 1, 4)).task_id,\n                    DummyPostgresImporter(date=datetime.datetime(2015, 1, 5)).task_id,\n                    DummyPostgresImporter(date=datetime.datetime(2015, 1, 6)).task_id,\n                ]\n            ),\n        )\n        self.assertFalse(task.complete())\n\n\nclass DummyPostgresQuery(luigi.contrib.postgres.PostgresQuery):\n    date = luigi.DateParameter()\n\n    host = \"dummy_host\"\n    database = \"dummy_database\"\n    user = \"dummy_user\"\n    password = \"dummy_password\"\n    table = \"dummy_table\"\n    columns = (\n        (\"some_text\", \"text\"),\n        (\"some_int\", \"int\"),\n    )\n    query = \"SELECT * FROM foo\"\n\n\nclass DummyPostgresQueryWithPort(DummyPostgresQuery):\n    port = 1234\n\n\nclass DummyPostgresQueryWithPortEncodedInHost(DummyPostgresQuery):\n    host = \"dummy_host:1234\"\n\n\n@pytest.mark.postgres\nclass PostgresQueryTest(unittest.TestCase):\n    maxDiff = None\n\n    @mock.patch(\"psycopg2.connect\")\n    def test_bulk_complete(self, mock_connect):\n        mock_cursor = MockPostgresCursor([\"DummyPostgresQuery_2015_01_03_838e32a989\"])\n        mock_connect.return_value.cursor.return_value = mock_cursor\n\n        task = RangeDaily(of=DummyPostgresQuery, start=datetime.date(2015, 1, 2), now=datetime_to_epoch(datetime.datetime(2015, 1, 7)))\n        actual = [t.task_id for t in task.requires()]\n\n        self.assertEqual(\n            actual,\n            [\n                \"DummyPostgresQuery_2015_01_02_3a0ec498ed\",\n                \"DummyPostgresQuery_2015_01_04_9c1d42ff62\",\n                \"DummyPostgresQuery_2015_01_05_0f90e52357\",\n                \"DummyPostgresQuery_2015_01_06_f91a47ec40\",\n            ],\n        )\n        self.assertFalse(task.complete())\n\n    def test_override_port(self):\n        output = DummyPostgresQueryWithPort(date=datetime.datetime(1991, 3, 24)).output()\n        self.assertEqual(output.port, 1234)\n\n    def test_port_encoded_in_host(self):\n        output = DummyPostgresQueryWithPortEncodedInHost(date=datetime.datetime(1991, 3, 24)).output()\n        self.assertEqual(output.port, \"1234\")\n\n\n@pytest.mark.postgres\nclass TestCopyToTableWithMetaColumns(unittest.TestCase):\n    @mock.patch(\"luigi.contrib.postgres.CopyToTable.enable_metadata_columns\", new_callable=mock.PropertyMock, return_value=True)\n    @mock.patch(\"luigi.contrib.postgres.CopyToTable._add_metadata_columns\")\n    @mock.patch(\"luigi.contrib.postgres.CopyToTable.post_copy_metacolumns\")\n    @mock.patch(\"luigi.contrib.postgres.CopyToTable.rows\", return_value=[\"row1\", \"row2\"])\n    @mock.patch(\"luigi.contrib.postgres.PostgresTarget\")\n    @mock.patch(\"psycopg2.connect\")\n    def test_copy_with_metadata_columns_enabled(\n        self, mock_connect, mock_redshift_target, mock_rows, mock_add_columns, mock_update_columns, mock_metadata_columns_enabled\n    ):\n\n        task = DummyPostgresImporter(date=datetime.datetime(1991, 3, 24))\n\n        mock_cursor = MockPostgresCursor([task.task_id])\n        mock_connect.return_value.cursor.return_value = mock_cursor\n\n        task = DummyPostgresImporter(date=datetime.datetime(1991, 3, 24))\n        task.run()\n\n        self.assertTrue(mock_add_columns.called)\n        self.assertTrue(mock_update_columns.called)\n\n    @mock.patch(\"luigi.contrib.postgres.CopyToTable.enable_metadata_columns\", new_callable=mock.PropertyMock, return_value=False)\n    @mock.patch(\"luigi.contrib.postgres.CopyToTable._add_metadata_columns\")\n    @mock.patch(\"luigi.contrib.postgres.CopyToTable.post_copy_metacolumns\")\n    @mock.patch(\"luigi.contrib.postgres.CopyToTable.rows\", return_value=[\"row1\", \"row2\"])\n    @mock.patch(\"luigi.contrib.postgres.PostgresTarget\")\n    @mock.patch(\"psycopg2.connect\")\n    def test_copy_with_metadata_columns_disabled(\n        self, mock_connect, mock_redshift_target, mock_rows, mock_add_columns, mock_update_columns, mock_metadata_columns_enabled\n    ):\n\n        task = DummyPostgresImporter(date=datetime.datetime(1991, 3, 24))\n\n        mock_cursor = MockPostgresCursor([task.task_id])\n        mock_connect.return_value.cursor.return_value = mock_cursor\n\n        task.run()\n\n        self.assertFalse(mock_add_columns.called)\n        self.assertFalse(mock_update_columns.called)\n"
  },
  {
    "path": "test/contrib/postgres_with_server_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nimport os\n\nimport pytest\nfrom helpers import unittest\n\nimport luigi\nimport luigi.notifications\nfrom luigi.contrib import postgres\n\n\"\"\"\nTypical use cases that should be tested:\n\n* Daily overwrite of all data in table\n* Daily inserts of new segment in table\n* (Daily insertion/creation of new table)\n* Daily insertion of multiple (different) new segments into table\n\n\n\"\"\"\n\nhost = \"localhost\"\ndatabase = \"spotify\"\nuser = os.getenv(\"POSTGRES_USER\", \"spotify\")\npassword = \"guest\"\n\n\ntry:\n    import psycopg2\n\n    conn = psycopg2.connect(\n        user=user,\n        host=host,\n        database=database,\n        password=password,\n    )\n    conn.close()\n    psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)\n    psycopg2.extensions.register_type(psycopg2.extensions.UNICODEARRAY)\nexcept Exception:\n    raise unittest.SkipTest(\"Unable to connect to postgres\")\n\n\n# to avoid copying:\n\n\nclass CopyToTestDB(postgres.CopyToTable):\n    host = host\n    database = database\n    user = user\n    password = password\n\n\nclass TestPostgresTask(CopyToTestDB):\n    table = \"test_table\"\n    columns = ((\"test_text\", \"text\"), (\"test_int\", \"int\"), (\"test_float\", \"float\"))\n\n    def create_table(self, connection):\n        connection.cursor().execute(\"CREATE TABLE {table} (id SERIAL PRIMARY KEY, test_text TEXT, test_int INT, test_float FLOAT)\".format(table=self.table))\n\n    def rows(self):\n        yield \"foo\", 123, 123.45\n        yield None, \"-100\", \"5143.213\"\n        yield \"\\t\\n\\r\\\\N\", 0, 0\n        yield \"éцү我\", 0, 0\n        yield \"\", 0, r\"\\N\"  # Test working default null charcter\n\n\nclass MetricBase(CopyToTestDB):\n    table = \"metrics\"\n    columns = [(\"metric\", \"text\"), (\"value\", \"int\")]\n\n\nclass Metric1(MetricBase):\n    param = luigi.Parameter()\n\n    def rows(self):\n        yield \"metric1\", 1\n        yield \"metric1\", 2\n        yield \"metric1\", 3\n\n\nclass Metric2(MetricBase):\n    param = luigi.Parameter()\n\n    def rows(self):\n        yield \"metric2\", 1\n        yield \"metric2\", 4\n        yield \"metric2\", 3\n\n\n@pytest.mark.postgres\nclass TestPostgresImportTask(unittest.TestCase):\n    def test_default_escape(self):\n        self.assertEqual(postgres.default_escape(\"foo\"), \"foo\")\n        self.assertEqual(postgres.default_escape(\"\\n\"), \"\\\\n\")\n        self.assertEqual(postgres.default_escape(\"\\\\\\n\"), \"\\\\\\\\\\\\n\")\n        self.assertEqual(postgres.default_escape(\"\\n\\r\\\\\\t\\\\N\\\\\"), \"\\\\n\\\\r\\\\\\\\\\\\t\\\\\\\\N\\\\\\\\\")\n\n    def test_repeat(self):\n        task = TestPostgresTask()\n        conn = task.output().connect()\n        conn.autocommit = True\n        cursor = conn.cursor()\n        cursor.execute(\"DROP TABLE IF EXISTS {table}\".format(table=task.table))\n        cursor.execute(\"DROP TABLE IF EXISTS {marker_table}\".format(marker_table=postgres.PostgresTarget.marker_table))\n\n        luigi.build([task], local_scheduler=True)\n        luigi.build([task], local_scheduler=True)  # try to schedule twice\n\n        cursor.execute(\"\"\"SELECT test_text, test_int, test_float\n                          FROM test_table\n                          ORDER BY id ASC\"\"\")\n\n        rows = tuple(cursor)\n\n        self.assertEqual(\n            rows,\n            (\n                (\"foo\", 123, 123.45),\n                (None, -100, 5143.213),\n                (\"\\t\\n\\r\\\\N\", 0.0, 0),\n                (\"éцү我\", 0, 0),\n                (\"\", 0, None),  # Test working default null charcter\n            ),\n        )\n\n    def test_multimetric(self):\n        metrics = MetricBase()\n        conn = metrics.output().connect()\n        conn.autocommit = True\n        conn.cursor().execute(\"DROP TABLE IF EXISTS {table}\".format(table=metrics.table))\n        conn.cursor().execute(\"DROP TABLE IF EXISTS {marker_table}\".format(marker_table=postgres.PostgresTarget.marker_table))\n        luigi.build([Metric1(20), Metric1(21), Metric2(\"foo\")], local_scheduler=True)\n\n        cursor = conn.cursor()\n        cursor.execute(\"select count(*) from {table}\".format(table=metrics.table))\n        self.assertEqual(tuple(cursor), ((9,),))\n\n    def test_clear(self):\n        class Metric2Copy(Metric2):\n            def init_copy(self, connection):\n                query = \"TRUNCATE {0}\".format(self.table)\n                connection.cursor().execute(query)\n\n        clearer = Metric2Copy(21)\n        conn = clearer.output().connect()\n        conn.autocommit = True\n        conn.cursor().execute(\"DROP TABLE IF EXISTS {table}\".format(table=clearer.table))\n        conn.cursor().execute(\"DROP TABLE IF EXISTS {marker_table}\".format(marker_table=postgres.PostgresTarget.marker_table))\n\n        luigi.build([Metric1(0), Metric1(1)], local_scheduler=True)\n        luigi.build([clearer], local_scheduler=True)\n        cursor = conn.cursor()\n        cursor.execute(\"select count(*) from {table}\".format(table=clearer.table))\n        self.assertEqual(tuple(cursor), ((3,),))\n"
  },
  {
    "path": "test/contrib/presto_test.py",
    "content": "import unittest\n\nimport mock\nfrom pyhive.exc import DatabaseError\nfrom pyhive.presto import Connection, Cursor\n\nfrom luigi.contrib.presto import PrestoClient, PrestoTarget, PrestoTask\n\n\nclass WithPrestoClientTest(unittest.TestCase):\n    def test_creates_client_with_expected_params(self):\n        # arrange\n        class _Task(PrestoTask):\n            host = \"127.0.0.1\"\n            port = 8089\n            user = \"user_123\"\n            database = \"db1\"\n            table = \"tbl1\"\n\n        expected_connection_kwargs = {\n            \"host\": \"127.0.0.1\",\n            \"port\": 8089,\n            \"username\": \"user_123\",\n            \"catalog\": \"hive\",\n            \"protocol\": \"https\",\n            \"source\": \"pyhive\",\n            \"poll_interval\": 1.0,\n            \"schema\": \"db1\",\n            \"requests_kwargs\": {\"verify\": False},\n        }\n\n        # act\n        task = _Task()\n\n        # assert\n        client = task._client\n        assert isinstance(client, PrestoClient)\n        connection = client._connection\n        assert not connection._args\n        assert connection._kwargs == expected_connection_kwargs\n\n\nclass PrestoClientTest(unittest.TestCase):\n    @mock.patch(\"luigi.contrib.presto.sleep\", return_value=None)\n    def test_watch(self, sleep):\n        # arrange\n        status = {\"stats\": {\"progressPercentage\": 1.2}, \"infoUri\": \"http://127.0.0.1:8080/ui/query.html?query=123\"}\n        cursor = mock.MagicMock(spec=Cursor)\n        cursor.poll.side_effect = [status, None]\n\n        connection = mock.MagicMock(spec=Connection)\n        connection.cursor.return_value = cursor\n\n        client = PrestoClient(connection)\n        query = \"select 1\"\n\n        # act\n        statuses = list(client.execute(query))\n\n        # assert\n        assert client.percentage_progress == 1.2\n        assert client.info_uri == \"http://127.0.0.1:8080/ui/query.html?query=123\"\n        assert statuses == [status]\n        cursor.execute.assert_called_once_with(query, None)\n        cursor.close.assert_called_once_with()\n\n    @mock.patch(\"luigi.contrib.presto.sleep\", return_value=None)\n    def test_fetch(self, sleep):\n        # arrange\n        status = {\"infoUri\": \"http://127.0.0.1:8080/ui/query.html?query=123\"}\n        cursor = mock.MagicMock(spec=Cursor)\n        cursor.poll.side_effect = [status, None]\n        cursor.fetchall.return_value = [(1,), (2,)]\n\n        connection = mock.MagicMock(spec=Connection)\n        connection.cursor.return_value = cursor\n\n        client = PrestoClient(connection)\n        query = \"select 1\"\n\n        # act\n        result = list(client.execute(query, mode=\"fetch\"))\n\n        # assert\n        assert client.percentage_progress == 0.1\n        assert client.info_uri == \"http://127.0.0.1:8080/ui/query.html?query=123\"\n        cursor.execute.assert_called_once_with(query, None)\n        cursor.close.assert_called_once_with()\n        assert result == [(1,), (2,)]\n\n\nclass PrestoTargetTest(unittest.TestCase):\n    def test_non_partitioned(self):\n        # arrange\n        client = mock.MagicMock(spec=PrestoClient)\n        client.execute.return_value = iter(\n            [\n                (7, None),\n            ]\n        )\n\n        catalog = \"hive\"\n        database = \"schm1\"\n        table = \"tbl1\"\n\n        # act\n        target = PrestoTarget(client, catalog, database, table)\n        count = target.count()\n        exists = target.exists()\n\n        # assert\n        client.execute.assert_called_once_with(\n            \"SELECT COUNT(*) AS cnt FROM hive.schm1.tbl1 WHERE 1 = %s LIMIT 1\",\n            [\n                1,\n            ],\n            mode=\"fetch\",\n        )\n        assert count == 7\n        assert exists\n\n    def test_partitioned(self):\n        # arrange\n        client = mock.MagicMock(spec=PrestoClient)\n        client.execute.return_value = iter(\n            [\n                (7, None),\n            ]\n        )\n\n        catalog = \"hive\"\n        database = \"schm1\"\n        table = \"tbl1\"\n        partition = {\"a\": 2, \"b\": \"x\"}\n\n        # act\n        target = PrestoTarget(client, catalog, database, table, partition)\n        count = target.count()\n        exists = target.exists()\n\n        # assert\n        client.execute.assert_called_once_with(\"SELECT COUNT(*) AS cnt FROM hive.schm1.tbl1 WHERE a = %s AND b = %s LIMIT 1\", [2, \"x\"], mode=\"fetch\")\n        assert count == 7\n        assert exists\n\n    def test_table_doesnot_exist(self):\n        # arrange\n        e = DatabaseError()\n        setattr(e, \"message\", {\"message\": \"line 1:15: Table hive.schm1.tbl1 does not exist\"})\n\n        client = mock.MagicMock(spec=PrestoClient)\n        client.execute.side_effect = e\n\n        catalog = \"hive\"\n        database = \"schm1\"\n        table = \"tbl1\"\n\n        # act\n        target = PrestoTarget(client, catalog, database, table)\n        exists = target.exists()\n\n        # assert\n        client.execute.assert_called_once_with(\"SELECT COUNT(*) AS cnt FROM hive.schm1.tbl1 WHERE 1 = %s LIMIT 1\", [1], mode=\"fetch\")\n        assert not exists\n\n\nclass PrestoTest(unittest.TestCase):\n    @mock.patch(\"luigi.contrib.presto.sleep\", return_value=None)\n    def test_run(self, sleep):\n        # arrange\n        client = mock.MagicMock(spec=PrestoClient)\n        client.execute.return_value = [(), (), ()]\n        client.info_uri = \"http://127.0.0.1:8080/ui/query.html?query=123\"\n        client.percentage_progress = 2.3\n\n        class _Task(PrestoTask):\n            host = \"127.0.0.1\"\n            port = 8089\n            user = \"user_123\"\n            password = \"123\"\n            database = \"db1\"\n            table = \"tbl1\"\n            query = \"select 1\"\n\n        # act\n        with mock.patch(\"luigi.contrib.presto.PrestoClient\", return_value=client):\n            task = _Task()\n            task.set_progress_percentage = mock.MagicMock()\n            task.set_tracking_url = mock.MagicMock()\n            task.run()\n\n        # assert\n        assert task.protocol == \"https\"\n        assert task.output().catalog == \"hive\"\n        assert task.output().database == \"db1\"\n        assert task.output().table == \"tbl1\"\n        assert task.output().partition is None\n\n        client.execute.assert_called_once_with(\"select 1\")\n        task.set_tracking_url.assert_called_once_with(\"http://127.0.0.1:8080/ui/query.html?query=123\")\n        assert task.set_progress_percentage.mock_calls == [\n            mock.call(2.3),\n            mock.call(2.3),\n            mock.call(2.3),\n        ]\n"
  },
  {
    "path": "test/contrib/prometheus_metric_test.py",
    "content": "import pytest\nfrom helpers import unittest\nfrom prometheus_client import CONTENT_TYPE_LATEST\n\nfrom luigi.contrib.prometheus_metric import PrometheusMetricsCollector\nfrom luigi.metrics import MetricsCollectors\nfrom luigi.scheduler import Scheduler\n\ntry:\n    from unittest import mock\nexcept ImportError:\n    import mock\n\n\nWORKER = \"myworker\"\nTASK_ID = \"TaskID\"\nTASK_FAMILY = \"TaskFamily\"\nA_PARAM_VALUE = \"1\"\nB_PARAM_VALUE = \"2\"\nC_PARAM_VALUE = \"3\"\n\n\n@pytest.mark.contrib\nclass PrometheusMetricBaseTest(unittest.TestCase):\n    COLLECTOR_KWARGS = {}\n    EXPECTED_LABELS = {\"family\": TASK_FAMILY}\n\n    def setUp(self):\n        self.collector = PrometheusMetricsCollector(**self.COLLECTOR_KWARGS)\n        self.s = Scheduler(metrics_collector=MetricsCollectors.prometheus)\n        self.gauge_name = \"luigi_task_execution_time_seconds\"\n\n    def startTask(self):\n        self.s.add_task(\n            worker=WORKER,\n            task_id=TASK_ID,\n            family=TASK_FAMILY,\n            params={\"a\": A_PARAM_VALUE, \"b\": B_PARAM_VALUE, \"c\": C_PARAM_VALUE},\n        )\n        task = self.s._state.get_task(TASK_ID)\n        task.time_running = 0\n        task.updated = 5\n        return task\n\n    def test_handle_task_started(self):\n        task = self.startTask()\n        self.collector.handle_task_started(task)\n\n        counter_name = \"luigi_task_started_total\"\n        gauge_name = self.gauge_name\n        labels = self.EXPECTED_LABELS\n\n        assert self.collector.registry.get_sample_value(counter_name, labels=labels) == 1\n        assert self.collector.registry.get_sample_value(gauge_name, labels=labels) == 0\n\n    def test_handle_task_failed(self):\n        task = self.startTask()\n        self.collector.handle_task_failed(task)\n\n        counter_name = \"luigi_task_failed_total\"\n        gauge_name = self.gauge_name\n        labels = self.EXPECTED_LABELS\n\n        assert self.collector.registry.get_sample_value(counter_name, labels=labels) == 1\n        assert self.collector.registry.get_sample_value(gauge_name, labels=labels) == task.updated - task.time_running\n\n    def test_handle_task_disabled(self):\n        task = self.startTask()\n        self.collector.handle_task_disabled(task, self.s._config)\n\n        counter_name = \"luigi_task_disabled_total\"\n        gauge_name = self.gauge_name\n        labels = self.EXPECTED_LABELS\n\n        assert self.collector.registry.get_sample_value(counter_name, labels=labels) == 1\n        assert self.collector.registry.get_sample_value(gauge_name, labels=labels) == task.updated - task.time_running\n\n    def test_handle_task_done(self):\n        task = self.startTask()\n        self.collector.handle_task_done(task)\n\n        counter_name = \"luigi_task_done_total\"\n        gauge_name = self.gauge_name\n        labels = self.EXPECTED_LABELS\n\n        assert self.collector.registry.get_sample_value(counter_name, labels=labels) == 1\n        assert self.collector.registry.get_sample_value(gauge_name, labels=labels) == task.updated - task.time_running\n\n    def test_configure_http_handler(self):\n        mock_http_handler = mock.MagicMock()\n        self.collector.configure_http_handler(mock_http_handler)\n        mock_http_handler.set_header.assert_called_once_with(\"Content-Type\", CONTENT_TYPE_LATEST)\n\n\n@pytest.mark.contrib\nclass PrometheusMetricTaskParamsOnlyTest(PrometheusMetricBaseTest):\n    COLLECTOR_KWARGS = {\n        \"use_task_family_in_labels\": False,\n        \"task_parameters_to_use_in_labels\": [\"a\", \"c\"],\n    }\n    EXPECTED_LABELS = {\"a\": A_PARAM_VALUE, \"c\": C_PARAM_VALUE}\n\n\n@pytest.mark.contrib\nclass PrometheusMetricTaskFamilyAndTaskParamsTest(PrometheusMetricBaseTest):\n    COLLECTOR_KWARGS = {\n        \"use_task_family_in_labels\": True,\n        \"task_parameters_to_use_in_labels\": [\"b\"],\n    }\n    EXPECTED_LABELS = {\"family\": TASK_FAMILY, \"b\": B_PARAM_VALUE}\n"
  },
  {
    "path": "test/contrib/rdbms_test.py",
    "content": "# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n\"\"\"\nWe're using Redshift as the test bed since Redshift implements RDBMS. We could\nhave opted for PSQL but we're less familiar with that contrib and there are\nless examples on how to test it.\n\"\"\"\n\nimport unittest\n\nimport mock\nimport pytest\n\nimport luigi\nimport luigi.contrib.redshift\n\n# Fake AWS and S3 credentials taken from `../redshift_test.py`.\nAWS_ACCESS_KEY = \"key\"\nAWS_SECRET_KEY = \"secret\"\n\nAWS_ACCOUNT_ID = \"0123456789012\"\nAWS_ROLE_NAME = \"MyRedshiftRole\"\n\nBUCKET = \"bucket\"\nKEY = \"key\"\n\n\nclass DummyS3CopyToTableBase(luigi.contrib.redshift.S3CopyToTable):\n    # Class attributes taken from `DummyPostgresImporter` in\n    # `../postgres_test.py`.\n    host = \"dummy_host\"\n    database = \"dummy_database\"\n    user = \"dummy_user\"\n    password = \"dummy_password\"\n    table = luigi.Parameter(default=\"dummy_table\")\n    columns = luigi.TupleParameter(\n        default=(\n            (\"some_text\", \"varchar(255)\"),\n            (\"some_int\", \"int\"),\n        )\n    )\n\n    copy_options = \"\"\n    prune_table = \"\"\n    prune_column = \"\"\n    prune_date = \"\"\n\n    def s3_load_path(self):\n        return \"s3://%s/%s\" % (BUCKET, KEY)\n\n\nclass DummyS3CopyToTableKey(DummyS3CopyToTableBase):\n    aws_access_key_id = AWS_ACCESS_KEY\n    aws_secret_access_key = AWS_SECRET_KEY\n\n\n@pytest.mark.aws\nclass TestS3CopyToTableWithMetaColumns(unittest.TestCase):\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.enable_metadata_columns\", new_callable=mock.PropertyMock, return_value=True)\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.metadata_columns\", new_callable=mock.PropertyMock, return_value=[(\"created_tz\", \"TIMESTAMP\")])\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_copy_check_meta_columns_to_table_if_exists(self, mock_redshift_target, mock_metadata_columns, mock_metadata_columns_enabled):\n        task = DummyS3CopyToTableKey(table=\"my_test_table\")\n        task.run()\n\n        mock_cursor = mock_redshift_target.return_value.connect.return_value.cursor.return_value\n\n        executed_query = mock_cursor.execute.call_args_list[1][0][0]\n\n        expected_output = (\n            \"SELECT 1 AS column_exists FROM information_schema.columns WHERE table_name = LOWER('{table}') AND column_name = LOWER('{column}') LIMIT 1;\".format(\n                table=\"my_test_table\", column=\"created_tz\"\n            )\n        )\n\n        self.assertEqual(executed_query, expected_output)\n\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.enable_metadata_columns\", new_callable=mock.PropertyMock, return_value=True)\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.metadata_columns\", new_callable=mock.PropertyMock, return_value=[(\"created_tz\", \"TIMESTAMP\")])\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_copy_check_meta_columns_to_schematable_if_exists(self, mock_redshift_target, mock_metadata_columns, mock_metadata_columns_enabled):\n        task = DummyS3CopyToTableKey(table=\"test.my_test_table\")\n        task.run()\n\n        mock_cursor = mock_redshift_target.return_value.connect.return_value.cursor.return_value\n\n        executed_query = mock_cursor.execute.call_args_list[2][0][0]\n\n        expected_output = (\n            \"SELECT 1 AS column_exists FROM information_schema.columns \"\n            \"WHERE table_schema = LOWER('{schema}') \"\n            \"AND table_name = LOWER('{table}') \"\n            \"AND column_name = LOWER('{column}') \"\n            \"LIMIT 1;\".format(schema=\"test\", table=\"my_test_table\", column=\"created_tz\")\n        )\n\n        self.assertEqual(executed_query, expected_output)\n\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.enable_metadata_columns\", new_callable=mock.PropertyMock, return_value=True)\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.metadata_columns\", new_callable=mock.PropertyMock, return_value=[(\"created_tz\", \"TIMESTAMP\")])\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable._column_exists\", return_value=True)\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable._add_column_to_table\")\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_copy_not_add_if_meta_columns_already_exists(\n        self, mock_redshift_target, mock_add_to_table, mock_columns_exists, mock_metadata_columns, mock_metadata_columns_enabled\n    ):\n        task = DummyS3CopyToTableKey()\n        task.run()\n\n        self.assertFalse(mock_add_to_table.called)\n\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.enable_metadata_columns\", new_callable=mock.PropertyMock, return_value=True)\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.metadata_columns\", new_callable=mock.PropertyMock, return_value=[(\"created_tz\", \"TIMESTAMP\")])\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable._column_exists\", return_value=False)\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable._add_column_to_table\")\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_copy_add_if_meta_columns_not_already_exists(\n        self, mock_redshift_target, mock_add_to_table, mock_columns_exists, mock_metadata_columns, mock_metadata_columns_enabled\n    ):\n        task = DummyS3CopyToTableKey()\n        task.run()\n\n        self.assertTrue(mock_add_to_table.called)\n\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.enable_metadata_columns\", new_callable=mock.PropertyMock, return_value=True)\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.metadata_columns\", new_callable=mock.PropertyMock, return_value=[(\"created_tz\", \"TIMESTAMP\")])\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable._column_exists\", return_value=False)\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_copy_add_regular_column(self, mock_redshift_target, mock_columns_exists, mock_metadata_columns, mock_metadata_columns_enabled):\n        task = DummyS3CopyToTableKey(table=\"my_test_table\")\n        task.run()\n\n        mock_cursor = mock_redshift_target.return_value.connect.return_value.cursor.return_value\n\n        executed_query = mock_cursor.execute.call_args_list[1][0][0]\n\n        expected_output = \"ALTER TABLE {table} ADD COLUMN {column} {type};\".format(table=\"my_test_table\", column=\"created_tz\", type=\"TIMESTAMP\")\n\n        self.assertEqual(executed_query, expected_output)\n\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.enable_metadata_columns\", new_callable=mock.PropertyMock, return_value=True)\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.metadata_columns\", new_callable=mock.PropertyMock, return_value=[(\"created_tz\", \"TIMESTAMP\", \"bytedict\")])\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable._column_exists\", return_value=False)\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_copy_add_encoded_column(self, mock_redshift_target, mock_columns_exists, mock_metadata_columns, mock_metadata_columns_enabled):\n        task = DummyS3CopyToTableKey(table=\"my_test_table\")\n        task.run()\n\n        mock_cursor = mock_redshift_target.return_value.connect.return_value.cursor.return_value\n\n        executed_query = mock_cursor.execute.call_args_list[1][0][0]\n\n        expected_output = \"ALTER TABLE {table} ADD COLUMN {column} {type} ENCODE {encoding};\".format(\n            table=\"my_test_table\", column=\"created_tz\", type=\"TIMESTAMP\", encoding=\"bytedict\"\n        )\n\n        self.assertEqual(executed_query, expected_output)\n\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.enable_metadata_columns\", new_callable=mock.PropertyMock, return_value=True)\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.metadata_columns\", new_callable=mock.PropertyMock, return_value=[(\"created_tz\")])\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable._column_exists\", return_value=False)\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_copy_raise_error_on_no_column_type(self, mock_redshift_target, mock_columns_exists, mock_metadata_columns, mock_metadata_columns_enabled):\n        task = DummyS3CopyToTableKey()\n\n        with self.assertRaises(ValueError):\n            task.run()\n\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.enable_metadata_columns\", new_callable=mock.PropertyMock, return_value=True)\n    @mock.patch(\n        \"luigi.contrib.redshift.S3CopyToTable.metadata_columns\", new_callable=mock.PropertyMock, return_value=[(\"created_tz\", \"TIMESTAMP\", \"bytedict\", \"42\")]\n    )\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable._column_exists\", return_value=False)\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_copy_raise_error_on_invalid_column(self, mock_redshift_target, mock_columns_exists, mock_metadata_columns, mock_metadata_columns_enabled):\n        task = DummyS3CopyToTableKey()\n\n        with self.assertRaises(ValueError):\n            task.run()\n\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.enable_metadata_columns\", new_callable=mock.PropertyMock, return_value=True)\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.metadata_queries\", new_callable=mock.PropertyMock, return_value=[\"SELECT 1 FROM X\", \"SELECT 2 FROM Y\"])\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_post_copy_metacolumns(self, mock_redshift_target, mock_metadata_queries, mock_metadata_columns_enabled):\n        task = DummyS3CopyToTableKey()\n        task.run()\n\n        mock_cursor = mock_redshift_target.return_value.connect.return_value.cursor.return_value\n\n        executed_query = mock_cursor.execute.call_args_list[2][0][0]\n        expected_output = \"SELECT 1 FROM X\"\n        self.assertEqual(executed_query, expected_output)\n\n        executed_query = mock_cursor.execute.call_args_list[3][0][0]\n        expected_output = \"SELECT 2 FROM Y\"\n        self.assertEqual(executed_query, expected_output)\n"
  },
  {
    "path": "test/contrib/redis_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n# pylint: disable=F0401\nfrom time import sleep\n\nimport pytest\nfrom helpers import unittest\n\ntry:\n    import redis\nexcept ImportError:\n    raise unittest.SkipTest(\"Unable to load redis module\")\n\nfrom luigi.contrib.redis_store import RedisTarget\n\nHOST = \"localhost\"\nPORT = 6379\nDB = 15\nPASSWORD = None\nSOCKET_TIMEOUT = None\nMARKER_PREFIX = \"luigi_test\"\nEXPIRE = 5\n\n\n@pytest.mark.contrib\nclass RedisTargetTest(unittest.TestCase):\n    \"\"\"Test touch, exists and target expiration\"\"\"\n\n    def test_touch_and_exists(self):\n        target = RedisTarget(HOST, PORT, DB, \"update_id\", PASSWORD)\n        target.marker_prefix = MARKER_PREFIX\n        flush()\n        self.assertFalse(target.exists(), \"Target should not exist before touching it\")\n        target.touch()\n        self.assertTrue(target.exists(), \"Target should exist after touching it\")\n        flush()\n\n    def test_expiration(self):\n        target = RedisTarget(HOST, PORT, DB, \"update_id\", PASSWORD, None, EXPIRE)\n        target.marker_prefix = MARKER_PREFIX\n        flush()\n        target.touch()\n        self.assertTrue(target.exists(), \"Target should exist after touching it and before expiring\")\n        sleep(EXPIRE)\n        self.assertFalse(target.exists(), \"Target should not exist after expiring\")\n        flush()\n\n\ndef flush():\n    \"\"\"Flush test DB\"\"\"\n    redis_client = redis.StrictRedis(host=HOST, port=PORT, db=DB, socket_timeout=SOCKET_TIMEOUT)\n    redis_client.flushdb()\n"
  },
  {
    "path": "test/contrib/redshift_test.py",
    "content": "# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport json\nimport os\nimport sys\n\nimport mock\nimport pytest\nfrom helpers import unittest, with_config\nfrom moto import mock_s3\n\nimport luigi\nimport luigi.contrib.redshift\nimport luigi.notifications\nfrom luigi.contrib import redshift\nfrom luigi.contrib.s3 import S3Client\n\nif (3, 4, 0) <= sys.version_info[:3] < (3, 4, 3):\n    # spulec/moto#308\n    mock_s3 = unittest.skip(\"moto mock doesn't work with python3.4\")  # NOQA\n\n\n# Fake AWS and S3 credentials taken from `../redshift_test.py`.\nAWS_ACCESS_KEY = \"key\"\nAWS_SECRET_KEY = \"secret\"\n\nAWS_ACCOUNT_ID = \"0123456789012\"\nAWS_ROLE_NAME = \"MyRedshiftRole\"\n\nBUCKET = \"bucket\"\nKEY = \"key\"\nKEY_2 = \"key2\"\nFILES = [\"file1\", \"file2\", \"file3\"]\n\n\ndef generate_manifest_json(path_to_folders, file_names):\n    entries = []\n    for path_to_folder in path_to_folders:\n        for file_name in file_names:\n            entries.append({\"url\": \"%s/%s\" % (path_to_folder, file_name), \"mandatory\": True})\n    return {\"entries\": entries}\n\n\nclass DummyS3CopyToTableBase(luigi.contrib.redshift.S3CopyToTable):\n    # Class attributes taken from `DummyPostgresImporter` in\n    # `../postgres_test.py`.\n    host = \"dummy_host\"\n    database = \"dummy_database\"\n    user = \"dummy_user\"\n    password = \"dummy_password\"\n    table = luigi.Parameter(default=\"dummy_table\")\n    columns = luigi.TupleParameter(\n        default=(\n            (\"some_text\", \"varchar(255)\"),\n            (\"some_int\", \"int\"),\n        )\n    )\n    table_constraints = luigi.Parameter(default=\"\")\n\n    copy_options = \"\"\n    prune_table = \"\"\n    prune_column = \"\"\n    prune_date = \"\"\n\n    def s3_load_path(self):\n        return \"s3://%s/%s\" % (BUCKET, KEY)\n\n\nclass DummyS3CopyJSONToTableBase(luigi.contrib.redshift.S3CopyJSONToTable):\n    # Class attributes taken from `DummyPostgresImporter` in\n    # `../postgres_test.py`.\n    aws_access_key_id = AWS_ACCESS_KEY\n    aws_secret_access_key = AWS_SECRET_KEY\n\n    host = \"dummy_host\"\n    database = \"dummy_database\"\n    user = \"dummy_user\"\n    password = \"dummy_password\"\n    table = luigi.Parameter(default=\"dummy_table\")\n    columns = luigi.TupleParameter(\n        default=(\n            (\"some_text\", \"varchar(255)\"),\n            (\"some_int\", \"int\"),\n        )\n    )\n\n    copy_options = \"\"\n    prune_table = \"\"\n    prune_column = \"\"\n    prune_date = \"\"\n\n    jsonpath = \"\"\n    copy_json_options = \"\"\n\n    def s3_load_path(self):\n        return \"s3://%s/%s\" % (BUCKET, KEY)\n\n\nclass DummyS3CopyToTableKey(DummyS3CopyToTableBase):\n    aws_access_key_id = AWS_ACCESS_KEY\n    aws_secret_access_key = AWS_SECRET_KEY\n\n\nclass DummyS3CopyToTableWithCompressionEncodings(DummyS3CopyToTableKey):\n    columns = (\n        (\"some_text\", \"varchar(255)\", \"LZO\"),\n        (\"some_int\", \"int\", \"DELTA\"),\n    )\n\n\nclass DummyS3CopyToTableRole(DummyS3CopyToTableBase):\n    aws_account_id = AWS_ACCESS_KEY\n    aws_arn_role_name = AWS_SECRET_KEY\n\n\nclass DummyS3CopyToTempTable(DummyS3CopyToTableKey):\n    # Extend/alter DummyS3CopyToTable for temp table copying\n    table = luigi.Parameter(default=\"stage_dummy_table\")\n\n    table_type = \"TEMP\"\n\n    prune_date = \"current_date - 30\"\n    prune_column = \"dumb_date\"\n    prune_table = \"stage_dummy_table\"\n\n    queries = [\"insert into dummy_table select * from stage_dummy_table;\"]\n\n\n@pytest.mark.aws\nclass TestInternalCredentials(unittest.TestCase, DummyS3CopyToTableKey):\n    def test_from_property(self):\n        self.assertEqual(self.aws_access_key_id, AWS_ACCESS_KEY)\n        self.assertEqual(self.aws_secret_access_key, AWS_SECRET_KEY)\n\n\n@pytest.mark.aws\nclass TestExternalCredentials(unittest.TestCase, DummyS3CopyToTableBase):\n    @mock.patch.dict(os.environ, {\"AWS_ACCESS_KEY_ID\": \"env_key\", \"AWS_SECRET_ACCESS_KEY\": \"env_secret\"})\n    def test_from_env(self):\n        self.assertEqual(self.aws_access_key_id, \"env_key\")\n        self.assertEqual(self.aws_secret_access_key, \"env_secret\")\n\n    @with_config({\"redshift\": {\"aws_access_key_id\": \"config_key\", \"aws_secret_access_key\": \"config_secret\"}})\n    def test_from_config(self):\n        self.assertEqual(self.aws_access_key_id, \"config_key\")\n        self.assertEqual(self.aws_secret_access_key, \"config_secret\")\n\n\n@pytest.mark.aws\nclass TestS3CopyToTableWithMetaColumns(unittest.TestCase):\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.enable_metadata_columns\", new_callable=mock.PropertyMock, return_value=True)\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable._add_metadata_columns\")\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.post_copy_metacolumns\")\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_copy_with_metadata_columns_enabled(self, mock_redshift_target, mock_add_columns, mock_update_columns, mock_metadata_columns_enabled):\n        task = DummyS3CopyToTableKey()\n        task.run()\n\n        self.assertTrue(mock_add_columns.called)\n        self.assertTrue(mock_update_columns.called)\n\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.enable_metadata_columns\", new_callable=mock.PropertyMock, return_value=False)\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable._add_metadata_columns\")\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.post_copy_metacolumns\")\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_copy_with_metadata_columns_disabled(self, mock_redshift_target, mock_add_columns, mock_update_columns, mock_metadata_columns_enabled):\n        task = DummyS3CopyToTableKey()\n        task.run()\n\n        self.assertFalse(mock_add_columns.called)\n        self.assertFalse(mock_update_columns.called)\n\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.enable_metadata_columns\", new_callable=mock.PropertyMock, return_value=True)\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable._add_metadata_columns\")\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.post_copy_metacolumns\")\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_json_copy_with_metadata_columns_enabled(self, mock_redshift_target, mock_add_columns, mock_update_columns, mock_metadata_columns_enabled):\n        task = DummyS3CopyJSONToTableBase()\n        task.run()\n\n        self.assertTrue(mock_add_columns.called)\n        self.assertTrue(mock_update_columns.called)\n\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.enable_metadata_columns\", new_callable=mock.PropertyMock, return_value=False)\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable._add_metadata_columns\")\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.post_copy_metacolumns\")\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_json_copy_with_metadata_columns_disabled(self, mock_redshift_target, mock_add_columns, mock_update_columns, mock_metadata_columns_enabled):\n        task = DummyS3CopyJSONToTableBase()\n        task.run()\n\n        self.assertFalse(mock_add_columns.called)\n        self.assertFalse(mock_update_columns.called)\n\n\n@pytest.mark.aws\nclass TestS3CopyToTable(unittest.TestCase):\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_copy_missing_creds(self, mock_redshift_target):\n\n        # Make sure credentials are not set as env vars\n        try:\n            del os.environ[\"AWS_ACCESS_KEY_ID\"]\n            del os.environ[\"AWS_SECRET_ACCESS_KEY\"]\n        except KeyError:\n            pass\n\n        task = DummyS3CopyToTableBase()\n\n        # The mocked connection cursor passed to\n        # S3CopyToTable.copy(self, cursor, f).\n        mock_cursor = mock_redshift_target.return_value.connect.return_value.cursor.return_value\n\n        with self.assertRaises(NotImplementedError):\n            task.copy(mock_cursor, task.s3_load_path())\n\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.copy\")\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_s3_copy_to_table(self, mock_redshift_target, mock_copy):\n        task = DummyS3CopyToTableKey()\n        task.run()\n\n        # The mocked connection cursor passed to\n        # S3CopyToTable.copy(self, cursor, f).\n        mock_cursor = mock_redshift_target.return_value.connect.return_value.cursor.return_value\n\n        # `mock_redshift_target` is the mocked `RedshiftTarget` object\n        # returned by S3CopyToTable.output(self).\n        mock_redshift_target.assert_called_with(\n            database=task.database, host=task.host, update_id=task.task_id, user=task.user, table=task.table, password=task.password\n        )\n\n        # Check if the `S3CopyToTable.s3_load_path` class attribute was\n        # successfully referenced in the `S3CopyToTable.run` method, which is\n        # in-turn passed to `S3CopyToTable.copy` and other functions in `run`\n        # (see issue #995).\n        mock_copy.assert_called_with(mock_cursor, task.s3_load_path())\n\n        # Check the SQL query in `S3CopyToTable.does_table_exist`.\n        mock_cursor.execute.assert_called_with(\"select 1 as table_exists from pg_table_def where tablename = lower(%s) limit 1\", (task.table,))\n\n        return\n\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.does_table_exist\", return_value=False)\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_s3_copy_to_missing_table(self, mock_redshift_target, mock_does_exist):\n        \"\"\"\n        Test missing table creation\n        \"\"\"\n        # Ensure `S3CopyToTable.create_table` does not throw an error.\n        task = DummyS3CopyToTableKey()\n        task.run()\n\n        # Make sure the cursor was successfully used to create the table in\n        # `create_table` as expected.\n        mock_cursor = mock_redshift_target.return_value.connect.return_value.cursor.return_value\n        assert mock_cursor.execute.call_args_list[0][0][0].startswith(\"CREATE  TABLE %s\" % task.table)\n\n        return\n\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.does_schema_exist\", return_value=False)\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_s3_copy_to_missing_schema(self, mock_redshift_target, mock_does_exist):\n        task = DummyS3CopyToTableKey(table=\"schema.table_with_schema\")\n        task.run()\n\n        mock_cursor = mock_redshift_target.return_value.connect.return_value.cursor.return_value\n        executed_query = mock_cursor.execute.call_args_list[0][0][0]\n        assert executed_query.startswith(\"CREATE SCHEMA IF NOT EXISTS schema\")\n\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.does_schema_exist\", return_value=False)\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_s3_copy_to_missing_schema_with_no_schema(self, mock_redshift_target, mock_does_exist):\n        task = DummyS3CopyToTableKey(table=\"table_with_no_schema\")\n        task.run()\n\n        mock_cursor = mock_redshift_target.return_value.connect.return_value.cursor.return_value\n        executed_query = mock_cursor.execute.call_args_list[0][0][0]\n        assert not executed_query.startswith(\"CREATE SCHEMA IF NOT EXISTS\")\n\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.does_schema_exist\", return_value=True)\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_s3_copy_to_existing_schema_with_schema(self, mock_redshift_target, mock_does_exist):\n        task = DummyS3CopyToTableKey(table=\"schema.table_with_schema\")\n        task.run()\n\n        mock_cursor = mock_redshift_target.return_value.connect.return_value.cursor.return_value\n        executed_query = mock_cursor.execute.call_args_list[0][0][0]\n        assert not executed_query.startswith(\"CREATE SCHEMA IF NOT EXISTS\")\n\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.does_table_exist\", return_value=False)\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_s3_copy_to_missing_table_with_compression_encodings(self, mock_redshift_target, mock_does_exist):\n        \"\"\"\n        Test missing table creation with compression encodings\n        \"\"\"\n        # Ensure `S3CopyToTable.create_table` does not throw an error.\n        task = DummyS3CopyToTableWithCompressionEncodings()\n        task.run()\n\n        # Make sure the cursor was successfully used to create the table in\n        # `create_table` as expected.\n        mock_cursor = mock_redshift_target.return_value.connect.return_value.cursor.return_value\n        encode_string = \",\".join(\"{name} {type} ENCODE {encoding}\".format(name=name, type=type, encoding=encoding) for name, type, encoding in task.columns)\n\n        assert mock_cursor.execute.call_args_list[0][0][0].startswith(\"CREATE  TABLE %s (%s )\" % (task.table, encode_string))\n\n        return\n\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.does_table_exist\", return_value=False)\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_s3_copy_to_missing_table_with_table_constraints(self, mock_redshift_target, mock_does_exist):\n        table_constraints = \"PRIMARY KEY (COL1, COL2)\"\n\n        task = DummyS3CopyToTableKey(table_constraints=table_constraints)\n\n        task.run()\n\n        mock_cursor = mock_redshift_target.return_value.connect.return_value.cursor.return_value\n        columns_string = \",\".join(\"{name} {type}\".format(name=name, type=type) for name, type in task.columns)\n\n        executed_query = mock_cursor.execute.call_args_list[0][0][0]\n        expectation = \"CREATE  TABLE %s (%s , PRIMARY KEY (COL1, COL2))\" % (task.table, columns_string)\n\n        assert executed_query.startswith(expectation)\n\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.copy\")\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_s3_copy_to_temp_table(self, mock_redshift_target, mock_copy):\n        task = DummyS3CopyToTempTable()\n        task.run()\n\n        # The mocked connection cursor passed to\n        # S3CopyToTable.copy(self, cursor, f).\n        mock_cursor = mock_redshift_target.return_value.connect.return_value.cursor.return_value\n\n        # `mock_redshift_target` is the mocked `RedshiftTarget` object\n        # returned by S3CopyToTable.output(self).\n        mock_redshift_target.assert_called_once_with(\n            database=task.database,\n            host=task.host,\n            update_id=task.task_id,\n            user=task.user,\n            table=task.table,\n            password=task.password,\n        )\n\n        # Check if the `S3CopyToTable.s3_load_path` class attribute was\n        # successfully referenced in the `S3CopyToTable.run` method, which is\n        # in-turn passed to `S3CopyToTable.copy` and other functions in `run`\n        # (see issue #995).\n        mock_copy.assert_called_once_with(mock_cursor, task.s3_load_path())\n\n        # Check the SQL query in `S3CopyToTable.does_table_exist`. # temp table\n        mock_cursor.execute.assert_any_call(\n            \"select 1 as table_exists from pg_table_def where tablename = lower(%s) limit 1\",\n            (task.table,),\n        )\n\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_s3_copy_with_valid_columns(self, mock_redshift_target):\n        task = DummyS3CopyToTableKey()\n        task.run()\n\n        # The mocked connection cursor passed to\n        # S3CopyToTable.copy(self, cursor, f).\n        mock_cursor = mock_redshift_target.return_value.connect.return_value.cursor.return_value\n\n        # `mock_redshift_target` is the mocked `RedshiftTarget` object\n        # returned by S3CopyToTable.output(self).\n        mock_redshift_target.assert_called_once_with(\n            database=task.database,\n            host=task.host,\n            update_id=task.task_id,\n            user=task.user,\n            table=task.table,\n            password=task.password,\n        )\n\n        # To get the proper intendation in the multiline `COPY` statement the\n        # SQL string was copied from redshift.py.\n        mock_cursor.execute.assert_called_with(\n            \"\"\"\n         COPY {table} {colnames} from '{source}'\n         CREDENTIALS '{creds}'\n         {options}\n         ;\"\"\".format(\n                table=\"dummy_table\",\n                colnames=\"(some_text,some_int)\",\n                source=\"s3://bucket/key\",\n                creds=\"aws_access_key_id=key;aws_secret_access_key=secret\",\n                options=\"\",\n            )\n        )\n\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_s3_copy_with_default_columns(self, mock_redshift_target):\n        task = DummyS3CopyToTableKey(columns=[])\n        task.run()\n\n        # The mocked connection cursor passed to\n        # S3CopyToTable.copy(self, cursor, f).\n        mock_cursor = mock_redshift_target.return_value.connect.return_value.cursor.return_value\n\n        # `mock_redshift_target` is the mocked `RedshiftTarget` object\n        # returned by S3CopyToTable.output(self).\n        mock_redshift_target.assert_called_once_with(\n            database=task.database,\n            host=task.host,\n            update_id=task.task_id,\n            user=task.user,\n            table=task.table,\n            password=task.password,\n        )\n\n        # To get the proper intendation in the multiline `COPY` statement the\n        # SQL string was copied from redshift.py.\n        mock_cursor.execute.assert_called_with(\n            \"\"\"\n         COPY {table} {colnames} from '{source}'\n         CREDENTIALS '{creds}'\n         {options}\n         ;\"\"\".format(table=\"dummy_table\", colnames=\"\", source=\"s3://bucket/key\", creds=\"aws_access_key_id=key;aws_secret_access_key=secret\", options=\"\")\n        )\n\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_s3_copy_with_nonetype_columns(self, mock_redshift_target):\n        task = DummyS3CopyToTableKey(columns=None)\n        task.run()\n\n        # The mocked connection cursor passed to\n        # S3CopyToTable.copy(self, cursor, f).\n        mock_cursor = mock_redshift_target.return_value.connect.return_value.cursor.return_value\n\n        # `mock_redshift_target` is the mocked `RedshiftTarget` object\n        # returned by S3CopyToTable.output(self).\n        mock_redshift_target.assert_called_once_with(\n            database=task.database,\n            host=task.host,\n            update_id=task.task_id,\n            user=task.user,\n            table=task.table,\n            password=task.password,\n        )\n\n        # To get the proper intendation in the multiline `COPY` statement the\n        # SQL string was copied from redshift.py.\n        mock_cursor.execute.assert_called_with(\n            \"\"\"\n         COPY {table} {colnames} from '{source}'\n         CREDENTIALS '{creds}'\n         {options}\n         ;\"\"\".format(table=\"dummy_table\", colnames=\"\", source=\"s3://bucket/key\", creds=\"aws_access_key_id=key;aws_secret_access_key=secret\", options=\"\")\n        )\n\n\n@pytest.mark.aws\nclass TestS3CopyToSchemaTable(unittest.TestCase):\n    @mock.patch(\"luigi.contrib.redshift.S3CopyToTable.copy\")\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_s3_copy_to_table(self, mock_redshift_target, mock_copy):\n        task = DummyS3CopyToTableKey(table=\"dummy_schema.dummy_table\")\n        task.run()\n\n        # The mocked connection cursor passed to\n        # S3CopyToTable.copy(self, cursor, f).\n        mock_cursor = mock_redshift_target.return_value.connect.return_value.cursor.return_value\n\n        # Check the SQL query in `S3CopyToTable.does_table_exist`.\n        mock_cursor.execute.assert_called_with(\n            \"select 1 as table_exists from information_schema.tables where table_schema = lower(%s) and table_name = lower(%s) limit 1\",\n            tuple(task.table.split(\".\")),\n        )\n\n\nclass DummyRedshiftUnloadTask(luigi.contrib.redshift.RedshiftUnloadTask):\n    # Class attributes taken from `DummyPostgresImporter` in\n    # `../postgres_test.py`.\n    host = \"dummy_host\"\n    database = \"dummy_database\"\n    user = \"dummy_user\"\n    password = \"dummy_password\"\n    table = luigi.Parameter(default=\"dummy_table\")\n    columns = (\n        (\"some_text\", \"varchar(255)\"),\n        (\"some_int\", \"int\"),\n    )\n\n    aws_access_key_id = \"AWS_ACCESS_KEY\"\n    aws_secret_access_key = \"AWS_SECRET_KEY\"\n\n    s3_unload_path = \"s3://%s/%s\" % (BUCKET, KEY)\n    unload_options = \"DELIMITER ',' ADDQUOTES GZIP ALLOWOVERWRITE PARALLEL OFF\"\n\n    def query(self):\n        return \"SELECT 'a' as col_a, current_date as col_b\"\n\n\n@pytest.mark.aws\nclass TestRedshiftUnloadTask(unittest.TestCase):\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_redshift_unload_command(self, mock_redshift_target):\n\n        task = DummyRedshiftUnloadTask()\n        task.run()\n\n        # The mocked connection cursor passed to\n        # RedshiftUnloadTask.\n        mock_cursor = mock_redshift_target.return_value.connect.return_value.cursor.return_value\n\n        # Check the Unload query.\n        mock_cursor.execute.assert_called_with(\n            \"UNLOAD ( 'SELECT \\\\'a\\\\' as col_a, current_date as col_b' ) TO 's3://bucket/key' \"\n            \"credentials 'aws_access_key_id=AWS_ACCESS_KEY;aws_secret_access_key=AWS_SECRET_KEY' \"\n            \"DELIMITER ',' ADDQUOTES GZIP ALLOWOVERWRITE PARALLEL OFF;\"\n        )\n\n\nclass DummyRedshiftAutocommitQuery(luigi.contrib.redshift.RedshiftQuery):\n    # Class attributes taken from `DummyPostgresImporter` in\n    # `../postgres_test.py`.\n    host = \"dummy_host\"\n    database = \"dummy_database\"\n    user = \"dummy_user\"\n    password = \"dummy_password\"\n    table = luigi.Parameter(default=\"dummy_table\")\n    autocommit = True\n\n    def query(self):\n        return \"SELECT 'a' as col_a, current_date as col_b\"\n\n\n@pytest.mark.aws\nclass TestRedshiftAutocommitQuery(unittest.TestCase):\n    @mock.patch(\"luigi.contrib.redshift.RedshiftTarget\")\n    def test_redshift_autocommit_query(self, mock_redshift_target):\n\n        task = DummyRedshiftAutocommitQuery()\n        task.run()\n\n        # The mocked connection cursor passed to\n        # RedshiftUnloadTask.\n        mock_connect = mock_redshift_target.return_value.connect.return_value\n\n        # Check the Unload query.\n        self.assertTrue(mock_connect.autocommit)\n\n\n@pytest.mark.aws\nclass TestRedshiftManifestTask(unittest.TestCase):\n    def test_run(self):\n        with mock_s3():\n            client = S3Client()\n            client.s3.meta.client.create_bucket(Bucket=BUCKET)\n            for key in FILES:\n                k = \"%s/%s\" % (KEY, key)\n                client.put_string(\"\", \"s3://%s/%s\" % (BUCKET, k))\n            folder_path = \"s3://%s/%s\" % (BUCKET, KEY)\n            path = \"s3://%s/%s/%s\" % (BUCKET, \"manifest\", \"test.manifest\")\n            folder_paths = [folder_path]\n\n            m = mock.mock_open()\n            with mock.patch(\"luigi.contrib.s3.S3Target.open\", m, create=True):\n                t = redshift.RedshiftManifestTask(path, folder_paths)\n                luigi.build([t], local_scheduler=True)\n\n            expected_manifest_output = json.dumps(generate_manifest_json(folder_paths, FILES))\n\n            handle = m()\n            handle.write.assert_called_with(expected_manifest_output)\n\n    def test_run_multiple_paths(self):\n        with mock_s3():\n            client = S3Client()\n            client.s3.meta.client.create_bucket(Bucket=BUCKET)\n            for parent in [KEY, KEY_2]:\n                for key in FILES:\n                    k = \"%s/%s\" % (parent, key)\n                    client.put_string(\"\", \"s3://%s/%s\" % (BUCKET, k))\n            folder_path_1 = \"s3://%s/%s\" % (BUCKET, KEY)\n            folder_path_2 = \"s3://%s/%s\" % (BUCKET, KEY_2)\n            folder_paths = [folder_path_1, folder_path_2]\n            path = \"s3://%s/%s/%s\" % (BUCKET, \"manifest\", \"test.manifest\")\n\n            m = mock.mock_open()\n            with mock.patch(\"luigi.contrib.s3.S3Target.open\", m, create=True):\n                t = redshift.RedshiftManifestTask(path, folder_paths)\n                luigi.build([t], local_scheduler=True)\n\n            expected_manifest_output = json.dumps(generate_manifest_json(folder_paths, FILES))\n            handle = m()\n            handle.write.assert_called_with(expected_manifest_output)\n"
  },
  {
    "path": "test/contrib/s3_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright (c) 2013 Mortar Data\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n#\n\nimport os\nimport sys\nimport tempfile\n\nimport boto3\n\nif sys.version_info[:2] <= (3, 11):\n    from boto.s3 import key\nimport pytest\nfrom botocore.exceptions import ClientError\nfrom helpers import skipOnTravisAndGithubActions, unittest, with_config\nfrom mock import patch\nfrom moto import mock_s3, mock_sts\nfrom target_test import FileSystemTargetTestMixin\n\nfrom luigi.contrib.s3 import DeprecatedBotoClientException, FileNotFoundException, InvalidDeleteException, S3Client, S3Target\nfrom luigi.target import MissingParentDirectory\n\nif (3, 4, 0) <= sys.version_info[:3] < (3, 4, 3):\n    # spulec/moto#308\n    raise unittest.SkipTest(\"moto mock doesn't work with python3.4\")\n\n\nAWS_ACCESS_KEY = \"XXXXXXXXXXXXXXXXXXXX\"\nAWS_SECRET_KEY = \"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\"\nAWS_SESSION_TOKEN = \"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\"\n\n\ndef create_bucket():\n    conn = boto3.resource(\"s3\", region_name=\"us-east-1\")\n    # We need to create the bucket since this is all in Moto's 'virtual' AWS account\n    conn.create_bucket(Bucket=\"mybucket\")\n    return conn\n\n\n@pytest.mark.aws\nclass TestS3Target(unittest.TestCase, FileSystemTargetTestMixin):\n    def setUp(self):\n        f = tempfile.NamedTemporaryFile(mode=\"wb\", delete=False)\n        self.tempFileContents = b\"I'm a temporary file for testing\\nAnd this is the second line\\nThis is the third.\"\n        self.tempFilePath = f.name\n        f.write(self.tempFileContents)\n        f.close()\n        self.addCleanup(os.remove, self.tempFilePath)\n\n        self.mock_s3 = mock_s3()\n        self.mock_s3.start()\n        self.addCleanup(self.mock_s3.stop)\n\n    def create_target(self, format=None, **kwargs):\n        client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n        create_bucket()\n        return S3Target(\"s3://mybucket/test_file\", client=client, format=format, **kwargs)\n\n    def create_target_with_session(self, format=None, **kwargs):\n        client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_SESSION_TOKEN)\n        create_bucket()\n        return S3Target(\"s3://mybucket/test_file\", client=client, format=format, **kwargs)\n\n    def test_read(self):\n        client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n        create_bucket()\n        client.put(self.tempFilePath, \"s3://mybucket/tempfile\")\n        t = S3Target(\"s3://mybucket/tempfile\", client=client)\n        read_file = t.open()\n        file_str = read_file.read()\n        self.assertEqual(self.tempFileContents, file_str.encode(\"utf-8\"))\n\n    def test_read_with_session(self):\n        client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_SESSION_TOKEN)\n        create_bucket()\n        client.put(self.tempFilePath, \"s3://mybucket/tempfile-with-session\")\n        t = S3Target(\"s3://mybucket/tempfile-with-session\", client=client)\n        read_file = t.open()\n        file_str = read_file.read()\n        self.assertEqual(self.tempFileContents, file_str.encode(\"utf-8\"))\n\n    def test_read_no_file(self):\n        t = self.create_target()\n        self.assertRaises(FileNotFoundException, t.open)\n\n    def test_read_no_file_with_session(self):\n        t = self.create_target_with_session()\n        self.assertRaises(FileNotFoundException, t.open)\n\n    def test_read_no_file_sse(self):\n        t = self.create_target(encrypt_key=True)\n        self.assertRaises(FileNotFoundException, t.open)\n\n    @unittest.skipIf(tuple(sys.version_info) >= (3, 12), \"boto is not supported on Python 3.12+\")\n    def test_read_iterator_long(self):\n        # write a file that is 5X the boto buffersize\n        # to test line buffering\n        old_buffer = key.Key.BufferSize\n        key.Key.BufferSize = 2\n        try:\n            tempf = tempfile.NamedTemporaryFile(mode=\"wb\", delete=False)\n            temppath = tempf.name\n            firstline = \"\".zfill(key.Key.BufferSize * 5) + os.linesep\n            secondline = \"line two\" + os.linesep\n            thirdline = \"line three\" + os.linesep\n            contents = firstline + secondline + thirdline\n            tempf.write(contents.encode(\"utf-8\"))\n            tempf.close()\n\n            client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n            create_bucket()\n            remote_path = \"s3://mybucket/largetempfile\"\n            client.put(temppath, remote_path)\n            t = S3Target(remote_path, client=client)\n            with t.open() as read_file:\n                lines = [line for line in read_file]\n        finally:\n            key.Key.BufferSize = old_buffer\n\n        self.assertEqual(3, len(lines))\n        self.assertEqual(firstline, lines[0])\n        self.assertEqual(secondline, lines[1])\n        self.assertEqual(thirdline, lines[2])\n\n    def test_get_path(self):\n        t = self.create_target()\n        path = t.path\n        self.assertEqual(\"s3://mybucket/test_file\", path)\n\n    def test_get_path_sse(self):\n        t = self.create_target(encrypt_key=True)\n        path = t.path\n        self.assertEqual(\"s3://mybucket/test_file\", path)\n\n\n@pytest.mark.aws\nclass TestS3Client(unittest.TestCase):\n    def setUp(self):\n        f = tempfile.NamedTemporaryFile(mode=\"wb\", delete=False)\n        self.tempFilePath = f.name\n        self.tempFileContents = b\"I'm a temporary file for testing\\n\"\n        f.write(self.tempFileContents)\n        f.close()\n        self.addCleanup(os.remove, self.tempFilePath)\n\n        self.mock_s3 = mock_s3()\n        self.mock_s3.start()\n        self.mock_sts = mock_sts()\n        self.mock_sts.start()\n        self.addCleanup(self.mock_s3.stop)\n        self.addCleanup(self.mock_sts.stop)\n\n    @patch(\"boto3.resource\")\n    def test_init_without_init_or_config(self, mock):\n        \"\"\"If no config or arn provided, boto3 client\n        should be called with default parameters.\n        Delegating ENV or Task Role credential handling\n        to boto3 itself.\n        \"\"\"\n        S3Client().s3\n        mock.assert_called_with(\"s3\", aws_access_key_id=None, aws_secret_access_key=None, aws_session_token=None)\n\n    @with_config({\"s3\": {\"aws_access_key_id\": \"foo\", \"aws_secret_access_key\": \"bar\"}})\n    @patch(\"boto3.resource\")\n    def test_init_with_config(self, mock):\n        S3Client().s3\n        mock.assert_called_with(\"s3\", aws_access_key_id=\"foo\", aws_secret_access_key=\"bar\", aws_session_token=None)\n\n    @patch(\"boto3.resource\")\n    @patch(\"boto3.client\")\n    @with_config({\"s3\": {\"aws_role_arn\": \"role\", \"aws_role_session_name\": \"name\"}})\n    def test_init_with_config_and_roles(self, sts_mock, s3_mock):\n        S3Client().s3\n        sts_mock.client.assume_role.called_with(RoleArn=\"role\", RoleSessionName=\"name\")\n\n    @patch(\"boto3.client\")\n    def test_init_with_host_deprecated(self, mock):\n        with self.assertRaises(DeprecatedBotoClientException):\n            S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY, host=\"us-east-1\").s3\n\n    def test_put(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n        s3_client.put(self.tempFilePath, \"s3://mybucket/putMe\")\n        self.assertTrue(s3_client.exists(\"s3://mybucket/putMe\"))\n\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_SESSION_TOKEN)\n        s3_client.put(self.tempFilePath, \"s3://mybucket/putMe\")\n        self.assertTrue(s3_client.exists(\"s3://mybucket/putMe\"))\n\n    def test_put_no_such_bucket(self):\n        # intentionally don't create bucket\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n        with self.assertRaises(s3_client.s3.meta.client.exceptions.NoSuchBucket):\n            s3_client.put(self.tempFilePath, \"s3://mybucket/putMe\")\n\n    def test_put_sse_deprecated(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n        with self.assertRaises(DeprecatedBotoClientException):\n            s3_client.put(self.tempFilePath, \"s3://mybucket/putMe\", encrypt_key=True)\n\n    def test_put_host_deprecated(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n        with self.assertRaises(DeprecatedBotoClientException):\n            s3_client.put(self.tempFilePath, \"s3://mybucket/putMe\", host=\"us-east-1\")\n\n    def test_put_string(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n        s3_client.put_string(\"SOMESTRING\", \"s3://mybucket/putString\")\n        self.assertTrue(s3_client.exists(\"s3://mybucket/putString\"))\n\n    def test_put_string_no_such_bucket(self):\n        # intentionally don't create bucket\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n        with self.assertRaises(s3_client.s3.meta.client.exceptions.NoSuchBucket):\n            s3_client.put_string(\"SOMESTRING\", \"s3://mybucket/putString\")\n\n    def test_put_string_sse_deprecated(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n        with self.assertRaises(DeprecatedBotoClientException):\n            s3_client.put(\"SOMESTRING\", \"s3://mybucket/putMe\", encrypt_key=True)\n\n    def test_put_string_host_deprecated(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n        with self.assertRaises(DeprecatedBotoClientException):\n            s3_client.put(\"SOMESTRING\", \"s3://mybucket/putMe\", host=\"us-east-1\")\n\n    @skipOnTravisAndGithubActions(\"passes and fails intermitantly, suspecting it's a race condition not handled by moto\")\n    def test_put_multipart_multiple_parts_non_exact_fit(self):\n        \"\"\"\n        Test a multipart put with two parts, where the parts are not exactly the split size.\n        \"\"\"\n        # 5MB is minimum part size\n        part_size = 8388608\n        file_size = (part_size * 2) - 1000\n        return self._run_multipart_test(part_size, file_size)\n\n    @skipOnTravisAndGithubActions(\"passes and fails intermitantly, suspecting it's a race condition not handled by moto\")\n    def test_put_multipart_multiple_parts_exact_fit(self):\n        \"\"\"\n        Test a multipart put with multiple parts, where the parts are exactly the split size.\n        \"\"\"\n        # 5MB is minimum part size\n        part_size = 8388608\n        file_size = part_size * 2\n        return self._run_multipart_test(part_size, file_size)\n\n    def test_put_multipart_multiple_parts_with_sse_deprecated(self):\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n        with self.assertRaises(DeprecatedBotoClientException):\n            s3_client.put_multipart(\"path\", \"path\", encrypt_key=True)\n\n    def test_put_multipart_multiple_parts_with_host_deprecated(self):\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n        with self.assertRaises(DeprecatedBotoClientException):\n            s3_client.put_multipart(\"path\", \"path\", host=\"us-east-1\")\n\n    def test_put_multipart_empty_file(self):\n        \"\"\"\n        Test a multipart put with an empty file.\n        \"\"\"\n        # 5MB is minimum part size\n        part_size = 8388608\n        file_size = 0\n        return self._run_multipart_test(part_size, file_size)\n\n    def test_put_multipart_less_than_split_size(self):\n        \"\"\"\n        Test a multipart put with a file smaller than split size; should revert to regular put.\n        \"\"\"\n        # 5MB is minimum part size\n        part_size = 8388608\n        file_size = 5000\n        return self._run_multipart_test(part_size, file_size)\n\n    def test_put_multipart_no_such_bucket(self):\n        # intentionally don't create bucket\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n        with self.assertRaises(s3_client.s3.meta.client.exceptions.NoSuchBucket):\n            s3_client.put_multipart(self.tempFilePath, \"s3://mybucket/putMe\")\n\n    def test_exists(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n\n        self.assertTrue(s3_client.exists(\"s3://mybucket/\"))\n        self.assertTrue(s3_client.exists(\"s3://mybucket\"))\n        self.assertFalse(s3_client.exists(\"s3://mybucket/nope\"))\n        self.assertFalse(s3_client.exists(\"s3://mybucket/nope/\"))\n\n        s3_client.put(self.tempFilePath, \"s3://mybucket/tempfile\")\n        self.assertTrue(s3_client.exists(\"s3://mybucket/tempfile\"))\n        self.assertFalse(s3_client.exists(\"s3://mybucket/temp\"))\n\n        s3_client.put(self.tempFilePath, \"s3://mybucket/tempdir0_$folder$\")\n        self.assertTrue(s3_client.exists(\"s3://mybucket/tempdir0\"))\n\n        s3_client.put(self.tempFilePath, \"s3://mybucket/tempdir1/\")\n        self.assertTrue(s3_client.exists(\"s3://mybucket/tempdir1\"))\n\n        s3_client.put(self.tempFilePath, \"s3://mybucket/tempdir2/subdir\")\n        self.assertTrue(s3_client.exists(\"s3://mybucket/tempdir2\"))\n        self.assertFalse(s3_client.exists(\"s3://mybucket/tempdir\"))\n\n    def test_get(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n        s3_client.put(self.tempFilePath, \"s3://mybucket/putMe\")\n\n        tmp_file = tempfile.NamedTemporaryFile(delete=True)\n        tmp_file_path = tmp_file.name\n\n        s3_client.get(\"s3://mybucket/putMe\", tmp_file_path)\n        with open(tmp_file_path, \"r\") as f:\n            content = f.read()\n        self.assertEqual(content, self.tempFileContents.decode(\"utf-8\"))\n        tmp_file.close()\n\n    def test_get_as_bytes(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n        s3_client.put(self.tempFilePath, \"s3://mybucket/putMe\")\n\n        contents = s3_client.get_as_bytes(\"s3://mybucket/putMe\")\n\n        self.assertEqual(contents, self.tempFileContents)\n\n    def test_get_as_string(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n        s3_client.put(self.tempFilePath, \"s3://mybucket/putMe2\")\n\n        contents = s3_client.get_as_string(\"s3://mybucket/putMe2\")\n\n        self.assertEqual(contents, self.tempFileContents.decode(\"utf-8\"))\n\n    def test_get_as_string_latin1(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n        s3_client.put(self.tempFilePath, \"s3://mybucket/putMe3\")\n\n        contents = s3_client.get_as_string(\"s3://mybucket/putMe3\", encoding=\"ISO-8859-1\")\n\n        self.assertEqual(contents, self.tempFileContents.decode(\"ISO-8859-1\"))\n\n    def test_get_key(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n        s3_client.put(self.tempFilePath, \"s3://mybucket/key_to_find\")\n        self.assertTrue(s3_client.get_key(\"s3://mybucket/key_to_find\").key)\n        self.assertFalse(s3_client.get_key(\"s3://mybucket/does_not_exist\"))\n\n    def test_isdir(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n        self.assertTrue(s3_client.isdir(\"s3://mybucket\"))\n\n        s3_client.put(self.tempFilePath, \"s3://mybucket/tempdir0_$folder$\")\n        self.assertTrue(s3_client.isdir(\"s3://mybucket/tempdir0\"))\n\n        s3_client.put(self.tempFilePath, \"s3://mybucket/tempdir1/\")\n        self.assertTrue(s3_client.isdir(\"s3://mybucket/tempdir1\"))\n\n        s3_client.put(self.tempFilePath, \"s3://mybucket/key\")\n        self.assertFalse(s3_client.isdir(\"s3://mybucket/key\"))\n\n    def test_mkdir(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n        self.assertTrue(s3_client.isdir(\"s3://mybucket\"))\n        s3_client.mkdir(\"s3://mybucket\")\n\n        s3_client.mkdir(\"s3://mybucket/dir\")\n        self.assertTrue(s3_client.isdir(\"s3://mybucket/dir\"))\n\n        self.assertRaises(MissingParentDirectory, s3_client.mkdir, \"s3://mybucket/dir/foo/bar\", parents=False)\n\n        self.assertFalse(s3_client.isdir(\"s3://mybucket/dir/foo/bar\"))\n\n    def test_listdir(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n\n        s3_client.put_string(\"\", \"s3://mybucket/hello/frank\")\n        s3_client.put_string(\"\", \"s3://mybucket/hello/world\")\n\n        self.assertEqual([\"s3://mybucket/hello/frank\", \"s3://mybucket/hello/world\"], list(s3_client.listdir(\"s3://mybucket/hello\")))\n\n    def test_list(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n\n        s3_client.put_string(\"\", \"s3://mybucket/hello/frank\")\n        s3_client.put_string(\"\", \"s3://mybucket/hello/world\")\n\n        self.assertEqual([\"frank\", \"world\"], list(s3_client.list(\"s3://mybucket/hello\")))\n\n    def test_listdir_key(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n\n        s3_client.put_string(\"\", \"s3://mybucket/hello/frank\")\n        s3_client.put_string(\"\", \"s3://mybucket/hello/world\")\n\n        self.assertEqual(\n            [True, True], [s3_client.exists(\"s3://\" + x.bucket_name + \"/\" + x.key) for x in s3_client.listdir(\"s3://mybucket/hello\", return_key=True)]\n        )\n\n    def test_list_key(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n\n        s3_client.put_string(\"\", \"s3://mybucket/hello/frank\")\n        s3_client.put_string(\"\", \"s3://mybucket/hello/world\")\n\n        self.assertEqual(\n            [True, True], [s3_client.exists(\"s3://\" + x.bucket_name + \"/\" + x.key) for x in s3_client.listdir(\"s3://mybucket/hello\", return_key=True)]\n        )\n\n    def test_remove_bucket_dne(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n\n        self.assertRaises(ClientError, lambda: s3_client.remove(\"s3://bucketdoesnotexist/file\"))\n\n    def test_remove_file_dne(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n\n        self.assertFalse(s3_client.remove(\"s3://mybucket/doesNotExist\"))\n\n    def test_remove_file(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n\n        s3_client.put(self.tempFilePath, \"s3://mybucket/existingFile0\")\n        self.assertTrue(s3_client.remove(\"s3://mybucket/existingFile0\"))\n        self.assertFalse(s3_client.exists(\"s3://mybucket/existingFile0\"))\n\n    def test_remove_invalid(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n\n        self.assertRaises(InvalidDeleteException, lambda: s3_client.remove(\"s3://mybucket/\"))\n\n    def test_remove_invalid_no_slash(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n\n        self.assertRaises(InvalidDeleteException, lambda: s3_client.remove(\"s3://mybucket\"))\n\n    def test_remove_dir_not_recursive(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n\n        s3_client.put(self.tempFilePath, \"s3://mybucket/removemedir/file\")\n        self.assertRaises(InvalidDeleteException, lambda: s3_client.remove(\"s3://mybucket/removemedir\", recursive=False))\n\n    def test_remove_dir(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n\n        # test that the marker file created by Hadoop S3 Native FileSystem is removed\n        s3_client.put(self.tempFilePath, \"s3://mybucket/removemedir/file\")\n        s3_client.put_string(\"\", \"s3://mybucket/removemedir_$folder$\")\n        self.assertTrue(s3_client.remove(\"s3://mybucket/removemedir\"))\n        self.assertFalse(s3_client.exists(\"s3://mybucket/removemedir_$folder$\"))\n\n    def test_remove_dir_batch(self):\n        create_bucket()\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n\n        for i in range(0, 2000):\n            s3_client.put(self.tempFilePath, \"s3://mybucket/removemedir/file{i}\".format(i=i))\n        self.assertTrue(s3_client.remove(\"s3://mybucket/removemedir/\"))\n        self.assertFalse(s3_client.exists(\"s3://mybucket/removedir/\"))\n\n    @skipOnTravisAndGithubActions(\"passes and fails intermitantly, suspecting it's a race condition not handled by moto\")\n    def test_copy_multiple_parts_non_exact_fit(self):\n        \"\"\"\n        Test a multipart put with two parts, where the parts are not exactly the split size.\n        \"\"\"\n        # First, put a file into S3\n        self._run_copy_test(self.test_put_multipart_multiple_parts_non_exact_fit)\n\n    @skipOnTravisAndGithubActions(\"passes and fails intermitantly, suspecting it's a race condition not handled by moto\")\n    def test_copy_multiple_parts_exact_fit(self):\n        \"\"\"\n        Test a copy multiple parts, where the parts are exactly the split size.\n        \"\"\"\n        self._run_copy_test(self.test_put_multipart_multiple_parts_exact_fit)\n\n    def test_copy_less_than_split_size(self):\n        \"\"\"\n        Test a copy with a file smaller than split size; should revert to regular put.\n        \"\"\"\n        self._run_copy_test(self.test_put_multipart_less_than_split_size)\n\n    def test_copy_empty_file(self):\n        \"\"\"\n        Test a copy with an empty file.\n        \"\"\"\n        self._run_copy_test(self.test_put_multipart_empty_file)\n\n    @mock_s3\n    def test_copy_empty_dir(self):\n        \"\"\"\n        Test copying an empty dir\n        \"\"\"\n        create_bucket()\n\n        s3_dir = \"s3://mybucket/copydir/\"\n\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n\n        s3_client.mkdir(s3_dir)\n        self.assertTrue(s3_client.exists(s3_dir))\n\n        s3_dest = \"s3://mybucket/copydir_new/\"\n        response = s3_client.copy(s3_dir, s3_dest)\n\n        self._run_copy_response_test(response, expected_num=0, expected_size=0)\n\n    @mock_s3\n    @skipOnTravisAndGithubActions(\"https://travis-ci.org/spotify/luigi/jobs/145895385\")\n    def test_copy_dir(self):\n        \"\"\"\n        Test copying 20 files from one folder to another\n        \"\"\"\n        create_bucket()\n        n = 20\n        copy_part_size = (1024**2) * 5\n\n        # Note we can't test the multipart copy due to moto issue #526\n        # so here I have to keep the file size smaller than the copy_part_size\n        file_size = 5000\n\n        s3_dir = \"s3://mybucket/copydir/\"\n        file_contents = b\"a\" * file_size\n        tmp_file = tempfile.NamedTemporaryFile(mode=\"wb\", delete=True)\n        tmp_file_path = tmp_file.name\n        tmp_file.write(file_contents)\n        tmp_file.flush()\n\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n\n        for i in range(n):\n            file_path = s3_dir + str(i)\n            s3_client.put_multipart(tmp_file_path, file_path)\n            self.assertTrue(s3_client.exists(file_path))\n\n        s3_dest = \"s3://mybucket/copydir_new/\"\n        response = s3_client.copy(s3_dir, s3_dest, threads=10, part_size=copy_part_size)\n\n        self._run_copy_response_test(response, expected_num=n, expected_size=(n * file_size))\n\n        for i in range(n):\n            original_size = s3_client.get_key(s3_dir + str(i)).size\n            copy_size = s3_client.get_key(s3_dest + str(i)).size\n            self.assertEqual(original_size, copy_size)\n\n    def test__path_to_bucket_and_key(self):\n        self.assertEqual((\"bucket\", \"key\"), S3Client._path_to_bucket_and_key(\"s3://bucket/key\"))\n\n    def test__path_to_bucket_and_key_with_question_mark(self):\n        self.assertEqual((\"bucket\", \"key?blade\"), S3Client._path_to_bucket_and_key(\"s3://bucket/key?blade\"))\n\n    @mock_s3\n    def _run_copy_test(self, put_method, is_multipart=False):\n        create_bucket()\n        # Run the method to put the file into s3 into the first place\n        expected_num, expected_size = put_method()\n\n        # As all the multipart put methods use `self._run_multipart_test`\n        # we can just use this key\n        original = \"s3://mybucket/putMe\"\n        copy = \"s3://mybucket/putMe_copy\"\n\n        # Copy the file from old location to new\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n        if is_multipart:\n            # 5MB is minimum part size, use it here so we don't have to generate huge files to test\n            # the multipart upload in moto\n            part_size = (1024**2) * 5\n            response = s3_client.copy(original, copy, part_size=part_size, threads=4)\n        else:\n            response = s3_client.copy(original, copy, threads=4)\n\n        self._run_copy_response_test(response, expected_num=expected_num, expected_size=expected_size)\n\n        # We can't use etags to compare between multipart and normal keys,\n        # so we fall back to using the file size\n        original_size = s3_client.get_key(original).size\n        copy_size = s3_client.get_key(copy).size\n        self.assertEqual(original_size, copy_size)\n\n    @mock_s3\n    def _run_multipart_test(self, part_size, file_size, **kwargs):\n        create_bucket()\n        file_contents = b\"a\" * file_size\n\n        s3_path = \"s3://mybucket/putMe\"\n        tmp_file = tempfile.NamedTemporaryFile(mode=\"wb\", delete=True)\n        tmp_file_path = tmp_file.name\n        tmp_file.write(file_contents)\n        tmp_file.flush()\n\n        s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY)\n\n        s3_client.put_multipart(tmp_file_path, s3_path, part_size=part_size, **kwargs)\n        self.assertTrue(s3_client.exists(s3_path))\n        file_size = os.path.getsize(tmp_file.name)\n        key_size = s3_client.get_key(s3_path).size\n        self.assertEqual(file_size, key_size)\n        tmp_file.close()\n\n        return 1, key_size\n\n    def _run_copy_response_test(self, response, expected_num=None, expected_size=None):\n        num, size = response\n        self.assertIsInstance(response, tuple)\n\n        # only check >= minimum possible value if not provided expected value\n        self.assertEqual(num, expected_num) if expected_num is not None else self.assertGreaterEqual(num, 1)\n        self.assertEqual(size, expected_size) if expected_size is not None else self.assertGreaterEqual(size, 0)\n"
  },
  {
    "path": "test/contrib/salesforce_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright (c) 2016 Simply Measured\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n#\n# This method will be used by the mock to replace requests.get\n\n\"\"\"\nUnit test for the Salesforce contrib package\n\"\"\"\n\nimport re\n\nimport mock\nimport pytest\nfrom helpers import unittest\n\nfrom luigi.contrib.salesforce import QuerySalesforce, SalesforceAPI\nfrom luigi.mock import MockTarget\n\n\ndef mocked_requests_get(*args, **kwargs):\n    class MockResponse:\n        def __init__(self, body, status_code):\n            self.body = body\n            self.status_code = status_code\n\n        @property\n        def text(self):\n            return self.body\n\n        def raise_for_status(self):\n            return None\n\n    result_list = (\n        '<result-list xmlns=\"http://www.force.com/2009/06/asyncapi/dataload\"><result>1234</result><result>1235</result><result>1236</result></result-list>'\n    )\n    return MockResponse(result_list, 200)\n\n\n# Keep open around so we can use it in the mock responses\nold__open = open\n\n\ndef mocked_open(*args, **kwargs):\n    if re.match(\"job_data\", str(args[0])):\n        return MockTarget(args[0]).open(args[1])\n    else:\n        return old__open(*args)\n\n\n@pytest.mark.contrib\nclass TestSalesforceAPI(unittest.TestCase):\n    # We patch 'requests.get' with our own method. The mock object is passed in to our test case method.\n    @mock.patch(\"requests.get\", side_effect=mocked_requests_get)\n    def test_deprecated_results_warning(self, mock_get):\n        sf = SalesforceAPI(\"xx\", \"xx\", \"xx\")\n        with self.assertWarnsRegex(UserWarning, r\"get_batch_results is deprecated\"):\n            result_id = sf.get_batch_results(\"job_id\", \"batch_id\")\n            self.assertEqual(\"1234\", result_id)\n\n    @mock.patch(\"requests.get\", side_effect=mocked_requests_get)\n    def test_result_ids(self, mock_get):\n        sf = SalesforceAPI(\"xx\", \"xx\", \"xx\")\n        result_ids = sf.get_batch_result_ids(\"job_id\", \"batch_id\")\n        self.assertEqual([\"1234\", \"1235\", \"1236\"], result_ids)\n\n\nclass TestQuerySalesforce(QuerySalesforce):\n    def output(self):\n        return MockTarget(\"job_data.csv\")\n\n    @property\n    def object_name(self):\n        return \"dual\"\n\n    @property\n    def soql(self):\n        return \"SELECT * FROM %s\" % self.object_name\n\n\n@pytest.mark.contrib\nclass TestSalesforceQuery(unittest.TestCase):\n    @mock.patch(\"builtins.open\", side_effect=mocked_open)\n    def setUp(self, mock_open):\n        MockTarget.fs.clear()\n        self.result_ids = [\"a\", \"b\", \"c\"]\n\n        counter = 1\n        self.all_lines = \"Lines\\n\"\n        self.header = \"Lines\"\n        for i, id in enumerate(self.result_ids):\n            filename = \"%s.%d\" % (\"job_data.csv\", i)\n            with MockTarget(filename).open(\"w\") as f:\n                line = \"%d line\\n%d line\" % ((counter), (counter + 1))\n                f.write(self.header + \"\\n\" + line + \"\\n\")\n                self.all_lines += line + \"\\n\"\n                counter += 2\n\n    @mock.patch(\"builtins.open\", side_effect=mocked_open)\n    def test_multi_csv_download(self, mock_open):\n        qsf = TestQuerySalesforce()\n\n        qsf.merge_batch_results(self.result_ids)\n        self.assertEqual(MockTarget(qsf.output().path).open(\"r\").read(), self.all_lines)\n"
  },
  {
    "path": "test/contrib/scalding_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport os\nimport random\nimport shutil\nimport tempfile\nimport unittest\n\nimport mock\nimport pytest\n\nimport luigi\nfrom luigi.contrib import scalding\n\n\nclass MyScaldingTask(scalding.ScaldingJobTask):\n    scala_source = luigi.Parameter()\n\n    def source(self):\n        return self.scala_source\n\n\n@pytest.mark.contrib\nclass ScaldingTest(unittest.TestCase):\n    def setUp(self):\n        self.scalding_home = os.path.join(tempfile.gettempdir(), \"scalding-%09d\" % random.randint(0, 999999999))\n        os.mkdir(self.scalding_home)\n        self.lib_dir = os.path.join(self.scalding_home, \"lib\")\n        os.mkdir(self.lib_dir)\n        os.mkdir(os.path.join(self.scalding_home, \"provided\"))\n        os.mkdir(os.path.join(self.scalding_home, \"libjars\"))\n        f = open(os.path.join(self.lib_dir, \"scalding-core-foo\"), \"w\")\n        f.close()\n\n        self.scala_source = os.path.join(self.scalding_home, \"my_source.scala\")\n        f = open(self.scala_source, \"w\")\n        f.write(\"class foo extends Job\")\n        f.close()\n\n        os.environ[\"SCALDING_HOME\"] = self.scalding_home\n\n    def tearDown(self):\n        shutil.rmtree(self.scalding_home)\n\n    @mock.patch(\"subprocess.check_call\")\n    @mock.patch(\"luigi.contrib.hadoop.run_and_track_hadoop_job\")\n    def test_scalding(self, check_call, track_job):\n        success = luigi.run([\"MyScaldingTask\", \"--scala-source\", self.scala_source, \"--local-scheduler\", \"--no-lock\"])\n        self.assertTrue(success)\n        # TODO: check more stuff\n"
  },
  {
    "path": "test/contrib/sge_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport logging\nimport os\nimport os.path\nimport subprocess\nimport unittest\nfrom glob import glob\n\nimport pytest\nfrom mock import patch\n\nimport luigi\nfrom luigi.contrib.sge import SGEJobTask, _parse_qstat_state\n\nDEFAULT_HOME = \"/home\"\n\nlogger = logging.getLogger(\"luigi-interface\")\n\n\nQSTAT_OUTPUT = \"\"\"job-ID  prior   name       user         state submit/start at     queue                          slots ja-task-ID\n-----------------------------------------------------------------------------------------------------------------\n     1 0.55500 job1 root         r     07/09/2015 16:56:45 all.q@node001                      1\n     2 0.55500 job2 root         qw    07/09/2015 16:56:42                                    1\n     3 0.00000 job3 root         t    07/09/2015 16:56:45                                    1\n\"\"\"\n\n\ndef on_sge_master():\n    try:\n        subprocess.check_output(\"qstat\", shell=True)\n        return True\n    except subprocess.CalledProcessError:\n        return False\n\n\n@pytest.mark.contrib\nclass TestSGEWrappers(unittest.TestCase):\n    def test_track_job(self):\n        \"\"\"`track_job` returns the state using qstat\"\"\"\n        self.assertEqual(_parse_qstat_state(QSTAT_OUTPUT, 1), \"r\")\n        self.assertEqual(_parse_qstat_state(QSTAT_OUTPUT, 2), \"qw\")\n        self.assertEqual(_parse_qstat_state(QSTAT_OUTPUT, 3), \"t\")\n        self.assertEqual(_parse_qstat_state(\"\", 1), \"u\")\n        self.assertEqual(_parse_qstat_state(\"\", 4), \"u\")\n\n\nclass TestJobTask(SGEJobTask):\n    \"\"\"Simple SGE job: write a test file to NSF shared drive and waits a minute\"\"\"\n\n    i = luigi.Parameter()\n\n    def work(self):\n        logger.info(\"Running test job...\")\n        with open(self.output().path, \"w\") as f:\n            f.write(\"this is a test\\n\")\n\n    def output(self):\n        return luigi.LocalTarget(os.path.join(DEFAULT_HOME, \"testfile_\" + str(self.i)))\n\n\n@pytest.mark.contrib\nclass TestSGEJob(unittest.TestCase):\n    \"\"\"Test from SGE master node\"\"\"\n\n    def test_run_job(self):\n        if on_sge_master():\n            outfile = os.path.join(DEFAULT_HOME, \"testfile_1\")\n            tasks = [TestJobTask(i=str(i), n_cpu=1) for i in range(3)]\n            luigi.build(tasks, local_scheduler=True, workers=3)\n            self.assertTrue(os.path.exists(outfile))\n\n    @patch(\"subprocess.check_output\")\n    def test_run_job_with_dump(self, mock_check_output):\n        mock_check_output.side_effect = ['Your job 12345 (\"test_job\") has been submitted', \"\"]\n        task = TestJobTask(i=\"1\", n_cpu=1, shared_tmp_dir=\"/tmp\")\n        luigi.build([task], local_scheduler=True)\n        self.assertEqual(mock_check_output.call_count, 2)\n\n    def tearDown(self):\n        for fpath in glob(os.path.join(DEFAULT_HOME, \"test_file_*\")):\n            try:\n                os.remove(fpath)\n            except OSError:\n                pass\n"
  },
  {
    "path": "test/contrib/spark_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport os\nimport pickle\nimport sys\nimport unittest\nfrom functools import partial\nfrom io import BytesIO\nfrom multiprocessing import Value\nfrom subprocess import Popen\n\nimport pytest\nfrom helpers import temporary_unloaded_module, with_config\nfrom mock import MagicMock, call, mock, patch\n\nimport luigi\nimport luigi.contrib.hdfs\nfrom luigi.contrib.external_program import ExternalProgramRunError\nfrom luigi.contrib.spark import PySparkTask, SparkSubmitTask\nfrom luigi.mock import MockTarget\n\n\ndef poll_generator():\n    yield None\n    yield 1\n\n\ndef setup_run_process(proc):\n    poll_gen = poll_generator()\n    proc.return_value.poll = lambda: next(poll_gen)\n    proc.return_value.returncode = 0\n    proc.return_value.stdout = BytesIO()\n    proc.return_value.stderr = BytesIO()\n\n\nclass TestSparkSubmitTask(SparkSubmitTask):\n    name = \"AppName\"\n    entry_class = \"org.test.MyClass\"\n    jars = [\"jars/my.jar\"]\n    py_files = [\"file1.py\", \"file2.py\"]\n    files = [\"file1\", \"file2\"]\n    conf = {\"Prop\": \"Value\"}\n    properties_file = \"conf/spark-defaults.conf\"\n    driver_memory = \"4G\"\n    driver_java_options = \"-Xopt\"\n    driver_library_path = \"library/path\"\n    driver_class_path = \"class/path\"\n    executor_memory = \"8G\"\n    driver_cores = 8\n    supervise = True\n    total_executor_cores = 150\n    executor_cores = 10\n    queue = \"queue\"\n    num_executors = 2\n    archives = [\"archive1\", \"archive2\"]\n    app = \"file\"\n    pyspark_python = \"/a/b/c\"\n    pyspark_driver_python = \"/b/c/d\"\n    hadoop_user_name = \"luigiuser\"\n\n    def app_options(self):\n        return [\"arg1\", \"arg2\"]\n\n    def output(self):\n        return luigi.LocalTarget(\"output\")\n\n\nclass TestDefaultSparkSubmitTask(SparkSubmitTask):\n    app = \"test.py\"\n\n    def output(self):\n        return luigi.LocalTarget(\"output\")\n\n\nclass TestPySparkTask(PySparkTask):\n    def input(self):\n        return MockTarget(\"input\")\n\n    def output(self):\n        return MockTarget(\"output\")\n\n    def main(self, sc, *args):\n        sc.textFile(self.input().path).saveAsTextFile(self.output().path)\n\n\nclass TestPySparkSessionTask(PySparkTask):\n    def input(self):\n        return MockTarget(\"input\")\n\n    def output(self):\n        return MockTarget(\"output\")\n\n    def main(self, spark, *args):\n        spark.sql(self.input().path).write.saveAsTable(self.output().path)\n\n\nclass MessyNamePySparkTask(TestPySparkTask):\n    name = \"AppName(a,b,c,1:2,3/4)\"\n\n\n@pytest.mark.apache\nclass SparkSubmitTaskTest(unittest.TestCase):\n    ss = \"ss-stub\"\n\n    @with_config({\"spark\": {\"spark-submit\": ss, \"master\": \"yarn-client\", \"hadoop-conf-dir\": \"path\", \"deploy-mode\": \"client\"}})\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_run(self, proc):\n        setup_run_process(proc)\n        job = TestSparkSubmitTask()\n        job.run()\n\n        self.assertEqual(\n            proc.call_args[0][0],\n            [\n                \"ss-stub\",\n                \"--master\",\n                \"yarn-client\",\n                \"--deploy-mode\",\n                \"client\",\n                \"--name\",\n                \"AppName\",\n                \"--class\",\n                \"org.test.MyClass\",\n                \"--jars\",\n                \"jars/my.jar\",\n                \"--py-files\",\n                \"file1.py,file2.py\",\n                \"--files\",\n                \"file1,file2\",\n                \"--archives\",\n                \"archive1,archive2\",\n                \"--conf\",\n                \"Prop=Value\",\n                \"--conf\",\n                \"spark.pyspark.python=/a/b/c\",\n                \"--conf\",\n                \"spark.pyspark.driver.python=/b/c/d\",\n                \"--properties-file\",\n                \"conf/spark-defaults.conf\",\n                \"--driver-memory\",\n                \"4G\",\n                \"--driver-java-options\",\n                \"-Xopt\",\n                \"--driver-library-path\",\n                \"library/path\",\n                \"--driver-class-path\",\n                \"class/path\",\n                \"--executor-memory\",\n                \"8G\",\n                \"--driver-cores\",\n                \"8\",\n                \"--supervise\",\n                \"--total-executor-cores\",\n                \"150\",\n                \"--executor-cores\",\n                \"10\",\n                \"--queue\",\n                \"queue\",\n                \"--num-executors\",\n                \"2\",\n                \"file\",\n                \"arg1\",\n                \"arg2\",\n            ],\n        )\n\n    @with_config({\"spark\": {\"hadoop-conf-dir\": \"path\"}})\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_environment_is_set_correctly(self, proc):\n        setup_run_process(proc)\n        job = TestSparkSubmitTask()\n        job.run()\n\n        assert job._conf == {\"Prop\": \"Value\", \"spark.pyspark.python\": \"/a/b/c\", \"spark.pyspark.driver.python\": \"/b/c/d\"}\n        assert job.program_environment()[\"HADOOP_USER_NAME\"] == \"luigiuser\"\n        self.assertIn(\"HADOOP_CONF_DIR\", proc.call_args[1][\"env\"])\n        self.assertEqual(proc.call_args[1][\"env\"][\"HADOOP_CONF_DIR\"], \"path\")\n\n    @with_config(\n        {\n            \"spark\": {\n                \"spark-submit\": ss,\n                \"master\": \"spark://host:7077\",\n                \"conf\": \"prop1=val1\",\n                \"jars\": \"jar1.jar,jar2.jar\",\n                \"files\": \"file1,file2\",\n                \"py-files\": \"file1.py,file2.py\",\n                \"archives\": \"archive1\",\n            }\n        }\n    )\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_defaults(self, proc):\n        proc.return_value.returncode = 0\n        job = TestDefaultSparkSubmitTask()\n        job.run()\n        self.assertEqual(\n            proc.call_args[0][0],\n            [\n                \"ss-stub\",\n                \"--master\",\n                \"spark://host:7077\",\n                \"--jars\",\n                \"jar1.jar,jar2.jar\",\n                \"--py-files\",\n                \"file1.py,file2.py\",\n                \"--files\",\n                \"file1,file2\",\n                \"--archives\",\n                \"archive1\",\n                \"--conf\",\n                \"prop1=val1\",\n                \"test.py\",\n            ],\n        )\n\n    @patch(\"luigi.contrib.external_program.logger\")\n    @patch(\"luigi.contrib.external_program.tempfile.TemporaryFile\")\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_handle_failed_job(self, proc, file, logger):\n        proc.return_value.returncode = 1\n        file.return_value = BytesIO(b\"spark test error\")\n        try:\n            job = TestSparkSubmitTask()\n            job.run()\n        except ExternalProgramRunError as e:\n            self.assertEqual(e.err, \"spark test error\")\n            self.assertIn(\"spark test error\", str(e))\n            self.assertIn(call.info(\"Program stderr:\\nspark test error\"), logger.mock_calls)\n        else:\n            self.fail(\"Should have thrown ExternalProgramRunError\")\n\n    @patch(\"luigi.contrib.external_program.logger\")\n    @patch(\"luigi.contrib.external_program.tempfile.TemporaryFile\")\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_dont_log_stderr_on_success(self, proc, file, logger):\n        proc.return_value.returncode = 0\n        file.return_value = BytesIO(b\"spark normal error output\")\n        job = TestSparkSubmitTask()\n        job.run()\n\n        self.assertNotIn(call.info(\"Program stderr:\\nspark normal error output\"), logger.mock_calls)\n\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_app_must_be_set(self, proc):\n        with self.assertRaises(NotImplementedError):\n            job = SparkSubmitTask()\n            job.run()\n\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_app_interruption(self, proc):\n\n        def interrupt():\n            raise KeyboardInterrupt()\n\n        proc.return_value.wait = interrupt\n        try:\n            job = TestSparkSubmitTask()\n            job.run()\n        except KeyboardInterrupt:\n            pass\n        proc.return_value.kill.check_called()\n\n    @with_config({\"spark\": {\"deploy-mode\": \"client\"}})\n    def test_tracking_url_is_found_in_stderr_client_mode(self):\n        test_val = Value(\"i\", 0)\n\n        def fake_set_tracking_url(val, url):\n            if url == \"http://10.66.76.155:4040\":\n                val.value += 1\n\n        def Popen_wrap(args, **kwargs):\n            return Popen('>&2 echo \"INFO SparkUI: Bound SparkUI to 0.0.0.0, and started at http://10.66.76.155:4040\"', shell=True, **kwargs)\n\n        task = TestSparkSubmitTask()\n        with mock.patch(\"luigi.contrib.external_program.subprocess.Popen\", wraps=Popen_wrap):\n            with mock.patch.object(task, \"set_tracking_url\", new=partial(fake_set_tracking_url, test_val)):\n                task.run()\n                self.assertEqual(test_val.value, 1)\n\n    @with_config({\"spark\": {\"deploy-mode\": \"cluster\"}})\n    def test_tracking_url_is_found_in_stderr_cluster_mode(self):\n        test_val = Value(\"i\", 0)\n\n        def fake_set_tracking_url(val, url):\n            if url == \"https://127.0.0.1:4040\":\n                val.value += 1\n\n        def Popen_wrap(args, **kwargs):\n            return Popen('>&2 echo \"tracking URL: https://127.0.0.1:4040\"', shell=True, **kwargs)\n\n        task = TestSparkSubmitTask()\n        with mock.patch(\"luigi.contrib.external_program.subprocess.Popen\", wraps=Popen_wrap):\n            with mock.patch.object(task, \"set_tracking_url\", new=partial(fake_set_tracking_url, test_val)):\n                task.run()\n                self.assertEqual(test_val.value, 1)\n\n\n@pytest.mark.apache\nclass PySparkTaskTest(unittest.TestCase):\n    ss = \"ss-stub\"\n\n    @with_config({\"spark\": {\"spark-submit\": ss, \"master\": \"spark://host:7077\", \"deploy-mode\": \"client\"}})\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_run(self, proc):\n        setup_run_process(proc)\n        job = TestPySparkTask()\n        job.run()\n        proc_arg_list = proc.call_args[0][0]\n        self.assertEqual(proc_arg_list[0:7], [\"ss-stub\", \"--master\", \"spark://host:7077\", \"--deploy-mode\", \"client\", \"--name\", \"TestPySparkTask\"])\n        self.assertTrue(os.path.exists(proc_arg_list[7]))\n        self.assertTrue(proc_arg_list[8].endswith(\"TestPySparkTask.pickle\"))\n\n    @with_config({\"spark\": {\"spark-submit\": ss, \"master\": \"spark://host:7077\", \"deploy-mode\": \"client\"}})\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_run_with_pickle_dump(self, proc):\n        setup_run_process(proc)\n        job = TestPySparkTask()\n        luigi.build([job], local_scheduler=True)\n        self.assertEqual(proc.call_count, 1)\n        proc_arg_list = proc.call_args[0][0]\n        self.assertEqual(proc_arg_list[0:7], [\"ss-stub\", \"--master\", \"spark://host:7077\", \"--deploy-mode\", \"client\", \"--name\", \"TestPySparkTask\"])\n        self.assertTrue(os.path.exists(proc_arg_list[7]))\n        self.assertTrue(proc_arg_list[8].endswith(\"TestPySparkTask.pickle\"))\n\n    @with_config({\"spark\": {\"spark-submit\": ss, \"master\": \"spark://host:7077\", \"deploy-mode\": \"cluster\"}})\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_run_with_cluster(self, proc):\n        setup_run_process(proc)\n        job = TestPySparkTask()\n        job.run()\n        proc_arg_list = proc.call_args[0][0]\n        self.assertEqual(proc_arg_list[0:8], [\"ss-stub\", \"--master\", \"spark://host:7077\", \"--deploy-mode\", \"cluster\", \"--name\", \"TestPySparkTask\", \"--files\"])\n        self.assertTrue(proc_arg_list[8].endswith(\"TestPySparkTask.pickle\"))\n        self.assertTrue(os.path.exists(proc_arg_list[9]))\n        self.assertEqual(\"TestPySparkTask.pickle\", proc_arg_list[10])\n\n    @patch.dict(\"sys.modules\", {\"pyspark\": MagicMock()})\n    @patch(\"pyspark.SparkContext\")\n    def test_pyspark_runner(self, spark_context):\n        sc = spark_context.return_value\n\n        def mock_spark_submit(task):\n            from luigi.contrib.pyspark_runner import PySparkRunner\n\n            PySparkRunner(*task.app_command()[1:]).run()\n            # Check py-package exists\n            self.assertTrue(os.path.exists(sc.addPyFile.call_args[0][0]))\n            # Check that main module containing the task exists.\n            run_path = os.path.dirname(task.app_command()[1])\n            self.assertTrue(os.path.exists(os.path.join(run_path, os.path.basename(__file__))))\n            # Check that the python path contains the run_path\n            self.assertTrue(run_path in sys.path)\n            # Check if find_class finds the class for the correct module name.\n            with open(task.app_command()[1], \"rb\") as fp:\n                self.assertTrue(pickle.Unpickler(fp).find_class(\"spark_test\", \"TestPySparkTask\"))\n\n        with patch.object(SparkSubmitTask, \"run\", mock_spark_submit):\n            job = TestPySparkTask()\n            with temporary_unloaded_module(b\"\") as task_module:\n                with_config({\"spark\": {\"py-packages\": task_module}})(job.run)()\n\n        sc.textFile.assert_called_with(\"input\")\n        sc.textFile.return_value.saveAsTextFile.assert_called_with(\"output\")\n        sc.stop.assert_called_once_with()\n\n    def test_pyspark_session_runner_use_spark_session_true(self):\n        pyspark = MagicMock()\n        pyspark.__version__ = \"2.1.0\"\n        pyspark_sql = MagicMock()\n        with patch.dict(sys.modules, {\"pyspark\": pyspark, \"pyspark.sql\": pyspark_sql}):\n            spark = pyspark_sql.SparkSession.builder.config.return_value.enableHiveSupport.return_value.getOrCreate.return_value\n            sc = spark.sparkContext\n\n            def mock_spark_submit(task):\n                from luigi.contrib.pyspark_runner import PySparkSessionRunner\n\n                PySparkSessionRunner(*task.app_command()[1:]).run()\n                # Check py-package exists\n                self.assertTrue(os.path.exists(sc.addPyFile.call_args[0][0]))\n                # Check that main module containing the task exists.\n                run_path = os.path.dirname(task.app_command()[1])\n                self.assertTrue(os.path.exists(os.path.join(run_path, os.path.basename(__file__))))\n                # Check that the python path contains the run_path\n                self.assertTrue(run_path in sys.path)\n                # Check if find_class finds the class for the correct module name.\n                with open(task.app_command()[1], \"rb\") as fp:\n                    self.assertTrue(pickle.Unpickler(fp).find_class(\"spark_test\", \"TestPySparkSessionTask\"))\n\n            with patch.object(SparkSubmitTask, \"run\", mock_spark_submit):\n                job = TestPySparkSessionTask()\n                with temporary_unloaded_module(b\"\") as task_module:\n                    with_config({\"spark\": {\"py-packages\": task_module}})(job.run)()\n\n            spark.sql.assert_called_with(\"input\")\n            spark.sql.return_value.write.saveAsTable.assert_called_with(\"output\")\n            spark.stop.assert_called_once_with()\n\n    def test_pyspark_session_runner_use_spark_session_true_spark1(self):\n        pyspark = MagicMock()\n        pyspark.__version__ = \"1.6.3\"\n        pyspark_sql = MagicMock()\n        with patch.dict(sys.modules, {\"pyspark\": pyspark, \"pyspark.sql\": pyspark_sql}):\n\n            def mock_spark_submit(task):\n                from luigi.contrib.pyspark_runner import PySparkSessionRunner\n\n                self.assertRaises(RuntimeError, PySparkSessionRunner(*task.app_command()[1:]).run)\n\n            with patch.object(SparkSubmitTask, \"run\", mock_spark_submit):\n                job = TestPySparkSessionTask()\n                with temporary_unloaded_module(b\"\") as task_module:\n                    with_config({\"spark\": {\"py-packages\": task_module}})(job.run)()\n\n    @patch(\"luigi.contrib.external_program.subprocess.Popen\")\n    def test_name_cleanup(self, proc):\n        setup_run_process(proc)\n        job = MessyNamePySparkTask()\n        job.run()\n        assert \"AppName_a_b_c_1_2_3_4_\" in job.run_path\n"
  },
  {
    "path": "test/contrib/sqla_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright (c) 2015 Gouthaman Balaraman\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n#\n\"\"\"\nThis file implements unit test cases for luigi/contrib/sqla.py\nAuthor: Gouthaman Balaraman\nDate: 01/02/2015\n\"\"\"\n\nimport os\nimport shutil\nimport tempfile\nimport unittest\n\nimport pytest\nimport sqlalchemy\nfrom helpers import skipOnTravisAndGithubActions\n\nimport luigi\nfrom luigi.contrib import sqla\nfrom luigi.mock import MockTarget\n\n\nclass BaseTask(luigi.Task):\n    TASK_LIST = [\"item%d\\tproperty%d\\n\" % (i, i) for i in range(10)]\n\n    def output(self):\n        return MockTarget(\"BaseTask\", mirror_on_stderr=True)\n\n    def run(self):\n        out = self.output().open(\"w\")\n        for task in self.TASK_LIST:\n            out.write(task)\n        out.close()\n\n\n@pytest.mark.contrib\nclass TestSQLA(unittest.TestCase):\n    NUM_WORKERS = 1\n\n    def _clear_tables(self):\n        meta = sqlalchemy.MetaData()\n        meta.reflect(bind=self.engine)\n        for table in reversed(meta.sorted_tables):\n            self.engine.execute(table.delete())\n\n    def setUp(self):\n        self.tempdir = tempfile.mkdtemp()\n        self.connection_string = self.get_connection_string()\n        self.connect_args = {\"timeout\": 5.0}\n        self.engine = sqlalchemy.create_engine(self.connection_string, connect_args=self.connect_args)\n\n        # Create SQLATask and store in self\n        class SQLATask(sqla.CopyToTable):\n            columns = [([\"item\", sqlalchemy.String(64)], {}), ([\"property\", sqlalchemy.String(64)], {})]\n            connection_string = self.connection_string\n            connect_args = self.connect_args\n            table = \"item_property\"\n            chunk_size = 1\n\n            def requires(self):\n                return BaseTask()\n\n        self.SQLATask = SQLATask\n\n    def tearDown(self):\n        self._clear_tables()\n        if os.path.exists(self.tempdir):\n            shutil.rmtree(self.tempdir)\n\n    def get_connection_string(self, db=\"sqlatest.db\"):\n        return \"sqlite:///{path}\".format(path=os.path.join(self.tempdir, db))\n\n    def test_create_table(self):\n        \"\"\"\n        Test that this method creates table that we require\n        :return:\n        \"\"\"\n\n        class TestSQLData(sqla.CopyToTable):\n            connection_string = self.connection_string\n            connect_args = self.connect_args\n            table = \"test_table\"\n            columns = [([\"id\", sqlalchemy.Integer], dict(primary_key=True)), ([\"name\", sqlalchemy.String(64)], {}), ([\"value\", sqlalchemy.String(64)], {})]\n            chunk_size = 1\n\n            def output(self):\n                pass\n\n        sql_copy = TestSQLData()\n        eng = sqlalchemy.create_engine(TestSQLData.connection_string)\n        self.assertFalse(eng.dialect.has_table(eng.connect(), TestSQLData.table))\n        sql_copy.create_table(eng)\n        self.assertTrue(eng.dialect.has_table(eng.connect(), TestSQLData.table))\n        # repeat and ensure it just binds to existing table\n        sql_copy.create_table(eng)\n\n    def test_create_table_raises_no_columns(self):\n        \"\"\"\n        Check that the test fails when the columns are not set\n        :return:\n        \"\"\"\n\n        class TestSQLData(sqla.CopyToTable):\n            connection_string = self.connection_string\n            table = \"test_table\"\n            columns = []\n            chunk_size = 1\n\n        def output(self):\n            pass\n\n        sql_copy = TestSQLData()\n        eng = sqlalchemy.create_engine(TestSQLData.connection_string)\n        self.assertRaises(NotImplementedError, sql_copy.create_table, eng)\n\n    def _check_entries(self, engine):\n        with engine.begin() as conn:\n            meta = sqlalchemy.MetaData()\n            meta.reflect(bind=engine)\n            self.assertEqual({\"table_updates\", \"item_property\"}, set(meta.tables.keys()))\n            table = meta.tables[self.SQLATask.table]\n            s = sqlalchemy.select([sqlalchemy.func.count(table.c.item)])\n            result = conn.execute(s).fetchone()\n            self.assertEqual(len(BaseTask.TASK_LIST), result[0])\n            s = sqlalchemy.select([table]).order_by(table.c.item)\n            result = conn.execute(s).fetchall()\n            for i in range(len(BaseTask.TASK_LIST)):\n                given = BaseTask.TASK_LIST[i].strip(\"\\n\").split(\"\\t\")\n                given = (str(given[0]), str(given[1]))\n                self.assertEqual(given, tuple(result[i]))\n\n    def test_rows(self):\n        task, task0 = self.SQLATask(), BaseTask()\n        luigi.build([task, task0], local_scheduler=True, workers=self.NUM_WORKERS)\n\n        for i, row in enumerate(task.rows()):\n            given = BaseTask.TASK_LIST[i].strip(\"\\n\").split(\"\\t\")\n            self.assertEqual(row, given)\n\n    def test_run(self):\n        \"\"\"\n        Checking that the runs go as expected. Rerunning the same shouldn't end up\n        inserting more rows into the db.\n        :return:\n        \"\"\"\n        task, task0 = self.SQLATask(), BaseTask()\n        self.engine = sqlalchemy.create_engine(task.connection_string)\n        luigi.build([task0, task], local_scheduler=True)\n        self._check_entries(self.engine)\n\n        # rerun and the num entries should be the same\n        luigi.build([task0, task], local_scheduler=True, workers=self.NUM_WORKERS)\n        self._check_entries(self.engine)\n\n    def test_run_with_chunk_size(self):\n        \"\"\"\n        The chunk_size can be specified in order to control the batch size for inserts.\n        :return:\n        \"\"\"\n        task, task0 = self.SQLATask(), BaseTask()\n        self.engine = sqlalchemy.create_engine(task.connection_string)\n        task.chunk_size = 2  # change chunk size and check it runs ok\n        luigi.build([task, task0], local_scheduler=True, workers=self.NUM_WORKERS)\n        self._check_entries(self.engine)\n\n    def test_reflect(self):\n        \"\"\"\n        If the table is setup already, then one can set reflect to True, and\n        completely skip the columns part. It is not even required at that point.\n        :return:\n        \"\"\"\n        SQLATask = self.SQLATask\n\n        class AnotherSQLATask(sqla.CopyToTable):\n            connection_string = self.connection_string\n            table = \"item_property\"\n            reflect = True\n            chunk_size = 1\n\n            def requires(self):\n                return SQLATask()\n\n            def copy(self, conn, ins_rows, table_bound):\n                ins = (\n                    table_bound.update()\n                    .where(table_bound.c.property == sqlalchemy.bindparam(\"_property\"))\n                    .values({table_bound.c.item: sqlalchemy.bindparam(\"_item\")})\n                )\n                conn.execute(ins, ins_rows)\n\n            def rows(self):\n                for line in BaseTask.TASK_LIST:\n                    yield line.strip(\"\\n\").split(\"\\t\")\n\n        task0, task1, task2 = AnotherSQLATask(), self.SQLATask(), BaseTask()\n        luigi.build([task0, task1, task2], local_scheduler=True, workers=self.NUM_WORKERS)\n        self._check_entries(self.engine)\n\n    def test_create_marker_table(self):\n        \"\"\"\n        Is the marker table created as expected for the SQLAlchemyTarget\n        :return:\n        \"\"\"\n        target = sqla.SQLAlchemyTarget(self.connection_string, \"test_table\", \"12312123\")\n        target.create_marker_table()\n        self.assertTrue(target.engine.dialect.has_table(target.engine.connect(), target.marker_table))\n\n    def test_touch(self):\n        \"\"\"\n        Touch takes care of creating a checkpoint for task completion\n        :return:\n        \"\"\"\n        target = sqla.SQLAlchemyTarget(self.connection_string, \"test_table\", \"12312123\")\n        target.create_marker_table()\n        self.assertFalse(target.exists())\n        target.touch()\n        self.assertTrue(target.exists())\n\n    def test_row_overload(self):\n        \"\"\"Overload the rows method and we should be able to insert data into database\"\"\"\n\n        class SQLARowOverloadTest(sqla.CopyToTable):\n            columns = [([\"item\", sqlalchemy.String(64)], {}), ([\"property\", sqlalchemy.String(64)], {})]\n            connection_string = self.connection_string\n            table = \"item_property\"\n            chunk_size = 1\n\n            def rows(self):\n                tasks = [\n                    (\"item0\", \"property0\"),\n                    (\"item1\", \"property1\"),\n                    (\"item2\", \"property2\"),\n                    (\"item3\", \"property3\"),\n                    (\"item4\", \"property4\"),\n                    (\"item5\", \"property5\"),\n                    (\"item6\", \"property6\"),\n                    (\"item7\", \"property7\"),\n                    (\"item8\", \"property8\"),\n                    (\"item9\", \"property9\"),\n                ]\n                for row in tasks:\n                    yield row\n\n        task = SQLARowOverloadTest()\n        luigi.build([task], local_scheduler=True, workers=self.NUM_WORKERS)\n        self._check_entries(self.engine)\n\n    def test_column_row_separator(self):\n        \"\"\"\n        Test alternate column row separator works\n        :return:\n        \"\"\"\n\n        class ModBaseTask(luigi.Task):\n            def output(self):\n                return MockTarget(\"ModBaseTask\", mirror_on_stderr=True)\n\n            def run(self):\n                out = self.output().open(\"w\")\n                tasks = [\"item%d,property%d\\n\" % (i, i) for i in range(10)]\n                for task in tasks:\n                    out.write(task)\n                out.close()\n\n        class ModSQLATask(sqla.CopyToTable):\n            columns = [([\"item\", sqlalchemy.String(64)], {}), ([\"property\", sqlalchemy.String(64)], {})]\n            connection_string = self.connection_string\n            table = \"item_property\"\n            column_separator = \",\"\n            chunk_size = 1\n\n            def requires(self):\n                return ModBaseTask()\n\n        task1, task2 = ModBaseTask(), ModSQLATask()\n        luigi.build([task1, task2], local_scheduler=True, workers=self.NUM_WORKERS)\n        self._check_entries(self.engine)\n\n    def test_update_rows_test(self):\n        \"\"\"\n        Overload the copy() method and implement an update action.\n        :return:\n        \"\"\"\n\n        class ModBaseTask(luigi.Task):\n            def output(self):\n                return MockTarget(\"BaseTask\", mirror_on_stderr=True)\n\n            def run(self):\n                out = self.output().open(\"w\")\n                for task in self.TASK_LIST:\n                    out.write(\"dummy_\" + task)\n                out.close()\n\n        class ModSQLATask(sqla.CopyToTable):\n            connection_string = self.connection_string\n            table = \"item_property\"\n            columns = [([\"item\", sqlalchemy.String(64)], {}), ([\"property\", sqlalchemy.String(64)], {})]\n            chunk_size = 1\n\n            def requires(self):\n                return ModBaseTask()\n\n        class UpdateSQLATask(sqla.CopyToTable):\n            connection_string = self.connection_string\n            table = \"item_property\"\n            reflect = True\n            chunk_size = 1\n\n            def requires(self):\n                return ModSQLATask()\n\n            def copy(self, conn, ins_rows, table_bound):\n                ins = (\n                    table_bound.update()\n                    .where(table_bound.c.property == sqlalchemy.bindparam(\"_property\"))\n                    .values({table_bound.c.item: sqlalchemy.bindparam(\"_item\")})\n                )\n                conn.execute(ins, ins_rows)\n\n            def rows(self):\n                for task in self.TASK_LIST:\n                    yield task.strip(\"\\n\").split(\"\\t\")\n\n        # Running only task1, and task2 should fail\n        task1, task2, task3 = ModBaseTask(), ModSQLATask(), UpdateSQLATask()\n        luigi.build([task1, task2, task3], local_scheduler=True, workers=self.NUM_WORKERS)\n        self._check_entries(self.engine)\n\n    @skipOnTravisAndGithubActions(\"AssertionError: 10 != 7; https://travis-ci.org/spotify/luigi/jobs/156732446\")\n    def test_multiple_tasks(self):\n        \"\"\"\n        Test a case where there are multiple tasks\n        :return:\n        \"\"\"\n\n        class SmallSQLATask(sqla.CopyToTable):\n            item = luigi.Parameter()\n            property = luigi.Parameter()\n            columns = [([\"item\", sqlalchemy.String(64)], {}), ([\"property\", sqlalchemy.String(64)], {})]\n            connection_string = self.connection_string\n            table = \"item_property\"\n            chunk_size = 1\n\n            def rows(self):\n                yield (self.item, self.property)\n\n        class ManyBaseTask(luigi.Task):\n            def requires(self):\n                for t in BaseTask.TASK_LIST:\n                    item, property = t.strip().split(\"\\t\")\n                    yield SmallSQLATask(item=item, property=property)\n\n        task2 = ManyBaseTask()\n        luigi.build([task2], local_scheduler=True, workers=self.NUM_WORKERS)\n        self._check_entries(self.engine)\n\n    def test_multiple_engines(self):\n        \"\"\"\n        Test case where different tasks require different SQL engines.\n        \"\"\"\n        alt_db = self.get_connection_string(\"sqlatest2.db\")\n\n        class MultiEngineTask(self.SQLATask):\n            connection_string = alt_db\n\n        task0, task1, task2 = BaseTask(), self.SQLATask(), MultiEngineTask()\n        self.assertTrue(task1.output().engine != task2.output().engine)\n        luigi.build([task2, task1, task0], local_scheduler=True, workers=self.NUM_WORKERS)\n        self._check_entries(task1.output().engine)\n        self._check_entries(task2.output().engine)\n\n\n@pytest.mark.contrib\nclass TestSQLA2(TestSQLA):\n    \"\"\"2 workers version\"\"\"\n\n    NUM_WORKERS = 2\n"
  },
  {
    "path": "test/contrib/streaming_test.py",
    "content": "import os\nimport unittest\n\nimport mock\nimport pytest\n\nfrom luigi import Parameter\nfrom luigi.contrib import mrrunner\nfrom luigi.contrib.hadoop import HadoopJobRunner, JobTask\nfrom luigi.contrib.hdfs import HdfsTarget\n\n\nclass MockStreamingJob(JobTask):\n    package_binary = Parameter(default=None)\n\n    def output(self):\n        rv = mock.MagicMock(HdfsTarget)\n        rv.path = \"test_path\"\n        return rv\n\n\nclass MockStreamingJobWithExtraArguments(JobTask):\n    package_binary = Parameter(default=None)\n\n    def extra_streaming_arguments(self):\n        return [(\"myargument\", \"/path/to/coolvalue\")]\n\n    def extra_archives(self):\n        return [\"/path/to/myarchive.zip\", \"/path/to/other_archive.zip\"]\n\n    def output(self):\n        rv = mock.MagicMock(HdfsTarget)\n        rv.path = \"test_path\"\n        return rv\n\n\n@pytest.mark.apache\nclass StreamingRunTest(unittest.TestCase):\n    @mock.patch(\"luigi.contrib.hadoop.shutil\")\n    @mock.patch(\"luigi.contrib.hadoop.run_and_track_hadoop_job\")\n    def test_package_binary_run(self, rath_job, shutil):\n        job_runner = HadoopJobRunner(\"jar_path\", end_job_with_atomic_move_dir=False)\n        job_runner.run_job(MockStreamingJob(package_binary=\"test_bin.pex\"))\n\n        self.assertEqual(1, shutil.copy.call_count)\n        pex_src, pex_dest = shutil.copy.call_args[0]\n        runner_fname = os.path.basename(pex_dest)\n        self.assertEqual(\"test_bin.pex\", pex_src)\n        self.assertEqual(\"mrrunner.pex\", runner_fname)\n\n        self.assertEqual(1, rath_job.call_count)\n        mr_args = rath_job.call_args[0][0]\n        mr_args_pairs = zip(mr_args, mr_args[1:])\n        self.assertIn((\"-mapper\", \"python mrrunner.pex map\"), mr_args_pairs)\n        self.assertIn((\"-file\", pex_dest), mr_args_pairs)\n\n    @mock.patch(\"luigi.contrib.hadoop.create_packages_archive\")\n    @mock.patch(\"luigi.contrib.hadoop.run_and_track_hadoop_job\")\n    def test_standard_run(self, rath_job, cpa):\n        job_runner = HadoopJobRunner(\"jar_path\", end_job_with_atomic_move_dir=False)\n        job_runner.run_job(MockStreamingJob())\n\n        self.assertEqual(1, cpa.call_count)\n\n        self.assertEqual(1, rath_job.call_count)\n        mr_args = rath_job.call_args[0][0]\n        mr_args_pairs = zip(mr_args, mr_args[1:])\n        self.assertIn((\"-mapper\", \"python mrrunner.py map\"), mr_args_pairs)\n        self.assertIn((\"-file\", mrrunner.__file__.rstrip(\"c\")), mr_args_pairs)\n\n    @mock.patch(\"luigi.contrib.hadoop.create_packages_archive\")\n    @mock.patch(\"luigi.contrib.hadoop.run_and_track_hadoop_job\")\n    def test_run_with_extra_arguments(self, rath_job, cpa):\n        job_runner = HadoopJobRunner(\"jar_path\", end_job_with_atomic_move_dir=False)\n        job_runner.run_job(MockStreamingJobWithExtraArguments())\n\n        self.assertEqual(1, cpa.call_count)\n\n        self.assertEqual(1, rath_job.call_count)\n        mr_args = rath_job.call_args[0][0]\n        mr_args_pairs = list(zip(mr_args, mr_args[1:]))\n        self.assertIn((\"-myargument\", \"/path/to/coolvalue\"), mr_args_pairs)\n        self.assertIn((\"-archives\", \"/path/to/myarchive.zip,/path/to/other_archive.zip\"), mr_args_pairs)\n"
  },
  {
    "path": "test/contrib/test_ssh.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nIntegration tests for ssh module.\n\"\"\"\n\nimport os\nimport random\nimport socket\nimport subprocess\n\nimport pytest\nimport target_test\nfrom helpers import unittest\n\nfrom luigi.contrib.ssh import RemoteCalledProcessError, RemoteContext, RemoteFileSystem, RemoteTarget\nfrom luigi.target import FileAlreadyExists, MissingParentDirectory\n\nworking_ssh_host = os.environ.get(\"SSH_TEST_HOST\", \"localhost\")\n# set this to a working ssh host string (e.g. \"localhost\") to activate integration tests\n# The following tests require a working ssh server at `working_ssh_host`\n# the test runner can ssh into using password-less authentication\n\n# since `nc` has different syntax on different platforms\n# we use a short python command to start\n# a 'hello'-server on the remote machine\nHELLO_SERVER_CMD = \"\"\"\nimport socket, sys\nlistener = socket.socket()\nlistener.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)\nlistener.bind(('localhost', 2134))\nlistener.listen(1)\nsys.stdout.write('ready')\nsys.stdout.flush()\nconn = listener.accept()[0]\nconn.sendall(b'hello')\n\"\"\"\n\ntry:\n    x = subprocess.check_output(\"ssh %s -S none -o BatchMode=yes 'echo 1'\" % working_ssh_host, shell=True)\n    if x != b\"1\\n\":\n        raise unittest.SkipTest(\"Not able to connect to ssh server\")\nexcept Exception:\n    raise unittest.SkipTest(\"Not able to connect to ssh server\")\n\n\n@pytest.mark.contrib\nclass TestRemoteContext(unittest.TestCase):\n    def setUp(self):\n        self.context = RemoteContext(working_ssh_host)\n\n    def tearDown(self):\n        try:\n            self.remote_server_handle.terminate()\n        except Exception:\n            pass\n\n    def test_check_output(self):\n        \"\"\"Test check_output ssh\n\n        Assumes the running user can ssh to working_ssh_host\n        \"\"\"\n        output = self.context.check_output([\"echo\", \"-n\", \"luigi\"])\n        self.assertEqual(output, b\"luigi\")\n\n    def test_tunnel(self):\n        print(\"Setting up remote listener...\")\n\n        self.remote_server_handle = self.context.Popen([\"python\", \"-c\", '\"{0}\"'.format(HELLO_SERVER_CMD)], stdout=subprocess.PIPE)\n\n        print(\"Setting up tunnel\")\n        with self.context.tunnel(2135, 2134):\n            print(\"Tunnel up!\")\n            # hack to make sure the listener process is up\n            # and running before we write to it\n            server_output = self.remote_server_handle.stdout.read(5)\n            self.assertEqual(server_output, b\"ready\")\n            print(\"Connecting to server via tunnel\")\n            s = socket.socket()\n            s.connect((\"localhost\", 2135))\n            print(\n                \"Receiving...\",\n            )\n            response = s.recv(5)\n            self.assertEqual(response, b\"hello\")\n            print(\"Closing connection\")\n            s.close()\n            print(\"Waiting for listener...\")\n            output, _ = self.remote_server_handle.communicate()\n            self.assertEqual(self.remote_server_handle.returncode, 0)\n            print(\"Closing tunnel\")\n\n\n@pytest.mark.contrib\nclass TestRemoteTarget(unittest.TestCase):\n    \"\"\"These tests assume RemoteContext working\n    in order for setUp and tearDown to work\n    \"\"\"\n\n    def setUp(self):\n        self.ctx = RemoteContext(working_ssh_host)\n        self.filepath = \"/tmp/luigi_remote_test.dat\"\n        self.target = RemoteTarget(\n            self.filepath,\n            working_ssh_host,\n        )\n        self.ctx.check_output([\"rm\", \"-rf\", self.filepath])\n        self.ctx.check_output([\"echo -n 'hello' >\", self.filepath])\n\n    def tearDown(self):\n        self.ctx.check_output([\"rm\", \"-rf\", self.filepath])\n\n    def test_exists(self):\n        self.assertTrue(self.target.exists())\n        no_file = RemoteTarget(\n            \"/tmp/_file_that_doesnt_exist_\",\n            working_ssh_host,\n        )\n        self.assertFalse(no_file.exists())\n\n    def test_remove(self):\n        self.target.remove()\n        self.assertRaises(subprocess.CalledProcessError, self.ctx.check_output, [\"cat\", self.filepath])\n\n    def test_open(self):\n        f = self.target.open(\"r\")\n        file_content = f.read()\n        f.close()\n        self.assertEqual(file_content, \"hello\")\n\n        self.assertTrue(self.target.fs.exists(self.filepath))\n        self.assertFalse(self.target.fs.isdir(self.filepath))\n\n    def test_context_manager(self):\n        with self.target.open(\"r\") as f:\n            file_content = f.read()\n\n        self.assertEqual(file_content, \"hello\")\n\n\n@pytest.mark.contrib\nclass TestRemoteFilesystem(unittest.TestCase):\n    def setUp(self):\n        self.fs = RemoteFileSystem(working_ssh_host)\n        self.root = \"/tmp/luigi-remote-test\"\n        self.directory = self.root + \"/dir\"\n        self.filepath = self.directory + \"/file\"\n        self.target = RemoteTarget(\n            self.filepath,\n            working_ssh_host,\n        )\n\n        self.fs.remote_context.check_output([\"rm\", \"-rf\", self.root])\n        self.addCleanup(self.fs.remote_context.check_output, [\"rm\", \"-rf\", self.root])\n\n    def test_mkdir(self):\n        self.assertFalse(self.fs.isdir(self.directory))\n\n        self.assertRaises(MissingParentDirectory, self.fs.mkdir, self.directory, parents=False)\n        self.fs.mkdir(self.directory)\n        self.assertTrue(self.fs.isdir(self.directory))\n\n        # Shouldn't throw\n        self.fs.mkdir(self.directory)\n\n        self.assertRaises(FileAlreadyExists, self.fs.mkdir, self.directory, raise_if_exists=True)\n\n    def test_list(self):\n        with self.target.open(\"w\"):\n            pass\n\n        self.assertEqual([self.target.path], list(self.fs.listdir(self.directory)))\n\n\n@pytest.mark.contrib\nclass TestGetAttrRecursion(unittest.TestCase):\n    def test_recursion_on_delete(self):\n        target = RemoteTarget(\"/etc/this/does/not/exist\", working_ssh_host)\n        with self.assertRaises(RemoteCalledProcessError):\n            with target.open(\"w\") as fh:\n                fh.write(\"test\")\n\n\n@pytest.mark.contrib\nclass TestRemoteTargetAtomicity(unittest.TestCase, target_test.FileSystemTargetTestMixin):\n    path = \"/tmp/luigi_remote_atomic_test.txt\"\n    ctx = RemoteContext(working_ssh_host)\n\n    def create_target(self, format=None):\n        return RemoteTarget(self.path, working_ssh_host, format=format)\n\n    def _exists(self, path):\n        try:\n            self.ctx.check_output([\"test\", \"-e\", path])\n        except subprocess.CalledProcessError as e:\n            if e.returncode == 1:\n                return False\n            else:\n                raise\n        return True\n\n    def assertCleanUp(self, tp):\n        self.assertFalse(self._exists(tp))\n\n    def setUp(self):\n        self.ctx.check_output([\"rm\", \"-rf\", self.path])\n        self.local_file = \"/tmp/local_luigi_remote_atomic_test.txt\"\n        if os.path.exists(self.local_file):\n            os.remove(self.local_file)\n\n    def tearDown(self):\n        self.ctx.check_output([\"rm\", \"-rf\", self.path])\n        if os.path.exists(self.local_file):\n            os.remove(self.local_file)\n\n    def test_put(self):\n        f = open(self.local_file, \"w\")\n        f.write(\"hello\")\n        f.close()\n        t = RemoteTarget(self.path, working_ssh_host)\n        t.put(self.local_file)\n        self.assertTrue(self._exists(self.path))\n\n    def test_get(self):\n        self.ctx.check_output([\"echo -n 'hello' >\", self.path])\n        t = RemoteTarget(self.path, working_ssh_host)\n        t.get(self.local_file)\n        f = open(self.local_file, \"r\")\n        file_content = f.read()\n        self.assertEqual(file_content, \"hello\")\n\n    test_move_on_fs = None  # ssh don't have move (yet?)\n    test_rename_dont_move_on_fs = None  # ssh don't have move (yet?)\n\n\nclass TestRemoteTargetCreateDirectories(TestRemoteTargetAtomicity):\n    path = \"/tmp/%s/xyz/luigi_remote_atomic_test.txt\" % random.randint(0, 999999999)\n\n\nclass TestRemoteTargetRelative(TestRemoteTargetAtomicity):\n    path = \"luigi_remote_atomic_test.txt\"\n"
  },
  {
    "path": "test/create_packages_archive_root/module.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n"
  },
  {
    "path": "test/create_packages_archive_root/package/__init__.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n"
  },
  {
    "path": "test/create_packages_archive_root/package/submodule.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport os  # NOQA\n"
  },
  {
    "path": "test/create_packages_archive_root/package/submodule_with_absolute_import.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport os  # NOQA\n"
  },
  {
    "path": "test/create_packages_archive_root/package/submodule_without_imports.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n"
  },
  {
    "path": "test/create_packages_archive_root/package/subpackage/__init__.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n"
  },
  {
    "path": "test/create_packages_archive_root/package/subpackage/submodule.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport os  # NOQA\n"
  },
  {
    "path": "test/custom_metrics_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2017 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport os\nimport tempfile\nimport time\n\nfrom helpers import LuigiTestCase, temporary_unloaded_module\n\nimport luigi\nfrom luigi.metrics import MetricsCollectors\nfrom luigi.scheduler import Scheduler\nfrom luigi.worker import Worker\n\n\nclass CustomMetricsTestMyTask(luigi.Task):\n    root_path = luigi.PathParameter()\n\n    n = luigi.IntParameter()\n\n    def output(self):\n        basename = \"%s_%s.txt\" % (self.__class__.__name__, self.n)\n        return luigi.LocalTarget(os.path.join(self.root_path, basename))\n\n    def run(self):\n        time.sleep(self.n)\n        with self.output().open(\"w\") as f:\n            f.write(\"content\\n\")\n\n\nclass CustomMetricsTestWrapper(CustomMetricsTestMyTask):\n    def requires(self):\n        return [self.clone(CustomMetricsTestMyTask, n=n) for n in range(self.n)]\n\n\nMETRICS_COLLECTOR_MODULE = b\"\"\"\nfrom luigi.metrics import NoMetricsCollector\n\nclass CustomMetricsCollector(NoMetricsCollector):\n    def __init__(self, *args, **kwargs):\n        super(CustomMetricsCollector, self).__init__(*args, **kwargs)\n        self.elapsed = {}\n\n    def handle_task_statistics(self, task, statistics):\n        if \"elapsed\" in statistics:\n            self.elapsed[(task.family, task.params.get(\"n\"))] = statistics[\"elapsed\"]\n\"\"\"\n\n\nTASK_CONTEXT_MODULE = b\"\"\"\nimport time\n\nclass CustomTaskContext:\n    def __init__(self, task_process):\n        self._task_process = task_process\n        self._start = None\n\n    def __enter__(self):\n        self._start = time.perf_counter()\n        return self\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        assert self._start is not None\n        elapsed = time.perf_counter() - self._start\n        self._task_process.status_reporter.report_task_statistics({\"elapsed\": elapsed})\n\"\"\"\n\n\nclass CustomMetricsTest(LuigiTestCase):\n    \"\"\"\n    Test showcasing collection of cutom metrics\n    \"\"\"\n\n    def _run_task_on_worker(self, worker):\n        with tempfile.TemporaryDirectory() as tmpdir:\n            task = CustomMetricsTestWrapper(n=3, root_path=tmpdir)\n            self.assertTrue(worker.add(task))\n            worker.run()\n            self.assertTrue(task.complete())\n\n    def _create_worker_and_run_task(self, scheduler):\n        with temporary_unloaded_module(TASK_CONTEXT_MODULE) as task_context_module:\n            with Worker(scheduler=scheduler, worker_id=\"X\", task_process_context=task_context_module + \".CustomTaskContext\") as worker:\n                self._run_task_on_worker(worker)\n\n    def test_custom_metrics(self):\n        with temporary_unloaded_module(METRICS_COLLECTOR_MODULE) as metrics_collector_module:\n            scheduler = Scheduler(metrics_collector=MetricsCollectors.custom, metrics_custom_import=metrics_collector_module + \".CustomMetricsCollector\")\n            self._create_worker_and_run_task(scheduler)\n            for (family, n), elapsed in scheduler._state._metrics_collector.elapsed.items():\n                self.assertTrue(family in {\"CustomMetricsTestMyTask\", \"CustomMetricsTestWrapper\"})\n                self.assertTrue(elapsed >= float(n))\n"
  },
  {
    "path": "test/customized_run_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport logging\nimport time\n\nfrom helpers import unittest\n\nimport luigi\nimport luigi.contrib.hadoop\nimport luigi.rpc\nimport luigi.scheduler\nimport luigi.worker\n\n\nclass DummyTask(luigi.Task):\n    task_namespace = \"customized_run\"  # to prevent task name coflict between tests\n    n = luigi.Parameter()\n\n    def __init__(self, *args, **kwargs):\n        super(DummyTask, self).__init__(*args, **kwargs)\n        self.has_run = False\n\n    def complete(self):\n        return self.has_run\n\n    def run(self):\n        logging.debug(\"%s - setting has_run\", self)\n        self.has_run = True\n\n\nclass CustomizedLocalScheduler(luigi.scheduler.Scheduler):\n    def __init__(self, *args, **kwargs):\n        super(CustomizedLocalScheduler, self).__init__(*args, **kwargs)\n        self.has_run = False\n\n    def get_work(self, worker, host=None, **kwargs):\n        r = super(CustomizedLocalScheduler, self).get_work(worker=worker, host=host)\n        self.has_run = True\n        return r\n\n    def complete(self):\n        return self.has_run\n\n\nclass CustomizedRemoteScheduler(luigi.rpc.RemoteScheduler):\n    def __init__(self, *args, **kwargs):\n        super(CustomizedRemoteScheduler, self).__init__(*args, **kwargs)\n        self.has_run = False\n\n    def get_work(self, worker, host=None):\n        r = super(CustomizedRemoteScheduler, self).get_work(worker=worker, host=host)\n        self.has_run = True\n        return r\n\n    def complete(self):\n        return self.has_run\n\n\nclass CustomizedWorker(luigi.worker.Worker):\n    def __init__(self, *args, **kwargs):\n        super(CustomizedWorker, self).__init__(*args, **kwargs)\n        self.has_run = False\n\n    def _run_task(self, task_id):\n        super(CustomizedWorker, self)._run_task(task_id)\n        self.has_run = True\n\n    def complete(self):\n        return self.has_run\n\n\nclass CustomizedWorkerSchedulerFactory:\n    def __init__(self, *args, **kwargs):\n        self.scheduler = CustomizedLocalScheduler()\n        self.worker = CustomizedWorker(self.scheduler)\n\n    def create_local_scheduler(self):\n        return self.scheduler\n\n    def create_remote_scheduler(self, url):\n        return CustomizedRemoteScheduler(url)\n\n    def create_worker(self, scheduler, worker_processes=None, assistant=False):\n        return self.worker\n\n\nclass CustomizedWorkerTest(unittest.TestCase):\n    \"\"\"Test that luigi's build method (and ultimately the run method) can accept a customized worker and scheduler\"\"\"\n\n    def setUp(self):\n        self.worker_scheduler_factory = CustomizedWorkerSchedulerFactory()\n        self.time = time.time\n\n    def tearDown(self):\n        if time.time != self.time:\n            time.time = self.time\n\n    def setTime(self, t):\n        time.time = lambda: t\n\n    def test_customized_worker(self):\n        a = DummyTask(3)\n        self.assertFalse(a.complete())\n        self.assertFalse(self.worker_scheduler_factory.worker.complete())\n        luigi.build([a], worker_scheduler_factory=self.worker_scheduler_factory)\n        self.assertTrue(a.complete())\n        self.assertTrue(self.worker_scheduler_factory.worker.complete())\n\n    def test_cmdline_custom_worker(self):\n        self.assertFalse(self.worker_scheduler_factory.worker.complete())\n        luigi.run([\"customized_run.DummyTask\", \"--n\", \"4\"], worker_scheduler_factory=self.worker_scheduler_factory)\n        self.assertTrue(self.worker_scheduler_factory.worker.complete())\n"
  },
  {
    "path": "test/date_interval_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport datetime\n\nfrom helpers import LuigiTestCase, in_parse\n\nimport luigi\nfrom luigi.parameter import DateIntervalParameter as DI\n\n\nclass DateIntervalTest(LuigiTestCase):\n    def test_date(self):\n        di = DI().parse(\"2012-01-01\")\n        self.assertEqual(di.dates(), [datetime.date(2012, 1, 1)])\n        self.assertEqual(di.next().dates(), [datetime.date(2012, 1, 2)])\n        self.assertEqual(di.prev().dates(), [datetime.date(2011, 12, 31)])\n        self.assertEqual(str(di), \"2012-01-01\")\n\n    def test_month(self):\n        di = DI().parse(\"2012-01\")\n        self.assertEqual(di.dates(), [datetime.date(2012, 1, 1) + datetime.timedelta(i) for i in range(31)])\n        self.assertEqual(di.next().dates(), [datetime.date(2012, 2, 1) + datetime.timedelta(i) for i in range(29)])\n        self.assertEqual(di.prev().dates(), [datetime.date(2011, 12, 1) + datetime.timedelta(i) for i in range(31)])\n        self.assertEqual(str(di), \"2012-01\")\n\n    def test_year(self):\n        di = DI().parse(\"2012\")\n        self.assertEqual(di.dates(), [datetime.date(2012, 1, 1) + datetime.timedelta(i) for i in range(366)])\n        self.assertEqual(di.next().dates(), [datetime.date(2013, 1, 1) + datetime.timedelta(i) for i in range(365)])\n        self.assertEqual(di.prev().dates(), [datetime.date(2011, 1, 1) + datetime.timedelta(i) for i in range(365)])\n        self.assertEqual(str(di), \"2012\")\n\n    def test_week(self):\n        # >>> datetime.date(2012, 1, 1).isocalendar()\n        # (2011, 52, 7)\n        # >>> datetime.date(2012, 12, 31).isocalendar()\n        # (2013, 1, 1)\n\n        di = DI().parse(\"2011-W52\")\n        self.assertEqual(di.dates(), [datetime.date(2011, 12, 26) + datetime.timedelta(i) for i in range(7)])\n        self.assertEqual(di.next().dates(), [datetime.date(2012, 1, 2) + datetime.timedelta(i) for i in range(7)])\n        self.assertEqual(str(di), \"2011-W52\")\n\n        di = DI().parse(\"2013-W01\")\n        self.assertEqual(di.dates(), [datetime.date(2012, 12, 31) + datetime.timedelta(i) for i in range(7)])\n        self.assertEqual(di.prev().dates(), [datetime.date(2012, 12, 24) + datetime.timedelta(i) for i in range(7)])\n        self.assertEqual(str(di), \"2013-W01\")\n\n    def test_interval(self):\n        di = DI().parse(\"2012-01-01-2012-02-01\")\n        self.assertEqual(di.dates(), [datetime.date(2012, 1, 1) + datetime.timedelta(i) for i in range(31)])\n        self.assertRaises(NotImplementedError, di.next)\n        self.assertRaises(NotImplementedError, di.prev)\n        self.assertEqual(di.to_string(), \"2012-01-01-2012-02-01\")\n\n    def test_exception(self):\n        self.assertRaises(ValueError, DI().parse, \"xyz\")\n\n    def test_comparison(self):\n        a = DI().parse(\"2011\")\n        b = DI().parse(\"2013\")\n        c = DI().parse(\"2012\")\n        self.assertTrue(a < b)\n        self.assertTrue(a < c)\n        self.assertTrue(b > c)\n        d = DI().parse(\"2012\")\n        self.assertTrue(d == c)\n        self.assertEqual(d, min(c, b))\n        self.assertEqual(3, len({a, b, c, d}))\n\n    def test_comparison_different_types(self):\n        x = DI().parse(\"2012\")\n        y = DI().parse(\"2012-01-01-2013-01-01\")\n        self.assertRaises(TypeError, lambda: x == y)\n\n    def test_parameter_parse_and_default(self):\n        month = luigi.date_interval.Month(2012, 11)\n        other = luigi.date_interval.Month(2012, 10)\n\n        class MyTask(luigi.Task):\n            di = DI(default=month)\n\n        class MyTaskNoDefault(luigi.Task):\n            di = DI()\n\n        self.assertEqual(MyTask().di, month)\n        in_parse([\"MyTask\", \"--di\", \"2012-10\"], lambda task: self.assertEqual(task.di, other))\n        task = MyTask(month)\n        self.assertEqual(task.di, month)\n        task = MyTask(di=month)\n        self.assertEqual(task.di, month)\n        task = MyTask(other)\n        self.assertNotEqual(task.di, month)\n\n        def fail1():\n            return MyTaskNoDefault()\n\n        self.assertRaises(luigi.parameter.MissingParameterException, fail1)\n\n        in_parse([\"MyTaskNoDefault\", \"--di\", \"2012-10\"], lambda task: self.assertEqual(task.di, other))\n\n    def test_hours(self):\n        d = DI().parse(\"2015\")\n        self.assertEqual(len(list(d.hours())), 24 * 365)\n\n    def test_cmp(self):\n        operators = [lambda x, y: x == y, lambda x, y: x != y, lambda x, y: x < y, lambda x, y: x > y, lambda x, y: x <= y, lambda x, y: x >= y]\n\n        dates = [\n            (1, 30, DI().parse(\"2015-01-01-2015-01-30\")),\n            (1, 15, DI().parse(\"2015-01-01-2015-01-15\")),\n            (10, 20, DI().parse(\"2015-01-10-2015-01-20\")),\n            (20, 30, DI().parse(\"2015-01-20-2015-01-30\")),\n        ]\n\n        for from_a, to_a, di_a in dates:\n            for from_b, to_b, di_b in dates:\n                for op in operators:\n                    self.assertEqual(op((from_a, to_a), (from_b, to_b)), op(di_a, di_b))\n"
  },
  {
    "path": "test/date_parameter_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport datetime\n\nfrom helpers import in_parse, unittest\n\nimport luigi\nimport luigi.interface\n\n\nclass DateTask(luigi.Task):\n    day = luigi.DateParameter()\n\n\nclass DateHourTask(luigi.Task):\n    dh = luigi.DateHourParameter()\n\n\nclass DateMinuteTask(luigi.Task):\n    dm = luigi.DateMinuteParameter()\n\n\nclass DateSecondTask(luigi.Task):\n    ds = luigi.DateSecondParameter()\n\n\nclass MonthTask(luigi.Task):\n    month = luigi.MonthParameter()\n\n\nclass YearTask(luigi.Task):\n    year = luigi.YearParameter()\n\n\nclass DateParameterTest(unittest.TestCase):\n    def test_parse(self):\n        d = luigi.DateParameter().parse(\"2015-04-03\")\n        self.assertEqual(d, datetime.date(2015, 4, 3))\n\n    def test_serialize(self):\n        d = luigi.DateParameter().serialize(datetime.date(2015, 4, 3))\n        self.assertEqual(d, \"2015-04-03\")\n\n    def test_parse_interface(self):\n        in_parse([\"DateTask\", \"--day\", \"2015-04-03\"], lambda task: self.assertEqual(task.day, datetime.date(2015, 4, 3)))\n\n    def test_serialize_task(self):\n        t = DateTask(datetime.date(2015, 4, 3))\n        self.assertEqual(str(t), \"DateTask(day=2015-04-03)\")\n\n\nclass DateHourParameterTest(unittest.TestCase):\n    def test_parse(self):\n        dh = luigi.DateHourParameter().parse(\"2013-02-01T18\")\n        self.assertEqual(dh, datetime.datetime(2013, 2, 1, 18, 0, 0))\n\n    def test_date_to_dh(self):\n        date = luigi.DateHourParameter().normalize(datetime.date(2000, 1, 1))\n        self.assertEqual(date, datetime.datetime(2000, 1, 1, 0))\n\n    def test_serialize(self):\n        dh = luigi.DateHourParameter().serialize(datetime.datetime(2013, 2, 1, 18, 0, 0))\n        self.assertEqual(dh, \"2013-02-01T18\")\n\n    def test_parse_interface(self):\n        in_parse([\"DateHourTask\", \"--dh\", \"2013-02-01T18\"], lambda task: self.assertEqual(task.dh, datetime.datetime(2013, 2, 1, 18, 0, 0)))\n\n    def test_serialize_task(self):\n        t = DateHourTask(datetime.datetime(2013, 2, 1, 18, 0, 0))\n        self.assertEqual(str(t), \"DateHourTask(dh=2013-02-01T18)\")\n\n\nclass DateMinuteParameterTest(unittest.TestCase):\n    def test_parse(self):\n        dm = luigi.DateMinuteParameter().parse(\"2013-02-01T1842\")\n        self.assertEqual(dm, datetime.datetime(2013, 2, 1, 18, 42, 0))\n\n    def test_parse_padding_zero(self):\n        dm = luigi.DateMinuteParameter().parse(\"2013-02-01T1807\")\n        self.assertEqual(dm, datetime.datetime(2013, 2, 1, 18, 7, 0))\n\n    def test_parse_deprecated(self):\n        with self.assertWarnsRegex(DeprecationWarning, 'Using \"H\" between hours and minutes is deprecated, omit it instead.'):\n            dm = luigi.DateMinuteParameter().parse(\"2013-02-01T18H42\")\n        self.assertEqual(dm, datetime.datetime(2013, 2, 1, 18, 42, 0))\n\n    def test_serialize(self):\n        dm = luigi.DateMinuteParameter().serialize(datetime.datetime(2013, 2, 1, 18, 42, 0))\n        self.assertEqual(dm, \"2013-02-01T1842\")\n\n    def test_serialize_padding_zero(self):\n        dm = luigi.DateMinuteParameter().serialize(datetime.datetime(2013, 2, 1, 18, 7, 0))\n        self.assertEqual(dm, \"2013-02-01T1807\")\n\n    def test_parse_interface(self):\n        in_parse([\"DateMinuteTask\", \"--dm\", \"2013-02-01T1842\"], lambda task: self.assertEqual(task.dm, datetime.datetime(2013, 2, 1, 18, 42, 0)))\n\n    def test_serialize_task(self):\n        t = DateMinuteTask(datetime.datetime(2013, 2, 1, 18, 42, 0))\n        self.assertEqual(str(t), \"DateMinuteTask(dm=2013-02-01T1842)\")\n\n\nclass DateSecondParameterTest(unittest.TestCase):\n    def test_parse(self):\n        ds = luigi.DateSecondParameter().parse(\"2013-02-01T184227\")\n        self.assertEqual(ds, datetime.datetime(2013, 2, 1, 18, 42, 27))\n\n    def test_serialize(self):\n        ds = luigi.DateSecondParameter().serialize(datetime.datetime(2013, 2, 1, 18, 42, 27))\n        self.assertEqual(ds, \"2013-02-01T184227\")\n\n    def test_parse_interface(self):\n        in_parse([\"DateSecondTask\", \"--ds\", \"2013-02-01T184227\"], lambda task: self.assertEqual(task.ds, datetime.datetime(2013, 2, 1, 18, 42, 27)))\n\n    def test_serialize_task(self):\n        t = DateSecondTask(datetime.datetime(2013, 2, 1, 18, 42, 27))\n        self.assertEqual(str(t), \"DateSecondTask(ds=2013-02-01T184227)\")\n\n\nclass MonthParameterTest(unittest.TestCase):\n    def test_parse(self):\n        m = luigi.MonthParameter().parse(\"2015-04\")\n        self.assertEqual(m, datetime.date(2015, 4, 1))\n\n    def test_construct_month_interval(self):\n        m = MonthTask(luigi.date_interval.Month(2015, 4))\n        self.assertEqual(m.month, datetime.date(2015, 4, 1))\n\n    def test_month_interval_default(self):\n        class MonthDefaultTask(luigi.task.Task):\n            month = luigi.MonthParameter(default=luigi.date_interval.Month(2015, 4))\n\n        m = MonthDefaultTask()\n        self.assertEqual(m.month, datetime.date(2015, 4, 1))\n\n    def test_serialize(self):\n        m = luigi.MonthParameter().serialize(datetime.date(2015, 4, 3))\n        self.assertEqual(m, \"2015-04\")\n\n    def test_parse_interface(self):\n        in_parse([\"MonthTask\", \"--month\", \"2015-04\"], lambda task: self.assertEqual(task.month, datetime.date(2015, 4, 1)))\n\n    def test_serialize_task(self):\n        task = MonthTask(datetime.date(2015, 4, 3))\n        self.assertEqual(str(task), \"MonthTask(month=2015-04)\")\n\n\nclass YearParameterTest(unittest.TestCase):\n    def test_parse(self):\n        year = luigi.YearParameter().parse(\"2015\")\n        self.assertEqual(year, datetime.date(2015, 1, 1))\n\n    def test_construct_year_interval(self):\n        y = YearTask(luigi.date_interval.Year(2015))\n        self.assertEqual(y.year, datetime.date(2015, 1, 1))\n\n    def test_year_interval_default(self):\n        class YearDefaultTask(luigi.task.Task):\n            year = luigi.YearParameter(default=luigi.date_interval.Year(2015))\n\n        m = YearDefaultTask()\n        self.assertEqual(m.year, datetime.date(2015, 1, 1))\n\n    def test_serialize(self):\n        year = luigi.YearParameter().serialize(datetime.date(2015, 4, 3))\n        self.assertEqual(year, \"2015\")\n\n    def test_parse_interface(self):\n        in_parse([\"YearTask\", \"--year\", \"2015\"], lambda task: self.assertEqual(task.year, datetime.date(2015, 1, 1)))\n\n    def test_serialize_task(self):\n        task = YearTask(datetime.date(2015, 4, 3))\n        self.assertEqual(str(task), \"YearTask(year=2015)\")\n"
  },
  {
    "path": "test/db_task_history_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom helpers import unittest, with_config\n\nimport luigi\nimport luigi.scheduler\nfrom luigi.db_task_history import DbTaskHistory\nfrom luigi.parameter import ParameterVisibility\nfrom luigi.task_status import DONE, PENDING, RUNNING\n\n\nclass DummyTask(luigi.Task):\n    foo = luigi.Parameter(default=\"foo\")\n\n\nclass ParamTask(luigi.Task):\n    param1 = luigi.Parameter()\n    param2 = luigi.IntParameter(visibility=ParameterVisibility.HIDDEN)\n    param3 = luigi.Parameter(default=\"empty\", visibility=ParameterVisibility.PRIVATE)\n\n\nclass DbTaskHistoryTest(unittest.TestCase):\n    @with_config(dict(task_history=dict(db_connection=\"sqlite:///:memory:\")))\n    def setUp(self):\n        self.history = DbTaskHistory()\n\n    def test_task_list(self):\n        self.run_task(DummyTask())\n        self.run_task(DummyTask(foo=\"bar\"))\n\n        with self.history._session() as session:\n            tasks = list(self.history.find_all_by_name(\"DummyTask\", session))\n\n            self.assertEqual(len(tasks), 2)\n            for task in tasks:\n                self.assertEqual(task.name, \"DummyTask\")\n                self.assertEqual(task.host, \"hostname\")\n\n    def test_task_events(self):\n        self.run_task(DummyTask())\n\n        with self.history._session() as session:\n            tasks = list(self.history.find_all_by_name(\"DummyTask\", session))\n            self.assertEqual(len(tasks), 1)\n            [task] = tasks\n            self.assertEqual(task.name, \"DummyTask\")\n            self.assertEqual(len(task.events), 3)\n            for event, name in zip(task.events, [DONE, RUNNING, PENDING]):\n                self.assertEqual(event.event_name, name)\n\n    def test_task_by_params(self):\n        task1 = ParamTask(\"foo\", \"bar\")\n        task2 = ParamTask(\"bar\", \"foo\")\n\n        with self.history._session() as session:\n            self.run_task(task1)\n            self.run_task(task2)\n            task1_record = self.history.find_all_by_parameters(task_name=\"ParamTask\", session=session, param1=\"foo\", param2=\"bar\")\n            task2_record = self.history.find_all_by_parameters(task_name=\"ParamTask\", session=session, param1=\"bar\", param2=\"foo\")\n            for task, records in zip((task1, task2), (task1_record, task2_record)):\n                records = list(records)\n                self.assertEqual(len(records), 1)\n                [record] = records\n                self.assertEqual(task.task_family, record.name)\n                for param_name, param_value in task.param_kwargs.items():\n                    self.assertTrue(param_name in record.parameters)\n                    self.assertEqual(str(param_value), record.parameters[param_name].value)\n\n    def test_task_blank_param(self):\n        self.run_task(DummyTask(foo=\"\"))\n\n        with self.history._session() as session:\n            tasks = list(self.history.find_all_by_name(\"DummyTask\", session))\n\n            self.assertEqual(len(tasks), 1)\n            task_record = tasks[0]\n            self.assertEqual(task_record.name, \"DummyTask\")\n            self.assertEqual(task_record.host, \"hostname\")\n            self.assertIn(\"foo\", task_record.parameters)\n            self.assertEqual(task_record.parameters[\"foo\"].value, \"\")\n\n    def run_task(self, task):\n        task2 = luigi.scheduler.Task(\n            task.task_id, PENDING, [], family=task.task_family, params=task.param_kwargs, retry_policy=luigi.scheduler._get_empty_retry_policy()\n        )\n\n        self.history.task_scheduled(task2)\n        self.history.task_started(task2, \"hostname\")\n        self.history.task_finished(task2, successful=True)\n\n\nclass MySQLDbTaskHistoryTest(unittest.TestCase):\n    @with_config(dict(task_history=dict(db_connection=\"mysql+mysqlconnector://travis@localhost/luigi_test\")))\n    def setUp(self):\n        try:\n            self.history = DbTaskHistory()\n        except Exception:\n            raise unittest.SkipTest(\"DBTaskHistory cannot be created: probably no MySQL available\")\n\n    def test_subsecond_timestamp(self):\n        with self.history._session() as session:\n            # Add 2 events in <1s\n            task = DummyTask()\n            self.run_task(task)\n\n            task_record = next(self.history.find_all_by_name(\"DummyTask\", session))\n            print(task_record.events)\n            self.assertEqual(task_record.events[0].event_name, DONE)\n\n    def test_utc_conversion(self):\n        from luigi.server import from_utc\n\n        with self.history._session() as session:\n            task = DummyTask()\n            self.run_task(task)\n\n            task_record = next(self.history.find_all_by_name(\"DummyTask\", session))\n            last_event = task_record.events[0]\n            try:\n                print(from_utc(str(last_event.ts)))\n            except ValueError:\n                self.fail(\"Failed to convert timestamp {} to UTC\".format(last_event.ts))\n\n    def run_task(self, task):\n        task2 = luigi.scheduler.Task(\n            task.task_id, PENDING, [], family=task.task_family, params=task.param_kwargs, retry_policy=luigi.scheduler._get_empty_retry_policy()\n        )\n\n        self.history.task_scheduled(task2)\n        self.history.task_started(task2, \"hostname\")\n        self.history.task_finished(task2, successful=True)\n"
  },
  {
    "path": "test/decorator_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport datetime\nimport pickle\n\nfrom helpers import unittest\n\nimport luigi\nimport luigi.notifications\nfrom luigi.mock import MockTarget\nfrom luigi.parameter import MissingParameterException\nfrom luigi.util import common_params, copies, delegates, inherits, requires\n\nluigi.notifications.DEBUG = True\n\n\nclass A(luigi.Task):\n    task_namespace = \"decorator\"  # to prevent task name conflict between tests\n    param1 = luigi.Parameter(\"class A-specific default\")\n\n\n@inherits(A)\nclass B(luigi.Task):\n    param2 = luigi.Parameter(\"class B-specific default\")\n\n\n@inherits(B)\nclass C(luigi.Task):\n    param3 = luigi.Parameter(\"class C-specific default\")\n\n\n@inherits(B)\nclass D(luigi.Task):\n    param1 = luigi.Parameter(\"class D overwriting class A's default\")\n\n\n@inherits(B)\nclass D_null(luigi.Task):\n    param1 = None\n\n\n@inherits(A, B)\nclass E(luigi.Task):\n    param4 = luigi.Parameter(\"class E-specific default\")\n\n\n@inherits(A)\n@inherits(B)\nclass E_stacked(luigi.Task):\n    param4 = luigi.Parameter(\"class E-specific default\")\n\n\nclass InheritTest(unittest.TestCase):\n    def setUp(self):\n        self.a = A()\n        self.a_changed = A(param1=34)\n        self.b = B()\n        self.c = C()\n        self.d = D()\n        self.d_null = D_null()\n        self.e = E()\n        self.e_stacked = E_stacked()\n\n    def test_has_param(self):\n        b_params = dict(self.b.get_params()).keys()\n        self.assertTrue(\"param1\" in b_params)\n\n    def test_default_param(self):\n        self.assertEqual(self.b.param1, self.a.param1)\n\n    def test_change_of_defaults_not_equal(self):\n        self.assertNotEqual(self.b.param1, self.a_changed.param1)\n\n    def tested_chained_inheritance(self):\n        self.assertEqual(self.c.param2, self.b.param2)\n        self.assertEqual(self.c.param1, self.a.param1)\n        self.assertEqual(self.c.param1, self.b.param1)\n\n    def test_overwriting_defaults(self):\n        self.assertEqual(self.d.param2, self.b.param2)\n        self.assertNotEqual(self.d.param1, self.b.param1)\n        self.assertNotEqual(self.d.param1, self.a.param1)\n        self.assertEqual(self.d.param1, \"class D overwriting class A's default\")\n\n    def test_multiple_inheritance(self):\n        self.assertEqual(self.e.param1, self.a.param1)\n        self.assertEqual(self.e.param1, self.b.param1)\n        self.assertEqual(self.e.param2, self.b.param2)\n\n    def test_stacked_inheritance(self):\n        self.assertEqual(self.e_stacked.param1, self.a.param1)\n        self.assertEqual(self.e_stacked.param1, self.b.param1)\n        self.assertEqual(self.e_stacked.param2, self.b.param2)\n\n    def test_empty_inheritance(self):\n        with self.assertRaises(TypeError):\n\n            @inherits()\n            class shouldfail(luigi.Task):\n                pass\n\n    def test_removing_parameter(self):\n        self.assertFalse(\"param1\" in dict(self.d_null.get_params()).keys())\n\n    def test_wrapper_preserve_attributes(self):\n        self.assertEqual(B.__name__, \"B\")\n\n\nclass F(luigi.Task):\n    param1 = luigi.Parameter(\"A parameter on a base task, that will be required later.\")\n\n\n@inherits(F)\nclass G(luigi.Task):\n    param2 = luigi.Parameter(\"A separate parameter that doesn't affect 'F'\")\n\n    def requires(self):\n        return F(**common_params(self, F))\n\n\n@inherits(G)\nclass H(luigi.Task):\n    param2 = luigi.Parameter(\"OVERWRITING\")\n\n    def requires(self):\n        return G(**common_params(self, G))\n\n\n@inherits(G)\nclass H_null(luigi.Task):\n    param2 = None\n\n    def requires(self):\n        special_param2 = str(datetime.datetime.now())\n        return G(param2=special_param2, **common_params(self, G))\n\n\n@inherits(G)\nclass I_task(luigi.Task):\n    def requires(self):\n        return F(**common_params(self, F))\n\n\nclass J(luigi.Task):\n    param1 = luigi.Parameter()  # something required, with no default\n\n\n@inherits(J)\nclass K_shouldnotinstantiate(luigi.Task):\n    param2 = luigi.Parameter(\"A K-specific parameter\")\n\n\n@inherits(J)\nclass K_shouldfail(luigi.Task):\n    param1 = None\n    param2 = luigi.Parameter(\"A K-specific parameter\")\n\n    def requires(self):\n        return J(**common_params(self, J))\n\n\n@inherits(J)\nclass K_shouldsucceed(luigi.Task):\n    param1 = None\n    param2 = luigi.Parameter(\"A K-specific parameter\")\n\n    def requires(self):\n        return J(param1=\"Required parameter\", **common_params(self, J))\n\n\n@inherits(J)\nclass K_wrongparamsorder(luigi.Task):\n    param1 = None\n    param2 = luigi.Parameter(\"A K-specific parameter\")\n\n    def requires(self):\n        return J(param1=\"Required parameter\", **common_params(J, self))\n\n\nclass RequiresTest(unittest.TestCase):\n    def setUp(self):\n        self.f = F()\n        self.g = G()\n        self.g_changed = G(param1=\"changing the default\")\n        self.h = H()\n        self.h_null = H_null()\n        self.i = I_task()\n        self.k_shouldfail = K_shouldfail()\n        self.k_shouldsucceed = K_shouldsucceed()\n        self.k_wrongparamsorder = K_wrongparamsorder()\n\n    def test_inherits(self):\n        self.assertEqual(self.f.param1, self.g.param1)\n        self.assertEqual(self.f.param1, self.g.requires().param1)\n\n    def test_change_of_defaults(self):\n        self.assertNotEqual(self.f.param1, self.g_changed.param1)\n        self.assertNotEqual(self.g.param1, self.g_changed.param1)\n        self.assertNotEqual(self.f.param1, self.g_changed.requires().param1)\n\n    def test_overwriting_parameter(self):\n        self.h.requires()\n        self.assertNotEqual(self.h.param2, self.g.param2)\n        self.assertEqual(self.h.param2, self.h.requires().param2)\n        self.assertEqual(self.h.param2, \"OVERWRITING\")\n\n    def test_skipping_one_inheritance(self):\n        self.assertEqual(self.i.requires().param1, self.f.param1)\n\n    def test_removing_parameter(self):\n        self.assertNotEqual(self.h_null.requires().param2, self.g.param2)\n\n    def test_not_setting_required_parameter(self):\n        self.assertRaises(MissingParameterException, self.k_shouldfail.requires)\n\n    def test_setting_required_parameters(self):\n        self.k_shouldsucceed.requires()\n\n    def test_should_not_instantiate(self):\n        self.assertRaises(MissingParameterException, K_shouldnotinstantiate)\n\n    def test_resuscitation(self):\n        k = K_shouldnotinstantiate(param1=\"hello\")\n        k.requires()\n\n    def test_wrong_common_params_order(self):\n        self.assertRaises(TypeError, self.k_wrongparamsorder.requires)\n\n\nclass V(luigi.Task):\n    n = luigi.IntParameter(default=42)\n\n\n@inherits(V)\nclass W(luigi.Task):\n    def requires(self):\n        return self.clone_parent()\n\n\n@requires(V)\nclass W2(luigi.Task):\n    pass\n\n\n@requires(V)\nclass W3(luigi.Task):\n    n = luigi.IntParameter(default=43)\n\n\nclass X(luigi.Task):\n    m = luigi.IntParameter(default=56)\n\n\n@requires(V, X)\nclass Y(luigi.Task):\n    pass\n\n\nclass CloneParentTest(unittest.TestCase):\n    def test_clone_parent(self):\n        w = W()\n        v = V()\n        self.assertEqual(w.requires(), v)\n        self.assertEqual(w.n, 42)\n\n    def test_requires(self):\n        w2 = W2()\n        v = V()\n        self.assertEqual(w2.requires(), v)\n        self.assertEqual(w2.n, 42)\n\n    def test_requires_override_default(self):\n        w3 = W3()\n        v = V()\n        self.assertNotEqual(w3.requires(), v)\n        self.assertEqual(w3.n, 43)\n        self.assertEqual(w3.requires().n, 43)\n\n    def test_multiple_requires(self):\n        y = Y()\n        v = V()\n        x = X()\n        self.assertEqual(y.requires()[0], v)\n        self.assertEqual(y.requires()[1], x)\n\n    def test_empty_requires(self):\n        with self.assertRaises(TypeError):\n\n            @requires()\n            class shouldfail(luigi.Task):\n                pass\n\n    def test_names(self):\n        # Just make sure the decorators retain the original class names\n        v = V()\n        self.assertEqual(str(v), \"V(n=42)\")\n        self.assertEqual(v.__class__.__name__, \"V\")\n\n\nclass P(luigi.Task):\n    date = luigi.DateParameter()\n\n    def output(self):\n        return MockTarget(self.date.strftime(\"/tmp/data-%Y-%m-%d.txt\"))\n\n    def run(self):\n        f = self.output().open(\"w\")\n        print(\"hello, world\", file=f)\n        f.close()\n\n\n@copies(P)\nclass PCopy(luigi.Task):\n    def output(self):\n        return MockTarget(self.date.strftime(\"/tmp/copy-data-%Y-%m-%d.txt\"))\n\n\nclass CopyTest(unittest.TestCase):\n    def test_copy(self):\n        luigi.build([PCopy(date=datetime.date(2012, 1, 1))], local_scheduler=True)\n        self.assertEqual(MockTarget.fs.get_data(\"/tmp/data-2012-01-01.txt\"), b\"hello, world\\n\")\n        self.assertEqual(MockTarget.fs.get_data(\"/tmp/copy-data-2012-01-01.txt\"), b\"hello, world\\n\")\n\n\nclass PickleTest(unittest.TestCase):\n    def test_pickle(self):\n        # similar to CopyTest.test_copy\n        p = PCopy(date=datetime.date(2013, 1, 1))\n        p_pickled = pickle.dumps(p)\n        p = pickle.loads(p_pickled)\n\n        luigi.build([p], local_scheduler=True)\n        self.assertEqual(MockTarget.fs.get_data(\"/tmp/data-2013-01-01.txt\"), b\"hello, world\\n\")\n        self.assertEqual(MockTarget.fs.get_data(\"/tmp/copy-data-2013-01-01.txt\"), b\"hello, world\\n\")\n\n\nclass Subtask(luigi.Task):\n    k = luigi.IntParameter()\n\n    def f(self, x):\n        return x**self.k\n\n\n@delegates\nclass SubtaskDelegator(luigi.Task):\n    def subtasks(self):\n        return [Subtask(1), Subtask(2)]\n\n    def run(self):\n        self.s = 0\n        for t in self.subtasks():\n            self.s += t.f(42)\n\n\nclass SubtaskTest(unittest.TestCase):\n    def test_subtasks(self):\n        sd = SubtaskDelegator()\n        luigi.build([sd], local_scheduler=True)\n        self.assertEqual(sd.s, 42 * (1 + 42))\n\n    def test_forgot_subtasks(self):\n        def trigger_failure():\n            @delegates\n            class SubtaskDelegatorBroken(luigi.Task):\n                pass\n\n        self.assertRaises(AttributeError, trigger_failure)\n\n    def test_cmdline(self):\n        # Exposes issue where wrapped tasks are registered twice under\n        # the same name\n        from luigi.task import Register\n\n        self.assertEqual(Register.get_task_cls(\"SubtaskDelegator\"), SubtaskDelegator)\n"
  },
  {
    "path": "test/dict_parameter_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport collections\nimport json\n\nimport mock\nimport pytest\nfrom helpers import in_parse, unittest\nfrom jsonschema import Draft4Validator\nfrom jsonschema.exceptions import ValidationError\n\nimport luigi\nimport luigi.interface\n\n\nclass DictParameterTask(luigi.Task):\n    param = luigi.DictParameter()\n\n\nclass DictParameterTest(unittest.TestCase):\n    _dict = collections.OrderedDict([(\"username\", \"me\"), (\"password\", \"secret\")])\n\n    def test_parse(self):\n        d = luigi.DictParameter().parse(json.dumps(DictParameterTest._dict))\n        self.assertEqual(d, DictParameterTest._dict)\n\n    def test_serialize(self):\n        d = luigi.DictParameter().serialize(DictParameterTest._dict)\n        self.assertEqual(d, '{\"username\": \"me\", \"password\": \"secret\"}')\n\n    def test_parse_and_serialize(self):\n        inputs = ['{\"username\": \"me\", \"password\": \"secret\"}', '{\"password\": \"secret\", \"username\": \"me\"}']\n        for json_input in inputs:\n            _dict = luigi.DictParameter().parse(json_input)\n            self.assertEqual(json_input, luigi.DictParameter().serialize(_dict))\n\n    def test_parse_interface(self):\n        in_parse(\n            [\"DictParameterTask\", \"--param\", '{\"username\": \"me\", \"password\": \"secret\"}'], lambda task: self.assertEqual(task.param, DictParameterTest._dict)\n        )\n\n    def test_serialize_task(self):\n        t = DictParameterTask(DictParameterTest._dict)\n        self.assertEqual(str(t), 'DictParameterTask(param={\"username\": \"me\", \"password\": \"secret\"})')\n\n    def test_parse_invalid_input(self):\n        self.assertRaises(ValueError, lambda: luigi.DictParameter().parse('{\"invalid\"}'))\n\n    def test_hash_normalize(self):\n        self.assertRaises(TypeError, lambda: hash(luigi.DictParameter().parse('{\"a\": {\"b\": []}}')))\n        a = luigi.DictParameter().normalize({\"a\": [{\"b\": []}]})\n        b = luigi.DictParameter().normalize({\"a\": [{\"b\": []}]})\n        self.assertEqual(hash(a), hash(b))\n\n    def test_schema(self):\n        a = luigi.parameter.DictParameter(\n            schema={\n                \"type\": \"object\",\n                \"properties\": {\n                    \"an_int\": {\"type\": \"integer\"},\n                    \"an_optional_str\": {\"type\": \"string\"},\n                },\n                \"additionalProperties\": False,\n                \"required\": [\"an_int\"],\n            },\n        )\n\n        # Check that the default value is validated\n        with pytest.raises(\n            ValidationError,\n            match=r\"Additional properties are not allowed \\('INVALID_ATTRIBUTE' was unexpected\\)\",\n        ):\n            a.normalize({\"INVALID_ATTRIBUTE\": 0})\n\n        # Check that empty dict is not valid\n        with pytest.raises(ValidationError, match=\"'an_int' is a required property\"):\n            a.normalize({})\n\n        # Check that valid dicts work\n        a.normalize({\"an_int\": 1})\n        a.normalize({\"an_int\": 1, \"an_optional_str\": \"hello\"})\n\n        # Check that invalid dicts raise correct errors\n        with pytest.raises(ValidationError, match=\"'999' is not of type 'integer'\"):\n            a.normalize({\"an_int\": \"999\"})\n\n        with pytest.raises(ValidationError, match=\"999 is not of type 'string'\"):\n            a.normalize({\"an_int\": 1, \"an_optional_str\": 999})\n\n        # Test the example given in docstring\n        b = luigi.DictParameter(\n            schema={\n                \"type\": \"object\",\n                \"patternProperties\": {\n                    \".*\": {\"type\": \"string\", \"enum\": [\"web\", \"staging\"]},\n                },\n            }\n        )\n        b.normalize({\"role\": \"web\", \"env\": \"staging\"})\n        with pytest.raises(ValidationError, match=r\"'UNKNOWN_VALUE' is not one of \\['web', 'staging'\\]\"):\n            b.normalize({\"role\": \"UNKNOWN_VALUE\", \"env\": \"staging\"})\n\n        # Check that warnings are properly emitted\n        with mock.patch(\"luigi.parameter._JSONSCHEMA_ENABLED\", False):\n            with pytest.warns(\n                UserWarning, match=(\"The 'jsonschema' package is not installed so the parameter can not be validated even though a schema is given.\")\n            ):\n                luigi.ListParameter(schema={\"type\": \"object\"})\n\n        # Test with a custom validator\n        validator = Draft4Validator(\n            schema={\n                \"type\": \"object\",\n                \"patternProperties\": {\n                    \".*\": {\"type\": \"string\", \"enum\": [\"web\", \"staging\"]},\n                },\n            }\n        )\n        c = luigi.DictParameter(schema=validator)\n        c.normalize({\"role\": \"web\", \"env\": \"staging\"})\n        with pytest.raises(ValidationError, match=r\"'UNKNOWN_VALUE' is not one of \\['web', 'staging'\\]\"):\n            c.normalize({\"role\": \"UNKNOWN_VALUE\", \"env\": \"staging\"})\n\n        # Test with frozen data\n        frozen_data = luigi.freezing.recursively_freeze({\"role\": \"web\", \"env\": \"staging\"})\n        c.normalize(frozen_data)\n"
  },
  {
    "path": "test/dynamic_import_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom helpers import LuigiTestCase, temporary_unloaded_module\n\nimport luigi\nimport luigi.interface\n\nCONTENTS = b\"\"\"\nimport luigi\n\nclass FooTask(luigi.Task):\n    x = luigi.IntParameter()\n\n    def run(self):\n        luigi._testing_glob_var = self.x\n\"\"\"\n\n\nclass CmdlineTest(LuigiTestCase):\n    def test_dynamic_loading(self):\n        with temporary_unloaded_module(CONTENTS) as temp_module_name:\n            luigi.interface.run([\"--module\", temp_module_name, \"FooTask\", \"--x\", \"123\", \"--local-scheduler\", \"--no-lock\"])\n            self.assertEqual(luigi._testing_glob_var, 123)\n"
  },
  {
    "path": "test/event_callbacks_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom helpers import unittest\nfrom mock import patch\n\nimport luigi\nfrom luigi import Event, Task, build\nfrom luigi.mock import MockFileSystem, MockTarget\nfrom luigi.task import flatten\n\n\nclass DummyException(Exception):\n    pass\n\n\nclass EmptyTask(Task):\n    fail = luigi.BoolParameter()\n\n    def run(self):\n        self.trigger_event(Event.PROGRESS, self, {\"foo\": \"bar\"})\n        if self.fail:\n            raise DummyException()\n\n\nclass TaskWithBrokenDependency(Task):\n    def requires(self):\n        raise DummyException()\n\n    def run(self):\n        pass\n\n\nclass TaskWithCallback(Task):\n    def run(self):\n        print(\"Triggering event\")\n        self.trigger_event(\"foo event\")\n\n\nclass TestEventCallbacks(unittest.TestCase):\n    def test_start_handler(self):\n        saved_tasks = []\n\n        @EmptyTask.event_handler(Event.START)\n        def save_task(task):\n            print(\"Saving task...\")\n            saved_tasks.append(task)\n\n        t = EmptyTask(True)\n        build([t], local_scheduler=True)\n        self.assertEqual(saved_tasks, [t])\n\n    def _run_empty_task(self, fail):\n        progresses = []\n        progresses_data = []\n        successes = []\n        failures = []\n        exceptions = []\n\n        @EmptyTask.event_handler(Event.SUCCESS)\n        def success(task):\n            successes.append(task)\n\n        @EmptyTask.event_handler(Event.FAILURE)\n        def failure(task, exception):\n            failures.append(task)\n            exceptions.append(exception)\n\n        @EmptyTask.event_handler(Event.PROGRESS)\n        def progress(task, data):\n            progresses.append(task)\n            progresses_data.append(data)\n\n        t = EmptyTask(fail)\n        build([t], local_scheduler=True)\n        return t, progresses, progresses_data, successes, failures, exceptions\n\n    def test_success(self):\n        t, progresses, progresses_data, successes, failures, exceptions = self._run_empty_task(False)\n        self.assertEqual(progresses, [t])\n        self.assertEqual(progresses_data, [{\"foo\": \"bar\"}])\n        self.assertEqual(successes, [t])\n        self.assertEqual(failures, [])\n        self.assertEqual(exceptions, [])\n\n    def test_failure(self):\n        t, progresses, progresses_data, successes, failures, exceptions = self._run_empty_task(True)\n        self.assertEqual(progresses, [t])\n        self.assertEqual(progresses_data, [{\"foo\": \"bar\"}])\n        self.assertEqual(successes, [])\n        self.assertEqual(failures, [t])\n        self.assertEqual(len(exceptions), 1)\n        self.assertTrue(isinstance(exceptions[0], DummyException))\n\n    def test_broken_dependency(self):\n        failures = []\n        exceptions = []\n\n        @TaskWithBrokenDependency.event_handler(Event.BROKEN_TASK)\n        def failure(task, exception):\n            failures.append(task)\n            exceptions.append(exception)\n\n        t = TaskWithBrokenDependency()\n        build([t], local_scheduler=True)\n\n        self.assertEqual(failures, [t])\n        self.assertEqual(len(exceptions), 1)\n        self.assertTrue(isinstance(exceptions[0], DummyException))\n\n    def test_custom_handler(self):\n        dummies = []\n\n        @TaskWithCallback.event_handler(\"foo event\")\n        def story_dummy():\n            dummies.append(\"foo\")\n\n        t = TaskWithCallback()\n        build([t], local_scheduler=True)\n        self.assertEqual(dummies[0], \"foo\")\n\n    def _run_processing_time_handler(self, fail):\n        result = []\n\n        @EmptyTask.event_handler(Event.PROCESSING_TIME)\n        def save_task(task, processing_time):\n            result.append((task, processing_time))\n\n        times = [43.0, 1.0]\n        t = EmptyTask(fail)\n        with patch(\"luigi.worker.time\") as mock:\n            mock.time = times.pop\n            build([t], local_scheduler=True)\n\n        return t, result\n\n    def test_processing_time_handler_success(self):\n        t, result = self._run_processing_time_handler(False)\n        self.assertEqual(len(result), 1)\n        task, time = result[0]\n        self.assertTrue(task is t)\n        self.assertEqual(time, 42.0)\n\n    def test_processing_time_handler_failure(self):\n        t, result = self._run_processing_time_handler(True)\n        self.assertEqual(result, [])\n\n    def test_remove_event_handler(self):\n        run_cnt = 0\n\n        @EmptyTask.event_handler(luigi.Event.START)\n        def handler(task):\n            nonlocal run_cnt\n            run_cnt += 1\n\n        task = EmptyTask()\n        build([task], local_scheduler=True)\n        assert run_cnt == 1\n        EmptyTask.remove_event_handler(luigi.Event.START, handler)\n        build([task], local_scheduler=True)\n        assert run_cnt == 1\n\n\n#        A\n#      /   \\\n#    B(1)  B(2)\n#     |     |\n#    C(1)  C(2)\n#     |  \\  |  \\\n#    D(1)  D(2)  D(3)\n\n\ndef eval_contents(f):\n    with f.open(\"r\") as i:\n        return eval(i.read())\n\n\nclass ConsistentMockOutput:\n    \"\"\"\n    Computes output location and contents from the task and its parameters. Rids us of writing ad-hoc boilerplate output() et al.\n    \"\"\"\n\n    param = luigi.IntParameter(default=1)\n\n    def output(self):\n        return MockTarget(\"/%s/%u\" % (self.__class__.__name__, self.param))\n\n    def produce_output(self):\n        with self.output().open(\"w\") as o:\n            o.write(repr([self.task_id] + sorted([eval_contents(i) for i in flatten(self.input())])))\n\n\nclass HappyTestFriend(ConsistentMockOutput, luigi.Task):\n    \"\"\"\n    Does trivial \"work\", outputting the list of inputs. Results in a convenient lispy comparable.\n    \"\"\"\n\n    def run(self):\n        self.produce_output()\n\n\nclass D(ConsistentMockOutput, luigi.ExternalTask):\n    pass\n\n\nclass C(HappyTestFriend):\n    def requires(self):\n        return [D(self.param), D(self.param + 1)]\n\n\nclass B(HappyTestFriend):\n    def requires(self):\n        return C(self.param)\n\n\nclass A(HappyTestFriend):\n    task_namespace = \"event_callbacks\"  # to prevent task name coflict between tests\n\n    def requires(self):\n        return [B(1), B(2)]\n\n\nclass TestDependencyEvents(unittest.TestCase):\n    def tearDown(self):\n        MockFileSystem().remove(\"\")\n\n    def _run_test(self, task, expected_events):\n        actual_events = {}\n\n        # yucky to create separate callbacks; would be nicer if the callback\n        # received an instance of a subclass of Event, so one callback could\n        # accumulate all types\n        @luigi.Task.event_handler(Event.DEPENDENCY_DISCOVERED)\n        def callback_dependency_discovered(*args):\n            actual_events.setdefault(Event.DEPENDENCY_DISCOVERED, set()).add(tuple(map(lambda t: t.task_id, args)))\n\n        @luigi.Task.event_handler(Event.DEPENDENCY_MISSING)\n        def callback_dependency_missing(*args):\n            actual_events.setdefault(Event.DEPENDENCY_MISSING, set()).add(tuple(map(lambda t: t.task_id, args)))\n\n        @luigi.Task.event_handler(Event.DEPENDENCY_PRESENT)\n        def callback_dependency_present(*args):\n            actual_events.setdefault(Event.DEPENDENCY_PRESENT, set()).add(tuple(map(lambda t: t.task_id, args)))\n\n        build([task], local_scheduler=True)\n        self.assertEqual(actual_events, expected_events)\n\n    def test_incomplete_dag(self):\n        for param in range(1, 3):\n            D(param).produce_output()\n        self._run_test(\n            A(),\n            {\n                \"event.core.dependency.discovered\": {\n                    (A(param=1).task_id, B(param=1).task_id),\n                    (A(param=1).task_id, B(param=2).task_id),\n                    (B(param=1).task_id, C(param=1).task_id),\n                    (B(param=2).task_id, C(param=2).task_id),\n                    (C(param=1).task_id, D(param=1).task_id),\n                    (C(param=1).task_id, D(param=2).task_id),\n                    (C(param=2).task_id, D(param=2).task_id),\n                    (C(param=2).task_id, D(param=3).task_id),\n                },\n                \"event.core.dependency.missing\": {\n                    (D(param=3).task_id,),\n                },\n                \"event.core.dependency.present\": {\n                    (D(param=1).task_id,),\n                    (D(param=2).task_id,),\n                },\n            },\n        )\n        self.assertFalse(A().output().exists())\n\n    def test_complete_dag(self):\n        for param in range(1, 4):\n            D(param).produce_output()\n        self._run_test(\n            A(),\n            {\n                \"event.core.dependency.discovered\": {\n                    (A(param=1).task_id, B(param=1).task_id),\n                    (A(param=1).task_id, B(param=2).task_id),\n                    (B(param=1).task_id, C(param=1).task_id),\n                    (B(param=2).task_id, C(param=2).task_id),\n                    (C(param=1).task_id, D(param=1).task_id),\n                    (C(param=1).task_id, D(param=2).task_id),\n                    (C(param=2).task_id, D(param=2).task_id),\n                    (C(param=2).task_id, D(param=3).task_id),\n                },\n                \"event.core.dependency.present\": {\n                    (D(param=1).task_id,),\n                    (D(param=2).task_id,),\n                    (D(param=3).task_id,),\n                },\n            },\n        )\n        self.assertEqual(\n            eval_contents(A().output()),\n            [\n                A(param=1).task_id,\n                [B(param=1).task_id, [C(param=1).task_id, [D(param=1).task_id], [D(param=2).task_id]]],\n                [B(param=2).task_id, [C(param=2).task_id, [D(param=2).task_id], [D(param=3).task_id]]],\n            ],\n        )\n"
  },
  {
    "path": "test/execution_summary_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2015-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport datetime\nimport threading\nfrom enum import Enum\n\nimport mock\nfrom helpers import LuigiTestCase, RunOnceTask, with_config\n\nimport luigi\nimport luigi.execution_summary\nimport luigi.worker\n\n\nclass ExecutionSummaryTest(LuigiTestCase):\n    def setUp(self):\n        super(ExecutionSummaryTest, self).setUp()\n        self.scheduler = luigi.scheduler.Scheduler(prune_on_get_work=False)\n        self.worker = luigi.worker.Worker(scheduler=self.scheduler)\n\n    def run_task(self, task):\n        self.worker.add(task)  # schedule\n        self.worker.run()  # run\n\n    def summary_dict(self):\n        return luigi.execution_summary._summary_dict(self.worker)\n\n    def summary(self):\n        return luigi.execution_summary.summary(self.worker)\n\n    def test_all_statuses(self):\n        class Bar(luigi.Task):\n            num = luigi.IntParameter()\n\n            def run(self):\n                if self.num == 0:\n                    raise ValueError()\n\n            def complete(self):\n                if self.num == 1:\n                    return True\n                return False\n\n        class Foo(luigi.Task):\n            def requires(self):\n                for i in range(5):\n                    yield Bar(i)\n\n        self.run_task(Foo())\n        d = self.summary_dict()\n        self.assertEqual({Bar(num=1)}, d[\"already_done\"])\n        self.assertEqual({Bar(num=2), Bar(num=3), Bar(num=4)}, d[\"completed\"])\n        self.assertEqual({Bar(num=0)}, d[\"failed\"])\n        self.assertEqual({Foo()}, d[\"upstream_failure\"])\n        self.assertFalse(d[\"upstream_missing_dependency\"])\n        self.assertFalse(d[\"run_by_other_worker\"])\n        self.assertFalse(d[\"still_pending_ext\"])\n        summary = self.summary()\n\n        expected = [\n            \"\",\n            \"===== Luigi Execution Summary =====\",\n            \"\",\n            \"Scheduled 6 tasks of which:\",\n            \"* 1 complete ones were encountered:\",\n            \"    - 1 Bar(num=1)\",\n            \"* 3 ran successfully:\",\n            \"    - 3 Bar(num=2,3,4)\",\n            \"* 1 failed:\",\n            \"    - 1 Bar(num=0)\",\n            \"* 1 were left pending, among these:\",\n            \"    * 1 had failed dependencies:\",\n            \"        - 1 Foo()\",\n            \"\",\n            \"This progress looks :( because there were failed tasks\",\n            \"\",\n            \"===== Luigi Execution Summary =====\",\n            \"\",\n        ]\n        result = summary.split(\"\\n\")\n        self.assertEqual(len(result), len(expected))\n        for i, line in enumerate(result):\n            self.assertEqual(line, expected[i])\n\n    def test_batch_complete(self):\n        ran_tasks = set()\n\n        class MaxBatchTask(luigi.Task):\n            param = luigi.IntParameter(batch_method=max)\n\n            def run(self):\n                ran_tasks.add(self.param)\n\n            def complete(self):\n                return any(self.param <= ran_param for ran_param in ran_tasks)\n\n        class MaxBatches(luigi.WrapperTask):\n            def requires(self):\n                return map(MaxBatchTask, range(5))\n\n        self.run_task(MaxBatches())\n        d = self.summary_dict()\n        expected_completed = {\n            MaxBatchTask(0),\n            MaxBatchTask(1),\n            MaxBatchTask(2),\n            MaxBatchTask(3),\n            MaxBatchTask(4),\n            MaxBatches(),\n        }\n        self.assertEqual(expected_completed, d[\"completed\"])\n\n    def test_batch_fail(self):\n        class MaxBatchFailTask(luigi.Task):\n            param = luigi.IntParameter(batch_method=max)\n\n            def run(self):\n                assert self.param < 4\n\n            def complete(self):\n                return False\n\n        class MaxBatches(luigi.WrapperTask):\n            def requires(self):\n                return map(MaxBatchFailTask, range(5))\n\n        self.run_task(MaxBatches())\n        d = self.summary_dict()\n        expected_failed = {\n            MaxBatchFailTask(0),\n            MaxBatchFailTask(1),\n            MaxBatchFailTask(2),\n            MaxBatchFailTask(3),\n            MaxBatchFailTask(4),\n        }\n        self.assertEqual(expected_failed, d[\"failed\"])\n\n    def test_check_complete_error(self):\n        class Bar(luigi.Task):\n            def run(self):\n                pass\n\n            def complete(self):\n                raise Exception\n                return True\n\n        class Foo(luigi.Task):\n            def requires(self):\n                yield Bar()\n\n        self.run_task(Foo())\n        d = self.summary_dict()\n        self.assertEqual({Foo()}, d[\"still_pending_not_ext\"])\n        self.assertEqual({Foo()}, d[\"upstream_scheduling_error\"])\n        self.assertEqual({Bar()}, d[\"scheduling_error\"])\n        self.assertFalse(d[\"not_run\"])\n        self.assertFalse(d[\"already_done\"])\n        self.assertFalse(d[\"completed\"])\n        self.assertFalse(d[\"failed\"])\n        self.assertFalse(d[\"upstream_failure\"])\n        self.assertFalse(d[\"upstream_missing_dependency\"])\n        self.assertFalse(d[\"run_by_other_worker\"])\n        self.assertFalse(d[\"still_pending_ext\"])\n        summary = self.summary()\n        expected = [\n            \"\",\n            \"===== Luigi Execution Summary =====\",\n            \"\",\n            \"Scheduled 2 tasks of which:\",\n            \"* 1 failed scheduling:\",\n            \"    - 1 Bar()\",\n            \"* 1 were left pending, among these:\",\n            \"    * 1 had dependencies whose scheduling failed:\",\n            \"        - 1 Foo()\",\n            \"\",\n            \"Did not run any tasks\",\n            \"This progress looks :( because there were tasks whose scheduling failed\",\n            \"\",\n            \"===== Luigi Execution Summary =====\",\n            \"\",\n        ]\n        result = summary.split(\"\\n\")\n        self.assertEqual(len(result), len(expected))\n        for i, line in enumerate(result):\n            self.assertEqual(line, expected[i])\n\n    def test_not_run_error(self):\n        class Bar(luigi.Task):\n            def complete(self):\n                return True\n\n        class Foo(luigi.Task):\n            def requires(self):\n                yield Bar()\n\n        def new_func(*args, **kwargs):\n            return None\n\n        with mock.patch(\"luigi.scheduler.Scheduler.add_task\", new_func):\n            self.run_task(Foo())\n\n        d = self.summary_dict()\n        self.assertEqual({Foo()}, d[\"still_pending_not_ext\"])\n        self.assertEqual({Foo()}, d[\"not_run\"])\n        self.assertEqual({Bar()}, d[\"already_done\"])\n        self.assertFalse(d[\"upstream_scheduling_error\"])\n        self.assertFalse(d[\"scheduling_error\"])\n        self.assertFalse(d[\"completed\"])\n        self.assertFalse(d[\"failed\"])\n        self.assertFalse(d[\"upstream_failure\"])\n        self.assertFalse(d[\"upstream_missing_dependency\"])\n        self.assertFalse(d[\"run_by_other_worker\"])\n        self.assertFalse(d[\"still_pending_ext\"])\n        summary = self.summary()\n        expected = [\n            \"\",\n            \"===== Luigi Execution Summary =====\",\n            \"\",\n            \"Scheduled 2 tasks of which:\",\n            \"* 1 complete ones were encountered:\",\n            \"    - 1 Bar()\",\n            \"* 1 were left pending, among these:\",\n            \"    * 1 was not granted run permission by the scheduler:\",\n            \"        - 1 Foo()\",\n            \"\",\n            \"Did not run any tasks\",\n            \"This progress looks :| because there were tasks that were not granted run permission by the scheduler\",\n            \"\",\n            \"===== Luigi Execution Summary =====\",\n            \"\",\n        ]\n        result = summary.split(\"\\n\")\n        self.assertEqual(len(result), len(expected))\n        for i, line in enumerate(result):\n            self.assertEqual(line, expected[i])\n\n    def test_deps_error(self):\n        class Bar(luigi.Task):\n            def run(self):\n                pass\n\n            def complete(self):\n                return True\n\n        class Foo(luigi.Task):\n            def requires(self):\n                raise Exception\n                yield Bar()\n\n        self.run_task(Foo())\n        d = self.summary_dict()\n        self.assertEqual({Foo()}, d[\"scheduling_error\"])\n        self.assertFalse(d[\"upstream_scheduling_error\"])\n        self.assertFalse(d[\"not_run\"])\n        self.assertFalse(d[\"already_done\"])\n        self.assertFalse(d[\"completed\"])\n        self.assertFalse(d[\"failed\"])\n        self.assertFalse(d[\"upstream_failure\"])\n        self.assertFalse(d[\"upstream_missing_dependency\"])\n        self.assertFalse(d[\"run_by_other_worker\"])\n        self.assertFalse(d[\"still_pending_ext\"])\n        summary = self.summary()\n        expected = [\n            \"\",\n            \"===== Luigi Execution Summary =====\",\n            \"\",\n            \"Scheduled 1 tasks of which:\",\n            \"* 1 failed scheduling:\",\n            \"    - 1 Foo()\",\n            \"\",\n            \"Did not run any tasks\",\n            \"This progress looks :( because there were tasks whose scheduling failed\",\n            \"\",\n            \"===== Luigi Execution Summary =====\",\n            \"\",\n        ]\n        result = summary.split(\"\\n\")\n        self.assertEqual(len(result), len(expected))\n        for i, line in enumerate(result):\n            self.assertEqual(line, expected[i])\n\n    @with_config({\"execution_summary\": {\"summary_length\": \"1\"}})\n    def test_config_summary_limit(self):\n        class Bar(luigi.Task):\n            num = luigi.IntParameter()\n\n            def run(self):\n                pass\n\n            def complete(self):\n                return True\n\n        class Biz(Bar):\n            pass\n\n        class Bat(Bar):\n            pass\n\n        class Wut(Bar):\n            pass\n\n        class Foo(luigi.Task):\n            def requires(self):\n                yield Bat(1)\n                yield Wut(1)\n                yield Biz(1)\n                for i in range(4):\n                    yield Bar(i)\n\n            def complete(self):\n                return False\n\n        self.run_task(Foo())\n        d = self.summary_dict()\n        self.assertEqual({Bat(1), Wut(1), Biz(1), Bar(0), Bar(1), Bar(2), Bar(3)}, d[\"already_done\"])\n        self.assertEqual({Foo()}, d[\"completed\"])\n        self.assertFalse(d[\"failed\"])\n        self.assertFalse(d[\"upstream_failure\"])\n        self.assertFalse(d[\"upstream_missing_dependency\"])\n        self.assertFalse(d[\"run_by_other_worker\"])\n        self.assertFalse(d[\"still_pending_ext\"])\n        summary = self.summary()\n        expected = [\n            \"\",\n            \"===== Luigi Execution Summary =====\",\n            \"\",\n            \"Scheduled 8 tasks of which:\",\n            \"* 7 complete ones were encountered:\",\n            \"    - 4 Bar(num=0...3)\",\n            \"    ...\",\n            \"* 1 ran successfully:\",\n            \"    - 1 Foo()\",\n            \"\",\n            \"This progress looks :) because there were no failed tasks or missing dependencies\",\n            \"\",\n            \"===== Luigi Execution Summary =====\",\n            \"\",\n        ]\n        result = summary.split(\"\\n\")\n        self.assertEqual(len(result), len(expected))\n        for i, line in enumerate(result):\n            self.assertEqual(line, expected[i])\n\n    def test_upstream_not_running(self):\n        class ExternalBar(luigi.ExternalTask):\n            num = luigi.IntParameter()\n\n            def complete(self):\n                if self.num == 1:\n                    return True\n                return False\n\n        class Bar(luigi.Task):\n            num = luigi.IntParameter()\n\n            def run(self):\n                if self.num == 0:\n                    raise ValueError()\n\n        class Foo(luigi.Task):\n            def requires(self):\n                for i in range(5):\n                    yield ExternalBar(i)\n                    yield Bar(i)\n\n        self.run_task(Foo())\n        d = self.summary_dict()\n        self.assertEqual({ExternalBar(num=1)}, d[\"already_done\"])\n        self.assertEqual({Bar(num=1), Bar(num=2), Bar(num=3), Bar(num=4)}, d[\"completed\"])\n        self.assertEqual({Bar(num=0)}, d[\"failed\"])\n        self.assertEqual({Foo()}, d[\"upstream_failure\"])\n        self.assertEqual({Foo()}, d[\"upstream_missing_dependency\"])\n        self.assertFalse(d[\"run_by_other_worker\"])\n        self.assertEqual({ExternalBar(num=0), ExternalBar(num=2), ExternalBar(num=3), ExternalBar(num=4)}, d[\"still_pending_ext\"])\n        s = self.summary()\n        self.assertIn(\"\\n* 1 complete ones were encountered:\\n    - 1 ExternalBar(num=1)\\n\", s)\n        self.assertIn(\"\\n* 4 ran successfully:\\n    - 4 Bar(num=1...4)\\n\", s)\n        self.assertIn(\"\\n* 1 failed:\\n    - 1 Bar(num=0)\\n\", s)\n        self.assertIn(\"\\n* 5 were left pending, among these:\\n    * 4 were missing external dependencies:\\n        - 4 ExternalBar(num=\", s)\n        self.assertIn(\n            \"\\n    * 1 had failed dependencies:\\n\"\n            \"        - 1 Foo()\\n\"\n            \"    * 1 had missing dependencies:\\n\"\n            \"        - 1 Foo()\\n\\n\"\n            \"This progress looks :( because there were failed tasks\\n\",\n            s,\n        )\n        self.assertNotIn(\"\\n\\n\\n\", s)\n\n    def test_already_running(self):\n        lock1 = threading.Lock()\n        lock2 = threading.Lock()\n\n        class ParentTask(RunOnceTask):\n            def requires(self):\n                yield LockTask()\n\n        class LockTask(RunOnceTask):\n            def run(self):\n                lock2.release()\n                lock1.acquire()\n                self.comp = True\n\n        lock1.acquire()\n        lock2.acquire()\n        other_worker = luigi.worker.Worker(scheduler=self.scheduler, worker_id=\"other_worker\")\n        other_worker.add(ParentTask())\n        t1 = threading.Thread(target=other_worker.run)\n        t1.start()\n        lock2.acquire()\n        self.run_task(ParentTask())\n        lock1.release()\n        t1.join()\n        d = self.summary_dict()\n        self.assertEqual({LockTask()}, d[\"run_by_other_worker\"])\n        self.assertEqual({ParentTask()}, d[\"upstream_run_by_other_worker\"])\n        s = self.summary()\n        self.assertIn(\n            \"\\nScheduled 2 tasks of which:\\n\"\n            \"* 2 were left pending, among these:\\n\"\n            \"    * 1 were being run by another worker:\\n\"\n            \"        - 1 LockTask()\\n\"\n            \"    * 1 had dependencies that were being run by other worker:\\n\"\n            \"        - 1 ParentTask()\\n\",\n            s,\n        )\n        self.assertIn(\n            \"\\n\\nThe other workers were:\\n\"\n            \"    - other_worker ran 1 tasks\\n\\n\"\n            \"Did not run any tasks\\n\"\n            \"This progress looks :) because there were no failed \"\n            \"tasks or missing dependencies\\n\",\n            s,\n        )\n        self.assertNotIn(\"\\n\\n\\n\", s)\n\n    def test_already_running_2(self):\n        class AlreadyRunningTask(luigi.Task):\n            def run(self):\n                pass\n\n        other_worker = luigi.worker.Worker(scheduler=self.scheduler, worker_id=\"other_worker\")\n        other_worker.add(AlreadyRunningTask())  # This also registers this worker\n        old_func = luigi.scheduler.Scheduler.get_work\n\n        def new_func(*args, **kwargs):\n            new_kwargs = kwargs.copy()\n            new_kwargs[\"worker\"] = \"other_worker\"\n            old_func(*args, **new_kwargs)\n            return old_func(*args, **kwargs)\n\n        with mock.patch(\"luigi.scheduler.Scheduler.get_work\", new_func):\n            self.run_task(AlreadyRunningTask())\n\n        d = self.summary_dict()\n        self.assertFalse(d[\"already_done\"])\n        self.assertFalse(d[\"completed\"])\n        self.assertFalse(d[\"not_run\"])\n        self.assertEqual({AlreadyRunningTask()}, d[\"run_by_other_worker\"])\n\n    def test_not_run(self):\n        class AlreadyRunningTask(luigi.Task):\n            def run(self):\n                pass\n\n        other_worker = luigi.worker.Worker(scheduler=self.scheduler, worker_id=\"other_worker\")\n        other_worker.add(AlreadyRunningTask())  # This also registers this worker\n        old_func = luigi.scheduler.Scheduler.get_work\n\n        def new_func(*args, **kwargs):\n            kwargs[\"current_tasks\"] = None\n            old_func(*args, **kwargs)\n            return old_func(*args, **kwargs)\n\n        with mock.patch(\"luigi.scheduler.Scheduler.get_work\", new_func):\n            self.run_task(AlreadyRunningTask())\n\n        d = self.summary_dict()\n        self.assertFalse(d[\"already_done\"])\n        self.assertFalse(d[\"completed\"])\n        self.assertFalse(d[\"run_by_other_worker\"])\n        self.assertEqual({AlreadyRunningTask()}, d[\"not_run\"])\n\n        s = self.summary()\n        self.assertIn(\n            \"\\nScheduled 1 tasks of which:\\n\"\n            \"* 1 were left pending, among these:\\n\"\n            \"    * 1 was not granted run permission by the scheduler:\\n\"\n            \"        - 1 AlreadyRunningTask()\\n\",\n            s,\n        )\n        self.assertNotIn(\"\\n\\n\\n\", s)\n\n    def test_somebody_else_finish_task(self):\n        class SomeTask(RunOnceTask):\n            pass\n\n        other_worker = luigi.worker.Worker(scheduler=self.scheduler, worker_id=\"other_worker\")\n\n        self.worker.add(SomeTask())\n        other_worker.add(SomeTask())\n        other_worker.run()\n        self.worker.run()\n\n        d = self.summary_dict()\n        self.assertFalse(d[\"already_done\"])\n        self.assertFalse(d[\"completed\"])\n        self.assertFalse(d[\"run_by_other_worker\"])\n        self.assertEqual({SomeTask()}, d[\"not_run\"])\n\n    def test_somebody_else_disables_task(self):\n        class SomeTask(luigi.Task):\n            def complete(self):\n                return False\n\n            def run(self):\n                raise ValueError()\n\n        other_worker = luigi.worker.Worker(scheduler=self.scheduler, worker_id=\"other_worker\")\n\n        self.worker.add(SomeTask())\n        other_worker.add(SomeTask())\n        other_worker.run()  # Assuming it is disabled for a while after this\n        self.worker.run()\n\n        d = self.summary_dict()\n        self.assertFalse(d[\"already_done\"])\n        self.assertFalse(d[\"completed\"])\n        self.assertFalse(d[\"run_by_other_worker\"])\n        self.assertEqual({SomeTask()}, d[\"not_run\"])\n\n    def test_larger_tree(self):\n\n        class Dog(RunOnceTask):\n            def requires(self):\n                yield Cat(2)\n\n        class Cat(luigi.Task):\n            num = luigi.IntParameter()\n\n            def __init__(self, *args, **kwargs):\n                super(Cat, self).__init__(*args, **kwargs)\n                self.comp = False\n\n            def run(self):\n                if self.num == 2:\n                    raise ValueError()\n                self.comp = True\n\n            def complete(self):\n                if self.num == 1:\n                    return True\n                else:\n                    return self.comp\n\n        class Bar(RunOnceTask):\n            num = luigi.IntParameter()\n\n            def requires(self):\n                if self.num == 0:\n                    yield ExternalBar()\n                    yield Cat(0)\n                if self.num == 1:\n                    yield Cat(0)\n                    yield Cat(1)\n                if self.num == 2:\n                    yield Dog()\n\n        class Foo(luigi.Task):\n            def requires(self):\n                for i in range(3):\n                    yield Bar(i)\n\n        class ExternalBar(luigi.ExternalTask):\n            def complete(self):\n                return False\n\n        self.run_task(Foo())\n        d = self.summary_dict()\n\n        self.assertEqual({Cat(num=1)}, d[\"already_done\"])\n        self.assertEqual({Cat(num=0), Bar(num=1)}, d[\"completed\"])\n        self.assertEqual({Cat(num=2)}, d[\"failed\"])\n        self.assertEqual({Dog(), Bar(num=2), Foo()}, d[\"upstream_failure\"])\n        self.assertEqual({Bar(num=0), Foo()}, d[\"upstream_missing_dependency\"])\n        self.assertFalse(d[\"run_by_other_worker\"])\n        self.assertEqual({ExternalBar()}, d[\"still_pending_ext\"])\n        s = self.summary()\n        self.assertNotIn(\"\\n\\n\\n\", s)\n\n    def test_with_dates(self):\n        \"\"\"Just test that it doesn't crash with date params\"\"\"\n\n        start = datetime.date(1998, 3, 23)\n\n        class Bar(RunOnceTask):\n            date = luigi.DateParameter()\n\n        class Foo(luigi.Task):\n            def requires(self):\n                for i in range(10):\n                    new_date = start + datetime.timedelta(days=i)\n                    yield Bar(date=new_date)\n\n        self.run_task(Foo())\n        d = self.summary_dict()\n        exp_set = {Bar(start + datetime.timedelta(days=i)) for i in range(10)}\n        exp_set.add(Foo())\n        self.assertEqual(exp_set, d[\"completed\"])\n        s = self.summary()\n        self.assertIn(\"date=1998-0\", s)\n        self.assertIn(\"Scheduled 11 tasks\", s)\n        self.assertIn(\"Luigi Execution Summary\", s)\n        self.assertNotIn(\"00:00:00\", s)\n        self.assertNotIn(\"\\n\\n\\n\", s)\n\n    def test_with_ranges_minutes(self):\n\n        start = datetime.datetime(1998, 3, 23, 1, 50)\n\n        class Bar(RunOnceTask):\n            time = luigi.DateMinuteParameter()\n\n        class Foo(luigi.Task):\n            def requires(self):\n                for i in range(300):\n                    new_time = start + datetime.timedelta(minutes=i)\n                    yield Bar(time=new_time)\n\n        self.run_task(Foo())\n        d = self.summary_dict()\n        exp_set = {Bar(start + datetime.timedelta(minutes=i)) for i in range(300)}\n        exp_set.add(Foo())\n        self.assertEqual(exp_set, d[\"completed\"])\n        s = self.summary()\n        self.assertIn(\"Bar(time=1998-03-23T0150...1998-03-23T0649)\", s)\n        self.assertNotIn(\"\\n\\n\\n\", s)\n\n    def test_with_ranges_one_param(self):\n\n        class Bar(RunOnceTask):\n            num = luigi.IntParameter()\n\n        class Foo(luigi.Task):\n            def requires(self):\n                for i in range(11):\n                    yield Bar(i)\n\n        self.run_task(Foo())\n        d = self.summary_dict()\n        exp_set = {Bar(i) for i in range(11)}\n        exp_set.add(Foo())\n        self.assertEqual(exp_set, d[\"completed\"])\n        s = self.summary()\n        self.assertIn(\"Bar(num=0...10)\", s)\n        self.assertNotIn(\"\\n\\n\\n\", s)\n\n    def test_with_ranges_multiple_params(self):\n\n        class Bar(RunOnceTask):\n            num1 = luigi.IntParameter()\n            num2 = luigi.IntParameter()\n            num3 = luigi.IntParameter()\n\n        class Foo(luigi.Task):\n            def requires(self):\n                for i in range(5):\n                    yield Bar(5, i, 25)\n\n        self.run_task(Foo())\n        d = self.summary_dict()\n        exp_set = {Bar(5, i, 25) for i in range(5)}\n        exp_set.add(Foo())\n        self.assertEqual(exp_set, d[\"completed\"])\n        s = self.summary()\n        self.assertIn(\"- 5 Bar(num1=5, num2=0...4, num3=25)\", s)\n        self.assertNotIn(\"\\n\\n\\n\", s)\n\n    def test_with_two_tasks(self):\n\n        class Bar(RunOnceTask):\n            num = luigi.IntParameter()\n            num2 = luigi.IntParameter()\n\n        class Foo(luigi.Task):\n            def requires(self):\n                for i in range(2):\n                    yield Bar(i, 2 * i)\n\n        self.run_task(Foo())\n        d = self.summary_dict()\n        self.assertEqual({Foo(), Bar(num=0, num2=0), Bar(num=1, num2=2)}, d[\"completed\"])\n\n        summary = self.summary()\n        result = summary.split(\"\\n\")\n        expected = [\n            \"\",\n            \"===== Luigi Execution Summary =====\",\n            \"\",\n            \"Scheduled 3 tasks of which:\",\n            \"* 3 ran successfully:\",\n            \"    - 2 Bar(num=0, num2=0) and Bar(num=1, num2=2)\",\n            \"    - 1 Foo()\",\n            \"\",\n            \"This progress looks :) because there were no failed tasks or missing dependencies\",\n            \"\",\n            \"===== Luigi Execution Summary =====\",\n            \"\",\n        ]\n\n        self.assertEqual(len(result), len(expected))\n        for i, line in enumerate(result):\n            self.assertEqual(line, expected[i])\n\n    def test_really_long_param_name(self):\n\n        class Bar(RunOnceTask):\n            This_is_a_really_long_parameter_that_we_should_not_print_out_because_people_will_get_annoyed = luigi.IntParameter()\n\n        class Foo(luigi.Task):\n            def requires(self):\n                yield Bar(0)\n\n        self.run_task(Foo())\n        s = self.summary()\n        self.assertIn(\"Bar(...)\", s)\n        self.assertNotIn(\"Did not run any tasks\", s)\n        self.assertNotIn(\"\\n\\n\\n\", s)\n\n    def test_multiple_params_multiple_same_task_family(self):\n\n        class Bar(RunOnceTask):\n            num = luigi.IntParameter()\n            num2 = luigi.IntParameter()\n\n        class Foo(luigi.Task):\n            def requires(self):\n                for i in range(4):\n                    yield Bar(i, 2 * i)\n\n        self.run_task(Foo())\n        summary = self.summary()\n\n        result = summary.split(\"\\n\")\n        expected = [\n            \"\",\n            \"===== Luigi Execution Summary =====\",\n            \"\",\n            \"Scheduled 5 tasks of which:\",\n            \"* 5 ran successfully:\",\n            \"    - 4 Bar(num=0, num2=0) ...\",\n            \"    - 1 Foo()\",\n            \"\",\n            \"This progress looks :) because there were no failed tasks or missing dependencies\",\n            \"\",\n            \"===== Luigi Execution Summary =====\",\n            \"\",\n        ]\n\n        self.assertEqual(len(result), len(expected))\n        for i, line in enumerate(result):\n            self.assertEqual(line, expected[i])\n\n    def test_happy_smiley_face_normal(self):\n\n        class Bar(RunOnceTask):\n            num = luigi.IntParameter()\n            num2 = luigi.IntParameter()\n\n        class Foo(luigi.Task):\n            def requires(self):\n                for i in range(4):\n                    yield Bar(i, 2 * i)\n\n        self.run_task(Foo())\n        s = self.summary()\n        self.assertIn(\"\\nThis progress looks :) because there were no failed tasks or missing dependencies\", s)\n        self.assertNotIn(\"Did not run any tasks\", s)\n        self.assertNotIn(\"\\n\\n\\n\", s)\n\n    def test_happy_smiley_face_other_workers(self):\n        lock1 = threading.Lock()\n        lock2 = threading.Lock()\n\n        class ParentTask(RunOnceTask):\n            def requires(self):\n                yield LockTask()\n\n        class LockTask(RunOnceTask):\n            def run(self):\n                lock2.release()\n                lock1.acquire()\n                self.comp = True\n\n        lock1.acquire()\n        lock2.acquire()\n        other_worker = luigi.worker.Worker(scheduler=self.scheduler, worker_id=\"other_worker\")\n        other_worker.add(ParentTask())\n        t1 = threading.Thread(target=other_worker.run)\n        t1.start()\n        lock2.acquire()\n        self.run_task(ParentTask())\n        lock1.release()\n        t1.join()\n        s = self.summary()\n        self.assertIn(\"\\nThis progress looks :) because there were no failed tasks or missing dependencies\", s)\n        self.assertNotIn(\"\\n\\n\\n\", s)\n\n    def test_sad_smiley_face(self):\n\n        class ExternalBar(luigi.ExternalTask):\n            def complete(self):\n                return False\n\n        class Bar(luigi.Task):\n            num = luigi.IntParameter()\n\n            def run(self):\n                if self.num == 0:\n                    raise ValueError()\n\n        class Foo(luigi.Task):\n            def requires(self):\n                for i in range(5):\n                    yield Bar(i)\n                yield ExternalBar()\n\n        self.run_task(Foo())\n        s = self.summary()\n        self.assertIn(\"\\nThis progress looks :( because there were failed tasks\", s)\n        self.assertNotIn(\"Did not run any tasks\", s)\n        self.assertNotIn(\"\\n\\n\\n\", s)\n\n    def test_neutral_smiley_face(self):\n\n        class ExternalBar(luigi.ExternalTask):\n            def complete(self):\n                return False\n\n        class Foo(luigi.Task):\n            def requires(self):\n                yield ExternalBar()\n\n        self.run_task(Foo())\n        s = self.summary()\n        self.assertIn(\"\\nThis progress looks :| because there were missing external dependencies\", s)\n        self.assertNotIn(\"\\n\\n\\n\", s)\n\n    def test_did_not_run_any_tasks(self):\n\n        class ExternalBar(luigi.ExternalTask):\n            num = luigi.IntParameter()\n\n            def complete(self):\n                if self.num == 5:\n                    return True\n                return False\n\n        class Foo(luigi.Task):\n            def requires(self):\n                for i in range(10):\n                    yield ExternalBar(i)\n\n        self.run_task(Foo())\n        d = self.summary_dict()\n        self.assertEqual({ExternalBar(5)}, d[\"already_done\"])\n        self.assertEqual({ExternalBar(i) for i in range(10) if i != 5}, d[\"still_pending_ext\"])\n        self.assertEqual({Foo()}, d[\"upstream_missing_dependency\"])\n        s = self.summary()\n        self.assertIn(\"\\n\\nDid not run any tasks\\nThis progress looks :| because there were missing external dependencies\", s)\n        self.assertNotIn(\"\\n\\n\\n\", s)\n\n    def test_example(self):\n\n        class MyExternal(luigi.ExternalTask):\n            def complete(self):\n                return False\n\n        class Boom(luigi.Task):\n            this_is_a_really_long_I_mean_way_too_long_and_annoying_parameter = luigi.IntParameter()\n\n            def requires(self):\n                for i in range(5, 200):\n                    yield Bar(i)\n\n        class Foo(luigi.Task):\n            num = luigi.IntParameter()\n            num2 = luigi.IntParameter()\n\n            def requires(self):\n                yield MyExternal()\n                yield Boom(0)\n\n        class Bar(luigi.Task):\n            num = luigi.IntParameter()\n\n            def complete(self):\n                return True\n\n        class DateTask(luigi.Task):\n            date = luigi.DateParameter()\n            num = luigi.IntParameter()\n\n            def requires(self):\n                yield MyExternal()\n                yield Boom(0)\n\n        class EntryPoint(luigi.Task):\n            def requires(self):\n                for i in range(10):\n                    yield Foo(100, 2 * i)\n                for i in range(10):\n                    yield DateTask(datetime.date(1998, 3, 23) + datetime.timedelta(days=i), 5)\n\n        self.run_task(EntryPoint())\n        summary = self.summary()\n\n        expected = [\n            \"\",\n            \"===== Luigi Execution Summary =====\",\n            \"\",\n            \"Scheduled 218 tasks of which:\",\n            \"* 195 complete ones were encountered:\",\n            \"    - 195 Bar(num=5...199)\",\n            \"* 1 ran successfully:\",\n            \"    - 1 Boom(...)\",\n            \"* 22 were left pending, among these:\",\n            \"    * 1 were missing external dependencies:\",\n            \"        - 1 MyExternal()\",\n            \"    * 21 had missing dependencies:\",\n            \"        - 10 DateTask(date=1998-03-23...1998-04-01, num=5)\",\n            \"        - 1 EntryPoint()\",\n            \"        - 10 Foo(num=100, num2=0) ...\",\n            \"\",\n            \"This progress looks :| because there were missing external dependencies\",\n            \"\",\n            \"===== Luigi Execution Summary =====\",\n            \"\",\n        ]\n        result = summary.split(\"\\n\")\n\n        self.assertEqual(len(result), len(expected))\n        for i, line in enumerate(result):\n            self.assertEqual(line, expected[i])\n\n    def test_with_datehours(self):\n        \"\"\"Just test that it doesn't crash with datehour params\"\"\"\n\n        start = datetime.datetime(1998, 3, 23, 5)\n\n        class Bar(RunOnceTask):\n            datehour = luigi.DateHourParameter()\n\n        class Foo(luigi.Task):\n            def requires(self):\n                for i in range(10):\n                    new_date = start + datetime.timedelta(hours=i)\n                    yield Bar(datehour=new_date)\n\n        self.run_task(Foo())\n        d = self.summary_dict()\n        exp_set = {Bar(start + datetime.timedelta(hours=i)) for i in range(10)}\n        exp_set.add(Foo())\n        self.assertEqual(exp_set, d[\"completed\"])\n        s = self.summary()\n        self.assertIn(\"datehour=1998-03-23T0\", s)\n        self.assertIn(\"Scheduled 11 tasks\", s)\n        self.assertIn(\"Luigi Execution Summary\", s)\n        self.assertNotIn(\"00:00:00\", s)\n        self.assertNotIn(\"\\n\\n\\n\", s)\n\n    def test_with_months(self):\n        \"\"\"Just test that it doesn't crash with month params\"\"\"\n\n        start = datetime.datetime(1998, 3, 23)\n\n        class Bar(RunOnceTask):\n            month = luigi.MonthParameter()\n\n        class Foo(luigi.Task):\n            def requires(self):\n                for i in range(3):\n                    new_date = start + datetime.timedelta(days=30 * i)\n                    yield Bar(month=new_date)\n\n        self.run_task(Foo())\n        d = self.summary_dict()\n        exp_set = {Bar(start + datetime.timedelta(days=30 * i)) for i in range(3)}\n        exp_set.add(Foo())\n        self.assertEqual(exp_set, d[\"completed\"])\n        s = self.summary()\n        self.assertIn(\"month=1998-0\", s)\n        self.assertIn(\"Scheduled 4 tasks\", s)\n        self.assertIn(\"Luigi Execution Summary\", s)\n        self.assertNotIn(\"00:00:00\", s)\n        self.assertNotIn(\"\\n\\n\\n\", s)\n\n    def test_multiple_dash_dash_workers(self):\n        \"\"\"\n        Don't print own worker with ``--workers 2`` setting.\n        \"\"\"\n        self.worker = luigi.worker.Worker(scheduler=self.scheduler, worker_processes=2)\n\n        class Foo(RunOnceTask):\n            pass\n\n        self.run_task(Foo())\n        d = self.summary_dict()\n        self.assertEqual(set(), d[\"run_by_other_worker\"])\n        s = self.summary()\n        self.assertNotIn(\"The other workers were\", s)\n        self.assertIn(\"This progress looks :) because there were no failed \", s)\n        self.assertNotIn(\"\\n\\n\\n\", s)\n\n    def test_with_uncomparable_parameters(self):\n        \"\"\"\n        Don't rely on parameters being sortable\n        \"\"\"\n\n        class Color(Enum):\n            red = 1\n            yellow = 2\n\n        class Bar(RunOnceTask):\n            eparam = luigi.EnumParameter(enum=Color)\n\n        class Baz(RunOnceTask):\n            eparam = luigi.EnumParameter(enum=Color)\n            another_param = luigi.IntParameter()\n\n        class Foo(luigi.Task):\n            def requires(self):\n                yield Bar(Color.red)\n                yield Bar(Color.yellow)\n                yield Baz(Color.red, 5)\n                yield Baz(Color.yellow, 5)\n\n        self.run_task(Foo())\n        s = self.summary()\n        self.assertIn(\"yellow\", s)\n\n    def test_with_dict_dependency(self):\n        \"\"\"Just test that it doesn't crash with dict params in dependencies\"\"\"\n\n        args = dict(start=datetime.date(1998, 3, 23), num=3)\n\n        class Bar(RunOnceTask):\n            args = luigi.DictParameter()\n\n        class Foo(luigi.Task):\n            def requires(self):\n                for i in range(10):\n                    new_dict = args.copy()\n                    new_dict[\"start\"] = str(new_dict[\"start\"] + datetime.timedelta(days=i))\n                    yield Bar(args=new_dict)\n\n        self.run_task(Foo())\n        d = self.summary_dict()\n        exp_set = set()\n        for i in range(10):\n            new_dict = args.copy()\n            new_dict[\"start\"] = str(new_dict[\"start\"] + datetime.timedelta(days=i))\n            exp_set.add(Bar(new_dict))\n        exp_set.add(Foo())\n        self.assertEqual(exp_set, d[\"completed\"])\n        s = self.summary()\n        self.assertIn('\"num\": 3', s)\n        self.assertIn('\"start\": \"1998-0', s)\n        self.assertIn(\"Scheduled 11 tasks\", s)\n        self.assertIn(\"Luigi Execution Summary\", s)\n        self.assertNotIn(\"00:00:00\", s)\n        self.assertNotIn(\"\\n\\n\\n\", s)\n\n    def test_with_dict_argument(self):\n        \"\"\"Just test that it doesn't crash with dict params\"\"\"\n\n        args = dict(start=str(datetime.date(1998, 3, 23)), num=3)\n\n        class Bar(RunOnceTask):\n            args = luigi.DictParameter()\n\n        self.run_task(Bar(args=args))\n        d = self.summary_dict()\n        exp_set = set()\n        exp_set.add(Bar(args=args))\n        self.assertEqual(exp_set, d[\"completed\"])\n        s = self.summary()\n        self.assertIn('\"num\": 3', s)\n        self.assertIn('\"start\": \"1998-0', s)\n        self.assertIn(\"Scheduled 1 task\", s)\n        self.assertIn(\"Luigi Execution Summary\", s)\n        self.assertNotIn(\"00:00:00\", s)\n        self.assertNotIn(\"\\n\\n\\n\", s)\n\n    \"\"\"\n    Test that a task once crashing and then succeeding should be counted as no failure.\n    \"\"\"\n\n    def test_status_with_task_retry(self):\n        class Foo(luigi.Task):\n            run_count = 0\n\n            def run(self):\n                self.run_count += 1\n                if self.run_count == 1:\n                    raise ValueError()\n\n            def complete(self):\n                return self.run_count > 0\n\n        self.run_task(Foo())\n        self.run_task(Foo())\n        d = self.summary_dict()\n        self.assertEqual({Foo()}, d[\"completed\"])\n        self.assertEqual({Foo()}, d[\"ever_failed\"])\n        self.assertFalse(d[\"failed\"])\n        self.assertFalse(d[\"upstream_failure\"])\n        self.assertFalse(d[\"upstream_missing_dependency\"])\n        self.assertFalse(d[\"run_by_other_worker\"])\n        self.assertFalse(d[\"still_pending_ext\"])\n        s = self.summary()\n        self.assertIn(\"Scheduled 1 task\", s)\n        self.assertIn(\"Luigi Execution Summary\", s)\n        self.assertNotIn(\"ever failed\", s)\n        self.assertIn(\"\\n\\nThis progress looks :) because there were failed tasks but they all succeeded in a retry\", s)\n"
  },
  {
    "path": "test/factorial_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom helpers import unittest\n\nimport luigi\n\n\nclass Factorial(luigi.Task):\n    \"\"\"This calculates factorials *online* and does not write its results anywhere\n\n    Demonstrates the ability for dependencies between Tasks and not just between their output.\n    \"\"\"\n\n    n = luigi.IntParameter(default=100)\n\n    def requires(self):\n        if self.n > 1:\n            return Factorial(self.n - 1)\n\n    def run(self):\n        if self.n > 1:\n            self.value = self.n * self.requires().value\n        else:\n            self.value = 1\n        self.complete = lambda: True\n\n    def complete(self):\n        return False\n\n\nclass FactorialTest(unittest.TestCase):\n    def test_invoke(self):\n        luigi.build([Factorial(100)], local_scheduler=True)\n        self.assertEqual(Factorial(42).value, 1405006117752879898543142606244511569936384000000000)\n"
  },
  {
    "path": "test/fib_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom helpers import unittest\n\nimport luigi\nimport luigi.interface\nfrom luigi.mock import MockTarget\n\n# Calculates Fibonacci numbers :)\n\n\nclass Fib(luigi.Task):\n    n = luigi.IntParameter(default=100)\n\n    def requires(self):\n        if self.n >= 2:\n            return [Fib(self.n - 1), Fib(self.n - 2)]\n        else:\n            return []\n\n    def output(self):\n        return MockTarget(\"/tmp/fib_%d\" % self.n)\n\n    def run(self):\n        if self.n == 0:\n            s = 0\n        elif self.n == 1:\n            s = 1\n        else:\n            s = 0\n            for input in self.input():\n                for line in input.open(\"r\"):\n                    s += int(line.strip())\n\n        f = self.output().open(\"w\")\n        f.write(\"%d\\n\" % s)\n        f.close()\n\n\nclass FibTestBase(unittest.TestCase):\n    def setUp(self):\n        MockTarget.fs.clear()\n\n\nclass FibTest(FibTestBase):\n    def test_invoke(self):\n        luigi.build([Fib(100)], local_scheduler=True)\n        self.assertEqual(MockTarget.fs.get_data(\"/tmp/fib_10\"), b\"55\\n\")\n        self.assertEqual(MockTarget.fs.get_data(\"/tmp/fib_100\"), b\"354224848179261915075\\n\")\n\n    def test_cmdline(self):\n        luigi.run([\"--local-scheduler\", \"--no-lock\", \"Fib\", \"--n\", \"100\"])\n\n        self.assertEqual(MockTarget.fs.get_data(\"/tmp/fib_10\"), b\"55\\n\")\n        self.assertEqual(MockTarget.fs.get_data(\"/tmp/fib_100\"), b\"354224848179261915075\\n\")\n\n    def test_build_internal(self):\n        luigi.build([Fib(100)], local_scheduler=True)\n\n        self.assertEqual(MockTarget.fs.get_data(\"/tmp/fib_10\"), b\"55\\n\")\n        self.assertEqual(MockTarget.fs.get_data(\"/tmp/fib_100\"), b\"354224848179261915075\\n\")\n"
  },
  {
    "path": "test/hdfs_client_test.py",
    "content": "import itertools\nimport threading\nimport unittest\n\nfrom luigi.contrib.hdfs import get_autoconfig_client\n\n\nclass HdfsClientTest(unittest.TestCase):\n    def test_get_autoconfig_client_cached(self):\n        original_client = get_autoconfig_client()\n        for _ in range(100):\n            self.assertIs(original_client, get_autoconfig_client())\n\n    def test_threaded_clients_different(self):\n        clients = []\n\n        def add_client():\n            clients.append(get_autoconfig_client())\n\n        # run a bunch of threads to get new clients in them\n        threads = [threading.Thread(target=add_client) for _ in range(10)]\n        for thread in threads:\n            thread.start()\n        for thread in threads:\n            thread.join()\n\n        for client1, client2 in itertools.combinations(clients, 2):\n            self.assertIsNot(client1, client2)\n"
  },
  {
    "path": "test/helpers.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport functools\nimport itertools\nimport os\nimport re\nimport tempfile\nimport unittest\nfrom contextlib import contextmanager\n\nimport luigi\nimport luigi.cmdline_parser\nimport luigi.task_register\nfrom luigi.cmdline_parser import CmdlineParser\n\n\ndef skipOnTravisAndGithubActions(reason):\n    if _override_skip_CI_tests():\n        # Do not skip the CI tests\n        return unittest.skipIf(False, \"\")\n    # run the skip CI tests logic\n    return unittest.skipIf(_running_on_travis() or _running_on_github_actions(), reason)\n\n\ndef skipOnGithubActions(reason):\n    return unittest.skipIf(_running_on_github_actions(), reason)\n\n\ndef _running_on_travis():\n    return os.getenv(\"TRAVIS\") == \"true\"\n\n\ndef _running_on_github_actions():\n    return os.getenv(\"GITHUB_ACTIONS\") == \"true\"\n\n\ndef _override_skip_CI_tests():\n    return os.getenv(\"OVERRIDE_SKIP_CI_TESTS\") == \"true\"\n\n\nclass with_config:\n    \"\"\"\n    Decorator to override config settings for the length of a function.\n\n    Usage:\n\n    .. code-block: python\n\n        >>> import luigi.configuration\n        >>> @with_config({'foo': {'bar': 'baz'}})\n        ... def my_test():\n        ...     print(luigi.configuration.get_config().get(\"foo\", \"bar\"))\n        ...\n        >>> my_test()\n        baz\n        >>> @with_config({'hoo': {'bar': 'buz'}})\n        ... @with_config({'foo': {'bar': 'baz'}})\n        ... def my_test():\n        ...     print(luigi.configuration.get_config().get(\"foo\", \"bar\"))\n        ...     print(luigi.configuration.get_config().get(\"hoo\", \"bar\"))\n        ...\n        >>> my_test()\n        baz\n        buz\n        >>> @with_config({'foo': {'bar': 'buz'}})\n        ... @with_config({'foo': {'bar': 'baz'}})\n        ... def my_test():\n        ...     print(luigi.configuration.get_config().get(\"foo\", \"bar\"))\n        ...\n        >>> my_test()\n        baz\n        >>> @with_config({'foo': {'bur': 'buz'}})\n        ... @with_config({'foo': {'bar': 'baz'}})\n        ... def my_test():\n        ...     print(luigi.configuration.get_config().get(\"foo\", \"bar\"))\n        ...     print(luigi.configuration.get_config().get(\"foo\", \"bur\"))\n        ...\n        >>> my_test()\n        baz\n        buz\n        >>> @with_config({'foo': {'bur': 'buz'}})\n        ... @with_config({'foo': {'bar': 'baz'}}, replace_sections=True)\n        ... def my_test():\n        ...     print(luigi.configuration.get_config().get(\"foo\", \"bar\"))\n        ...     print(luigi.configuration.get_config().get(\"foo\", \"bur\", \"no_bur\"))\n        ...\n        >>> my_test()\n        baz\n        no_bur\n\n    \"\"\"\n\n    def __init__(self, config, replace_sections=False):\n        self.config = config\n        self.replace_sections = replace_sections\n\n    def _make_dict(self, old_dict):\n        if self.replace_sections:\n            old_dict.update(self.config)\n            return old_dict\n\n        def get_section(sec):\n            old_sec = old_dict.get(sec, {})\n            new_sec = self.config.get(sec, {})\n            old_sec.update(new_sec)\n            return old_sec\n\n        all_sections = itertools.chain(old_dict.keys(), self.config.keys())\n        return {sec: get_section(sec) for sec in all_sections}\n\n    def __call__(self, fun):\n        @functools.wraps(fun)\n        def wrapper(*args, **kwargs):\n            import luigi.configuration\n\n            orig_conf = luigi.configuration.LuigiConfigParser.instance()\n            new_conf = luigi.configuration.LuigiConfigParser()\n            luigi.configuration.LuigiConfigParser._instance = new_conf\n            orig_dict = {k: dict(orig_conf.items(k)) for k in orig_conf.sections()}\n            new_dict = self._make_dict(orig_dict)\n            for section, settings in new_dict.items():\n                new_conf.add_section(section)\n                for name, value in settings.items():\n                    new_conf.set(section, name, value)\n            try:\n                return fun(*args, **kwargs)\n            finally:\n                luigi.configuration.LuigiConfigParser._instance = orig_conf\n\n        return wrapper\n\n\nclass RunOnceTask(luigi.Task):\n    def __init__(self, *args, **kwargs):\n        super(RunOnceTask, self).__init__(*args, **kwargs)\n        self.comp = False\n\n    def complete(self):\n        return self.comp\n\n    def run(self):\n        self.comp = True\n\n\n# string subclass that matches arguments containing the specified substring\n# for use in mock 'called_with' assertions\nclass StringContaining(str):\n    def __eq__(self, other_str):\n        return self in other_str\n\n\nclass LuigiTestCase(unittest.TestCase):\n    \"\"\"\n    Tasks registred within a test case will get unregistered in a finalizer\n\n    Instance caches are cleared before and after all runs\n    \"\"\"\n\n    def setUp(self):\n        super(LuigiTestCase, self).setUp()\n        self._stashed_reg = luigi.task_register.Register._get_reg()\n        luigi.task_register.Register.clear_instance_cache()\n\n    def tearDown(self):\n        luigi.task_register.Register._set_reg(self._stashed_reg)\n        super(LuigiTestCase, self).tearDown()\n        luigi.task_register.Register.clear_instance_cache()\n\n    def run_locally(self, args):\n        \"\"\"Helper for running tests testing more of the stack, the command\n        line parsing and task from name intstantiation parts in particular.\"\"\"\n        temp = CmdlineParser._instance\n        try:\n            CmdlineParser._instance = None\n            run_exit_status = luigi.run([\"--local-scheduler\", \"--no-lock\"] + args)\n        finally:\n            CmdlineParser._instance = temp\n        return run_exit_status\n\n    def run_locally_split(self, space_seperated_args):\n        \"\"\"Helper for running tests testing more of the stack, the command\n        line parsing and task from name intstantiation parts in particular.\"\"\"\n        return self.run_locally(space_seperated_args.split(\" \"))\n\n\nclass parsing:\n    \"\"\"\n    Convenient decorator for test cases to set the parsing environment.\n    \"\"\"\n\n    def __init__(self, cmds):\n        self.cmds = cmds\n\n    def __call__(self, fun):\n        @functools.wraps(fun)\n        def wrapper(*args, **kwargs):\n            with CmdlineParser.global_instance(self.cmds, allow_override=True):\n                return fun(*args, **kwargs)\n\n        return wrapper\n\n\ndef in_parse(cmds, deferred_computation):\n    with CmdlineParser.global_instance(cmds) as cp:\n        deferred_computation(cp.get_task_obj())\n\n\n@contextmanager\ndef temporary_unloaded_module(python_file_contents):\n    \"\"\"Create an importable module\n\n    Return the name of importable module name given its file contents (source\n    code)\"\"\"\n    with tempfile.NamedTemporaryFile(dir=\"test/\", prefix=\"_test_time_generated_module\", suffix=\".py\") as temp_module_file:\n        temp_module_file.file.write(python_file_contents)\n        temp_module_file.file.flush()\n        temp_module_path = temp_module_file.name\n        temp_module_name = re.search(r\"/(_test_time_generated_module.*).py\", temp_module_path).group(1)\n        yield temp_module_name\n"
  },
  {
    "path": "test/helpers_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2016 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nfrom helpers import LuigiTestCase, RunOnceTask\n\nimport luigi\nimport luigi.date_interval\nimport luigi.interface\nimport luigi.notifications\n\n\nclass LuigiTestCaseTest(LuigiTestCase):\n    def test_1(self):\n        class MyClass(luigi.Task):\n            pass\n\n        self.assertTrue(self.run_locally([\"MyClass\"]))\n\n    def test_2(self):\n        class MyClass(luigi.Task):\n            pass\n\n        self.assertTrue(self.run_locally([\"MyClass\"]))\n\n\nclass RunOnceTaskTest(LuigiTestCase):\n    def test_complete_behavior(self):\n        \"\"\"\n        Verify that RunOnceTask works as expected.\n\n        This task will fail if it is a normal ``luigi.Task``, because\n        RequiringTask will not run (missing dependency at runtime).\n        \"\"\"\n\n        class MyTask(RunOnceTask):\n            pass\n\n        class RequiringTask(luigi.Task):\n            counter = 0\n\n            def requires(self):\n                yield MyTask()\n\n            def run(self):\n                RequiringTask.counter += 1\n\n        self.run_locally([\"RequiringTask\"])\n        self.assertEqual(1, RequiringTask.counter)\n"
  },
  {
    "path": "test/import_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport os\n\nfrom helpers import unittest\n\n\nclass ImportTest(unittest.TestCase):\n    def import_test(self):\n        \"\"\"Test that all module can be imported\"\"\"\n\n        luigidir = os.path.join(os.path.dirname(os.path.abspath(__file__)), \"..\")\n\n        packagedir = os.path.join(luigidir, \"luigi\")\n\n        for root, subdirs, files in os.walk(packagedir):\n            package = os.path.relpath(root, luigidir).replace(\"/\", \".\")\n\n            if \"__init__.py\" in files:\n                __import__(package)\n\n            for f in files:\n                if f.endswith(\".py\") and not f.startswith(\"_\"):\n                    __import__(package + \".\" + f[:-3])\n\n    def import_luigi_test(self):\n        \"\"\"\n        Test that the top luigi package can be imported and contains the usual suspects.\n        \"\"\"\n        import luigi\n\n        # These should exist (if not, this will cause AttributeErrors)\n        expected = [\n            luigi.Event,\n            luigi.Config,\n            luigi.Task,\n            luigi.ExternalTask,\n            luigi.WrapperTask,\n            luigi.Target,\n            luigi.LocalTarget,\n            luigi.namespace,\n            luigi.RemoteScheduler,\n            luigi.RPCError,\n            luigi.run,\n            luigi.build,\n            luigi.Parameter,\n            luigi.DateHourParameter,\n            luigi.DateMinuteParameter,\n            luigi.DateSecondParameter,\n            luigi.DateParameter,\n            luigi.MonthParameter,\n            luigi.YearParameter,\n            luigi.DateIntervalParameter,\n            luigi.TimeDeltaParameter,\n            luigi.IntParameter,\n            luigi.FloatParameter,\n            luigi.BoolParameter,\n        ]\n        self.assertGreater(len(expected), 0)\n"
  },
  {
    "path": "test/instance_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom helpers import unittest\n\nimport luigi\nimport luigi.date_interval\nimport luigi.notifications\nimport luigi.worker\n\nluigi.notifications.DEBUG = True\n\n\nclass InstanceTest(unittest.TestCase):\n    def test_simple(self):\n        class DummyTask(luigi.Task):\n            x = luigi.Parameter()\n\n        dummy_1 = DummyTask(1)\n        dummy_2 = DummyTask(2)\n        dummy_1b = DummyTask(1)\n\n        self.assertNotEqual(dummy_1, dummy_2)\n        self.assertEqual(dummy_1, dummy_1b)\n\n    def test_dep(self):\n        test = self\n\n        class A(luigi.Task):\n            task_namespace = \"instance\"  # to prevent task name conflict between tests\n\n            def __init__(self):\n                self.has_run = False\n                super(A, self).__init__()\n\n            def run(self):\n                self.has_run = True\n\n        class B(luigi.Task):\n            x = luigi.Parameter()\n\n            def requires(self):\n                return A()  # This will end up referring to the same object\n\n            def run(self):\n                test.assertTrue(self.requires().has_run)\n\n        luigi.build([B(1), B(2)], local_scheduler=True)\n\n    def test_external_instance_cache(self):\n        class A(luigi.Task):\n            task_namespace = \"instance\"  # to prevent task name conflict between tests\n            pass\n\n        class OtherA(luigi.ExternalTask):\n            task_family = \"A\"\n\n        oa = OtherA()\n        a = A()\n        self.assertNotEqual(oa, a)\n\n    def test_date(self):\n        \"\"\"Adding unit test because we had a problem with this\"\"\"\n\n        class DummyTask(luigi.Task):\n            x = luigi.DateIntervalParameter()\n\n        dummy_1 = DummyTask(luigi.date_interval.Year(2012))\n        dummy_2 = DummyTask(luigi.date_interval.Year(2013))\n        dummy_1b = DummyTask(luigi.date_interval.Year(2012))\n\n        self.assertNotEqual(dummy_1, dummy_2)\n        self.assertEqual(dummy_1, dummy_1b)\n\n    def test_unhashable_type(self):\n        # See #857\n        class DummyTask(luigi.Task):\n            x = luigi.Parameter()\n\n        dummy = DummyTask(x={})  # NOQA\n"
  },
  {
    "path": "test/instance_wrap_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport datetime\nimport decimal\n\nfrom helpers import unittest\n\nimport luigi\nimport luigi.notifications\nfrom luigi.mock import MockTarget\n\nluigi.notifications.DEBUG = True\n\n\nclass Report(luigi.Task):\n    date = luigi.DateParameter()\n\n    def run(self):\n        f = self.output().open(\"w\")\n        f.write(\"10.0 USD\\n\")\n        f.write(\"4.0 EUR\\n\")\n        f.write(\"3.0 USD\\n\")\n        f.close()\n\n    def output(self):\n        return MockTarget(self.date.strftime(\"/tmp/report-%Y-%m-%d\"))\n\n\nclass ReportReader(luigi.Task):\n    date = luigi.DateParameter()\n\n    def requires(self):\n        return Report(self.date)\n\n    def run(self):\n        self.lines = list(self.input().open(\"r\").readlines())\n\n    def get_line(self, line):\n        amount, currency = self.lines[line].strip().split()\n        return decimal.Decimal(amount), currency\n\n    def complete(self):\n        return False\n\n\nclass CurrencyExchanger(luigi.Task):\n    task = luigi.Parameter()\n    currency_to = luigi.Parameter()\n\n    exchange_rates = {(\"USD\", \"USD\"): decimal.Decimal(1), (\"EUR\", \"USD\"): decimal.Decimal(\"1.25\")}\n\n    def requires(self):\n        return self.task  # Note that you still need to state this explicitly\n\n    def get_line(self, line):\n        amount, currency_from = self.task.get_line(line)\n        return amount * self.exchange_rates[(currency_from, self.currency_to)], self.currency_to\n\n    def complete(self):\n        return False\n\n\nclass InstanceWrapperTest(unittest.TestCase):\n    \"\"\"This test illustrates that tasks can have tasks as parameters\n\n    This is a more complicated variant of factorial_test.py which is an example of\n    tasks communicating directly with other tasks. In this case, a task takes another\n    task as a parameter and wraps it.\n\n    Also see wrap_test.py for an example of a task class wrapping another task class.\n\n    Not the most useful pattern, but there's actually been a few cases where it was\n    pretty handy to be able to do that. I'm adding it as a unit test to make sure that\n    new code doesn't break the expected behavior.\n    \"\"\"\n\n    def test(self):\n        d = datetime.date(2012, 1, 1)\n        r = ReportReader(d)\n        ex = CurrencyExchanger(r, \"USD\")\n\n        luigi.build([ex], local_scheduler=True)\n        self.assertEqual(ex.get_line(0), (decimal.Decimal(\"10.0\"), \"USD\"))\n        self.assertEqual(ex.get_line(1), (decimal.Decimal(\"5.0\"), \"USD\"))\n"
  },
  {
    "path": "test/interface_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport sys\n\nfrom helpers import LuigiTestCase, with_config\nfrom mock import MagicMock, Mock, patch\n\nimport luigi\nimport luigi.date_interval\nimport luigi.notifications\nfrom luigi.execution_summary import LuigiStatusCode\nfrom luigi.interface import _WorkerSchedulerFactory, core\nfrom luigi.worker import Worker\n\nluigi.notifications.DEBUG = True\n\n\nclass InterfaceTest(LuigiTestCase):\n    def setUp(self):\n        self.worker = Worker()\n\n        self.worker_scheduler_factory = _WorkerSchedulerFactory()\n        self.worker_scheduler_factory.create_worker = Mock(return_value=self.worker)\n        self.worker_scheduler_factory.create_local_scheduler = Mock()\n        super(InterfaceTest, self).setUp()\n\n        class NoOpTask(luigi.Task):\n            param = luigi.Parameter()\n\n        self.task_a = NoOpTask(\"a\")\n        self.task_b = NoOpTask(\"b\")\n\n    def _create_summary_dict_with(self, updates={}):\n        summary_dict = {\n            \"completed\": set(),\n            \"already_done\": set(),\n            \"ever_failed\": set(),\n            \"failed\": set(),\n            \"scheduling_error\": set(),\n            \"still_pending_ext\": set(),\n            \"still_pending_not_ext\": set(),\n            \"run_by_other_worker\": set(),\n            \"upstream_failure\": set(),\n            \"upstream_missing_dependency\": set(),\n            \"upstream_run_by_other_worker\": set(),\n            \"upstream_scheduling_error\": set(),\n            \"not_run\": set(),\n        }\n        summary_dict.update(updates)\n        return summary_dict\n\n    def _summary_dict_module_path():\n        return \"luigi.execution_summary._summary_dict\"\n\n    def test_interface_run_positive_path(self):\n        self.worker.add = Mock(side_effect=[True, True])\n        self.worker.run = Mock(return_value=True)\n        self.assertTrue(self._run_interface())\n\n    def test_interface_run_positive_path_with_detailed_summary_enabled(self):\n        self.worker.add = Mock(side_effect=[True, True])\n        self.worker.run = Mock(return_value=True)\n        self.assertTrue(self._run_interface(detailed_summary=True).scheduling_succeeded)\n\n    def test_interface_run_with_add_failure(self):\n        self.worker.add = Mock(side_effect=[True, False])\n        self.worker.run = Mock(return_value=True)\n        self.assertFalse(self._run_interface())\n\n    def test_interface_run_with_add_failure_with_detailed_summary_enabled(self):\n        self.worker.add = Mock(side_effect=[True, False])\n        self.worker.run = Mock(return_value=True)\n        self.assertFalse(self._run_interface(detailed_summary=True).scheduling_succeeded)\n\n    def test_interface_run_with_run_failure(self):\n        self.worker.add = Mock(side_effect=[True, True])\n        self.worker.run = Mock(return_value=False)\n        self.assertFalse(self._run_interface())\n\n    def test_interface_run_with_run_failure_with_detailed_summary_enabled(self):\n        self.worker.add = Mock(side_effect=[True, True])\n        self.worker.run = Mock(return_value=False)\n        self.assertFalse(self._run_interface(detailed_summary=True).scheduling_succeeded)\n\n    @patch(_summary_dict_module_path())\n    def test_that_status_is_success(self, fake_summary_dict):\n        # Nothing in failed tasks so, should succeed\n        fake_summary_dict.return_value = self._create_summary_dict_with()\n        luigi_run_result = self._run_interface(detailed_summary=True)\n        self.assertEqual(luigi_run_result.status, LuigiStatusCode.SUCCESS)\n\n    @patch(_summary_dict_module_path())\n    def test_that_status_is_success_with_retry(self, fake_summary_dict):\n        # Nothing in failed tasks (only an entry in ever_failed) so, should succeed with retry\n        fake_summary_dict.return_value = self._create_summary_dict_with({\"ever_failed\": [self.task_a]})\n        luigi_run_result = self._run_interface(detailed_summary=True)\n        self.assertEqual(luigi_run_result.status, LuigiStatusCode.SUCCESS_WITH_RETRY)\n\n    @patch(_summary_dict_module_path())\n    def test_that_status_is_failed_when_there_is_one_failed_task(self, fake_summary_dict):\n        # Should fail because a task failed\n        fake_summary_dict.return_value = self._create_summary_dict_with({\"ever_failed\": [self.task_a], \"failed\": [self.task_a]})\n        luigi_run_result = self._run_interface(detailed_summary=True)\n        self.assertEqual(luigi_run_result.status, LuigiStatusCode.FAILED)\n\n    @patch(_summary_dict_module_path())\n    def test_that_status_is_failed_with_scheduling_failure(self, fake_summary_dict):\n        # Failed task and also a scheduling error\n        fake_summary_dict.return_value = self._create_summary_dict_with(\n            {\"ever_failed\": [self.task_a], \"failed\": [self.task_a], \"scheduling_error\": [self.task_b]}\n        )\n        luigi_run_result = self._run_interface(detailed_summary=True)\n        self.assertEqual(luigi_run_result.status, LuigiStatusCode.FAILED_AND_SCHEDULING_FAILED)\n\n    @patch(_summary_dict_module_path())\n    def test_that_status_is_scheduling_failed_with_one_scheduling_error(self, fake_summary_dict):\n        # Scheduling error for at least one task\n        fake_summary_dict.return_value = self._create_summary_dict_with({\"scheduling_error\": [self.task_b]})\n        luigi_run_result = self._run_interface(detailed_summary=True)\n        self.assertEqual(luigi_run_result.status, LuigiStatusCode.SCHEDULING_FAILED)\n\n    @patch(_summary_dict_module_path())\n    def test_that_status_is_not_run_with_one_task_not_run(self, fake_summary_dict):\n        # At least one of the tasks was not run\n        fake_summary_dict.return_value = self._create_summary_dict_with({\"not_run\": [self.task_a]})\n        luigi_run_result = self._run_interface(detailed_summary=True)\n        self.assertEqual(luigi_run_result.status, LuigiStatusCode.NOT_RUN)\n\n    @patch(_summary_dict_module_path())\n    def test_that_status_is_missing_ext_with_one_task_with_missing_external_dependency(self, fake_summary_dict):\n        # Missing external dependency for at least one task\n        fake_summary_dict.return_value = self._create_summary_dict_with({\"still_pending_ext\": [self.task_a]})\n        luigi_run_result = self._run_interface(detailed_summary=True)\n        self.assertEqual(luigi_run_result.status, LuigiStatusCode.MISSING_EXT)\n\n    def test_stops_worker_on_add_exception(self):\n        worker = MagicMock()\n        self.worker_scheduler_factory.create_worker = Mock(return_value=worker)\n        worker.add = Mock(side_effect=AttributeError)\n\n        self.assertRaises(AttributeError, self._run_interface)\n        self.assertTrue(worker.__exit__.called)\n\n    def test_stops_worker_on_run_exception(self):\n        worker = MagicMock()\n        self.worker_scheduler_factory.create_worker = Mock(return_value=worker)\n        worker.add = Mock(side_effect=[True, True])\n        worker.run = Mock(side_effect=AttributeError)\n\n        self.assertRaises(AttributeError, self._run_interface)\n        self.assertTrue(worker.__exit__.called)\n\n    def test_just_run_main_task_cls(self):\n        class MyTestTask(luigi.Task):\n            pass\n\n        class MyOtherTestTask(luigi.Task):\n            my_param = luigi.Parameter()\n\n        with patch.object(sys, \"argv\", [\"my_module.py\", \"--no-lock\", \"--local-scheduler\"]):\n            luigi.run(main_task_cls=MyTestTask)\n\n        with patch.object(sys, \"argv\", [\"my_module.py\", \"--no-lock\", \"--my-param\", \"my_value\", \"--local-scheduler\"]):\n            luigi.run(main_task_cls=MyOtherTestTask)\n\n    def _run_interface(self, **env_params):\n        return luigi.interface.build([self.task_a, self.task_b], worker_scheduler_factory=self.worker_scheduler_factory, **env_params)\n\n\nclass CoreConfigTest(LuigiTestCase):\n    @with_config({})\n    def test_parallel_scheduling_processes_default(self):\n        self.assertEqual(0, core().parallel_scheduling_processes)\n\n    @with_config({\"core\": {\"parallel-scheduling-processes\": \"1234\"}})\n    def test_parallel_scheduling_processes(self):\n        from luigi.interface import core\n\n        self.assertEqual(1234, core().parallel_scheduling_processes)\n"
  },
  {
    "path": "test/list_parameter_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport json\n\nimport mock\nimport pytest\nfrom helpers import in_parse, unittest\nfrom jsonschema import Draft4Validator\nfrom jsonschema.exceptions import ValidationError\n\nimport luigi\n\n\nclass ListParameterTask(luigi.Task):\n    param = luigi.ListParameter()\n\n\nclass ListParameterTest(unittest.TestCase):\n    _list = [1, \"one\", True]\n\n    def test_parse(self):\n        d = luigi.ListParameter().parse(json.dumps(ListParameterTest._list))\n        self.assertEqual(d, ListParameterTest._list)\n\n    def test_serialize(self):\n        d = luigi.ListParameter().serialize(ListParameterTest._list)\n        self.assertEqual(d, '[1, \"one\", true]')\n\n    def test_list_serialize_parse(self):\n        a = luigi.ListParameter()\n        b_list = [1, 2, 3]\n        self.assertEqual(b_list, a.parse(a.serialize(b_list)))\n\n    def test_parse_interface(self):\n        in_parse([\"ListParameterTask\", \"--param\", '[1, \"one\", true]'], lambda task: self.assertEqual(task.param, tuple(ListParameterTest._list)))\n\n    def test_serialize_task(self):\n        t = ListParameterTask(ListParameterTest._list)\n        self.assertEqual(str(t), 'ListParameterTask(param=[1, \"one\", true])')\n\n    def test_parse_invalid_input(self):\n        self.assertRaises(ValueError, lambda: luigi.ListParameter().parse('{\"invalid\"}'))\n\n    def test_hash_normalize(self):\n        self.assertRaises(TypeError, lambda: hash(luigi.ListParameter().parse('\"NOT A LIST\"')))\n        a = luigi.ListParameter().normalize([0])\n        b = luigi.ListParameter().normalize([0])\n        self.assertEqual(hash(a), hash(b))\n\n    def test_schema(self):\n        a = luigi.ListParameter(\n            schema={\n                \"type\": \"array\",\n                \"items\": {\n                    \"type\": \"number\",\n                    \"minimum\": 0,\n                    \"maximum\": 10,\n                },\n                \"minItems\": 1,\n            }\n        )\n\n        # Check that the default value is validated\n        with pytest.raises(ValidationError, match=r\"'INVALID_ATTRIBUTE' is not of type 'number'\"):\n            a.normalize([\"INVALID_ATTRIBUTE\"])\n\n        # Check that empty list is not valid\n        with pytest.raises(ValidationError):\n            a.normalize([])\n\n        # Check that valid lists work\n        valid_list = [1, 2, 3]\n        a.normalize(valid_list)\n\n        # Check that invalid lists raise correct errors\n        invalid_list_type = [\"NOT AN INT\"]\n        invalid_list_value = [-999, 4]\n\n        with pytest.raises(ValidationError, match=\"'NOT AN INT' is not of type 'number'\"):\n            a.normalize(invalid_list_type)\n\n        with pytest.raises(ValidationError, match=\"-999 is less than the minimum of 0\"):\n            a.normalize(invalid_list_value)\n\n        # Check that warnings are properly emitted\n        with mock.patch(\"luigi.parameter._JSONSCHEMA_ENABLED\", False):\n            with pytest.warns(\n                UserWarning, match=(\"The 'jsonschema' package is not installed so the parameter can not be validated even though a schema is given.\")\n            ):\n                luigi.ListParameter(schema={\"type\": \"array\", \"items\": {\"type\": \"number\"}})\n\n        # Test with a custom validator\n        validator = Draft4Validator(\n            schema={\n                \"type\": \"array\",\n                \"items\": {\n                    \"type\": \"number\",\n                    \"minimum\": 0,\n                    \"maximum\": 10,\n                },\n                \"minItems\": 1,\n            }\n        )\n        c = luigi.DictParameter(schema=validator)\n        c.normalize(valid_list)\n        with pytest.raises(\n            ValidationError,\n            match=r\"'INVALID_ATTRIBUTE' is not of type 'number'\",\n        ):\n            c.normalize([\"INVALID_ATTRIBUTE\"])\n\n        # Test with frozen data\n        frozen_data = luigi.freezing.recursively_freeze(valid_list)\n        c.normalize(frozen_data)\n"
  },
  {
    "path": "test/local_target_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport bz2\nimport gzip\nimport io\nimport itertools\nimport os\nimport random\nimport shutil\nimport sys\nfrom errno import EEXIST, EXDEV\n\nimport mock\nfrom helpers import unittest\nfrom target_test import FileSystemTargetTestMixin\n\nimport luigi.format\nfrom luigi import LocalTarget\nfrom luigi.local_target import LocalFileSystem\nfrom luigi.target import FileAlreadyExists, MissingParentDirectory\n\n\nclass LocalTargetTest(unittest.TestCase, FileSystemTargetTestMixin):\n    PATH_PREFIX = \"/tmp/test.txt\"\n\n    def setUp(self):\n        self.path = self.PATH_PREFIX + \"-\" + str(self.id())\n        self.copy = self.PATH_PREFIX + \"-copy-\" + str(self.id())\n        if os.path.exists(self.path):\n            os.remove(self.path)\n        if os.path.exists(self.copy):\n            os.remove(self.copy)\n\n    def tearDown(self):\n        if os.path.exists(self.path):\n            os.remove(self.path)\n        if os.path.exists(self.copy):\n            os.remove(self.copy)\n\n    def create_target(self, format=None):\n        return LocalTarget(self.path, format=format)\n\n    def assertCleanUp(self, tmp_path=\"\"):\n        self.assertFalse(os.path.exists(tmp_path))\n\n    def test_exists(self):\n        t = self.create_target()\n        p = t.open(\"w\")\n        self.assertEqual(t.exists(), os.path.exists(self.path))\n        p.close()\n        self.assertEqual(t.exists(), os.path.exists(self.path))\n\n    @unittest.skipIf(tuple(sys.version_info) < (3, 4), \"only for Python>=3.4\")\n    def test_pathlib(self):\n        \"\"\"Test work with pathlib.Path\"\"\"\n        import pathlib\n\n        path = pathlib.Path(self.path)\n        self.assertFalse(path.exists())\n        target = LocalTarget(path)\n        self.assertFalse(target.exists())\n        with path.open(\"w\") as stream:\n            stream.write(\"test me\")\n        self.assertTrue(target.exists())\n\n    def test_gzip_with_module(self):\n        t = LocalTarget(self.path, luigi.format.Gzip)\n        p = t.open(\"w\")\n        test_data = b\"test\"\n        p.write(test_data)\n        print(self.path)\n        self.assertFalse(os.path.exists(self.path))\n        p.close()\n        self.assertTrue(os.path.exists(self.path))\n\n        # Using gzip module as validation\n        f = gzip.open(self.path, \"r\")\n        self.assertTrue(test_data == f.read())\n        f.close()\n\n        # Verifying our own gzip reader\n        f = LocalTarget(self.path, luigi.format.Gzip).open(\"r\")\n        self.assertTrue(test_data == f.read())\n        f.close()\n\n    def test_bzip2(self):\n        t = LocalTarget(self.path, luigi.format.Bzip2)\n        p = t.open(\"w\")\n        test_data = b\"test\"\n        p.write(test_data)\n        print(self.path)\n        self.assertFalse(os.path.exists(self.path))\n        p.close()\n        self.assertTrue(os.path.exists(self.path))\n\n        # Using bzip module as validation\n        f = bz2.BZ2File(self.path, \"r\")\n        self.assertTrue(test_data == f.read())\n        f.close()\n\n        # Verifying our own bzip2 reader\n        f = LocalTarget(self.path, luigi.format.Bzip2).open(\"r\")\n        self.assertTrue(test_data == f.read())\n        f.close()\n\n    def test_copy(self):\n        t = LocalTarget(self.path)\n        f = t.open(\"w\")\n        test_data = \"test\"\n        f.write(test_data)\n        f.close()\n        self.assertTrue(os.path.exists(self.path))\n        self.assertFalse(os.path.exists(self.copy))\n        t.copy(self.copy)\n        self.assertTrue(os.path.exists(self.path))\n        self.assertTrue(os.path.exists(self.copy))\n        self.assertEqual(t.open(\"r\").read(), LocalTarget(self.copy).open(\"r\").read())\n\n    def test_move(self):\n        t = LocalTarget(self.path)\n        f = t.open(\"w\")\n        test_data = \"test\"\n        f.write(test_data)\n        f.close()\n        self.assertTrue(os.path.exists(self.path))\n        self.assertFalse(os.path.exists(self.copy))\n        t.move(self.copy)\n        self.assertFalse(os.path.exists(self.path))\n        self.assertTrue(os.path.exists(self.copy))\n\n    def test_move_across_filesystems(self):\n        t = LocalTarget(self.path)\n        with t.open(\"w\") as f:\n            f.write(\"test_data\")\n\n        def rename_across_filesystems(src, dst):\n            err = OSError()\n            err.errno = EXDEV\n            raise err\n\n        real_rename = os.replace\n\n        def mockreplace(src, dst):\n            if \"-across-fs\" in src:\n                real_rename(src, dst)\n            else:\n                rename_across_filesystems(src, dst)\n\n        copy = \"%s-across-fs\" % self.copy\n        with mock.patch(\"os.replace\", mockreplace):\n            t.move(copy)\n\n        self.assertFalse(os.path.exists(self.path))\n        self.assertTrue(os.path.exists(copy))\n        self.assertEqual(\"test_data\", LocalTarget(copy).open(\"r\").read())\n\n    def test_format_chain(self):\n        UTF8WIN = luigi.format.TextFormat(encoding=\"utf8\", newline=\"\\r\\n\")\n        t = LocalTarget(self.path, UTF8WIN >> luigi.format.Gzip)\n        a = \"我é\\nçф\"\n\n        with t.open(\"w\") as f:\n            f.write(a)\n\n        f = gzip.open(self.path, \"rb\")\n        b = f.read()\n        f.close()\n\n        self.assertEqual(b\"\\xe6\\x88\\x91\\xc3\\xa9\\r\\n\\xc3\\xa7\\xd1\\x84\", b)\n\n    def test_format_chain_reverse(self):\n        t = LocalTarget(self.path, luigi.format.UTF8 >> luigi.format.Gzip)\n\n        f = gzip.open(self.path, \"wb\")\n        f.write(b\"\\xe6\\x88\\x91\\xc3\\xa9\\r\\n\\xc3\\xa7\\xd1\\x84\")\n        f.close()\n\n        with t.open(\"r\") as f:\n            b = f.read()\n\n        self.assertEqual(\"我é\\nçф\", b)\n\n    @mock.patch(\"os.linesep\", \"\\r\\n\")\n    def test_format_newline(self):\n        t = LocalTarget(self.path, luigi.format.SysNewLine)\n\n        with t.open(\"w\") as f:\n            f.write(b\"a\\rb\\nc\\r\\nd\")\n\n        with t.open(\"r\") as f:\n            b = f.read()\n\n        with open(self.path, \"rb\") as f:\n            c = f.read()\n\n        self.assertEqual(b\"a\\nb\\nc\\nd\", b)\n        self.assertEqual(b\"a\\r\\nb\\r\\nc\\r\\nd\", c)\n\n    def theoretical_io_modes(self, rwax=\"rwax\", bt=[\"\", \"b\", \"t\"], plus=[\"\", \"+\"]):\n        p = itertools.product(rwax, plus, bt)\n        return {\"\".join(c) for c in list(itertools.chain.from_iterable([itertools.permutations(m) for m in p]))}\n\n    def valid_io_modes(self, *a, **kw):\n        modes = set()\n        t = LocalTarget(is_tmp=True)\n        t.open(\"w\").close()\n        for mode in self.theoretical_io_modes(*a, **kw):\n            try:\n                io.FileIO(t.path, mode).close()\n            except ValueError:\n                pass\n            except IOError as err:\n                if err.errno == EEXIST:\n                    modes.add(mode)\n                else:\n                    raise\n            else:\n                modes.add(mode)\n        return modes\n\n    def valid_write_io_modes_for_luigi(self):\n        return self.valid_io_modes(\"w\", plus=[\"\"])\n\n    def valid_read_io_modes_for_luigi(self):\n        return self.valid_io_modes(\"r\", plus=[\"\"])\n\n    def invalid_io_modes_for_luigi(self):\n        return self.valid_io_modes().difference(self.valid_write_io_modes_for_luigi(), self.valid_read_io_modes_for_luigi())\n\n    def test_open_modes(self):\n        t = LocalTarget(is_tmp=True)\n        print(\"Valid write mode:\", end=\" \")\n        for mode in self.valid_write_io_modes_for_luigi():\n            print(mode, end=\" \")\n            p = t.open(mode)\n            p.close()\n        print()\n        print(\"Valid read mode:\", end=\" \")\n        for mode in self.valid_read_io_modes_for_luigi():\n            print(mode, end=\" \")\n            p = t.open(mode)\n            p.close()\n        print()\n        print(\"Invalid mode:\", end=\" \")\n        for mode in self.invalid_io_modes_for_luigi():\n            print(mode, end=\" \")\n            self.assertRaises(Exception, t.open, mode)\n        print()\n\n\nclass LocalTargetCreateDirectoriesTest(LocalTargetTest):\n    path = \"/tmp/%s/xyz/test.txt\" % random.randint(0, 999999999)\n    copy = \"/tmp/%s/xyz_2/copy.txt\" % random.randint(0, 999999999)\n\n\nclass LocalTargetRelativeTest(LocalTargetTest):\n    # We had a bug that caused relative file paths to fail, adding test for it\n    path = \"test.txt\"\n    copy = \"copy.txt\"\n\n\nclass TmpFileTest(unittest.TestCase):\n    def test_tmp(self):\n        t = LocalTarget(is_tmp=True)\n        self.assertFalse(t.exists())\n        self.assertFalse(os.path.exists(t.path))\n        p = t.open(\"w\")\n        print(\"test\", file=p)\n        self.assertFalse(t.exists())\n        self.assertFalse(os.path.exists(t.path))\n        p.close()\n        self.assertTrue(t.exists())\n        self.assertTrue(os.path.exists(t.path))\n\n        q = t.open(\"r\")\n        self.assertEqual(q.readline(), \"test\\n\")\n        q.close()\n        path = t.path\n        del t  # should remove the underlying file\n        self.assertFalse(os.path.exists(path))\n\n\nclass FileSystemTest(unittest.TestCase):\n    path = \"/tmp/luigi-test-dir\"\n    fs = LocalFileSystem()\n\n    def setUp(self):\n        if os.path.exists(self.path):\n            shutil.rmtree(self.path)\n\n    def tearDown(self):\n        self.setUp()\n\n    def test_copy(self):\n        src = os.path.join(self.path, \"src.txt\")\n        dest = os.path.join(self.path, \"newdir\", \"dest.txt\")\n\n        LocalTarget(src).open(\"w\").close()\n        self.fs.copy(src, dest)\n        self.assertTrue(os.path.exists(src))\n        self.assertTrue(os.path.exists(dest))\n\n    def test_mkdir(self):\n        testpath = os.path.join(self.path, \"foo/bar\")\n\n        self.assertRaises(MissingParentDirectory, self.fs.mkdir, testpath, parents=False)\n\n        self.fs.mkdir(testpath)\n        self.assertTrue(os.path.exists(testpath))\n        self.assertTrue(self.fs.isdir(testpath))\n\n        self.assertRaises(FileAlreadyExists, self.fs.mkdir, testpath, raise_if_exists=True)\n\n    def test_exists(self):\n        self.assertFalse(self.fs.exists(self.path))\n        os.mkdir(self.path)\n        self.assertTrue(self.fs.exists(self.path))\n        self.assertTrue(self.fs.isdir(self.path))\n\n    def test_listdir(self):\n        os.mkdir(self.path)\n        with open(self.path + \"/file\", \"w\"):\n            pass\n        self.assertTrue([self.path + \"/file\"], list(self.fs.listdir(self.path + \"/\")))\n\n    def test_move_to_new_dir(self):\n        # Regression test for a bug in LocalFileSystem.move\n        src = os.path.join(self.path, \"src.txt\")\n        dest = os.path.join(self.path, \"newdir\", \"dest.txt\")\n\n        LocalTarget(src).open(\"w\").close()\n        self.fs.move(src, dest)\n        self.assertTrue(os.path.exists(dest))\n\n\nclass DestructorTest(unittest.TestCase):\n    def test_destructor(self):\n        # LocalTarget might not be fully initialised if an exception is thrown in the constructor of LocalTarget or a\n        # subclass. The destructor can't expect attributes to be initialised.\n        t = LocalTarget(is_tmp=True)\n        del t.is_tmp\n        t.__del__()\n"
  },
  {
    "path": "test/lock_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport os\nimport subprocess\nimport tempfile\n\nimport mock\nfrom helpers import unittest\nfrom tenacity import retry, retry_if_result, stop_after_attempt, wait_exponential\n\nimport luigi\nimport luigi.lock\nimport luigi.notifications\n\nluigi.notifications.DEBUG = True\n\n\nclass TestCmd(unittest.TestCase):\n    def test_getpcmd(self):\n        def _is_empty(cmd):\n            return cmd == \"\"\n\n        # for CI stability, add retring\n        @retry(retry=retry_if_result(_is_empty), wait=wait_exponential(multiplier=0.2, min=0.1, max=3), stop=stop_after_attempt(3))\n        def _getpcmd(pid):\n            return luigi.lock.getpcmd(pid)\n\n        if os.name == \"nt\":\n            command = [\"ping\", \"1.1.1.1\", \"-w\", \"1000\"]\n        else:\n            command = [\"sleep\", \"1\"]\n\n        external_process = subprocess.Popen(command)\n        result = _getpcmd(external_process.pid)\n\n        self.assertTrue(result.strip() in [\"sleep 1\", \"[sleep]\", \"ping 1.1.1.1 -w 1000\"])\n        external_process.kill()\n\n\nclass LockTest(unittest.TestCase):\n    def setUp(self):\n        self.pid_dir = tempfile.mkdtemp()\n        self.pid, self.cmd, self.pid_file = luigi.lock.get_info(self.pid_dir)\n\n    def tearDown(self):\n        if os.path.exists(self.pid_file):\n            os.remove(self.pid_file)\n        os.rmdir(self.pid_dir)\n\n    def test_get_info(self):\n        def _is_empty(result):\n            return result[1] == \"\"  # cmd is empty\n\n        # for CI stability, add retring\n        @retry(retry=retry_if_result(_is_empty), wait=wait_exponential(multiplier=0.2, min=0.1, max=3), stop=stop_after_attempt(3))\n        def _get_info(pid_dir, pid):\n            return luigi.lock.get_info(pid_dir, pid)\n\n        try:\n            p = subprocess.Popen([\"yes\", \"à我ф\"], stdout=subprocess.PIPE)\n            pid, cmd, pid_file = _get_info(self.pid_dir, p.pid)\n        finally:\n            p.kill()\n        self.assertEqual(cmd, \"yes à我ф\")\n\n    def test_acquiring_free_lock(self):\n        acquired = luigi.lock.acquire_for(self.pid_dir)\n        self.assertTrue(acquired)\n\n    def test_acquiring_taken_lock(self):\n        with open(self.pid_file, \"w\") as f:\n            f.write(\"%d\\n\" % (self.pid,))\n\n        acquired = luigi.lock.acquire_for(self.pid_dir)\n        self.assertFalse(acquired)\n\n    def test_acquiring_partially_taken_lock(self):\n        with open(self.pid_file, \"w\") as f:\n            f.write(\"%d\\n\" % (self.pid,))\n\n        acquired = luigi.lock.acquire_for(self.pid_dir, 2)\n        self.assertTrue(acquired)\n\n        s = os.stat(self.pid_file)\n        self.assertEqual(s.st_mode & 0o700, 0o700)\n\n    def test_acquiring_lock_from_missing_process(self):\n        fake_pid = 99999\n        with open(self.pid_file, \"w\") as f:\n            f.write(\"%d\\n\" % (fake_pid,))\n\n        acquired = luigi.lock.acquire_for(self.pid_dir)\n        self.assertTrue(acquired)\n\n        s = os.stat(self.pid_file)\n        self.assertEqual(s.st_mode & 0o700, 0o700)\n\n    @mock.patch(\"os.kill\")\n    def test_take_lock_with_kill(self, kill_fn):\n        with open(self.pid_file, \"w\") as f:\n            f.write(\"%d\\n\" % (self.pid,))\n\n        kill_signal = 77777\n        acquired = luigi.lock.acquire_for(self.pid_dir, kill_signal=kill_signal)\n        self.assertTrue(acquired)\n        kill_fn.assert_called_once_with(self.pid, kill_signal)\n\n    @mock.patch(\"os.kill\")\n    @mock.patch(\"luigi.lock.getpcmd\")\n    def test_take_lock_has_only_one_extra_life(self, getpcmd, kill_fn):\n        def side_effect(pid):\n            if pid in [self.pid, self.pid + 1, self.pid + 2]:\n                return self.cmd  # We could return something else too, actually\n            else:\n                return \"echo something_else\"\n\n        getpcmd.side_effect = side_effect\n        with open(self.pid_file, \"w\") as f:\n            f.write(\"{}\\n{}\\n\".format(self.pid + 1, self.pid + 2))\n\n        kill_signal = 77777\n        acquired = luigi.lock.acquire_for(self.pid_dir, kill_signal=kill_signal)\n        self.assertFalse(acquired)  # So imagine +2 was runnig first, then +1 was run with --take-lock\n        kill_fn.assert_any_call(self.pid + 1, kill_signal)\n        kill_fn.assert_any_call(self.pid + 2, kill_signal)\n\n    @mock.patch(\"luigi.lock.getpcmd\")\n    def test_cleans_old_pid_entries(self, getpcmd):\n        assert self.pid > 10  # I've never seen so low pids so\n        SAME_ENTRIES = {1, 2, 3, 4, 5, self.pid}\n        ALL_ENTRIES = SAME_ENTRIES | {6, 7, 8, 9, 10}\n\n        def side_effect(pid):\n            if pid in SAME_ENTRIES:\n                return self.cmd  # We could return something else too, actually\n            elif pid == 8:\n                return None\n            else:\n                return \"echo something_else\"\n\n        getpcmd.side_effect = side_effect\n        with open(self.pid_file, \"w\") as f:\n            f.writelines(\"{}\\n\".format(pid) for pid in ALL_ENTRIES)\n\n        acquired = luigi.lock.acquire_for(self.pid_dir, num_available=100)\n        self.assertTrue(acquired)\n\n        with open(self.pid_file, \"r\") as f:\n            self.assertEqual({int(pid_str.strip()) for pid_str in f}, SAME_ENTRIES)\n"
  },
  {
    "path": "test/metrics_test.py",
    "content": "import unittest\n\nimport luigi.metrics as metrics\nfrom luigi.contrib.datadog_metric import DatadogMetricsCollector\nfrom luigi.contrib.prometheus_metric import PrometheusMetricsCollector\n\n\nclass TestMetricsCollectors(unittest.TestCase):\n    def test_default_value(self):\n        collector = metrics.MetricsCollectors.default\n        output = metrics.MetricsCollectors.get(collector)\n\n        assert type(output) is metrics.NoMetricsCollector\n\n    def test_datadog_value(self):\n        collector = metrics.MetricsCollectors.datadog\n        output = metrics.MetricsCollectors.get(collector)\n\n        assert type(output) is DatadogMetricsCollector\n\n    def test_prometheus_value(self):\n        collector = metrics.MetricsCollectors.prometheus\n        output = metrics.MetricsCollectors.get(collector)\n\n        assert type(output) is PrometheusMetricsCollector\n\n    def test_none_value(self):\n        collector = metrics.MetricsCollectors.none\n        output = metrics.MetricsCollectors.get(collector)\n\n        assert type(output) is metrics.NoMetricsCollector\n\n    def test_other_value(self):\n        collector = \"junk\"\n\n        with self.assertRaises(ValueError) as context:\n            metrics.MetricsCollectors.get(collector)\n            assert (\"MetricsCollectors value ' junk ' isn't supported\") in str(context.exception)\n"
  },
  {
    "path": "test/mock_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom helpers import unittest\n\nfrom luigi.format import Nop\nfrom luigi.mock import MockFileSystem, MockTarget\n\n\nclass MockFileTest(unittest.TestCase):\n    def test_1(self):\n        t = MockTarget(\"test\")\n        p = t.open(\"w\")\n        print(\"test\", file=p)\n        p.close()\n\n        q = t.open(\"r\")\n        self.assertEqual(list(q), [\"test\\n\"])\n        q.close()\n\n    def test_with(self):\n        t = MockTarget(\"foo\")\n        with t.open(\"w\") as b:\n            b.write(\"bar\")\n\n        with t.open(\"r\") as b:\n            self.assertEqual(list(b), [\"bar\"])\n\n    def test_bytes(self):\n        t = MockTarget(\"foo\", format=Nop)\n        with t.open(\"wb\") as b:\n            b.write(b\"bar\")\n\n        with t.open(\"rb\") as b:\n            self.assertEqual(list(b), [b\"bar\"])\n\n    def test_default_mode_value(self):\n        t = MockTarget(\"foo\")\n        with t.open(\"w\") as b:\n            b.write(\"bar\")\n\n        with t.open() as b:\n            self.assertEqual(list(b), [\"bar\"])\n\n    def test_mode_none_error(self):\n        t = MockTarget(\"foo\")\n        with self.assertRaises(TypeError):\n            with t.open(None) as b:\n                b.write(\"bar\")\n\n    # That should work in python2 because of the autocast\n    # That should work in python3 because the default format is Text\n    def test_unicode(self):\n        t = MockTarget(\"foo\")\n        with t.open(\"w\") as b:\n            b.write(\"bar\")\n\n        with t.open(\"r\") as b:\n            self.assertEqual(b.read(), \"bar\")\n\n\nclass MockFileSystemTest(unittest.TestCase):\n    fs = MockFileSystem()\n\n    def _touch(self, path):\n        t = MockTarget(path)\n        with t.open(\"w\"):\n            pass\n\n    def setUp(self):\n        self.fs.clear()\n        self.path = \"/tmp/foo\"\n        self.path2 = \"/tmp/bar\"\n        self.path3 = \"/tmp/foobar\"\n        self._touch(self.path)\n        self._touch(self.path2)\n\n    def test_copy(self):\n        self.fs.copy(self.path, self.path3)\n        self.assertTrue(self.fs.exists(self.path))\n        self.assertTrue(self.fs.exists(self.path3))\n\n    def test_exists(self):\n        self.assertTrue(self.fs.exists(self.path))\n\n    def test_remove(self):\n        self.fs.remove(self.path)\n        self.assertFalse(self.fs.exists(self.path))\n\n    def test_remove_recursive(self):\n        self.fs.remove(\"/tmp\", recursive=True)\n        self.assertFalse(self.fs.exists(self.path))\n        self.assertFalse(self.fs.exists(self.path2))\n\n    def test_rename(self):\n        self.fs.rename(self.path, self.path3)\n        self.assertFalse(self.fs.exists(self.path))\n        self.assertTrue(self.fs.exists(self.path3))\n\n    def test_listdir(self):\n        self.assertEqual(sorted([self.path, self.path2]), sorted(self.fs.listdir(\"/tmp\")))\n"
  },
  {
    "path": "test/most_common_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom helpers import unittest\n\nfrom luigi.tools.range import most_common\n\n\nclass MostCommonTest(unittest.TestCase):\n    def setUp(self):\n        self.runs = [([1], (1, 1)), ([1, 1], (1, 2)), ([1, 1, 2], (1, 2)), ([1, 1, 2, 2, 2], (2, 3))]\n\n    def test_runs(self):\n        for args, result in self.runs:\n            actual = most_common(args)\n            expected = result\n            self.assertEqual(expected, actual)\n"
  },
  {
    "path": "test/mypy_test.py",
    "content": "import sys\nimport tempfile\nimport unittest\n\nfrom mypy import api\n\n\ndef _run_mypy(test_code: str):\n    with tempfile.NamedTemporaryFile(suffix=\".py\") as test_file:\n        test_file.write(test_code.encode(\"utf-8\"))\n        test_file.flush()\n        return api.run(\n            [\n                \"--no-incremental\",\n                \"--cache-dir=/dev/null\",\n                \"--show-traceback\",\n                \"--config-file\",\n                \"test/testconfig/pyproject.toml\",\n                test_file.name,\n            ]\n        )\n\n\nclass TestMyMypyPlugin(unittest.TestCase):\n    def test_plugin_no_issue(self):\n        if sys.version_info[:2] < (3, 8):\n            return\n\n        test_code = \"\"\"\nfrom datetime import date, datetime, timedelta\nfrom enum import Enum\nimport luigi\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple, Type\nfrom uuid import UUID\n\nclass MyEnum(Enum):\n    A = 1\n    B = 2\n    C = 3\n\nclass UUIDParameter(luigi.Parameter[UUID]):\n    def parse(self, s):\n        return UUID(s)\n\nclass OtherTask(luigi.Task):\n    pass\n\nclass MyTask(luigi.Task):\n    bool_p: bool = luigi.BoolParameter()\n    choice_int_p: int = luigi.parameter.ChoiceParameter(choices=[1, 2, 3])\n    choice_list_int_p: Tuple[int, ...] = luigi.parameter.ChoiceListParameter(choices=[1, 2, 3])\n    choice_list_str_p: Tuple[str, ...] = luigi.parameter.ChoiceListParameter(choices=[\"foo\", \"bar\", \"baz\"])\n    choice_str_p: str = luigi.parameter.ChoiceParameter(choices=[\"foo\", \"bar\", \"baz\"])\n    date_p: date = luigi.DateParameter()\n    datetime_p: datetime = luigi.DateSecondParameter()\n    dict_p: Dict[str, str] = luigi.DictParameter()\n    enum_p: MyEnum = luigi.parameter.EnumParameter(enum=MyEnum)\n    enums_p: Tuple[MyEnum, ...] = luigi.parameter.EnumListParameter(enum=MyEnum)\n    int_p: int = luigi.IntParameter()\n    list_float_p: Tuple[Any, ...] = luigi.ListParameter()\n    numeric_p: float = luigi.NumericalParameter(var_type=float, min_value=-3.0, max_value=7.0)\n    opt_p: Optional[str] = luigi.OptionalParameter()\n    path_p: Path = luigi.PathParameter()\n    str_p: str = luigi.Parameter()\n    str_p_default: str = luigi.Parameter(default=\"baz\")\n    task_p: Type[luigi.Task] = luigi.TaskParameter()\n    timedelta_p: timedelta = luigi.TimeDeltaParameter()\n    tuple_int_p: Tuple[Any, ...] = luigi.TupleParameter()\n    uuid_p: UUID = UUIDParameter()\n\nMyTask(\n    bool_p=True,\n    choice_int_p=3,\n    choice_list_int_p=(2, 3),\n    choice_list_str_p=(\"foo\", \"baz\"),\n    choice_str_p=\"foo\",\n    date_p=date.today(),\n    datetime_p=datetime.now(),\n    dict_p={\"foo\": \"bar\"},\n    enum_p=MyEnum.B,\n    enums_p=(MyEnum.A, MyEnum.C),\n    int_p=1,\n    list_float_p=(0.1, 0.2),\n    numeric_p=4.0,\n    opt_p=None,\n    path_p=Path(\"/tmp\"),\n    str_p='bar',\n    task_p=OtherTask,\n    timedelta_p=timedelta(hours=1),\n    tuple_int_p=(1, 2),\n    uuid_p=UUID(\"9b0591d7-a167-4978-bc6d-41f7d84a288c\"),\n)\n\"\"\"\n\n        stdout, stderr, exitcode = _run_mypy(test_code)\n        self.assertEqual(\n            exitcode,\n            0,\n            f\"mypy plugin error occurred:\\nstdout: {stdout}\\nstderr: {stderr}\",\n        )\n        self.assertIn(\"Success: no issues found\", stdout)\n\n    def test_plugin_invalid_arg(self):\n        if sys.version_info[:2] < (3, 8):\n            return\n\n        test_code = \"\"\"\nimport luigi\n\n\nclass MyTask(luigi.Task):\n    foo: int = luigi.IntParameter()\n    bar: str = luigi.Parameter()\n    baz: str = luigi.Parameter(default=1) # invalid assignment to str with default value int\n\n# issue:\n#   - foo is int\n#   - unknown is unknown parameter\n#   - baz is invalid assignment to str with default value int\nMyTask(foo='1', bar=\"bar\", unknown=\"unknown\")\n        \"\"\"\n\n        stdout, stderr, exitcode = _run_mypy(test_code)\n\n        self.assertEqual(\n            exitcode,\n            1,\n            f\"mypy plugin error occurred:\\nstdout: {stdout}\\nstderr: {stderr}\",\n        )\n        self.assertIn(\n            'error: Incompatible types in assignment (expression has type \"int\", variable has type \"str\")  [assignment]',\n            stdout,\n        )  # check baz assignment\n        self.assertIn(\n            'error: Argument \"foo\" to \"MyTask\" has incompatible type \"str\"; expected \"int\"  [arg-type]',\n            stdout,\n        )  # check foo argument\n        self.assertIn(\n            'error: Unexpected keyword argument \"unknown\" for \"MyTask\"  [call-arg]',\n            stdout,\n        )  # check unknown argument\n        self.assertIn(\"Found 3 errors in 1 file (checked 1 source file)\", stdout)\n\n    def test_plugin_custom_parameter_subclass_without_default_arg(self):\n        \"\"\"Test for issue #3376: Custom Parameter subclass without 'default' in __init__\"\"\"\n        if sys.version_info[:2] < (3, 8):\n            return\n\n        test_code = \"\"\"\nimport luigi\n\n\nclass CustomPathParameter(luigi.PathParameter):\n    \\\"\\\"\\\"A PathParameter subclass that doesn't expose 'default' in its signature.\\\"\\\"\\\"\n    def __init__(self, **kwargs):\n        super().__init__(**kwargs)\n\n\nclass MyTask(luigi.Task):\n    path = CustomPathParameter()\n\"\"\"\n\n        stdout, stderr, exitcode = _run_mypy(test_code)\n        self.assertEqual(\n            exitcode,\n            0,\n            f\"mypy plugin error occurred:\\nstdout: {stdout}\\nstderr: {stderr}\",\n        )\n        self.assertIn(\"Success: no issues found\", stdout)\n\n    def test_plugin_parameter_type_annotation(self):\n        \"\"\"Test that Parameter types can be used as type annotations.\n\n        Users should be able to write:\n            foo: luigi.IntParameter = luigi.IntParameter()\n            bar: luigi.Parameter[str] = luigi.Parameter()\n        \"\"\"\n        if sys.version_info[:2] < (3, 8):\n            return\n\n        test_code = \"\"\"\nimport luigi\n\n\nclass MyTask(luigi.Task):\n    foo: luigi.IntParameter = luigi.IntParameter()\n    bar: luigi.StrParameter = luigi.StrParameter()\n\nMyTask(foo=1, bar='2')\n\"\"\"\n\n        stdout, stderr, exitcode = _run_mypy(test_code)\n        self.assertEqual(\n            exitcode,\n            0,\n            f\"mypy plugin error occurred:\\nstdout: {stdout}\\nstderr: {stderr}\",\n        )\n        self.assertIn(\"Success: no issues found\", stdout)\n\n    def test_plugin_parameter_type_annotation_invalid_arg(self):\n        \"\"\"Test that Parameter type annotations catch type errors in __init__ args.\n\n        MyTask(foo='1', bar='2') should error because foo expects int, not str.\n        \"\"\"\n        if sys.version_info[:2] < (3, 8):\n            return\n\n        test_code = \"\"\"\nimport luigi\n\n\nclass MyTask(luigi.Task):\n    foo: luigi.IntParameter = luigi.IntParameter()\n    bar: luigi.StrParameter = luigi.StrParameter()\n\nMyTask(foo='1', bar='2')\n\"\"\"\n\n        stdout, stderr, exitcode = _run_mypy(test_code)\n        self.assertEqual(\n            exitcode,\n            1,\n            f\"Expected mypy error but got:\\nstdout: {stdout}\\nstderr: {stderr}\",\n        )\n        self.assertIn(\n            'error: Argument \"foo\" to \"MyTask\" has incompatible type \"str\"; expected \"int\"',\n            stdout,\n        )\n        self.assertIn(\"Found 1 error in 1 file (checked 1 source file)\", stdout)\n"
  },
  {
    "path": "test/notifications_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport socket\nimport sys\n\nimport mock\nfrom helpers import unittest, with_config\n\nimport luigi\nfrom luigi import notifications\nfrom luigi.notifications import generate_email\nfrom luigi.scheduler import Scheduler\nfrom luigi.worker import Worker\n\n\nclass TestEmail(unittest.TestCase):\n    def testEmailNoPrefix(self):\n        self.assertEqual(\"subject\", notifications._prefix(\"subject\"))\n\n    @with_config({\"email\": {\"prefix\": \"[prefix]\"}})\n    def testEmailPrefix(self):\n        self.assertEqual(\"[prefix] subject\", notifications._prefix(\"subject\"))\n\n\nclass TestException(Exception):\n    pass\n\n\nclass TestTask(luigi.Task):\n    foo = luigi.Parameter()\n    bar = luigi.Parameter()\n\n\nclass FailSchedulingTask(TestTask):\n    def requires(self):\n        raise TestException(\"Oops!\")\n\n    def run(self):\n        pass\n\n    def complete(self):\n        return False\n\n\nclass FailRunTask(TestTask):\n    def run(self):\n        raise TestException(\"Oops!\")\n\n    def complete(self):\n        return False\n\n\nclass ExceptionFormatTest(unittest.TestCase):\n    def setUp(self):\n        self.sch = Scheduler()\n\n    def test_fail_run(self):\n        task = FailRunTask(foo=\"foo\", bar=\"bar\")\n        self._run_task(task)\n\n    def test_fail_run_html(self):\n        task = FailRunTask(foo=\"foo\", bar=\"bar\")\n        self._run_task_html(task)\n\n    def test_fail_schedule(self):\n        task = FailSchedulingTask(foo=\"foo\", bar=\"bar\")\n        self._run_task(task)\n\n    def test_fail_schedule_html(self):\n        task = FailSchedulingTask(foo=\"foo\", bar=\"bar\")\n        self._run_task_html(task)\n\n    @with_config({\"email\": {\"receiver\": \"nowhere@example.com\", \"prefix\": \"[TEST] \"}})\n    @mock.patch(\"luigi.notifications.send_error_email\")\n    def _run_task(self, task, mock_send):\n        with Worker(scheduler=self.sch) as w:\n            w.add(task)\n            w.run()\n\n        self.assertEqual(mock_send.call_count, 1)\n        args, kwargs = mock_send.call_args\n        self._check_subject(args[0], task)\n        self._check_body(args[1], task, html=False)\n\n    @with_config({\"email\": {\"receiver\": \"nowhere@axample.com\", \"prefix\": \"[TEST] \", \"format\": \"html\"}})\n    @mock.patch(\"luigi.notifications.send_error_email\")\n    def _run_task_html(self, task, mock_send):\n        with Worker(scheduler=self.sch) as w:\n            w.add(task)\n            w.run()\n\n        self.assertEqual(mock_send.call_count, 1)\n        args, kwargs = mock_send.call_args\n        self._check_subject(args[0], task)\n        self._check_body(args[1], task, html=True)\n\n    def _check_subject(self, subject, task):\n        self.assertIn(str(task), subject)\n\n    def _check_body(self, body, task, html=False):\n        if html:\n            self.assertIn(\"<th>name</th><td>{}</td>\".format(task.task_family), body)\n            self.assertIn('<div class=\"highlight\"', body)\n            self.assertIn(\"Oops!\", body)\n\n            for param, value in task.param_kwargs.items():\n                self.assertIn(\"<th>{}</th><td>{}</td>\".format(param, value), body)\n        else:\n            self.assertIn(\"Name: {}\\n\".format(task.task_family), body)\n            self.assertIn(\"Parameters:\\n\", body)\n            self.assertIn(\"TestException: Oops!\", body)\n\n            for param, value in task.param_kwargs.items():\n                self.assertIn(\"{}: {}\\n\".format(param, value), body)\n\n    @with_config({\"email\": {\"receiver\": \"a@a.a\"}})\n    def testEmailRecipients(self):\n        self.assertCountEqual(notifications._email_recipients(), [\"a@a.a\"])\n        self.assertCountEqual(notifications._email_recipients(\"b@b.b\"), [\"a@a.a\", \"b@b.b\"])\n        self.assertCountEqual(notifications._email_recipients([\"b@b.b\", \"c@c.c\"]), [\"a@a.a\", \"b@b.b\", \"c@c.c\"])\n\n    @with_config({\"email\": {}}, replace_sections=True)\n    def testEmailRecipientsNoConfig(self):\n        self.assertCountEqual(notifications._email_recipients(), [])\n        self.assertCountEqual(notifications._email_recipients(\"a@a.a\"), [\"a@a.a\"])\n        self.assertCountEqual(notifications._email_recipients([\"a@a.a\", \"b@b.b\"]), [\"a@a.a\", \"b@b.b\"])\n\n    def test_generate_unicode_email(self):\n        generate_email(\n            sender=\"test@example.com\",\n            subject=\"sübjéct\",\n            message=\"你好\",\n            recipients=[\"receiver@example.com\"],\n            image_png=None,\n        )\n\n\nclass NotificationFixture:\n    \"\"\"\n    Defines API and message fixture.\n\n    config, sender, subject, message, recipients, image_png\n    \"\"\"\n\n    sender = \"luigi@unittest\"\n    subject = \"Oops!\"\n    message = \"\"\"A multiline\n                 message.\"\"\"\n    recipients = [\"noone@nowhere.no\", \"phantom@opera.fr\"]\n    image_png = None\n\n    notification_args = [sender, subject, message, recipients, image_png]\n    mocked_email_msg = \"\"\"Content-Type: multipart/related; boundary=\"===============0998157881==\"\nMIME-Version: 1.0\nSubject: Oops!\nFrom: luigi@unittest\nTo: noone@nowhere.no,phantom@opera.fr\n\n--===============0998157881==\nMIME-Version: 1.0\nContent-Transfer-Encoding: 7bit\nContent-Type: text/plain; charset=\"utf-8\"\n\nA multiline\nmessage.\n--===============0998157881==--\"\"\"\n\n\nclass TestSMTPEmail(unittest.TestCase, NotificationFixture):\n    \"\"\"\n    Tests sending SMTP email.\n    \"\"\"\n\n    def setUp(self):\n        sys.modules[\"smtplib\"] = mock.MagicMock()\n        import smtplib  # noqa: F401\n\n    def tearDown(self):\n        del sys.modules[\"smtplib\"]\n\n    @with_config(\n        {\n            \"smtp\": {\n                \"ssl\": \"False\",\n                \"host\": \"my.smtp.local\",\n                \"port\": \"999\",\n                \"local_hostname\": \"ptms\",\n                \"timeout\": \"1200\",\n                \"username\": \"Robin\",\n                \"password\": \"dooH\",\n                \"no_tls\": \"False\",\n            }\n        }\n    )\n    def test_sends_smtp_email(self):\n        \"\"\"\n        Call notifications.send_email_smtp with fixture parameters with smtp_without_tls  set to False\n        and check that sendmail is properly called.\n        \"\"\"\n\n        smtp_kws = {\"host\": \"my.smtp.local\", \"port\": 999, \"local_hostname\": \"ptms\", \"timeout\": 1200}\n\n        with mock.patch(\"smtplib.SMTP\") as SMTP:\n            with mock.patch(\"luigi.notifications.generate_email\") as generate_email:\n                generate_email.return_value.as_string.return_value = self.mocked_email_msg\n\n                notifications.send_email_smtp(*self.notification_args)\n\n                SMTP.assert_called_once_with(**smtp_kws)\n                SMTP.return_value.login.assert_called_once_with(\"Robin\", \"dooH\")\n                SMTP.return_value.starttls.assert_called_once_with()\n                SMTP.return_value.sendmail.assert_called_once_with(self.sender, self.recipients, self.mocked_email_msg)\n\n    @with_config(\n        {\n            \"smtp\": {\n                \"ssl\": \"False\",\n                \"host\": \"my.smtp.local\",\n                \"port\": \"999\",\n                \"local_hostname\": \"ptms\",\n                \"timeout\": \"1200\",\n                \"username\": \"Robin\",\n                \"password\": \"dooH\",\n                \"no_tls\": \"True\",\n            }\n        }\n    )\n    def test_sends_smtp_email_without_tls(self):\n        \"\"\"\n        Call notifications.send_email_smtp with fixture parameters with no_tls  set to True\n        and check that sendmail is properly called without also calling\n        starttls.\n        \"\"\"\n        smtp_kws = {\"host\": \"my.smtp.local\", \"port\": 999, \"local_hostname\": \"ptms\", \"timeout\": 1200}\n\n        with mock.patch(\"smtplib.SMTP\") as SMTP:\n            with mock.patch(\"luigi.notifications.generate_email\") as generate_email:\n                generate_email.return_value.as_string.return_value = self.mocked_email_msg\n\n                notifications.send_email_smtp(*self.notification_args)\n\n                SMTP.assert_called_once_with(**smtp_kws)\n                self.assertEqual(SMTP.return_value.starttls.called, False)\n                SMTP.return_value.login.assert_called_once_with(\"Robin\", \"dooH\")\n                SMTP.return_value.sendmail.assert_called_once_with(self.sender, self.recipients, self.mocked_email_msg)\n\n    @with_config(\n        {\n            \"smtp\": {\n                \"ssl\": \"False\",\n                \"host\": \"my.smtp.local\",\n                \"port\": \"999\",\n                \"local_hostname\": \"ptms\",\n                \"timeout\": \"1200\",\n                \"username\": \"Robin\",\n                \"password\": \"dooH\",\n                \"no_tls\": \"True\",\n            }\n        }\n    )\n    def test_sends_smtp_email_exceptions(self):\n        \"\"\"\n        Call notifications.send_email_smtp when it cannot connect to smtp server (socket.error)\n        starttls.\n        \"\"\"\n        smtp_kws = {\"host\": \"my.smtp.local\", \"port\": 999, \"local_hostname\": \"ptms\", \"timeout\": 1200}\n\n        with mock.patch(\"smtplib.SMTP\") as SMTP:\n            with mock.patch(\"luigi.notifications.generate_email\") as generate_email:\n                SMTP.side_effect = socket.error()\n                generate_email.return_value.as_string.return_value = self.mocked_email_msg\n\n                try:\n                    notifications.send_email_smtp(*self.notification_args)\n                except socket.error:\n                    self.fail(\"send_email_smtp() raised expection unexpectedly\")\n\n                SMTP.assert_called_once_with(**smtp_kws)\n                self.assertEqual(notifications.generate_email.called, False)\n                self.assertEqual(SMTP.sendemail.called, False)\n\n\nclass TestSendgridEmail(unittest.TestCase, NotificationFixture):\n    \"\"\"\n    Tests sending Sendgrid email.\n    \"\"\"\n\n    def setUp(self):\n        sys.modules[\"sendgrid\"] = mock.MagicMock()\n        import sendgrid  # noqa: F401\n\n    def tearDown(self):\n        del sys.modules[\"sendgrid\"]\n\n    @with_config({\"sendgrid\": {\"apikey\": \"456abcdef123\"}})\n    def test_sends_sendgrid_email(self):\n        \"\"\"\n        Call notifications.send_email_sendgrid with fixture parameters\n        and check that SendGridAPIClient is properly called.\n        \"\"\"\n\n        with mock.patch(\"sendgrid.SendGridAPIClient\") as SendGridAPIClient:\n            notifications.send_email_sendgrid(*self.notification_args)\n\n            SendGridAPIClient.assert_called_once_with(\"456abcdef123\")\n            self.assertTrue(SendGridAPIClient.return_value.send.called)\n\n\nclass TestSESEmail(unittest.TestCase, NotificationFixture):\n    \"\"\"\n    Tests sending email through AWS SES.\n    \"\"\"\n\n    def setUp(self):\n        sys.modules[\"boto3\"] = mock.MagicMock()\n        import boto3  # noqa: F401\n\n    def tearDown(self):\n        del sys.modules[\"boto3\"]\n\n    @with_config({})\n    def test_sends_ses_email(self):\n        \"\"\"\n        Call notifications.send_email_ses with fixture parameters\n        and check that boto is properly called.\n        \"\"\"\n\n        with mock.patch(\"boto3.client\") as boto_client:\n            with mock.patch(\"luigi.notifications.generate_email\") as generate_email:\n                generate_email.return_value.as_string.return_value = self.mocked_email_msg\n\n                notifications.send_email_ses(*self.notification_args)\n\n                SES = boto_client.return_value\n                SES.send_raw_email.assert_called_once_with(Source=self.sender, Destinations=self.recipients, RawMessage={\"Data\": self.mocked_email_msg})\n\n\nclass TestSNSNotification(unittest.TestCase, NotificationFixture):\n    \"\"\"\n    Tests sending email through AWS SNS.\n    \"\"\"\n\n    def setUp(self):\n        sys.modules[\"boto3\"] = mock.MagicMock()\n        import boto3  # noqa: F401\n\n    def tearDown(self):\n        del sys.modules[\"boto3\"]\n\n    @with_config({})\n    def test_sends_sns_email(self):\n        \"\"\"\n        Call notifications.send_email_sns with fixture parameters\n        and check that boto3 is properly called.\n        \"\"\"\n\n        with mock.patch(\"boto3.resource\") as res:\n            notifications.send_email_sns(*self.notification_args)\n\n            SNS = res.return_value\n            SNS.Topic.assert_called_once_with(self.recipients[0])\n            SNS.Topic.return_value.publish.assert_called_once_with(Subject=self.subject, Message=self.message)\n\n    @with_config({})\n    def test_sns_subject_is_shortened(self):\n        \"\"\"\n        Call notifications.send_email_sns with too long Subject (more than 100 chars)\n        and check that it is cut to length of 100 chars.\n        \"\"\"\n\n        long_subject = (\n            \"Luigi: SanityCheck(regexPattern=aligned-source\\\\|data-not-older\\\\|source-chunks-compl,mailFailure=False, mongodb=mongodb://localhost/stats) FAILED\"\n        )\n\n        with mock.patch(\"boto3.resource\") as res:\n            notifications.send_email_sns(self.sender, long_subject, self.message, self.recipients, self.image_png)\n\n            SNS = res.return_value\n            SNS.Topic.assert_called_once_with(self.recipients[0])\n            called_subj = SNS.Topic.return_value.publish.call_args[1][\"Subject\"]\n            self.assertTrue(len(called_subj) <= 100, \"Subject can be max 100 chars long! Found {}.\".format(len(called_subj)))\n\n\nclass TestNotificationDispatcher(unittest.TestCase, NotificationFixture):\n    \"\"\"\n    Test dispatching of notifications on configuration values.\n    \"\"\"\n\n    def check_dispatcher(self, target):\n        \"\"\"\n        Call notifications.send_email and test that the proper\n        function was called.\n        \"\"\"\n\n        expected_args = self.notification_args\n\n        with mock.patch(\"luigi.notifications.{}\".format(target)) as sender:\n            notifications.send_email(self.subject, self.message, self.sender, self.recipients, image_png=self.image_png)\n\n            self.assertTrue(sender.called)\n\n            call_args = sender.call_args[0]\n\n            self.assertEqual(tuple(expected_args), call_args)\n\n    @with_config({\"email\": {\"force_send\": \"True\", \"method\": \"smtp\"}})\n    def test_smtp(self):\n        return self.check_dispatcher(\"send_email_smtp\")\n\n    @with_config({\"email\": {\"force_send\": \"True\", \"method\": \"ses\"}})\n    def test_ses(self):\n        return self.check_dispatcher(\"send_email_ses\")\n\n    @with_config({\"email\": {\"force_send\": \"True\", \"method\": \"sendgrid\"}})\n    def test_sendgrid(self):\n        return self.check_dispatcher(\"send_email_sendgrid\")\n\n    @with_config({\"email\": {\"force_send\": \"True\", \"method\": \"sns\"}})\n    def test_sns(self):\n        return self.check_dispatcher(\"send_email_sns\")\n"
  },
  {
    "path": "test/numerical_parameter_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom operator import le, lt\n\nfrom helpers import unittest\n\nimport luigi\n\n\nclass NumericalParameterTest(unittest.TestCase):\n    def test_int_min_value_inclusive(self):\n        d = luigi.NumericalParameter(var_type=int, min_value=-3, max_value=7, left_op=le, right_op=lt)\n        self.assertEqual(-3, d.parse(-3))\n\n    def test_float_min_value_inclusive(self):\n        d = luigi.NumericalParameter(var_type=float, min_value=-3, max_value=7, left_op=le, right_op=lt)\n        self.assertEqual(-3.0, d.parse(-3))\n\n    def test_int_min_value_exclusive(self):\n        d = luigi.NumericalParameter(var_type=int, min_value=-3, max_value=7, left_op=lt, right_op=lt)\n        self.assertRaises(ValueError, lambda: d.parse(-3))\n\n    def test_float_min_value_exclusive(self):\n        d = luigi.NumericalParameter(var_type=int, min_value=-3, max_value=7, left_op=lt, right_op=lt)\n        self.assertRaises(ValueError, lambda: d.parse(-3))\n\n    def test_int_max_value_inclusive(self):\n        d = luigi.NumericalParameter(var_type=int, min_value=-3, max_value=7, left_op=le, right_op=le)\n        self.assertEqual(7, d.parse(7))\n\n    def test_float_max_value_inclusive(self):\n        d = luigi.NumericalParameter(var_type=float, min_value=-3, max_value=7, left_op=le, right_op=le)\n        self.assertEqual(7, d.parse(7))\n\n    def test_int_max_value_exclusive(self):\n        d = luigi.NumericalParameter(var_type=int, min_value=-3, max_value=7, left_op=le, right_op=lt)\n        self.assertRaises(ValueError, lambda: d.parse(7))\n\n    def test_float_max_value_exclusive(self):\n        d = luigi.NumericalParameter(var_type=float, min_value=-3, max_value=7, left_op=le, right_op=lt)\n        self.assertRaises(ValueError, lambda: d.parse(7))\n\n    def test_defaults_start_range(self):\n        d = luigi.NumericalParameter(var_type=int, min_value=-3, max_value=7)\n        self.assertEqual(-3, d.parse(-3))\n\n    def test_endpoint_default_exclusive(self):\n        d = luigi.NumericalParameter(var_type=int, min_value=-3, max_value=7)\n        self.assertRaises(ValueError, lambda: d.parse(7))\n\n    def test_var_type_parameter_exception(self):\n        self.assertRaises(luigi.parameter.ParameterException, lambda: luigi.NumericalParameter(min_value=-3, max_value=7))\n\n    def test_min_value_parameter_exception(self):\n        self.assertRaises(luigi.parameter.ParameterException, lambda: luigi.NumericalParameter(var_type=int, max_value=7))\n\n    def test_max_value_parameter_exception(self):\n        self.assertRaises(luigi.parameter.ParameterException, lambda: luigi.NumericalParameter(var_type=int, min_value=-3))\n\n    def test_hash_int(self):\n        class Foo(luigi.Task):\n            args = luigi.parameter.NumericalParameter(var_type=int, min_value=-3, max_value=7)\n\n        p = luigi.parameter.NumericalParameter(var_type=int, min_value=-3, max_value=7)\n        self.assertEqual(hash(Foo(args=-3).args), hash(p.parse(\"-3\")))\n\n    def test_hash_float(self):\n        class Foo(luigi.Task):\n            args = luigi.parameter.NumericalParameter(var_type=float, min_value=-3, max_value=7)\n\n        p = luigi.parameter.NumericalParameter(var_type=float, min_value=-3, max_value=7)\n        self.assertEqual(hash(Foo(args=-3.0).args), hash(p.parse(\"-3.0\")))\n\n    def test_int_serialize_parse(self):\n        a = luigi.parameter.NumericalParameter(var_type=int, min_value=-3, max_value=7)\n        b = -3\n        self.assertEqual(b, a.parse(a.serialize(b)))\n\n    def test_float_serialize_parse(self):\n        a = luigi.parameter.NumericalParameter(var_type=float, min_value=-3, max_value=7)\n        b = -3.0\n        self.assertEqual(b, a.parse(a.serialize(b)))\n"
  },
  {
    "path": "test/optional_parameter_test.py",
    "content": "import warnings\n\nimport mock\nfrom helpers import LuigiTestCase, with_config\n\nimport luigi\n\n\nclass OptionalParameterTest(LuigiTestCase):\n    def actual_test(self, cls, default, expected_value, expected_type, bad_data, **kwargs):\n\n        class TestConfig(luigi.Config):\n            param = cls(default=default, **kwargs)\n            empty_param = cls(default=default, **kwargs)\n\n            def run(self):\n                assert self.param == expected_value\n                assert self.empty_param is None\n\n        # Test parsing empty string (should be None)\n        self.assertIsNone(cls(**kwargs).parse(\"\"))\n\n        # Test next_in_enumeration always returns None for summary\n        self.assertIsNone(TestConfig.param.next_in_enumeration(expected_value))\n        self.assertIsNone(TestConfig.param.next_in_enumeration(None))\n\n        # Test that warning is raised only with bad type\n        with mock.patch(\"luigi.parameter.warnings\") as warnings:\n            TestConfig()\n            warnings.warn.assert_not_called()\n\n        if cls != luigi.OptionalChoiceParameter:\n            with mock.patch(\"luigi.parameter.warnings\") as warnings:\n                TestConfig(param=None)\n                warnings.warn.assert_not_called()\n\n            with mock.patch(\"luigi.parameter.warnings\") as warnings:\n                TestConfig(param=bad_data)\n                if cls == luigi.OptionalBoolParameter:\n                    warnings.warn.assert_not_called()\n                else:\n                    warnings.warn.assert_called_with(\n                        '{} \"param\" with value \"{}\" is not of type \"{}\" or None.'.format(cls.__name__, bad_data, expected_type),\n                        luigi.parameter.OptionalParameterTypeWarning,\n                    )\n\n        # Test with value from config\n        self.assertTrue(luigi.build([TestConfig()], local_scheduler=True))\n\n    @with_config({\"TestConfig\": {\"param\": \"expected value\", \"empty_param\": \"\"}})\n    def test_optional_parameter(self):\n        self.actual_test(luigi.OptionalParameter, None, \"expected value\", \"str\", 0)\n        self.actual_test(luigi.OptionalParameter, \"default value\", \"expected value\", \"str\", 0)\n\n    @with_config({\"TestConfig\": {\"param\": \"10\", \"empty_param\": \"\"}})\n    def test_optional_int_parameter(self):\n        self.actual_test(luigi.OptionalIntParameter, None, 10, \"int\", \"bad data\")\n        self.actual_test(luigi.OptionalIntParameter, 1, 10, \"int\", \"bad data\")\n\n    @with_config({\"TestConfig\": {\"param\": \"true\", \"empty_param\": \"\"}})\n    def test_optional_bool_parameter(self):\n        self.actual_test(luigi.OptionalBoolParameter, None, True, \"bool\", \"bad data\")\n        self.actual_test(luigi.OptionalBoolParameter, False, True, \"bool\", \"bad data\")\n\n    @with_config({\"TestConfig\": {\"param\": \"10.5\", \"empty_param\": \"\"}})\n    def test_optional_float_parameter(self):\n        self.actual_test(luigi.OptionalFloatParameter, None, 10.5, \"float\", \"bad data\")\n        self.actual_test(luigi.OptionalFloatParameter, 1.5, 10.5, \"float\", \"bad data\")\n\n    @with_config({\"TestConfig\": {\"param\": '{\"a\": 10}', \"empty_param\": \"\"}})\n    def test_optional_dict_parameter(self):\n        self.actual_test(luigi.OptionalDictParameter, None, {\"a\": 10}, \"FrozenOrderedDict\", \"bad data\")\n        self.actual_test(luigi.OptionalDictParameter, {\"a\": 1}, {\"a\": 10}, \"FrozenOrderedDict\", \"bad data\")\n\n    @with_config({\"TestConfig\": {\"param\": \"[10.5]\", \"empty_param\": \"\"}})\n    def test_optional_list_parameter(self):\n        self.actual_test(luigi.OptionalListParameter, None, (10.5,), \"tuple\", \"bad data\")\n        self.actual_test(luigi.OptionalListParameter, (1.5,), (10.5,), \"tuple\", \"bad data\")\n\n    @with_config({\"TestConfig\": {\"param\": \"[10.5]\", \"empty_param\": \"\"}})\n    def test_optional_tuple_parameter(self):\n        self.actual_test(luigi.OptionalTupleParameter, None, (10.5,), \"tuple\", \"bad data\")\n        self.actual_test(luigi.OptionalTupleParameter, (1.5,), (10.5,), \"tuple\", \"bad data\")\n\n    @with_config({\"TestConfig\": {\"param\": \"10.5\", \"empty_param\": \"\"}})\n    def test_optional_numerical_parameter_float(self):\n        self.actual_test(luigi.OptionalNumericalParameter, None, 10.5, \"float\", \"bad data\", var_type=float, min_value=0, max_value=100)\n        self.actual_test(luigi.OptionalNumericalParameter, 1.5, 10.5, \"float\", \"bad data\", var_type=float, min_value=0, max_value=100)\n\n    @with_config({\"TestConfig\": {\"param\": \"10\", \"empty_param\": \"\"}})\n    def test_optional_numerical_parameter_int(self):\n        self.actual_test(luigi.OptionalNumericalParameter, None, 10, \"int\", \"bad data\", var_type=int, min_value=0, max_value=100)\n        self.actual_test(luigi.OptionalNumericalParameter, 1, 10, \"int\", \"bad data\", var_type=int, min_value=0, max_value=100)\n\n    @with_config({\"TestConfig\": {\"param\": \"expected value\", \"empty_param\": \"\"}})\n    def test_optional_choice_parameter(self):\n        choices = [\"default value\", \"expected value\"]\n        self.actual_test(luigi.OptionalChoiceParameter, None, \"expected value\", \"str\", \"bad data\", choices=choices)\n        self.actual_test(luigi.OptionalChoiceParameter, \"default value\", \"expected value\", \"str\", \"bad data\", choices=choices)\n\n    @with_config({\"TestConfig\": {\"param\": \"1\", \"empty_param\": \"\"}})\n    def test_optional_choice_parameter_int(self):\n        choices = [0, 1, 2]\n        self.actual_test(luigi.OptionalChoiceParameter, None, 1, \"int\", \"bad data\", var_type=int, choices=choices)\n        self.actual_test(luigi.OptionalChoiceParameter, \"default value\", 1, \"int\", \"bad data\", var_type=int, choices=choices)\n\n    def test_warning(self):\n        class TestOptionalFloatParameterSingleType(luigi.parameter.OptionalParameter, luigi.FloatParameter):\n            expected_type = float\n\n        class TestOptionalFloatParameterMultiTypes(luigi.parameter.OptionalParameter, luigi.FloatParameter):\n            expected_type = (int, float)\n\n        class TestConfig(luigi.Config):\n            param_single = TestOptionalFloatParameterSingleType()\n            param_multi = TestOptionalFloatParameterMultiTypes()\n\n        with warnings.catch_warnings(record=True) as record:\n            TestConfig(param_single=0.0, param_multi=1.0)\n\n        assert len(record) == 0\n\n        with warnings.catch_warnings(record=True) as record:\n            warnings.filterwarnings(\n                action=\"ignore\",\n                category=Warning,\n            )\n            warnings.simplefilter(\n                action=\"always\",\n                category=luigi.parameter.OptionalParameterTypeWarning,\n            )\n            assert luigi.build([TestConfig(param_single=\"0\", param_multi=\"1\")], local_scheduler=True)\n\n        assert len(record) == 2\n        assert issubclass(record[0].category, luigi.parameter.OptionalParameterTypeWarning)\n        assert issubclass(record[1].category, luigi.parameter.OptionalParameterTypeWarning)\n        assert str(record[0].message) == ('TestOptionalFloatParameterSingleType \"param_single\" with value \"0\" is not of type \"float\" or None.')\n        assert str(record[1].message) == ('TestOptionalFloatParameterMultiTypes \"param_multi\" with value \"1\" is not of any type in [\"int\", \"float\"] or None.')\n"
  },
  {
    "path": "test/other_module.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport luigi\n\n\nclass OtherModuleTask(luigi.Task):\n    p = luigi.Parameter()\n\n    def output(self):\n        return luigi.LocalTarget(self.p)\n\n    def run(self):\n        with self.output().open(\"w\") as f:\n            f.write(\"Done!\")\n"
  },
  {
    "path": "test/parameter_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport datetime\nimport enum\nfrom datetime import timedelta\n\nimport mock\nimport pytest\nfrom helpers import LuigiTestCase, RunOnceTask, in_parse, parsing, with_config\nfrom worker_test import email_patch\n\nimport luigi\nimport luigi.date_interval\nimport luigi.interface\nimport luigi.notifications\nfrom luigi.mock import MockTarget\nfrom luigi.parameter import ParameterException\n\nluigi.notifications.DEBUG = True\n\n\nclass A(luigi.Task):\n    _visible_in_registry = False  # test fixture: invisible to registry to prevent name conflicts\n    p = luigi.IntParameter()\n\n\nclass WithDefault(luigi.Task):\n    x = luigi.Parameter(default=\"xyz\")\n\n\nclass WithDefaultTrue(luigi.Task):\n    x = luigi.BoolParameter(default=True)\n\n\nclass WithDefaultFalse(luigi.Task):\n    x = luigi.BoolParameter(default=False)\n\n\nclass Foo(luigi.Task):\n    _visible_in_registry = False  # test fixture: invisible to registry to prevent name conflicts\n    bar = luigi.Parameter()\n    p2 = luigi.IntParameter()\n    not_a_param = \"lol\"\n\n\nclass Baz(luigi.Task):\n    bool = luigi.BoolParameter()\n    bool_true = luigi.BoolParameter(default=True)\n    bool_explicit = luigi.BoolParameter(parsing=luigi.BoolParameter.EXPLICIT_PARSING)\n\n    def run(self):\n        Baz._val = self.bool\n        Baz._val_true = self.bool_true\n        Baz._val_explicit = self.bool_explicit\n\n\nclass ListFoo(luigi.Task):\n    my_list = luigi.ListParameter()\n\n    def run(self):\n        ListFoo._val = self.my_list\n\n\nclass TupleFoo(luigi.Task):\n    my_tuple = luigi.TupleParameter()\n\n    def run(self):\n        TupleFoo._val = self.my_tuple\n\n\nclass ForgotParam(luigi.Task):\n    param = luigi.Parameter()\n\n    def run(self):\n        pass\n\n\nclass ForgotParamDep(luigi.Task):\n    def requires(self):\n        return ForgotParam()\n\n    def run(self):\n        pass\n\n\nclass BananaDep(luigi.Task):\n    x = luigi.Parameter()\n    y = luigi.Parameter(default=\"def\")\n\n    def output(self):\n        return MockTarget(\"banana-dep-%s-%s\" % (self.x, self.y))\n\n    def run(self):\n        self.output().open(\"w\").close()\n\n\nclass Banana(luigi.Task):\n    x = luigi.Parameter()\n    y = luigi.Parameter()\n    style = luigi.Parameter(default=None)\n\n    def requires(self):\n        if self.style is None:\n            return BananaDep()  # will fail\n        elif self.style == \"x-arg\":\n            return BananaDep(self.x)\n        elif self.style == \"y-kwarg\":\n            return BananaDep(y=self.y)\n        elif self.style == \"x-arg-y-arg\":\n            return BananaDep(self.x, self.y)\n        else:\n            raise Exception(\"unknown style\")\n\n    def output(self):\n        return MockTarget(\"banana-%s-%s\" % (self.x, self.y))\n\n    def run(self):\n        self.output().open(\"w\").close()\n\n\nclass MyConfig(luigi.Config):\n    mc_p = luigi.IntParameter()\n    mc_q = luigi.IntParameter(default=73)\n\n\nclass MyConfigWithoutSection(luigi.Config):\n    use_cmdline_section = False\n    mc_r = luigi.IntParameter()\n    mc_s = luigi.IntParameter(default=99)\n\n\nclass NoopTask(luigi.Task):\n    pass\n\n\nclass MyEnum(enum.Enum):\n    A = 1\n    C = 3\n\n\ndef _value(parameter):\n    \"\"\"\n    A hackish way to get the \"value\" of a parameter.\n\n    Previously Parameter exposed ``param_obj._value``. This is replacement for\n    that so I don't need to rewrite all test cases.\n    \"\"\"\n\n    class DummyLuigiTask(luigi.Task):\n        param = parameter\n\n    return DummyLuigiTask().param\n\n\nclass ParameterTest(LuigiTestCase):\n    def test_default_param(self):\n        self.assertEqual(WithDefault().x, \"xyz\")\n\n    def test_missing_param(self):\n        def create_a():\n            return A()\n\n        self.assertRaises(luigi.parameter.MissingParameterException, create_a)\n\n    def test_unknown_param(self):\n        def create_a():\n            return A(p=5, q=4)\n\n        self.assertRaises(luigi.parameter.UnknownParameterException, create_a)\n\n    def test_unknown_param_2(self):\n        def create_a():\n            return A(1, 2, 3)\n\n        self.assertRaises(luigi.parameter.UnknownParameterException, create_a)\n\n    def test_duplicated_param(self):\n        def create_a():\n            return A(5, p=7)\n\n        self.assertRaises(luigi.parameter.DuplicateParameterException, create_a)\n\n    def test_parameter_registration(self):\n        self.assertEqual(len(Foo.get_params()), 2)\n\n    def test_task_creation(self):\n        f = Foo(\"barval\", p2=5)\n        self.assertEqual(len(f.get_params()), 2)\n        self.assertEqual(f.bar, \"barval\")\n        self.assertEqual(f.p2, 5)\n        self.assertEqual(f.not_a_param, \"lol\")\n\n    def test_bool_parsing(self):\n        self.run_locally([\"Baz\"])\n        self.assertFalse(Baz._val)\n        self.assertTrue(Baz._val_true)\n        self.assertFalse(Baz._val_explicit)\n\n        self.run_locally([\"Baz\", \"--bool\", \"--bool-true\"])\n        self.assertTrue(Baz._val)\n        self.assertTrue(Baz._val_true)\n\n        self.run_locally([\"Baz\", \"--bool-explicit\", \"true\"])\n        self.assertTrue(Baz._val_explicit)\n\n        self.run_locally([\"Baz\", \"--bool-explicit\", \"false\"])\n        self.assertFalse(Baz._val_explicit)\n\n    def test_bool_default(self):\n        self.assertTrue(WithDefaultTrue().x)\n        self.assertFalse(WithDefaultFalse().x)\n\n    def test_bool_coerce(self):\n        self.assertTrue(WithDefaultTrue(x=\"true\").x)\n        self.assertFalse(WithDefaultTrue(x=\"false\").x)\n\n    def test_bool_no_coerce_none(self):\n        self.assertIsNone(WithDefaultTrue(x=None).x)\n\n    def test_forgot_param(self):\n        self.assertRaises(\n            luigi.parameter.MissingParameterException,\n            self.run_locally,\n            [\"ForgotParam\"],\n        )\n\n    @email_patch\n    def test_forgot_param_in_dep(self, emails):\n        # A programmatic missing parameter will cause an error email to be sent\n        self.run_locally([\"ForgotParamDep\"])\n        self.assertNotEqual(emails, [])\n\n    def test_default_param_cmdline(self):\n        self.assertEqual(WithDefault().x, \"xyz\")\n\n    def test_default_param_cmdline_2(self):\n        self.assertEqual(WithDefault().x, \"xyz\")\n\n    def test_insignificant_parameter(self):\n        class InsignificantParameterTask(luigi.Task):\n            foo = luigi.Parameter(significant=False, default=\"foo_default\")\n            bar = luigi.Parameter()\n\n        t1 = InsignificantParameterTask(foo=\"x\", bar=\"y\")\n        self.assertEqual(str(t1), \"InsignificantParameterTask(bar=y)\")\n\n        t2 = InsignificantParameterTask(\"u\", \"z\")\n        self.assertEqual(t2.foo, \"u\")\n        self.assertEqual(t2.bar, \"z\")\n        self.assertEqual(str(t2), \"InsignificantParameterTask(bar=z)\")\n\n    def test_local_significant_param(self):\n        \"\"\"Obviously, if anything should be positional, so should local\n        significant parameters\"\"\"\n\n        class MyTask(luigi.Task):\n            # This could typically be \"--label-company=disney\"\n            x = luigi.Parameter(significant=True)\n\n        MyTask(\"arg\")\n        self.assertRaises(luigi.parameter.MissingParameterException, lambda: MyTask())\n\n    def test_local_insignificant_param(self):\n        \"\"\"Ensure we have the same behavior as in before a78338c\"\"\"\n\n        class MyTask(luigi.Task):\n            # This could typically be \"--num-threads=True\"\n            x = luigi.Parameter(significant=False)\n\n        MyTask(\"arg\")\n        self.assertRaises(luigi.parameter.MissingParameterException, lambda: MyTask())\n\n    def test_nonpositional_param(self):\n        \"\"\"Ensure we have the same behavior as in before a78338c\"\"\"\n\n        class MyTask(luigi.Task):\n            # This could typically be \"--num-threads=10\"\n            x = luigi.Parameter(significant=False, positional=False)\n\n        MyTask(x=\"arg\")\n        self.assertRaises(luigi.parameter.UnknownParameterException, lambda: MyTask(\"arg\"))\n\n    def test_enum_param_valid(self):\n        p = luigi.parameter.EnumParameter(enum=MyEnum)\n        self.assertEqual(MyEnum.A, p.parse(\"A\"))\n\n    def test_enum_param_invalid(self):\n        p = luigi.parameter.EnumParameter(enum=MyEnum)\n        self.assertRaises(ValueError, lambda: p.parse(\"B\"))\n\n    def test_enum_param_missing(self):\n        self.assertRaises(ParameterException, lambda: luigi.parameter.EnumParameter())\n\n    def test_enum_list_param_valid(self):\n        p = luigi.parameter.EnumListParameter(enum=MyEnum)\n        self.assertEqual((), p.parse(\"\"))\n        self.assertEqual((MyEnum.A,), p.parse(\"A\"))\n        self.assertEqual((MyEnum.A, MyEnum.C), p.parse(\"A,C\"))\n\n    def test_enum_list_param_invalid(self):\n        p = luigi.parameter.EnumListParameter(enum=MyEnum)\n        self.assertRaises(ValueError, lambda: p.parse(\"A,B\"))\n\n    def test_enum_list_param_missing(self):\n        self.assertRaises(ParameterException, lambda: luigi.parameter.EnumListParameter())\n\n    def test_choice_list_param_valid(self):\n        p = luigi.parameter.ChoiceListParameter(choices=[\"1\", \"2\", \"3\"])\n        self.assertEqual((), p.parse(\"\"))\n        self.assertEqual((\"1\",), p.parse(\"1\"))\n        self.assertEqual((\"1\", \"3\"), p.parse(\"1,3\"))\n\n    def test_choice_list_param_invalid(self):\n        p = luigi.parameter.ChoiceListParameter(choices=[\"1\", \"2\", \"3\"])\n        self.assertRaises(ValueError, lambda: p.parse(\"1,4\"))\n\n    def test_invalid_choice_type(self):\n        self.assertRaises(\n            AssertionError,\n            lambda: luigi.ChoiceListParameter(var_type=int, choices=[1, 2, \"3\"]),\n        )\n\n    def test_choice_list_param_missing(self):\n        self.assertRaises(ParameterException, lambda: luigi.parameter.ChoiceListParameter())\n\n    def test_tuple_serialize_parse(self):\n        a = luigi.TupleParameter()\n        b_tuple = ((1, 2), (3, 4))\n        self.assertEqual(b_tuple, a.parse(a.serialize(b_tuple)))\n\n    def test_parse_list_without_batch_method(self):\n        param = luigi.Parameter()\n        for xs in [], [\"x\"], [\"x\", \"y\"]:\n            self.assertRaises(NotImplementedError, param._parse_list, xs)\n\n    def test_parse_empty_list_raises_value_error(self):\n        for batch_method in (max, min, tuple, \",\".join):\n            param = luigi.Parameter(batch_method=batch_method)\n            self.assertRaises(ValueError, param._parse_list, [])\n\n    def test_parse_int_list_max(self):\n        param = luigi.IntParameter(batch_method=max)\n        self.assertEqual(17, param._parse_list([\"7\", \"17\", \"5\"]))\n\n    def test_parse_string_list_max(self):\n        param = luigi.Parameter(batch_method=max)\n        self.assertEqual(\"7\", param._parse_list([\"7\", \"17\", \"5\"]))\n\n    def test_parse_list_as_tuple(self):\n        param = luigi.IntParameter(batch_method=tuple)\n        self.assertEqual((7, 17, 5), param._parse_list([\"7\", \"17\", \"5\"]))\n\n    @mock.patch(\"luigi.parameter.warnings\")\n    def test_warn_on_default_none(self, warnings):\n        class TestConfig(luigi.Config):\n            param = luigi.Parameter(default=None)\n\n        TestConfig()\n        warnings.warn.assert_called_once_with('Parameter \"param\" with value \"None\" is not of type string.')\n\n    @mock.patch(\"luigi.parameter.warnings\")\n    def test_no_warn_on_string(self, warnings):\n        class TestConfig(luigi.Config):\n            param = luigi.Parameter(default=None)\n\n        TestConfig(param=\"str\")\n        warnings.warn.assert_not_called()\n\n    def test_no_warn_on_none_in_optional(self):\n        class TestConfig(luigi.Config):\n            param = luigi.OptionalParameter(default=None)\n\n        with mock.patch(\"luigi.parameter.warnings\") as warnings:\n            TestConfig()\n            warnings.warn.assert_not_called()\n\n        with mock.patch(\"luigi.parameter.warnings\") as warnings:\n            TestConfig(param=None)\n            warnings.warn.assert_not_called()\n\n        with mock.patch(\"luigi.parameter.warnings\") as warnings:\n            TestConfig(param=\"\")\n            warnings.warn.assert_not_called()\n\n    @mock.patch(\"luigi.parameter.warnings\")\n    def test_no_warn_on_string_in_optional(self, warnings):\n        class TestConfig(luigi.Config):\n            param = luigi.OptionalParameter(default=None)\n\n        TestConfig(param=\"value\")\n        warnings.warn.assert_not_called()\n\n    @mock.patch(\"luigi.parameter.warnings\")\n    def test_warn_on_bad_type_in_optional(self, warnings):\n        class TestConfig(luigi.Config):\n            param = luigi.OptionalParameter()\n\n        TestConfig(param=1)\n        warnings.warn.assert_called_once_with(\n            'OptionalParameter \"param\" with value \"1\" is not of type \"str\" or None.', luigi.parameter.OptionalParameterTypeWarning\n        )\n\n    def test_optional_parameter_parse_none(self):\n        self.assertIsNone(luigi.OptionalParameter().parse(\"\"))\n\n    def test_optional_parameter_parse_string(self):\n        self.assertEqual(\"test\", luigi.OptionalParameter().parse(\"test\"))\n\n    def test_optional_parameter_serialize_none(self):\n        self.assertEqual(\"\", luigi.OptionalParameter().serialize(None))\n\n    def test_optional_parameter_serialize_string(self):\n        self.assertEqual(\"test\", luigi.OptionalParameter().serialize(\"test\"))\n\n\nclass TestParametersHashability(LuigiTestCase):\n    def test_date(self):\n        class Foo(luigi.Task):\n            args = luigi.parameter.DateParameter()\n\n        p = luigi.parameter.DateParameter()\n        self.assertEqual(hash(Foo(args=datetime.date(2000, 1, 1)).args), hash(p.parse(\"2000-1-1\")))\n\n    def test_dateminute(self):\n        class Foo(luigi.Task):\n            args = luigi.parameter.DateMinuteParameter()\n\n        p = luigi.parameter.DateMinuteParameter()\n        self.assertEqual(hash(Foo(args=datetime.datetime(2000, 1, 1, 12, 0)).args), hash(p.parse(\"2000-1-1T1200\")))\n\n    def test_dateinterval(self):\n        class Foo(luigi.Task):\n            args = luigi.parameter.DateIntervalParameter()\n\n        p = luigi.parameter.DateIntervalParameter()\n        di = luigi.date_interval.Custom(datetime.date(2000, 1, 1), datetime.date(2000, 2, 12))\n        self.assertEqual(hash(Foo(args=di).args), hash(p.parse(\"2000-01-01-2000-02-12\")))\n\n    def test_timedelta(self):\n        class Foo(luigi.Task):\n            args = luigi.parameter.TimeDeltaParameter()\n\n        p = luigi.parameter.TimeDeltaParameter()\n        self.assertEqual(hash(Foo(args=datetime.timedelta(days=2, hours=3, minutes=2)).args), hash(p.parse(\"P2DT3H2M\")))\n\n    def test_boolean(self):\n        class Foo(luigi.Task):\n            args = luigi.parameter.BoolParameter()\n\n        p = luigi.parameter.BoolParameter()\n\n        self.assertEqual(hash(Foo(args=True).args), hash(p.parse(\"true\")))\n        self.assertEqual(hash(Foo(args=False).args), hash(p.parse(\"false\")))\n\n    def test_int(self):\n        class Foo(luigi.Task):\n            args = luigi.parameter.IntParameter()\n\n        p = luigi.parameter.IntParameter()\n        self.assertEqual(hash(Foo(args=1).args), hash(p.parse(\"1\")))\n\n    def test_float(self):\n        class Foo(luigi.Task):\n            args = luigi.parameter.FloatParameter()\n\n        p = luigi.parameter.FloatParameter()\n        self.assertEqual(hash(Foo(args=1.0).args), hash(p.parse(\"1\")))\n\n    def test_enum(self):\n        class Foo(luigi.Task):\n            args = luigi.parameter.EnumParameter(enum=MyEnum)\n\n        p = luigi.parameter.EnumParameter(enum=MyEnum)\n        self.assertEqual(hash(Foo(args=MyEnum.A).args), hash(p.parse(\"A\")))\n\n    def test_enum_list(self):\n        class Foo(luigi.Task):\n            args = luigi.parameter.EnumListParameter(enum=MyEnum)\n\n        p = luigi.parameter.EnumListParameter(enum=MyEnum)\n        self.assertEqual(hash(Foo(args=(MyEnum.A, MyEnum.C)).args), hash(p.parse(\"A,C\")))\n\n        class FooWithDefault(luigi.Task):\n            args = luigi.parameter.EnumListParameter(enum=MyEnum, default=[MyEnum.C])\n\n        self.assertEqual(FooWithDefault().args, p.parse(\"C\"))\n\n    def test_choice_list(self):\n        class Foo(luigi.Task):\n            args = luigi.ChoiceListParameter(var_type=str, choices=[\"1\", \"2\", \"3\"])\n\n        p = luigi.ChoiceListParameter(var_type=str, choices=[\"3\", \"2\", \"1\"])\n        self.assertEqual(hash(Foo(args=(\"3\",)).args), hash(p.parse(\"3\")))\n\n    def test_dict(self):\n        class Foo(luigi.Task):\n            args = luigi.parameter.DictParameter()\n\n        p = luigi.parameter.DictParameter()\n        self.assertEqual(hash(Foo(args=dict(foo=1, bar=\"hello\")).args), hash(p.parse('{\"foo\":1,\"bar\":\"hello\"}')))\n\n    def test_list(self):\n        class Foo(luigi.Task):\n            args = luigi.parameter.ListParameter()\n\n        p = luigi.parameter.ListParameter()\n        self.assertEqual(hash(Foo(args=[1, \"hello\"]).args), hash(p.normalize(p.parse('[1,\"hello\"]'))))\n\n    def test_list_param_with_default_none_in_dynamic_req_task(self):\n        class TaskWithDefaultNoneParameter(RunOnceTask):\n            args = luigi.parameter.ListParameter(default=None)\n\n        class DynamicTaskCallsDefaultNoneParameter(RunOnceTask):\n            def run(self):\n                yield [TaskWithDefaultNoneParameter()]\n                self.comp = True\n\n        self.assertTrue(self.run_locally([\"DynamicTaskCallsDefaultNoneParameter\"]))\n\n    def test_list_dict(self):\n        class Foo(luigi.Task):\n            args = luigi.parameter.ListParameter()\n\n        p = luigi.parameter.ListParameter()\n        self.assertEqual(hash(Foo(args=[{\"foo\": \"bar\"}, {\"doge\": \"wow\"}]).args), hash(p.normalize(p.parse('[{\"foo\": \"bar\"}, {\"doge\": \"wow\"}]'))))\n\n    def test_list_nested(self):\n        class Foo(luigi.Task):\n            args = luigi.parameter.ListParameter()\n\n        p = luigi.parameter.ListParameter()\n        self.assertEqual(hash(Foo(args=[[\"foo\", \"bar\"], [\"doge\", \"wow\"]]).args), hash(p.normalize(p.parse('[[\"foo\", \"bar\"], [\"doge\", \"wow\"]]'))))\n\n    def test_tuple(self):\n        class Foo(luigi.Task):\n            args = luigi.parameter.TupleParameter()\n\n        p = luigi.parameter.TupleParameter()\n        self.assertEqual(hash(Foo(args=(1, \"hello\")).args), hash(p.parse('(1,\"hello\")')))\n\n    def test_tuple_dict(self):\n        class Foo(luigi.Task):\n            args = luigi.parameter.TupleParameter()\n\n        p = luigi.parameter.TupleParameter()\n        self.assertEqual(hash(Foo(args=({\"foo\": \"bar\"}, {\"doge\": \"wow\"})).args), hash(p.normalize(p.parse('({\"foo\": \"bar\"}, {\"doge\": \"wow\"})'))))\n\n    def test_tuple_nested(self):\n        class Foo(luigi.Task):\n            args = luigi.parameter.TupleParameter()\n\n        p = luigi.parameter.TupleParameter()\n        self.assertEqual(hash(Foo(args=((\"foo\", \"bar\"), (\"doge\", \"wow\"))).args), hash(p.normalize(p.parse('((\"foo\", \"bar\"), (\"doge\", \"wow\"))'))))\n\n    def test_tuple_string_with_json(self):\n        class Foo(luigi.Task):\n            args = luigi.parameter.TupleParameter()\n\n        p = luigi.parameter.TupleParameter()\n        self.assertEqual(hash(Foo(args=(\"foo\", \"bar\")).args), hash(p.normalize(p.parse('[\"foo\", \"bar\"]'))))\n\n    def test_tuple_invalid_string(self):\n        param = luigi.TupleParameter()\n        self.assertRaises(ValueError, lambda: param.parse('(\"abcd\")'))\n\n    def test_tuple_invalid_string_in_tuple(self):\n        param = luigi.TupleParameter()\n        self.assertRaises(ValueError, lambda: param.parse('((\"abcd\"))'))\n\n    def test_parse_invalid_format(self):\n        param = luigi.TupleParameter()\n        self.assertRaises(SyntaxError, lambda: param.parse(\"((1,2),(3,4\"))\n\n    def test_task(self):\n        class Bar(luigi.Task):\n            pass\n\n        class Foo(luigi.Task):\n            args = luigi.parameter.TaskParameter()\n\n        p = luigi.parameter.TaskParameter()\n        self.assertEqual(hash(Foo(args=Bar).args), hash(p.parse(\"Bar\")))\n\n\nclass TestNewStyleGlobalParameters(LuigiTestCase):\n    def setUp(self):\n        super(TestNewStyleGlobalParameters, self).setUp()\n        MockTarget.fs.clear()\n\n    def expect_keys(self, expected):\n        self.assertEqual(set(MockTarget.fs.get_all_data().keys()), set(expected))\n\n    def test_x_arg(self):\n        self.run_locally([\"Banana\", \"--x\", \"foo\", \"--y\", \"bar\", \"--style\", \"x-arg\"])\n        self.expect_keys([\"banana-foo-bar\", \"banana-dep-foo-def\"])\n\n    def test_x_arg_override(self):\n        self.run_locally([\"Banana\", \"--x\", \"foo\", \"--y\", \"bar\", \"--style\", \"x-arg\", \"--BananaDep-y\", \"xyz\"])\n        self.expect_keys([\"banana-foo-bar\", \"banana-dep-foo-xyz\"])\n\n    def test_x_arg_override_stupid(self):\n        self.run_locally([\"Banana\", \"--x\", \"foo\", \"--y\", \"bar\", \"--style\", \"x-arg\", \"--BananaDep-x\", \"blabla\"])\n        self.expect_keys([\"banana-foo-bar\", \"banana-dep-foo-def\"])\n\n    def test_x_arg_y_arg(self):\n        self.run_locally([\"Banana\", \"--x\", \"foo\", \"--y\", \"bar\", \"--style\", \"x-arg-y-arg\"])\n        self.expect_keys([\"banana-foo-bar\", \"banana-dep-foo-bar\"])\n\n    def test_x_arg_y_arg_override(self):\n        self.run_locally([\"Banana\", \"--x\", \"foo\", \"--y\", \"bar\", \"--style\", \"x-arg-y-arg\", \"--BananaDep-y\", \"xyz\"])\n        self.expect_keys([\"banana-foo-bar\", \"banana-dep-foo-bar\"])\n\n    def test_x_arg_y_arg_override_all(self):\n        self.run_locally([\"Banana\", \"--x\", \"foo\", \"--y\", \"bar\", \"--style\", \"x-arg-y-arg\", \"--BananaDep-y\", \"xyz\", \"--BananaDep-x\", \"blabla\"])\n        self.expect_keys([\"banana-foo-bar\", \"banana-dep-foo-bar\"])\n\n    def test_y_arg_override(self):\n        self.run_locally([\"Banana\", \"--x\", \"foo\", \"--y\", \"bar\", \"--style\", \"y-kwarg\", \"--BananaDep-x\", \"xyz\"])\n        self.expect_keys([\"banana-foo-bar\", \"banana-dep-xyz-bar\"])\n\n    def test_y_arg_override_both(self):\n        self.run_locally([\"Banana\", \"--x\", \"foo\", \"--y\", \"bar\", \"--style\", \"y-kwarg\", \"--BananaDep-x\", \"xyz\", \"--BananaDep-y\", \"blah\"])\n        self.expect_keys([\"banana-foo-bar\", \"banana-dep-xyz-bar\"])\n\n    def test_y_arg_override_banana(self):\n        self.run_locally([\"Banana\", \"--y\", \"bar\", \"--style\", \"y-kwarg\", \"--BananaDep-x\", \"xyz\", \"--Banana-x\", \"baz\"])\n        self.expect_keys([\"banana-baz-bar\", \"banana-dep-xyz-bar\"])\n\n\nclass TestRemoveGlobalParameters(LuigiTestCase):\n    def run_and_check(self, args):\n        run_exit_status = self.run_locally(args)\n        self.assertTrue(run_exit_status)\n        return run_exit_status\n\n    @parsing([\"--MyConfig-mc-p\", \"99\", \"--mc-r\", \"55\", \"NoopTask\"])\n    def test_use_config_class_1(self):\n        self.assertEqual(MyConfig().mc_p, 99)\n        self.assertEqual(MyConfig().mc_q, 73)\n        self.assertEqual(MyConfigWithoutSection().mc_r, 55)\n        self.assertEqual(MyConfigWithoutSection().mc_s, 99)\n\n    @parsing([\"NoopTask\", \"--MyConfig-mc-p\", \"99\", \"--mc-r\", \"55\"])\n    def test_use_config_class_2(self):\n        self.assertEqual(MyConfig().mc_p, 99)\n        self.assertEqual(MyConfig().mc_q, 73)\n        self.assertEqual(MyConfigWithoutSection().mc_r, 55)\n        self.assertEqual(MyConfigWithoutSection().mc_s, 99)\n\n    @parsing([\"--MyConfig-mc-p\", \"99\", \"--mc-r\", \"55\", \"NoopTask\", \"--mc-s\", \"123\", \"--MyConfig-mc-q\", \"42\"])\n    def test_use_config_class_more_args(self):\n        self.assertEqual(MyConfig().mc_p, 99)\n        self.assertEqual(MyConfig().mc_q, 42)\n        self.assertEqual(MyConfigWithoutSection().mc_r, 55)\n        self.assertEqual(MyConfigWithoutSection().mc_s, 123)\n\n    @with_config({\"MyConfig\": {\"mc_p\": \"666\", \"mc_q\": \"777\"}})\n    @parsing([\"--mc-r\", \"555\", \"NoopTask\"])\n    def test_use_config_class_with_configuration(self):\n        self.assertEqual(MyConfig().mc_p, 666)\n        self.assertEqual(MyConfig().mc_q, 777)\n        self.assertEqual(MyConfigWithoutSection().mc_r, 555)\n        self.assertEqual(MyConfigWithoutSection().mc_s, 99)\n\n    @with_config({\"MyConfigWithoutSection\": {\"mc_r\": \"999\", \"mc_s\": \"888\"}})\n    @parsing([\"NoopTask\", \"--MyConfig-mc-p\", \"222\", \"--mc-r\", \"555\"])\n    def test_use_config_class_with_configuration_2(self):\n        self.assertEqual(MyConfig().mc_p, 222)\n        self.assertEqual(MyConfig().mc_q, 73)\n        self.assertEqual(MyConfigWithoutSection().mc_r, 555)\n        self.assertEqual(MyConfigWithoutSection().mc_s, 888)\n\n    @with_config({\"MyConfig\": {\"mc_p\": \"555\", \"mc-p\": \"666\", \"mc-q\": \"777\"}})\n    def test_configuration_style(self):\n        self.assertEqual(MyConfig().mc_p, 555)\n        self.assertEqual(MyConfig().mc_q, 777)\n\n    def test_misc_1(self):\n        class Dogs(luigi.Config):\n            n_dogs = luigi.IntParameter()\n\n        class CatsWithoutSection(luigi.Config):\n            use_cmdline_section = False\n            n_cats = luigi.IntParameter()\n\n        with luigi.cmdline_parser.CmdlineParser.global_instance([\"--n-cats\", \"123\", \"--Dogs-n-dogs\", \"456\", \"WithDefault\"], allow_override=True):\n            self.assertEqual(Dogs().n_dogs, 456)\n            self.assertEqual(CatsWithoutSection().n_cats, 123)\n\n        with luigi.cmdline_parser.CmdlineParser.global_instance([\"WithDefault\", \"--n-cats\", \"321\", \"--Dogs-n-dogs\", \"654\"], allow_override=True):\n            self.assertEqual(Dogs().n_dogs, 654)\n            self.assertEqual(CatsWithoutSection().n_cats, 321)\n\n    def test_global_significant_param_warning(self):\n        \"\"\"We don't want any kind of global param to be positional\"\"\"\n        with self.assertWarnsRegex(DeprecationWarning, \"is_global support is removed. Assuming positional=False\"):\n\n            class MyTask(luigi.Task):\n                # This could typically be called \"--test-dry-run\"\n                x_g1 = luigi.Parameter(default=\"y\", is_global=True, significant=True)\n\n        self.assertRaises(luigi.parameter.UnknownParameterException, lambda: MyTask(\"arg\"))\n\n        def test_global_insignificant_param_warning(self):\n            \"\"\"We don't want any kind of global param to be positional\"\"\"\n            with self.assertWarnsRegex(DeprecationWarning, \"is_global support is removed. Assuming positional=False\"):\n\n                class MyTask(luigi.Task):\n                    # This could typically be \"--yarn-pool=development\"\n                    x_g2 = luigi.Parameter(default=\"y\", is_global=True, significant=False)\n\n            self.assertRaises(luigi.parameter.UnknownParameterException, lambda: MyTask(\"arg\"))\n\n\nclass TestParamWithDefaultFromConfig(LuigiTestCase):\n    def testNoSection(self):\n        self.assertRaises(ParameterException, lambda: _value(luigi.Parameter(config_path=dict(section=\"foo\", name=\"bar\"))))\n\n    @with_config({\"foo\": {}})\n    def testNoValue(self):\n        self.assertRaises(ParameterException, lambda: _value(luigi.Parameter(config_path=dict(section=\"foo\", name=\"bar\"))))\n\n    @with_config({\"foo\": {\"bar\": \"baz\"}})\n    def testDefault(self):\n        class LocalA(luigi.Task):\n            p = luigi.Parameter(config_path=dict(section=\"foo\", name=\"bar\"))\n\n        self.assertEqual(\"baz\", LocalA().p)\n        self.assertEqual(\"boo\", LocalA(p=\"boo\").p)\n\n    @with_config({\"foo\": {\"bar\": \"2001-02-03T04\"}})\n    def testDateHour(self):\n        p = luigi.DateHourParameter(config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(datetime.datetime(2001, 2, 3, 4, 0, 0), _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"2001-02-03T05\"}})\n    def testDateHourWithInterval(self):\n        p = luigi.DateHourParameter(config_path=dict(section=\"foo\", name=\"bar\"), interval=2)\n        self.assertEqual(datetime.datetime(2001, 2, 3, 4, 0, 0), _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"2001-02-03T0430\"}})\n    def testDateMinute(self):\n        p = luigi.DateMinuteParameter(config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(datetime.datetime(2001, 2, 3, 4, 30, 0), _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"2001-02-03T0431\"}})\n    def testDateWithMinuteInterval(self):\n        p = luigi.DateMinuteParameter(config_path=dict(section=\"foo\", name=\"bar\"), interval=2)\n        self.assertEqual(datetime.datetime(2001, 2, 3, 4, 30, 0), _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"2001-02-03T04H30\"}})\n    def testDateMinuteDeprecated(self):\n        p = luigi.DateMinuteParameter(config_path=dict(section=\"foo\", name=\"bar\"))\n        with self.assertWarnsRegex(DeprecationWarning, 'Using \"H\" between hours and minutes is deprecated, omit it instead.'):\n            self.assertEqual(datetime.datetime(2001, 2, 3, 4, 30, 0), _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"2001-02-03T040506\"}})\n    def testDateSecond(self):\n        p = luigi.DateSecondParameter(config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(datetime.datetime(2001, 2, 3, 4, 5, 6), _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"2001-02-03T040507\"}})\n    def testDateSecondWithInterval(self):\n        p = luigi.DateSecondParameter(config_path=dict(section=\"foo\", name=\"bar\"), interval=2)\n        self.assertEqual(datetime.datetime(2001, 2, 3, 4, 5, 6), _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"2001-02-03\"}})\n    def testDate(self):\n        p = luigi.DateParameter(config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(datetime.date(2001, 2, 3), _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"2001-02-03\"}})\n    def testDateWithInterval(self):\n        p = luigi.DateParameter(config_path=dict(section=\"foo\", name=\"bar\"), interval=3, start=datetime.date(2001, 2, 1))\n        self.assertEqual(datetime.date(2001, 2, 1), _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"2015-07\"}})\n    def testMonthParameter(self):\n        p = luigi.MonthParameter(config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(datetime.date(2015, 7, 1), _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"2015-07\"}})\n    def testMonthWithIntervalParameter(self):\n        p = luigi.MonthParameter(config_path=dict(section=\"foo\", name=\"bar\"), interval=13, start=datetime.date(2014, 1, 1))\n        self.assertEqual(datetime.date(2015, 2, 1), _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"2015\"}})\n    def testYearParameter(self):\n        p = luigi.YearParameter(config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(datetime.date(2015, 1, 1), _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"2015\"}})\n    def testYearWithIntervalParameter(self):\n        p = luigi.YearParameter(config_path=dict(section=\"foo\", name=\"bar\"), start=datetime.date(2011, 1, 1), interval=5)\n        self.assertEqual(datetime.date(2011, 1, 1), _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"123\"}})\n    def testInt(self):\n        p = luigi.IntParameter(config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(123, _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"true\"}})\n    def testBool(self):\n        p = luigi.BoolParameter(config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(True, _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"false\"}})\n    def testBoolConfigOutranksDefault(self):\n        p = luigi.BoolParameter(default=True, config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(False, _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"2001-02-03-2001-02-28\"}})\n    def testDateInterval(self):\n        p = luigi.DateIntervalParameter(config_path=dict(section=\"foo\", name=\"bar\"))\n        expected = luigi.date_interval.Custom.parse(\"2001-02-03-2001-02-28\")\n        self.assertEqual(expected, _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"0 seconds\"}})\n    def testTimeDeltaNoSeconds(self):\n        p = luigi.TimeDeltaParameter(config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(timedelta(seconds=0), _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"0 d\"}})\n    def testTimeDeltaNoDays(self):\n        p = luigi.TimeDeltaParameter(config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(timedelta(days=0), _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"1 day\"}})\n    def testTimeDelta(self):\n        p = luigi.TimeDeltaParameter(config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(timedelta(days=1), _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"2 seconds\"}})\n    def testTimeDeltaPlural(self):\n        p = luigi.TimeDeltaParameter(config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(timedelta(seconds=2), _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"3w 4h 5m\"}})\n    def testTimeDeltaMultiple(self):\n        p = luigi.TimeDeltaParameter(config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(timedelta(weeks=3, hours=4, minutes=5), _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"P4DT12H30M5S\"}})\n    def testTimeDelta8601(self):\n        p = luigi.TimeDeltaParameter(config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(timedelta(days=4, hours=12, minutes=30, seconds=5), _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"P5D\"}})\n    def testTimeDelta8601NoTimeComponent(self):\n        p = luigi.TimeDeltaParameter(config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(timedelta(days=5), _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"P5W\"}})\n    def testTimeDelta8601Weeks(self):\n        p = luigi.TimeDeltaParameter(config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(timedelta(weeks=5), _value(p))\n\n    @mock.patch(\"luigi.parameter.ParameterException\")\n    @with_config({\"foo\": {\"bar\": \"P3Y6M4DT12H30M5S\"}})\n    def testTimeDelta8601YearMonthNotSupported(self, exc):\n        def f():\n            return _value(luigi.TimeDeltaParameter(config_path=dict(section=\"foo\", name=\"bar\")))\n\n        self.assertRaises(ValueError, f)  # ISO 8601 durations with years or months are not supported\n        exc.assert_called_once_with(\"Invalid time delta - could not parse P3Y6M4DT12H30M5S\")\n\n    @with_config({\"foo\": {\"bar\": \"PT6M\"}})\n    def testTimeDelta8601MAfterT(self):\n        p = luigi.TimeDeltaParameter(config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(timedelta(minutes=6), _value(p))\n\n    @mock.patch(\"luigi.parameter.ParameterException\")\n    @with_config({\"foo\": {\"bar\": \"P6M\"}})\n    def testTimeDelta8601MBeforeT(self, exc):\n        def f():\n            return _value(luigi.TimeDeltaParameter(config_path=dict(section=\"foo\", name=\"bar\")))\n\n        self.assertRaises(ValueError, f)  # ISO 8601 durations with months are not supported\n        exc.assert_called_once_with(\"Invalid time delta - could not parse P6M\")\n\n    @with_config({\"foo\": {\"bar\": \"12.34\"}})\n    def testTimeDeltaFloat(self):\n        p = luigi.TimeDeltaParameter(config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(timedelta(seconds=12.34), _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"56789\"}})\n    def testTimeDeltaInt(self):\n        p = luigi.TimeDeltaParameter(config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(timedelta(seconds=56789), _value(p))\n\n    def testHasDefaultNoSection(self):\n        self.assertRaises(luigi.parameter.MissingParameterException, lambda: _value(luigi.Parameter(config_path=dict(section=\"foo\", name=\"bar\"))))\n\n    @with_config({\"foo\": {}})\n    def testHasDefaultNoValue(self):\n        self.assertRaises(luigi.parameter.MissingParameterException, lambda: _value(luigi.Parameter(config_path=dict(section=\"foo\", name=\"bar\"))))\n\n    @with_config({\"foo\": {\"bar\": \"baz\"}})\n    def testHasDefaultWithBoth(self):\n        self.assertTrue(_value(luigi.Parameter(config_path=dict(section=\"foo\", name=\"bar\"))))\n\n    @with_config({\"foo\": {\"bar\": \"baz\"}})\n    def testWithDefault(self):\n        p = luigi.Parameter(config_path=dict(section=\"foo\", name=\"bar\"), default=\"blah\")\n        self.assertEqual(\"baz\", _value(p))  # config overrides default\n\n    def testWithDefaultAndMissing(self):\n        p = luigi.Parameter(config_path=dict(section=\"foo\", name=\"bar\"), default=\"blah\")\n        self.assertEqual(\"blah\", _value(p))\n\n    @with_config({\"LocalA\": {\"p\": \"p_default\"}})\n    def testDefaultFromTaskName(self):\n        class LocalA(luigi.Task):\n            p = luigi.Parameter()\n\n        self.assertEqual(\"p_default\", LocalA().p)\n        self.assertEqual(\"boo\", LocalA(p=\"boo\").p)\n\n    @with_config({\"LocalA\": {\"p\": \"999\"}})\n    def testDefaultFromTaskNameInt(self):\n        class LocalA(luigi.Task):\n            p = luigi.IntParameter()\n\n        self.assertEqual(999, LocalA().p)\n        self.assertEqual(777, LocalA(p=777).p)\n\n    @with_config({\"LocalA\": {\"p\": \"p_default\"}, \"foo\": {\"bar\": \"baz\"}})\n    def testDefaultFromConfigWithTaskNameToo(self):\n        class LocalA(luigi.Task):\n            p = luigi.Parameter(config_path=dict(section=\"foo\", name=\"bar\"))\n\n        self.assertEqual(\"p_default\", LocalA().p)\n        self.assertEqual(\"boo\", LocalA(p=\"boo\").p)\n\n    @with_config({\"LocalA\": {\"p\": \"p_default_2\"}})\n    def testDefaultFromTaskNameWithDefault(self):\n        class LocalA(luigi.Task):\n            p = luigi.Parameter(default=\"banana\")\n\n        self.assertEqual(\"p_default_2\", LocalA().p)\n        self.assertEqual(\"boo_2\", LocalA(p=\"boo_2\").p)\n\n    @with_config({\"MyClass\": {\"p_wohoo\": \"p_default_3\"}})\n    def testWithLongParameterName(self):\n        class MyClass(luigi.Task):\n            p_wohoo = luigi.Parameter(default=\"banana\")\n\n        self.assertEqual(\"p_default_3\", MyClass().p_wohoo)\n        self.assertEqual(\"boo_2\", MyClass(p_wohoo=\"boo_2\").p_wohoo)\n\n    @with_config({\"RangeDaily\": {\"days_back\": \"123\"}})\n    def testSettingOtherMember(self):\n        class LocalA(luigi.Task):\n            pass\n\n        self.assertEqual(123, luigi.tools.range.RangeDaily(of=LocalA).days_back)\n        self.assertEqual(70, luigi.tools.range.RangeDaily(of=LocalA, days_back=70).days_back)\n\n    @with_config({\"MyClass\": {\"p_not_global\": \"123\"}})\n    def testCommandLineWithDefault(self):\n        \"\"\"\n        Verify that we also read from the config when we build tasks from the\n        command line parsers.\n        \"\"\"\n\n        class MyClass(luigi.Task):\n            p_not_global = luigi.Parameter(default=\"banana\")\n\n            def complete(self):\n                import sys\n\n                luigi.configuration.get_config().write(sys.stdout)\n                if self.p_not_global != \"123\":\n                    raise ValueError(\"The parameter didn't get set!!\")\n                return True\n\n            def run(self):\n                pass\n\n        self.assertTrue(self.run_locally([\"MyClass\"]))\n        self.assertFalse(self.run_locally([\"MyClass\", \"--p-not-global\", \"124\"]))\n        self.assertFalse(self.run_locally([\"MyClass\", \"--MyClass-p-not-global\", \"124\"]))\n\n    @with_config({\"MyClass2\": {\"p_not_global_no_default\": \"123\"}})\n    def testCommandLineNoDefault(self):\n        \"\"\"\n        Verify that we also read from the config when we build tasks from the\n        command line parsers.\n        \"\"\"\n\n        class MyClass2(luigi.Task):\n            \"\"\"TODO: Make luigi clean it's register for tests. Hate this 2 dance.\"\"\"\n\n            p_not_global_no_default = luigi.Parameter()\n\n            def complete(self):\n                import sys\n\n                luigi.configuration.get_config().write(sys.stdout)\n                luigi.configuration.get_config().write(sys.stdout)\n                if self.p_not_global_no_default != \"123\":\n                    raise ValueError(\"The parameter didn't get set!!\")\n                return True\n\n            def run(self):\n                pass\n\n        self.assertTrue(self.run_locally([\"MyClass2\"]))\n        self.assertFalse(self.run_locally([\"MyClass2\", \"--p-not-global-no-default\", \"124\"]))\n        self.assertFalse(self.run_locally([\"MyClass2\", \"--MyClass2-p-not-global-no-default\", \"124\"]))\n\n    @with_config({\"mynamespace.A\": {\"p\": \"999\"}})\n    def testWithNamespaceConfig(self):\n        class A(luigi.Task):\n            task_namespace = \"mynamespace\"\n            p = luigi.IntParameter()\n\n        self.assertEqual(999, A().p)\n        self.assertEqual(777, A(p=777).p)\n\n    def testWithNamespaceCli(self):\n        class A(luigi.Task):\n            task_namespace = \"mynamespace\"\n            p = luigi.IntParameter(default=100)\n            expected = luigi.IntParameter()\n\n            def complete(self):\n                if self.p != self.expected:\n                    raise ValueError\n                return True\n\n        self.assertTrue(self.run_locally_split(\"mynamespace.A --expected 100\"))\n        # TODO(arash): Why is `--p 200` hanging with multiprocessing stuff?\n        # self.assertTrue(self.run_locally_split('mynamespace.A --p 200 --expected 200'))\n        self.assertTrue(self.run_locally_split(\"mynamespace.A --mynamespace.A-p 200 --expected 200\"))\n        # --A-p is unrecognized since module-level A is _visible_in_registry=False (no CLI flag)\n        self.assertRaises(SystemExit, self.run_locally_split, \"mynamespace.A --A-p 200 --expected 200\")\n\n    def testListWithNamespaceCli(self):\n        class A(luigi.Task):\n            task_namespace = \"mynamespace\"\n            l_param = luigi.ListParameter(default=[1, 2, 3])\n            expected = luigi.ListParameter()\n\n            def complete(self):\n                if self.l_param != self.expected:\n                    raise ValueError\n                return True\n\n        self.assertTrue(self.run_locally_split(\"mynamespace.A --expected [1,2,3]\"))\n        self.assertTrue(self.run_locally_split(\"mynamespace.A --mynamespace.A-l [1,2,3] --expected [1,2,3]\"))\n\n    def testTupleWithNamespaceCli(self):\n        class A(luigi.Task):\n            task_namespace = \"mynamespace\"\n            t = luigi.TupleParameter(default=((1, 2), (3, 4)))\n            expected = luigi.TupleParameter()\n\n            def complete(self):\n                if self.t != self.expected:\n                    raise ValueError\n                return True\n\n        self.assertTrue(self.run_locally_split(\"mynamespace.A --expected ((1,2),(3,4))\"))\n        self.assertTrue(self.run_locally_split(\"mynamespace.A --mynamespace.A-t ((1,2),(3,4)) --expected ((1,2),(3,4))\"))\n\n    @with_config({\"foo\": {\"bar\": \"[1,2,3]\"}})\n    def testListConfig(self):\n        self.assertTrue(_value(luigi.ListParameter(config_path=dict(section=\"foo\", name=\"bar\"))))\n\n    @with_config({\"foo\": {\"bar\": \"((1,2),(3,4))\"}})\n    def testTupleConfig(self):\n        self.assertTrue(_value(luigi.TupleParameter(config_path=dict(section=\"foo\", name=\"bar\"))))\n\n    @with_config({\"foo\": {\"bar\": \"-3\"}})\n    def testNumericalParameter(self):\n        p = luigi.NumericalParameter(min_value=-3, max_value=7, var_type=int, config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(-3, _value(p))\n\n    @with_config({\"foo\": {\"bar\": \"3\"}})\n    def testChoiceParameter(self):\n        p = luigi.ChoiceParameter(var_type=int, choices=[1, 2, 3], config_path=dict(section=\"foo\", name=\"bar\"))\n        self.assertEqual(3, _value(p))\n\n\nclass OverrideEnvStuff(LuigiTestCase):\n    @with_config({\"core\": {\"default-scheduler-port\": \"6543\"}})\n    def testOverrideSchedulerPort(self):\n        with self.assertWarnsRegex(DeprecationWarning, r\"default-scheduler-port is deprecated\"):\n            env_params = luigi.interface.core()\n            self.assertEqual(env_params.scheduler_port, 6543)\n\n    @with_config({\"core\": {\"scheduler-port\": \"6544\"}})\n    def testOverrideSchedulerPort2(self):\n        with self.assertWarnsRegex(DeprecationWarning, r\"scheduler-port \\(with dashes\\) should be avoided\"):\n            env_params = luigi.interface.core()\n        self.assertEqual(env_params.scheduler_port, 6544)\n\n    @with_config({\"core\": {\"scheduler_port\": \"6545\"}})\n    def testOverrideSchedulerPort3(self):\n        env_params = luigi.interface.core()\n        self.assertEqual(env_params.scheduler_port, 6545)\n\n\nclass TestSerializeDateParameters(LuigiTestCase):\n    def testSerialize(self):\n        date = datetime.date(2013, 2, 3)\n        self.assertEqual(luigi.DateParameter().serialize(date), \"2013-02-03\")\n        self.assertEqual(luigi.YearParameter().serialize(date), \"2013\")\n        self.assertEqual(luigi.MonthParameter().serialize(date), \"2013-02\")\n        dt = datetime.datetime(2013, 2, 3, 4, 5)\n        self.assertEqual(luigi.DateHourParameter().serialize(dt), \"2013-02-03T04\")\n\n\nclass TestSerializeTimeDeltaParameters(LuigiTestCase):\n    def testSerialize(self):\n        tdelta = timedelta(weeks=5, days=4, hours=3, minutes=2, seconds=1)\n        self.assertEqual(luigi.TimeDeltaParameter().serialize(tdelta), \"5 w 4 d 3 h 2 m 1 s\")\n        tdelta = timedelta(seconds=0)\n        self.assertEqual(luigi.TimeDeltaParameter().serialize(tdelta), \"0 w 0 d 0 h 0 m 0 s\")\n\n\nclass TestTaskParameter(LuigiTestCase):\n    def testUsage(self):\n\n        class MetaTask(luigi.Task):\n            task_namespace = \"mynamespace\"\n            a = luigi.TaskParameter()\n\n            def run(self):\n                self.__class__.saved_value = self.a\n\n        class OtherTask(luigi.Task):\n            task_namespace = \"other_namespace\"\n\n        self.assertEqual(MetaTask(a=MetaTask).a, MetaTask)\n        self.assertEqual(MetaTask(a=OtherTask).a, OtherTask)\n\n        # So I first thought this \"should\" work, but actually it should not,\n        # because it should not need to parse values known at run-time\n        self.assertRaises(AttributeError, lambda: MetaTask(a=\"mynamespace.MetaTask\"))\n\n        # But is should be able to parse command line arguments\n        self.assertRaises(luigi.task_register.TaskClassNotFoundException, lambda: self.run_locally_split(\"mynamespace.MetaTask --a blah\"))\n        self.assertRaises(luigi.task_register.TaskClassNotFoundException, lambda: self.run_locally_split(\"mynamespace.MetaTask --a Taskk\"))\n        self.assertTrue(self.run_locally_split(\"mynamespace.MetaTask --a mynamespace.MetaTask\"))\n        self.assertEqual(MetaTask.saved_value, MetaTask)\n        self.assertTrue(self.run_locally_split(\"mynamespace.MetaTask --a other_namespace.OtherTask\"))\n        self.assertEqual(MetaTask.saved_value, OtherTask)\n\n    def testSerialize(self):\n\n        class OtherTask(luigi.Task):\n            def complete(self):\n                return True\n\n        class DepTask(luigi.Task):\n            dep = luigi.TaskParameter()\n            ran = False\n\n            def complete(self):\n                return self.__class__.ran\n\n            def requires(self):\n                return self.dep()\n\n            def run(self):\n                self.__class__.ran = True\n\n        class MainTask(luigi.Task):\n            def run(self):\n                yield DepTask(dep=OtherTask)\n\n        # OtherTask is serialized because it is used as an argument for DepTask.\n        self.assertTrue(self.run_locally([\"MainTask\"]))\n\n\nclass TestSerializeTupleParameter(LuigiTestCase):\n    def testSerialize(self):\n        the_tuple = (1, 2, 3)\n\n        self.assertEqual(luigi.TupleParameter().parse(luigi.TupleParameter().serialize(the_tuple)), the_tuple)\n\n\nclass NewStyleParameters822Test(LuigiTestCase):\n    \"\"\"\n    I bet these tests created at 2015-03-08 are reduntant by now (Oct 2015).\n    But maintaining them anyway, just in case I have overlooked something.\n    \"\"\"\n\n    # See https://github.com/spotify/luigi/issues/822\n\n    def test_subclasses(self):\n        class BarBaseClass(luigi.Task):\n            x = luigi.Parameter(default=\"bar_base_default\")\n\n        class BarSubClass(BarBaseClass):\n            pass\n\n        in_parse([\"BarSubClass\", \"--x\", \"xyz\", \"--BarBaseClass-x\", \"xyz\"], lambda task: self.assertEqual(task.x, \"xyz\"))\n\n        # https://github.com/spotify/luigi/issues/822#issuecomment-77782714\n        in_parse([\"BarBaseClass\", \"--BarBaseClass-x\", \"xyz\"], lambda task: self.assertEqual(task.x, \"xyz\"))\n\n\nclass LocalParameters1304Test(LuigiTestCase):\n    \"\"\"\n    It was discussed and decided that local parameters (--x) should be\n    semantically different from global parameters (--MyTask-x).\n\n    The former sets only the parsed root task, and the later sets the parameter\n    for all the tasks.\n\n    https://github.com/spotify/luigi/issues/1304#issuecomment-148402284\n    \"\"\"\n\n    def test_local_params(self):\n\n        class MyTask(RunOnceTask):\n            param1 = luigi.IntParameter()\n            param2 = luigi.BoolParameter(default=False)\n\n            def requires(self):\n                if self.param1 > 0:\n                    yield MyTask(param1=(self.param1 - 1))\n\n            def run(self):\n                assert self.param1 == 1 or not self.param2\n                self.comp = True\n\n        self.assertTrue(self.run_locally_split(\"MyTask --param1 1 --param2\"))\n\n    def test_local_takes_precedence(self):\n\n        class MyTask(luigi.Task):\n            param = luigi.IntParameter()\n\n            def complete(self):\n                return False\n\n            def run(self):\n                assert self.param == 5\n\n        self.assertTrue(self.run_locally_split(\"MyTask --param 5 --MyTask-param 6\"))\n\n    def test_local_only_affects_root(self):\n\n        class MyTask(RunOnceTask):\n            param = luigi.IntParameter(default=3)\n\n            def requires(self):\n                assert self.param != 3\n                if self.param == 5:\n                    yield MyTask()\n\n        # It would be a cyclic dependency if local took precedence\n        self.assertTrue(self.run_locally_split(\"MyTask --param 5 --MyTask-param 6\"))\n\n    def test_range_doesnt_propagate_args(self):\n        \"\"\"\n        Ensure that ``--task Range --of Blah --blah-arg 123`` doesn't work.\n\n        This will of course not work unless support is explicitly added for it.\n        But being a bit paranoid here and adding this test case so that if\n        somebody decides to add it in the future, they'll be redircted to the\n        dicussion in #1304\n        \"\"\"\n\n        class Blah(RunOnceTask):\n            date = luigi.DateParameter()\n            blah_arg = luigi.IntParameter()\n\n        # The SystemExit is assumed to be thrown by argparse\n        self.assertRaises(SystemExit, self.run_locally_split, \"RangeDailyBase --of Blah --start 2015-01-01 --task-limit 1 --blah-arg 123\")\n        self.assertTrue(self.run_locally_split(\"RangeDailyBase --of Blah --start 2015-01-01 --task-limit 1 --Blah-blah-arg 123\"))\n\n\nclass TaskAsParameterName1335Test(LuigiTestCase):\n    def test_parameter_can_be_named_task(self):\n\n        class MyTask(luigi.Task):\n            # Indeed, this is not the most realistic example, but still ...\n            task = luigi.IntParameter()\n\n        self.assertTrue(self.run_locally_split(\"MyTask --task 5\"))\n\n\nclass TestPathParameter:\n    @pytest.fixture(params=[None, \"not_existing_dir\"])\n    def default(self, request):\n        return request.param\n\n    @pytest.fixture(params=[True, False])\n    def absolute(self, request):\n        return request.param\n\n    @pytest.fixture(params=[True, False])\n    def exists(self, request):\n        return request.param\n\n    @pytest.fixture()\n    def path_parameter(self, tmpdir, default, absolute, exists):\n        class TaskPathParameter(luigi.Task):\n            a = luigi.PathParameter(\n                default=str(tmpdir / default) if default is not None else str(tmpdir),\n                absolute=absolute,\n                exists=exists,\n            )\n            b = luigi.OptionalPathParameter(\n                default=str(tmpdir / default) if default is not None else str(tmpdir),\n                absolute=absolute,\n                exists=exists,\n            )\n            c = luigi.OptionalPathParameter(default=None)\n            d = luigi.OptionalPathParameter(default=\"not empty default\")\n\n            def run(self):\n                # Use the parameter as a Path object\n                new_file = self.a / \"test.file\"\n                new_optional_file = self.b / \"test_optional.file\"\n                if default is not None:\n                    new_file.parent.mkdir(parents=True)\n                new_file.touch()\n                new_optional_file.touch()\n                assert new_file.exists()\n                assert new_optional_file.exists()\n                assert self.c is None\n                assert self.d is None\n\n            def output(self):\n                return luigi.LocalTarget(\"not_existing_file\")\n\n        return {\n            \"tmpdir\": tmpdir,\n            \"default\": default,\n            \"absolute\": absolute,\n            \"exists\": exists,\n            \"cls\": TaskPathParameter,\n        }\n\n    @with_config({\"TaskPathParameter\": {\"d\": \"\"}})\n    def test_exists(self, path_parameter):\n        if path_parameter[\"default\"] is not None and path_parameter[\"exists\"]:\n            with pytest.raises(ValueError, match=\"The path .* does not exist\"):\n                luigi.build([path_parameter[\"cls\"]()], local_scheduler=True)\n        else:\n            assert luigi.build([path_parameter[\"cls\"]()], local_scheduler=True)\n"
  },
  {
    "path": "test/priority_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom helpers import unittest\n\nimport luigi\nimport luigi.notifications\n\nluigi.notifications.DEBUG = True\n\n\nclass PrioTask(luigi.Task):\n    prio = luigi.Parameter()\n    run_counter = 0\n\n    @property\n    def priority(self):\n        return self.prio\n\n    def requires(self):\n        if self.prio > 10:\n            return PrioTask(self.prio - 10)\n\n    def run(self):\n        self.t = PrioTask.run_counter\n        PrioTask.run_counter += 1\n\n    def complete(self):\n        return hasattr(self, \"t\")\n\n\nclass PriorityTest(unittest.TestCase):\n    def test_priority(self):\n        p, q, r = PrioTask(1), PrioTask(2), PrioTask(3)\n        luigi.build([p, q, r], local_scheduler=True)\n        self.assertTrue(r.t < q.t < p.t)\n\n    def test_priority_w_dep(self):\n        x, y, z = PrioTask(25), PrioTask(15), PrioTask(5)\n        a, b, c = PrioTask(24), PrioTask(14), PrioTask(4)\n        luigi.build([a, b, c, x, y, z], local_scheduler=True)\n        self.assertTrue(z.t < y.t < x.t < c.t < b.t < a.t)\n"
  },
  {
    "path": "test/range_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport datetime\nimport fnmatch\n\nimport mock\nfrom helpers import LuigiTestCase, unittest\n\nimport luigi\nfrom luigi.mock import MockFileSystem, MockTarget\nfrom luigi.tools.range import (\n    RangeByMinutes,\n    RangeByMinutesBase,\n    RangeDaily,\n    RangeDailyBase,\n    RangeEvent,\n    RangeHourly,\n    RangeHourlyBase,\n    RangeMonthly,\n    _constrain_glob,\n    _get_filesystems_and_globs,\n)\n\n\nclass CommonDateMinuteTask(luigi.Task):\n    dh = luigi.DateMinuteParameter()\n\n    def output(self):\n        return MockTarget(self.dh.strftime(\"/n2000y01a05n/%Y_%m-_-%daww/21mm%H%Mdara21/ooo\"))\n\n\nclass CommonDateHourTask(luigi.Task):\n    dh = luigi.DateHourParameter()\n\n    def output(self):\n        return MockTarget(self.dh.strftime(\"/n2000y01a05n/%Y_%m-_-%daww/21mm%Hdara21/ooo\"))\n\n\nclass CommonDateTask(luigi.Task):\n    d = luigi.DateParameter()\n\n    def output(self):\n        return MockTarget(self.d.strftime(\"/n2000y01a05n/%Y_%m-_-%daww/21mm01dara21/ooo\"))\n\n\nclass CommonMonthTask(luigi.Task):\n    m = luigi.MonthParameter()\n\n    def output(self):\n        return MockTarget(self.m.strftime(\"/n2000y01a05n/%Y_%maww/21mm01dara21/ooo\"))\n\n\ntask_a_paths = [\n    \"TaskA/2014-03-20/18\",\n    \"TaskA/2014-03-20/21\",\n    \"TaskA/2014-03-20/23\",\n    \"TaskA/2014-03-21/00\",\n    \"TaskA/2014-03-21/00.attempt.1\",\n    \"TaskA/2014-03-21/00.attempt.2\",\n    \"TaskA/2014-03-21/01\",\n    \"TaskA/2014-03-21/02\",\n    \"TaskA/2014-03-21/03.attempt-temp-2014-03-21T13-22-58.165969\",\n    \"TaskA/2014-03-21/03.attempt.1\",\n    \"TaskA/2014-03-21/03.attempt.2\",\n    \"TaskA/2014-03-21/03.attempt.3\",\n    \"TaskA/2014-03-21/03.attempt.latest\",\n    \"TaskA/2014-03-21/04.attempt-temp-2014-03-21T13-23-09.078249\",\n    \"TaskA/2014-03-21/12\",\n    \"TaskA/2014-03-23/12\",\n]\n\ntask_b_paths = [\n    \"TaskB/no/worries2014-03-20/23\",\n    \"TaskB/no/worries2014-03-21/01\",\n    \"TaskB/no/worries2014-03-21/03\",\n    \"TaskB/no/worries2014-03-21/04.attempt-yadayada\",\n    \"TaskB/no/worries2014-03-21/05\",\n]\n\nmock_contents = task_a_paths + task_b_paths\n\n\nexpected_a = [\n    \"TaskA(dh=2014-03-20T17)\",\n    \"TaskA(dh=2014-03-20T19)\",\n    \"TaskA(dh=2014-03-20T20)\",\n]\n\n# expected_reverse = [\n# ]\n\nexpected_wrapper = [\n    \"CommonWrapperTask(dh=2014-03-21T00)\",\n    \"CommonWrapperTask(dh=2014-03-21T02)\",\n    \"CommonWrapperTask(dh=2014-03-21T03)\",\n    \"CommonWrapperTask(dh=2014-03-21T04)\",\n    \"CommonWrapperTask(dh=2014-03-21T05)\",\n]\n\n\nclass TaskA(luigi.Task):\n    dh = luigi.DateHourParameter()\n\n    def output(self):\n        return MockTarget(self.dh.strftime(\"TaskA/%Y-%m-%d/%H\"))\n\n\nclass TaskB(luigi.Task):\n    dh = luigi.DateHourParameter()\n    complicator = luigi.Parameter()\n\n    def output(self):\n        return MockTarget(self.dh.strftime(\"TaskB/%%s%Y-%m-%d/%H\") % self.complicator)\n\n\nclass TaskC(luigi.Task):\n    dh = luigi.DateHourParameter()\n\n    def output(self):\n        return MockTarget(self.dh.strftime(\"not/a/real/path/%Y-%m-%d/%H\"))\n\n\nclass CommonWrapperTask(luigi.WrapperTask):\n    dh = luigi.DateHourParameter()\n\n    def requires(self):\n        yield TaskA(dh=self.dh)\n        yield TaskB(dh=self.dh, complicator=\"no/worries\")  # str(self.dh) would complicate beyond working\n\n\nclass TaskMinutesA(luigi.Task):\n    dm = luigi.DateMinuteParameter()\n\n    def output(self):\n        return MockTarget(self.dm.strftime(\"TaskA/%Y-%m-%d/%H%M\"))\n\n\nclass TaskMinutesB(luigi.Task):\n    dm = luigi.DateMinuteParameter()\n    complicator = luigi.Parameter()\n\n    def output(self):\n        return MockTarget(self.dm.strftime(\"TaskB/%%s%Y-%m-%d/%H%M\") % self.complicator)\n\n\nclass TaskMinutesC(luigi.Task):\n    dm = luigi.DateMinuteParameter()\n\n    def output(self):\n        return MockTarget(self.dm.strftime(\"not/a/real/path/%Y-%m-%d/%H%M\"))\n\n\nclass CommonWrapperTaskMinutes(luigi.WrapperTask):\n    dm = luigi.DateMinuteParameter()\n\n    def requires(self):\n        yield TaskMinutesA(dm=self.dm)\n        yield TaskMinutesB(dm=self.dm, complicator=\"no/worries\")  # str(self.dh) would complicate beyond working\n\n\ndef mock_listdir(contents):\n    def contents_listdir(_, glob):\n        for path in fnmatch.filter(contents, glob + \"*\"):\n            yield path\n\n    return contents_listdir\n\n\ndef mock_exists_always_true(_, _2):\n    yield True\n\n\ndef mock_exists_always_false(_, _2):\n    yield False\n\n\nclass ConstrainGlobTest(unittest.TestCase):\n    def test_limit(self):\n        glob = \"/[0-9][0-9][0-9][0-9]/[0-9][0-9]/[0-9][0-9]/[0-9][0-9]\"\n        paths = [(datetime.datetime(2013, 12, 31, 5) + datetime.timedelta(hours=h)).strftime(\"/%Y/%m/%d/%H\") for h in range(40)]\n        self.assertEqual(\n            sorted(_constrain_glob(glob, paths)),\n            [\n                \"/2013/12/31/[0-2][0-9]\",\n                \"/2014/01/01/[0-2][0-9]\",\n            ],\n        )\n        paths.pop(26)\n        self.assertEqual(\n            sorted(_constrain_glob(glob, paths, 6)),\n            [\n                \"/2013/12/31/0[5-9]\",\n                \"/2013/12/31/1[0-9]\",\n                \"/2013/12/31/2[0-3]\",\n                \"/2014/01/01/0[012345689]\",\n                \"/2014/01/01/1[0-9]\",\n                \"/2014/01/01/2[0]\",\n            ],\n        )\n        self.assertEqual(\n            sorted(_constrain_glob(glob, paths[:7], 10)),\n            [\n                \"/2013/12/31/05\",\n                \"/2013/12/31/06\",\n                \"/2013/12/31/07\",\n                \"/2013/12/31/08\",\n                \"/2013/12/31/09\",\n                \"/2013/12/31/10\",\n                \"/2013/12/31/11\",\n            ],\n        )\n\n    def test_no_wildcards(self):\n        glob = \"/2014/01\"\n        paths = \"/2014/01\"\n        self.assertEqual(\n            _constrain_glob(glob, paths),\n            [\n                \"/2014/01\",\n            ],\n        )\n\n\ndef datetime_to_epoch(dt):\n    td = dt - datetime.datetime(1970, 1, 1)\n    return td.days * 86400 + td.seconds + td.microseconds / 1e6\n\n\nclass RangeDailyBaseTest(unittest.TestCase):\n    maxDiff = None\n\n    def setUp(self):\n        # yucky to create separate callbacks; would be nicer if the callback\n        # received an instance of a subclass of Event, so one callback could\n        # accumulate all types\n        @RangeDailyBase.event_handler(RangeEvent.DELAY)\n        def callback_delay(*args):\n            self.events.setdefault(RangeEvent.DELAY, []).append(args)\n\n        @RangeDailyBase.event_handler(RangeEvent.COMPLETE_COUNT)\n        def callback_complete_count(*args):\n            self.events.setdefault(RangeEvent.COMPLETE_COUNT, []).append(args)\n\n        @RangeDailyBase.event_handler(RangeEvent.COMPLETE_FRACTION)\n        def callback_complete_fraction(*args):\n            self.events.setdefault(RangeEvent.COMPLETE_FRACTION, []).append(args)\n\n        self.events = {}\n\n    def test_consistent_formatting(self):\n        task = RangeDailyBase(of=CommonDateTask, start=datetime.date(2016, 1, 1))\n        self.assertEqual(task._format_range([datetime.datetime(2016, 1, 2, 13), datetime.datetime(2016, 2, 29, 23)]), \"[2016-01-02, 2016-02-29]\")\n\n    def _empty_subcase(self, kwargs, expected_events):\n        calls = []\n\n        class RangeDailyDerived(RangeDailyBase):\n            def missing_datetimes(self, task_cls, finite_datetimes):\n                args = [self, task_cls, finite_datetimes]\n                calls.append(args)\n                return args[-1][:5]\n\n        task = RangeDailyDerived(of=CommonDateTask, **kwargs)\n        self.assertEqual(task.requires(), [])\n        self.assertEqual(calls, [])\n        self.assertEqual(task.requires(), [])\n        self.assertEqual(calls, [])  # subsequent requires() should return the cached result, never call missing_datetimes\n        self.assertEqual(self.events, expected_events)\n        self.assertTrue(task.complete())\n\n    def test_stop_before_days_back(self):\n        # nothing to do because stop is earlier\n        self._empty_subcase(\n            {\n                \"now\": datetime_to_epoch(datetime.datetime(2015, 1, 1, 4)),\n                \"stop\": datetime.date(2014, 3, 20),\n                \"days_back\": 4,\n                \"days_forward\": 20,\n                \"reverse\": True,\n            },\n            {\n                \"event.tools.range.delay\": [\n                    (\"CommonDateTask\", 0),\n                ],\n                \"event.tools.range.complete.count\": [\n                    (\"CommonDateTask\", 0),\n                ],\n                \"event.tools.range.complete.fraction\": [\n                    (\"CommonDateTask\", 1.0),\n                ],\n            },\n        )\n\n    def _nonempty_subcase(self, kwargs, expected_finite_datetimes_range, expected_requires, expected_events):\n        calls = []\n\n        class RangeDailyDerived(RangeDailyBase):\n            def missing_datetimes(self, finite_datetimes):\n                # I only changed tests for number of arguments at this one\n                # place to test both old and new behavior\n                calls.append((self, finite_datetimes))\n                return finite_datetimes[:7]\n\n        task = RangeDailyDerived(of=CommonDateTask, **kwargs)\n        self.assertEqual(list(map(str, task.requires())), expected_requires)\n        self.assertEqual((min(calls[0][1]), max(calls[0][1])), expected_finite_datetimes_range)\n        self.assertEqual(list(map(str, task.requires())), expected_requires)\n        self.assertEqual(len(calls), 1)  # subsequent requires() should return the cached result, not call missing_datetimes again\n        self.assertEqual(self.events, expected_events)\n        self.assertFalse(task.complete())\n\n    def test_start_long_before_long_days_back_and_with_long_days_forward(self):\n        self._nonempty_subcase(\n            {\n                \"now\": datetime_to_epoch(datetime.datetime(2017, 10, 22, 12, 4, 29)),\n                \"start\": datetime.date(2011, 3, 20),\n                \"stop\": datetime.date(2025, 1, 29),\n                \"task_limit\": 4,\n                \"days_back\": 3 * 365,\n                \"days_forward\": 3 * 365,\n            },\n            (datetime.datetime(2014, 10, 24), datetime.datetime(2020, 10, 21)),\n            [\n                \"CommonDateTask(d=2014-10-24)\",\n                \"CommonDateTask(d=2014-10-25)\",\n                \"CommonDateTask(d=2014-10-26)\",\n                \"CommonDateTask(d=2014-10-27)\",\n            ],\n            {\n                \"event.tools.range.delay\": [\n                    (\"CommonDateTask\", 3750),\n                ],\n                \"event.tools.range.complete.count\": [\n                    (\"CommonDateTask\", 5057),\n                ],\n                \"event.tools.range.complete.fraction\": [\n                    (\"CommonDateTask\", 5057.0 / (5057 + 7)),\n                ],\n            },\n        )\n\n\nclass RangeHourlyBaseTest(unittest.TestCase):\n    maxDiff = None\n\n    def setUp(self):\n        # yucky to create separate callbacks; would be nicer if the callback\n        # received an instance of a subclass of Event, so one callback could\n        # accumulate all types\n        @RangeHourlyBase.event_handler(RangeEvent.DELAY)\n        def callback_delay(*args):\n            self.events.setdefault(RangeEvent.DELAY, []).append(args)\n\n        @RangeHourlyBase.event_handler(RangeEvent.COMPLETE_COUNT)\n        def callback_complete_count(*args):\n            self.events.setdefault(RangeEvent.COMPLETE_COUNT, []).append(args)\n\n        @RangeHourlyBase.event_handler(RangeEvent.COMPLETE_FRACTION)\n        def callback_complete_fraction(*args):\n            self.events.setdefault(RangeEvent.COMPLETE_FRACTION, []).append(args)\n\n        self.events = {}\n\n    def test_consistent_formatting(self):\n        task = RangeHourlyBase(of=CommonDateHourTask, start=datetime.datetime(2016, 1, 1))\n        self.assertEqual(task._format_range([datetime.datetime(2016, 1, 2, 13), datetime.datetime(2016, 2, 29, 23)]), \"[2016-01-02T13, 2016-02-29T23]\")\n\n    def _empty_subcase(self, kwargs, expected_events):\n        calls = []\n\n        class RangeHourlyDerived(RangeHourlyBase):\n            def missing_datetimes(a, b, c):\n                args = [a, b, c]\n                calls.append(args)\n                return args[-1][:5]\n\n        task = RangeHourlyDerived(of=CommonDateHourTask, **kwargs)\n        self.assertEqual(task.requires(), [])\n        self.assertEqual(calls, [])\n        self.assertEqual(task.requires(), [])\n        self.assertEqual(calls, [])  # subsequent requires() should return the cached result, never call missing_datetimes\n        self.assertEqual(self.events, expected_events)\n        self.assertTrue(task.complete())\n\n    def test_start_after_hours_forward(self):\n        # nothing to do because start is later\n        self._empty_subcase(\n            {\n                \"now\": datetime_to_epoch(datetime.datetime(2000, 1, 1, 4)),\n                \"start\": datetime.datetime(2014, 3, 20, 17),\n                \"hours_back\": 4,\n                \"hours_forward\": 20,\n            },\n            {\n                \"event.tools.range.delay\": [\n                    (\"CommonDateHourTask\", 0),\n                ],\n                \"event.tools.range.complete.count\": [\n                    (\"CommonDateHourTask\", 0),\n                ],\n                \"event.tools.range.complete.fraction\": [\n                    (\"CommonDateHourTask\", 1.0),\n                ],\n            },\n        )\n\n    def _nonempty_subcase(self, kwargs, expected_finite_datetimes_range, expected_requires, expected_events):\n        calls = []\n\n        class RangeHourlyDerived(RangeHourlyBase):\n            def missing_datetimes(a, b, c):\n                args = [a, b, c]\n                calls.append(args)\n                return args[-1][:7]\n\n        task = RangeHourlyDerived(of=CommonDateHourTask, **kwargs)\n        self.assertEqual(list(map(str, task.requires())), expected_requires)\n        self.assertEqual(calls[0][1], CommonDateHourTask)\n        self.assertEqual((min(calls[0][2]), max(calls[0][2])), expected_finite_datetimes_range)\n        self.assertEqual(list(map(str, task.requires())), expected_requires)\n        self.assertEqual(len(calls), 1)  # subsequent requires() should return the cached result, not call missing_datetimes again\n        self.assertEqual(self.events, expected_events)\n        self.assertFalse(task.complete())\n\n    def test_start_long_before_hours_back(self):\n        self._nonempty_subcase(\n            {\n                \"now\": datetime_to_epoch(datetime.datetime(2000, 1, 1, 4)),\n                \"start\": datetime.datetime(1960, 3, 2, 1),\n                \"hours_back\": 5,\n                \"hours_forward\": 20,\n            },\n            (datetime.datetime(1999, 12, 31, 23), datetime.datetime(2000, 1, 1, 23)),\n            [\n                \"CommonDateHourTask(dh=1999-12-31T23)\",\n                \"CommonDateHourTask(dh=2000-01-01T00)\",\n                \"CommonDateHourTask(dh=2000-01-01T01)\",\n                \"CommonDateHourTask(dh=2000-01-01T02)\",\n                \"CommonDateHourTask(dh=2000-01-01T03)\",\n                \"CommonDateHourTask(dh=2000-01-01T04)\",\n                \"CommonDateHourTask(dh=2000-01-01T05)\",\n            ],\n            {\n                \"event.tools.range.delay\": [\n                    (\"CommonDateHourTask\", 25),  # because of short hours_back we're oblivious to those 40 preceding years\n                ],\n                \"event.tools.range.complete.count\": [\n                    (\"CommonDateHourTask\", 349192),\n                ],\n                \"event.tools.range.complete.fraction\": [\n                    (\"CommonDateHourTask\", 349192.0 / (349192 + 7)),\n                ],\n            },\n        )\n\n    def test_start_after_long_hours_back(self):\n        self._nonempty_subcase(\n            {\n                \"now\": datetime_to_epoch(datetime.datetime(2014, 10, 22, 12, 4, 29)),\n                \"start\": datetime.datetime(2014, 3, 20, 17),\n                \"task_limit\": 4,\n                \"hours_back\": 365 * 24,\n            },\n            (datetime.datetime(2014, 3, 20, 17), datetime.datetime(2014, 10, 22, 12)),\n            [\n                \"CommonDateHourTask(dh=2014-03-20T17)\",\n                \"CommonDateHourTask(dh=2014-03-20T18)\",\n                \"CommonDateHourTask(dh=2014-03-20T19)\",\n                \"CommonDateHourTask(dh=2014-03-20T20)\",\n            ],\n            {\n                \"event.tools.range.delay\": [\n                    (\"CommonDateHourTask\", 5180),\n                ],\n                \"event.tools.range.complete.count\": [\n                    (\"CommonDateHourTask\", 5173),\n                ],\n                \"event.tools.range.complete.fraction\": [\n                    (\"CommonDateHourTask\", 5173.0 / (5173 + 7)),\n                ],\n            },\n        )\n\n    def test_start_long_before_long_hours_back_and_with_long_hours_forward(self):\n        self._nonempty_subcase(\n            {\n                \"now\": datetime_to_epoch(datetime.datetime(2017, 10, 22, 12, 4, 29)),\n                \"start\": datetime.datetime(2011, 3, 20, 17),\n                \"task_limit\": 4,\n                \"hours_back\": 3 * 365 * 24,\n                \"hours_forward\": 3 * 365 * 24,\n            },\n            (datetime.datetime(2014, 10, 23, 13), datetime.datetime(2020, 10, 21, 12)),\n            [\n                \"CommonDateHourTask(dh=2014-10-23T13)\",\n                \"CommonDateHourTask(dh=2014-10-23T14)\",\n                \"CommonDateHourTask(dh=2014-10-23T15)\",\n                \"CommonDateHourTask(dh=2014-10-23T16)\",\n            ],\n            {\n                \"event.tools.range.delay\": [\n                    (\"CommonDateHourTask\", 52560),\n                ],\n                \"event.tools.range.complete.count\": [\n                    (\"CommonDateHourTask\", 84061),\n                ],\n                \"event.tools.range.complete.fraction\": [\n                    (\"CommonDateHourTask\", 84061.0 / (84061 + 7)),\n                ],\n            },\n        )\n\n\nclass RangeByMinutesBaseTest(unittest.TestCase):\n    maxDiff = None\n\n    def setUp(self):\n        # yucky to create separate callbacks; would be nicer if the callback\n        # received an instance of a subclass of Event, so one callback could\n        # accumulate all types\n        @RangeByMinutesBase.event_handler(RangeEvent.DELAY)\n        def callback_delay(*args):\n            self.events.setdefault(RangeEvent.DELAY, []).append(args)\n\n        @RangeByMinutesBase.event_handler(RangeEvent.COMPLETE_COUNT)\n        def callback_complete_count(*args):\n            self.events.setdefault(RangeEvent.COMPLETE_COUNT, []).append(args)\n\n        @RangeByMinutesBase.event_handler(RangeEvent.COMPLETE_FRACTION)\n        def callback_complete_fraction(*args):\n            self.events.setdefault(RangeEvent.COMPLETE_FRACTION, []).append(args)\n\n        self.events = {}\n\n    def test_consistent_formatting(self):\n        task = RangeByMinutesBase(of=CommonDateMinuteTask, start=datetime.datetime(2016, 1, 1, 13), minutes_interval=5)\n        self.assertEqual(\n            task._format_range([datetime.datetime(2016, 1, 2, 13, 10), datetime.datetime(2016, 2, 29, 23, 20)]), \"[2016-01-02T1310, 2016-02-29T2320]\"\n        )\n\n    def _empty_subcase(self, kwargs, expected_events):\n        calls = []\n\n        class RangeByMinutesDerived(RangeByMinutesBase):\n            def missing_datetimes(a, b, c):\n                args = [a, b, c]\n                calls.append(args)\n                return args[-1][:5]\n\n        task = RangeByMinutesDerived(of=CommonDateMinuteTask, **kwargs)\n        self.assertEqual(task.requires(), [])\n        self.assertEqual(calls, [])\n        self.assertEqual(task.requires(), [])\n        self.assertEqual(calls, [])  # subsequent requires() should return the cached result, never call missing_datetimes\n        self.assertEqual(self.events, expected_events)\n        self.assertTrue(task.complete())\n\n    def test_start_after_minutes_forward(self):\n        # nothing to do because start is later\n        self._empty_subcase(\n            {\n                \"now\": datetime_to_epoch(datetime.datetime(2000, 1, 1, 4)),\n                \"start\": datetime.datetime(2014, 3, 20, 17, 10),\n                \"minutes_back\": 4,\n                \"minutes_forward\": 20,\n                \"minutes_interval\": 5,\n            },\n            {\n                \"event.tools.range.delay\": [\n                    (\"CommonDateMinuteTask\", 0),\n                ],\n                \"event.tools.range.complete.count\": [\n                    (\"CommonDateMinuteTask\", 0),\n                ],\n                \"event.tools.range.complete.fraction\": [\n                    (\"CommonDateMinuteTask\", 1.0),\n                ],\n            },\n        )\n\n    def _nonempty_subcase(self, kwargs, expected_finite_datetimes_range, expected_requires, expected_events):\n        calls = []\n\n        class RangeByMinutesDerived(RangeByMinutesBase):\n            def missing_datetimes(a, b, c):\n                args = [a, b, c]\n                calls.append(args)\n                return args[-1][:7]\n\n        task = RangeByMinutesDerived(of=CommonDateMinuteTask, **kwargs)\n        self.assertEqual(list(map(str, task.requires())), expected_requires)\n        self.assertEqual(calls[0][1], CommonDateMinuteTask)\n        self.assertEqual((min(calls[0][2]), max(calls[0][2])), expected_finite_datetimes_range)\n        self.assertEqual(list(map(str, task.requires())), expected_requires)\n        self.assertEqual(len(calls), 1)  # subsequent requires() should return the cached result, not call missing_datetimes again\n        self.assertEqual(self.events, expected_events)\n        self.assertFalse(task.complete())\n\n    def test_negative_interval(self):\n        class SomeByMinutesTask(luigi.Task):\n            d = luigi.DateMinuteParameter()\n\n            def output(self):\n                return MockTarget(self.d.strftime(\"/data/2014/p/v/z/%Y_/_%m-_-%doctor/20/%HZ%MOOO\"))\n\n        task = RangeByMinutes(\n            now=datetime_to_epoch(datetime.datetime(2016, 4, 1)), of=SomeByMinutesTask, start=datetime.datetime(2014, 3, 20, 17), minutes_interval=-1\n        )\n        self.assertRaises(luigi.parameter.ParameterException, task.requires)\n\n    def test_non_dividing_interval(self):\n        class SomeByMinutesTask(luigi.Task):\n            d = luigi.DateMinuteParameter()\n\n            def output(self):\n                return MockTarget(self.d.strftime(\"/data/2014/p/v/z/%Y_/_%m-_-%doctor/20/%HZ%MOOO\"))\n\n        task = RangeByMinutes(\n            now=datetime_to_epoch(datetime.datetime(2016, 4, 1)), of=SomeByMinutesTask, start=datetime.datetime(2014, 3, 20, 17), minutes_interval=8\n        )\n        self.assertRaises(luigi.parameter.ParameterException, task.requires)\n\n    def test_start_and_minutes_period(self):\n        self._nonempty_subcase(\n            {\n                \"now\": datetime_to_epoch(datetime.datetime(2016, 9, 1, 12, 0, 0)),\n                \"start\": datetime.datetime(2016, 9, 1, 11, 0, 0),\n                \"minutes_back\": 24 * 60,\n                \"minutes_forward\": 0,\n                \"minutes_interval\": 3,\n            },\n            (datetime.datetime(2016, 9, 1, 11, 0), datetime.datetime(2016, 9, 1, 11, 57, 0)),\n            [\n                \"CommonDateMinuteTask(dh=2016-09-01T1100)\",\n                \"CommonDateMinuteTask(dh=2016-09-01T1103)\",\n                \"CommonDateMinuteTask(dh=2016-09-01T1106)\",\n                \"CommonDateMinuteTask(dh=2016-09-01T1109)\",\n                \"CommonDateMinuteTask(dh=2016-09-01T1112)\",\n                \"CommonDateMinuteTask(dh=2016-09-01T1115)\",\n                \"CommonDateMinuteTask(dh=2016-09-01T1118)\",\n            ],\n            {\n                \"event.tools.range.delay\": [\n                    (\"CommonDateMinuteTask\", 20),  # First missing is the 20th\n                ],\n                \"event.tools.range.complete.count\": [\n                    (\"CommonDateMinuteTask\", 13),  # 20 intervals - 7 missing\n                ],\n                \"event.tools.range.complete.fraction\": [\n                    (\"CommonDateMinuteTask\", 13.0 / (13 + 7)),  # (expected - missing) / expected\n                ],\n            },\n        )\n\n    def test_start_long_before_minutes_back(self):\n        self._nonempty_subcase(\n            {\n                \"now\": datetime_to_epoch(datetime.datetime(2000, 1, 1, 0, 3, 0)),\n                \"start\": datetime.datetime(1960, 1, 1, 0, 0, 0),\n                \"minutes_back\": 5,\n                \"minutes_forward\": 20,\n                \"minutes_interval\": 5,\n            },\n            (datetime.datetime(2000, 1, 1, 0, 0), datetime.datetime(2000, 1, 1, 0, 20, 0)),\n            [\n                \"CommonDateMinuteTask(dh=2000-01-01T0000)\",\n                \"CommonDateMinuteTask(dh=2000-01-01T0005)\",\n                \"CommonDateMinuteTask(dh=2000-01-01T0010)\",\n                \"CommonDateMinuteTask(dh=2000-01-01T0015)\",\n                \"CommonDateMinuteTask(dh=2000-01-01T0020)\",\n            ],\n            {\n                \"event.tools.range.delay\": [\n                    (\"CommonDateMinuteTask\", 5),  # because of short minutes_back we're oblivious to those 40 preceding years\n                ],\n                \"event.tools.range.complete.count\": [\n                    (\"CommonDateMinuteTask\", 4207680),  # expected intervals - missing.\n                ],\n                \"event.tools.range.complete.fraction\": [\n                    (\"CommonDateMinuteTask\", 4207680.0 / 4207685),  # (expected - missing) / expected\n                ],\n            },\n        )\n\n    def test_start_after_long_minutes_back(self):\n        self._nonempty_subcase(\n            {\n                \"now\": datetime_to_epoch(datetime.datetime(2014, 3, 20, 18, 4, 29)),\n                \"start\": datetime.datetime(2014, 3, 20, 17, 10),\n                \"task_limit\": 4,\n                \"minutes_back\": 365 * 24 * 60,\n                \"minutes_interval\": 5,\n            },\n            (datetime.datetime(2014, 3, 20, 17, 10, 0), datetime.datetime(2014, 3, 20, 18, 0, 0)),\n            [\n                \"CommonDateMinuteTask(dh=2014-03-20T1710)\",\n                \"CommonDateMinuteTask(dh=2014-03-20T1715)\",\n                \"CommonDateMinuteTask(dh=2014-03-20T1720)\",\n                \"CommonDateMinuteTask(dh=2014-03-20T1725)\",\n            ],\n            {\n                \"event.tools.range.delay\": [\n                    (\"CommonDateMinuteTask\", 11),\n                ],\n                \"event.tools.range.complete.count\": [\n                    (\"CommonDateMinuteTask\", 4),\n                ],\n                \"event.tools.range.complete.fraction\": [\n                    (\"CommonDateMinuteTask\", 4.0 / 11),\n                ],\n            },\n        )\n\n    def test_start_long_before_long_minutes_back_and_with_long_minutes_forward(self):\n        self._nonempty_subcase(\n            {\n                \"now\": datetime_to_epoch(datetime.datetime(2017, 3, 22, 20, 4, 29)),\n                \"start\": datetime.datetime(2011, 3, 20, 17, 10, 0),\n                \"task_limit\": 4,\n                \"minutes_back\": 365 * 24 * 60,\n                \"minutes_forward\": 365 * 24 * 60,\n                \"minutes_interval\": 5,\n            },\n            (datetime.datetime(2016, 3, 22, 20, 5), datetime.datetime(2018, 3, 22, 20, 0)),\n            [\n                \"CommonDateMinuteTask(dh=2016-03-22T2005)\",\n                \"CommonDateMinuteTask(dh=2016-03-22T2010)\",\n                \"CommonDateMinuteTask(dh=2016-03-22T2015)\",\n                \"CommonDateMinuteTask(dh=2016-03-22T2020)\",\n            ],\n            {\n                \"event.tools.range.delay\": [\n                    (\"CommonDateMinuteTask\", 210240),\n                ],\n                \"event.tools.range.complete.count\": [\n                    (\"CommonDateMinuteTask\", 737020),\n                ],\n                \"event.tools.range.complete.fraction\": [\n                    (\"CommonDateMinuteTask\", 737020.0 / (737020 + 7)),\n                ],\n            },\n        )\n\n\nclass FilesystemInferenceTest(unittest.TestCase):\n    def _test_filesystems_and_globs(self, datetime_to_task, datetime_to_re, expected):\n        actual = list(_get_filesystems_and_globs(datetime_to_task, datetime_to_re))\n        self.assertEqual(len(actual), len(expected))\n        for (actual_filesystem, actual_glob), (expected_filesystem, expected_glob) in zip(actual, expected):\n            self.assertTrue(isinstance(actual_filesystem, expected_filesystem))\n            self.assertEqual(actual_glob, expected_glob)\n\n    def test_date_glob_successfully_inferred(self):\n        self._test_filesystems_and_globs(\n            lambda d: CommonDateTask(d),\n            lambda d: d.strftime(\"(%Y).*(%m).*(%d)\"),\n            [\n                (MockFileSystem, \"/n2000y01a05n/[0-9][0-9][0-9][0-9]_[0-9][0-9]-_-[0-9][0-9]aww/21mm01dara21\"),\n            ],\n        )\n\n    def test_datehour_glob_successfully_inferred(self):\n        self._test_filesystems_and_globs(\n            lambda d: CommonDateHourTask(d),\n            lambda d: d.strftime(\"(%Y).*(%m).*(%d).*(%H)\"),\n            [\n                (MockFileSystem, \"/n2000y01a05n/[0-9][0-9][0-9][0-9]_[0-9][0-9]-_-[0-9][0-9]aww/21mm[0-9][0-9]dara21\"),\n            ],\n        )\n\n    def test_dateminute_glob_successfully_inferred(self):\n        self._test_filesystems_and_globs(\n            lambda d: CommonDateMinuteTask(d),\n            lambda d: d.strftime(\"(%Y).*(%m).*(%d).*(%H).*(%M)\"),\n            [\n                (MockFileSystem, \"/n2000y01a05n/[0-9][0-9][0-9][0-9]_[0-9][0-9]-_-[0-9][0-9]aww/21mm[0-9][0-9][0-9][0-9]dara21\"),\n            ],\n        )\n\n    def test_wrapped_datehour_globs_successfully_inferred(self):\n        self._test_filesystems_and_globs(\n            lambda d: CommonWrapperTask(d),\n            lambda d: d.strftime(\"(%Y).*(%m).*(%d).*(%H)\"),\n            [\n                (MockFileSystem, \"TaskA/[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]\"),\n                (MockFileSystem, \"TaskB/no/worries[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]\"),\n            ],\n        )\n\n    def test_inconsistent_output_datehour_glob_not_inferred(self):\n        class InconsistentlyOutputtingDateHourTask(luigi.Task):\n            dh = luigi.DateHourParameter()\n\n            def output(self):\n                base = self.dh.strftime(\"/even/%Y%m%d%H\")\n                if self.dh.hour % 2 == 0:\n                    return MockTarget(base)\n                else:\n                    return {\n                        \"spi\": MockTarget(base + \"/something.spi\"),\n                        \"spl\": MockTarget(base + \"/something.spl\"),\n                    }\n\n        def test_raise_not_implemented():\n            list(_get_filesystems_and_globs(lambda d: InconsistentlyOutputtingDateHourTask(d), lambda d: d.strftime(\"(%Y).*(%m).*(%d).*(%H)\")))\n\n        self.assertRaises(NotImplementedError, test_raise_not_implemented)\n\n    def test_wrapped_inconsistent_datehour_globs_not_inferred(self):\n        class InconsistentlyParameterizedWrapperTask(luigi.WrapperTask):\n            dh = luigi.DateHourParameter()\n\n            def requires(self):\n                yield TaskA(dh=self.dh - datetime.timedelta(days=1))\n                yield TaskB(dh=self.dh, complicator=\"no/worries\")\n\n        def test_raise_not_implemented():\n            list(_get_filesystems_and_globs(lambda d: InconsistentlyParameterizedWrapperTask(d), lambda d: d.strftime(\"(%Y).*(%m).*(%d).*(%H)\")))\n\n        self.assertRaises(NotImplementedError, test_raise_not_implemented)\n\n\nclass RangeMonthlyTest(unittest.TestCase):\n    def setUp(self):\n        # yucky to create separate callbacks; would be nicer if the callback\n        # received an instance of a subclass of Event, so one callback could\n        # accumulate all types\n        @RangeMonthly.event_handler(RangeEvent.DELAY)\n        def callback_delay(*args):\n            self.events.setdefault(RangeEvent.DELAY, []).append(args)\n\n        @RangeMonthly.event_handler(RangeEvent.COMPLETE_COUNT)\n        def callback_complete_count(*args):\n            self.events.setdefault(RangeEvent.COMPLETE_COUNT, []).append(args)\n\n        @RangeMonthly.event_handler(RangeEvent.COMPLETE_FRACTION)\n        def callback_complete_fraction(*args):\n            self.events.setdefault(RangeEvent.COMPLETE_FRACTION, []).append(args)\n\n        self.events = {}\n\n    def _empty_subcase(self, kwargs, expected_events):\n        calls = []\n\n        class RangeMonthlyDerived(RangeMonthly):\n            def missing_datetimes(self, task_cls, finite_datetimes):\n                args = [self, task_cls, finite_datetimes]\n                calls.append(args)\n                return args[-1][:5]\n\n        task = RangeMonthlyDerived(of=CommonMonthTask, **kwargs)\n        self.assertEqual(task.requires(), [])\n        self.assertEqual(calls, [])\n        self.assertEqual(task.requires(), [])\n        self.assertEqual(calls, [])  # subsequent requires() should return the cached result, never call missing_datetimes\n        self.assertEqual(self.events, expected_events)\n        self.assertTrue(task.complete())\n\n    def test_stop_before_months_back(self):\n        # nothing to do because stop is earlier\n        self._empty_subcase(\n            {\n                \"now\": datetime_to_epoch(datetime.datetime(2017, 1, 3)),\n                \"stop\": datetime.date(2016, 3, 20),\n                \"months_back\": 4,\n                \"months_forward\": 20,\n                \"reverse\": True,\n            },\n            {\n                \"event.tools.range.delay\": [\n                    (\"CommonMonthTask\", 0),\n                ],\n                \"event.tools.range.complete.count\": [\n                    (\"CommonMonthTask\", 0),\n                ],\n                \"event.tools.range.complete.fraction\": [\n                    (\"CommonMonthTask\", 1.0),\n                ],\n            },\n        )\n\n    def test_start_after_months_forward(self):\n        # nothing to do because start is later\n        self._empty_subcase(\n            {\n                \"now\": datetime_to_epoch(datetime.datetime(2000, 1, 1)),\n                \"start\": datetime.datetime(2014, 3, 20),\n                \"months_back\": 4,\n                \"months_forward\": 20,\n            },\n            {\n                \"event.tools.range.delay\": [\n                    (\"CommonMonthTask\", 0),\n                ],\n                \"event.tools.range.complete.count\": [\n                    (\"CommonMonthTask\", 0),\n                ],\n                \"event.tools.range.complete.fraction\": [\n                    (\"CommonMonthTask\", 1.0),\n                ],\n            },\n        )\n\n    def _nonempty_subcase(self, kwargs, expected_finite_datetimes_range, expected_requires, expected_events):\n        calls = []\n\n        class RangeDailyDerived(RangeMonthly):\n            def missing_datetimes(self, finite_datetimes):\n                calls.append((self, finite_datetimes))\n                return finite_datetimes[:7]\n\n        task = RangeDailyDerived(of=CommonMonthTask, **kwargs)\n        self.assertEqual(list(map(str, task.requires())), expected_requires)\n        self.assertEqual((min(calls[0][1]), max(calls[0][1])), expected_finite_datetimes_range)\n        self.assertEqual(list(map(str, task.requires())), expected_requires)\n        self.assertEqual(len(calls), 1)  # subsequent requires() should return the cached result, not call missing_datetimes again\n        self.assertEqual(self.events, expected_events)\n        self.assertFalse(task.complete())\n\n    def test_start_long_before_months_back(self):\n        total = (2000 - 1960) * 12 + 20 - 2\n        self._nonempty_subcase(\n            {\n                \"now\": datetime_to_epoch(datetime.datetime(2000, 1, 1)),\n                \"start\": datetime.datetime(1960, 3, 2, 1),\n                \"months_back\": 5,\n                \"months_forward\": 20,\n            },\n            (datetime.datetime(1999, 8, 1), datetime.datetime(2001, 8, 1)),\n            [\n                \"CommonMonthTask(m=1999-08)\",\n                \"CommonMonthTask(m=1999-09)\",\n                \"CommonMonthTask(m=1999-10)\",\n                \"CommonMonthTask(m=1999-11)\",\n                \"CommonMonthTask(m=1999-12)\",\n                \"CommonMonthTask(m=2000-01)\",\n                \"CommonMonthTask(m=2000-02)\",\n            ],\n            {\n                \"event.tools.range.delay\": [\n                    (\"CommonMonthTask\", 25),\n                ],\n                \"event.tools.range.complete.count\": [\n                    (\"CommonMonthTask\", total - 7),\n                ],\n                \"event.tools.range.complete.fraction\": [\n                    (\"CommonMonthTask\", (total - 7.0) / total),\n                ],\n            },\n        )\n\n    def test_start_after_long_months_back(self):\n        total = 12 - 4\n        self._nonempty_subcase(\n            {\n                \"now\": datetime_to_epoch(datetime.datetime(2014, 11, 22)),\n                \"start\": datetime.datetime(2014, 3, 1),\n                \"task_limit\": 4,\n                \"months_back\": 12 * 24,\n            },\n            (datetime.datetime(2014, 3, 1), datetime.datetime(2014, 10, 1)),\n            [\n                \"CommonMonthTask(m=2014-03)\",\n                \"CommonMonthTask(m=2014-04)\",\n                \"CommonMonthTask(m=2014-05)\",\n                \"CommonMonthTask(m=2014-06)\",\n            ],\n            {\n                \"event.tools.range.delay\": [\n                    (\"CommonMonthTask\", total),\n                ],\n                \"event.tools.range.complete.count\": [\n                    (\"CommonMonthTask\", total - 7),\n                ],\n                \"event.tools.range.complete.fraction\": [\n                    (\"CommonMonthTask\", (total - 7.0) / total),\n                ],\n            },\n        )\n\n    def test_start_long_before_long_months_back_and_with_long_months_forward(self):\n        total = (2025 - 2011) * 12 - 2\n        self._nonempty_subcase(\n            {\n                \"now\": datetime_to_epoch(datetime.datetime(2017, 10, 22, 12, 4, 29)),\n                \"start\": datetime.date(2011, 3, 20),\n                \"stop\": datetime.date(2025, 1, 29),\n                \"task_limit\": 4,\n                \"months_back\": 3 * 12,\n                \"months_forward\": 3 * 12,\n            },\n            (datetime.datetime(2014, 10, 1), datetime.datetime(2020, 9, 1)),\n            [\n                \"CommonMonthTask(m=2014-10)\",\n                \"CommonMonthTask(m=2014-11)\",\n                \"CommonMonthTask(m=2014-12)\",\n                \"CommonMonthTask(m=2015-01)\",\n            ],\n            {\n                \"event.tools.range.delay\": [\n                    (\"CommonMonthTask\", (2025 - (2017 - 3)) * 12 - 9),\n                ],\n                \"event.tools.range.complete.count\": [\n                    (\"CommonMonthTask\", total - 7),\n                ],\n                \"event.tools.range.complete.fraction\": [\n                    (\"CommonMonthTask\", (total - 7.0) / total),\n                ],\n            },\n        )\n\n    def test_zero_months_forward(self):\n        total = (2017 - 2011) * 12\n        self._nonempty_subcase(\n            {\n                \"now\": datetime_to_epoch(datetime.datetime(2017, 10, 31, 12, 4, 29)),\n                \"start\": datetime.date(2011, 10, 1),\n                \"task_limit\": 10,\n                \"months_back\": 4,\n            },\n            (datetime.datetime(2017, 6, 1), datetime.datetime(2017, 9, 1)),\n            [\n                \"CommonMonthTask(m=2017-06)\",\n                \"CommonMonthTask(m=2017-07)\",\n                \"CommonMonthTask(m=2017-08)\",\n                \"CommonMonthTask(m=2017-09)\",\n            ],\n            {\n                \"event.tools.range.delay\": [\n                    (\"CommonMonthTask\", 4),\n                ],\n                \"event.tools.range.complete.count\": [\n                    (\"CommonMonthTask\", total - 4),\n                ],\n                \"event.tools.range.complete.fraction\": [\n                    (\"CommonMonthTask\", (total - 4.0) / total),\n                ],\n            },\n        )\n\n    def test_months_forward_on_first_of_month(self):\n        total = (2017 - 2011) * 12 + 2\n        self._nonempty_subcase(\n            {\n                \"now\": datetime_to_epoch(datetime.datetime(2017, 10, 1, 12, 4, 29)),\n                \"start\": datetime.date(2011, 10, 1),\n                \"task_limit\": 10,\n                \"months_back\": 4,\n                \"months_forward\": 2,\n            },\n            (datetime.datetime(2017, 6, 1), datetime.datetime(2017, 11, 1)),\n            [\n                \"CommonMonthTask(m=2017-06)\",\n                \"CommonMonthTask(m=2017-07)\",\n                \"CommonMonthTask(m=2017-08)\",\n                \"CommonMonthTask(m=2017-09)\",\n                \"CommonMonthTask(m=2017-10)\",\n                \"CommonMonthTask(m=2017-11)\",\n            ],\n            {\n                \"event.tools.range.delay\": [\n                    (\"CommonMonthTask\", 6),\n                ],\n                \"event.tools.range.complete.count\": [\n                    (\"CommonMonthTask\", total - 6),\n                ],\n                \"event.tools.range.complete.fraction\": [\n                    (\"CommonMonthTask\", (total - 6.0) / total),\n                ],\n            },\n        )\n\n    def test_consistent_formatting(self):\n        task = RangeMonthly(of=CommonMonthTask, start=datetime.date(2018, 1, 4))\n        self.assertEqual(task._format_range([datetime.datetime(2018, 2, 3, 14), datetime.datetime(2018, 4, 5, 21)]), \"[2018-02, 2018-04]\")\n\n\nclass MonthInstantiationTest(LuigiTestCase):\n    def test_old_month_instantiation(self):\n        \"\"\"\n        Verify that you can still programmatically set of param as string\n        \"\"\"\n\n        class MyTask(luigi.Task):\n            month_param = luigi.MonthParameter()\n\n            def complete(self):\n                return False\n\n        range_task = RangeMonthly(\n            now=datetime_to_epoch(datetime.datetime(2016, 1, 1)), of=MyTask, start=datetime.date(2015, 12, 1), stop=datetime.date(2016, 1, 1)\n        )\n        expected_task = MyTask(month_param=datetime.date(2015, 12, 1))\n        self.assertEqual(expected_task, list(range_task._requires())[0])\n\n    def test_month_cli_instantiation(self):\n        \"\"\"\n        Verify that you can still use Range through CLI\n        \"\"\"\n\n        class MyTask(luigi.Task):\n            task_namespace = \"wohoo\"\n            month_param = luigi.MonthParameter()\n            secret = \"some-value-to-sooth-python-linters\"\n            comp = False\n\n            def complete(self):\n                return self.comp\n\n            def run(self):\n                self.comp = True\n                MyTask.secret = \"yay\"\n\n        now = str(int(datetime_to_epoch(datetime.datetime(2016, 1, 1))))\n        self.run_locally_split(\"RangeMonthly --of wohoo.MyTask --now {now} --start 2015-12 --stop 2016-01\".format(now=now))\n        self.assertEqual(MyTask(month_param=datetime.date(1934, 12, 1)).secret, \"yay\")\n\n    def test_param_name(self):\n        class MyTask(luigi.Task):\n            some_non_range_param = luigi.Parameter(default=\"woo\")\n            month_param = luigi.MonthParameter()\n\n            def complete(self):\n                return False\n\n        range_task = RangeMonthly(\n            now=datetime_to_epoch(datetime.datetime(2016, 1, 1)),\n            of=MyTask,\n            start=datetime.date(2015, 12, 1),\n            stop=datetime.date(2016, 1, 1),\n            param_name=\"month_param\",\n        )\n        expected_task = MyTask(\"woo\", datetime.date(2015, 12, 1))\n        self.assertEqual(expected_task, list(range_task._requires())[0])\n\n    def test_param_name_with_inferred_fs(self):\n        class MyTask(luigi.Task):\n            some_non_range_param = luigi.Parameter(default=\"woo\")\n            month_param = luigi.MonthParameter()\n\n            def output(self):\n                return MockTarget(self.month_param.strftime(\"/n2000y01a05n/%Y_%m-aww/21mm%Hdara21/ooo\"))\n\n        range_task = RangeMonthly(\n            now=datetime_to_epoch(datetime.datetime(2016, 1, 1)),\n            of=MyTask,\n            start=datetime.date(2015, 12, 1),\n            stop=datetime.date(2016, 1, 1),\n            param_name=\"month_param\",\n        )\n        expected_task = MyTask(\"woo\", datetime.date(2015, 12, 1))\n        self.assertEqual(expected_task, list(range_task._requires())[0])\n\n    def test_of_param_distinction(self):\n        class MyTask(luigi.Task):\n            arbitrary_param = luigi.Parameter(default=\"foo\")\n            arbitrary_integer_param = luigi.IntParameter(default=10)\n            month_param = luigi.MonthParameter()\n\n            def complete(self):\n                return False\n\n        range_task_1 = RangeMonthly(\n            now=datetime_to_epoch(datetime.datetime(2015, 12, 2)), of=MyTask, start=datetime.date(2015, 12, 1), stop=datetime.date(2016, 1, 1)\n        )\n        range_task_2 = RangeMonthly(\n            now=datetime_to_epoch(datetime.datetime(2015, 12, 2)),\n            of=MyTask,\n            of_params=dict(arbitrary_param=\"bar\", abitrary_integer_param=2),\n            start=datetime.date(2015, 12, 1),\n            stop=datetime.date(2016, 1, 1),\n        )\n        self.assertNotEqual(range_task_1.task_id, range_task_2.task_id)\n\n    def test_of_param_commandline(self):\n        class MyTask(luigi.Task):\n            task_namespace = \"wohoo\"\n            month_param = luigi.MonthParameter()\n            arbitrary_param = luigi.Parameter(default=\"foo\")\n            arbitrary_integer_param = luigi.IntParameter(default=10)\n            state = (None, None)\n            comp = False\n\n            def complete(self):\n                return self.comp\n\n            def run(self):\n                self.comp = True\n                MyTask.state = (self.arbitrary_param, self.arbitrary_integer_param)\n\n        now = str(int(datetime_to_epoch(datetime.datetime(2016, 1, 1))))\n        self.run_locally(\n            [\n                \"RangeMonthly\",\n                \"--of\",\n                \"wohoo.MyTask\",\n                \"--of-params\",\n                '{\"arbitrary_param\":\"bar\",\"arbitrary_integer_param\":5}',\n                \"--now\",\n                \"{0}\".format(now),\n                \"--start\",\n                \"2015-12\",\n                \"--stop\",\n                \"2016-01\",\n            ]\n        )\n        self.assertEqual(MyTask.state, (\"bar\", 5))\n\n\nclass RangeDailyTest(unittest.TestCase):\n    def test_bulk_complete_correctly_interfaced(self):\n        class BulkCompleteDailyTask(luigi.Task):\n            d = luigi.DateParameter()\n\n            @classmethod\n            def bulk_complete(self, parameter_tuples):\n                return list(parameter_tuples)[:-2]\n\n            def output(self):\n                raise RuntimeError(\"Shouldn't get called while resolving deps via bulk_complete\")\n\n        task = RangeDaily(\n            now=datetime_to_epoch(datetime.datetime(2015, 12, 1)), of=BulkCompleteDailyTask, start=datetime.date(2015, 11, 1), stop=datetime.date(2015, 12, 1)\n        )\n\n        expected = [\n            \"BulkCompleteDailyTask(d=2015-11-29)\",\n            \"BulkCompleteDailyTask(d=2015-11-30)\",\n        ]\n\n        actual = [str(t) for t in task.requires()]\n        self.assertEqual(actual, expected)\n\n    def test_bulk_complete_of_params(self):\n        class BulkCompleteDailyTask(luigi.Task):\n            non_positional_arbitrary_argument = luigi.Parameter(default=\"whatever\", positional=False, significant=False)\n            d = luigi.DateParameter()\n            arbitrary_argument = luigi.BoolParameter()\n\n            @classmethod\n            def bulk_complete(cls, parameter_tuples):\n                ptuples = list(parameter_tuples)\n                for t in map(cls, ptuples):\n                    assert t.arbitrary_argument\n                return ptuples[:-2]\n\n            def output(self):\n                raise RuntimeError(\"Shouldn't get called while resolving deps via bulk_complete\")\n\n        task = RangeDaily(\n            now=datetime_to_epoch(datetime.datetime(2015, 12, 1)),\n            of=BulkCompleteDailyTask,\n            of_params=dict(arbitrary_argument=True),\n            start=datetime.date(2015, 11, 1),\n            stop=datetime.date(2015, 12, 1),\n        )\n        expected = [\n            \"BulkCompleteDailyTask(d=2015-11-29, arbitrary_argument=True)\",\n            \"BulkCompleteDailyTask(d=2015-11-30, arbitrary_argument=True)\",\n        ]\n\n        actual = [str(t) for t in task.requires()]\n        self.assertEqual(actual, expected)\n\n    @mock.patch(\n        \"luigi.mock.MockFileSystem.listdir\",\n        new=mock_listdir(\n            [\n                \"/data/2014/p/v/z/2014_/_03-_-21octor/20/ZOOO\",\n                \"/data/2014/p/v/z/2014_/_03-_-23octor/20/ZOOO\",\n                \"/data/2014/p/v/z/2014_/_03-_-24octor/20/ZOOO\",\n            ]\n        ),\n    )\n    @mock.patch(\"luigi.mock.MockFileSystem.exists\", new=mock_exists_always_true)\n    def test_missing_tasks_correctly_required(self):\n        class SomeDailyTask(luigi.Task):\n            d = luigi.DateParameter()\n\n            def output(self):\n                return MockTarget(self.d.strftime(\"/data/2014/p/v/z/%Y_/_%m-_-%doctor/20/ZOOO\"))\n\n        task = RangeDaily(\n            now=datetime_to_epoch(datetime.datetime(2016, 4, 1)), of=SomeDailyTask, start=datetime.date(2014, 3, 20), task_limit=3, days_back=3 * 365\n        )\n        expected = [\n            \"SomeDailyTask(d=2014-03-20)\",\n            \"SomeDailyTask(d=2014-03-22)\",\n            \"SomeDailyTask(d=2014-03-25)\",\n        ]\n        actual = [str(t) for t in task.requires()]\n        self.assertEqual(actual, expected)\n\n\nclass RangeHourlyTest(unittest.TestCase):\n    # fishy to mock the mock, but MockFileSystem doesn't support globs yet\n    @mock.patch(\"luigi.mock.MockFileSystem.listdir\", new=mock_listdir(mock_contents))\n    @mock.patch(\"luigi.mock.MockFileSystem.exists\", new=mock_exists_always_true)\n    def test_missing_tasks_correctly_required(self):\n        for task_path in task_a_paths:\n            MockTarget(task_path)\n        # this test takes a few seconds. Since stop is not defined,\n        # finite_datetimes constitute many years to consider\n        task = RangeHourly(\n            now=datetime_to_epoch(datetime.datetime(2016, 4, 1)), of=TaskA, start=datetime.datetime(2014, 3, 20, 17), task_limit=3, hours_back=3 * 365 * 24\n        )\n        actual = [str(t) for t in task.requires()]\n        self.assertEqual(actual, expected_a)\n\n    @mock.patch(\"luigi.mock.MockFileSystem.listdir\", new=mock_listdir(mock_contents))\n    @mock.patch(\"luigi.mock.MockFileSystem.exists\", new=mock_exists_always_true)\n    def test_missing_wrapper_tasks_correctly_required(self):\n        task = RangeHourly(\n            now=datetime_to_epoch(datetime.datetime(2040, 4, 1)),\n            of=CommonWrapperTask,\n            start=datetime.datetime(2014, 3, 20, 23),\n            stop=datetime.datetime(2014, 3, 21, 6),\n            hours_back=30 * 365 * 24,\n        )\n        actual = [str(t) for t in task.requires()]\n        self.assertEqual(actual, expected_wrapper)\n\n    def test_bulk_complete_correctly_interfaced(self):\n        class BulkCompleteHourlyTask(luigi.Task):\n            dh = luigi.DateHourParameter()\n\n            @classmethod\n            def bulk_complete(cls, parameter_tuples):\n                return parameter_tuples[:-2]\n\n            def output(self):\n                raise RuntimeError(\"Shouldn't get called while resolving deps via bulk_complete\")\n\n        task = RangeHourly(\n            now=datetime_to_epoch(datetime.datetime(2015, 12, 1)),\n            of=BulkCompleteHourlyTask,\n            start=datetime.datetime(2015, 11, 1),\n            stop=datetime.datetime(2015, 12, 1),\n        )\n\n        expected = [\n            \"BulkCompleteHourlyTask(dh=2015-11-30T22)\",\n            \"BulkCompleteHourlyTask(dh=2015-11-30T23)\",\n        ]\n\n        actual = [str(t) for t in task.requires()]\n        self.assertEqual(actual, expected)\n\n    def test_bulk_complete_of_params(self):\n        class BulkCompleteHourlyTask(luigi.Task):\n            non_positional_arbitrary_argument = luigi.Parameter(default=\"whatever\", positional=False, significant=False)\n            dh = luigi.DateHourParameter()\n            arbitrary_argument = luigi.BoolParameter()\n\n            @classmethod\n            def bulk_complete(cls, parameter_tuples):\n                for t in map(cls, parameter_tuples):\n                    assert t.arbitrary_argument\n                return parameter_tuples[:-2]\n\n            def output(self):\n                raise RuntimeError(\"Shouldn't get called while resolving deps via bulk_complete\")\n\n        task = RangeHourly(\n            now=datetime_to_epoch(datetime.datetime(2015, 12, 1)),\n            of=BulkCompleteHourlyTask,\n            of_params=dict(arbitrary_argument=True),\n            start=datetime.datetime(2015, 11, 1),\n            stop=datetime.datetime(2015, 12, 1),\n        )\n\n        expected = [\n            \"BulkCompleteHourlyTask(dh=2015-11-30T22, arbitrary_argument=True)\",\n            \"BulkCompleteHourlyTask(dh=2015-11-30T23, arbitrary_argument=True)\",\n        ]\n\n        actual = [str(t) for t in task.requires()]\n        self.assertEqual(actual, expected)\n\n    @mock.patch(\"luigi.mock.MockFileSystem.exists\", new=mock_exists_always_false)\n    def test_missing_directory(self):\n        task = RangeHourly(\n            now=datetime_to_epoch(datetime.datetime(2014, 4, 1)), of=TaskC, start=datetime.datetime(2014, 3, 20, 23), stop=datetime.datetime(2014, 3, 21, 1)\n        )\n        self.assertFalse(task.complete())\n        expected = [\"TaskC(dh=2014-03-20T23)\", \"TaskC(dh=2014-03-21T00)\"]\n        self.assertEqual([str(t) for t in task.requires()], expected)\n\n\nclass RangeByMinutesTest(unittest.TestCase):\n    # fishy to mock the mock, but MockFileSystem doesn't support globs yet\n    @mock.patch(\"luigi.mock.MockFileSystem.listdir\", new=mock_listdir(mock_contents))\n    @mock.patch(\"luigi.mock.MockFileSystem.exists\", new=mock_exists_always_true)\n    def test_missing_tasks_correctly_required(self):\n        expected_tasks = [\"SomeByMinutesTask(d=2016-03-31T0000)\", \"SomeByMinutesTask(d=2016-03-31T0005)\", \"SomeByMinutesTask(d=2016-03-31T0010)\"]\n\n        class SomeByMinutesTask(luigi.Task):\n            d = luigi.DateMinuteParameter()\n\n            def output(self):\n                return MockTarget(self.d.strftime(\"/data/2014/p/v/z/%Y_/_%m-_-%doctor/20/%HZ%MOOO\"))\n\n        for task_path in task_a_paths:\n            MockTarget(task_path)\n        # this test takes a few seconds. Since stop is not defined,\n        # finite_datetimes constitute many years to consider\n        task = RangeByMinutes(\n            now=datetime_to_epoch(datetime.datetime(2016, 4, 1)),\n            of=SomeByMinutesTask,\n            start=datetime.datetime(2014, 3, 20, 17),\n            task_limit=3,\n            minutes_back=24 * 60,\n            minutes_interval=5,\n        )\n        actual = [str(t) for t in task.requires()]\n        self.assertEqual(actual, expected_tasks)\n\n    @mock.patch(\"luigi.mock.MockFileSystem.listdir\", new=mock_listdir(mock_contents))\n    @mock.patch(\"luigi.mock.MockFileSystem.exists\", new=mock_exists_always_true)\n    def test_missing_wrapper_tasks_correctly_required(self):\n        expected_wrapper = [\n            \"CommonWrapperTaskMinutes(dm=2014-03-20T2300)\",\n            \"CommonWrapperTaskMinutes(dm=2014-03-20T2305)\",\n            \"CommonWrapperTaskMinutes(dm=2014-03-20T2310)\",\n            \"CommonWrapperTaskMinutes(dm=2014-03-20T2315)\",\n        ]\n        task = RangeByMinutes(\n            now=datetime_to_epoch(datetime.datetime(2040, 4, 1, 0, 0, 0)),\n            of=CommonWrapperTaskMinutes,\n            start=datetime.datetime(2014, 3, 20, 23, 0, 0),\n            stop=datetime.datetime(2014, 3, 20, 23, 20, 0),\n            minutes_back=30 * 365 * 24 * 60,\n            minutes_interval=5,\n        )\n        actual = [str(t) for t in task.requires()]\n        self.assertEqual(actual, expected_wrapper)\n\n    def test_bulk_complete_correctly_interfaced(self):\n        class BulkCompleteByMinutesTask(luigi.Task):\n            dh = luigi.DateMinuteParameter()\n\n            @classmethod\n            def bulk_complete(cls, parameter_tuples):\n                return list(parameter_tuples)[:-2]\n\n            def output(self):\n                raise RuntimeError(\"Shouldn't get called while resolving deps via bulk_complete\")\n\n        task = RangeByMinutes(\n            now=datetime_to_epoch(datetime.datetime(2015, 12, 1)),\n            of=BulkCompleteByMinutesTask,\n            start=datetime.datetime(2015, 11, 1),\n            stop=datetime.datetime(2015, 12, 1),\n            minutes_interval=5,\n        )\n\n        expected = [\n            \"BulkCompleteByMinutesTask(dh=2015-11-30T2350)\",\n            \"BulkCompleteByMinutesTask(dh=2015-11-30T2355)\",\n        ]\n\n        actual = [str(t) for t in task.requires()]\n        self.assertEqual(actual, expected)\n\n    def test_bulk_complete_of_params(self):\n        class BulkCompleteByMinutesTask(luigi.Task):\n            non_positional_arbitrary_argument = luigi.Parameter(default=\"whatever\", positional=False, significant=False)\n            dh = luigi.DateMinuteParameter()\n            arbitrary_argument = luigi.BoolParameter()\n\n            @classmethod\n            def bulk_complete(cls, parameter_tuples):\n                ptuples = list(parameter_tuples)\n                for t in map(cls, parameter_tuples):\n                    assert t.arbitrary_argument\n                return ptuples[:-2]\n\n            def output(self):\n                raise RuntimeError(\"Shouldn't get called while resolving deps via bulk_complete\")\n\n        task = RangeByMinutes(\n            now=datetime_to_epoch(datetime.datetime(2015, 12, 1)),\n            of=BulkCompleteByMinutesTask,\n            of_params=dict(arbitrary_argument=True),\n            start=datetime.datetime(2015, 11, 1),\n            stop=datetime.datetime(2015, 12, 1),\n            minutes_interval=5,\n        )\n\n        expected = [\n            \"BulkCompleteByMinutesTask(dh=2015-11-30T2350, arbitrary_argument=True)\",\n            \"BulkCompleteByMinutesTask(dh=2015-11-30T2355, arbitrary_argument=True)\",\n        ]\n\n        actual = [str(t) for t in task.requires()]\n        self.assertEqual(actual, expected)\n\n    @mock.patch(\"luigi.mock.MockFileSystem.exists\", new=mock_exists_always_false)\n    def test_missing_directory(self):\n        task = RangeByMinutes(\n            now=datetime_to_epoch(datetime.datetime(2014, 3, 21, 0, 0)),\n            of=TaskMinutesC,\n            start=datetime.datetime(2014, 3, 20, 23, 11),\n            stop=datetime.datetime(2014, 3, 20, 23, 21),\n            minutes_interval=5,\n        )\n        self.assertFalse(task.complete())\n        expected = [\"TaskMinutesC(dm=2014-03-20T2315)\", \"TaskMinutesC(dm=2014-03-20T2320)\"]\n        self.assertEqual([str(t) for t in task.requires()], expected)\n\n\nclass RangeInstantiationTest(LuigiTestCase):\n    def test_old_instantiation(self):\n        \"\"\"\n        Verify that you can still programmatically set of param as string\n        \"\"\"\n\n        class MyTask(luigi.Task):\n            date_param = luigi.DateParameter()\n\n            def complete(self):\n                return False\n\n        range_task = RangeDailyBase(\n            now=datetime_to_epoch(datetime.datetime(2015, 12, 2)), of=MyTask, start=datetime.date(2015, 12, 1), stop=datetime.date(2015, 12, 2)\n        )\n        expected_task = MyTask(date_param=datetime.date(2015, 12, 1))\n        self.assertEqual(expected_task, list(range_task._requires())[0])\n\n    def test_cli_instantiation(self):\n        \"\"\"\n        Verify that you can still use Range through CLI\n        \"\"\"\n\n        class MyTask(luigi.Task):\n            task_namespace = \"wohoo\"\n            date_param = luigi.DateParameter()\n            secret = \"some-value-to-sooth-python-linters\"\n            comp = False\n\n            def complete(self):\n                return self.comp\n\n            def run(self):\n                self.comp = True\n                MyTask.secret = \"yay\"\n\n        now = str(int(datetime_to_epoch(datetime.datetime(2015, 12, 2))))\n        self.run_locally_split(\"RangeDailyBase --of wohoo.MyTask --now {now} --start 2015-12-01 --stop 2015-12-02\".format(now=now))\n        self.assertEqual(MyTask(date_param=datetime.date(1934, 12, 1)).secret, \"yay\")\n\n    def test_param_name(self):\n        class MyTask(luigi.Task):\n            some_non_range_param = luigi.Parameter(default=\"woo\")\n            date_param = luigi.DateParameter()\n\n            def complete(self):\n                return False\n\n        range_task = RangeDailyBase(\n            now=datetime_to_epoch(datetime.datetime(2015, 12, 2)),\n            of=MyTask,\n            start=datetime.date(2015, 12, 1),\n            stop=datetime.date(2015, 12, 2),\n            param_name=\"date_param\",\n        )\n        expected_task = MyTask(\"woo\", datetime.date(2015, 12, 1))\n        self.assertEqual(expected_task, list(range_task._requires())[0])\n\n    def test_param_name_with_inferred_fs(self):\n        class MyTask(luigi.Task):\n            some_non_range_param = luigi.Parameter(default=\"woo\")\n            date_param = luigi.DateParameter()\n\n            def output(self):\n                return MockTarget(self.date_param.strftime(\"/n2000y01a05n/%Y_%m-_-%daww/21mm%Hdara21/ooo\"))\n\n        range_task = RangeDaily(\n            now=datetime_to_epoch(datetime.datetime(2015, 12, 2)),\n            of=MyTask,\n            start=datetime.date(2015, 12, 1),\n            stop=datetime.date(2015, 12, 2),\n            param_name=\"date_param\",\n        )\n        expected_task = MyTask(\"woo\", datetime.date(2015, 12, 1))\n        self.assertEqual(expected_task, list(range_task._requires())[0])\n\n    def test_of_param_distinction(self):\n        class MyTask(luigi.Task):\n            arbitrary_param = luigi.Parameter(default=\"foo\")\n            arbitrary_integer_param = luigi.IntParameter(default=10)\n            date_param = luigi.DateParameter()\n\n            def complete(self):\n                return False\n\n        range_task_1 = RangeDaily(\n            now=datetime_to_epoch(datetime.datetime(2015, 12, 2)), of=MyTask, start=datetime.date(2015, 12, 1), stop=datetime.date(2015, 12, 2)\n        )\n        range_task_2 = RangeDaily(\n            now=datetime_to_epoch(datetime.datetime(2015, 12, 2)),\n            of=MyTask,\n            of_params=dict(arbitrary_param=\"bar\", abitrary_integer_param=2),\n            start=datetime.date(2015, 12, 1),\n            stop=datetime.date(2015, 12, 2),\n        )\n        self.assertNotEqual(range_task_1.task_id, range_task_2.task_id)\n\n    def test_of_param_commandline(self):\n        class MyTask(luigi.Task):\n            task_namespace = \"wohoo\"\n            date_param = luigi.DateParameter()\n            arbitrary_param = luigi.Parameter(default=\"foo\")\n            arbitrary_integer_param = luigi.IntParameter(default=10)\n            state = (None, None)\n            comp = False\n\n            def complete(self):\n                return self.comp\n\n            def run(self):\n                self.comp = True\n                MyTask.state = (self.arbitrary_param, self.arbitrary_integer_param)\n\n        now = str(int(datetime_to_epoch(datetime.datetime(2015, 12, 2))))\n        self.run_locally(\n            [\n                \"RangeDailyBase\",\n                \"--of\",\n                \"wohoo.MyTask\",\n                \"--of-params\",\n                '{\"arbitrary_param\":\"bar\",\"arbitrary_integer_param\":5}',\n                \"--now\",\n                \"{0}\".format(now),\n                \"--start\",\n                \"2015-12-01\",\n                \"--stop\",\n                \"2015-12-02\",\n            ]\n        )\n        self.assertEqual(MyTask.state, (\"bar\", 5))\n"
  },
  {
    "path": "test/recursion_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport datetime\n\nfrom helpers import unittest\n\nimport luigi\nimport luigi.interface\nfrom luigi.mock import MockTarget\n\n\nclass Popularity(luigi.Task):\n    date = luigi.DateParameter(default=datetime.date.today() - datetime.timedelta(1))\n\n    def output(self):\n        return MockTarget(\"/tmp/popularity/%s.txt\" % self.date.strftime(\"%Y-%m-%d\"))\n\n    def requires(self):\n        return Popularity(self.date - datetime.timedelta(1))\n\n    def run(self):\n        f = self.output().open(\"w\")\n        for line in self.input().open(\"r\"):\n            print(int(line.strip()) + 1, file=f)\n\n        f.close()\n\n\nclass RecursionTest(unittest.TestCase):\n    def setUp(self):\n        MockTarget.fs.get_all_data()[\"/tmp/popularity/2009-01-01.txt\"] = b\"0\\n\"\n\n    def test_invoke(self):\n        luigi.build([Popularity(datetime.date(2009, 1, 5))], local_scheduler=True)\n\n        self.assertEqual(MockTarget.fs.get_data(\"/tmp/popularity/2009-01-05.txt\"), b\"4\\n\")\n"
  },
  {
    "path": "test/remote_scheduler_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport os\nimport tempfile\n\nimport server_test\n\nimport luigi.server\n\ntempdir = tempfile.mkdtemp()\n\n\nclass DummyTask(luigi.Task):\n    id = luigi.IntParameter()\n\n    def run(self):\n        f = self.output().open(\"w\")\n        f.close()\n\n    def output(self):\n        return luigi.LocalTarget(os.path.join(tempdir, str(self.id)))\n\n\nclass RemoteSchedulerTest(server_test.ServerTestBase):\n    def _test_run(self, workers):\n        tasks = [DummyTask(id) for id in range(20)]\n        luigi.build(tasks, workers=workers, scheduler_port=self.get_http_port())\n\n        for t in tasks:\n            self.assertEqual(t.complete(), True)\n            self.assertTrue(os.path.exists(t.output().path))\n\n    def test_single_worker(self):\n        self._test_run(workers=1)\n\n    def test_multiple_workers(self):\n        self._test_run(workers=10)\n"
  },
  {
    "path": "test/retcodes_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2015-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nimport mock\nfrom helpers import LuigiTestCase, with_config\n\nimport luigi\nimport luigi.scheduler\nfrom luigi.cmdline import luigi_run\n\n\nclass RetcodesTest(LuigiTestCase):\n    def run_and_expect(self, joined_params, retcode, extra_args=[\"--local-scheduler\", \"--no-lock\"]):\n        with self.assertRaises(SystemExit) as cm:\n            luigi_run((joined_params.split(\" \") + extra_args))\n        self.assertEqual(cm.exception.code, retcode)\n\n    def run_with_config(self, retcode_config, *args, **kwargs):\n        with_config(dict(retcode=retcode_config))(self.run_and_expect)(*args, **kwargs)\n\n    def test_task_failed(self):\n        class FailingTask(luigi.Task):\n            def run(self):\n                raise ValueError()\n\n        self.run_and_expect(\"FailingTask\", 0)  # Test default value to be 0\n        self.run_and_expect(\"FailingTask --retcode-task-failed 5\", 5)\n        self.run_with_config(dict(task_failed=\"3\"), \"FailingTask\", 3)\n\n    def test_missing_data(self):\n        class MissingDataTask(luigi.ExternalTask):\n            def complete(self):\n                return False\n\n        self.run_and_expect(\"MissingDataTask\", 0)  # Test default value to be 0\n        self.run_and_expect(\"MissingDataTask --retcode-missing-data 5\", 5)\n        self.run_with_config(dict(missing_data=\"3\"), \"MissingDataTask\", 3)\n\n    def test_already_running(self):\n        class AlreadyRunningTask(luigi.Task):\n            def run(self):\n                pass\n\n        old_func = luigi.scheduler.Scheduler.get_work\n\n        def new_func(*args, **kwargs):\n            kwargs[\"current_tasks\"] = None\n            old_func(*args, **kwargs)\n            res = old_func(*args, **kwargs)\n            res[\"running_tasks\"][0][\"worker\"] = \"not me :)\"  # Otherwise it will be filtered\n            return res\n\n        with mock.patch(\"luigi.scheduler.Scheduler.get_work\", new_func):\n            self.run_and_expect(\"AlreadyRunningTask\", 0)  # Test default value to be 0\n            self.run_and_expect(\"AlreadyRunningTask --retcode-already-running 5\", 5)\n            self.run_with_config(dict(already_running=\"3\"), \"AlreadyRunningTask\", 3)\n\n    def test_when_locked(self):\n        def new_func(*args, **kwargs):\n            return False\n\n        with mock.patch(\"luigi.lock.acquire_for\", new_func):\n            self.run_and_expect(\"Task\", 0, extra_args=[\"--local-scheduler\"])\n            self.run_and_expect(\"Task --retcode-already-running 5\", 5, extra_args=[\"--local-scheduler\"])\n            self.run_with_config(dict(already_running=\"3\"), \"Task\", 3, extra_args=[\"--local-scheduler\"])\n\n    def test_failure_in_complete(self):\n        class FailingComplete(luigi.Task):\n            def complete(self):\n                raise Exception\n\n        class RequiringTask(luigi.Task):\n            def requires(self):\n                yield FailingComplete()\n\n        self.run_and_expect(\"RequiringTask\", 0)\n\n    def test_failure_in_requires(self):\n        class FailingRequires(luigi.Task):\n            def requires(self):\n                raise Exception\n\n        self.run_and_expect(\"FailingRequires\", 0)\n\n    def test_validate_dependency_error(self):\n        # requires() from RequiringTask expects a Task object\n        class DependencyTask:\n            pass\n\n        class RequiringTask(luigi.Task):\n            def requires(self):\n                yield DependencyTask()\n\n        self.run_and_expect(\"RequiringTask\", 4)\n\n    def test_task_limit(self):\n        class TaskB(luigi.Task):\n            def complete(self):\n                return False\n\n        class TaskA(luigi.Task):\n            def requires(sefl):\n                yield TaskB()\n\n        class TaskLimitTest(luigi.Task):\n            def requires(self):\n                yield TaskA()\n\n        self.run_and_expect(\"TaskLimitTest --worker-task-limit 2\", 0)\n        self.run_and_expect(\"TaskLimitTest --worker-task-limit 2 --retcode-scheduling-error 3\", 3)\n\n    def test_unhandled_exception(self):\n        def new_func(*args, **kwargs):\n            raise Exception()\n\n        with mock.patch(\"luigi.worker.Worker.add\", new_func):\n            self.run_and_expect(\"Task\", 4)\n            self.run_and_expect(\"Task --retcode-unhandled-exception 2\", 2)\n\n        class TaskWithRequiredParam(luigi.Task):\n            param = luigi.Parameter()\n\n        self.run_and_expect(\"TaskWithRequiredParam --param hello\", 0)\n        self.run_and_expect(\"TaskWithRequiredParam\", 4)\n\n    def test_when_mixed_errors(self):\n\n        class FailingTask(luigi.Task):\n            def run(self):\n                raise ValueError()\n\n        class MissingDataTask(luigi.ExternalTask):\n            def complete(self):\n                return False\n\n        class RequiringTask(luigi.Task):\n            def requires(self):\n                yield FailingTask()\n                yield MissingDataTask()\n\n        self.run_and_expect(\"RequiringTask --retcode-task-failed 4 --retcode-missing-data 5\", 5)\n        self.run_and_expect(\"RequiringTask --retcode-task-failed 7 --retcode-missing-data 6\", 7)\n\n    def test_unknown_reason(self):\n\n        class TaskA(luigi.Task):\n            def complete(self):\n                return True\n\n        class RequiringTask(luigi.Task):\n            def requires(self):\n                yield TaskA()\n\n        def new_func(*args, **kwargs):\n            return None\n\n        with mock.patch(\"luigi.scheduler.Scheduler.add_task\", new_func):\n            self.run_and_expect(\"RequiringTask\", 0)\n            self.run_and_expect(\"RequiringTask --retcode-not-run 5\", 5)\n\n    \"\"\"\n    Test that a task once crashing and then succeeding should be counted as no failure.\n    \"\"\"\n\n    def test_retry_sucess_task(self):\n        class Foo(luigi.Task):\n            run_count = 0\n\n            def run(self):\n                self.run_count += 1\n                if self.run_count == 1:\n                    raise ValueError()\n\n            def complete(self):\n                return self.run_count > 0\n\n        self.run_and_expect(\"Foo --scheduler-retry-delay=0\", 0)\n        self.run_and_expect(\"Foo --scheduler-retry-delay=0 --retcode-task-failed=5\", 0)\n        self.run_with_config(dict(task_failed=\"3\"), \"Foo\", 0)\n"
  },
  {
    "path": "test/rpc_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nfrom helpers import unittest, with_config\n\ntry:\n    from unittest import mock\nexcept ImportError:\n    import mock\n\nimport socket\nfrom multiprocessing import Process, Queue\n\nimport scheduler_api_test\nfrom server_test import ServerTestBase\n\nimport luigi.rpc\nimport luigi.server\nfrom luigi.scheduler import Scheduler\n\n\nclass RemoteSchedulerTest(unittest.TestCase):\n    def testUrlArgumentVariations(self):\n        for url in [\"http://zorg.com\", \"http://zorg.com/\"]:\n            for suffix in [\"api/123\", \"/api/123\"]:\n                s = luigi.rpc.RemoteScheduler(url, 42)\n                with mock.patch.object(s, \"_fetcher\") as fetcher:\n                    s._fetch(suffix, \"{}\")\n                    fetcher.fetch.assert_called_once_with(\"http://zorg.com/api/123\", \"{}\", 42)\n\n    def testUrlArgumentVariationsNotRoot(self):\n        for url in [\"http://zorg.com/subpath\", \"http://zorg.com/subpath/\"]:\n            for suffix in [\"api/123\", \"/api/123\"]:\n                s = luigi.rpc.RemoteScheduler(url, 42)\n                with mock.patch.object(s, \"_fetcher\") as fetcher:\n                    s._fetch(suffix, \"{}\")\n                    fetcher.fetch.assert_called_once_with(\"http://zorg.com/subpath/api/123\", \"{}\", 42)\n\n    def get_work(self, fetcher_side_effect):\n        scheduler = luigi.rpc.RemoteScheduler(\"http://zorg.com\", 42)\n        scheduler._rpc_retry_wait = 1  # shorten wait time to speed up tests\n\n        with mock.patch.object(scheduler, \"_fetcher\") as fetcher:\n            fetcher.raises = socket.timeout, socket.gaierror\n            fetcher.fetch.side_effect = fetcher_side_effect\n            return scheduler.get_work(\"fake_worker\")\n\n    def test_retry_rpc_method(self):\n        \"\"\"\n        Tests that a call to a RPC method is re-tried 3 times.\n        \"\"\"\n\n        fetch_results = [socket.timeout, socket.timeout, '{\"response\":{}}']\n        self.assertEqual({}, self.get_work(fetch_results))\n\n    def test_retry_rpc_limited(self):\n        \"\"\"\n        Tests that a call to an RPC method fails after the third attempt\n        \"\"\"\n\n        fetch_results = [socket.timeout, socket.timeout, socket.timeout]\n        self.assertRaises(luigi.rpc.RPCError, self.get_work, fetch_results)\n\n    @mock.patch(\"luigi.rpc.logger\")\n    def test_log_rpc_retries_enabled(self, mock_logger):\n        \"\"\"\n        Tests that each retry of an RPC method is logged\n        \"\"\"\n\n        fetch_results = [socket.timeout, socket.timeout, '{\"response\":{}}']\n        self.get_work(fetch_results)\n        self.assertEqual(\n            [\n                mock.call.warning(\"Failed connecting to remote scheduler %r\", \"http://zorg.com\", exc_info=True),\n                mock.call.info(\"Retrying attempt 2 of 3 (max)\"),\n                mock.call.info(\"Wait for 1 seconds\"),\n                mock.call.warning(\"Failed connecting to remote scheduler %r\", \"http://zorg.com\", exc_info=True),\n                mock.call.info(\"Retrying attempt 3 of 3 (max)\"),\n                mock.call.info(\"Wait for 1 seconds\"),\n            ],\n            mock_logger.mock_calls,\n        )\n\n    @with_config({\"core\": {\"rpc-log-retries\": \"false\"}})\n    @mock.patch(\"luigi.rpc.logger\")\n    def test_log_rpc_retries_disabled(self, mock_logger):\n        \"\"\"\n        Tests that retries of an RPC method are not logged\n        \"\"\"\n\n        fetch_results = [socket.timeout, socket.timeout, socket.gaierror]\n        try:\n            self.get_work(fetch_results)\n            self.fail(\"get_work should have thrown RPCError\")\n        except luigi.rpc.RPCError as e:\n            self.assertTrue(isinstance(e.sub_exception, socket.gaierror))\n        self.assertEqual([], mock_logger.mock_calls)\n\n    def test_get_work_retries_on_null(self):\n        \"\"\"\n        Tests that get_work will retry if the response is null\n        \"\"\"\n\n        fetch_results = ['{\"response\": null}', '{\"response\": {\"pass\": true}}']\n        self.assertEqual({\"pass\": True}, self.get_work(fetch_results))\n\n    def test_get_work_retries_on_null_limited(self):\n        \"\"\"\n        Tests that get_work will give up after the third null response\n        \"\"\"\n\n        fetch_results = ['{\"response\": null}'] * 3 + ['{\"response\": {}}']\n        self.assertRaises(luigi.rpc.RPCError, self.get_work, fetch_results)\n\n\nclass RPCTest(scheduler_api_test.SchedulerApiTest, ServerTestBase):\n    def get_app(self):\n        conf = self.get_scheduler_config()\n        sch = Scheduler(**conf)\n        return luigi.server.app(sch)\n\n    def setUp(self):\n        super(RPCTest, self).setUp()\n        self.sch = luigi.rpc.RemoteScheduler(self.get_url(\"\"))\n        self.sch._wait = lambda: None\n\n    # disable test that doesn't work with remote scheduler\n\n    def test_task_first_failure_time(self):\n        pass\n\n    def test_task_first_failure_time_remains_constant(self):\n        pass\n\n    def test_task_has_excessive_failures(self):\n        pass\n\n    def test_quadratic_behavior(self):\n        \"\"\"This would be too slow to run through network\"\"\"\n        pass\n\n    def test_get_work_speed(self):\n        \"\"\"This would be too slow to run through network\"\"\"\n        pass\n\n\nclass RequestsFetcherTest(ServerTestBase):\n    def test_fork_changes_session(self):\n        fetcher = luigi.rpc.RequestsFetcher()\n        session = fetcher.session\n\n        q = Queue()\n\n        def check_session(q):\n            fetcher.check_pid()\n            # make sure that check_pid has changed out the session\n            q.put(fetcher.session != session)\n\n        p = Process(target=check_session, args=(q,))\n        p.start()\n        p.join()\n\n        self.assertTrue(q.get(), \"the requests.Session should have changed in the new process\")\n\n\nclass URLLibFetcherTest(ServerTestBase):\n    def test_url_with_basic_auth(self):\n        fetcher = luigi.rpc.URLLibFetcher()\n\n        # without password\n        req = fetcher._create_request(\"http://user@localhost\")\n        self.assertTrue(req.has_header(\"Authorization\"))\n        self.assertEqual(req.get_header(\"Authorization\"), \"Basic dXNlcjo=\")\n        self.assertEqual(req.get_full_url(), \"http://localhost\")\n\n        # empty password (same as above)\n        req = fetcher._create_request(\"http://user:@localhost\")\n        self.assertTrue(req.has_header(\"Authorization\"))\n        self.assertEqual(req.get_header(\"Authorization\"), \"Basic dXNlcjo=\")\n        self.assertEqual(req.get_full_url(), \"http://localhost\")\n\n        # with password\n        req = fetcher._create_request(\"http://user:pass@localhost\")\n        self.assertTrue(req.has_header(\"Authorization\"))\n        self.assertEqual(req.get_header(\"Authorization\"), \"Basic dXNlcjpwYXNz\")\n        self.assertEqual(req.get_full_url(), \"http://localhost\")\n\n    def test_url_without_basic_auth(self):\n        fetcher = luigi.rpc.URLLibFetcher()\n        req = fetcher._create_request(\"http://localhost\")\n\n        self.assertFalse(req.has_header(\"Authorization\"))\n        self.assertEqual(req.get_full_url(), \"http://localhost\")\n\n    def test_body_encoding(self):\n        fetcher = luigi.rpc.URLLibFetcher()\n\n        # with body\n        req = fetcher._create_request(\"http://localhost\", body={\"foo\": \"bar baz/test\"})\n        self.assertEqual(req.data, b\"foo=bar+baz%2Ftest\")\n\n        # without body\n        req = fetcher._create_request(\"http://localhost\")\n        self.assertIsNone(req.data)\n"
  },
  {
    "path": "test/runtests.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport sys\nimport warnings\n\nimport pytest\n\nif __name__ == \"__main__\":\n    with warnings.catch_warnings():\n        warnings.simplefilter(\"default\")\n        warnings.filterwarnings(\"ignore\", message=\"(.*)outputs has no custom(.*)\", category=UserWarning)\n        sys.exit(pytest.main(sys.argv[1:]))\n"
  },
  {
    "path": "test/safe_extractor_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"\nSafe Extractor Test\n=============\n\nTests for the Safe Extractor class in luigi.safe_extractor module.\n\"\"\"\n\nimport os\nimport shutil\nimport tarfile\nimport tempfile\nimport unittest\n\nfrom luigi.safe_extractor import SafeExtractor\n\n\nclass TestSafeExtract(unittest.TestCase):\n    \"\"\"\n    Unit test class for testing the SafeExtractor module.\n    \"\"\"\n\n    def setUp(self):\n        \"\"\"Set up a temporary directory for test files.\"\"\"\n        self.temp_dir = tempfile.mkdtemp()\n        self.test_file_template = \"test_file_{}.txt\"\n        self.tar_file_name = \"test.tar\"\n        self.tar_file_name_with_traversal = f\"traversal_{self.tar_file_name}\"\n\n    def tearDown(self):\n        \"\"\"Clean up the temporary directory after each test.\"\"\"\n        shutil.rmtree(self.temp_dir)\n\n    def create_test_tar(self, tar_path, file_count=1, with_traversal=False):\n        \"\"\"\n        Create a tar file containing test files.\n\n        Args:\n            tar_path (str): Path where the tar file will be created.\n            file_count (int): Number of test files to include.\n            with_traversal (bool): If True, creates a tar file with path traversal vulnerability.\n        \"\"\"\n        # Default content for the test files\n        file_contents = [f\"This is {self.test_file_template.format(i)}\" for i in range(file_count)]\n\n        with tarfile.open(tar_path, \"w\") as tar:\n            for i in range(file_count):\n                file_name = self.test_file_template.format(i)\n                file_path = os.path.join(self.temp_dir, file_name)\n\n                # Write content to each test file\n                with open(file_path, \"w\") as f:\n                    f.write(file_contents[i])\n\n                # If path traversal is enabled, create malicious paths\n                archive_name = f\"../../{file_name}\" if with_traversal else file_name\n\n                # Add the file to the tar archive\n                tar.add(file_path, arcname=archive_name)\n\n    def verify_extracted_files(self, file_count):\n        \"\"\"\n        Verify that the correct files were extracted and their contents match expectations.\n\n        Args:\n            file_count (int): Number of files to verify.\n        \"\"\"\n        for i in range(file_count):\n            file_name = self.test_file_template.format(i)\n            file_path = os.path.join(self.temp_dir, file_name)\n\n            # Check if the file exists\n            self.assertTrue(os.path.exists(file_path), f\"File {file_name} does not exist.\")\n\n            # Check if the file content is correct\n            with open(file_path, \"r\") as f:\n                content = f.read()\n                expected_content = f\"This is {file_name}\"\n                self.assertEqual(content, expected_content, f\"Content mismatch in {file_name}.\")\n\n    def test_safe_extract(self):\n        \"\"\"Test normal safe extraction of tar files.\"\"\"\n        tar_path = os.path.join(self.temp_dir, self.tar_file_name)\n\n        # Create a tar file with 3 files\n        self.create_test_tar(tar_path, file_count=3)\n\n        # Initialize SafeExtractor and perform extraction\n        extractor = SafeExtractor(self.temp_dir)\n        extractor.safe_extract(tar_path)\n\n        # Verify that all 3 files were extracted correctly\n        self.verify_extracted_files(3)\n\n    def test_safe_extract_with_traversal(self):\n        \"\"\"Test safe extraction for tar files with path traversal (should raise an error).\"\"\"\n        tar_path = os.path.join(self.temp_dir, self.tar_file_name_with_traversal)\n\n        # Create a tar file with a path traversal file\n        self.create_test_tar(tar_path, file_count=1, with_traversal=True)\n\n        # Initialize SafeExtractor and expect RuntimeError due to path traversal\n        extractor = SafeExtractor(self.temp_dir)\n        with self.assertRaises(RuntimeError):\n            extractor.safe_extract(tar_path)\n\n\nif __name__ == \"__main__\":\n    unittest.main()\n"
  },
  {
    "path": "test/scheduler_api_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport itertools\nimport time\n\nimport mock\nimport pytest\nfrom helpers import unittest\n\nimport luigi.notifications\nfrom luigi.scheduler import BATCH_RUNNING, DISABLED, DONE, FAILED, PENDING, RUNNING, UNKNOWN, UPSTREAM_RUNNING, Scheduler\n\nluigi.notifications.DEBUG = True\nWORKER = \"myworker\"\n\n\n@pytest.mark.scheduler\nclass SchedulerApiTest(unittest.TestCase):\n    def setUp(self):\n        super(SchedulerApiTest, self).setUp()\n        conf = self.get_scheduler_config()\n        self.sch = Scheduler(**conf)\n        self.time = time.time\n\n    def get_scheduler_config(self):\n        return {\n            \"retry_delay\": 100,\n            \"remove_delay\": 1000,\n            \"worker_disconnect_delay\": 10,\n            \"disable_persist\": 10,\n            \"disable_window\": 10,\n            \"retry_count\": 3,\n            \"disable_hard_timeout\": 60 * 60,\n            \"stable_done_cooldown_secs\": 0,\n        }\n\n    def tearDown(self):\n        super(SchedulerApiTest, self).tearDown()\n        if time.time != self.time:\n            time.time = self.time\n\n    def setTime(self, t):\n        time.time = lambda: t\n\n    def test_dep(self):\n        self.sch.add_task(worker=WORKER, task_id=\"B\", deps=(\"A\",))\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], \"A\")\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=DONE)\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], \"B\")\n        self.sch.add_task(worker=WORKER, task_id=\"B\", status=DONE)\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], None)\n\n    def test_failed_dep(self):\n        self.sch.add_task(worker=WORKER, task_id=\"B\", deps=(\"A\",))\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], \"A\")\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], None)  # can still wait and retry: TODO: do we want this?\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=DONE)\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], \"B\")\n        self.sch.add_task(worker=WORKER, task_id=\"B\", status=DONE)\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], None)\n\n    def test_broken_dep(self):\n        self.sch.add_task(worker=WORKER, task_id=\"B\", deps=(\"A\",))\n        self.sch.add_task(worker=WORKER, task_id=\"A\", runnable=False)\n\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], None)  # can still wait and retry: TODO: do we want this?\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=DONE)\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], \"B\")\n        self.sch.add_task(worker=WORKER, task_id=\"B\", status=DONE)\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], None)\n\n    def test_two_workers(self):\n        # Worker X wants to build A -> B\n        # Worker Y wants to build A -> C\n        self.sch.add_task(worker=\"X\", task_id=\"A\")\n        self.sch.add_task(worker=\"Y\", task_id=\"A\")\n        self.sch.add_task(task_id=\"B\", deps=(\"A\",), worker=\"X\")\n        self.sch.add_task(task_id=\"C\", deps=(\"A\",), worker=\"Y\")\n\n        self.assertEqual(self.sch.get_work(worker=\"X\")[\"task_id\"], \"A\")\n        self.assertEqual(self.sch.get_work(worker=\"Y\")[\"task_id\"], None)  # Worker Y is pending on A to be done\n        self.sch.add_task(worker=\"X\", task_id=\"A\", status=DONE)\n        self.assertEqual(self.sch.get_work(worker=\"Y\")[\"task_id\"], \"C\")\n        self.assertEqual(self.sch.get_work(worker=\"X\")[\"task_id\"], \"B\")\n\n    def test_status_wont_override(self):\n        # Worker X is running A\n        # Worker Y wants to override the status to UNKNOWN (e.g. complete is throwing an exception)\n        self.sch.add_task(worker=\"X\", task_id=\"A\")\n        self.assertEqual(self.sch.get_work(worker=\"X\")[\"task_id\"], \"A\")\n        self.sch.add_task(worker=\"Y\", task_id=\"A\", status=UNKNOWN)\n        self.assertEqual({\"A\"}, set(self.sch.task_list(RUNNING, \"\").keys()))\n\n    def test_retry(self):\n        # Try to build A but fails, will retry after 100s\n        self.setTime(0)\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], \"A\")\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n        for t in range(100):\n            self.setTime(t)\n            self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], None)\n            self.sch.ping(worker=WORKER)\n            if t % 10 == 0:\n                self.sch.prune()\n\n        self.setTime(101)\n        self.sch.prune()\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], \"A\")\n\n    def test_resend_task(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.sch.add_task(worker=WORKER, task_id=\"B\")\n        for _ in range(10):\n            self.assertEqual(\"A\", self.sch.get_work(worker=WORKER, current_tasks=[])[\"task_id\"])\n        self.assertEqual(\"B\", self.sch.get_work(worker=WORKER, current_tasks=[\"A\"])[\"task_id\"])\n\n    def test_resend_multiple_tasks(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.sch.add_task(worker=WORKER, task_id=\"B\")\n        self.sch.add_task(worker=WORKER, task_id=\"C\")\n\n        # get A and B running\n        self.assertEqual(\"A\", self.sch.get_work(worker=WORKER)[\"task_id\"])\n        self.assertEqual(\"B\", self.sch.get_work(worker=WORKER)[\"task_id\"])\n\n        for _ in range(10):\n            self.assertEqual(\"A\", self.sch.get_work(worker=WORKER, current_tasks=[])[\"task_id\"])\n            self.assertEqual(\"A\", self.sch.get_work(worker=WORKER, current_tasks=[\"B\"])[\"task_id\"])\n            self.assertEqual(\"B\", self.sch.get_work(worker=WORKER, current_tasks=[\"A\"])[\"task_id\"])\n            self.assertEqual(\"C\", self.sch.get_work(worker=WORKER, current_tasks=[\"A\", \"B\"])[\"task_id\"])\n\n    def test_disconnect_running(self):\n        # X and Y wants to run A.\n        # X starts but does not report back. Y does.\n        # After some timeout, Y will build it instead\n        self.setTime(0)\n        self.sch.add_task(task_id=\"A\", worker=\"X\")\n        self.sch.add_task(task_id=\"A\", worker=\"Y\")\n        self.assertEqual(self.sch.get_work(worker=\"X\")[\"task_id\"], \"A\")\n        for t in range(200):\n            self.setTime(t)\n            self.sch.ping(worker=\"Y\")\n            if t % 10 == 0:\n                self.sch.prune()\n\n        self.assertEqual(self.sch.get_work(worker=\"Y\")[\"task_id\"], \"A\")\n\n    def test_get_work_single_batch_item(self):\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"a\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_1\", family=\"A\", params={\"a\": \"1\"}, batchable=True)\n\n        response = self.sch.get_work(worker=WORKER)\n        self.assertEqual(\"A_a_1\", response[\"task_id\"])\n\n        param_values = response[\"task_params\"].values()\n        self.assertTrue(not any(isinstance(param, list)) for param in param_values)\n\n    def test_get_work_multiple_batch_items(self):\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"a\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_1\", family=\"A\", params={\"a\": \"1\"}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_2\", family=\"A\", params={\"a\": \"2\"}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_3\", family=\"A\", params={\"a\": \"3\"}, batchable=True)\n\n        response = self.sch.get_work(worker=WORKER)\n        self.assertIsNone(response[\"task_id\"])\n        self.assertEqual({\"a\": [\"1\", \"2\", \"3\"]}, response[\"task_params\"])\n        self.assertEqual(\"A\", response[\"task_family\"])\n\n    def test_batch_time_running(self):\n        self.setTime(1234)\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"a\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_1\", family=\"A\", params={\"a\": \"1\"}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_2\", family=\"A\", params={\"a\": \"2\"}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_3\", family=\"A\", params={\"a\": \"3\"}, batchable=True)\n\n        self.sch.get_work(worker=WORKER)\n        for task in self.sch.task_list().values():\n            self.assertEqual(1234, task[\"time_running\"])\n\n    def test_batch_ignore_items_not_ready(self):\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"a\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_1\", family=\"A\", params={\"a\": \"1\"}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_2\", family=\"A\", params={\"a\": \"2\"}, deps=[\"NOT_DONE\"], batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_3\", family=\"A\", params={\"a\": \"3\"}, deps=[\"DONE\"], batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_4\", family=\"A\", params={\"a\": \"4\"}, deps=[\"DONE\"], batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_5\", family=\"A\", params={\"a\": \"5\"}, deps=[\"NOT_DONE\"], batchable=True)\n\n        self.sch.add_task(worker=WORKER, task_id=\"NOT_DONE\", runnable=False)\n        self.sch.add_task(worker=WORKER, task_id=\"DONE\", status=DONE)\n\n        response = self.sch.get_work(worker=WORKER)\n        self.assertIsNone(response[\"task_id\"])\n        self.assertEqual({\"a\": [\"1\", \"3\", \"4\"]}, response[\"task_params\"])\n        self.assertEqual(\"A\", response[\"task_family\"])\n\n    def test_batch_ignore_first_item_not_ready(self):\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"a\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_1\", family=\"A\", params={\"a\": \"1\"}, deps=[\"NOT_DONE\"], batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_2\", family=\"A\", params={\"a\": \"2\"}, deps=[\"DONE\"], batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_3\", family=\"A\", params={\"a\": \"3\"}, deps=[\"DONE\"], batchable=True)\n\n        self.sch.add_task(worker=WORKER, task_id=\"NOT_DONE\", runnable=False)\n        self.sch.add_task(worker=WORKER, task_id=\"DONE\", status=DONE)\n\n        response = self.sch.get_work(worker=WORKER)\n        self.assertIsNone(response[\"task_id\"])\n        self.assertEqual({\"a\": [\"2\", \"3\"]}, response[\"task_params\"])\n        self.assertEqual(\"A\", response[\"task_family\"])\n\n    def test_get_work_with_batch_items_with_resources(self):\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"a\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_1\", family=\"A\", params={\"a\": \"1\"}, batchable=True, resources={\"r1\": 1})\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_2\", family=\"A\", params={\"a\": \"2\"}, batchable=True, resources={\"r1\": 1})\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_3\", family=\"A\", params={\"a\": \"3\"}, batchable=True, resources={\"r1\": 1})\n\n        response = self.sch.get_work(worker=WORKER)\n        self.assertIsNone(response[\"task_id\"])\n        self.assertEqual({\"a\": [\"1\", \"2\", \"3\"]}, response[\"task_params\"])\n        self.assertEqual(\"A\", response[\"task_family\"])\n\n    def test_get_work_limited_batch_size(self):\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"a\"], max_batch_size=2)\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_1\", family=\"A\", params={\"a\": \"1\"}, batchable=True, priority=1)\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_2\", family=\"A\", params={\"a\": \"2\"}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_3\", family=\"A\", params={\"a\": \"3\"}, batchable=True, priority=2)\n\n        response = self.sch.get_work(worker=WORKER)\n        self.assertIsNone(response[\"task_id\"])\n        self.assertEqual({\"a\": [\"3\", \"1\"]}, response[\"task_params\"])\n        self.assertEqual(\"A\", response[\"task_family\"])\n\n        response2 = self.sch.get_work(worker=WORKER)\n        self.assertEqual(\"A_a_2\", response2[\"task_id\"])\n\n    def test_get_work_do_not_batch_non_batchable_item(self):\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"a\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_1\", family=\"A\", params={\"a\": \"1\"}, batchable=True, priority=1)\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_2\", family=\"A\", params={\"a\": \"2\"}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_a_3\", family=\"A\", params={\"a\": \"3\"}, batchable=False, priority=2)\n\n        response = self.sch.get_work(worker=WORKER)\n        self.assertEqual(\"A_a_3\", response[\"task_id\"])\n\n        response2 = self.sch.get_work(worker=WORKER)\n        self.assertIsNone(response2[\"task_id\"])\n        self.assertEqual({\"a\": [\"1\", \"2\"]}, response2[\"task_params\"])\n        self.assertEqual(\"A\", response2[\"task_family\"])\n\n    def test_get_work_group_on_non_batch_params(self):\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"b\"])\n        for a, b, c in itertools.product((1, 2), repeat=3):\n            self.sch.add_task(\n                worker=WORKER,\n                task_id=\"A_%i_%i_%i\" % (a, b, c),\n                family=\"A\",\n                params={\"a\": str(a), \"b\": str(b), \"c\": str(c)},\n                batchable=True,\n                priority=9 * a + 3 * c + b,\n            )\n\n        for a, c in [(\"2\", \"2\"), (\"2\", \"1\"), (\"1\", \"2\"), (\"1\", \"1\")]:\n            response = self.sch.get_work(worker=WORKER)\n            self.assertIsNone(response[\"task_id\"])\n            self.assertEqual({\"a\": a, \"b\": [\"2\", \"1\"], \"c\": c}, response[\"task_params\"])\n            self.assertEqual(\"A\", response[\"task_family\"])\n\n    def test_get_work_multiple_batched_params(self):\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"a\", \"b\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A_1_1\", family=\"A\", params={\"a\": \"1\", \"b\": \"1\"}, priority=1, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_1_2\", family=\"A\", params={\"a\": \"1\", \"b\": \"2\"}, priority=2, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_2_1\", family=\"A\", params={\"a\": \"2\", \"b\": \"1\"}, priority=3, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_2_2\", family=\"A\", params={\"a\": \"2\", \"b\": \"2\"}, priority=4, batchable=True)\n\n        response = self.sch.get_work(worker=WORKER)\n        self.assertIsNone(response[\"task_id\"])\n\n        expected_params = {\n            \"a\": [\"2\", \"2\", \"1\", \"1\"],\n            \"b\": [\"2\", \"1\", \"2\", \"1\"],\n        }\n        self.assertEqual(expected_params, response[\"task_params\"])\n\n    def test_get_work_with_unbatched_worker_on_batched_task(self):\n        self.sch.add_task_batcher(worker=\"batcher\", task_family=\"A\", batched_args=[\"a\"])\n        for i in range(5):\n            self.sch.add_task(worker=WORKER, task_id=\"A_%i\" % i, family=\"A\", params={\"a\": str(i)}, priority=i, batchable=False)\n            self.sch.add_task(worker=\"batcher\", task_id=\"A_%i\" % i, family=\"A\", params={\"a\": str(i)}, priority=i, batchable=True)\n        self.assertEqual(\"A_4\", self.sch.get_work(worker=WORKER)[\"task_id\"])\n        batch_response = self.sch.get_work(worker=\"batcher\")\n        self.assertIsNone(batch_response[\"task_id\"])\n        self.assertEqual({\"a\": [\"3\", \"2\", \"1\", \"0\"]}, batch_response[\"task_params\"])\n\n    def test_batched_tasks_become_batch_running(self):\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"a\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A_1\", family=\"A\", params={\"a\": 1}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_2\", family=\"A\", params={\"a\": 2}, batchable=True)\n        self.sch.get_work(worker=WORKER)\n        self.assertEqual({\"A_1\", \"A_2\"}, set(self.sch.task_list(\"BATCH_RUNNING\", \"\").keys()))\n\n    def test_downstream_jobs_from_batch_running_have_upstream_running_status(self):\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"a\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A_1\", family=\"A\", params={\"a\": 1}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_2\", family=\"A\", params={\"a\": 2}, batchable=True)\n        self.sch.get_work(worker=WORKER)\n        self.assertEqual({\"A_1\", \"A_2\"}, set(self.sch.task_list(\"BATCH_RUNNING\", \"\").keys()))\n\n        self.sch.add_task(worker=WORKER, task_id=\"B\", deps=[\"A_1\"])\n        self.assertEqual({\"B\"}, set(self.sch.task_list(PENDING, UPSTREAM_RUNNING).keys()))\n\n    def test_set_batch_runner_new_task(self):\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"a\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A_1\", family=\"A\", params={\"a\": \"1\"}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_2\", family=\"A\", params={\"a\": \"2\"}, batchable=True)\n        response = self.sch.get_work(worker=WORKER)\n        batch_id = response[\"batch_id\"]\n        self.sch.add_task(worker=WORKER, task_id=\"A_1_2\", task_family=\"A\", params={\"a\": \"1,2\"}, batch_id=batch_id, status=\"RUNNING\")\n        self.assertEqual({\"A_1\", \"A_2\"}, set(self.sch.task_list(\"BATCH_RUNNING\", \"\").keys()))\n        self.assertEqual({\"A_1_2\"}, set(self.sch.task_list(\"RUNNING\", \"\").keys()))\n\n        self.sch.add_task(worker=WORKER, task_id=\"A_1_2\", status=DONE)\n        self.assertEqual({\"A_1\", \"A_2\", \"A_1_2\"}, set(self.sch.task_list(DONE, \"\").keys()))\n\n    def test_set_batch_runner_max(self):\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"a\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A_1\", family=\"A\", params={\"a\": \"1\"}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_2\", family=\"A\", params={\"a\": \"2\"}, batchable=True)\n        response = self.sch.get_work(worker=WORKER)\n        batch_id = response[\"batch_id\"]\n        self.sch.add_task(worker=WORKER, task_id=\"A_2\", task_family=\"A\", params={\"a\": \"2\"}, batch_id=batch_id, status=\"RUNNING\")\n        self.assertEqual({\"A_1\"}, set(self.sch.task_list(\"BATCH_RUNNING\", \"\").keys()))\n        self.assertEqual({\"A_2\"}, set(self.sch.task_list(\"RUNNING\", \"\").keys()))\n\n        self.sch.add_task(worker=WORKER, task_id=\"A_2\", status=DONE)\n        self.assertEqual({\"A_1\", \"A_2\"}, set(self.sch.task_list(DONE, \"\").keys()))\n\n    def _start_simple_batch(self, use_max=False, mark_running=True, resources=None):\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"a\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A_1\", family=\"A\", params={\"a\": \"1\"}, batchable=True, resources=resources)\n        self.sch.add_task(worker=WORKER, task_id=\"A_2\", family=\"A\", params={\"a\": \"2\"}, batchable=True, resources=resources)\n        response = self.sch.get_work(worker=WORKER)\n        if mark_running:\n            batch_id = response[\"batch_id\"]\n            task_id, params = (\"A_2\", {\"a\": \"2\"}) if use_max else (\"A_1_2\", {\"a\": \"1,2\"})\n\n            self.sch.add_task(worker=WORKER, task_id=task_id, task_family=\"A\", params=params, batch_id=batch_id, status=\"RUNNING\")\n            return batch_id, task_id, params\n\n    def test_set_batch_runner_retry(self):\n        batch_id, task_id, params = self._start_simple_batch()\n        self.sch.add_task(worker=WORKER, task_id=task_id, task_family=\"A\", params=params, batch_id=batch_id, status=\"RUNNING\")\n        self.assertEqual({task_id}, set(self.sch.task_list(\"RUNNING\", \"\").keys()))\n        self.assertEqual({\"A_1\", \"A_2\"}, set(self.sch.task_list(BATCH_RUNNING, \"\").keys()))\n\n    def test_set_batch_runner_multiple_retries(self):\n        batch_id, task_id, params = self._start_simple_batch()\n        for _ in range(3):\n            self.sch.add_task(worker=WORKER, task_id=task_id, task_family=\"A\", params=params, batch_id=batch_id, status=\"RUNNING\")\n        self.assertEqual({task_id}, set(self.sch.task_list(\"RUNNING\", \"\").keys()))\n        self.assertEqual({\"A_1\", \"A_2\"}, set(self.sch.task_list(BATCH_RUNNING, \"\").keys()))\n\n    def test_batch_fail(self):\n        self._start_simple_batch()\n        self.sch.add_task(worker=WORKER, task_id=\"A_1_2\", status=FAILED, expl=\"bad failure\")\n\n        task_ids = {\"A_1\", \"A_2\"}\n        self.assertEqual(task_ids, set(self.sch.task_list(FAILED, \"\").keys()))\n        for task_id in task_ids:\n            expl = self.sch.fetch_error(task_id)[\"error\"]\n            self.assertEqual(\"bad failure\", expl)\n\n    def test_batch_fail_max(self):\n        self._start_simple_batch(use_max=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_2\", status=FAILED, expl=\"bad max failure\")\n\n        task_ids = {\"A_1\", \"A_2\"}\n        self.assertEqual(task_ids, set(self.sch.task_list(FAILED, \"\").keys()))\n        for task_id in task_ids:\n            response = self.sch.fetch_error(task_id)\n            self.assertEqual(\"bad max failure\", response[\"error\"])\n\n    def test_batch_fail_from_dead_worker(self):\n        self.setTime(1)\n        self._start_simple_batch()\n        self.setTime(601)\n        self.sch.prune()\n        self.assertEqual({\"A_1\", \"A_2\"}, set(self.sch.task_list(FAILED, \"\").keys()))\n\n    def test_batch_fail_max_from_dead_worker(self):\n        self.setTime(1)\n        self._start_simple_batch(use_max=True)\n        self.setTime(601)\n        self.sch.prune()\n        self.assertEqual({\"A_1\", \"A_2\"}, set(self.sch.task_list(FAILED, \"\").keys()))\n\n    def test_batch_fail_from_dead_worker_without_running(self):\n        self.setTime(1)\n        self._start_simple_batch(mark_running=False)\n        self.setTime(601)\n        self.sch.prune()\n        self.assertEqual({\"A_1\", \"A_2\"}, set(self.sch.task_list(FAILED, \"\").keys()))\n\n    def test_batch_update_status(self):\n        self._start_simple_batch()\n        self.sch.set_task_status_message(\"A_1_2\", \"test message\")\n        for task_id in (\"A_1\", \"A_2\", \"A_1_2\"):\n            self.assertEqual(\"test message\", self.sch.get_task_status_message(task_id)[\"statusMessage\"])\n\n    def test_batch_update_progress(self):\n        self._start_simple_batch()\n        self.sch.set_task_progress_percentage(\"A_1_2\", 30)\n        for task_id in (\"A_1\", \"A_2\", \"A_1_2\"):\n            self.assertEqual(30, self.sch.get_task_progress_percentage(task_id)[\"progressPercentage\"])\n\n    def test_batch_decrease_resources(self):\n        self.sch.update_resources(x=3)\n        self._start_simple_batch(resources={\"x\": 3})\n        self.sch.decrease_running_task_resources(\"A_1_2\", {\"x\": 1})\n        for task_id in (\"A_1\", \"A_2\", \"A_1_2\"):\n            self.assertEqual(2, self.sch.get_running_task_resources(task_id)[\"resources\"][\"x\"])\n\n    def test_batch_tracking_url(self):\n        self._start_simple_batch()\n        self.sch.add_task(worker=WORKER, task_id=\"A_1_2\", tracking_url=\"http://test.tracking.url/\")\n\n        tasks = self.sch.task_list(\"\", \"\")\n        for task_id in (\"A_1\", \"A_2\", \"A_1_2\"):\n            self.assertEqual(\"http://test.tracking.url/\", tasks[task_id][\"tracking_url\"])\n\n    def test_finish_batch(self):\n        self._start_simple_batch()\n        self.sch.add_task(worker=WORKER, task_id=\"A_1_2\", status=DONE)\n        self.assertEqual({\"A_1\", \"A_2\", \"A_1_2\"}, set(self.sch.task_list(DONE, \"\").keys()))\n\n    def test_reschedule_max_batch(self):\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"a\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A_1\", family=\"A\", params={\"a\": \"1\"}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_2\", family=\"A\", params={\"a\": \"2\"}, batchable=True)\n        response = self.sch.get_work(worker=WORKER)\n        batch_id = response[\"batch_id\"]\n        self.sch.add_task(worker=WORKER, task_id=\"A_2\", task_family=\"A\", params={\"a\": \"2\"}, batch_id=batch_id, status=\"RUNNING\")\n        self.sch.add_task(worker=WORKER, task_id=\"A_2\", status=DONE)\n        self.sch.add_task(worker=WORKER, task_id=\"A_2\", task_family=\"A\", params={\"a\": \"2\"}, batchable=True)\n\n        self.assertEqual({\"A_2\"}, set(self.sch.task_list(PENDING, \"\").keys()))\n        self.assertEqual({\"A_1\"}, set(self.sch.task_list(DONE, \"\").keys()))\n\n    def test_resend_batch_on_get_work_retry(self):\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"a\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A_1\", family=\"A\", params={\"a\": \"1\"}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_2\", family=\"A\", params={\"a\": \"2\"}, batchable=True)\n        response = self.sch.get_work(worker=WORKER)\n        response2 = self.sch.get_work(worker=WORKER, current_tasks=())\n        self.assertEqual(response[\"task_id\"], response2[\"task_id\"])\n        self.assertEqual(response[\"task_family\"], response2.get(\"task_family\"))\n        self.assertEqual(response[\"task_params\"], response2.get(\"task_params\"))\n\n    def test_resend_batch_runner_on_get_work_retry(self):\n        self._start_simple_batch()\n        get_work = self.sch.get_work(worker=WORKER, current_tasks=())\n        self.assertEqual(\"A_1_2\", get_work[\"task_id\"])\n\n    def test_resend_max_batch_runner_on_get_work_retry(self):\n        self._start_simple_batch(use_max=True)\n        get_work = self.sch.get_work(worker=WORKER, current_tasks=())\n        self.assertEqual(\"A_2\", get_work[\"task_id\"])\n\n    def test_do_not_resend_batch_runner_on_get_work(self):\n        self._start_simple_batch()\n        get_work = self.sch.get_work(worker=WORKER, current_tasks=(\"A_1_2\",))\n        self.assertIsNone(get_work[\"task_id\"])\n\n    def test_do_not_resend_max_batch_runner_on_get_work(self):\n        self._start_simple_batch(use_max=True)\n        get_work = self.sch.get_work(worker=WORKER, current_tasks=(\"A_2\",))\n        self.assertIsNone(get_work[\"task_id\"])\n\n    def test_rescheduled_batch_running_tasks_stay_batch_running_before_runner(self):\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"a\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A_1\", family=\"A\", params={\"a\": \"1\"}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_2\", family=\"A\", params={\"a\": \"2\"}, batchable=True)\n        self.sch.get_work(worker=WORKER)\n\n        self.sch.add_task(worker=WORKER, task_id=\"A_1\", family=\"A\", params={\"a\": \"1\"}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_2\", family=\"A\", params={\"a\": \"2\"}, batchable=True)\n        self.assertEqual({\"A_1\", \"A_2\"}, set(self.sch.task_list(BATCH_RUNNING, \"\").keys()))\n\n    def test_rescheduled_batch_running_tasks_stay_batch_running_after_runner(self):\n        self._start_simple_batch()\n        self.sch.add_task(worker=WORKER, task_id=\"A_1\", family=\"A\", params={\"a\": \"1\"}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_2\", family=\"A\", params={\"a\": \"2\"}, batchable=True)\n        self.assertEqual({\"A_1\", \"A_2\"}, set(self.sch.task_list(BATCH_RUNNING, \"\").keys()))\n\n    def test_disabled_batch_running_tasks_stay_batch_running_before_runner(self):\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"a\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A_1\", family=\"A\", params={\"a\": \"1\"}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_2\", family=\"A\", params={\"a\": \"2\"}, batchable=True)\n        self.sch.get_work(worker=WORKER)\n\n        self.sch.add_task(worker=WORKER, task_id=\"A_1\", family=\"A\", params={\"a\": \"1\"}, batchable=True, status=DISABLED)\n        self.sch.add_task(worker=WORKER, task_id=\"A_2\", family=\"A\", params={\"a\": \"2\"}, batchable=True, status=DISABLED)\n        self.assertEqual({\"A_1\", \"A_2\"}, set(self.sch.task_list(BATCH_RUNNING, \"\").keys()))\n\n    def test_get_work_returns_batch_task_id_list(self):\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"a\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A_1\", family=\"A\", params={\"a\": \"1\"}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A_2\", family=\"A\", params={\"a\": \"2\"}, batchable=True)\n        response = self.sch.get_work(worker=WORKER)\n        self.assertEqual({\"A_1\", \"A_2\"}, set(response[\"batch_task_ids\"]))\n\n    def test_disabled_batch_running_tasks_stay_batch_running_after_runner(self):\n        self._start_simple_batch()\n        self.sch.add_task(worker=WORKER, task_id=\"A_1\", family=\"A\", params={\"a\": \"1\"}, batchable=True, status=DISABLED)\n        self.sch.add_task(worker=WORKER, task_id=\"A_2\", family=\"A\", params={\"a\": \"2\"}, batchable=True, status=DISABLED)\n        self.assertEqual({\"A_1\", \"A_2\"}, set(self.sch.task_list(BATCH_RUNNING, \"\").keys()))\n\n    def test_do_not_overwrite_tracking_url_while_running(self):\n        self.sch.add_task(task_id=\"A\", worker=\"X\", status=\"RUNNING\", tracking_url=\"trackme\")\n        self.assertEqual(\"trackme\", self.sch.task_list(\"RUNNING\", \"\")[\"A\"][\"tracking_url\"])\n\n        # not wiped out by another working scheduling as pending\n        self.sch.add_task(task_id=\"A\", worker=\"Y\", status=\"PENDING\")\n        self.assertEqual(\"trackme\", self.sch.task_list(\"RUNNING\", \"\")[\"A\"][\"tracking_url\"])\n\n    def test_do_update_tracking_url_while_running(self):\n        self.sch.add_task(task_id=\"A\", worker=\"X\", status=\"RUNNING\", tracking_url=\"trackme\")\n        self.assertEqual(\"trackme\", self.sch.task_list(\"RUNNING\", \"\")[\"A\"][\"tracking_url\"])\n\n        self.sch.add_task(task_id=\"A\", worker=\"X\", status=\"RUNNING\", tracking_url=\"stage_2\")\n        self.assertEqual(\"stage_2\", self.sch.task_list(\"RUNNING\", \"\")[\"A\"][\"tracking_url\"])\n\n    def test_keep_tracking_url_on_done_and_fail(self):\n        for status in (\"DONE\", \"FAILED\"):\n            self.sch.add_task(task_id=\"A\", worker=\"X\", status=\"RUNNING\", tracking_url=\"trackme\")\n            self.assertEqual(\"trackme\", self.sch.task_list(\"RUNNING\", \"\")[\"A\"][\"tracking_url\"])\n\n            self.sch.add_task(task_id=\"A\", worker=\"X\", status=status)\n            self.assertEqual(\"trackme\", self.sch.task_list(status, \"\")[\"A\"][\"tracking_url\"])\n\n    def test_drop_tracking_url_when_rescheduled_while_not_running(self):\n        for status in (\"DONE\", \"FAILED\", \"PENDING\"):\n            self.sch.add_task(task_id=\"A\", worker=\"X\", status=status, tracking_url=\"trackme\")\n            self.assertEqual(\"trackme\", self.sch.task_list(status, \"\")[\"A\"][\"tracking_url\"])\n\n            self.sch.add_task(task_id=\"A\", worker=\"Y\", status=\"PENDING\")\n            self.assertIsNone(self.sch.task_list(\"PENDING\", \"\")[\"A\"][\"tracking_url\"])\n\n    def test_reset_tracking_url_on_new_run(self):\n        self.sch.add_task(task_id=\"A\", worker=\"X\", status=\"PENDING\", tracking_url=\"trackme\")\n        self.assertEqual(\"trackme\", self.sch.task_list(\"PENDING\", \"\")[\"A\"][\"tracking_url\"])\n\n        self.sch.add_task(task_id=\"A\", worker=\"Y\", status=\"RUNNING\")\n        self.assertIsNone(self.sch.task_list(\"RUNNING\", \"\")[\"A\"][\"tracking_url\"])\n\n    def test_remove_dep(self):\n        # X schedules A -> B, A is broken\n        # Y schedules C -> B: this should remove A as a dep of B\n        self.sch.add_task(task_id=\"A\", worker=\"X\", runnable=False)\n        self.sch.add_task(task_id=\"B\", deps=(\"A\",), worker=\"X\")\n\n        # X can't build anything\n        self.assertEqual(self.sch.get_work(worker=\"X\")[\"task_id\"], None)\n\n        self.sch.add_task(task_id=\"B\", deps=(\"C\",), worker=\"Y\")  # should reset dependencies for A\n        self.sch.add_task(task_id=\"C\", worker=\"Y\", status=DONE)\n\n        self.assertEqual(self.sch.get_work(worker=\"Y\")[\"task_id\"], \"B\")\n\n    def test_start_time(self):\n        self.setTime(100)\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.setTime(200)\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=DONE)\n        self.assertEqual(100, self.sch.task_list(DONE, \"\")[\"A\"][\"start_time\"])\n\n    def test_last_updated_does_not_change_with_same_status_update(self):\n        for t, status in ((100, PENDING), (300, DONE), (500, DISABLED)):\n            self.setTime(t)\n            self.sch.add_task(worker=WORKER, task_id=\"A\", status=status)\n            self.assertEqual(t, self.sch.task_list(status, \"\")[\"A\"][\"last_updated\"])\n\n            self.setTime(t + 100)\n            self.sch.add_task(worker=WORKER, task_id=\"A\", status=status)\n            self.assertEqual(t, self.sch.task_list(status, \"\")[\"A\"][\"last_updated\"])\n\n    def test_last_updated_shows_running_start(self):\n        self.setTime(100)\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=PENDING)\n        self.assertEqual(100, self.sch.task_list(PENDING, \"\")[\"A\"][\"last_updated\"])\n\n        self.setTime(200)\n        self.assertEqual(\"A\", self.sch.get_work(worker=WORKER)[\"task_id\"])\n        self.assertEqual(200, self.sch.task_list(\"RUNNING\", \"\")[\"A\"][\"last_updated\"])\n\n        self.setTime(300)\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=PENDING)\n        self.assertEqual(200, self.sch.task_list(\"RUNNING\", \"\")[\"A\"][\"last_updated\"])\n\n    def test_last_updated_with_failure_and_recovery(self):\n        self.setTime(100)\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.assertEqual(\"A\", self.sch.get_work(worker=WORKER)[\"task_id\"])\n\n        self.setTime(200)\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n        self.assertEqual(200, self.sch.task_list(FAILED, \"\")[\"A\"][\"last_updated\"])\n\n        self.setTime(1000)\n        self.sch.prune()\n        self.assertEqual(1000, self.sch.task_list(PENDING, \"\")[\"A\"][\"last_updated\"])\n\n    def test_timeout(self):\n        # A bug that was earlier present when restarting the same flow\n        self.setTime(0)\n        self.sch.add_task(task_id=\"A\", worker=\"X\")\n        self.assertEqual(self.sch.get_work(worker=\"X\")[\"task_id\"], \"A\")\n        self.setTime(10000)\n        self.sch.add_task(task_id=\"A\", worker=\"Y\")  # Will timeout X but not schedule A for removal\n        for i in range(2000):\n            self.setTime(10000 + i)\n            self.sch.ping(worker=\"Y\")\n        self.sch.add_task(task_id=\"A\", status=DONE, worker=\"Y\")  # This used to raise an exception since A was removed\n\n    def test_disallowed_state_changes(self):\n        # Test that we can not schedule an already running task\n        t = \"A\"\n        self.sch.add_task(task_id=t, worker=\"X\")\n        self.assertEqual(self.sch.get_work(worker=\"X\")[\"task_id\"], t)\n        self.sch.add_task(task_id=t, worker=\"Y\")\n        self.assertEqual(self.sch.get_work(worker=\"Y\")[\"task_id\"], None)\n\n    def test_two_worker_info(self):\n        # Make sure the scheduler returns info that some other worker is running task A\n        self.sch.add_task(worker=\"X\", task_id=\"A\")\n        self.sch.add_task(worker=\"Y\", task_id=\"A\")\n\n        self.assertEqual(self.sch.get_work(worker=\"X\")[\"task_id\"], \"A\")\n        r = self.sch.get_work(worker=\"Y\")\n        self.assertEqual(r[\"task_id\"], None)  # Worker Y is pending on A to be done\n        s = r[\"running_tasks\"][0]\n        self.assertEqual(s[\"task_id\"], \"A\")\n        self.assertEqual(s[\"worker\"], \"X\")\n\n    def test_assistant_get_work(self):\n        self.sch.add_task(worker=\"X\", task_id=\"A\")\n        self.sch.add_worker(\"Y\", [])\n\n        self.assertEqual(self.sch.get_work(worker=\"Y\", assistant=True)[\"task_id\"], \"A\")\n\n        # check that the scheduler recognizes tasks as running\n        running_tasks = self.sch.task_list(\"RUNNING\", \"\")\n        self.assertEqual(len(running_tasks), 1)\n        self.assertEqual(list(running_tasks.keys()), [\"A\"])\n        self.assertEqual(running_tasks[\"A\"][\"worker_running\"], \"Y\")\n\n    def test_assistant_get_work_external_task(self):\n        self.sch.add_task(worker=\"X\", task_id=\"A\", runnable=False)\n        self.assertTrue(self.sch.get_work(worker=\"Y\", assistant=True)[\"task_id\"] is None)\n\n    def test_task_fails_when_assistant_dies(self):\n        self.setTime(0)\n        self.sch.add_task(worker=\"X\", task_id=\"A\")\n        self.sch.add_worker(\"Y\", [])\n\n        self.assertEqual(self.sch.get_work(worker=\"Y\", assistant=True)[\"task_id\"], \"A\")\n        self.assertEqual(list(self.sch.task_list(\"RUNNING\", \"\").keys()), [\"A\"])\n\n        # Y dies for 50 seconds, X stays alive\n        self.setTime(50)\n        self.sch.ping(worker=\"X\")\n        self.assertEqual(list(self.sch.task_list(\"FAILED\", \"\").keys()), [\"A\"])\n\n    def test_prune_with_live_assistant(self):\n        self.setTime(0)\n        self.sch.add_task(worker=\"X\", task_id=\"A\")\n        self.sch.get_work(worker=\"Y\", assistant=True)\n        self.sch.add_task(worker=\"Y\", task_id=\"A\", status=DONE, assistant=True)\n\n        # worker X stops communicating, A should be marked for removal\n        self.setTime(600)\n        self.sch.ping(worker=\"Y\")\n        self.sch.prune()\n\n        # A will now be pruned\n        self.setTime(2000)\n        self.sch.prune()\n        self.assertFalse(list(self.sch.task_list(\"\", \"\")))\n\n    def test_re_enable_failed_task_assistant(self):\n        self.setTime(0)\n        self.sch.add_worker(\"X\", [(\"assistant\", True)])\n        self.sch.add_task(worker=\"X\", task_id=\"A\", status=FAILED, assistant=True)\n\n        # should be failed now\n        self.assertEqual(FAILED, self.sch.task_list(\"\", \"\")[\"A\"][\"status\"])\n\n        # resets to PENDING after 100 seconds\n        self.setTime(101)\n        self.sch.ping(worker=\"X\")  # worker still alive\n        self.assertEqual(\"PENDING\", self.sch.task_list(\"\", \"\")[\"A\"][\"status\"])\n\n    def test_assistant_doesnt_keep_alive_task(self):\n        self.setTime(0)\n        self.sch.add_task(worker=\"X\", task_id=\"A\")\n        self.assertEqual(\"A\", self.sch.get_work(worker=\"X\")[\"task_id\"])\n        self.sch.add_worker(\"Y\", {\"assistant\": True})\n\n        remove_delay = self.get_scheduler_config()[\"remove_delay\"] + 1.0\n        self.setTime(remove_delay)\n        self.sch.ping(worker=\"Y\")\n        self.sch.prune()\n        self.assertEqual([\"A\"], list(self.sch.task_list(status=\"FAILED\", upstream_status=\"\").keys()))\n        self.assertEqual([\"A\"], list(self.sch.task_list(status=\"\", upstream_status=\"\").keys()))\n\n        self.setTime(2 * remove_delay)\n        self.sch.ping(worker=\"Y\")\n        self.sch.prune()\n        self.assertEqual([], list(self.sch.task_list(status=\"\", upstream_status=\"\").keys()))\n\n    def test_assistant_request_runnable_task(self):\n        \"\"\"\n        Test that an assistant gets a task despite it havent registered for it\n        \"\"\"\n        self.setTime(0)\n        self.sch.add_task(worker=\"X\", task_id=\"A\", runnable=True)\n        self.setTime(600)\n        self.sch.prune()\n\n        self.assertEqual(\"A\", self.sch.get_work(worker=\"Y\", assistant=True)[\"task_id\"])\n\n    def test_assistant_request_external_task(self):\n        self.sch.add_task(worker=\"X\", task_id=\"A\", runnable=False)\n        self.assertIsNone(self.sch.get_work(worker=\"Y\", assistant=True)[\"task_id\"])\n\n    def _test_prune_done_tasks(self, expected=None):\n        self.setTime(0)\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=DONE)\n        self.sch.add_task(worker=WORKER, task_id=\"B\", deps=[\"A\"], status=DONE)\n        self.sch.add_task(worker=WORKER, task_id=\"C\", deps=[\"B\"])\n\n        self.setTime(600)\n        self.sch.ping(worker=\"MAYBE_ASSITANT\")\n        self.sch.prune()\n        self.setTime(2000)\n        self.sch.ping(worker=\"MAYBE_ASSITANT\")\n        self.sch.prune()\n\n        self.assertEqual(set(expected), set(self.sch.task_list(\"\", \"\").keys()))\n\n    def test_prune_done_tasks_not_assistant(self, expected=None):\n        # Here, MAYBE_ASSISTANT isnt an assistant\n        self._test_prune_done_tasks(expected=[])\n\n    def test_keep_tasks_for_assistant(self):\n        self.sch.get_work(worker=\"MAYBE_ASSITANT\", assistant=True)  # tell the scheduler this is an assistant\n        self._test_prune_done_tasks([])\n\n    def test_keep_scheduler_disabled_tasks_for_assistant(self):\n        self.sch.get_work(worker=\"MAYBE_ASSITANT\", assistant=True)  # tell the scheduler this is an assistant\n\n        # create a scheduler disabled task and a worker disabled task\n        for i in range(10):\n            self.sch.add_task(worker=WORKER, task_id=\"D\", status=FAILED)\n        self.sch.add_task(worker=WORKER, task_id=\"E\", status=DISABLED)\n\n        # scheduler prunes the worker disabled task\n        self.assertEqual({\"D\", \"E\"}, set(self.sch.task_list(DISABLED, \"\")))\n        self._test_prune_done_tasks([])\n\n    def test_keep_failed_tasks_for_assistant(self):\n        self.sch.get_work(worker=\"MAYBE_ASSITANT\", assistant=True)  # tell the scheduler this is an assistant\n        self.sch.add_task(worker=WORKER, task_id=\"D\", status=FAILED, deps=[\"A\"])\n        self._test_prune_done_tasks([])\n\n    def test_count_pending(self):\n        for num_tasks in range(1, 20):\n            self.sch.add_task(worker=WORKER, task_id=str(num_tasks), status=PENDING)\n            expected = {\n                \"n_pending_tasks\": num_tasks,\n                \"n_unique_pending\": num_tasks,\n                \"n_pending_last_scheduled\": num_tasks,\n                \"running_tasks\": [],\n                \"worker_state\": \"active\",\n            }\n            self.assertEqual(expected, self.sch.count_pending(WORKER))\n\n    def test_count_pending_include_failures(self):\n        for num_tasks in range(1, 20):\n            # must be scheduled as pending before failed to ensure WORKER is in the task's workers\n            self.sch.add_task(worker=WORKER, task_id=str(num_tasks), status=PENDING)\n            self.sch.add_task(worker=WORKER, task_id=str(num_tasks), status=FAILED)\n            expected = {\n                \"n_pending_tasks\": num_tasks,\n                \"n_unique_pending\": num_tasks,\n                \"n_pending_last_scheduled\": num_tasks,\n                \"running_tasks\": [],\n                \"worker_state\": \"active\",\n            }\n            self.assertEqual(expected, self.sch.count_pending(WORKER))\n\n    def test_count_pending_do_not_include_done_or_disabled(self):\n        for num_tasks in range(1, 20, 2):\n            self.sch.add_task(worker=WORKER, task_id=str(num_tasks), status=PENDING)\n            self.sch.add_task(worker=WORKER, task_id=str(num_tasks + 1), status=PENDING)\n            self.sch.add_task(worker=WORKER, task_id=str(num_tasks), status=DONE)\n            self.sch.add_task(worker=WORKER, task_id=str(num_tasks + 1), status=DISABLED)\n        expected = {\n            \"n_pending_tasks\": 0,\n            \"n_unique_pending\": 0,\n            \"n_pending_last_scheduled\": 0,\n            \"running_tasks\": [],\n            \"worker_state\": \"active\",\n        }\n        self.assertEqual(expected, self.sch.count_pending(WORKER))\n\n    def test_count_pending_on_disabled_worker(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.sch.add_task(worker=\"other\", task_id=\"B\")  # needed to trigger right get_tasks code path\n        self.assertEqual(1, self.sch.count_pending(WORKER)[\"n_pending_tasks\"])\n        self.sch.disable_worker(WORKER)\n        self.assertEqual(0, self.sch.count_pending(WORKER)[\"n_pending_tasks\"])\n\n    def test_count_pending_do_not_count_upstream_disabled(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=PENDING)\n        self.sch.add_task(worker=WORKER, task_id=\"B\", status=DISABLED)\n        self.sch.add_task(worker=WORKER, task_id=\"C\", status=PENDING, deps=[\"A\", \"B\"])\n        expected = {\n            \"n_pending_tasks\": 1,\n            \"n_unique_pending\": 1,\n            \"n_pending_last_scheduled\": 1,\n            \"running_tasks\": [],\n            \"worker_state\": \"active\",\n        }\n        self.assertEqual(expected, self.sch.count_pending(WORKER))\n\n    def test_count_pending_count_upstream_failed(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=PENDING)\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n        self.sch.add_task(worker=WORKER, task_id=\"B\", status=PENDING, deps=[\"A\"])\n        expected = {\n            \"n_pending_tasks\": 2,\n            \"n_unique_pending\": 2,\n            \"n_pending_last_scheduled\": 2,\n            \"running_tasks\": [],\n            \"worker_state\": \"active\",\n        }\n        self.assertEqual(expected, self.sch.count_pending(WORKER))\n\n    def test_count_pending_missing_worker(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=PENDING)\n        expected = {\n            \"n_pending_tasks\": 0,\n            \"n_unique_pending\": 0,\n            \"n_pending_last_scheduled\": 0,\n            \"running_tasks\": [],\n            \"worker_state\": \"active\",\n        }\n        self.assertEqual(expected, self.sch.count_pending(\"other_worker\"))\n\n    def test_count_pending_uniques(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=PENDING)\n        self.sch.add_task(worker=WORKER, task_id=\"B\", status=PENDING)\n        self.sch.add_task(worker=WORKER, task_id=\"C\", status=PENDING)\n\n        self.sch.add_task(worker=\"other_worker\", task_id=\"A\", status=PENDING)\n\n        expected = {\n            \"n_pending_tasks\": 3,\n            \"n_unique_pending\": 2,\n            \"n_pending_last_scheduled\": 2,\n            \"running_tasks\": [],\n            \"worker_state\": \"active\",\n        }\n        self.assertEqual(expected, self.sch.count_pending(WORKER))\n\n    def test_count_pending_last_scheduled(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=PENDING)\n        self.sch.add_task(worker=WORKER, task_id=\"B\", status=PENDING)\n        self.sch.add_task(worker=WORKER, task_id=\"C\", status=PENDING)\n\n        self.sch.add_task(worker=\"other_worker\", task_id=\"A\", status=PENDING)\n        self.sch.add_task(worker=\"other_worker\", task_id=\"B\", status=PENDING)\n        self.sch.add_task(worker=\"other_worker\", task_id=\"C\", status=PENDING)\n\n        expected = {\n            \"n_pending_tasks\": 3,\n            \"n_unique_pending\": 0,\n            \"n_pending_last_scheduled\": 0,\n            \"running_tasks\": [],\n            \"worker_state\": \"active\",\n        }\n        self.assertEqual(expected, self.sch.count_pending(WORKER))\n\n        expected_other_worker = {\n            \"n_pending_tasks\": 3,\n            \"n_unique_pending\": 0,\n            \"n_pending_last_scheduled\": 3,\n            \"running_tasks\": [],\n            \"worker_state\": \"active\",\n        }\n        self.assertEqual(expected_other_worker, self.sch.count_pending(\"other_worker\"))\n\n    def test_count_pending_disabled_worker(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=PENDING)\n\n        expected_active_state = {\n            \"n_pending_tasks\": 1,\n            \"n_unique_pending\": 1,\n            \"n_pending_last_scheduled\": 1,\n            \"running_tasks\": [],\n            \"worker_state\": \"active\",\n        }\n        self.assertEqual(expected_active_state, self.sch.count_pending(worker=WORKER))\n\n        expected_disabled_state = {\n            \"n_pending_tasks\": 0,\n            \"n_unique_pending\": 0,\n            \"n_pending_last_scheduled\": 0,\n            \"running_tasks\": [],\n            \"worker_state\": \"disabled\",\n        }\n        self.sch.disable_worker(worker=WORKER)\n        self.assertEqual(expected_disabled_state, self.sch.count_pending(worker=WORKER))\n\n    def test_count_pending_running_tasks(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=PENDING)\n        self.assertEqual(\"A\", self.sch.get_work(worker=WORKER)[\"task_id\"])\n\n        expected_active_state = {\n            \"n_pending_tasks\": 0,\n            \"n_unique_pending\": 0,\n            \"n_pending_last_scheduled\": 0,\n            \"running_tasks\": [{\"task_id\": \"A\", \"worker\": \"myworker\"}],\n            \"worker_state\": \"active\",\n        }\n        self.assertEqual(expected_active_state, self.sch.count_pending(worker=WORKER))\n\n    def test_scheduler_resources_none_allow_one(self):\n        self.sch.add_task(worker=\"X\", task_id=\"A\", resources={\"R1\": 1})\n        self.assertEqual(self.sch.get_work(worker=\"X\")[\"task_id\"], \"A\")\n\n    def test_scheduler_resources_none_disallow_two(self):\n        self.sch.add_task(worker=\"X\", task_id=\"A\", resources={\"R1\": 2})\n        self.assertFalse(self.sch.get_work(worker=\"X\")[\"task_id\"], \"A\")\n\n    def test_scheduler_with_insufficient_resources(self):\n        self.sch.add_task(worker=\"X\", task_id=\"A\", resources={\"R1\": 3})\n        self.sch.update_resources(R1=2)\n        self.assertFalse(self.sch.get_work(worker=\"X\")[\"task_id\"])\n\n    def test_scheduler_with_sufficient_resources(self):\n        self.sch.add_task(worker=\"X\", task_id=\"A\", resources={\"R1\": 3})\n        self.sch.update_resources(R1=3)\n        self.assertEqual(self.sch.get_work(worker=\"X\")[\"task_id\"], \"A\")\n\n    def test_scheduler_with_resources_used(self):\n        self.sch.add_task(worker=\"X\", task_id=\"A\", resources={\"R1\": 1})\n        self.assertEqual(self.sch.get_work(worker=\"X\")[\"task_id\"], \"A\")\n\n        self.sch.add_task(worker=\"Y\", task_id=\"B\", resources={\"R1\": 1})\n        self.sch.update_resources(R1=1)\n        self.assertFalse(self.sch.get_work(worker=\"Y\")[\"task_id\"])\n\n    def test_scheduler_overprovisioned_on_other_resource(self):\n        self.sch.add_task(worker=\"X\", task_id=\"A\", resources={\"R1\": 2})\n        self.sch.update_resources(R1=2)\n        self.assertEqual(self.sch.get_work(worker=\"X\")[\"task_id\"], \"A\")\n\n        self.sch.add_task(worker=\"Y\", task_id=\"B\", resources={\"R2\": 2})\n        self.sch.update_resources(R1=1, R2=2)\n        self.assertEqual(self.sch.get_work(worker=\"Y\")[\"task_id\"], \"B\")\n\n    def test_scheduler_with_priority_and_competing_resources(self):\n        self.sch.add_task(worker=\"X\", task_id=\"A\")\n        self.assertEqual(self.sch.get_work(worker=\"X\")[\"task_id\"], \"A\")\n\n        self.sch.add_task(worker=\"X\", task_id=\"B\", resources={\"R\": 1}, priority=10)\n        self.sch.add_task(worker=\"Y\", task_id=\"C\", resources={\"R\": 1}, priority=1)\n        self.sch.update_resources(R=1)\n        self.assertFalse(self.sch.get_work(worker=\"Y\")[\"task_id\"])\n\n        self.sch.add_task(worker=\"Y\", task_id=\"D\", priority=0)\n        self.assertEqual(self.sch.get_work(worker=\"Y\")[\"task_id\"], \"D\")\n\n    def test_do_not_lock_resources_when_not_ready(self):\n        \"\"\"Test to make sure that resources won't go unused waiting on workers\"\"\"\n        self.sch.add_task(worker=\"X\", task_id=\"A\", priority=10)\n        self.sch.add_task(worker=\"X\", task_id=\"B\", resources={\"R\": 1}, priority=5)\n        self.sch.add_task(worker=\"Y\", task_id=\"C\", resources={\"R\": 1}, priority=1)\n\n        self.sch.update_resources(R=1)\n        self.sch.add_worker(\"X\", [(\"workers\", 1)])\n        self.assertEqual(\"C\", self.sch.get_work(worker=\"Y\")[\"task_id\"])\n\n    def test_lock_resources_when_one_of_multiple_workers_is_ready(self):\n        self.sch.get_work(worker=\"X\")  # indicate to the scheduler that X is active\n        self.sch.add_task(worker=\"X\", task_id=\"A\", priority=10)\n        self.sch.add_task(worker=\"X\", task_id=\"B\", resources={\"R\": 1}, priority=5)\n        self.sch.add_task(worker=\"Y\", task_id=\"C\", resources={\"R\": 1}, priority=1)\n\n        self.sch.update_resources(R=1)\n        self.sch.add_worker(\"X\", [(\"workers\", 2)])\n        self.sch.add_worker(\"Y\", [])\n        self.assertFalse(self.sch.get_work(worker=\"Y\")[\"task_id\"])\n\n    def test_do_not_lock_resources_while_running_higher_priority(self):\n        \"\"\"Test to make sure that resources won't go unused waiting on workers\"\"\"\n        self.sch.add_task(worker=\"X\", task_id=\"A\", priority=10)\n        self.sch.add_task(worker=\"X\", task_id=\"B\", resources={\"R\": 1}, priority=5)\n        self.sch.add_task(worker=\"Y\", task_id=\"C\", resources={\"R\": 1}, priority=1)\n\n        self.sch.update_resources(R=1)\n        self.sch.add_worker(\"X\", [(\"workers\", 1)])\n        self.assertEqual(\"A\", self.sch.get_work(worker=\"X\")[\"task_id\"])\n        self.assertEqual(\"C\", self.sch.get_work(worker=\"Y\")[\"task_id\"])\n\n    def test_lock_resources_while_running_lower_priority(self):\n        \"\"\"Make sure resources will be made available while working on lower priority tasks\"\"\"\n        self.sch.add_task(worker=\"X\", task_id=\"A\", priority=4)\n        self.assertEqual(\"A\", self.sch.get_work(worker=\"X\")[\"task_id\"])\n        self.sch.add_task(worker=\"X\", task_id=\"B\", resources={\"R\": 1}, priority=5)\n        self.sch.add_task(worker=\"Y\", task_id=\"C\", resources={\"R\": 1}, priority=1)\n\n        self.sch.update_resources(R=1)\n        self.sch.add_worker(\"X\", [(\"workers\", 1)])\n        self.assertFalse(self.sch.get_work(worker=\"Y\")[\"task_id\"])\n\n    def test_lock_resources_for_second_worker(self):\n        self.sch.get_work(worker=\"Y\")  # indicate to the scheduler that Y is active\n        self.sch.add_task(worker=\"X\", task_id=\"A\", resources={\"R\": 1})\n        self.sch.add_task(worker=\"X\", task_id=\"B\", resources={\"R\": 1})\n        self.sch.add_task(worker=\"Y\", task_id=\"C\", resources={\"R\": 1}, priority=10)\n\n        self.sch.add_worker(\"X\", {\"workers\": 2})\n        self.sch.add_worker(\"Y\", {\"workers\": 1})\n        self.sch.update_resources(R=2)\n\n        self.assertEqual(\"A\", self.sch.get_work(worker=\"X\")[\"task_id\"])\n        self.assertFalse(self.sch.get_work(worker=\"X\")[\"task_id\"])\n\n    def test_can_work_on_lower_priority_while_waiting_for_resources(self):\n        self.sch.add_task(worker=\"X\", task_id=\"A\", resources={\"R\": 1}, priority=0)\n        self.assertEqual(\"A\", self.sch.get_work(worker=\"X\")[\"task_id\"])\n\n        self.sch.add_task(worker=\"Y\", task_id=\"B\", resources={\"R\": 1}, priority=10)\n        self.sch.add_task(worker=\"Y\", task_id=\"C\", priority=0)\n        self.sch.update_resources(R=1)\n\n        self.assertEqual(\"C\", self.sch.get_work(worker=\"Y\")[\"task_id\"])\n\n    def validate_resource_count(self, name, count):\n        counts = {resource[\"name\"]: resource[\"num_total\"] for resource in self.sch.resource_list()}\n        self.assertEqual(count, counts.get(name))\n\n    def test_update_new_resource(self):\n        self.validate_resource_count(\"new_resource\", None)  # new_resource is not in the scheduler\n        self.sch.update_resource(\"new_resource\", 1)\n        self.validate_resource_count(\"new_resource\", 1)\n\n    def test_update_existing_resource(self):\n        self.sch.update_resource(\"new_resource\", 1)\n        self.sch.update_resource(\"new_resource\", 2)\n        self.validate_resource_count(\"new_resource\", 2)\n\n    def test_disable_existing_resource(self):\n        self.sch.update_resource(\"new_resource\", 1)\n        self.sch.update_resource(\"new_resource\", 0)\n        self.validate_resource_count(\"new_resource\", 0)\n\n    def test_attempt_to_set_resource_to_negative_value(self):\n        self.sch.update_resource(\"new_resource\", 1)\n        self.assertFalse(self.sch.update_resource(\"new_resource\", -1))\n        self.validate_resource_count(\"new_resource\", 1)\n\n    def test_attempt_to_set_resource_to_non_integer(self):\n        self.sch.update_resource(\"new_resource\", 1)\n        self.assertFalse(self.sch.update_resource(\"new_resource\", 1.3))\n        self.assertFalse(self.sch.update_resource(\"new_resource\", \"1\"))\n        self.assertFalse(self.sch.update_resource(\"new_resource\", None))\n        self.validate_resource_count(\"new_resource\", 1)\n\n    def test_priority_update_with_pruning(self):\n        self.setTime(0)\n        self.sch.add_task(task_id=\"A\", worker=\"X\")\n\n        self.setTime(50)  # after worker disconnects\n        self.sch.prune()\n        self.sch.add_task(task_id=\"B\", deps=[\"A\"], worker=\"X\")\n\n        self.setTime(2000)  # after remove for task A\n        self.sch.prune()\n\n        # Here task A that B depends on is missing\n        self.sch.add_task(worker=WORKER, task_id=\"C\", deps=[\"B\"], priority=100)\n        self.sch.add_task(worker=WORKER, task_id=\"B\", deps=[\"A\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.sch.add_task(worker=WORKER, task_id=\"D\", priority=10)\n\n        self.check_task_order(\"ABCD\")\n\n    def test_update_resources(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\", deps=[\"B\"])\n        self.sch.add_task(worker=WORKER, task_id=\"B\", resources={\"r\": 2})\n        self.sch.update_resources(r=1)\n\n        # B requires too many resources, we can't schedule\n        self.check_task_order([])\n\n        self.sch.add_task(worker=WORKER, task_id=\"B\", resources={\"r\": 1})\n\n        # now we have enough resources\n        self.check_task_order([\"B\", \"A\"])\n\n    def test_handle_multiple_resources(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\", resources={\"r1\": 1, \"r2\": 1})\n        self.sch.add_task(worker=WORKER, task_id=\"B\", resources={\"r1\": 1, \"r2\": 1})\n        self.sch.add_task(worker=WORKER, task_id=\"C\", resources={\"r1\": 1})\n        self.sch.update_resources(r1=2, r2=1)\n\n        self.assertEqual(\"A\", self.sch.get_work(worker=WORKER)[\"task_id\"])\n        self.check_task_order(\"C\")\n\n    def test_single_resource_lock(self):\n        self.sch.add_task(worker=\"X\", task_id=\"A\", resources={\"r\": 1})\n        self.assertEqual(\"A\", self.sch.get_work(worker=\"X\")[\"task_id\"])\n\n        self.sch.add_task(worker=WORKER, task_id=\"B\", resources={\"r\": 2}, priority=10)\n        self.sch.add_task(worker=WORKER, task_id=\"C\", resources={\"r\": 1})\n        self.sch.update_resources(r=2)\n\n        # Should wait for 2 units of r to be available for B before scheduling C\n        self.check_task_order([])\n\n    def test_no_lock_if_too_many_resources_required(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\", resources={\"r\": 2}, priority=10)\n        self.sch.add_task(worker=WORKER, task_id=\"B\", resources={\"r\": 1})\n        self.sch.update_resources(r=1)\n        self.check_task_order(\"B\")\n\n    def test_multiple_resources_lock(self):\n        self.sch.get_work(worker=\"X\")  # indicate to the scheduler that X is active\n        self.sch.add_task(worker=\"X\", task_id=\"A\", resources={\"r1\": 1, \"r2\": 1}, priority=10)\n        self.sch.add_task(worker=WORKER, task_id=\"B\", resources={\"r2\": 1})\n        self.sch.add_task(worker=WORKER, task_id=\"C\", resources={\"r1\": 1})\n        self.sch.update_resources(r1=1, r2=1)\n\n        # should preserve both resources for worker 'X'\n        self.check_task_order([])\n\n    def test_multiple_resources_no_lock(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\", resources={\"r1\": 1}, priority=10)\n        self.sch.add_task(worker=WORKER, task_id=\"B\", resources={\"r1\": 1, \"r2\": 1}, priority=10)\n        self.sch.add_task(worker=WORKER, task_id=\"C\", resources={\"r2\": 1})\n        self.sch.update_resources(r1=1, r2=2)\n\n        self.assertEqual(\"A\", self.sch.get_work(worker=WORKER)[\"task_id\"])\n        # C doesn't block B, so it can go first\n        self.check_task_order(\"C\")\n\n    def test_do_not_allow_stowaway_resources(self):\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"a\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A1\", resources={\"r1\": 1}, family=\"A\", params={\"a\": \"1\"}, batchable=True, priority=1)\n        self.sch.add_task(worker=WORKER, task_id=\"A2\", resources={\"r1\": 2}, family=\"A\", params={\"a\": \"2\"}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A3\", resources={\"r2\": 1}, family=\"A\", params={\"a\": \"3\"}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A4\", resources={\"r1\": 1}, family=\"A\", params={\"a\": \"4\"}, batchable=True)\n        self.assertEqual({\"A1\", \"A4\"}, set(self.sch.get_work(worker=WORKER)[\"batch_task_ids\"]))\n\n    def test_do_not_allow_same_resources(self):\n        self.sch.add_task_batcher(worker=WORKER, task_family=\"A\", batched_args=[\"a\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A1\", resources={\"r1\": 1}, family=\"A\", params={\"a\": \"1\"}, batchable=True, priority=1)\n        self.sch.add_task(worker=WORKER, task_id=\"A2\", resources={\"r1\": 1}, family=\"A\", params={\"a\": \"2\"}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A3\", resources={\"r1\": 1}, family=\"A\", params={\"a\": \"3\"}, batchable=True)\n        self.sch.add_task(worker=WORKER, task_id=\"A4\", resources={\"r1\": 1}, family=\"A\", params={\"a\": \"4\"}, batchable=True)\n        self.assertEqual({\"A1\", \"A2\", \"A3\", \"A4\"}, set(self.sch.get_work(worker=WORKER)[\"batch_task_ids\"]))\n\n    def test_change_resources_on_running_task(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A1\", resources={\"a\": 1}, priority=10)\n        self.sch.add_task(worker=WORKER, task_id=\"A2\", resources={\"a\": 1}, priority=1)\n\n        self.assertEqual(\"A1\", self.sch.get_work(worker=WORKER)[\"task_id\"])\n        self.assertIsNone(self.sch.get_work(worker=WORKER)[\"task_id\"])\n\n        # switch the resource of the running task\n        self.sch.add_task(worker=\"other\", task_id=\"A1\", resources={\"b\": 1}, priority=1)\n\n        # the running task should be using the resource it had when it started running\n        self.assertIsNone(self.sch.get_work(worker=WORKER)[\"task_id\"])\n\n    def test_interleave_resource_change_and_get_work(self):\n        for i in range(100):\n            self.sch.add_task(worker=WORKER, task_id=\"A{}\".format(i), resources={\"a\": 1}, priority=100 - i)\n\n        for i in range(100):\n            self.sch.get_work(worker=WORKER)\n            self.sch.add_task(worker=\"other\", task_id=\"A{}\".format(i), resources={\"b\": 1}, priority=100 - i)\n\n        # we should only see 1 task  per resource rather than all 100 tasks running\n        self.assertEqual(2, len(self.sch.task_list(RUNNING, \"\")))\n\n    def test_assistant_has_different_resources_than_scheduled_max_task_id(self):\n        self.sch.add_task_batcher(worker=\"assistant\", task_family=\"A\", batched_args=[\"a\"], max_batch_size=2)\n        self.sch.add_task(worker=WORKER, task_id=\"A1\", resources={\"a\": 1}, family=\"A\", params={\"a\": \"1\"}, batchable=True, priority=1)\n        self.sch.add_task(worker=WORKER, task_id=\"A2\", resources={\"a\": 1}, family=\"A\", params={\"a\": \"2\"}, batchable=True, priority=2)\n        self.sch.add_task(worker=WORKER, task_id=\"A3\", resources={\"a\": 1}, family=\"A\", params={\"a\": \"3\"}, batchable=True, priority=3)\n\n        result = self.sch.get_work(worker=\"assistant\", assistant=True)\n        self.assertEqual({\"A3\", \"A2\"}, set(result[\"batch_task_ids\"]))\n        self.sch.add_task(worker=\"assistant\", task_id=\"A3\", status=RUNNING, batch_id=result[\"batch_id\"], resources={\"b\": 1})\n\n        # the assistant changed the status, but only after it was batch running\n        self.assertIsNone(self.sch.get_work(worker=WORKER)[\"task_id\"])\n\n    def test_assistant_has_different_resources_than_scheduled_new_task_id(self):\n        self.sch.add_task_batcher(worker=\"assistant\", task_family=\"A\", batched_args=[\"a\"], max_batch_size=2)\n        self.sch.add_task(worker=WORKER, task_id=\"A1\", resources={\"a\": 1}, family=\"A\", params={\"a\": \"1\"}, batchable=True, priority=1)\n        self.sch.add_task(worker=WORKER, task_id=\"A2\", resources={\"a\": 1}, family=\"A\", params={\"a\": \"2\"}, batchable=True, priority=2)\n        self.sch.add_task(worker=WORKER, task_id=\"A3\", resources={\"a\": 1}, family=\"A\", params={\"a\": \"3\"}, batchable=True, priority=3)\n\n        result = self.sch.get_work(worker=\"assistant\", assistant=True)\n        self.assertEqual({\"A3\", \"A2\"}, set(result[\"batch_task_ids\"]))\n        self.sch.add_task(worker=\"assistant\", task_id=\"A_2_3\", status=RUNNING, batch_id=result[\"batch_id\"], resources={\"b\": 1})\n\n        # the assistant changed the status, but only after it was batch running\n        self.assertIsNone(self.sch.get_work(worker=WORKER)[\"task_id\"])\n\n    def test_assistant_has_different_resources_than_scheduled_max_task_id_during_scheduling(self):\n        self.sch.add_task_batcher(worker=\"assistant\", task_family=\"A\", batched_args=[\"a\"], max_batch_size=2)\n        self.sch.add_task(worker=WORKER, task_id=\"A1\", resources={\"a\": 1}, family=\"A\", params={\"a\": \"1\"}, batchable=True, priority=1)\n        self.sch.add_task(worker=WORKER, task_id=\"A2\", resources={\"a\": 1}, family=\"A\", params={\"a\": \"2\"}, batchable=True, priority=2)\n        self.sch.add_task(worker=WORKER, task_id=\"A3\", resources={\"a\": 1}, family=\"A\", params={\"a\": \"3\"}, batchable=True, priority=3)\n\n        result = self.sch.get_work(worker=\"assistant\", assistant=True)\n        self.assertEqual({\"A3\", \"A2\"}, set(result[\"batch_task_ids\"]))\n        self.sch.add_task(worker=WORKER, task_id=\"A2\", resources={\"b\": 1}, family=\"A\", params={\"a\": \"2\"}, batchable=True, priority=2)\n        self.sch.add_task(worker=WORKER, task_id=\"A3\", resources={\"b\": 1}, family=\"A\", params={\"a\": \"3\"}, batchable=True, priority=3)\n        self.sch.add_task(worker=\"assistant\", task_id=\"A3\", status=RUNNING, batch_id=result[\"batch_id\"], resources={\"b\": 1})\n\n        # the statuses changed, but only after they wree batch running\n        self.assertIsNone(self.sch.get_work(worker=WORKER)[\"task_id\"])\n\n    def test_assistant_has_different_resources_than_scheduled_new_task_id_during_scheduling(self):\n        self.sch.add_task_batcher(worker=\"assistant\", task_family=\"A\", batched_args=[\"a\"], max_batch_size=2)\n        self.sch.add_task(worker=WORKER, task_id=\"A1\", resources={\"a\": 1}, family=\"A\", params={\"a\": \"1\"}, batchable=True, priority=1)\n        self.sch.add_task(worker=WORKER, task_id=\"A2\", resources={\"a\": 1}, family=\"A\", params={\"a\": \"2\"}, batchable=True, priority=2)\n        self.sch.add_task(worker=WORKER, task_id=\"A3\", resources={\"a\": 1}, family=\"A\", params={\"a\": \"3\"}, batchable=True, priority=3)\n\n        result = self.sch.get_work(worker=\"assistant\", assistant=True)\n        self.assertEqual({\"A3\", \"A2\"}, set(result[\"batch_task_ids\"]))\n        self.sch.add_task(worker=WORKER, task_id=\"A2\", resources={\"b\": 1}, family=\"A\", params={\"a\": \"2\"}, batchable=True, priority=2)\n        self.sch.add_task(worker=WORKER, task_id=\"A3\", resources={\"b\": 1}, family=\"A\", params={\"a\": \"3\"}, batchable=True, priority=3)\n        self.sch.add_task(worker=\"assistant\", task_id=\"A_2_3\", status=RUNNING, batch_id=result[\"batch_id\"], resources={\"b\": 1})\n\n        # the statuses changed, but only after they were batch running\n        self.assertIsNone(self.sch.get_work(worker=WORKER)[\"task_id\"])\n\n    def test_allow_resource_use_while_scheduling(self):\n        self.sch.update_resources(r1=1)\n        self.sch.add_task(worker=\"SCHEDULING\", task_id=\"A\", resources={\"r1\": 1}, priority=10)\n        self.sch.add_task(worker=WORKER, task_id=\"B\", resources={\"r1\": 1}, priority=1)\n        self.assertEqual(\"B\", self.sch.get_work(worker=WORKER)[\"task_id\"])\n\n    def test_stop_locking_resource_for_uninterested_worker(self):\n        self.setTime(0)\n        self.sch.update_resources(r1=1)\n        self.assertIsNone(self.sch.get_work(worker=WORKER)[\"task_id\"])\n        self.sch.add_task(worker=WORKER, task_id=\"A\", resources={\"r1\": 1}, priority=10)\n        self.sch.add_task(worker=\"LOW_PRIO\", task_id=\"B\", resources={\"r1\": 1}, priority=1)\n        self.assertIsNone(self.sch.get_work(worker=\"LOW_PRIO\")[\"task_id\"])\n\n        self.setTime(120)\n        self.assertEqual(\"B\", self.sch.get_work(worker=\"LOW_PRIO\")[\"task_id\"])\n\n    def check_task_order(self, order):\n        for expected_id in order:\n            self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], expected_id)\n            self.sch.add_task(worker=WORKER, task_id=expected_id, status=DONE)\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], None)\n\n    def test_priorities(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\", priority=10)\n        self.sch.add_task(worker=WORKER, task_id=\"B\", priority=5)\n        self.sch.add_task(worker=WORKER, task_id=\"C\", priority=15)\n        self.sch.add_task(worker=WORKER, task_id=\"D\", priority=9)\n        self.check_task_order([\"C\", \"A\", \"D\", \"B\"])\n\n    def test_priorities_default_and_negative(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\", priority=10)\n        self.sch.add_task(worker=WORKER, task_id=\"B\")\n        self.sch.add_task(worker=WORKER, task_id=\"C\", priority=15)\n        self.sch.add_task(worker=WORKER, task_id=\"D\", priority=-20)\n        self.sch.add_task(worker=WORKER, task_id=\"E\", priority=1)\n        self.check_task_order([\"C\", \"A\", \"E\", \"B\", \"D\"])\n\n    def test_priorities_and_dependencies(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\", deps=[\"Z\"], priority=10)\n        self.sch.add_task(worker=WORKER, task_id=\"B\", priority=5)\n        self.sch.add_task(worker=WORKER, task_id=\"C\", deps=[\"Z\"], priority=3)\n        self.sch.add_task(worker=WORKER, task_id=\"D\", priority=2)\n        self.sch.add_task(worker=WORKER, task_id=\"Z\", priority=1)\n        self.check_task_order([\"Z\", \"A\", \"B\", \"C\", \"D\"])\n\n    def test_priority_update_dependency_after_scheduling(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\", priority=1)\n        self.sch.add_task(worker=WORKER, task_id=\"B\", priority=5, deps=[\"A\"])\n        self.sch.add_task(worker=WORKER, task_id=\"C\", priority=10, deps=[\"B\"])\n        self.sch.add_task(worker=WORKER, task_id=\"D\", priority=6)\n        self.check_task_order([\"A\", \"B\", \"C\", \"D\"])\n\n    def test_disable(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n\n        # should be disabled at this point\n        self.assertEqual(len(self.sch.task_list(\"DISABLED\", \"\")), 1)\n        self.assertEqual(len(self.sch.task_list(\"FAILED\", \"\")), 0)\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], None)\n\n    def test_disable_and_reenable(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n\n        # should be disabled at this point\n        self.assertEqual(len(self.sch.task_list(\"DISABLED\", \"\")), 1)\n        self.assertEqual(len(self.sch.task_list(\"FAILED\", \"\")), 0)\n\n        self.sch.re_enable_task(\"A\")\n\n        # should be enabled at this point\n        self.assertEqual(len(self.sch.task_list(\"DISABLED\", \"\")), 0)\n        self.assertEqual(len(self.sch.task_list(\"FAILED\", \"\")), 1)\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], \"A\")\n\n    def test_disable_and_reenable_and_disable_again(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n\n        # should be disabled at this point\n        self.assertEqual(len(self.sch.task_list(\"DISABLED\", \"\")), 1)\n        self.assertEqual(len(self.sch.task_list(\"FAILED\", \"\")), 0)\n\n        self.sch.re_enable_task(\"A\")\n\n        # should be enabled at this point\n        self.assertEqual(len(self.sch.task_list(\"DISABLED\", \"\")), 0)\n        self.assertEqual(len(self.sch.task_list(\"FAILED\", \"\")), 1)\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], \"A\")\n\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n\n        # should be still enabled\n        self.assertEqual(len(self.sch.task_list(\"DISABLED\", \"\")), 0)\n        self.assertEqual(len(self.sch.task_list(\"FAILED\", \"\")), 1)\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], \"A\")\n\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n\n        # should be disabled now\n        self.assertEqual(len(self.sch.task_list(\"DISABLED\", \"\")), 1)\n        self.assertEqual(len(self.sch.task_list(\"FAILED\", \"\")), 0)\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], None)\n\n    def test_disable_and_done(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n\n        # should be disabled at this point\n        self.assertEqual(len(self.sch.task_list(\"DISABLED\", \"\")), 1)\n        self.assertEqual(len(self.sch.task_list(\"FAILED\", \"\")), 0)\n\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=DONE)\n\n        # should be enabled at this point\n        self.assertEqual(len(self.sch.task_list(\"DISABLED\", \"\")), 0)\n        self.assertEqual(len(self.sch.task_list(\"DONE\", \"\")), 1)\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], \"A\")\n\n    def test_automatic_re_enable(self):\n        self.sch = Scheduler(retry_count=2, disable_persist=100)\n        self.setTime(0)\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n\n        # should be disabled now\n        self.assertEqual(DISABLED, self.sch.task_list(\"\", \"\")[\"A\"][\"status\"])\n\n        # re-enables after 100 seconds\n        self.setTime(101)\n        self.assertEqual(FAILED, self.sch.task_list(\"\", \"\")[\"A\"][\"status\"])\n\n    def test_automatic_re_enable_with_one_failure_allowed(self):\n        self.sch = Scheduler(retry_count=1, disable_persist=100)\n        self.setTime(0)\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n\n        # should be disabled now\n        self.assertEqual(DISABLED, self.sch.task_list(\"\", \"\")[\"A\"][\"status\"])\n\n        # re-enables after 100 seconds\n        self.setTime(101)\n        self.assertEqual(FAILED, self.sch.task_list(\"\", \"\")[\"A\"][\"status\"])\n\n    def test_no_automatic_re_enable_after_manual_disable(self):\n        self.sch = Scheduler(disable_persist=100)\n        self.setTime(0)\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=DISABLED)\n\n        # should be disabled now\n        self.assertEqual(DISABLED, self.sch.task_list(\"\", \"\")[\"A\"][\"status\"])\n\n        # should not re-enable after 100 seconds\n        self.setTime(101)\n        self.assertEqual(DISABLED, self.sch.task_list(\"\", \"\")[\"A\"][\"status\"])\n\n    def test_no_automatic_re_enable_after_auto_then_manual_disable(self):\n        self.sch = Scheduler(retry_count=2, disable_persist=100)\n        self.setTime(0)\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n\n        # should be disabled now\n        self.assertEqual(DISABLED, self.sch.task_list(\"\", \"\")[\"A\"][\"status\"])\n\n        # should remain disabled once set\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=DISABLED)\n        self.assertEqual(DISABLED, self.sch.task_list(\"\", \"\")[\"A\"][\"status\"])\n\n        # should not re-enable after 100 seconds\n        self.setTime(101)\n        self.assertEqual(DISABLED, self.sch.task_list(\"\", \"\")[\"A\"][\"status\"])\n\n    def test_disable_by_worker(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=DISABLED)\n        self.assertEqual(len(self.sch.task_list(\"DISABLED\", \"\")), 1)\n\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n\n        # should be enabled at this point\n        self.assertEqual(len(self.sch.task_list(\"DISABLED\", \"\")), 0)\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], \"A\")\n\n    def test_disable_worker(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.sch.disable_worker(worker=WORKER)\n        work = self.sch.get_work(worker=WORKER)\n        self.assertEqual(0, work[\"n_unique_pending\"])\n        self.assertEqual(0, work[\"n_pending_tasks\"])\n        self.assertIsNone(work[\"task_id\"])\n\n    def test_pause_work(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n\n        self.sch.pause()\n        self.assertEqual(\n            {\n                \"n_pending_last_scheduled\": 1,\n                \"n_unique_pending\": 1,\n                \"n_pending_tasks\": 1,\n                \"running_tasks\": [],\n                \"task_id\": None,\n                \"worker_state\": \"active\",\n            },\n            self.sch.get_work(worker=WORKER),\n        )\n\n        self.sch.unpause()\n        self.assertEqual(\"A\", self.sch.get_work(worker=WORKER)[\"task_id\"])\n\n    def test_is_paused(self):\n        self.assertFalse(self.sch.is_paused()[\"paused\"])\n        self.sch.pause()\n        self.assertTrue(self.sch.is_paused()[\"paused\"])\n        self.sch.unpause()\n        self.assertFalse(self.sch.is_paused()[\"paused\"])\n\n    def test_disable_worker_leaves_jobs_running(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.sch.get_work(worker=WORKER)\n\n        self.sch.disable_worker(worker=WORKER)\n        self.assertEqual([\"A\"], list(self.sch.task_list(\"RUNNING\", \"\").keys()))\n        self.assertEqual([\"A\"], list(self.sch.worker_list()[0][\"running\"].keys()))\n\n    def test_disable_worker_cannot_pick_up_failed_jobs(self):\n        self.setTime(0)\n\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.sch.get_work(worker=WORKER)\n        self.sch.disable_worker(worker=WORKER)\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n\n        # increase time and prune to make the job pending again\n        self.setTime(1000)\n        self.sch.ping(worker=WORKER)\n        self.sch.prune()\n\n        # we won't try the job again\n        self.assertIsNone(self.sch.get_work(worker=WORKER)[\"task_id\"])\n\n        # not even if other stuff is pending, changing the pending tasks code path\n        self.sch.add_task(worker=\"other_worker\", task_id=\"B\")\n        self.assertIsNone(self.sch.get_work(worker=WORKER)[\"task_id\"])\n\n    def test_disable_worker_cannot_continue_scheduling(self):\n        self.sch.disable_worker(worker=WORKER)\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.assertIsNone(self.sch.get_work(worker=WORKER)[\"task_id\"])\n\n    def test_disable_worker_cannot_add_tasks(self):\n        \"\"\"\n        Verify that a disabled worker cannot add tasks\n        \"\"\"\n        self.sch.disable_worker(worker=WORKER)\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.assertIsNone(self.sch.get_work(worker=\"assistant\", assistant=True)[\"task_id\"])\n        self.sch.add_task(worker=\"third_enabled_worker\", task_id=\"A\")\n        self.assertIsNotNone(self.sch.get_work(worker=\"assistant\", assistant=True)[\"task_id\"])\n\n    def _test_disable_worker_helper(self, new_status, new_deps):\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.assertEqual(\"A\", self.sch.get_work(worker=WORKER)[\"task_id\"])\n\n        self.sch.disable_worker(worker=WORKER)\n        self.assertEqual([\"A\"], list(self.sch.task_list(\"RUNNING\", \"\").keys()))\n\n        for dep in new_deps:\n            self.sch.add_task(worker=WORKER, task_id=dep, status=\"PENDING\")\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=new_status, new_deps=new_deps)\n        self.assertFalse(self.sch.task_list(\"RUNNING\", \"\").keys())\n        self.assertEqual([\"A\"], list(self.sch.task_list(new_status, \"\").keys()))\n\n        self.assertIsNone(self.sch.get_work(worker=WORKER)[\"task_id\"])\n        for task in self.sch.task_list(\"\", \"\").values():\n            self.assertFalse(task[\"workers\"])\n\n    def test_disable_worker_can_finish_task(self):\n        self._test_disable_worker_helper(new_status=DONE, new_deps=[])\n\n    def test_disable_worker_can_fail_task(self):\n        self._test_disable_worker_helper(new_status=FAILED, new_deps=[])\n\n    def test_disable_worker_stays_disabled_on_new_deps(self):\n        self._test_disable_worker_helper(new_status=\"PENDING\", new_deps=[\"B\", \"C\"])\n\n    def test_disable_worker_assistant_gets_no_task(self):\n        self.setTime(0)\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.sch.add_worker(\"assistant\", [(\"assistant\", True)])\n        self.sch.ping(worker=\"assistant\")\n        self.sch.disable_worker(\"assistant\")\n        self.assertIsNone(self.sch.get_work(worker=\"assistant\", assistant=True)[\"task_id\"])\n        self.assertIsNotNone(self.sch.get_work(worker=WORKER)[\"task_id\"])\n\n    def test_prune_worker(self):\n        self.setTime(1)\n        self.sch.add_worker(worker=WORKER, info={})\n        self.setTime(10000)\n        self.sch.prune()\n        self.setTime(20000)\n        self.sch.prune()\n        self.assertFalse(self.sch.worker_list())\n\n    def test_task_list_beyond_limit(self):\n        sch = Scheduler(max_shown_tasks=3)\n        for c in \"ABCD\":\n            sch.add_task(worker=WORKER, task_id=c)\n        self.assertEqual(set(\"ABCD\"), set(sch.task_list(\"PENDING\", \"\", False).keys()))\n        self.assertEqual({\"num_tasks\": 4}, sch.task_list(\"PENDING\", \"\"))\n\n    def test_task_list_within_limit(self):\n        sch = Scheduler(max_shown_tasks=4)\n        for c in \"ABCD\":\n            sch.add_task(worker=WORKER, task_id=c)\n        self.assertEqual(set(\"ABCD\"), set(sch.task_list(\"PENDING\", \"\").keys()))\n\n    def test_task_lists_some_beyond_limit(self):\n        sch = Scheduler(max_shown_tasks=3)\n        for c in \"ABCD\":\n            sch.add_task(worker=WORKER, task_id=c, status=DONE)\n        for c in \"EFG\":\n            sch.add_task(worker=WORKER, task_id=c)\n        self.assertEqual(set(\"EFG\"), set(sch.task_list(\"PENDING\", \"\").keys()))\n        self.assertEqual({\"num_tasks\": 4}, sch.task_list(\"DONE\", \"\"))\n\n    def test_dynamic_shown_tasks_in_task_list(self):\n        sch = Scheduler(max_shown_tasks=3)\n        for task_id in \"ABCD\":\n            sch.add_task(worker=WORKER, task_id=task_id, status=DONE)\n        for task_id in \"EFG\":\n            sch.add_task(worker=WORKER, task_id=task_id)\n\n        self.assertEqual(set(\"EFG\"), set(sch.task_list(\"PENDING\", \"\").keys()))\n        self.assertEqual({\"num_tasks\": 3}, sch.task_list(\"PENDING\", \"\", max_shown_tasks=2))\n\n        self.assertEqual({\"num_tasks\": 4}, sch.task_list(\"DONE\", \"\"))\n        self.assertEqual(set(\"ABCD\"), set(sch.task_list(\"DONE\", \"\", max_shown_tasks=4).keys()))\n\n    def add_task(self, family, **params):\n        task_id = str(hash((family, str(params))))  # use an unhelpful task id\n        self.sch.add_task(worker=WORKER, family=family, params=params, task_id=task_id)\n        return task_id\n\n    def search_pending(self, term, expected_keys):\n        actual_keys = set(self.sch.task_list(\"PENDING\", \"\", search=term).keys())\n        self.assertEqual(expected_keys, actual_keys)\n\n    def test_task_list_filter_by_search_family_name(self):\n        task1 = self.add_task(\"MySpecialTask\")\n        task2 = self.add_task(\"OtherSpecialTask\")\n\n        self.search_pending(\"Special\", {task1, task2})\n        self.search_pending(\"Task\", {task1, task2})\n        self.search_pending(\"My\", {task1})\n        self.search_pending(\"Other\", {task2})\n\n    def test_task_list_filter_by_search_long_family_name(self):\n        task = self.add_task(\"TaskClassWithAVeryLongNameAndDistinctEndingUUDDLRLRAB\")\n        self.search_pending(\"UUDDLRLRAB\", {task})\n\n    def test_task_list_filter_by_param_name(self):\n        task1 = self.add_task(\"ClassA\", day=\"2016-02-01\")\n        task2 = self.add_task(\"ClassB\", hour=\"2016-02-01T12\")\n\n        self.search_pending(\"day\", {task1})\n        self.search_pending(\"hour\", {task2})\n\n    def test_task_list_filter_by_long_param_name(self):\n        task = self.add_task(\"ClassA\", a_very_long_param_name_ending_with_uuddlrlrab=\"2016-02-01\")\n\n        self.search_pending(\"uuddlrlrab\", {task})\n\n    def test_task_list_filter_by_param_value(self):\n        task1 = self.add_task(\"ClassA\", day=\"2016-02-01\")\n        task2 = self.add_task(\"ClassB\", hour=\"2016-02-01T12\")\n\n        self.search_pending(\"2016-02-01\", {task1, task2})\n        self.search_pending(\"T12\", {task2})\n\n    def test_task_list_filter_by_long_param_value(self):\n        task = self.add_task(\"ClassA\", param=\"a_very_long_param_value_ending_with_uuddlrlrab\")\n        self.search_pending(\"uuddlrlrab\", {task})\n\n    def test_task_list_filter_by_param_name_value_pair(self):\n        task = self.add_task(\"ClassA\", param=\"value\")\n        self.search_pending(\"param=value\", {task})\n\n    def test_task_list_does_not_filter_by_task_id(self):\n        task = self.add_task(\"Class\")\n        self.search_pending(task, set())\n\n    def test_task_list_filter_by_multiple_search_terms(self):\n        expected = self.add_task(\"ClassA\", day=\"2016-02-01\", num=\"5\")\n        self.add_task(\"ClassA\", day=\"2016-03-01\", num=\"5\")\n        self.add_task(\"ClassB\", day=\"2016-02-01\", num=\"5\")\n        self.add_task(\"ClassA\", day=\"2016-02-01\", val=\"5\")\n\n        self.search_pending(\"ClassA 2016-02-01 num\", {expected})\n        # ensure that the task search is case insensitive\n        self.search_pending(\"classa 2016-02-01 num\", {expected})\n\n    def test_upstream_beyond_limit(self):\n        sch = Scheduler(max_shown_tasks=3)\n        for i in range(4):\n            sch.add_task(worker=WORKER, family=\"Test\", params={\"p\": str(i)}, task_id=\"Test_%i\" % i)\n        self.assertEqual({\"num_tasks\": -1}, sch.task_list(\"PENDING\", \"FAILED\"))\n        self.assertEqual({\"num_tasks\": 4}, sch.task_list(\"PENDING\", \"\"))\n\n    def test_do_not_prune_on_beyond_limit_check(self):\n        sch = Scheduler(max_shown_tasks=3)\n        sch.prune = mock.Mock()\n        for i in range(4):\n            sch.add_task(worker=WORKER, family=\"Test\", params={\"p\": str(i)}, task_id=\"Test_%i\" % i)\n        self.assertEqual({\"num_tasks\": 4}, sch.task_list(\"PENDING\", \"\"))\n        sch.prune.assert_not_called()\n\n    def test_search_results_beyond_limit(self):\n        sch = Scheduler(max_shown_tasks=3)\n        for i in range(4):\n            sch.add_task(worker=WORKER, family=\"Test\", params={\"p\": str(i)}, task_id=\"Test_%i\" % i)\n        self.assertEqual({\"num_tasks\": 4}, sch.task_list(\"PENDING\", \"\", search=\"Test\"))\n        self.assertEqual([\"Test_0\"], list(sch.task_list(\"PENDING\", \"\", search=\"0\").keys()))\n\n    def test_priority_update_dependency_chain(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\", priority=10, deps=[\"B\"])\n        self.sch.add_task(worker=WORKER, task_id=\"B\", priority=5, deps=[\"C\"])\n        self.sch.add_task(worker=WORKER, task_id=\"C\", priority=1)\n        self.sch.add_task(worker=WORKER, task_id=\"D\", priority=6)\n        self.check_task_order([\"C\", \"B\", \"A\", \"D\"])\n\n    def test_priority_no_decrease_with_multiple_updates(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\", priority=1)\n        self.sch.add_task(worker=WORKER, task_id=\"B\", priority=10, deps=[\"A\"])\n        self.sch.add_task(worker=WORKER, task_id=\"C\", priority=5, deps=[\"A\"])\n        self.sch.add_task(worker=WORKER, task_id=\"D\", priority=6)\n        self.check_task_order([\"A\", \"B\", \"D\", \"C\"])\n\n    def test_unique_tasks(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.sch.add_task(worker=WORKER, task_id=\"B\")\n        self.sch.add_task(worker=WORKER, task_id=\"C\")\n        self.sch.add_task(worker=WORKER + \"_2\", task_id=\"B\")\n\n        response = self.sch.get_work(worker=WORKER)\n        self.assertEqual(3, response[\"n_pending_tasks\"])\n        self.assertEqual(2, response[\"n_unique_pending\"])\n\n    def test_pending_downstream_disable(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=DISABLED)\n        self.sch.add_task(worker=WORKER, task_id=\"B\", deps=(\"A\",))\n        self.sch.add_task(worker=WORKER, task_id=\"C\", deps=(\"B\",))\n\n        response = self.sch.get_work(worker=WORKER)\n        self.assertTrue(response[\"task_id\"] is None)\n        self.assertEqual(0, response[\"n_pending_tasks\"])\n        self.assertEqual(0, response[\"n_unique_pending\"])\n\n    def test_pending_downstream_failure(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n        self.sch.add_task(worker=WORKER, task_id=\"B\", deps=(\"A\",))\n        self.sch.add_task(worker=WORKER, task_id=\"C\", deps=(\"B\",))\n\n        response = self.sch.get_work(worker=WORKER)\n        self.assertTrue(response[\"task_id\"] is None)\n        self.assertEqual(2, response[\"n_pending_tasks\"])\n        self.assertEqual(2, response[\"n_unique_pending\"])\n\n    def test_task_list_no_deps(self):\n        self.sch.add_task(worker=WORKER, task_id=\"B\", deps=(\"A\",))\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        task_list = self.sch.task_list(\"PENDING\", \"\")\n        self.assertFalse(\"deps\" in task_list[\"A\"])\n\n    def test_task_first_failure_time(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        test_task = self.sch._state.get_task(\"A\")\n        self.assertIsNone(test_task.first_failure_time)\n\n        time_before_failure = time.time()\n        test_task.add_failure()\n        time_after_failure = time.time()\n\n        self.assertLessEqual(time_before_failure, test_task.first_failure_time)\n        self.assertGreaterEqual(time_after_failure, test_task.first_failure_time)\n\n    def test_task_first_failure_time_remains_constant(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        test_task = self.sch._state.get_task(\"A\")\n        self.assertIsNone(test_task.first_failure_time)\n\n        test_task.add_failure()\n        first_failure_time = test_task.first_failure_time\n\n        test_task.add_failure()\n        self.assertEqual(first_failure_time, test_task.first_failure_time)\n\n    def test_task_has_excessive_failures(self):\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        test_task = self.sch._state.get_task(\"A\")\n        self.assertIsNone(test_task.first_failure_time)\n\n        self.assertFalse(test_task.has_excessive_failures())\n\n        test_task.add_failure()\n        self.assertFalse(test_task.has_excessive_failures())\n\n        fake_failure_time = test_task.first_failure_time - 2 * 60 * 60\n\n        test_task.first_failure_time = fake_failure_time\n        self.assertTrue(test_task.has_excessive_failures())\n\n    def test_quadratic_behavior(self):\n        \"\"\"Test that get_work is not taking linear amount of time.\n\n        This is of course impossible to test, however, doing reasonable\n        assumptions about hardware. This time should finish in a timely\n        manner.\n        \"\"\"\n        # For 10000 it takes almost 1 second on my laptop.  Prior to these\n        # changes it was being slow already at NUM_TASKS=300\n        NUM_TASKS = 10000\n        for i in range(NUM_TASKS):\n            self.sch.add_task(worker=str(i), task_id=str(i), resources={})\n\n        for i in range(NUM_TASKS):\n            self.assertEqual(self.sch.get_work(worker=str(i))[\"task_id\"], str(i))\n            self.sch.add_task(worker=str(i), task_id=str(i), status=DONE)\n\n    def test_get_work_speed(self):\n        \"\"\"Test that get_work is fast for few workers and many DONEs.\n\n        In #986, @daveFNbuck reported that he got a slowdown.\n        \"\"\"\n        # This took almost 4 minutes without optimization.\n        # Now it takes 10 seconds on my machine.\n        NUM_PENDING = 1000\n        NUM_DONE = 200000\n        assert NUM_DONE >= NUM_PENDING\n        for i in range(NUM_PENDING):\n            self.sch.add_task(worker=WORKER, task_id=str(i), resources={})\n\n        for i in range(NUM_PENDING, NUM_DONE):\n            self.sch.add_task(worker=WORKER, task_id=str(i), status=DONE)\n\n        for i in range(NUM_PENDING):\n            res = int(self.sch.get_work(worker=WORKER)[\"task_id\"])\n            self.assertTrue(0 <= res < NUM_PENDING)\n            self.sch.add_task(worker=WORKER, task_id=str(res), status=DONE)\n\n    def test_assistants_dont_nurture_finished_statuses(self):\n        \"\"\"\n        Test how assistants affect longevity of tasks\n\n        Assistants should not affect longevity expect for the tasks that it is\n        running, par the one it's actually running.\n        \"\"\"\n        self.sch = Scheduler(retry_delay=100000000000)  # Never pendify failed tasks\n        self.setTime(1)\n        self.sch.add_worker(\"assistant\", [(\"assistant\", True)])\n        self.sch.ping(worker=\"assistant\")\n        self.sch.add_task(worker=\"uploader\", task_id=\"running\", status=PENDING)\n        self.assertEqual(self.sch.get_work(worker=\"assistant\", assistant=True)[\"task_id\"], \"running\")\n\n        self.setTime(2)\n        self.sch.add_task(worker=\"uploader\", task_id=\"done\", status=DONE)\n        self.sch.add_task(worker=\"uploader\", task_id=\"disabled\", status=DISABLED)\n        self.sch.add_task(worker=\"uploader\", task_id=\"pending\", status=PENDING)\n        self.sch.add_task(worker=\"uploader\", task_id=\"failed\", status=FAILED)\n        self.sch.add_task(worker=\"uploader\", task_id=\"unknown\", status=UNKNOWN)\n\n        self.setTime(100000)\n        self.sch.ping(worker=\"assistant\")\n        self.sch.prune()\n\n        self.setTime(200000)\n        self.sch.ping(worker=\"assistant\")\n        self.sch.prune()\n        nurtured_statuses = [RUNNING]\n        not_nurtured_statuses = [DONE, UNKNOWN, DISABLED, PENDING, FAILED]\n\n        for status in nurtured_statuses:\n            self.assertEqual(set([status.lower()]), set(self.sch.task_list(status, \"\")))\n\n        for status in not_nurtured_statuses:\n            self.assertEqual(set([]), set(self.sch.task_list(status, \"\")))\n\n        self.assertEqual(1, len(self.sch.task_list(None, \"\")))  # None == All statuses\n\n    def test_no_crash_on_only_disable_hard_timeout(self):\n        \"\"\"\n        Scheduler shouldn't crash with only disable_hard_timeout\n\n        There was some failure happening when disable_hard_timeout was set but\n        disable_failures was not.\n        \"\"\"\n        self.sch = Scheduler(retry_delay=5, disable_hard_timeout=100)\n        self.setTime(1)\n        self.sch.add_worker(WORKER, [])\n        self.sch.ping(worker=WORKER)\n\n        self.setTime(2)\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.sch.add_task(worker=WORKER, task_id=\"B\", deps=[\"A\"])\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], \"A\")\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n        self.setTime(10)\n        self.sch.prune()\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], \"A\")\n\n    def test_assistant_running_task_dont_disappear(self):\n        \"\"\"\n        Tasks run by an assistant shouldn't be pruned\n        \"\"\"\n        self.setTime(1)\n        self.sch.add_worker(WORKER, [])\n        self.sch.ping(worker=WORKER)\n\n        self.setTime(2)\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], \"A\")\n        self.sch.add_task(worker=WORKER, task_id=\"B\")\n        self.sch.add_worker(\"assistant\", [(\"assistant\", True)])\n        self.sch.ping(worker=\"assistant\")\n        self.assertEqual(self.sch.get_work(worker=\"assistant\", assistant=True)[\"task_id\"], \"B\")\n\n        self.setTime(100000)\n        # Here, lets say WORKER disconnects (doesnt ping)\n        self.sch.ping(worker=\"assistant\")\n        self.sch.prune()\n\n        self.setTime(200000)\n        self.sch.ping(worker=\"assistant\")\n        self.sch.prune()\n        self.assertEqual({\"B\"}, set(self.sch.task_list(RUNNING, \"\")))\n        self.assertEqual({\"B\"}, set(self.sch.task_list(\"\", \"\")))\n\n    @mock.patch(\"luigi.scheduler.BatchNotifier\")\n    def test_batch_failure_emails(self, BatchNotifier):\n        scheduler = Scheduler(batch_emails=True)\n        scheduler.add_task(worker=WORKER, status=FAILED, task_id=\"T(a=5, b=6)\", family=\"T\", params={\"a\": \"5\", \"b\": \"6\"}, expl='\"bad thing\"')\n        BatchNotifier().add_failure.assert_called_once_with(\n            \"T(a=5, b=6)\",\n            \"T\",\n            {\"a\": \"5\", \"b\": \"6\"},\n            \"bad thing\",\n            None,\n        )\n        BatchNotifier().add_disable.assert_not_called()\n\n    @mock.patch(\"luigi.scheduler.BatchNotifier\")\n    def test_send_batch_email_on_dump(self, BatchNotifier):\n        scheduler = Scheduler(batch_emails=True)\n\n        BatchNotifier().send_email.assert_not_called()\n        scheduler.dump()\n        BatchNotifier().send_email.assert_called_once_with()\n\n    @mock.patch(\"luigi.scheduler.BatchNotifier\")\n    def test_do_not_send_batch_email_on_dump_without_batch_enabled(self, BatchNotifier):\n        scheduler = Scheduler(batch_emails=False)\n        scheduler.dump()\n\n        BatchNotifier().send_email.assert_not_called()\n\n    @mock.patch(\"luigi.scheduler.BatchNotifier\")\n    def test_handle_bad_expl_in_failure_emails(self, BatchNotifier):\n        scheduler = Scheduler(batch_emails=True)\n        scheduler.add_task(worker=WORKER, status=FAILED, task_id=\"T(a=5, b=6)\", family=\"T\", params={\"a\": \"5\", \"b\": \"6\"}, expl=\"bad thing\")\n        BatchNotifier().add_failure.assert_called_once_with(\n            \"T(a=5, b=6)\",\n            \"T\",\n            {\"a\": \"5\", \"b\": \"6\"},\n            \"bad thing\",\n            None,\n        )\n        BatchNotifier().add_disable.assert_not_called()\n\n    @mock.patch(\"luigi.scheduler.BatchNotifier\")\n    def test_scheduling_failure(self, BatchNotifier):\n        scheduler = Scheduler(batch_emails=True)\n        scheduler.announce_scheduling_failure(worker=WORKER, task_name=\"T(a=1, b=2)\", family=\"T\", params={\"a\": \"1\", \"b\": \"2\"}, expl=\"error\", owners=(\"owner\",))\n        BatchNotifier().add_scheduling_fail.assert_called_once_with(\"T(a=1, b=2)\", \"T\", {\"a\": \"1\", \"b\": \"2\"}, \"error\", (\"owner\",))\n\n    @mock.patch(\"luigi.scheduler.BatchNotifier\")\n    def test_scheduling_failure_without_batcher(self, BatchNotifier):\n        scheduler = Scheduler(batch_emails=False)\n        scheduler.announce_scheduling_failure(worker=WORKER, task_name=\"T(a=1, b=2)\", family=\"T\", params={\"a\": \"1\", \"b\": \"2\"}, expl=\"error\", owners=(\"owner\",))\n        BatchNotifier().add_scheduling_fail.assert_not_called()\n\n    @mock.patch(\"luigi.scheduler.BatchNotifier\")\n    def test_batch_failure_emails_with_task_batcher(self, BatchNotifier):\n        scheduler = Scheduler(batch_emails=True)\n        scheduler.add_task_batcher(worker=WORKER, task_family=\"T\", batched_args=[\"a\"])\n        scheduler.add_task(worker=WORKER, status=FAILED, task_id=\"T(a=5, b=6)\", family=\"T\", params={\"a\": \"5\", \"b\": \"6\"}, expl='\"bad thing\"')\n        BatchNotifier().add_failure.assert_called_once_with(\n            \"T(a=5, b=6)\",\n            \"T\",\n            {\"b\": \"6\"},\n            \"bad thing\",\n            None,\n        )\n        BatchNotifier().add_disable.assert_not_called()\n\n    @mock.patch(\"luigi.scheduler.BatchNotifier\")\n    def test_scheduling_failure_with_task_batcher(self, BatchNotifier):\n        scheduler = Scheduler(batch_emails=True)\n        scheduler.add_task_batcher(worker=WORKER, task_family=\"T\", batched_args=[\"a\"])\n        scheduler.announce_scheduling_failure(worker=WORKER, task_name=\"T(a=1, b=2)\", family=\"T\", params={\"a\": \"1\", \"b\": \"2\"}, expl=\"error\", owners=(\"owner\",))\n        BatchNotifier().add_scheduling_fail.assert_called_once_with(\"T(a=1, b=2)\", \"T\", {\"b\": \"2\"}, \"error\", (\"owner\",))\n\n    @mock.patch(\"luigi.scheduler.BatchNotifier\")\n    def test_batch_failure_email_with_owner(self, BatchNotifier):\n        scheduler = Scheduler(batch_emails=True)\n        scheduler.add_task(\n            worker=WORKER,\n            status=FAILED,\n            task_id=\"T(a=5, b=6)\",\n            family=\"T\",\n            params={\"a\": \"5\", \"b\": \"6\"},\n            expl='\"bad thing\"',\n            owners=[\"a@test.com\", \"b@test.com\"],\n        )\n        BatchNotifier().add_failure.assert_called_once_with(\n            \"T(a=5, b=6)\",\n            \"T\",\n            {\"a\": \"5\", \"b\": \"6\"},\n            \"bad thing\",\n            [\"a@test.com\", \"b@test.com\"],\n        )\n        BatchNotifier().add_disable.assert_not_called()\n\n    @mock.patch(\"luigi.scheduler.notifications\")\n    @mock.patch(\"luigi.scheduler.BatchNotifier\")\n    def test_batch_disable_emails(self, BatchNotifier, notifications):\n        scheduler = Scheduler(batch_emails=True, retry_count=1)\n        scheduler.add_task(worker=WORKER, status=FAILED, task_id=\"T(a=5, b=6)\", family=\"T\", params={\"a\": \"5\", \"b\": \"6\"}, expl='\"bad thing\"')\n        BatchNotifier().add_failure.assert_called_once_with(\n            \"T(a=5, b=6)\",\n            \"T\",\n            {\"a\": \"5\", \"b\": \"6\"},\n            \"bad thing\",\n            None,\n        )\n        BatchNotifier().add_disable.assert_called_once_with(\n            \"T(a=5, b=6)\",\n            \"T\",\n            {\"a\": \"5\", \"b\": \"6\"},\n            None,\n        )\n        notifications.send_error_email.assert_not_called()\n\n    @mock.patch(\"luigi.scheduler.notifications\")\n    @mock.patch(\"luigi.scheduler.BatchNotifier\")\n    def test_batch_disable_email_with_owner(self, BatchNotifier, notifications):\n        scheduler = Scheduler(batch_emails=True, retry_count=1)\n        scheduler.add_task(\n            worker=WORKER, status=FAILED, task_id=\"T(a=5, b=6)\", family=\"T\", params={\"a\": \"5\", \"b\": \"6\"}, expl='\"bad thing\"', owners=[\"a@test.com\"]\n        )\n        BatchNotifier().add_failure.assert_called_once_with(\n            \"T(a=5, b=6)\",\n            \"T\",\n            {\"a\": \"5\", \"b\": \"6\"},\n            \"bad thing\",\n            [\"a@test.com\"],\n        )\n        BatchNotifier().add_disable.assert_called_once_with(\n            \"T(a=5, b=6)\",\n            \"T\",\n            {\"a\": \"5\", \"b\": \"6\"},\n            [\"a@test.com\"],\n        )\n        notifications.send_error_email.assert_not_called()\n\n    @mock.patch(\"luigi.scheduler.notifications\")\n    @mock.patch(\"luigi.scheduler.BatchNotifier\")\n    def test_batch_disable_emails_with_task_batcher(self, BatchNotifier, notifications):\n        scheduler = Scheduler(batch_emails=True, retry_count=1)\n        scheduler.add_task_batcher(worker=WORKER, task_family=\"T\", batched_args=[\"a\"])\n        scheduler.add_task(worker=WORKER, status=FAILED, task_id=\"T(a=5, b=6)\", family=\"T\", params={\"a\": \"5\", \"b\": \"6\"}, expl='\"bad thing\"')\n        BatchNotifier().add_failure.assert_called_once_with(\n            \"T(a=5, b=6)\",\n            \"T\",\n            {\"b\": \"6\"},\n            \"bad thing\",\n            None,\n        )\n        BatchNotifier().add_disable.assert_called_once_with(\n            \"T(a=5, b=6)\",\n            \"T\",\n            {\"b\": \"6\"},\n            None,\n        )\n        notifications.send_error_email.assert_not_called()\n\n    @mock.patch(\"luigi.scheduler.notifications\")\n    def test_send_normal_disable_email(self, notifications):\n        scheduler = Scheduler(batch_emails=False, retry_count=1)\n        notifications.send_error_email.assert_not_called()\n        scheduler.add_task(worker=WORKER, status=FAILED, task_id=\"T(a=5, b=6)\", family=\"T\", params={\"a\": \"5\", \"b\": \"6\"}, expl='\"bad thing\"')\n        self.assertEqual(1, notifications.send_error_email.call_count)\n\n    @mock.patch(\"luigi.scheduler.BatchNotifier\")\n    def test_no_batch_notifier_without_batch_emails(self, BatchNotifier):\n        Scheduler(batch_emails=False)\n        BatchNotifier.assert_not_called()\n\n    @mock.patch(\"luigi.scheduler.BatchNotifier\")\n    def test_update_batcher_on_prune(self, BatchNotifier):\n        scheduler = Scheduler(batch_emails=True)\n        BatchNotifier().update.assert_not_called()\n        scheduler.prune()\n        BatchNotifier().update.assert_called_once_with()\n\n    def test_forgive_failures(self):\n        # Try to build A but fails, forgive failures and will retry before 100s\n        self.setTime(0)\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], \"A\")\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n        self.setTime(1)\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], None)\n        self.setTime(2)\n        self.sch.forgive_failures(task_id=\"A\")\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], \"A\")\n\n    def test_you_can_forgive_failures_twice(self):\n        # Try to build A but fails, forgive failures two times and will retry before 100s\n        self.setTime(0)\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], \"A\")\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n        self.setTime(1)\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], None)\n        self.setTime(2)\n        self.sch.forgive_failures(task_id=\"A\")\n        self.sch.forgive_failures(task_id=\"A\")\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], \"A\")\n\n    def test_mark_running_as_done_works(self):\n        # Adding a task, it runs, then force-commiting it sends it to DONE\n        self.setTime(0)\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], \"A\")\n        self.setTime(1)\n        self.assertEqual({\"A\"}, set(self.sch.task_list(RUNNING, \"\").keys()))\n        self.sch.mark_as_done(task_id=\"A\")\n        self.assertEqual({\"A\"}, set(self.sch.task_list(DONE, \"\").keys()))\n\n    def test_mark_failed_as_done_works(self):\n        # Adding a task, saying it failed, then force-commiting it sends it to DONE\n        self.setTime(0)\n        self.sch.add_task(worker=WORKER, task_id=\"A\")\n        self.assertEqual(self.sch.get_work(worker=WORKER)[\"task_id\"], \"A\")\n        self.sch.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n        self.setTime(1)\n        self.assertEqual(set(), set(self.sch.task_list(RUNNING, \"\").keys()))\n        self.assertEqual({\"A\"}, set(self.sch.task_list(FAILED, \"\").keys()))\n        self.sch.mark_as_done(task_id=\"A\")\n        self.assertEqual({\"A\"}, set(self.sch.task_list(DONE, \"\").keys()))\n\n    @mock.patch(\"luigi.metrics.NoMetricsCollector\")\n    def test_collector_metrics_on_task_started(self, MetricsCollector):\n        from luigi.metrics import MetricsCollectors\n\n        s = Scheduler(metrics_collector=MetricsCollectors.none)\n        s.add_task(worker=WORKER, task_id=\"A\", status=PENDING)\n        s.get_work(worker=WORKER)\n\n        task = s._state.get_task(\"A\")\n        MetricsCollector().handle_task_started.assert_called_once_with(task)\n\n    @mock.patch(\"luigi.metrics.NoMetricsCollector\")\n    def test_collector_metrics_on_task_disabled(self, MetricsCollector):\n        from luigi.metrics import MetricsCollectors\n\n        s = Scheduler(metrics_collector=MetricsCollectors.none, retry_count=0)\n        s.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n\n        task = s._state.get_task(\"A\")\n        MetricsCollector().handle_task_disabled.assert_called_once_with(task, s._config)\n\n    @mock.patch(\"luigi.metrics.NoMetricsCollector\")\n    def test_collector_metrics_on_task_failed(self, MetricsCollector):\n        from luigi.metrics import MetricsCollectors\n\n        s = Scheduler(metrics_collector=MetricsCollectors.none)\n        s.add_task(worker=WORKER, task_id=\"A\", status=FAILED)\n\n        task = s._state.get_task(\"A\")\n        MetricsCollector().handle_task_failed.assert_called_once_with(task)\n\n    @mock.patch(\"luigi.metrics.NoMetricsCollector\")\n    def test_collector_metrics_on_task_done(self, MetricsCollector):\n        from luigi.metrics import MetricsCollectors\n\n        s = Scheduler(metrics_collector=MetricsCollectors.none)\n        s.add_task(worker=WORKER, task_id=\"A\", status=DONE)\n\n        task = s._state.get_task(\"A\")\n        MetricsCollector().handle_task_done.assert_called_once_with(task)\n"
  },
  {
    "path": "test/scheduler_message_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport os\nimport tempfile\nimport time\n\nfrom helpers import LuigiTestCase, RunOnceTask\n\nimport luigi\nimport luigi.scheduler\nimport luigi.worker\n\n\ndef fast_worker(scheduler, **kwargs):\n    kwargs.setdefault(\"ping_interval\", 0.5)\n    kwargs.setdefault(\"force_multiprocessing\", True)\n    return luigi.worker.Worker(scheduler=scheduler, **kwargs)\n\n\nclass WriteMessageToFile(luigi.Task):\n    path = luigi.Parameter()\n\n    accepts_messages = True\n\n    def output(self):\n        return luigi.LocalTarget(self.path)\n\n    def run(self):\n        msg = \"\"\n\n        time.sleep(1)\n        if not self.scheduler_messages.empty():\n            msg = self.scheduler_messages.get().content\n\n        with self.output().open(\"w\") as f:\n            f.write(msg + \"\\n\")\n\n\nclass SchedulerMessageTest(LuigiTestCase):\n    def test_scheduler_methods(self):\n        sch = luigi.scheduler.Scheduler(send_messages=True)\n        sch.add_task(task_id=\"foo-task\", worker=\"foo-worker\")\n\n        res = sch.send_scheduler_message(\"foo-worker\", \"foo-task\", \"message content\")\n        message_id = res[\"message_id\"]\n        self.assertTrue(len(message_id) > 0)\n        self.assertIn(\"-\", message_id)\n\n        sch.add_scheduler_message_response(\"foo-task\", message_id, \"message response\")\n        res = sch.get_scheduler_message_response(\"foo-task\", message_id)\n        response = res[\"response\"]\n        self.assertEqual(response, \"message response\")\n\n    def test_receive_messsage(self):\n        sch = luigi.scheduler.Scheduler(send_messages=True)\n        with fast_worker(sch) as w:\n            with tempfile.NamedTemporaryFile() as tmp:\n                if os.path.exists(tmp.name):\n                    os.remove(tmp.name)\n\n                task = WriteMessageToFile(path=tmp.name)\n                w.add(task)\n\n                sch.send_scheduler_message(w._id, task.task_id, \"test\")\n                w.run()\n\n                self.assertTrue(os.path.exists(tmp.name))\n                with open(tmp.name, \"r\") as f:\n                    self.assertEqual(str(f.read()).strip(), \"test\")\n\n    def test_receive_messages_disabled(self):\n        sch = luigi.scheduler.Scheduler(send_messages=True)\n        with fast_worker(sch, force_multiprocessing=False) as w:\n\n            class MyTask(RunOnceTask):\n                def run(self):\n                    self.had_queue = self.scheduler_messages is not None\n                    super(MyTask, self).run()\n\n            task = MyTask()\n            w.add(task)\n\n            sch.send_scheduler_message(w._id, task.task_id, \"test\")\n            w.run()\n\n            self.assertFalse(task.had_queue)\n\n    def test_send_messages_disabled(self):\n        sch = luigi.scheduler.Scheduler(send_messages=False)\n        with fast_worker(sch) as w:\n            with tempfile.NamedTemporaryFile() as tmp:\n                if os.path.exists(tmp.name):\n                    os.remove(tmp.name)\n\n                task = WriteMessageToFile(path=tmp.name)\n                w.add(task)\n\n                sch.send_scheduler_message(w._id, task.task_id, \"test\")\n                w.run()\n\n                self.assertTrue(os.path.exists(tmp.name))\n                with open(tmp.name, \"r\") as f:\n                    self.assertEqual(str(f.read()).strip(), \"\")\n"
  },
  {
    "path": "test/scheduler_parameter_visibilities_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport json\nimport time\n\nimport server_test\nfrom helpers import LuigiTestCase, RunOnceTask\n\nimport luigi\nimport luigi.scheduler\nimport luigi.worker\nfrom luigi.parameter import ParameterVisibility\n\n\nclass SchedulerParameterVisibilitiesTest(LuigiTestCase):\n    def test_task_with_deps(self):\n        s = luigi.scheduler.Scheduler(send_messages=True)\n        with luigi.worker.Worker(scheduler=s) as w:\n\n            class DynamicTask(RunOnceTask):\n                dynamic_public = luigi.Parameter(default=\"dynamic_public\")\n                dynamic_hidden = luigi.Parameter(default=\"dynamic_hidden\", visibility=ParameterVisibility.HIDDEN)\n                dynamic_private = luigi.Parameter(default=\"dynamic_private\", visibility=ParameterVisibility.PRIVATE)\n\n            class RequiredTask(RunOnceTask):\n                required_public = luigi.Parameter(default=\"required_param\")\n                required_hidden = luigi.Parameter(default=\"required_hidden\", visibility=ParameterVisibility.HIDDEN)\n                required_private = luigi.Parameter(default=\"required_private\", visibility=ParameterVisibility.PRIVATE)\n\n            class Task(RunOnceTask):\n                a = luigi.Parameter(default=\"a\")\n                b = luigi.Parameter(default=\"b\", visibility=ParameterVisibility.HIDDEN)\n                c = luigi.Parameter(default=\"c\", visibility=ParameterVisibility.PRIVATE)\n                d = luigi.Parameter(default=\"d\", visibility=ParameterVisibility.PUBLIC)\n\n                def requires(self):\n                    return required_task\n\n                def run(self):\n                    yield dynamic_task\n\n            dynamic_task = DynamicTask()\n            required_task = RequiredTask()\n            task = Task()\n\n            w.add(task)\n            w.run()\n\n            time.sleep(1)\n            task_deps = s.dep_graph(task_id=task.task_id)\n            required_task_deps = s.dep_graph(task_id=required_task.task_id)\n            dynamic_task_deps = s.dep_graph(task_id=dynamic_task.task_id)\n\n            self.assertEqual(\"Task(a=a, d=d)\", task_deps[task.task_id][\"display_name\"])\n            self.assertEqual(\"RequiredTask(required_public=required_param)\", required_task_deps[required_task.task_id][\"display_name\"])\n            self.assertEqual(\"DynamicTask(dynamic_public=dynamic_public)\", dynamic_task_deps[dynamic_task.task_id][\"display_name\"])\n\n            self.assertEqual({\"a\": \"a\", \"d\": \"d\"}, task_deps[task.task_id][\"params\"])\n            self.assertEqual({\"required_public\": \"required_param\"}, required_task_deps[required_task.task_id][\"params\"])\n            self.assertEqual({\"dynamic_public\": \"dynamic_public\"}, dynamic_task_deps[dynamic_task.task_id][\"params\"])\n\n    def test_public_and_hidden_params(self):\n        s = luigi.scheduler.Scheduler(send_messages=True)\n        with luigi.worker.Worker(scheduler=s) as w:\n\n            class Task(RunOnceTask):\n                a = luigi.Parameter(default=\"a\")\n                b = luigi.Parameter(default=\"b\", visibility=ParameterVisibility.HIDDEN)\n                c = luigi.Parameter(default=\"c\", visibility=ParameterVisibility.PRIVATE)\n                d = luigi.Parameter(default=\"d\", visibility=ParameterVisibility.PUBLIC)\n\n            task = Task()\n\n            w.add(task)\n            w.run()\n\n            time.sleep(1)\n            t = s._state.get_task(task.task_id)\n            self.assertEqual({\"b\": \"b\"}, t.hidden_params)\n            self.assertEqual({\"a\": \"a\", \"d\": \"d\"}, t.public_params)\n            self.assertEqual({\"a\": 0, \"b\": 1, \"d\": 0}, t.param_visibilities)\n\n\nclass Task(RunOnceTask):\n    a = luigi.Parameter(default=\"a\")\n    b = luigi.Parameter(default=\"b\", visibility=ParameterVisibility.HIDDEN)\n    c = luigi.Parameter(default=\"c\", visibility=ParameterVisibility.PRIVATE)\n    d = luigi.Parameter(default=\"d\", visibility=ParameterVisibility.PUBLIC)\n\n\nclass RemoteSchedulerParameterVisibilitiesTest(server_test.ServerTestBase):\n    def test_public_params(self):\n        task = Task()\n        luigi.build(tasks=[task], workers=2, scheduler_port=self.get_http_port())\n\n        time.sleep(1)\n\n        response = self.fetch(\"/api/graph\")\n\n        body = response.body\n        decoded = body.decode(\"utf8\").replace(\"'\", '\"')\n        data = json.loads(decoded)\n\n        self.assertEqual({\"a\": \"a\", \"d\": \"d\"}, data[\"response\"][task.task_id][\"params\"])\n"
  },
  {
    "path": "test/scheduler_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport os\nimport pickle\nimport shutil\nimport tempfile\nimport time\nfrom multiprocessing import Process\n\nfrom helpers import unittest, with_config\n\nimport luigi.configuration\nimport luigi.scheduler\nimport luigi.server\nfrom luigi.target import FileAlreadyExists\n\n\nclass SchedulerIoTest(unittest.TestCase):\n    def test_pretty_id_unicode(self):\n        scheduler = luigi.scheduler.Scheduler()\n        scheduler.add_task(worker=\"A\", task_id=\"1\", params={\"foo\": \"\\u2192bar\"})\n        [task] = list(scheduler._state.get_active_tasks())\n        task.pretty_id\n\n    def test_load_old_state(self):\n        tasks = {}\n        active_workers = {\"Worker1\": 1e9, \"Worker2\": time.time()}\n\n        with tempfile.NamedTemporaryFile(delete=True) as fn:\n            with open(fn.name, \"wb\") as fobj:\n                state = (tasks, active_workers)\n                pickle.dump(state, fobj)\n\n            state = luigi.scheduler.SimpleTaskState(state_path=fn.name)\n            state.load()\n\n            self.assertEqual(set(state.get_worker_ids()), {\"Worker1\", \"Worker2\"})\n\n    def test_load_broken_state(self):\n        with tempfile.NamedTemporaryFile(delete=True) as fn:\n            with open(fn.name, \"w\") as fobj:\n                print(\"b0rk\", file=fobj)\n\n            state = luigi.scheduler.SimpleTaskState(state_path=fn.name)\n            state.load()  # bad if this crashes\n\n            self.assertEqual(list(state.get_worker_ids()), [])\n\n    @with_config({\"scheduler\": {\"retry_count\": \"44\", \"worker_disconnect_delay\": \"55\"}})\n    def test_scheduler_with_config(self):\n        scheduler = luigi.scheduler.Scheduler()\n        self.assertEqual(44, scheduler._config.retry_count)\n        self.assertEqual(55, scheduler._config.worker_disconnect_delay)\n\n        # Override\n        scheduler = luigi.scheduler.Scheduler(retry_count=66, worker_disconnect_delay=77)\n        self.assertEqual(66, scheduler._config.retry_count)\n        self.assertEqual(77, scheduler._config.worker_disconnect_delay)\n\n    @with_config({\"resources\": {\"a\": \"100\", \"b\": \"200\"}})\n    def test_scheduler_with_resources(self):\n        scheduler = luigi.scheduler.Scheduler()\n        self.assertEqual({\"a\": 100, \"b\": 200}, scheduler._resources)\n\n    @with_config({\"scheduler\": {\"record_task_history\": \"True\"}, \"task_history\": {\"db_connection\": \"sqlite:////none/existing/path/hist.db\"}})\n    def test_local_scheduler_task_history_status(self):\n        ls = luigi.interface._WorkerSchedulerFactory().create_local_scheduler()\n        self.assertEqual(False, ls._config.record_task_history)\n\n    def test_load_recovers_tasks_index(self):\n        scheduler = luigi.scheduler.Scheduler()\n        scheduler.add_task(worker=\"A\", task_id=\"1\")\n        scheduler.add_task(worker=\"B\", task_id=\"2\")\n        scheduler.add_task(worker=\"C\", task_id=\"3\")\n        scheduler.add_task(worker=\"D\", task_id=\"4\")\n        self.assertEqual(scheduler.get_work(worker=\"A\")[\"task_id\"], \"1\")\n\n        with tempfile.NamedTemporaryFile(delete=True) as fn:\n\n            def reload_from_disk(scheduler):\n                scheduler._state._state_path = fn.name\n                scheduler.dump()\n                scheduler = luigi.scheduler.Scheduler()\n                scheduler._state._state_path = fn.name\n                scheduler.load()\n                return scheduler\n\n            scheduler = reload_from_disk(scheduler=scheduler)\n            self.assertEqual(scheduler.get_work(worker=\"B\")[\"task_id\"], \"2\")\n            self.assertEqual(scheduler.get_work(worker=\"C\")[\"task_id\"], \"3\")\n            scheduler = reload_from_disk(scheduler=scheduler)\n            self.assertEqual(scheduler.get_work(worker=\"D\")[\"task_id\"], \"4\")\n\n    def test_worker_prune_after_init(self):\n        \"\"\"\n        See https://github.com/spotify/luigi/pull/1019\n        \"\"\"\n        worker = luigi.scheduler.Worker(123)\n\n        class TmpCfg:\n            def __init__(self):\n                self.worker_disconnect_delay = 10\n\n        worker.prune(TmpCfg())\n\n    def test_get_empty_retry_policy(self):\n        retry_policy = luigi.scheduler._get_empty_retry_policy()\n        self.assertEqual(3, len(retry_policy))\n        self.assertEqual([\"retry_count\", \"disable_hard_timeout\", \"disable_window\"], list(retry_policy._asdict().keys()))\n        self.assertEqual([None, None, None], list(retry_policy._asdict().values()))\n\n    @with_config({\"scheduler\": {\"retry_count\": \"9\", \"disable_hard_timeout\": \"99\", \"disable_window\": \"999\"}})\n    def test_scheduler_get_retry_policy(self):\n        s = luigi.scheduler.Scheduler()\n        self.assertEqual(luigi.scheduler.RetryPolicy(9, 99, 999), s._config._get_retry_policy())\n\n    @with_config({\"scheduler\": {\"retry_count\": \"9\", \"disable_hard_timeout\": \"99\", \"disable_window\": \"999\"}})\n    def test_generate_retry_policy(self):\n        s = luigi.scheduler.Scheduler()\n\n        try:\n            s._generate_retry_policy({\"inexist_attr\": True})\n            self.assertFalse(True, \"'unexpected keyword argument' error must have been thrown\")\n        except TypeError:\n            self.assertTrue(True)\n\n        retry_policy = s._generate_retry_policy({})\n        self.assertEqual(luigi.scheduler.RetryPolicy(9, 99, 999), retry_policy)\n\n        retry_policy = s._generate_retry_policy({\"retry_count\": 1})\n        self.assertEqual(luigi.scheduler.RetryPolicy(1, 99, 999), retry_policy)\n\n        retry_policy = s._generate_retry_policy({\"retry_count\": 1, \"disable_hard_timeout\": 11, \"disable_window\": 111})\n        self.assertEqual(luigi.scheduler.RetryPolicy(1, 11, 111), retry_policy)\n\n    @with_config({\"scheduler\": {\"retry_count\": \"44\"}})\n    def test_per_task_retry_policy(self):\n        cps = luigi.scheduler.Scheduler()\n\n        cps.add_task(worker=\"test_worker1\", task_id=\"test_task_1\", deps=[\"test_task_2\", \"test_task_3\"])\n        tasks = list(cps._state.get_active_tasks())\n        self.assertEqual(3, len(tasks))\n\n        tasks = sorted(tasks, key=lambda x: x.id)\n        task_1 = tasks[0]\n        task_2 = tasks[1]\n        task_3 = tasks[2]\n\n        self.assertEqual(\"test_task_1\", task_1.id)\n        self.assertEqual(\"test_task_2\", task_2.id)\n        self.assertEqual(\"test_task_3\", task_3.id)\n\n        self.assertEqual(luigi.scheduler.RetryPolicy(44, 999999999, 3600), task_1.retry_policy)\n        self.assertEqual(luigi.scheduler.RetryPolicy(44, 999999999, 3600), task_2.retry_policy)\n        self.assertEqual(luigi.scheduler.RetryPolicy(44, 999999999, 3600), task_3.retry_policy)\n\n        cps._state._tasks = {}\n        cps.add_task(\n            worker=\"test_worker2\",\n            task_id=\"test_task_4\",\n            deps=[\"test_task_5\", \"test_task_6\"],\n            retry_policy_dict=luigi.scheduler.RetryPolicy(99, 999, 9999)._asdict(),\n        )\n\n        tasks = list(cps._state.get_active_tasks())\n        self.assertEqual(3, len(tasks))\n\n        tasks = sorted(tasks, key=lambda x: x.id)\n        task_4 = tasks[0]\n        task_5 = tasks[1]\n        task_6 = tasks[2]\n\n        self.assertEqual(\"test_task_4\", task_4.id)\n        self.assertEqual(\"test_task_5\", task_5.id)\n        self.assertEqual(\"test_task_6\", task_6.id)\n\n        self.assertEqual(luigi.scheduler.RetryPolicy(99, 999, 9999), task_4.retry_policy)\n        self.assertEqual(luigi.scheduler.RetryPolicy(44, 999999999, 3600), task_5.retry_policy)\n        self.assertEqual(luigi.scheduler.RetryPolicy(44, 999999999, 3600), task_6.retry_policy)\n\n        cps._state._tasks = {}\n        cps.add_task(worker=\"test_worker3\", task_id=\"test_task_7\", deps=[\"test_task_8\", \"test_task_9\"])\n        cps.add_task(worker=\"test_worker3\", task_id=\"test_task_8\", retry_policy_dict=luigi.scheduler.RetryPolicy(99, 999, 9999)._asdict())\n        cps.add_task(worker=\"test_worker3\", task_id=\"test_task_9\", retry_policy_dict=luigi.scheduler.RetryPolicy(11, 111, 1111)._asdict())\n\n        tasks = list(cps._state.get_active_tasks())\n        self.assertEqual(3, len(tasks))\n\n        tasks = sorted(tasks, key=lambda x: x.id)\n        task_7 = tasks[0]\n        task_8 = tasks[1]\n        task_9 = tasks[2]\n\n        self.assertEqual(\"test_task_7\", task_7.id)\n        self.assertEqual(\"test_task_8\", task_8.id)\n        self.assertEqual(\"test_task_9\", task_9.id)\n\n        self.assertEqual(luigi.scheduler.RetryPolicy(44, 999999999, 3600), task_7.retry_policy)\n        self.assertEqual(luigi.scheduler.RetryPolicy(99, 999, 9999), task_8.retry_policy)\n        self.assertEqual(luigi.scheduler.RetryPolicy(11, 111, 1111), task_9.retry_policy)\n\n        # Task 7 which is disable-failures 44 and its has_excessive_failures method returns False under 44\n        for i in range(43):\n            task_7.add_failure()\n        self.assertFalse(task_7.has_excessive_failures())\n        task_7.add_failure()\n        self.assertTrue(task_7.has_excessive_failures())\n\n        # Task 8 which is disable-failures 99 and its has_excessive_failures method returns False under 44\n        for i in range(98):\n            task_8.add_failure()\n        self.assertFalse(task_8.has_excessive_failures())\n        task_8.add_failure()\n        self.assertTrue(task_8.has_excessive_failures())\n\n        # Task 9 which is disable-failures 1 and its has_excessive_failures method returns False under 44\n        for i in range(10):\n            task_9.add_failure()\n        self.assertFalse(task_9.has_excessive_failures())\n        task_9.add_failure()\n        self.assertTrue(task_9.has_excessive_failures())\n\n    @with_config({\"scheduler\": {\"record_task_history\": \"true\"}})\n    def test_has_task_history(self):\n        cfg = luigi.configuration.get_config()\n        with tempfile.NamedTemporaryFile(suffix=\".db\", delete=True) as fn:\n            cfg.set(\"task_history\", \"db_connection\", \"sqlite:///\" + fn.name)\n            s = luigi.scheduler.Scheduler()\n            self.assertTrue(s.has_task_history())\n\n    @with_config({\"scheduler\": {\"record_task_history\": \"false\"}})\n    def test_has_no_task_history(self):\n        s = luigi.scheduler.Scheduler()\n        self.assertFalse(s.has_task_history())\n\n    @with_config({\"scheduler\": {\"pause_enabled\": \"false\"}})\n    def test_pause_disabled(self):\n        s = luigi.scheduler.Scheduler()\n        self.assertFalse(s.is_pause_enabled()[\"enabled\"])\n        self.assertFalse(s.is_paused()[\"paused\"])\n        s.pause()\n        self.assertFalse(s.is_paused()[\"paused\"])\n\n    def test_default_metrics_collector(self):\n        from luigi.metrics import MetricsCollector\n\n        s = luigi.scheduler.Scheduler()\n        scheduler_state = s._state\n        collector = scheduler_state._metrics_collector\n        self.assertTrue(isinstance(collector, MetricsCollector))\n\n    @with_config({\"scheduler\": {\"metrics_collector\": \"datadog\"}})\n    def test_datadog_metrics_collector(self):\n        from luigi.contrib.datadog_metric import DatadogMetricsCollector\n\n        s = luigi.scheduler.Scheduler()\n        scheduler_state = s._state\n        collector = scheduler_state._metrics_collector\n        self.assertTrue(isinstance(collector, DatadogMetricsCollector))\n\n    @with_config({\"scheduler\": {\"metrics_collector\": \"prometheus\"}})\n    def test_prometheus_metrics_collector(self):\n        from luigi.contrib.prometheus_metric import PrometheusMetricsCollector\n\n        s = luigi.scheduler.Scheduler()\n        scheduler_state = s._state\n        collector = scheduler_state._metrics_collector\n        self.assertTrue(isinstance(collector, PrometheusMetricsCollector))\n\n    @with_config({\"scheduler\": {\"metrics_collector\": \"custom\", \"metrics_custom_import\": \"luigi.contrib.prometheus_metric.PrometheusMetricsCollector\"}})\n    def test_custom_metrics_collector(self):\n        from luigi.contrib.prometheus_metric import PrometheusMetricsCollector\n\n        s = luigi.scheduler.Scheduler()\n        scheduler_state = s._state\n        collector = scheduler_state._metrics_collector\n        self.assertTrue(isinstance(collector, PrometheusMetricsCollector))\n\n\nclass SchedulerWorkerTest(unittest.TestCase):\n    def get_pending_ids(self, worker, state):\n        return {task.id for task in worker.get_tasks(state, \"PENDING\")}\n\n    def test_get_pending_tasks_with_many_done_tasks(self):\n        sch = luigi.scheduler.Scheduler()\n        sch.add_task(worker=\"NON_TRIVIAL\", task_id=\"A\", resources={\"a\": 1})\n        sch.add_task(worker=\"TRIVIAL\", task_id=\"B\", status=\"PENDING\")\n        sch.add_task(worker=\"TRIVIAL\", task_id=\"C\", status=\"DONE\")\n        sch.add_task(worker=\"TRIVIAL\", task_id=\"D\", status=\"DONE\")\n\n        scheduler_state = sch._state\n        trivial_worker = scheduler_state.get_worker(\"TRIVIAL\")\n        self.assertEqual({\"B\"}, self.get_pending_ids(trivial_worker, scheduler_state))\n\n        non_trivial_worker = scheduler_state.get_worker(\"NON_TRIVIAL\")\n        self.assertEqual({\"A\"}, self.get_pending_ids(non_trivial_worker, scheduler_state))\n\n\nclass FailingOnDoubleRunTask(luigi.Task):\n    time_to_check_secs = 1\n    time_to_run_secs = 2\n    output_dir = luigi.Parameter(default=\"\")\n\n    def __init__(self, *args, **kwargs):\n        super(FailingOnDoubleRunTask, self).__init__(*args, **kwargs)\n        self.file_name = os.path.join(self.output_dir, \"AnyTask\")\n\n    def complete(self):\n        time.sleep(self.time_to_check_secs)  # e.g., establish connection\n        exists = os.path.exists(self.file_name)\n        time.sleep(self.time_to_check_secs)  # e.g., close connection\n        return exists\n\n    def run(self):\n        time.sleep(self.time_to_run_secs)\n        if os.path.exists(self.file_name):\n            raise FileAlreadyExists(self.file_name)\n        open(self.file_name, \"w\").close()\n\n\nclass StableDoneCooldownSecsTest(unittest.TestCase):\n    def setUp(self):\n        self.p = tempfile.mkdtemp()\n\n    def tearDown(self):\n        shutil.rmtree(self.p)\n\n    def run_task(self):\n        return luigi.build([FailingOnDoubleRunTask(output_dir=self.p)], detailed_summary=True, parallel_scheduling=True, parallel_scheduling_processes=2)\n\n    @with_config({\"worker\": {\"keep_alive\": \"false\"}})\n    def get_second_run_result_on_double_run(self):\n        server_process = Process(target=luigi.server.run)\n        process = Process(target=self.run_task)\n        try:\n            # scheduler is started\n            server_process.start()\n            # first run is started\n            process.start()\n            time.sleep(FailingOnDoubleRunTask.time_to_run_secs + FailingOnDoubleRunTask.time_to_check_secs)\n            # second run of the same task is started\n            second_run_result = self.run_task()\n            return second_run_result\n        finally:\n            process.join(1)\n            server_process.terminate()\n            server_process.join(1)\n\n    @with_config({\"scheduler\": {\"stable_done_cooldown_secs\": \"5\"}})\n    def test_sending_same_task_twice_with_cooldown_does_not_lead_to_double_run(self):\n        second_run_result = self.get_second_run_result_on_double_run()\n        self.assertEqual(second_run_result.scheduling_succeeded, True)\n\n    @with_config({\"scheduler\": {\"stable_done_cooldown_secs\": \"0\"}})\n    def test_sending_same_task_twice_without_cooldown_leads_to_double_run(self):\n        second_run_result = self.get_second_run_result_on_double_run()\n        self.assertEqual(second_run_result.scheduling_succeeded, False)\n"
  },
  {
    "path": "test/scheduler_visualisation_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport os\nimport tempfile\nimport time\n\nfrom helpers import RunOnceTask, unittest\n\nimport luigi\nimport luigi.notifications\nimport luigi.scheduler\nimport luigi.worker\n\nluigi.notifications.DEBUG = True\n\ntempdir = tempfile.mkdtemp()\n\n\nclass DummyTask(luigi.Task):\n    task_id = luigi.IntParameter()\n\n    def run(self):\n        f = self.output().open(\"w\")\n        f.close()\n\n    def output(self):\n        return luigi.LocalTarget(os.path.join(tempdir, str(self)))\n\n\nclass FactorTask(luigi.Task):\n    product = luigi.IntParameter()\n\n    def requires(self):\n        for factor in range(2, self.product):\n            if self.product % factor == 0:\n                yield FactorTask(factor)\n                yield FactorTask(self.product // factor)\n                return\n\n    def run(self):\n        f = self.output().open(\"w\")\n        f.close()\n\n    def output(self):\n        return luigi.LocalTarget(os.path.join(tempdir, \"luigi_test_factor_%d\" % self.product))\n\n\nclass BadReqTask(luigi.Task):\n    succeed = luigi.BoolParameter()\n\n    def requires(self):\n        assert self.succeed\n        yield BadReqTask(False)\n\n    def run(self):\n        pass\n\n    def complete(self):\n        return False\n\n\nclass FailingTask(luigi.Task):\n    task_namespace = __name__\n    task_id = luigi.IntParameter()\n\n    def complete(self):\n        return False\n\n    def run(self):\n        raise Exception(\"Error Message\")\n\n\nclass OddFibTask(luigi.Task):\n    n = luigi.IntParameter()\n    done = luigi.BoolParameter(default=True, significant=False)\n\n    def requires(self):\n        if self.n > 1:\n            yield OddFibTask(self.n - 1, self.done)\n            yield OddFibTask(self.n - 2, self.done)\n\n    def complete(self):\n        return self.n % 2 == 0 and self.done\n\n    def run(self):\n        assert False\n\n\nclass SchedulerVisualisationTest(unittest.TestCase):\n    def setUp(self):\n        self.scheduler = luigi.scheduler.Scheduler()\n\n    def tearDown(self):\n        pass\n\n    def _assert_complete(self, tasks):\n        for t in tasks:\n            self.assertTrue(t.complete())\n\n    def _build(self, tasks):\n        with luigi.worker.Worker(scheduler=self.scheduler, worker_processes=1) as w:\n            for t in tasks:\n                w.add(t)\n            w.run()\n\n    def _remote(self):\n        return self.scheduler\n\n    def _test_run(self, workers):\n        tasks = [DummyTask(i) for i in range(20)]\n        self._build(tasks, workers=workers)\n        self._assert_complete(tasks)\n\n    def test_graph(self):\n        start = time.time()\n        tasks = [DummyTask(task_id=1), DummyTask(task_id=2)]\n        self._build(tasks)\n        self._assert_complete(tasks)\n        end = time.time()\n\n        remote = self._remote()\n        graph = remote.graph()\n        self.assertEqual(len(graph), 2)\n        self.assertTrue(DummyTask(task_id=1).task_id in graph)\n        d1 = graph[DummyTask(task_id=1).task_id]\n        self.assertEqual(d1[\"status\"], \"DONE\")\n        self.assertEqual(d1[\"deps\"], [])\n        self.assertGreaterEqual(d1[\"start_time\"], start)\n        self.assertLessEqual(d1[\"start_time\"], end)\n        d2 = graph[DummyTask(task_id=2).task_id]\n        self.assertEqual(d2[\"status\"], \"DONE\")\n        self.assertEqual(d2[\"deps\"], [])\n        self.assertGreaterEqual(d2[\"start_time\"], start)\n        self.assertLessEqual(d2[\"start_time\"], end)\n\n    def test_large_graph_truncate(self):\n        class LinearTask(luigi.Task):\n            idx = luigi.IntParameter()\n\n            def requires(self):\n                if self.idx > 0:\n                    yield LinearTask(self.idx - 1)\n\n            def complete(self):\n                return False\n\n        root_task = LinearTask(100)\n\n        self.scheduler = luigi.scheduler.Scheduler(max_graph_nodes=10)\n        self._build([root_task])\n\n        graph = self.scheduler.dep_graph(root_task.task_id)\n        self.assertEqual(10, len(graph))\n        expected_nodes = [LinearTask(i).task_id for i in range(100, 90, -1)]\n        self.assertCountEqual(expected_nodes, graph)\n\n    def test_large_inverse_graph_truncate(self):\n        class LinearTask(luigi.Task):\n            idx = luigi.IntParameter()\n\n            def requires(self):\n                if self.idx > 0:\n                    yield LinearTask(self.idx - 1)\n\n            def complete(self):\n                return False\n\n        root_task = LinearTask(100)\n\n        self.scheduler = luigi.scheduler.Scheduler(max_graph_nodes=10)\n        self._build([root_task])\n\n        graph = self.scheduler.inverse_dep_graph(LinearTask(0).task_id)\n        self.assertEqual(10, len(graph))\n        expected_nodes = [LinearTask(i).task_id for i in range(10)]\n        self.assertCountEqual(expected_nodes, graph)\n\n    def test_truncate_graph_with_full_levels(self):\n        class BinaryTreeTask(RunOnceTask):\n            idx = luigi.IntParameter()\n\n            def requires(self):\n                if self.idx < 100:\n                    return map(BinaryTreeTask, (self.idx * 2, self.idx * 2 + 1))\n\n        root_task = BinaryTreeTask(1)\n\n        self.scheduler = luigi.scheduler.Scheduler(max_graph_nodes=10)\n        self._build([root_task])\n\n        graph = self.scheduler.dep_graph(root_task.task_id)\n        self.assertEqual(10, len(graph))\n        expected_nodes = [BinaryTreeTask(i).task_id for i in range(1, 11)]\n        self.assertCountEqual(expected_nodes, graph)\n\n    def test_truncate_graph_with_multiple_depths(self):\n        class LinearTask(luigi.Task):\n            idx = luigi.IntParameter()\n\n            def requires(self):\n                if self.idx > 0:\n                    yield LinearTask(self.idx - 1)\n                yield LinearTask(0)\n\n            def complete(self):\n                return False\n\n        root_task = LinearTask(100)\n\n        self.scheduler = luigi.scheduler.Scheduler(max_graph_nodes=10)\n        self._build([root_task])\n\n        graph = self.scheduler.dep_graph(root_task.task_id)\n        self.assertEqual(10, len(graph))\n        expected_nodes = [LinearTask(i).task_id for i in range(100, 91, -1)] + [LinearTask(0).task_id]\n        self.maxDiff = None\n        self.assertCountEqual(expected_nodes, graph)\n\n    def _assert_all_done(self, tasks):\n        self._assert_all(tasks, \"DONE\")\n\n    def _assert_all(self, tasks, status):\n        for task in tasks.values():\n            self.assertEqual(task[\"status\"], status)\n\n    def test_dep_graph_single(self):\n        self._build([FactorTask(1)])\n        remote = self._remote()\n        dep_graph = remote.dep_graph(FactorTask(product=1).task_id)\n        self.assertEqual(len(dep_graph), 1)\n        self._assert_all_done(dep_graph)\n\n        d1 = dep_graph.get(FactorTask(product=1).task_id)\n        self.assertEqual(type(d1), type({}))\n        self.assertEqual(d1[\"deps\"], [])\n\n    def test_dep_graph_not_found(self):\n        self._build([FactorTask(1)])\n        remote = self._remote()\n        dep_graph = remote.dep_graph(FactorTask(product=5).task_id)\n        self.assertEqual(len(dep_graph), 0)\n\n    def test_inverse_dep_graph_not_found(self):\n        self._build([FactorTask(1)])\n        remote = self._remote()\n        dep_graph = remote.inverse_dep_graph(\"FactorTask(product=5)\")\n        self.assertEqual(len(dep_graph), 0)\n\n    def test_dep_graph_tree(self):\n        self._build([FactorTask(30)])\n        remote = self._remote()\n        dep_graph = remote.dep_graph(FactorTask(product=30).task_id)\n        self.assertEqual(len(dep_graph), 5)\n        self._assert_all_done(dep_graph)\n\n        d30 = dep_graph[FactorTask(product=30).task_id]\n        self.assertEqual(sorted(d30[\"deps\"]), sorted([FactorTask(product=15).task_id, FactorTask(product=2).task_id]))\n\n        d2 = dep_graph[FactorTask(product=2).task_id]\n        self.assertEqual(sorted(d2[\"deps\"]), [])\n\n        d15 = dep_graph[FactorTask(product=15).task_id]\n        self.assertEqual(sorted(d15[\"deps\"]), sorted([FactorTask(product=3).task_id, FactorTask(product=5).task_id]))\n\n        d3 = dep_graph[FactorTask(product=3).task_id]\n        self.assertEqual(sorted(d3[\"deps\"]), [])\n\n        d5 = dep_graph[FactorTask(product=5).task_id]\n        self.assertEqual(sorted(d5[\"deps\"]), [])\n\n    def test_dep_graph_missing_deps(self):\n        self._build([BadReqTask(True)])\n        dep_graph = self._remote().dep_graph(BadReqTask(succeed=True).task_id)\n        self.assertEqual(len(dep_graph), 2)\n\n        suc = dep_graph[BadReqTask(succeed=True).task_id]\n        self.assertEqual(suc[\"deps\"], [BadReqTask(succeed=False).task_id])\n\n        fail = dep_graph[BadReqTask(succeed=False).task_id]\n        self.assertEqual(fail[\"name\"], \"BadReqTask\")\n        self.assertEqual(fail[\"params\"], {\"succeed\": \"False\"})\n        self.assertEqual(fail[\"status\"], \"UNKNOWN\")\n\n    def test_dep_graph_diamond(self):\n        self._build([FactorTask(12)])\n        remote = self._remote()\n        dep_graph = remote.dep_graph(FactorTask(product=12).task_id)\n        self.assertEqual(len(dep_graph), 4)\n        self._assert_all_done(dep_graph)\n\n        d12 = dep_graph[FactorTask(product=12).task_id]\n        self.assertEqual(sorted(d12[\"deps\"]), sorted([FactorTask(product=2).task_id, FactorTask(product=6).task_id]))\n\n        d6 = dep_graph[FactorTask(product=6).task_id]\n        self.assertEqual(sorted(d6[\"deps\"]), sorted([FactorTask(product=2).task_id, FactorTask(product=3).task_id]))\n\n        d3 = dep_graph[FactorTask(product=3).task_id]\n        self.assertEqual(sorted(d3[\"deps\"]), [])\n\n        d2 = dep_graph[FactorTask(product=2).task_id]\n        self.assertEqual(sorted(d2[\"deps\"]), [])\n\n    def test_dep_graph_skip_done(self):\n        task = OddFibTask(9)\n        self._build([task])\n        remote = self._remote()\n\n        task_id = task.task_id\n        self.assertEqual(9, len(remote.dep_graph(task_id, include_done=True)))\n\n        skip_done_graph = remote.dep_graph(task_id, include_done=False)\n        self.assertEqual(5, len(skip_done_graph))\n        for task in skip_done_graph.values():\n            self.assertNotEqual(\"DONE\", task[\"status\"])\n            self.assertLess(len(task[\"deps\"]), 2)\n\n    def test_inverse_dep_graph_skip_done(self):\n        self._build([OddFibTask(9, done=False)])\n        self._build([OddFibTask(9, done=True)])\n        remote = self._remote()\n\n        task_id = OddFibTask(1).task_id\n        self.assertEqual(9, len(remote.inverse_dep_graph(task_id, include_done=True)))\n\n        skip_done_graph = remote.inverse_dep_graph(task_id, include_done=False)\n        self.assertEqual(5, len(skip_done_graph))\n        for task in skip_done_graph.values():\n            self.assertNotEqual(\"DONE\", task[\"status\"])\n            self.assertLess(len(task[\"deps\"]), 2)\n\n    def test_task_list_single(self):\n        self._build([FactorTask(7)])\n        remote = self._remote()\n        tasks_done = remote.task_list(\"DONE\", \"\")\n        self.assertEqual(len(tasks_done), 1)\n        self._assert_all_done(tasks_done)\n\n        t7 = tasks_done.get(FactorTask(product=7).task_id)\n        self.assertEqual(type(t7), type({}))\n\n        self.assertEqual(remote.task_list(\"\", \"\"), tasks_done)\n        self.assertEqual(remote.task_list(\"FAILED\", \"\"), {})\n        self.assertEqual(remote.task_list(\"PENDING\", \"\"), {})\n\n    def test_dep_graph_root_has_display_name(self):\n        root_task = FactorTask(12)\n        self._build([root_task])\n\n        dep_graph = self._remote().dep_graph(root_task.task_id)\n        self.assertEqual(\"FactorTask(product=12)\", dep_graph[root_task.task_id][\"display_name\"])\n\n    def test_dep_graph_non_root_nodes_lack_display_name(self):\n        root_task = FactorTask(12)\n        self._build([root_task])\n\n        dep_graph = self._remote().dep_graph(root_task.task_id)\n        for task_id, node in dep_graph.items():\n            if task_id != root_task.task_id:\n                self.assertNotIn(\"display_name\", node)\n\n    def test_task_list_failed(self):\n        self._build([FailingTask(8)])\n        remote = self._remote()\n        failed = remote.task_list(\"FAILED\", \"\")\n        self.assertEqual(len(failed), 1)\n\n        f8 = failed.get(FailingTask(task_id=8).task_id)\n        self.assertEqual(f8[\"status\"], \"FAILED\")\n\n        self.assertEqual(remote.task_list(\"DONE\", \"\"), {})\n        self.assertEqual(remote.task_list(\"PENDING\", \"\"), {})\n\n    def test_task_list_upstream_status(self):\n        class A(luigi.ExternalTask):\n            def complete(self):\n                return False\n\n        class B(luigi.ExternalTask):\n            def complete(self):\n                return True\n\n        class C(RunOnceTask):\n            def requires(self):\n                return [A(), B()]\n\n        class F(luigi.Task):\n            def complete(self):\n                return False\n\n            def run(self):\n                raise Exception()\n\n        class D(RunOnceTask):\n            def requires(self):\n                return [F()]\n\n        class E(RunOnceTask):\n            def requires(self):\n                return [C(), D()]\n\n        self._build([E()])\n        remote = self._remote()\n\n        done = remote.task_list(\"DONE\", \"\")\n        self.assertEqual(len(done), 1)\n        db = done.get(B().task_id)\n        self.assertEqual(db[\"status\"], \"DONE\")\n\n        missing_input = remote.task_list(\"PENDING\", \"UPSTREAM_MISSING_INPUT\")\n        self.assertEqual(len(missing_input), 2)\n\n        pa = missing_input.get(A().task_id)\n        self.assertEqual(pa[\"status\"], \"PENDING\")\n        self.assertEqual(remote._upstream_status(A().task_id, {}), \"UPSTREAM_MISSING_INPUT\")\n\n        pc = missing_input.get(C().task_id)\n        self.assertEqual(pc[\"status\"], \"PENDING\")\n        self.assertEqual(remote._upstream_status(C().task_id, {}), \"UPSTREAM_MISSING_INPUT\")\n\n        upstream_failed = remote.task_list(\"PENDING\", \"UPSTREAM_FAILED\")\n        self.assertEqual(len(upstream_failed), 2)\n        pe = upstream_failed.get(E().task_id)\n        self.assertEqual(pe[\"status\"], \"PENDING\")\n        self.assertEqual(remote._upstream_status(E().task_id, {}), \"UPSTREAM_FAILED\")\n\n        pe = upstream_failed.get(D().task_id)\n        self.assertEqual(pe[\"status\"], \"PENDING\")\n        self.assertEqual(remote._upstream_status(D().task_id, {}), \"UPSTREAM_FAILED\")\n\n        pending = dict(missing_input)\n        pending.update(upstream_failed)\n        self.assertEqual(remote.task_list(\"PENDING\", \"\"), pending)\n        self.assertEqual(remote.task_list(\"PENDING\", \"UPSTREAM_RUNNING\"), {})\n\n        failed = remote.task_list(\"FAILED\", \"\")\n        self.assertEqual(len(failed), 1)\n        fd = failed.get(F().task_id)\n        self.assertEqual(fd[\"status\"], \"FAILED\")\n\n        all = dict(pending)\n        all.update(done)\n        all.update(failed)\n        self.assertEqual(remote.task_list(\"\", \"\"), all)\n        self.assertEqual(remote.task_list(\"RUNNING\", \"\"), {})\n\n    def test_task_search(self):\n        self._build([FactorTask(8)])\n        self._build([FailingTask(8)])\n        remote = self._remote()\n        all_tasks = remote.task_search(\"Task\")\n        self.assertEqual(len(all_tasks), 2)\n        self._assert_all(all_tasks[\"DONE\"], \"DONE\")\n        self._assert_all(all_tasks[\"FAILED\"], \"FAILED\")\n\n    def test_fetch_error(self):\n        self._build([FailingTask(8)])\n        remote = self._remote()\n        error = remote.fetch_error(FailingTask(task_id=8).task_id)\n        self.assertEqual(error[\"taskId\"], FailingTask(task_id=8).task_id)\n        self.assertTrue(\"Error Message\" in error[\"error\"])\n        self.assertTrue(\"Runtime error\" in error[\"error\"])\n        self.assertTrue(\"Traceback\" in error[\"error\"])\n\n    def test_inverse_deps(self):\n        class X(RunOnceTask):\n            pass\n\n        class Y(RunOnceTask):\n            def requires(self):\n                return [X()]\n\n        class Z(RunOnceTask):\n            id = luigi.IntParameter()\n\n            def requires(self):\n                return [Y()]\n\n        class ZZ(RunOnceTask):\n            def requires(self):\n                return [Z(1), Z(2)]\n\n        self._build([ZZ()])\n        dep_graph = self._remote().inverse_dep_graph(X().task_id)\n\n        def assert_has_deps(task_id, deps):\n            self.assertTrue(task_id in dep_graph, \"%s not in dep_graph %s\" % (task_id, dep_graph))\n            task = dep_graph[task_id]\n            self.assertEqual(sorted(task[\"deps\"]), sorted(deps), \"%s does not have deps %s\" % (task_id, deps))\n\n        assert_has_deps(X().task_id, [Y().task_id])\n        assert_has_deps(Y().task_id, [Z(id=1).task_id, Z(id=2).task_id])\n        assert_has_deps(Z(id=1).task_id, [ZZ().task_id])\n        assert_has_deps(Z(id=2).task_id, [ZZ().task_id])\n        assert_has_deps(ZZ().task_id, [])\n\n    def test_simple_worker_list(self):\n        class X(luigi.Task):\n            def run(self):\n                self._complete = True\n\n            def complete(self):\n                return getattr(self, \"_complete\", False)\n\n        task_x = X()\n        self._build([task_x])\n\n        workers = self._remote().worker_list()\n\n        self.assertEqual(1, len(workers))\n        worker = workers[0]\n        self.assertEqual(task_x.task_id, worker[\"first_task\"])\n        self.assertEqual(0, worker[\"num_pending\"])\n        self.assertEqual(0, worker[\"num_uniques\"])\n        self.assertEqual(0, worker[\"num_running\"])\n        self.assertEqual(\"active\", worker[\"state\"])\n        self.assertEqual(1, worker[\"workers\"])\n\n    def test_worker_list_pending_uniques(self):\n        class X(luigi.Task):\n            def complete(self):\n                return False\n\n        class Y(X):\n            def requires(self):\n                return X()\n\n        class Z(Y):\n            pass\n\n        w1 = luigi.worker.Worker(scheduler=self.scheduler, worker_processes=1)\n        w2 = luigi.worker.Worker(scheduler=self.scheduler, worker_processes=1)\n\n        w1.add(Y())\n        w2.add(Z())\n\n        workers = self._remote().worker_list()\n        self.assertEqual(2, len(workers))\n        for worker in workers:\n            self.assertEqual(2, worker[\"num_pending\"])\n            self.assertEqual(1, worker[\"num_uniques\"])\n            self.assertEqual(0, worker[\"num_running\"])\n\n    def test_worker_list_running(self):\n        class X(RunOnceTask):\n            n = luigi.IntParameter()\n\n        w = luigi.worker.Worker(worker_id=\"w\", scheduler=self.scheduler, worker_processes=3)\n        w.add(X(0))\n        w.add(X(1))\n        w.add(X(2))\n        w.add(X(3))\n\n        self.scheduler.get_work(worker=\"w\")\n        self.scheduler.get_work(worker=\"w\")\n        self.scheduler.get_work(worker=\"w\")\n\n        workers = self._remote().worker_list()\n        self.assertEqual(1, len(workers))\n        worker = workers[0]\n\n        self.assertEqual(3, worker[\"num_running\"])\n        self.assertEqual(1, worker[\"num_pending\"])\n        self.assertEqual(1, worker[\"num_uniques\"])\n\n    def test_worker_list_disabled_worker(self):\n        class X(RunOnceTask):\n            pass\n\n        with luigi.worker.Worker(worker_id=\"w\", scheduler=self.scheduler) as w:\n            w.add(X())  #\n            workers = self._remote().worker_list()\n            self.assertEqual(1, len(workers))\n            self.assertEqual(\"active\", workers[0][\"state\"])\n            self.scheduler.disable_worker(\"w\")\n            workers = self._remote().worker_list()\n            self.assertEqual(1, len(workers))\n            self.assertEqual(1, len(workers))\n            self.assertEqual(\"disabled\", workers[0][\"state\"])\n"
  },
  {
    "path": "test/server_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nimport functools\nimport multiprocessing\nimport os\nimport shutil\nimport signal\nimport tempfile\nimport time\nfrom urllib.parse import ParseResult, urlencode\nfrom urllib.parse import quote as urlquote\n\nimport pytest\nimport tornado.ioloop\nfrom helpers import skipOnTravisAndGithubActions, unittest\nfrom tornado.testing import AsyncHTTPTestCase\n\nimport luigi.cmdline\nimport luigi.rpc\nimport luigi.server\nfrom luigi.configuration import get_config\nfrom luigi.scheduler import Scheduler\n\ntry:\n    from unittest import mock\nexcept ImportError:\n    import mock\n\n\ndef _is_running_from_main_thread():\n    \"\"\"\n    Return true if we're the same thread as the one that created the Tornado\n    IOLoop. In practice, the problem is that we get annoying intermittent\n    failures because sometimes the KeepAliveThread jumps in and \"disturbs\" the\n    intended flow of the test case. Worse, it fails in the terrible way that\n    the KeepAliveThread is kept alive, bugging the execution of subsequent test\n    casses.\n\n    Oh, I so wish Tornado would explicitly say that you're acessing it from\n    different threads and things will just not work.\n    \"\"\"\n    return tornado.ioloop.IOLoop.current(instance=False)\n\n\nclass ServerTestBase(AsyncHTTPTestCase):\n    def get_app(self):\n        return luigi.server.app(Scheduler())\n\n    def setUp(self):\n        super(ServerTestBase, self).setUp()\n\n        self._old_fetch = luigi.rpc.RemoteScheduler._fetch\n\n        def _fetch(obj, url, body, *args, **kwargs):\n            if _is_running_from_main_thread():\n                body = urlencode(body).encode(\"utf-8\")\n                response = self.fetch(url, body=body, method=\"POST\")\n                if response.code >= 400:\n                    raise luigi.rpc.RPCError(\"Errror when connecting to remote scheduler\")\n                return response.body.decode(\"utf-8\")\n\n        luigi.rpc.RemoteScheduler._fetch = _fetch\n\n    def tearDown(self):\n        super(ServerTestBase, self).tearDown()\n        luigi.rpc.RemoteScheduler._fetch = self._old_fetch\n\n\nclass ServerTest(ServerTestBase):\n    def setUp(self):\n        super(ServerTest, self).setUp()\n        get_config().remove_section(\"cors\")\n        self._default_cors = luigi.server.cors()\n\n        get_config().set(\"cors\", \"enabled\", \"true\")\n        get_config().set(\"cors\", \"allow_any_origin\", \"true\")\n        get_config().set(\"cors\", \"allow_null_origin\", \"true\")\n\n    def tearDown(self):\n        super(ServerTest, self).tearDown()\n        get_config().remove_section(\"cors\")\n\n    def test_visualiser(self):\n        page = self.fetch(\"/\").body\n        self.assertTrue(page.find(b\"<title>\") != -1)\n\n    def _test_404(self, path):\n        response = self.fetch(path)\n        self.assertEqual(response.code, 404)\n\n    def test_404(self):\n        self._test_404(\"/foo\")\n\n    def test_api_404(self):\n        self._test_404(\"/api/foo\")\n\n    def test_root_redirect(self):\n        response = self.fetch(\"/\", follow_redirects=False)\n        self.assertEqual(response.code, 302)\n        self.assertEqual(response.headers[\"Location\"], \"static/visualiser/index.html\")  # assert that doesnt begin with leading slash !\n\n    def test_api_preflight_cors_headers(self):\n        response = self.fetch(\"/api/graph\", method=\"OPTIONS\", headers={\"Origin\": \"foo\"})\n        headers = dict(response.headers)\n\n        self.assertEqual(self._default_cors.allowed_headers, headers[\"Access-Control-Allow-Headers\"])\n        self.assertEqual(self._default_cors.allowed_methods, headers[\"Access-Control-Allow-Methods\"])\n        self.assertEqual(\"*\", headers[\"Access-Control-Allow-Origin\"])\n        self.assertEqual(str(self._default_cors.max_age), headers[\"Access-Control-Max-Age\"])\n        self.assertIsNone(headers.get(\"Access-Control-Allow-Credentials\"))\n        self.assertIsNone(headers.get(\"Access-Control-Expose-Headers\"))\n\n    def test_api_preflight_cors_headers_all_response_headers(self):\n        get_config().set(\"cors\", \"allow_credentials\", \"true\")\n        get_config().set(\"cors\", \"exposed_headers\", \"foo, bar\")\n        response = self.fetch(\"/api/graph\", method=\"OPTIONS\", headers={\"Origin\": \"foo\"})\n        headers = dict(response.headers)\n\n        self.assertEqual(self._default_cors.allowed_headers, headers[\"Access-Control-Allow-Headers\"])\n        self.assertEqual(self._default_cors.allowed_methods, headers[\"Access-Control-Allow-Methods\"])\n        self.assertEqual(\"*\", headers[\"Access-Control-Allow-Origin\"])\n        self.assertEqual(str(self._default_cors.max_age), headers[\"Access-Control-Max-Age\"])\n        self.assertEqual(\"true\", headers[\"Access-Control-Allow-Credentials\"])\n        self.assertEqual(\"foo, bar\", headers[\"Access-Control-Expose-Headers\"])\n\n    def test_api_preflight_cors_headers_null_origin(self):\n        response = self.fetch(\"/api/graph\", method=\"OPTIONS\", headers={\"Origin\": \"null\"})\n        headers = dict(response.headers)\n\n        self.assertEqual(self._default_cors.allowed_headers, headers[\"Access-Control-Allow-Headers\"])\n        self.assertEqual(self._default_cors.allowed_methods, headers[\"Access-Control-Allow-Methods\"])\n        self.assertEqual(\"null\", headers[\"Access-Control-Allow-Origin\"])\n        self.assertEqual(str(self._default_cors.max_age), headers[\"Access-Control-Max-Age\"])\n        self.assertIsNone(headers.get(\"Access-Control-Allow-Credentials\"))\n        self.assertIsNone(headers.get(\"Access-Control-Expose-Headers\"))\n\n    def test_api_preflight_cors_headers_disallow_null(self):\n        get_config().set(\"cors\", \"allow_null_origin\", \"false\")\n        response = self.fetch(\"/api/graph\", method=\"OPTIONS\", headers={\"Origin\": \"null\"})\n        headers = dict(response.headers)\n\n        self.assertNotIn(\"Access-Control-Allow-Headers\", headers)\n        self.assertNotIn(\"Access-Control-Allow-Methods\", headers)\n        self.assertNotIn(\"Access-Control-Allow-Origin\", headers)\n        self.assertNotIn(\"Access-Control-Max-Age\", headers)\n        self.assertNotIn(\"Access-Control-Allow-Credentials\", headers)\n        self.assertNotIn(\"Access-Control-Expose-Headers\", headers)\n\n    def test_api_preflight_cors_headers_disallow_any(self):\n        get_config().set(\"cors\", \"allow_any_origin\", \"false\")\n        get_config().set(\"cors\", \"allowed_origins\", '[\"foo\", \"bar\"]')\n        response = self.fetch(\"/api/graph\", method=\"OPTIONS\", headers={\"Origin\": \"foo\"})\n        headers = dict(response.headers)\n\n        self.assertEqual(self._default_cors.allowed_headers, headers[\"Access-Control-Allow-Headers\"])\n        self.assertEqual(self._default_cors.allowed_methods, headers[\"Access-Control-Allow-Methods\"])\n        self.assertEqual(\"foo\", headers[\"Access-Control-Allow-Origin\"])\n        self.assertEqual(str(self._default_cors.max_age), headers[\"Access-Control-Max-Age\"])\n        self.assertIsNone(headers.get(\"Access-Control-Allow-Credentials\"))\n        self.assertIsNone(headers.get(\"Access-Control-Expose-Headers\"))\n\n    def test_api_preflight_cors_headers_disallow_any_no_matched_allowed_origins(self):\n        get_config().set(\"cors\", \"allow_any_origin\", \"false\")\n        get_config().set(\"cors\", \"allowed_origins\", '[\"foo\", \"bar\"]')\n        response = self.fetch(\"/api/graph\", method=\"OPTIONS\", headers={\"Origin\": \"foobar\"})\n        headers = dict(response.headers)\n\n        self.assertNotIn(\"Access-Control-Allow-Headers\", headers)\n        self.assertNotIn(\"Access-Control-Allow-Methods\", headers)\n        self.assertNotIn(\"Access-Control-Allow-Origin\", headers)\n        self.assertNotIn(\"Access-Control-Max-Age\", headers)\n        self.assertNotIn(\"Access-Control-Allow-Credentials\", headers)\n        self.assertNotIn(\"Access-Control-Expose-Headers\", headers)\n\n    def test_api_preflight_cors_headers_disallow_any_no_allowed_origins(self):\n        get_config().set(\"cors\", \"allow_any_origin\", \"false\")\n        response = self.fetch(\"/api/graph\", method=\"OPTIONS\", headers={\"Origin\": \"foo\"})\n        headers = dict(response.headers)\n\n        self.assertNotIn(\"Access-Control-Allow-Headers\", headers)\n        self.assertNotIn(\"Access-Control-Allow-Methods\", headers)\n        self.assertNotIn(\"Access-Control-Allow-Origin\", headers)\n        self.assertNotIn(\"Access-Control-Max-Age\", headers)\n        self.assertNotIn(\"Access-Control-Allow-Credentials\", headers)\n        self.assertNotIn(\"Access-Control-Expose-Headers\", headers)\n\n    def test_api_preflight_cors_headers_disabled(self):\n        get_config().set(\"cors\", \"enabled\", \"false\")\n        response = self.fetch(\"/api/graph\", method=\"OPTIONS\", headers={\"Origin\": \"foo\"})\n        headers = dict(response.headers)\n\n        self.assertNotIn(\"Access-Control-Allow-Headers\", headers)\n        self.assertNotIn(\"Access-Control-Allow-Methods\", headers)\n        self.assertNotIn(\"Access-Control-Allow-Origin\", headers)\n        self.assertNotIn(\"Access-Control-Max-Age\", headers)\n        self.assertNotIn(\"Access-Control-Allow-Credentials\", headers)\n        self.assertNotIn(\"Access-Control-Expose-Headers\", headers)\n\n    def test_api_preflight_cors_headers_no_origin_header(self):\n        response = self.fetch(\"/api/graph\", method=\"OPTIONS\")\n        headers = dict(response.headers)\n\n        self.assertNotIn(\"Access-Control-Allow-Headers\", headers)\n        self.assertNotIn(\"Access-Control-Allow-Methods\", headers)\n        self.assertNotIn(\"Access-Control-Allow-Origin\", headers)\n        self.assertNotIn(\"Access-Control-Max-Age\", headers)\n        self.assertNotIn(\"Access-Control-Allow-Credentials\", headers)\n        self.assertNotIn(\"Access-Control-Expose-Headers\", headers)\n\n    def test_api_cors_headers(self):\n        response = self.fetch(\"/api/graph\", headers={\"Origin\": \"foo\"})\n        headers = dict(response.headers)\n\n        self.assertEqual(\"*\", headers[\"Access-Control-Allow-Origin\"])\n\n    def test_api_cors_headers_null_origin(self):\n        response = self.fetch(\"/api/graph\", headers={\"Origin\": \"null\"})\n        headers = dict(response.headers)\n\n        self.assertEqual(\"null\", headers[\"Access-Control-Allow-Origin\"])\n\n    def test_api_cors_headers_disallow_null(self):\n        get_config().set(\"cors\", \"allow_null_origin\", \"false\")\n        response = self.fetch(\"/api/graph\", headers={\"Origin\": \"null\"})\n        headers = dict(response.headers)\n\n        self.assertIsNone(headers.get(\"Access-Control-Allow-Origin\"))\n\n    def test_api_cors_headers_disallow_any(self):\n        get_config().set(\"cors\", \"allow_any_origin\", \"false\")\n        get_config().set(\"cors\", \"allowed_origins\", '[\"foo\", \"bar\"]')\n        response = self.fetch(\"/api/graph\", headers={\"Origin\": \"foo\"})\n        headers = dict(response.headers)\n\n        self.assertEqual(\"foo\", headers[\"Access-Control-Allow-Origin\"])\n\n    def test_api_cors_headers_disallow_any_no_matched_allowed_origins(self):\n        get_config().set(\"cors\", \"allow_any_origin\", \"false\")\n        get_config().set(\"cors\", \"allowed_origins\", '[\"foo\", \"bar\"]')\n        response = self.fetch(\"/api/graph\", headers={\"Origin\": \"foobar\"})\n        headers = dict(response.headers)\n\n        self.assertIsNone(headers.get(\"Access-Control-Allow-Origin\"))\n\n    def test_api_cors_headers_disallow_any_no_allowed_origins(self):\n        get_config().set(\"cors\", \"allow_any_origin\", \"false\")\n        response = self.fetch(\"/api/graph\", headers={\"Origin\": \"foo\"})\n        headers = dict(response.headers)\n\n        self.assertIsNone(headers.get(\"Access-Control-Allow-Origin\"))\n\n    def test_api_cors_headers_disabled(self):\n        get_config().set(\"cors\", \"enabled\", \"false\")\n        response = self.fetch(\"/api/graph\", headers={\"Origin\": \"foo\"})\n        headers = dict(response.headers)\n\n        self.assertIsNone(headers.get(\"Access-Control-Allow-Origin\"))\n\n    def test_api_cors_headers_no_origin_header(self):\n        response = self.fetch(\"/api/graph\")\n        headers = dict(response.headers)\n\n        self.assertIsNone(headers.get(\"Access-Control-Allow-Origin\"))\n\n    def test_api_allow_head_on_root(self):\n        response = self.fetch(\"/\", method=\"HEAD\")\n        self.assertEqual(response.code, 204)\n\n\nclass _ServerTest(unittest.TestCase):\n    \"\"\"\n    Test to start and stop the server in a more \"standard\" way\n    \"\"\"\n\n    server_client_class = \"To be defined by subclasses\"\n\n    def start_server(self):\n        self._process = multiprocessing.Process(target=self.server_client.run_server)\n        self._process.start()\n        time.sleep(0.1)  # wait for server to start\n        self.sch = self.server_client.scheduler()\n        self.sch._wait = lambda: None\n\n    def stop_server(self):\n        self._process.terminate()\n        self._process.join(timeout=1)\n        if self._process.is_alive():\n            os.kill(self._process.pid, signal.SIGKILL)\n\n    def setUp(self):\n        self.server_client = self.server_client_class()\n        fd, state_path = tempfile.mkstemp(suffix=self.id())\n        os.close(fd)\n        self.addCleanup(functools.partial(os.unlink, state_path))\n        luigi.configuration.get_config().set(\"scheduler\", \"state_path\", state_path)\n        self.start_server()\n\n    def tearDown(self):\n        self.stop_server()\n\n    @skipOnTravisAndGithubActions(\"https://travis-ci.org/spotify/luigi/jobs/78315794\")\n    def test_ping(self):\n        self.sch.ping(worker=\"xyz\")\n\n    @skipOnTravisAndGithubActions(\"https://travis-ci.org/spotify/luigi/jobs/78023665\")\n    def test_raw_ping(self):\n        self.sch._request(\"/api/ping\", {\"worker\": \"xyz\"})\n\n    @skipOnTravisAndGithubActions(\"https://travis-ci.org/spotify/luigi/jobs/78023665\")\n    def test_raw_ping_extended(self):\n        self.sch._request(\"/api/ping\", {\"worker\": \"xyz\", \"foo\": \"bar\"})\n\n    @skipOnTravisAndGithubActions(\"https://travis-ci.org/spotify/luigi/jobs/166833694\")\n    def test_404(self):\n        with self.assertRaises(luigi.rpc.RPCError):\n            self.sch._request(\"/api/fdsfds\", {\"dummy\": 1})\n\n    @skipOnTravisAndGithubActions(\"https://travis-ci.org/spotify/luigi/jobs/72953884\")\n    def test_save_state(self):\n        self.sch.add_task(worker=\"X\", task_id=\"B\", deps=(\"A\",))\n        self.sch.add_task(worker=\"X\", task_id=\"A\")\n        self.assertEqual(self.sch.get_work(worker=\"X\")[\"task_id\"], \"A\")\n        self.stop_server()\n        self.start_server()\n        work = self.sch.get_work(worker=\"X\")[\"running_tasks\"][0]\n        self.assertEqual(work[\"task_id\"], \"A\")\n\n\n@pytest.mark.unixsocket\nclass UNIXServerTest(_ServerTest):\n    class ServerClient:\n        def __init__(self):\n            self.tempdir = tempfile.mkdtemp()\n            self.unix_socket = os.path.join(self.tempdir, \"luigid.sock\")\n\n        def run_server(self):\n            luigi.server.run(unix_socket=self.unix_socket)\n\n        def scheduler(self):\n            url = ParseResult(\n                scheme=\"http+unix\",\n                netloc=urlquote(self.unix_socket, safe=\"\"),\n                path=\"\",\n                params=\"\",\n                query=\"\",\n                fragment=\"\",\n            ).geturl()\n            return luigi.rpc.RemoteScheduler(url)\n\n    server_client_class = ServerClient\n\n    def tearDown(self):\n        super(UNIXServerTest, self).tearDown()\n        shutil.rmtree(self.server_client.tempdir)\n\n\nclass INETServerClient:\n    def __init__(self):\n        # Just some port\n        self.port = 8083\n\n    def scheduler(self):\n        return luigi.rpc.RemoteScheduler(\"http://localhost:\" + str(self.port))\n\n\nclass _INETServerTest(_ServerTest):\n    # HACK: nose ignores class whose name starts with underscore\n    # see: https://github.com/nose-devs/nose/blob/6f9dada1a5593b2365859bab92c7d1e468b64b7b/nose/selector.py#L72\n    # This hack affects derived classes of this class e.g. INETProcessServerTest, INETLuigidServerTest, INETLuigidDaemonServerTest.\n    __test__ = False\n\n    def test_with_cmdline(self):\n        \"\"\"\n        Test to run against the server as a normal luigi invocation does\n        \"\"\"\n        params = [\"Task\", \"--scheduler-port\", str(self.server_client.port), \"--no-lock\"]\n        self.assertTrue(luigi.interface.run(params))\n\n\nclass INETProcessServerTest(_INETServerTest):\n    __test__ = True\n\n    class ServerClient(INETServerClient):\n        def run_server(self):\n            luigi.server.run(api_port=self.port, address=\"127.0.0.1\")\n\n    server_client_class = ServerClient\n\n\nclass INETURLLibServerTest(INETProcessServerTest):\n    @mock.patch.object(luigi.rpc, \"HAS_REQUESTS\", False)\n    def start_server(self, *args, **kwargs):\n        super(INETURLLibServerTest, self).start_server(*args, **kwargs)\n\n    @skipOnTravisAndGithubActions(\"https://travis-ci.org/spotify/luigi/jobs/81022689\")\n    def patching_test(self):\n        \"\"\"\n        Check that HAS_REQUESTS patching is meaningful\n        \"\"\"\n        fetcher1 = luigi.rpc.RemoteScheduler()._fetcher\n        with mock.patch.object(luigi.rpc, \"HAS_REQUESTS\", False):\n            fetcher2 = luigi.rpc.RemoteScheduler()._fetcher\n\n        self.assertNotEqual(fetcher1.__class__, fetcher2.__class__)\n\n\nclass INETLuigidServerTest(_INETServerTest):\n    __test__ = True\n\n    class ServerClient(INETServerClient):\n        def run_server(self):\n            # I first tried to things like \"subprocess.call(['luigid', ...]),\n            # But it ended up to be a total mess getting the cleanup to work\n            # unfortunately.\n            luigi.cmdline.luigid([\"--port\", str(self.port)])\n\n    server_client_class = ServerClient\n\n\nclass INETLuigidDaemonServerTest(_INETServerTest):\n    __test__ = True\n\n    class ServerClient(INETServerClient):\n        def __init__(self):\n            super(INETLuigidDaemonServerTest.ServerClient, self).__init__()\n            self.tempdir = tempfile.mkdtemp()\n\n        @mock.patch(\"daemon.DaemonContext\")\n        def run_server(self, daemon_context):\n            luigi.cmdline.luigid(\n                [\n                    \"--port\",\n                    str(self.port),\n                    \"--background\",  # This makes it a daemon\n                    \"--logdir\",\n                    self.tempdir,\n                    \"--pidfile\",\n                    os.path.join(self.tempdir, \"luigid.pid\"),\n                ]\n            )\n\n    def tearDown(self):\n        super(INETLuigidDaemonServerTest, self).tearDown()\n        shutil.rmtree(self.server_client.tempdir)\n\n    server_client_class = ServerClient\n\n\nclass MetricsHandlerTest(unittest.TestCase):\n    def setUp(self):\n        self.mock_scheduler = mock.MagicMock()\n        self.handler = luigi.server.MetricsHandler(tornado.web.Application(), mock.MagicMock(), scheduler=self.mock_scheduler)\n\n    def test_initialize(self):\n        self.assertIs(self.handler._scheduler, self.mock_scheduler)\n\n    def test_get(self):\n        mock_metrics = mock.MagicMock()\n        self.mock_scheduler._state._metrics_collector.generate_latest.return_value = mock_metrics\n        with mock.patch.object(self.handler, \"write\") as patched_write:\n            self.handler.get()\n            patched_write.assert_called_once_with(mock_metrics)\n            self.mock_scheduler._state._metrics_collector.configure_http_handler.assert_called_once_with(self.handler)\n\n    def test_get_no_metrics(self):\n        self.mock_scheduler._state._metrics_collector.generate_latest.return_value = None\n        with mock.patch.object(self.handler, \"write\") as patched_write:\n            self.handler.get()\n            patched_write.assert_not_called()\n\n\nclass FromUtcTest(unittest.TestCase):\n    def test_with_microseconds(self):\n        \"\"\"Test parsing UTC time string with microseconds\"\"\"\n        result = luigi.server.from_utc(\"2021-01-15 10:30:45.123456\")\n        self.assertIsInstance(result, int)\n\n    def test_without_microseconds(self):\n        \"\"\"Test parsing UTC time string without microseconds\"\"\"\n        result = luigi.server.from_utc(\"2021-01-15 10:30:45\")\n        self.assertIsInstance(result, int)\n\n    def test_with_custom_format(self):\n        \"\"\"Test parsing with custom format\"\"\"\n        result = luigi.server.from_utc(\"01/15/2021\", fmt=\"%m/%d/%Y\")\n        self.assertIsInstance(result, int)\n\n    def test_invalid_format_raises_error(self):\n        \"\"\"Test that invalid format raises ValueError\"\"\"\n        with self.assertRaises(ValueError):\n            luigi.server.from_utc(\"invalid-date\")\n\n    def test_custom_format_mismatch_raises_error(self):\n        \"\"\"Test that mismatched custom format raises ValueError\"\"\"\n        with self.assertRaises(ValueError):\n            luigi.server.from_utc(\"2021-01-15\", fmt=\"%m/%d/%Y\")\n"
  },
  {
    "path": "test/set_task_name_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom helpers import unittest\n\nimport luigi\n\n\ndef create_class(cls_name):\n    class NewTask(luigi.WrapperTask):\n        pass\n\n    NewTask.__name__ = cls_name\n\n    return NewTask\n\n\ncreate_class(\"MyNewTask\")\n\n\nclass SetTaskNameTest(unittest.TestCase):\n    \"\"\"I accidentally introduced an issue in this commit:\n    https://github.com/spotify/luigi/commit/6330e9d0332e6152996292a39c42f752b9288c96\n\n    This causes tasks not to get exposed if they change name later. Adding a unit test\n    to resolve the issue.\"\"\"\n\n    def test_set_task_name(self):\n        luigi.run([\"--local-scheduler\", \"--no-lock\", \"MyNewTask\"])\n"
  },
  {
    "path": "test/setup_logging_test.py",
    "content": "from helpers import unittest\n\nfrom luigi.configuration import LuigiConfigParser, LuigiTomlParser, get_config\nfrom luigi.setup_logging import DaemonLogging, InterfaceLogging\n\n\nclass TestDaemonLogging(unittest.TestCase):\n    cls = DaemonLogging\n\n    def setUp(self):\n        self.cls._configured = False\n\n    def tearDown(self):\n        self.cls._configured = False\n        self.cls.config = get_config()\n\n    def test_cli(self):\n        opts = type(\"opts\", (), {})\n\n        opts.background = True\n        result = self.cls._cli(opts)\n        self.assertTrue(result)\n\n        opts.background = False\n        opts.logdir = \"./tests/\"\n        result = self.cls._cli(opts)\n        self.assertTrue(result)\n\n        opts.background = False\n        opts.logdir = False\n        result = self.cls._cli(opts)\n        self.assertFalse(result)\n\n    def test_section(self):\n        self.cls.config = {\n            \"logging\": {\n                \"version\": 1,\n                \"disable_existing_loggers\": False,\n                \"formatters\": {\n                    \"mockformatter\": {\n                        \"format\": \"{levelname}: {message}\",\n                        \"style\": \"{\",\n                        \"datefmt\": \"%Y-%m-%d %H:%M:%S\",\n                    },\n                },\n                \"handlers\": {\n                    \"mockhandler\": {\n                        \"class\": \"logging.StreamHandler\",\n                        \"level\": \"INFO\",\n                        \"formatter\": \"mockformatter\",\n                    },\n                },\n                \"loggers\": {\n                    \"mocklogger\": {\n                        \"handlers\": (\"mockhandler\",),\n                        \"level\": \"INFO\",\n                        \"disabled\": False,\n                        \"propagate\": False,\n                    },\n                },\n            },\n        }\n        result = self.cls._section(None)\n        self.assertTrue(result)\n\n        self.cls.config = LuigiTomlParser()\n        self.cls.config.read([\"./test/testconfig/luigi_logging.toml\"])\n        result = self.cls._section(None)\n        self.assertTrue(result)\n\n        self.cls.config = {}\n        result = self.cls._section(None)\n        self.assertFalse(result)\n\n    def test_section_cfg(self):\n        self.cls.config = LuigiConfigParser.instance()\n        result = self.cls._section(None)\n        self.assertFalse(result)\n\n    def test_cfg(self):\n        self.cls.config = LuigiTomlParser()\n        self.cls.config.data = {}\n        result = self.cls._conf(None)\n        self.assertFalse(result)\n\n        self.cls.config.data = {\"core\": {\"logging_conf_file\": \"./blah\"}}\n        with self.assertRaises(OSError):\n            self.cls._conf(None)\n\n        self.cls.config.data = {\n            \"core\": {\n                \"logging_conf_file\": \"./test/testconfig/logging.cfg\",\n            }\n        }\n        result = self.cls._conf(None)\n        self.assertTrue(result)\n\n    def test_default(self):\n        result = self.cls._default(None)\n        self.assertTrue(result)\n\n\nclass TestInterfaceLogging(TestDaemonLogging):\n    cls = InterfaceLogging\n\n    def test_cli(self):\n        opts = type(\"opts\", (), {})\n        result = self.cls._cli(opts)\n        self.assertFalse(result)\n\n    # test_section inherited from TestDaemonLogging\n\n    def test_cfg(self):\n        self.cls.config = LuigiTomlParser()\n        self.cls.config.data = {}\n\n        opts = type(\"opts\", (), {})\n        opts.logging_conf_file = \"\"\n        result = self.cls._conf(opts)\n        self.assertFalse(result)\n\n        opts.logging_conf_file = \"./blah\"\n        with self.assertRaises(OSError):\n            self.cls._conf(opts)\n\n        opts.logging_conf_file = \"./test/testconfig/logging.cfg\"\n        result = self.cls._conf(opts)\n        self.assertTrue(result)\n\n    def test_default(self):\n        opts = type(\"opts\", (), {})\n        opts.log_level = \"INFO\"\n        result = self.cls._default(opts)\n        self.assertTrue(result)\n\n\nclass PatchedLogging(InterfaceLogging):\n    @classmethod\n    def _cli(cls, *args):\n        cls.calls.append(\"_cli\")\n        return \"_cli\" not in cls.patched\n\n    @classmethod\n    def _conf(cls, *args):\n        cls.calls.append(\"_conf\")\n        return \"_conf\" not in cls.patched\n\n    @classmethod\n    def _section(cls, *args):\n        cls.calls.append(\"_section\")\n        return \"_section\" not in cls.patched\n\n    @classmethod\n    def _default(cls, *args):\n        cls.calls.append(\"_default\")\n        return \"_default\" not in cls.patched\n\n\nclass TestSetup(unittest.TestCase):\n    def setUp(self):\n        self.opts = type(\"opts\", (), {})\n        self.cls = PatchedLogging\n        self.cls.calls = []\n        self.cls.config = LuigiTomlParser()\n        self.cls._configured = False\n        self.cls.patched = \"_cli\", \"_conf\", \"_section\", \"_default\"\n\n    def tearDown(self):\n        self.cls.config = get_config()\n\n    def test_configured(self):\n        self.cls._configured = True\n        result = self.cls.setup(self.opts)\n        self.assertEqual(self.cls.calls, [])\n        self.assertFalse(result)\n\n    def test_disabled(self):\n        self.cls.config.data = {\"core\": {\"no_configure_logging\": True}}\n        result = self.cls.setup(self.opts)\n        self.assertEqual(self.cls.calls, [])\n        self.assertFalse(result)\n\n    def test_order(self):\n        self.cls.setup(self.opts)\n        self.assertEqual(self.cls.calls, [\"_cli\", \"_conf\", \"_section\", \"_default\"])\n\n    def test_cli(self):\n        self.cls.patched = ()\n        result = self.cls.setup(self.opts)\n        self.assertTrue(result)\n        self.assertEqual(self.cls.calls, [\"_cli\"])\n\n    def test_conf(self):\n        self.cls.patched = (\"_cli\",)\n        result = self.cls.setup(self.opts)\n        self.assertTrue(result)\n        self.assertEqual(self.cls.calls, [\"_cli\", \"_conf\"])\n\n    def test_section(self):\n        self.cls.patched = (\"_cli\", \"_conf\")\n        result = self.cls.setup(self.opts)\n        self.assertTrue(result)\n        self.assertEqual(self.cls.calls, [\"_cli\", \"_conf\", \"_section\"])\n\n    def test_default(self):\n        self.cls.patched = (\"_cli\", \"_conf\", \"_section\")\n        result = self.cls.setup(self.opts)\n        self.assertTrue(result)\n        self.assertEqual(self.cls.calls, [\"_cli\", \"_conf\", \"_section\", \"_default\"])\n"
  },
  {
    "path": "test/simulate_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport os\nimport tempfile\nfrom multiprocessing import Process\n\nfrom helpers import unittest\n\nimport luigi\nfrom luigi.contrib.simulate import RunAnywayTarget\n\n\ndef temp_dir():\n    return os.path.join(tempfile.gettempdir(), \"luigi-simulate\")\n\n\ndef is_writable():\n    d = temp_dir()\n    fn = os.path.join(d, \"luigi-simulate-write-test\")\n    exists = True\n    try:\n        try:\n            os.makedirs(d)\n        except OSError:\n            pass\n        open(fn, \"w\").close()\n        os.remove(fn)\n    except BaseException:\n        exists = False\n\n    return unittest.skipIf(not exists, \"Can't write to temporary directory\")\n\n\nclass TaskA(luigi.Task):\n    i = luigi.IntParameter(default=0)\n\n    def output(self):\n        return RunAnywayTarget(self)\n\n    def run(self):\n        fn = os.path.join(temp_dir(), \"luigi-simulate-test.tmp\")\n        try:\n            os.makedirs(os.path.dirname(fn))\n        except OSError:\n            pass\n\n        with open(fn, \"a\") as f:\n            f.write(\"{0}={1}\\n\".format(self.__class__.__name__, self.i))\n\n        self.output().done()\n\n\nclass TaskB(TaskA):\n    def requires(self):\n        return TaskA(i=10)\n\n\nclass TaskC(TaskA):\n    def requires(self):\n        return TaskA(i=5)\n\n\nclass TaskD(TaskA):\n    def requires(self):\n        return [TaskB(), TaskC(), TaskA(i=20)]\n\n\nclass TaskWrap(luigi.WrapperTask):\n    def requires(self):\n        return [TaskA(), TaskD()]\n\n\ndef reset():\n    # Force tasks to be executed again (because multiple pipelines are executed inside of the same process)\n    t = TaskA().output()\n    with t.unique.get_lock():\n        t.unique.value = 0\n\n\nclass RunAnywayTargetTest(unittest.TestCase):\n    @is_writable()\n    def test_output(self):\n        reset()\n\n        fn = os.path.join(temp_dir(), \"luigi-simulate-test.tmp\")\n\n        luigi.build([TaskWrap()], local_scheduler=True)\n        with open(fn, \"r\") as f:\n            data = f.read().strip().split(\"\\n\")\n\n        data.sort()\n        reference = [\"TaskA=0\", \"TaskA=10\", \"TaskA=20\", \"TaskA=5\", \"TaskB=0\", \"TaskC=0\", \"TaskD=0\"]\n        reference.sort()\n\n        os.remove(fn)\n        self.assertEqual(data, reference)\n\n    @is_writable()\n    def test_output_again(self):\n        # Running the test in another process because the PID is used to determine if the target exists\n        p = Process(target=self.test_output)\n        p.start()\n        p.join()\n"
  },
  {
    "path": "test/subtask_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport abc\n\nfrom helpers import unittest\n\nimport luigi\n\n\nclass AbstractTask(luigi.Task):\n    k = luigi.IntParameter()\n\n    @property\n    @abc.abstractmethod\n    def foo(self):\n        raise NotImplementedError\n\n    @abc.abstractmethod\n    def helper_function(self):\n        raise NotImplementedError\n\n    def run(self):\n        return \",\".join([self.foo, self.helper_function()])\n\n\nclass Implementation(AbstractTask):\n    @property\n    def foo(self):\n        return \"bar\"\n\n    def helper_function(self):\n        return \"hello\" * self.k\n\n\nclass AbstractSubclassTest(unittest.TestCase):\n    def test_instantiate_abstract(self):\n        def try_instantiate():\n            AbstractTask(k=1)\n\n        self.assertRaises(TypeError, try_instantiate)\n\n    def test_instantiate(self):\n        self.assertEqual(\"bar,hellohello\", Implementation(k=2).run())\n"
  },
  {
    "path": "test/target_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport random\nimport re\n\nfrom helpers import skipOnTravisAndGithubActions, unittest\nfrom mock import Mock\n\nimport luigi.format\nimport luigi.target\n\n\nclass TestException(Exception):\n    pass\n\n\nclass TargetTest(unittest.TestCase):\n    def test_cannot_instantiate(self):\n        def instantiate_target():\n            luigi.target.Target()\n\n        self.assertRaises(TypeError, instantiate_target)\n\n    def test_abstract_subclass(self):\n        class ExistsLessTarget(luigi.target.Target):\n            pass\n\n        def instantiate_target():\n            ExistsLessTarget()\n\n        self.assertRaises(TypeError, instantiate_target)\n\n    def test_instantiate_subclass(self):\n        class GoodTarget(luigi.target.Target):\n            def exists(self):\n                return True\n\n            def open(self, mode):\n                return None\n\n        GoodTarget()\n\n\nclass FileSystemTargetTestMixin:\n    \"\"\"All Target that take bytes (python2: str) should pass those\n    tests. In addition, a test to verify the method `exists`should be added\n    \"\"\"\n\n    def create_target(self, format=None):\n        raise NotImplementedError()\n\n    def assertCleanUp(self, tmp_path=\"\"):\n        pass\n\n    def test_atomicity(self):\n        target = self.create_target()\n\n        fobj = target.open(\"w\")\n        self.assertFalse(target.exists())\n        fobj.close()\n        self.assertTrue(target.exists())\n\n    def test_readback(self):\n        target = self.create_target()\n\n        origdata = \"lol\\n\"\n        fobj = target.open(\"w\")\n        fobj.write(origdata)\n        fobj.close()\n\n        fobj = target.open(\"r\")\n        data = fobj.read()\n        self.assertEqual(origdata, data)\n\n    def test_unicode_obj(self):\n        target = self.create_target()\n\n        origdata = \"lol\\n\"\n        fobj = target.open(\"w\")\n        fobj.write(origdata)\n        fobj.close()\n\n        fobj = target.open(\"r\")\n        data = fobj.read()\n        self.assertEqual(origdata, data)\n\n    def test_with_close(self):\n        target = self.create_target()\n\n        with target.open(\"w\") as fobj:\n            tp = getattr(fobj, \"tmp_path\", \"\")\n            fobj.write(\"hej\\n\")\n\n        self.assertCleanUp(tp)\n        self.assertTrue(target.exists())\n\n    def test_with_exception(self):\n        target = self.create_target()\n\n        a = {}\n\n        def foo():\n            with target.open(\"w\") as fobj:\n                fobj.write(\"hej\\n\")\n                a[\"tp\"] = getattr(fobj, \"tmp_path\", \"\")\n                raise TestException(\"Test triggered exception\")\n\n        self.assertRaises(TestException, foo)\n        self.assertCleanUp(a[\"tp\"])\n        self.assertFalse(target.exists())\n\n    def test_del(self):\n        t = self.create_target()\n        p = t.open(\"w\")\n        print(\"test\", file=p)\n        tp = getattr(p, \"tmp_path\", \"\")\n        del p\n\n        self.assertCleanUp(tp)\n        self.assertFalse(t.exists())\n\n    def test_write_cleanup_no_close(self):\n        t = self.create_target()\n\n        def context():\n            f = t.open(\"w\")\n            f.write(\"stuff\")\n            return getattr(f, \"tmp_path\", \"\")\n\n        tp = context()\n        import gc\n\n        gc.collect()  # force garbage collection of f variable\n        self.assertCleanUp(tp)\n        self.assertFalse(t.exists())\n\n    def test_text(self):\n        t = self.create_target(luigi.format.UTF8)\n        a = \"我éçф\"\n        with t.open(\"w\") as f:\n            f.write(a)\n        with t.open(\"r\") as f:\n            b = f.read()\n        self.assertEqual(a, b)\n\n    def test_del_with_Text(self):\n        t = self.create_target(luigi.format.UTF8)\n        p = t.open(\"w\")\n        print(\"test\", file=p)\n        tp = getattr(p, \"tmp_path\", \"\")\n        del p\n\n        self.assertCleanUp(tp)\n        self.assertFalse(t.exists())\n\n    def test_format_injection(self):\n        class CustomFormat(luigi.format.Format):\n            def pipe_reader(self, input_pipe):\n                input_pipe.foo = \"custom read property\"\n                return input_pipe\n\n            def pipe_writer(self, output_pipe):\n                output_pipe.foo = \"custom write property\"\n                return output_pipe\n\n        t = self.create_target(CustomFormat())\n        with t.open(\"w\") as f:\n            self.assertEqual(f.foo, \"custom write property\")\n\n        with t.open(\"r\") as f:\n            self.assertEqual(f.foo, \"custom read property\")\n\n    @skipOnTravisAndGithubActions(\"https://travis-ci.org/spotify/luigi/jobs/73693470\")\n    def test_binary_write(self):\n        t = self.create_target(luigi.format.Nop)\n        with t.open(\"w\") as f:\n            f.write(b\"a\\xf2\\xf3\\r\\nfd\")\n\n        with t.open(\"r\") as f:\n            c = f.read()\n\n        self.assertEqual(c, b\"a\\xf2\\xf3\\r\\nfd\")\n\n    def test_writelines(self):\n        t = self.create_target()\n        with t.open(\"w\") as f:\n            f.writelines(\n                [\n                    \"a\\n\",\n                    \"b\\n\",\n                    \"c\\n\",\n                ]\n            )\n\n        with t.open(\"r\") as f:\n            c = f.read()\n\n        self.assertEqual(c, \"a\\nb\\nc\\n\")\n\n    def test_read_iterator(self):\n        t = self.create_target()\n        with t.open(\"w\") as f:\n            f.write(\"a\\nb\\nc\\n\")\n\n        c = []\n        with t.open(\"r\") as f:\n            for x in f:\n                c.append(x)\n\n        self.assertEqual(c, [\"a\\n\", \"b\\n\", \"c\\n\"])\n\n    def test_gzip(self):\n        t = self.create_target(luigi.format.Gzip)\n        p = t.open(\"w\")\n        test_data = b\"test\"\n        p.write(test_data)\n        tp = getattr(p, \"tmp_path\", \"\")\n        self.assertFalse(t.exists())\n        p.close()\n        self.assertCleanUp(tp)\n        self.assertTrue(t.exists())\n\n    def test_gzip_works_and_cleans_up(self):\n        t = self.create_target(luigi.format.Gzip)\n\n        test_data = b\"123testing\"\n        with t.open(\"w\") as f:\n            tp = getattr(f, \"tmp_path\", \"\")\n            f.write(test_data)\n\n        self.assertCleanUp(tp)\n        with t.open() as f:\n            result = f.read()\n\n        self.assertEqual(test_data, result)\n\n    def test_move_on_fs(self):\n        # We're cheating and retrieving the fs from target.\n        # TODO: maybe move to \"filesystem_test.py\" or something\n        t = self.create_target()\n        other_path = t.path + \"-\" + str(random.randint(0, 999999999))\n        t._touchz()\n        fs = t.fs\n        self.assertTrue(t.exists())\n        fs.move(t.path, other_path)\n        self.assertFalse(t.exists())\n\n    def test_rename_dont_move_on_fs(self):\n        # We're cheating and retrieving the fs from target.\n        # TODO: maybe move to \"filesystem_test.py\" or something\n        t = self.create_target()\n        other_path = t.path + \"-\" + str(random.randint(0, 999999999))\n        t._touchz()\n        fs = t.fs\n        self.assertTrue(t.exists())\n        fs.rename_dont_move(t.path, other_path)\n        self.assertFalse(t.exists())\n        self.assertRaises(luigi.target.FileAlreadyExists, lambda: fs.rename_dont_move(t.path, other_path))\n\n\nclass TemporaryPathTest(unittest.TestCase):\n    def setUp(self):\n        super(TemporaryPathTest, self).setUp()\n        self.fs = Mock()\n\n        class MyFileSystemTarget(luigi.target.FileSystemTarget):\n            open = None  # Must be implemented due to abc stuff\n            fs = self.fs\n\n        self.target_cls = MyFileSystemTarget\n\n    def test_temporary_path_files(self):\n        target_outer = self.target_cls(\"/tmp/notreal.xls\")\n        target_inner = self.target_cls(\"/tmp/blah.txt\")\n\n        class MyException(Exception):\n            pass\n\n        orig_ex = MyException()\n        try:\n            with target_outer.temporary_path() as tmp_path_outer:\n                self.assertIn(\"notreal\", tmp_path_outer)\n                with target_inner.temporary_path() as tmp_path_inner:\n                    self.assertIn(\"blah\", tmp_path_inner)\n                    with target_inner.temporary_path() as tmp_path_inner_2:\n                        self.assertNotEqual(tmp_path_inner, tmp_path_inner_2)\n                    self.fs.rename_dont_move.assert_called_once_with(tmp_path_inner_2, target_inner.path)\n                self.fs.rename_dont_move.assert_called_with(tmp_path_inner, target_inner.path)\n                self.assertEqual(self.fs.rename_dont_move.call_count, 2)\n                raise orig_ex\n        except MyException as ex:\n            self.assertIs(ex, orig_ex)\n        else:\n            assert False\n        self.assertEqual(self.fs.rename_dont_move.call_count, 2)\n\n    def test_temporary_path_directory(self):\n        target_slash = self.target_cls(\"/tmp/dir/\")\n        target_noslash = self.target_cls(\"/tmp/dir\")\n\n        with target_slash.temporary_path() as tmp_path:\n            assert re.match(r\"/tmp/dir-luigi-tmp-\\d{10}/\", tmp_path)\n        self.fs.rename_dont_move.assert_called_once_with(tmp_path, target_slash.path)\n\n        with target_noslash.temporary_path() as tmp_path:\n            assert re.match(r\"/tmp/dir-luigi-tmp-\\d{10}\", tmp_path)\n        self.fs.rename_dont_move.assert_called_with(tmp_path, target_noslash.path)\n\n    def test_windowsish_dir(self):\n        target = self.target_cls(r\"\"\"C:\\my\\folder\"\"\" + \"\\\\\")\n        pattern = r\"\"\"C:\\\\my\\\\folder-luigi-tmp-\\d{10}\"\"\" + r\"\\\\\"\n\n        with target.temporary_path() as tmp_path:\n            assert re.match(pattern, tmp_path)\n        self.fs.rename_dont_move.assert_called_once_with(tmp_path, target.path)\n\n    def test_hadoopish_dir(self):\n        target = self.target_cls(r\"\"\"hdfs:///user/arash/myfile.uids\"\"\")\n\n        with target.temporary_path() as tmp_path:\n            assert re.match(r\"\"\"hdfs:///user/arash/myfile.uids-luigi-tmp-\\d{10}\"\"\", tmp_path)\n        self.fs.rename_dont_move.assert_called_once_with(tmp_path, target.path)\n\n    def test_creates_dir_for_file(self):\n        target = self.target_cls(\"/my/file/is/awesome.txt\")\n\n        with target.temporary_path():\n            self.fs.mkdir.assert_called_once_with(\"/my/file/is\", parents=True, raise_if_exists=False)\n\n    def test_creates_dir_for_dir(self):\n        target = self.target_cls(\"/my/dir/is/awesome/\")\n\n        with target.temporary_path():\n            self.fs.mkdir.assert_called_once_with(\"/my/dir/is\", parents=True, raise_if_exists=False)\n\n    def test_file_in_current_dir(self):\n        target = self.target_cls(\"foo.txt\")\n\n        with target.temporary_path() as tmp_path:\n            self.fs.mkdir.assert_not_called()  # there is no dir to create\n        self.fs.rename_dont_move.assert_called_once_with(tmp_path, target.path)\n"
  },
  {
    "path": "test/task_bulk_complete_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2016 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom helpers import unittest\n\nfrom luigi import Parameter, Task\nfrom luigi.task import MixinNaiveBulkComplete\n\nCOMPLETE_TASKS = [\"A\", \"B\", \"C\"]\n\n\nclass MockTask(MixinNaiveBulkComplete, Task):\n    param_a = Parameter()\n    param_b = Parameter(default=\"Not Mandatory\")\n\n    def complete(self):\n        return self.param_a in COMPLETE_TASKS\n\n\nclass MixinNaiveBulkCompleteTest(unittest.TestCase):\n    \"\"\"\n    Test that the MixinNaiveBulkComplete can handle\n    input as\n     - iterable of parameters (for single param tasks)\n     - iterable of parameter tuples (for multi param tasks)\n     - iterable of parameter dicts (for multi param tasks)\n    \"\"\"\n\n    def test_single_arg_list(self):\n        single_arg_list = [\"A\", \"B\", \"x\"]\n        expected_single_arg_list = {p for p in single_arg_list if p in COMPLETE_TASKS}\n        self.assertEqual(expected_single_arg_list, set(MockTask.bulk_complete(single_arg_list)))\n\n    def test_multiple_arg_tuple(self):\n        multiple_arg_tuple = ((\"A\", \"1\"), (\"B\", \"2\"), (\"X\", \"3\"), (\"C\", \"2\"))\n        expected_multiple_arg_tuple = {p for p in multiple_arg_tuple if p[0] in COMPLETE_TASKS}\n        self.assertEqual(expected_multiple_arg_tuple, set(MockTask.bulk_complete(multiple_arg_tuple)))\n\n    def test_multiple_arg_dict(self):\n        multiple_arg_dict = ({\"param_a\": \"X\", \"param_b\": \"1\"}, {\"param_a\": \"C\", \"param_b\": \"1\"})\n        expected_multiple_arg_dict = [p for p in multiple_arg_dict if p[\"param_a\"] in COMPLETE_TASKS]\n        self.assertEqual(expected_multiple_arg_dict, MockTask.bulk_complete(multiple_arg_dict))\n"
  },
  {
    "path": "test/task_forwarded_attributes_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom helpers import LuigiTestCase, RunOnceTask\n\nimport luigi\nimport luigi.scheduler\nimport luigi.worker\n\nFORWARDED_ATTRIBUTES = set(luigi.worker.TaskProcess.forward_reporter_attributes.values())\n\n\nclass NonYieldingTask(RunOnceTask):\n    # need to accept messages in order for the \"scheduler_message\" attribute to be not None\n    accepts_messages = True\n\n    def gather_forwarded_attributes(self):\n        \"\"\"\n        Returns a set of names of attributes that are forwarded by the TaskProcess and that are not\n        *None*. The tests in this file check if and which attributes are present at different times,\n        e.g. while running, or before and after a dynamic dependency was yielded.\n        \"\"\"\n        attrs = set()\n        for attr in FORWARDED_ATTRIBUTES:\n            if getattr(self, attr, None) is not None:\n                attrs.add(attr)\n        return attrs\n\n    def run(self):\n        # store names of forwarded attributes which are only available within the run method\n        self.attributes_while_running = self.gather_forwarded_attributes()\n\n        # invoke the run method of the RunOnceTask which marks this task as complete\n        RunOnceTask.run(self)\n\n\nclass YieldingTask(NonYieldingTask):\n    def run(self):\n        # as TaskProcess._run_get_new_deps handles generators in a specific way, store names of\n        # forwarded attributes before and after yielding a dynamic dependency, so we can explicitly\n        # validate the attribute forwarding implementation\n        self.attributes_before_yield = self.gather_forwarded_attributes()\n        yield RunOnceTask()\n        self.attributes_after_yield = self.gather_forwarded_attributes()\n\n        # invoke the run method of the RunOnceTask which marks this task as complete\n        RunOnceTask.run(self)\n\n\nclass TaskForwardedAttributesTest(LuigiTestCase):\n    def run_task(self, task):\n        sch = luigi.scheduler.Scheduler()\n        with luigi.worker.Worker(scheduler=sch) as w:\n            w.add(task)\n            w.run()\n        return task\n\n    def test_non_yielding_task(self):\n        task = self.run_task(NonYieldingTask())\n\n        self.assertEqual(task.attributes_while_running, FORWARDED_ATTRIBUTES)\n\n    def test_yielding_task(self):\n        task = self.run_task(YieldingTask())\n\n        self.assertEqual(task.attributes_before_yield, FORWARDED_ATTRIBUTES)\n        self.assertEqual(task.attributes_after_yield, FORWARDED_ATTRIBUTES)\n"
  },
  {
    "path": "test/task_history_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom helpers import LuigiTestCase\n\nimport luigi\nimport luigi.scheduler\nimport luigi.task_history\nimport luigi.worker\n\nluigi.notifications.DEBUG = True\n\n\nclass SimpleTaskHistory(luigi.task_history.TaskHistory):\n    def __init__(self):\n        self.actions = []\n\n    def task_scheduled(self, task):\n        self.actions.append((\"scheduled\", task.id))\n\n    def task_finished(self, task, successful):\n        self.actions.append((\"finished\", task.id))\n\n    def task_started(self, task, worker_host):\n        self.actions.append((\"started\", task.id))\n\n\nclass TaskHistoryTest(LuigiTestCase):\n    def test_run(self):\n        th = SimpleTaskHistory()\n        sch = luigi.scheduler.Scheduler(task_history_impl=th)\n        with luigi.worker.Worker(scheduler=sch) as w:\n\n            class MyTask(luigi.Task):\n                pass\n\n            task = MyTask()\n            w.add(task)\n            w.run()\n\n            self.assertEqual(th.actions, [(\"scheduled\", task.task_id), (\"started\", task.task_id), (\"finished\", task.task_id)])\n"
  },
  {
    "path": "test/task_progress_percentage_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom helpers import LuigiTestCase\n\nimport luigi\nimport luigi.scheduler\nimport luigi.worker\n\n\nclass TaskProgressPercentageTest(LuigiTestCase):\n    def test_run(self):\n        sch = luigi.scheduler.Scheduler()\n        with luigi.worker.Worker(scheduler=sch) as w:\n\n            class MyTask(luigi.Task):\n                def run(self):\n                    self.set_progress_percentage(30)\n\n            task = MyTask()\n            w.add(task)\n            w.run()\n\n            self.assertEqual(sch.get_task_progress_percentage(task.task_id)[\"progressPercentage\"], 30)\n"
  },
  {
    "path": "test/task_register_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2017 VNG Corporation\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nfrom helpers import LuigiTestCase\n\nimport luigi\nfrom luigi.task_register import (\n    Register,\n    TaskClassAmbigiousException,\n    TaskClassNotFoundException,\n)\n\n\nclass TaskRegisterTest(LuigiTestCase):\n    def test_externalize_taskclass(self):\n        with self.assertRaises(TaskClassNotFoundException):\n            Register.get_task_cls(\"scooby.Doo\")\n\n        class Task1(luigi.Task):\n            @classmethod\n            def get_task_family(cls):\n                return \"scooby.Doo\"\n\n        self.assertEqual(Task1, Register.get_task_cls(\"scooby.Doo\"))\n\n        class Task2(luigi.Task):\n            @classmethod\n            def get_task_family(cls):\n                return \"scooby.Doo\"\n\n        with self.assertRaises(TaskClassAmbigiousException):\n            Register.get_task_cls(\"scooby.Doo\")\n\n        class Task3(luigi.Task):\n            @classmethod\n            def get_task_family(cls):\n                return \"scooby.Doo\"\n\n        # There previously was a rare bug where the third installed class could\n        # \"undo\" class ambiguity.\n        with self.assertRaises(TaskClassAmbigiousException):\n            Register.get_task_cls(\"scooby.Doo\")\n"
  },
  {
    "path": "test/task_running_resources_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport multiprocessing\nimport os\nimport signal\nimport time\nfrom contextlib import contextmanager\n\nfrom helpers import RunOnceTask, skipOnGithubActions, unittest, with_config\n\nimport luigi\nimport luigi.server\n\n\nclass ResourceTestTask(RunOnceTask):\n    param = luigi.Parameter()\n    reduce_foo = luigi.BoolParameter()\n\n    def process_resources(self):\n        return {\"foo\": 2}\n\n    def run(self):\n        if self.reduce_foo:\n            self.decrease_running_resources({\"foo\": 1})\n\n        time.sleep(2)\n\n        super(ResourceTestTask, self).run()\n\n\nclass ResourceWrapperTask(RunOnceTask):\n    reduce_foo = ResourceTestTask.reduce_foo\n\n    def requires(self):\n        return [\n            ResourceTestTask(param=\"a\", reduce_foo=self.reduce_foo),\n            ResourceTestTask(param=\"b\"),\n        ]\n\n\nclass LocalRunningResourcesTest(unittest.TestCase):\n    def test_resource_reduction(self):\n        # trivial resource reduction on local scheduler\n        # test the running_task_resources setter and getter\n        sch = luigi.scheduler.Scheduler(resources={\"foo\": 2})\n\n        with luigi.worker.Worker(scheduler=sch) as w:\n            task = ResourceTestTask(param=\"a\", reduce_foo=True)\n\n            w.add(task)\n            w.run()\n\n            self.assertEqual(sch.get_running_task_resources(task.task_id)[\"resources\"][\"foo\"], 1)\n\n\nclass ConcurrentRunningResourcesTest(unittest.TestCase):\n    @with_config({\"scheduler\": {\"stable_done_cooldown_secs\": \"0\"}})\n    def setUp(self):\n        super(ConcurrentRunningResourcesTest, self).setUp()\n\n        # run the luigi server in a new process and wait for its startup\n        self._process = multiprocessing.Process(target=luigi.server.run)\n        self._process.start()\n        time.sleep(0.5)\n\n        # configure the rpc scheduler, update the foo resource\n        self.sch = luigi.rpc.RemoteScheduler()\n        self.sch.update_resource(\"foo\", 3)\n\n    def tearDown(self):\n        super(ConcurrentRunningResourcesTest, self).tearDown()\n\n        # graceful server shutdown\n        self._process.terminate()\n        self._process.join(timeout=1)\n        if self._process.is_alive():\n            os.kill(self._process.pid, signal.SIGKILL)\n\n    @contextmanager\n    def worker(self, scheduler=None, processes=2):\n        with luigi.worker.Worker(scheduler=scheduler or self.sch, worker_processes=processes) as w:\n            w._config.wait_interval = 0.2\n            w._config.check_unfulfilled_deps = False\n            yield w\n\n    @contextmanager\n    def assert_duration(self, min_duration=0, max_duration=-1):\n        t0 = time.time()\n        try:\n            yield\n        finally:\n            duration = time.time() - t0\n            self.assertGreater(duration, min_duration)\n            if max_duration > 0:\n                self.assertLess(duration, max_duration)\n\n    def test_tasks_serial(self):\n        # serial test\n        # run two tasks that do not reduce the \"foo\" resource\n        # as the total foo resource (3) is smaller than the requirement of two tasks (4),\n        # the scheduler is forced to run them serially which takes longer than 4 seconds\n        with self.worker() as w:\n            w.add(ResourceWrapperTask(reduce_foo=False))\n\n            with self.assert_duration(min_duration=4):\n                w.run()\n\n    @skipOnGithubActions(\"Temporary skipping on GH actions\")  # TODO: Fix and remove skip\n    def test_tasks_parallel(self):\n        # parallel test\n        # run two tasks and the first one lowers its requirement on the \"foo\" resource, so that\n        # the total \"foo\" resource (3) is sufficient to run both tasks in parallel shortly after\n        # the first task started, so the entire process should not exceed 4 seconds\n        with self.worker() as w:\n            w.add(ResourceWrapperTask(reduce_foo=True))\n\n            with self.assert_duration(max_duration=4):\n                w.run()\n"
  },
  {
    "path": "test/task_serialize_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\"\"\"\nWe want to test that task_id is consistent when generated from:\n\n 1. A real task instance\n 2. The task_family and a dictionary of parameter values (as strings)\n 3. A json representation of #2\n\nWe use the hypothesis package to do property-based tests.\n\n\"\"\"\n\nimport json\nimport string\nfrom datetime import datetime\n\nimport hypothesis as hyp\nfrom hypothesis.strategies import datetimes as hyp_datetimes\n\nimport luigi\n\n_no_value = luigi.parameter._no_value\n\n\ndef _mk_param_strategy(param_cls, param_value_strat, with_default=None):\n    if with_default is None:\n        default = hyp.strategies.one_of(hyp.strategies.just(_no_value), param_value_strat)\n    elif with_default:\n        default = param_value_strat\n    else:\n        default = hyp.strategies.just(_no_value)\n\n    return hyp.strategies.builds(param_cls, description=hyp.strategies.text(alphabet=string.printable), default=default)\n\n\ndef _mk_task(name, params):\n    return type(name, (luigi.Task,), params)\n\n\n# identifiers must be str not unicode in Python2\nidentifiers = hyp.strategies.builds(str, hyp.strategies.text(alphabet=string.ascii_letters, min_size=1, max_size=16))\ntext = hyp.strategies.text(alphabet=string.printable)\n\n# Luigi parameters with a default\nparameters_def = _mk_param_strategy(luigi.Parameter, text, True)\nint_parameters_def = _mk_param_strategy(luigi.IntParameter, hyp.strategies.integers(), True)\nfloat_parameters_def = _mk_param_strategy(luigi.FloatParameter, hyp.strategies.floats(min_value=-1e100, max_value=+1e100), True)\nbool_parameters_def = _mk_param_strategy(luigi.BoolParameter, hyp.strategies.booleans(), True)\ndate_parameters_def = _mk_param_strategy(luigi.DateParameter, hyp_datetimes(min_value=datetime(1900, 1, 1)), True)\n\nany_default_parameters = hyp.strategies.one_of(parameters_def, int_parameters_def, float_parameters_def, bool_parameters_def, date_parameters_def)\n\n# Tasks with up to 3 random parameters\ntasks_with_defaults = hyp.strategies.builds(_mk_task, name=identifiers, params=hyp.strategies.dictionaries(identifiers, any_default_parameters, max_size=3))\n\n\ndef _task_to_dict(task):\n    # Generate the parameter value dictionary.  Use each parameter's serialize() method\n    param_dict = {}\n    for key, param in task.get_params():\n        param_dict[key] = param.serialize(getattr(task, key))\n\n    return param_dict\n\n\ndef _task_from_dict(task_cls, param_dict):\n    # Regenerate the task from the dictionary\n    task_params = {}\n    for key, param in task_cls.get_params():\n        task_params[key] = param.parse(param_dict[key])\n\n    return task_cls(**task_params)\n\n\n@hyp.given(tasks_with_defaults)\ndef test_serializable(task_cls):\n    task = task_cls()\n\n    param_dict = _task_to_dict(task)\n    task2 = _task_from_dict(task_cls, param_dict)\n\n    assert task.task_id == task2.task_id\n\n\n@hyp.given(tasks_with_defaults)\ndef test_json_serializable(task_cls):\n    task = task_cls()\n\n    param_dict = _task_to_dict(task)\n\n    param_dict = json.loads(json.dumps(param_dict))\n    task2 = _task_from_dict(task_cls, param_dict)\n\n    assert task.task_id == task2.task_id\n\n\n@hyp.given(tasks_with_defaults)\ndef test_task_id_alphanumeric(task_cls):\n    task = task_cls()\n    task_id = task.task_id\n    valid = string.ascii_letters + string.digits + \"_\"\n\n    assert [x for x in task_id if x not in valid] == []\n\n\n# TODO : significant an non-significant parameters\n"
  },
  {
    "path": "test/task_status_message_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nfrom helpers import LuigiTestCase\n\nimport luigi\nimport luigi.scheduler\nimport luigi.worker\n\nluigi.notifications.DEBUG = True\n\n\nclass TaskStatusMessageTest(LuigiTestCase):\n    def test_run(self):\n        message = \"test message\"\n        sch = luigi.scheduler.Scheduler()\n        with luigi.worker.Worker(scheduler=sch) as w:\n\n            class MyTask(luigi.Task):\n                def run(self):\n                    self.set_status_message(message)\n\n            task = MyTask()\n            w.add(task)\n            w.run()\n\n            self.assertEqual(sch.get_task_status_message(task.task_id)[\"statusMessage\"], message)\n"
  },
  {
    "path": "test/task_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nimport collections\nimport doctest\nimport pickle\nimport warnings\nfrom datetime import datetime, timedelta\n\nfrom helpers import LuigiTestCase, unittest, with_config\n\nimport luigi\nimport luigi.task\nimport luigi.util\nfrom luigi.task_register import load_task\n\n\nclass DummyTask(luigi.Task):\n    param = luigi.Parameter()\n    bool_param = luigi.BoolParameter()\n    int_param = luigi.IntParameter()\n    float_param = luigi.FloatParameter()\n    date_param = luigi.DateParameter()\n    datehour_param = luigi.DateHourParameter()\n    timedelta_param = luigi.TimeDeltaParameter()\n    insignificant_param = luigi.Parameter(significant=False)\n\n\nDUMMY_TASK_OK_PARAMS = dict(\n    param=\"test\",\n    bool_param=True,\n    int_param=666,\n    float_param=123.456,\n    date_param=datetime(2014, 9, 13).date(),\n    datehour_param=datetime(2014, 9, 13, 9),\n    timedelta_param=timedelta(44),  # doesn't support seconds\n    insignificant_param=\"test\",\n)\n\n\nclass DefaultInsignificantParamTask(luigi.Task):\n    insignificant_param = luigi.Parameter(significant=False, default=\"value\")\n    necessary_param = luigi.Parameter(significant=False)\n\n\nclass TaskTest(unittest.TestCase):\n    def test_tasks_doctest(self):\n        doctest.testmod(luigi.task)\n\n    def test_task_to_str_to_task(self):\n        original = DummyTask(**DUMMY_TASK_OK_PARAMS)\n        other = DummyTask.from_str_params(original.to_str_params())\n        self.assertEqual(original, other)\n\n    def test_task_from_str_insignificant(self):\n        params = {\"necessary_param\": \"needed\"}\n        original = DefaultInsignificantParamTask(**params)\n        other = DefaultInsignificantParamTask.from_str_params(params)\n        self.assertEqual(original, other)\n\n    def test_task_missing_necessary_param(self):\n        with self.assertRaises(luigi.parameter.MissingParameterException):\n            DefaultInsignificantParamTask.from_str_params({})\n\n    def test_external_tasks_loadable(self):\n        task = load_task(\"luigi\", \"ExternalTask\", {})\n        self.assertTrue(isinstance(task, luigi.ExternalTask))\n\n    def test_getpaths(self):\n        class RequiredTask(luigi.Task):\n            def output(self):\n                return luigi.LocalTarget(\"/path/to/target/file\")\n\n        t = RequiredTask()\n        reqs = {}\n        reqs[\"bare\"] = t\n        reqs[\"dict\"] = {\"key\": t}\n        reqs[\"OrderedDict\"] = collections.OrderedDict([(\"key\", t)])\n        reqs[\"list\"] = [t]\n        reqs[\"tuple\"] = (t,)\n        reqs[\"generator\"] = (t for _ in range(10))\n\n        struct = luigi.task.getpaths(reqs)\n        self.assertIsInstance(struct, dict)\n        self.assertIsInstance(struct[\"bare\"], luigi.Target)\n        self.assertIsInstance(struct[\"dict\"], dict)\n        self.assertIsInstance(struct[\"OrderedDict\"], collections.OrderedDict)\n        self.assertIsInstance(struct[\"list\"], list)\n        self.assertIsInstance(struct[\"tuple\"], tuple)\n        self.assertTrue(hasattr(struct[\"generator\"], \"__iter__\"))\n\n    def test_flatten(self):\n        flatten = luigi.task.flatten\n        self.assertEqual(sorted(flatten({\"a\": \"foo\", \"b\": \"bar\"})), [\"bar\", \"foo\"])\n        self.assertEqual(sorted(flatten([\"foo\", [\"bar\", \"troll\"]])), [\"bar\", \"foo\", \"troll\"])\n        self.assertEqual(flatten(\"foo\"), [\"foo\"])\n        self.assertEqual(flatten(42), [42])\n        self.assertEqual(flatten((len(i) for i in [\"foo\", \"troll\"])), [3, 5])\n        self.assertRaises(TypeError, flatten, (len(i) for i in [\"foo\", \"troll\", None]))\n\n    def test_externalized_task_picklable(self):\n        task = luigi.task.externalize(luigi.Task())\n        pickled_task = pickle.dumps(task)\n        self.assertEqual(task, pickle.loads(pickled_task))\n\n    def test_no_unpicklable_properties(self):\n        task = luigi.Task()\n        task.set_tracking_url = lambda tracking_url: tracking_url\n        task.set_status_message = lambda message: message\n        with task.no_unpicklable_properties():\n            pickle.dumps(task)\n        self.assertIsNotNone(task.set_tracking_url)\n        self.assertIsNotNone(task.set_status_message)\n        tracking_url = task.set_tracking_url(\"http://test.luigi.com/\")\n        self.assertEqual(tracking_url, \"http://test.luigi.com/\")\n        message = task.set_status_message(\"message\")\n        self.assertEqual(message, \"message\")\n\n    def test_no_warn_if_param_types_ok(self):\n        with warnings.catch_warnings(record=True) as w:\n            DummyTask(**DUMMY_TASK_OK_PARAMS)\n        self.assertEqual(len(w), 0, msg=\"No warning should be raised when correct parameter types are used\")\n\n    def test_warn_on_non_str_param(self):\n        params = dict(**DUMMY_TASK_OK_PARAMS)\n        params[\"param\"] = 42\n        with self.assertWarnsRegex(UserWarning, 'Parameter \"param\" with value \"42\" is not of type string.'):\n            DummyTask(**params)\n\n    def test_warn_on_non_timedelta_param(self):\n        params = dict(**DUMMY_TASK_OK_PARAMS)\n\n        class MockTimedelta:\n            days = 1\n            seconds = 1\n\n        params[\"timedelta_param\"] = MockTimedelta()\n        with self.assertWarnsRegex(UserWarning, 'Parameter \"timedelta_param\" with value \".*\" is not of type timedelta.'):\n            DummyTask(**params)\n\n    def test_disable_window_seconds(self):\n        \"\"\"\n        Deprecated disable_window_seconds param uses disable_window value\n        \"\"\"\n\n        class ATask(luigi.Task):\n            disable_window = 17\n\n        task = ATask()\n        self.assertEqual(task.disable_window_seconds, 17)\n\n    @with_config({\"ATaskWithBadParam\": {\"bad_param\": \"bad_value\"}})\n    def test_bad_param(self):\n        class ATaskWithBadParam(luigi.Task):\n            bad_param = luigi.IntParameter()\n\n        with self.assertRaisesRegex(ValueError, r\"ATaskWithBadParam\\[args=\\(\\), kwargs={}\\]: Error when parsing the default value of 'bad_param'\"):\n            ATaskWithBadParam()\n\n    @with_config(\n        {\n            \"TaskA\": {\n                \"a\": \"a\",\n                \"b\": \"b\",\n                \"c\": \"c\",\n            },\n            \"TaskB\": {\n                \"a\": \"a\",\n                \"b\": \"b\",\n                \"c\": \"c\",\n            },\n        }\n    )\n    def test_unconsumed_params(self):\n        class TaskA(luigi.Task):\n            a = luigi.Parameter(default=\"a\")\n\n        class TaskB(luigi.Task):\n            a = luigi.Parameter(default=\"a\")\n\n        with warnings.catch_warnings(record=True) as w:\n            warnings.filterwarnings(\n                action=\"ignore\",\n                category=Warning,\n            )\n            warnings.simplefilter(\n                action=\"always\",\n                category=luigi.parameter.UnconsumedParameterWarning,\n            )\n\n            TaskA()\n            TaskB()\n\n            assert len(w) == 4\n            expected = [\n                (\"b\", \"TaskA\"),\n                (\"c\", \"TaskA\"),\n                (\"b\", \"TaskB\"),\n                (\"c\", \"TaskB\"),\n            ]\n            for i, (expected_value, task_name) in zip(w, expected):\n                assert issubclass(i.category, luigi.parameter.UnconsumedParameterWarning)\n                assert str(i.message) == (\n                    f\"The configuration contains the parameter '{expected_value}' with value '{expected_value}' that is not consumed by the task '{task_name}'.\"\n                )\n\n    @with_config(\n        {\n            \"TaskEdgeCase\": {\n                \"camelParam\": \"camelCase\",\n                \"underscore_param\": \"underscore\",\n                \"dash-param\": \"dash\",\n            },\n        }\n    )\n    def test_unconsumed_params_edge_cases(self):\n        class TaskEdgeCase(luigi.Task):\n            camelParam = luigi.Parameter()\n            underscore_param = luigi.Parameter()\n            dash_param = luigi.Parameter()\n\n        with warnings.catch_warnings(record=True) as w:\n            warnings.filterwarnings(\n                action=\"ignore\",\n                category=Warning,\n            )\n            warnings.simplefilter(\n                action=\"always\",\n                category=luigi.parameter.UnconsumedParameterWarning,\n            )\n\n            task = TaskEdgeCase()\n            assert len(w) == 0\n            assert task.camelParam == \"camelCase\"\n            assert task.underscore_param == \"underscore\"\n            assert task.dash_param == \"dash\"\n\n    @with_config(\n        {\n            \"TaskIgnoreUnconsumed\": {\n                \"a\": \"a\",\n                \"b\": \"b\",\n                \"c\": \"c\",\n            },\n        }\n    )\n    def test_unconsumed_params_ignore_unconsumed(self):\n        class TaskIgnoreUnconsumed(luigi.Task):\n            ignore_unconsumed = {\"b\", \"d\"}\n\n            a = luigi.Parameter()\n\n        with warnings.catch_warnings(record=True) as w:\n            warnings.filterwarnings(\n                action=\"ignore\",\n                category=Warning,\n            )\n            warnings.simplefilter(\n                action=\"always\",\n                category=luigi.parameter.UnconsumedParameterWarning,\n            )\n\n            TaskIgnoreUnconsumed()\n            assert len(w) == 1\n\n\nclass TaskFlattenOutputTest(unittest.TestCase):\n    def test_single_task(self):\n        expected = [luigi.LocalTarget(\"f1.txt\"), luigi.LocalTarget(\"f2.txt\")]\n\n        class TestTask(luigi.ExternalTask):\n            def output(self):\n                return expected\n\n        self.assertListEqual(luigi.task.flatten_output(TestTask()), expected)\n\n    def test_wrapper_task(self):\n        expected = [luigi.LocalTarget(\"f1.txt\"), luigi.LocalTarget(\"f2.txt\")]\n\n        class Test1Task(luigi.ExternalTask):\n            def output(self):\n                return expected[0]\n\n        class Test2Task(luigi.ExternalTask):\n            def output(self):\n                return expected[1]\n\n        @luigi.util.requires(Test1Task, Test2Task)\n        class TestWrapperTask(luigi.WrapperTask):\n            pass\n\n        self.assertListEqual(luigi.task.flatten_output(TestWrapperTask()), expected)\n\n    def test_wrapper_tasks_diamond(self):\n        expected = [luigi.LocalTarget(\"file.txt\")]\n\n        class TestTask(luigi.ExternalTask):\n            def output(self):\n                return expected\n\n        @luigi.util.requires(TestTask)\n        class LeftWrapperTask(luigi.WrapperTask):\n            pass\n\n        @luigi.util.requires(TestTask)\n        class RightWrapperTask(luigi.WrapperTask):\n            pass\n\n        @luigi.util.requires(LeftWrapperTask, RightWrapperTask)\n        class MasterWrapperTask(luigi.WrapperTask):\n            pass\n\n        self.assertListEqual(luigi.task.flatten_output(MasterWrapperTask()), expected)\n\n\nclass ExternalizeTaskTest(LuigiTestCase):\n    def test_externalize_taskclass(self):\n        class MyTask(luigi.Task):\n            def run(self):\n                pass\n\n        self.assertIsNotNone(MyTask.run)  # Assert what we believe\n        task_object = luigi.task.externalize(MyTask)()\n        self.assertIsNone(task_object.run)\n        self.assertIsNotNone(MyTask.run)  # Check immutability\n        self.assertIsNotNone(MyTask().run)  # Check immutability\n\n    def test_externalize_taskobject(self):\n        class MyTask(luigi.Task):\n            def run(self):\n                pass\n\n        task_object = luigi.task.externalize(MyTask())\n        self.assertIsNone(task_object.run)\n        self.assertIsNotNone(MyTask.run)  # Check immutability\n        self.assertIsNotNone(MyTask().run)  # Check immutability\n\n    def test_externalize_taskclass_readable_name(self):\n        class MyTask(luigi.Task):\n            def run(self):\n                pass\n\n        task_class = luigi.task.externalize(MyTask)\n        self.assertIsNot(task_class, MyTask)\n        self.assertIn(\"MyTask\", task_class.__name__)\n\n    def test_externalize_taskclass_instance_cache(self):\n        class MyTask(luigi.Task):\n            def run(self):\n                pass\n\n        task_class = luigi.task.externalize(MyTask)\n        self.assertIsNot(task_class, MyTask)\n        self.assertIs(MyTask(), MyTask())  # Assert it have enabled the instance caching\n        self.assertIsNot(task_class(), MyTask())  # Now, they should not be the same of course\n\n    def test_externalize_same_id(self):\n        class MyTask(luigi.Task):\n            def run(self):\n                pass\n\n        task_normal = MyTask()\n        task_ext_1 = luigi.task.externalize(MyTask)()\n        task_ext_2 = luigi.task.externalize(MyTask())\n        self.assertEqual(task_normal.task_id, task_ext_1.task_id)\n        self.assertEqual(task_normal.task_id, task_ext_2.task_id)\n\n    def test_externalize_same_id_with_task_namespace(self):\n        # Dependent on the new behavior from spotify/luigi#1953\n        class MyTask(luigi.Task):\n            task_namespace = \"something.domething\"\n\n            def run(self):\n                pass\n\n        task_normal = MyTask()\n        task_ext_1 = luigi.task.externalize(MyTask())\n        task_ext_2 = luigi.task.externalize(MyTask)()\n        self.assertEqual(task_normal.task_id, task_ext_1.task_id)\n        self.assertEqual(task_normal.task_id, task_ext_2.task_id)\n        self.assertEqual(str(task_normal), str(task_ext_1))\n        self.assertEqual(str(task_normal), str(task_ext_2))\n\n    def test_externalize_same_id_with_luigi_namespace(self):\n        # Dependent on the new behavior from spotify/luigi#1953\n        luigi.namespace(\"lets.externalize\")\n\n        class MyTask(luigi.Task):\n            def run(self):\n                pass\n\n        luigi.namespace()\n\n        task_normal = MyTask()\n        task_ext_1 = luigi.task.externalize(MyTask())\n        task_ext_2 = luigi.task.externalize(MyTask)()\n        self.assertEqual(task_normal.task_id, task_ext_1.task_id)\n        self.assertEqual(task_normal.task_id, task_ext_2.task_id)\n        self.assertEqual(str(task_normal), str(task_ext_1))\n        self.assertEqual(str(task_normal), str(task_ext_2))\n\n    def test_externalize_with_requires(self):\n        class MyTask(luigi.Task):\n            def run(self):\n                pass\n\n        @luigi.util.requires(luigi.task.externalize(MyTask))\n        class Requirer(luigi.Task):\n            def run(self):\n                pass\n\n        self.assertIsNotNone(MyTask.run)  # Check immutability\n        self.assertIsNotNone(MyTask().run)  # Check immutability\n\n    def test_externalize_doesnt_affect_the_registry(self):\n        class MyTask(luigi.Task):\n            pass\n\n        reg_orig = luigi.task_register.Register._get_reg()\n        luigi.task.externalize(MyTask)\n        reg_afterwards = luigi.task_register.Register._get_reg()\n        self.assertEqual(reg_orig, reg_afterwards)\n\n    def test_can_uniquely_command_line_parse(self):\n        class MyTask(luigi.Task):\n            pass\n\n        # This first check is just an assumption rather than assertion\n        self.assertTrue(self.run_locally([\"MyTask\"]))\n        luigi.task.externalize(MyTask)\n        # Now we check we don't encounter \"ambiguous task\" issues\n        self.assertTrue(self.run_locally([\"MyTask\"]))\n        # We do this once again, is there previously was a bug like this.\n        luigi.task.externalize(MyTask)\n        self.assertTrue(self.run_locally([\"MyTask\"]))\n\n\nclass TaskNamespaceTest(LuigiTestCase):\n    def setup_tasks(self):\n        class Foo(luigi.Task):\n            pass\n\n        class FooSubclass(Foo):\n            pass\n\n        return (Foo, FooSubclass, self.go_mynamespace())\n\n    def go_mynamespace(self):\n        luigi.namespace(\"mynamespace\")\n\n        class Foo(luigi.Task):\n            p = luigi.IntParameter()\n\n        class Bar(Foo):\n            task_namespace = \"othernamespace\"  # namespace override\n\n        class Baz(Bar):  # inherits namespace for Bar\n            pass\n\n        luigi.namespace()\n        return collections.namedtuple(\"mynamespace\", \"Foo Bar Baz\")(Foo, Bar, Baz)\n\n    def test_vanilla(self):\n        (Foo, FooSubclass, namespace_test_helper) = self.setup_tasks()\n        self.assertEqual(Foo.task_family, \"Foo\")\n        self.assertEqual(str(Foo()), \"Foo()\")\n\n        self.assertEqual(FooSubclass.task_family, \"FooSubclass\")\n        self.assertEqual(str(FooSubclass()), \"FooSubclass()\")\n\n    def test_namespace(self):\n        (Foo, FooSubclass, namespace_test_helper) = self.setup_tasks()\n        self.assertEqual(namespace_test_helper.Foo.task_family, \"mynamespace.Foo\")\n        self.assertEqual(str(namespace_test_helper.Foo(1)), \"mynamespace.Foo(p=1)\")\n\n        self.assertEqual(namespace_test_helper.Bar.task_namespace, \"othernamespace\")\n        self.assertEqual(namespace_test_helper.Bar.task_family, \"othernamespace.Bar\")\n        self.assertEqual(str(namespace_test_helper.Bar(1)), \"othernamespace.Bar(p=1)\")\n\n        self.assertEqual(namespace_test_helper.Baz.task_namespace, \"othernamespace\")\n        self.assertEqual(namespace_test_helper.Baz.task_family, \"othernamespace.Baz\")\n        self.assertEqual(str(namespace_test_helper.Baz(1)), \"othernamespace.Baz(p=1)\")\n\n    def test_uses_latest_namespace(self):\n        luigi.namespace(\"a\")\n\n        class _BaseTask(luigi.Task):\n            pass\n\n        luigi.namespace(\"b\")\n\n        class _ChildTask(_BaseTask):\n            pass\n\n        luigi.namespace()  # Reset everything\n        child_task = _ChildTask()\n        self.assertEqual(child_task.task_family, \"b._ChildTask\")\n        self.assertEqual(str(child_task), \"b._ChildTask()\")\n\n    def test_with_scope(self):\n        luigi.namespace(\"wohoo\", scope=\"task_test\")\n        luigi.namespace(\"bleh\", scope=\"\")\n\n        class MyTask(luigi.Task):\n            pass\n\n        luigi.namespace(scope=\"task_test\")\n        luigi.namespace(scope=\"\")\n        self.assertEqual(MyTask.get_task_namespace(), \"wohoo\")\n\n    def test_with_scope_not_matching(self):\n        luigi.namespace(\"wohoo\", scope=\"incorrect_namespace\")\n        luigi.namespace(\"bleh\", scope=\"\")\n\n        class MyTask(luigi.Task):\n            pass\n\n        luigi.namespace(scope=\"incorrect_namespace\")\n        luigi.namespace(scope=\"\")\n        self.assertEqual(MyTask.get_task_namespace(), \"bleh\")\n\n\nclass AutoNamespaceTest(LuigiTestCase):\n    this_module = \"task_test\"\n\n    def test_auto_namespace_global(self):\n        luigi.auto_namespace()\n\n        class MyTask(luigi.Task):\n            pass\n\n        luigi.namespace()\n        self.assertEqual(MyTask.get_task_namespace(), self.this_module)\n\n    def test_auto_namespace_scope(self):\n        luigi.auto_namespace(scope=\"task_test\")\n        luigi.namespace(\"bleh\", scope=\"\")\n\n        class MyTask(luigi.Task):\n            pass\n\n        luigi.namespace(scope=\"task_test\")\n        luigi.namespace(scope=\"\")\n        self.assertEqual(MyTask.get_task_namespace(), self.this_module)\n\n    def test_auto_namespace_not_matching(self):\n        luigi.auto_namespace(scope=\"incorrect_namespace\")\n        luigi.namespace(\"bleh\", scope=\"\")\n\n        class MyTask(luigi.Task):\n            pass\n\n        luigi.namespace(scope=\"incorrect_namespace\")\n        luigi.namespace(scope=\"\")\n        self.assertEqual(MyTask.get_task_namespace(), \"bleh\")\n\n    def test_auto_namespace_not_matching_2(self):\n        luigi.auto_namespace(scope=\"incorrect_namespace\")\n\n        class MyTask(luigi.Task):\n            pass\n\n        luigi.namespace(scope=\"incorrect_namespace\")\n        self.assertEqual(MyTask.get_task_namespace(), \"\")\n\n\nclass InitSubclassTest(LuigiTestCase):\n    def test_task_works_with_init_subclass(self):\n        class ReceivesClassKwargs(luigi.Task):\n            def __init_subclass__(cls, x, **kwargs):\n                super(ReceivesClassKwargs, cls).__init_subclass__()\n                cls.x = x\n\n        class Receiver(ReceivesClassKwargs, x=1):\n            pass\n\n        self.assertEqual(Receiver.x, 1)\n"
  },
  {
    "path": "test/test_sigpipe.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport os\n\nfrom helpers import unittest\n\nfrom luigi.format import InputPipeProcessWrapper\n\nBASH_SCRIPT = \"\"\"\n#!/bin/bash\n\ntrap \"touch /tmp/luigi_sigpipe.marker; exit 141\" SIGPIPE\n\n\nfor i in {1..3}\ndo\n    sleep 0.1\n    echo \"Welcome $i times\"\ndone\n\"\"\"\n\nFAIL_SCRIPT = (\n    BASH_SCRIPT\n    + \"\"\"\nexit 1\n\"\"\"\n)\n\n\nclass TestSigpipe(unittest.TestCase):\n    def setUp(self):\n        with open(\"/tmp/luigi_test_sigpipe.sh\", \"w\") as fp:\n            fp.write(BASH_SCRIPT)\n\n    def tearDown(self):\n        os.remove(\"/tmp/luigi_test_sigpipe.sh\")\n        if os.path.exists(\"/tmp/luigi_sigpipe.marker\"):\n            os.remove(\"/tmp/luigi_sigpipe.marker\")\n\n    def test_partial_read(self):\n        p1 = InputPipeProcessWrapper([\"bash\", \"/tmp/luigi_test_sigpipe.sh\"])\n        self.assertEqual(p1.readline().decode(\"utf8\"), \"Welcome 1 times\\n\")\n        p1.close()\n        self.assertTrue(os.path.exists(\"/tmp/luigi_sigpipe.marker\"))\n\n    def test_full_read(self):\n        p1 = InputPipeProcessWrapper([\"bash\", \"/tmp/luigi_test_sigpipe.sh\"])\n        counter = 1\n        for line in p1:\n            self.assertEqual(line.decode(\"utf8\"), \"Welcome %i times\\n\" % counter)\n            counter += 1\n        p1.close()\n        self.assertFalse(os.path.exists(\"/tmp/luigi_sigpipe.marker\"))\n\n\nclass TestSubprocessException(unittest.TestCase):\n    def setUp(self):\n        with open(\"/tmp/luigi_test_sigpipe.sh\", \"w\") as fp:\n            fp.write(FAIL_SCRIPT)\n\n    def tearDown(self):\n        os.remove(\"/tmp/luigi_test_sigpipe.sh\")\n        if os.path.exists(\"/tmp/luigi_sigpipe.marker\"):\n            os.remove(\"/tmp/luigi_sigpipe.marker\")\n\n    def test_partial_read(self):\n        p1 = InputPipeProcessWrapper([\"bash\", \"/tmp/luigi_test_sigpipe.sh\"])\n        self.assertEqual(p1.readline().decode(\"utf8\"), \"Welcome 1 times\\n\")\n        p1.close()\n        self.assertTrue(os.path.exists(\"/tmp/luigi_sigpipe.marker\"))\n\n    def test_full_read(self):\n        def run():\n            p1 = InputPipeProcessWrapper([\"bash\", \"/tmp/luigi_test_sigpipe.sh\"])\n            counter = 1\n            for line in p1:\n                self.assertEqual(line.decode(\"utf8\"), \"Welcome %i times\\n\" % counter)\n                counter += 1\n            p1.close()\n\n        self.assertRaises(RuntimeError, run)\n"
  },
  {
    "path": "test/test_ssh.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport subprocess\n\nfrom helpers import unittest\n\nfrom luigi.contrib.ssh import RemoteContext\n\n\nclass TestMockedRemoteContext(unittest.TestCase):\n    def test_subprocess_delegation(self):\n        \"\"\"Test subprocess call structure using mock module\"\"\"\n        orig_Popen = subprocess.Popen\n        self.last_test = None\n\n        def Popen(cmd, **kwargs):\n            self.last_test = cmd\n\n        subprocess.Popen = Popen\n        context = RemoteContext(\"some_host\", username=\"luigi\", key_file=\"/some/key.pub\")\n        context.Popen([\"ls\"])\n        self.assertTrue(\"ssh\" in self.last_test)\n        self.assertTrue(\"-i\" in self.last_test)\n        self.assertTrue(\"/some/key.pub\" in self.last_test)\n        self.assertTrue(\"luigi@some_host\" in self.last_test)\n        self.assertTrue(\"ls\" in self.last_test)\n\n        subprocess.Popen = orig_Popen\n\n    def test_check_output_fail_connect(self):\n        \"\"\"Test check_output to a non-existing host\"\"\"\n        context = RemoteContext(\"__NO_HOST_LIKE_THIS__\", connect_timeout=1)\n        self.assertRaises(subprocess.CalledProcessError, context.check_output, [\"ls\"])\n"
  },
  {
    "path": "test/testconfig/core-site.xml",
    "content": "<?xml version=\"1.0\"?>\n<?xml-stylesheet type=\"text/xsl\" href=\"configuration.xsl\"?>\n\n<configuration>\n  <property>\n    <name>fs.defaultFS</name>\n    <value>hdfs://localhost:50030/</value>\n  </property>\n</configuration>\n"
  },
  {
    "path": "test/testconfig/log4j.properties",
    "content": "hadoop.root.logger=INFO,stderr\nlog4j.logger.org.apache.hadoop=INFO,stderr\nlog4j.logger.org.apache.hadoop.util.NativeCodeLoader=Off\n\nlog4j.appender.stderr = org.apache.log4j.ConsoleAppender\nlog4j.appender.stderr.layout = org.apache.log4j.PatternLayout\nlog4j.appender.stderr.Target = System.err"
  },
  {
    "path": "test/testconfig/logging.cfg",
    "content": "[loggers]\nkeys=root\n\n[handlers]\nkeys=consoleHandler\n\n[formatters]\nkeys=simpleFormatter\n\n[logger_root]\nlevel=DEBUG\nhandlers=consoleHandler\n\n[handler_consoleHandler]\nclass=StreamHandler\nlevel=DEBUG\nformatter=simpleFormatter\nargs=(sys.stdout,)\n\n[formatter_simpleFormatter]\nformat=%(levelname)s: %(message)s\n"
  },
  {
    "path": "test/testconfig/luigi.toml",
    "content": "[core]\nlogging_conf_file = \"test/testconfig/logging.cfg\"\n\n[hdfs]\nclient = \"hadoopcli\"\nsnakebite_autoconfig = false\nnamenode_host = \"must be overridden in local config\"\n\n[SomeTask]\nparam = {key1 = \"value1\", key2 = \"value2\"}\n"
  },
  {
    "path": "test/testconfig/luigi_local.toml",
    "content": "[hdfs]\nnamenode_host = \"localhost\"\nnamenode_port = 50030\n"
  },
  {
    "path": "test/testconfig/luigi_logging.toml",
    "content": "[logging]\nversion = 1\ndisable_existing_loggers = false\n\n[logging.formatters.mockformatter]\nformat = \"{levelname}: {message}\"\nstyle = \"{\"\n\n[logging.handlers.mockhandler]\nclass = \"logging.StreamHandler\"\nlevel = \"INFO\"\nformatter = \"mockformatter\"\n\n[logging.loggers.mocklogger]\nhandlers = [\"mockhandler\"]\nlevel = 'INFO'\ndisabled = false\npropagate = false\n"
  },
  {
    "path": "test/testconfig/pyproject.toml",
    "content": "[tool.mypy]\nplugins = [\"luigi.mypy\"]\nignore_missing_imports = true\n"
  },
  {
    "path": "test/util_previous_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport datetime\n\nfrom helpers import unittest\n\nimport luigi\nimport luigi.date_interval\nfrom luigi.util import get_previous_completed, previous\n\n\nclass DateTaskOk(luigi.Task):\n    date = luigi.DateParameter()\n\n    def complete(self):\n        # test against 2000.03.01\n        return self.date in [datetime.date(2000, 2, 25), datetime.date(2000, 3, 1), datetime.date(2000, 3, 2)]\n\n\nclass DateTaskOkTest(unittest.TestCase):\n    def test_previous(self):\n        task = DateTaskOk(datetime.date(2000, 3, 1))\n        prev = previous(task)\n        self.assertEqual(prev.date, datetime.date(2000, 2, 29))\n\n    def test_get_previous_completed(self):\n        task = DateTaskOk(datetime.date(2000, 3, 1))\n        prev = get_previous_completed(task, 5)\n        self.assertEqual(prev.date, datetime.date(2000, 2, 25))\n\n    def test_get_previous_completed_not_found(self):\n        task = DateTaskOk(datetime.date(2000, 3, 1))\n        prev = get_previous_completed(task, 4)\n        self.assertEqual(None, prev)\n\n\nclass DateHourTaskOk(luigi.Task):\n    hour = luigi.DateHourParameter()\n\n    def complete(self):\n        # test against 2000.03.01T02\n        return self.hour in [datetime.datetime(2000, 2, 29, 22), datetime.datetime(2000, 3, 1, 2), datetime.datetime(2000, 3, 1, 3)]\n\n\nclass DateHourTaskOkTest(unittest.TestCase):\n    def test_previous(self):\n        task = DateHourTaskOk(datetime.datetime(2000, 3, 1, 2))\n        prev = previous(task)\n        self.assertEqual(prev.hour, datetime.datetime(2000, 3, 1, 1))\n\n    def test_get_previous_completed(self):\n        task = DateHourTaskOk(datetime.datetime(2000, 3, 1, 2))\n        prev = get_previous_completed(task, 4)\n        self.assertEqual(prev.hour, datetime.datetime(2000, 2, 29, 22))\n\n    def test_get_previous_completed_not_found(self):\n        task = DateHourTaskOk(datetime.datetime(2000, 3, 1, 2))\n        prev = get_previous_completed(task, 3)\n        self.assertEqual(None, prev)\n\n\nclass DateMinuteTaskOk(luigi.Task):\n    minute = luigi.DateMinuteParameter()\n\n    def complete(self):\n        # test against 2000.03.01T02H03\n        return self.minute in [datetime.datetime(2000, 3, 1, 2, 0)]\n\n\nclass DateMinuteTaskOkTest(unittest.TestCase):\n    def test_previous(self):\n        task = DateMinuteTaskOk(datetime.datetime(2000, 3, 1, 2, 3))\n        prev = previous(task)\n        self.assertEqual(prev.minute, datetime.datetime(2000, 3, 1, 2, 2))\n\n    def test_get_previous_completed(self):\n        task = DateMinuteTaskOk(datetime.datetime(2000, 3, 1, 2, 3))\n        prev = get_previous_completed(task, 3)\n        self.assertEqual(prev.minute, datetime.datetime(2000, 3, 1, 2, 0))\n\n    def test_get_previous_completed_not_found(self):\n        task = DateMinuteTaskOk(datetime.datetime(2000, 3, 1, 2, 3))\n        prev = get_previous_completed(task, 2)\n        self.assertEqual(None, prev)\n\n\nclass DateSecondTaskOk(luigi.Task):\n    second = luigi.DateSecondParameter()\n\n    def complete(self):\n        return self.second in [datetime.datetime(2000, 3, 1, 2, 3, 4)]\n\n\nclass DateSecondTaskOkTest(unittest.TestCase):\n    def test_previous(self):\n        task = DateSecondTaskOk(datetime.datetime(2000, 3, 1, 2, 3, 7))\n        prev = previous(task)\n        self.assertEqual(prev.second, datetime.datetime(2000, 3, 1, 2, 3, 6))\n\n    def test_get_previous_completed(self):\n        task = DateSecondTaskOk(datetime.datetime(2000, 3, 1, 2, 3, 7))\n        prev = get_previous_completed(task, 3)\n        self.assertEqual(prev.second, datetime.datetime(2000, 3, 1, 2, 3, 4))\n\n    def test_get_previous_completed_not_found(self):\n        task = DateSecondTaskOk(datetime.datetime(2000, 3, 1, 2, 3))\n        prev = get_previous_completed(task, 2)\n        self.assertEqual(None, prev)\n\n\nclass DateIntervalTaskOk(luigi.Task):\n    interval = luigi.DateIntervalParameter()\n\n    def complete(self):\n        return self.interval in [luigi.date_interval.Week(1999, 48), luigi.date_interval.Week(2000, 1), luigi.date_interval.Week(2000, 2)]\n\n\nclass DateIntervalTaskOkTest(unittest.TestCase):\n    def test_previous(self):\n        task = DateIntervalTaskOk(luigi.date_interval.Week(2000, 1))\n        prev = previous(task)\n        self.assertEqual(prev.interval, luigi.date_interval.Week(1999, 52))\n\n    def test_get_previous_completed(self):\n        task = DateIntervalTaskOk(luigi.date_interval.Week(2000, 1))\n        prev = get_previous_completed(task, 5)\n        self.assertEqual(prev.interval, luigi.date_interval.Week(1999, 48))\n\n    def test_get_previous_completed_not_found(self):\n        task = DateIntervalTaskOk(luigi.date_interval.Week(2000, 1))\n        prev = get_previous_completed(task, 4)\n        self.assertEqual(None, prev)\n\n\nclass ExtendedDateTaskOk(DateTaskOk):\n    param1 = luigi.Parameter()\n    param2 = luigi.IntParameter(default=2)\n\n\nclass ExtendedDateTaskOkTest(unittest.TestCase):\n    def test_previous(self):\n        task = ExtendedDateTaskOk(datetime.date(2000, 3, 1), \"some value\")\n        prev = previous(task)\n        self.assertEqual(prev.date, datetime.date(2000, 2, 29))\n        self.assertEqual(prev.param1, \"some value\")\n        self.assertEqual(prev.param2, 2)\n\n\nclass MultiTemporalTaskNok(luigi.Task):\n    date = luigi.DateParameter()\n    hour = luigi.DateHourParameter()\n\n\nclass MultiTemporalTaskNokTest(unittest.TestCase):\n    def test_previous(self):\n        task = MultiTemporalTaskNok(datetime.date(2000, 1, 1), datetime.datetime(2000, 1, 1, 1))\n        self.assertRaises(NotImplementedError, previous, task)\n        self.assertRaises(NotImplementedError, get_previous_completed, task)\n\n\nclass NoTemporalTaskNok(luigi.Task):\n    param = luigi.Parameter()\n\n\nclass NoTemporalTaskNokTest(unittest.TestCase):\n    def test_previous(self):\n        task = NoTemporalTaskNok(\"some value\")\n        self.assertRaises(NotImplementedError, previous, task)\n        self.assertRaises(NotImplementedError, get_previous_completed, task)\n"
  },
  {
    "path": "test/util_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2016 VNG Corporation\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nfrom helpers import LuigiTestCase, RunOnceTask\n\nimport luigi\nimport luigi.task\nfrom luigi.util import inherits, requires\n\n\nclass BasicsTest(LuigiTestCase):\n    # following tests using inherits decorator\n    def test_task_ids_using_inherits(self):\n        class ParentTask(luigi.Task):\n            my_param = luigi.Parameter()\n\n        luigi.namespace(\"blah\")\n\n        @inherits(ParentTask)\n        class ChildTask(luigi.Task):\n            def requires(self):\n                return self.clone(ParentTask)\n\n        luigi.namespace(\"\")\n        child_task = ChildTask(my_param=\"hello\")\n        self.assertEqual(str(child_task), \"blah.ChildTask(my_param=hello)\")\n        self.assertIn(ParentTask(my_param=\"hello\"), luigi.task.flatten(child_task.requires()))\n\n    def test_task_ids_using_inherits_2(self):\n        # Here we use this decorator in a unnormal way.\n        # But it should still work.\n        class ParentTask(luigi.Task):\n            my_param = luigi.Parameter()\n\n        decorator = inherits(ParentTask)\n        luigi.namespace(\"blah\")\n\n        class ChildTask(luigi.Task):\n            def requires(self):\n                return self.clone_parent()\n\n        luigi.namespace(\"\")\n        ChildTask = decorator(ChildTask)\n        child_task = ChildTask(my_param=\"hello\")\n        self.assertEqual(str(child_task), \"blah.ChildTask(my_param=hello)\")\n        self.assertIn(ParentTask(my_param=\"hello\"), luigi.task.flatten(child_task.requires()))\n\n    def test_task_ids_using_inherits_kwargs(self):\n        class ParentTask(luigi.Task):\n            my_param = luigi.Parameter()\n\n        luigi.namespace(\"blah\")\n\n        @inherits(parent=ParentTask)\n        class ChildTask(luigi.Task):\n            def requires(self):\n                return self.clone(ParentTask)\n\n        luigi.namespace(\"\")\n        child_task = ChildTask(my_param=\"hello\")\n        self.assertEqual(str(child_task), \"blah.ChildTask(my_param=hello)\")\n        self.assertIn(ParentTask(my_param=\"hello\"), luigi.task.flatten(child_task.requires()))\n\n    def _setup_parent_and_child_inherits(self):\n        class ParentTask(luigi.Task):\n            my_parameter = luigi.Parameter()\n            class_variable = \"notset\"\n\n            def run(self):\n                self.__class__.class_variable = self.my_parameter\n\n            def complete(self):\n                return self.class_variable == \"actuallyset\"\n\n        @inherits(ParentTask)\n        class ChildTask(RunOnceTask):\n            def requires(self):\n                return self.clone_parent()\n\n        return ParentTask\n\n    def test_inherits_has_effect_run_child(self):\n        ParentTask = self._setup_parent_and_child_inherits()\n        self.assertTrue(self.run_locally_split(\"ChildTask --my-parameter actuallyset\"))\n        self.assertEqual(ParentTask.class_variable, \"actuallyset\")\n\n    def test_inherits_has_effect_run_parent(self):\n        ParentTask = self._setup_parent_and_child_inherits()\n        self.assertTrue(self.run_locally_split(\"ParentTask --my-parameter actuallyset\"))\n        self.assertEqual(ParentTask.class_variable, \"actuallyset\")\n\n    def _setup_inherits_inheritence(self):\n        class InheritedTask(luigi.Task):\n            pass\n\n        class ParentTask(luigi.Task):\n            pass\n\n        @inherits(InheritedTask)\n        class ChildTask(ParentTask):\n            pass\n\n        return ChildTask\n\n    def test_inherits_has_effect_MRO(self):\n        ChildTask = self._setup_inherits_inheritence()\n        self.assertNotEqual(str(ChildTask.__mro__[0]), str(ChildTask.__mro__[1]))\n\n    # following tests using requires decorator\n    def test_task_ids_using_requries(self):\n        class ParentTask(luigi.Task):\n            my_param = luigi.Parameter()\n\n        luigi.namespace(\"blah\")\n\n        @requires(ParentTask)\n        class ChildTask(luigi.Task):\n            pass\n\n        luigi.namespace(\"\")\n        child_task = ChildTask(my_param=\"hello\")\n        self.assertEqual(str(child_task), \"blah.ChildTask(my_param=hello)\")\n        self.assertIn(ParentTask(my_param=\"hello\"), luigi.task.flatten(child_task.requires()))\n\n    def test_task_ids_using_requries_2(self):\n        # Here we use this decorator in a unnormal way.\n        # But it should still work.\n        class ParentTask(luigi.Task):\n            my_param = luigi.Parameter()\n\n        decorator = requires(ParentTask)\n        luigi.namespace(\"blah\")\n\n        class ChildTask(luigi.Task):\n            pass\n\n        luigi.namespace(\"\")\n        ChildTask = decorator(ChildTask)\n        child_task = ChildTask(my_param=\"hello\")\n        self.assertEqual(str(child_task), \"blah.ChildTask(my_param=hello)\")\n        self.assertIn(ParentTask(my_param=\"hello\"), luigi.task.flatten(child_task.requires()))\n\n    def _setup_parent_and_child(self):\n        class ParentTask(luigi.Task):\n            my_parameter = luigi.Parameter()\n            class_variable = \"notset\"\n\n            def run(self):\n                self.__class__.class_variable = self.my_parameter\n\n            def complete(self):\n                return self.class_variable == \"actuallyset\"\n\n        @requires(ParentTask)\n        class ChildTask(RunOnceTask):\n            pass\n\n        return ParentTask\n\n    def test_requires_has_effect_run_child(self):\n        ParentTask = self._setup_parent_and_child()\n        self.assertTrue(self.run_locally_split(\"ChildTask --my-parameter actuallyset\"))\n        self.assertEqual(ParentTask.class_variable, \"actuallyset\")\n\n    def test_requires_has_effect_run_parent(self):\n        ParentTask = self._setup_parent_and_child()\n        self.assertTrue(self.run_locally_split(\"ParentTask --my-parameter actuallyset\"))\n        self.assertEqual(ParentTask.class_variable, \"actuallyset\")\n\n    def _setup_requires_inheritence(self):\n        class RequiredTask(luigi.Task):\n            pass\n\n        class ParentTask(luigi.Task):\n            pass\n\n        @requires(RequiredTask)\n        class ChildTask(ParentTask):\n            pass\n\n        return ChildTask\n\n    def test_requires_has_effect_MRO(self):\n        ChildTask = self._setup_requires_inheritence()\n        self.assertNotEqual(str(ChildTask.__mro__[0]), str(ChildTask.__mro__[1]))\n\n    def test_kwargs_requires_gives_named_inputs(self):\n        class ParentTask(RunOnceTask):\n            def output(self):\n                return \"Target\"\n\n        @requires(parent_1=ParentTask, parent_2=ParentTask)\n        class ChildTask(RunOnceTask):\n            resulting_input = \"notset\"\n\n            def run(self):\n                self.__class__.resulting_input = self.input()\n\n        self.assertTrue(self.run_locally_split(\"ChildTask\"))\n        self.assertEqual(ChildTask.resulting_input, {\"parent_1\": \"Target\", \"parent_2\": \"Target\"})\n"
  },
  {
    "path": "test/visible_parameters_test.py",
    "content": "import json\n\nfrom helpers import unittest\n\nimport luigi\nfrom luigi.parameter import ParameterVisibility\n\n\nclass TestTask1(luigi.Task):\n    param_one = luigi.Parameter(default=\"1\", visibility=ParameterVisibility.HIDDEN, significant=True)\n    param_two = luigi.Parameter(default=\"2\", significant=True)\n    param_three = luigi.Parameter(default=\"3\", visibility=ParameterVisibility.PRIVATE, significant=True)\n\n\nclass TestTask2(luigi.Task):\n    param_one = luigi.Parameter(default=\"1\", visibility=ParameterVisibility.PRIVATE)\n    param_two = luigi.Parameter(default=\"2\", visibility=ParameterVisibility.PRIVATE)\n    param_three = luigi.Parameter(default=\"3\", visibility=ParameterVisibility.PRIVATE)\n\n\nclass TestTask3(luigi.Task):\n    param_one = luigi.Parameter(default=\"1\", visibility=ParameterVisibility.HIDDEN, significant=True)\n    param_two = luigi.Parameter(default=\"2\", visibility=ParameterVisibility.HIDDEN, significant=False)\n    param_three = luigi.Parameter(default=\"3\", visibility=ParameterVisibility.HIDDEN, significant=True)\n\n\nclass TestTask4(luigi.Task):\n    param_one = luigi.Parameter(default=\"1\", visibility=ParameterVisibility.PUBLIC, significant=True)\n    param_two = luigi.Parameter(default=\"2\", visibility=ParameterVisibility.PUBLIC, significant=False)\n    param_three = luigi.Parameter(default=\"3\", visibility=ParameterVisibility.PUBLIC, significant=True)\n\n\nclass Test(unittest.TestCase):\n    def test_to_str_params(self):\n        task = TestTask1()\n\n        self.assertEqual(task.to_str_params(), {\"param_one\": \"1\", \"param_two\": \"2\"})\n\n        task = TestTask2()\n\n        self.assertEqual(task.to_str_params(), {})\n\n        task = TestTask3()\n\n        self.assertEqual(task.to_str_params(), {\"param_one\": \"1\", \"param_two\": \"2\", \"param_three\": \"3\"})\n\n    def test_all_public_equals_all_hidden(self):\n        hidden = TestTask3()\n        public = TestTask4()\n\n        self.assertEqual(public.to_str_params(), hidden.to_str_params())\n\n    def test_all_public_equals_all_hidden_using_significant(self):\n        hidden = TestTask3()\n        public = TestTask4()\n\n        self.assertEqual(public.to_str_params(only_significant=True), hidden.to_str_params(only_significant=True))\n\n    def test_private_params_and_significant(self):\n        task = TestTask1()\n\n        self.assertEqual(task.to_str_params(), task.to_str_params(only_significant=True))\n\n    def test_param_visibilities(self):\n        task = TestTask1()\n\n        self.assertEqual(task._get_param_visibilities(), {\"param_one\": 1, \"param_two\": 0})\n\n    def test_incorrect_visibility_value(self):\n        class Task(luigi.Task):\n            a = luigi.Parameter(default=\"val\", visibility=5)\n\n        task = Task()\n\n        self.assertEqual(task._get_param_visibilities(), {\"a\": 0})\n\n    def test_task_id_exclude_hidden_and_private_params(self):\n        task = TestTask1()\n\n        self.assertEqual({\"param_two\": \"2\"}, task.to_str_params(only_public=True))\n\n    def test_json_dumps(self):\n        public = json.dumps(ParameterVisibility.PUBLIC.serialize())\n        hidden = json.dumps(ParameterVisibility.HIDDEN.serialize())\n        private = json.dumps(ParameterVisibility.PRIVATE.serialize())\n\n        self.assertEqual(\"0\", public)\n        self.assertEqual(\"1\", hidden)\n        self.assertEqual(\"2\", private)\n\n        public = json.loads(public)\n        hidden = json.loads(hidden)\n        private = json.loads(private)\n\n        self.assertEqual(0, public)\n        self.assertEqual(1, hidden)\n        self.assertEqual(2, private)\n"
  },
  {
    "path": "test/visualiser/__init__.py",
    "content": "# Tests for visualiser javascript.\n"
  },
  {
    "path": "test/visualiser/phantomjs_test.js",
    "content": "var page = require('webpage').create();\nvar system = require('system');\n\nvar tests = [];\n\n/*\n * Parse command line to get Luigi scheduler URL\n */\nif (system.args.length === 1) {\n    console.log('Usage: phantom_test.js <scheduler-url>');\n    phantom.exit();\n}\n\nvar url = system.args[1];\n\n\n/*\n * Minimal test framework\n */\nfunction do_tests(page) {\n    var ok = true;\n    var retval;\n\n    tests.forEach(function (spec) {\n        var name = spec[0];\n        var test_func = spec[1];\n\n        retval = report(page.evaluate(test_func), name);\n        ok = ok && retval;\n    });\n\n    return ok;\n}\n\nfunction report(retval, func_name) {\n\n    if (retval === true) {\n        console.log('[ OK ]  ' + func_name);\n        return true;\n    }\n    else {\n        console.log('[FAIL]  ' + func_name);\n        console.log(retval);\n        return false;\n    }\n}\n\nphantom.onError = function(msg, trace) {\n    var msgStack = ['PHANTOM ERROR: ' + msg];\n    if (trace && trace.length) {\n        msgStack.push('TRACE:');\n        trace.forEach(function(t) {\n            msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line\n                                    + (t.function ? ' (in function ' + t.function +')' : ''));\n        });\n    }\n    console.error(msgStack.join('\\n'));\n    phantom.exit(1);\n};\n\npage.onError = function(msg, trace) {\n\n  var msgStack = ['ERROR: ' + msg];\n\n  if (trace && trace.length) {\n    msgStack.push('TRACE:');\n    trace.forEach(function(t) {\n      msgStack.push(' -> ' + t.file + ': ' + t.line\n                    + (t.function ? ' (in function \"' + t.function +'\")' : ''));\n    });\n  }\n\n  console.error(msgStack.join('\\n'));\n\n};\n\n/**\n * def_test: define a test\n * @param test_name: Name of test\n * @param func: A function which will be evaluated within the page and should return\n *    true for success or any other value for failure.\n */\nfunction def_test(test_name, func) {\n    tests.push([test_name, func]);\n}\n\n\n/*\n * Test definitions\n */\ndef_test('failed_info_test', function () {\n    var el = $('#FAILED_info .info-box-number')[0];\n    if (el.textContent === \"4\") {\n        return true;\n    }\n    else {\n        return el.textContent;\n    }\n});\n\ndef_test('done_info_test', function () {\n    var el = $('#DONE_info .info-box-number')[0];\n    if (el.textContent === \"68\") {\n        return true;\n    }\n    else {\n        return el.textContent;\n    }\n});\n\ndef_test('upstream_failure_info_test', function () {\n    var el = $('#UPSTREAM_FAILED_info .info-box-number')[0];\n    if (el.textContent === '45') {\n        return true;\n    }\n    else {\n        return el.textContent;\n    }\n});\n\n\ndef_test('result_count_test', function () {\n    var el = $('#taskTable_info')[0];\n    if (el.textContent.match(/Showing \\d+ to \\d+ of 117 entries/)) {\n        return true;\n    }\n    else {\n        return el.textContent;\n    }\n});\n\ndef_test('filtered_result_count_test1', function () {\n    var ret;\n    var target = $('ul.sidebar-menu li a').first();\n\n    target.click();\n\n    var el = $('#taskTable_info')[0];\n    if (el.textContent.match(/Showing \\d+ to \\d+ of 29 entries.*from 117 total entries/)) {\n        ret = true;\n    }\n    else {\n        ret = el.textContent;\n    }\n\n    target.click();\n    return ret;\n});\n\n\ndef_test('filtered_result_count_test2', function () {\n    var ret;\n    var target = $('#FAILED_info').first();\n\n    target.click();\n    var el = $('#taskTable_info')[0];\n    if (el.textContent.match(/Showing \\d+ to \\d+ of 4 entries.*from 117 total entries/)) {\n        ret = true;\n    }\n    else {\n        ret = el.textContent;\n    }\n\n    target.click();\n    return ret;\n\n});\n\n\ndef_test('filtered_result_count_test3', function () {\n    var ret;\n    var target = $('#PENDING_info').first();\n\n    target.click();\n    var el = $('#taskTable_info')[0];\n    if (el.textContent.match(/Showing \\d+ to \\d+ of 0 entries.*from 117 total entries/)) {\n        ret = true;\n    }\n    else {\n        ret = el.textContent;\n    }\n\n    target.click();\n    return ret;\n\n});\n\n\ndef_test('filtered_result_count_test4', function () {\n    var ret;\n    var target = $('#RUNNING_info').first();\n\n    target.click();\n    var el = $('#taskTable_info')[0];\n    if (el.textContent.match(/Showing \\d+ to \\d+ of 0 entries.*from 117 total entries/)) {\n        ret = true;\n    }\n    else {\n        ret = el.textContent;\n    }\n\n    target.click();\n    return ret;\n\n});\n\n\ndef_test('filtered_result_count_test5', function () {\n    var ret;\n    var target = $('#DONE_info').first();\n\n    target.click();\n    var el = $('#taskTable_info')[0];\n    if (el.textContent.match(/Showing \\d+ to \\d+ of 68 entries.*from 117 total entries/)) {\n        ret = true;\n    }\n    else {\n        ret = el.textContent;\n    }\n\n    target.click();\n    return ret;\n\n});\n\n\ndef_test('filtered_result_count_test5', function () {\n    var ret;\n    var target = $('#DISABLED_info').first();\n\n    target.click();\n    var el = $('#taskTable_info')[0];\n    if (el.textContent.match(/Showing \\d+ to \\d+ of 0 entries.*from 117 total entries/)) {\n        ret = true;\n    }\n    else {\n        ret = el.textContent;\n    }\n\n    target.click();\n    return ret;\n\n});\n\ndef_test('filtered_result_count_test5', function () {\n    var ret;\n    var target = $('#UPSTREAM_DISABLED_info').first();\n\n    target.click();\n    var el = $('#taskTable_info')[0];\n    if (el.textContent.match(/Showing \\d+ to \\d+ of 0 entries.*from 117 total entries/)) {\n        ret = true;\n    }\n    else {\n        ret = el.textContent;\n    }\n\n    target.click();\n    return ret;\n\n});\n\n\ndef_test('filtered_result_count_test5', function () {\n    var ret;\n    var target = $('#UPSTREAM_FAILED_info').first();\n\n    target.click();\n    var el = $('#taskTable_info')[0];\n    if (el.textContent.match(/Showing \\d+ to \\d+ of 45 entries.*from 117 total entries/)) {\n        ret = true;\n    }\n    else {\n        ret = el.textContent;\n    }\n\n    target.click();\n    return ret;\n\n});\n\ndef_test('searched_result_count_test1', function () {\n    var ret;\n    var dt = $('#taskTable').DataTable();\n\n    dt.search('FailingMergeSort_1').draw();\n\n\n    var el = $('#taskTable_info')[0];\n    if (el.textContent.match(/Showing \\d+ to \\d+ of 29 entries.*from 117 total entries/)) {\n        ret = true;\n    }\n    else {\n        ret = el.textContent;\n    }\n\n    dt.search('').draw();\n    return ret;\n\n});\n\ndef_test('searched_result_count_test1', function () {\n    var ret;\n    var target = $('#serverSide label').first();\n    var dt = $('#taskTable').DataTable();\n\n    target.click();\n    dt.search('FailingMergeSort_1').draw();\n\n\n    var el = $('#taskTable_info')[0];\n    if (el.textContent.match(/Showing \\d+ to \\d+ of 29 entries.*from 117 total entries/)) {\n        ret = true;\n    }\n    else {\n        ret = el.textContent;\n    }\n\n    target.click();\n    dt.search('').draw();\n    return ret;\n\n});\n\n\npage.open(url, function(status) {\n    var ok;\n\n    console.log(\"Loaded \" + url + \", status: \" + status);\n    if(status === \"success\") {\n        ok = do_tests(page);\n    }\n    console.log('RESULT: ' + ok);\n    phantom.exit(ok === true ? 0 : -1);\n});\n"
  },
  {
    "path": "test/visualiser/visualiser_test.py",
    "content": "\"\"\"\nTest the visualiser's javascript using PhantomJS.\n\n\"\"\"\n\nimport os\nimport subprocess\nimport sys\nimport threading\nimport time\nimport unittest\n\nfrom selenium import webdriver\n\nimport luigi\n\nhere = os.path.dirname(__file__)\n\n# Patch-up path so that we can import from the directory above this one.r\n# This seems to be necessary because the `test` directory has no __init__.py but\n# adding one makes other tests fail.\nsys.path.append(os.path.join(here, \"..\"))\nfrom server_test import ServerTestBase  # noqa\n\nTEST_TIMEOUT = 40\n\n\n@unittest.skipUnless(os.environ.get(\"TEST_VISUALISER\"), \"PhantomJS tests not requested in TEST_VISUALISER\")\nclass TestVisualiser(ServerTestBase):\n    \"\"\"\n    Builds a medium-sized task tree of MergeSort results then starts\n    phantomjs  as a subprocess to interact with the scheduler.\n\n    \"\"\"\n\n    def setUp(self):\n        super(TestVisualiser, self).setUp()\n\n        x = \"I scream for ice cream\"\n        task = UberTask(base_task=FailingMergeSort, x=x, copies=4)\n        luigi.build([task], workers=1, scheduler_port=self.get_http_port())\n\n        self.done = threading.Event()\n\n        def _do_ioloop():\n            # Enter ioloop for maximum TEST_TIMEOUT.  Check every 2s whether the test has finished.\n            print(\"Entering event loop in separate thread\")\n\n            for i in range(TEST_TIMEOUT):\n                try:\n                    self.wait(timeout=1)\n                except AssertionError:\n                    pass\n                if self.done.is_set():\n                    break\n\n            print(\"Exiting event loop thread\")\n\n        self.iothread = threading.Thread(target=_do_ioloop)\n        self.iothread.start()\n\n    def tearDown(self):\n        self.done.set()\n        self.iothread.join()\n\n    def test(self):\n        port = self.get_http_port()\n        print(\"Server port is {}\".format(port))\n        print(\"Starting phantomjs\")\n\n        p = subprocess.Popen(\"phantomjs {}/phantomjs_test.js http://localhost:{}\".format(here, port), shell=True, stdin=None)\n\n        # PhantomJS may hang on an error so poll\n        status = None\n        for x in range(TEST_TIMEOUT):\n            status = p.poll()\n            if status is not None:\n                break\n            time.sleep(1)\n\n        if status is None:\n            raise AssertionError(\"PhantomJS failed to complete\")\n        else:\n            print(\"PhantomJS return status is {}\".format(status))\n            assert status == 0\n\n    # tasks tab tests.\n    def test_keeps_entries_after_page_refresh(self):\n        port = self.get_http_port()\n        driver = webdriver.PhantomJS()\n\n        driver.get(\"http://localhost:{}\".format(port))\n\n        length_select = driver.find_element_by_css_selector('select[name=\"taskTable_length\"]')\n        assert length_select.get_attribute(\"value\") == \"10\"\n        assert len(driver.find_elements_by_css_selector(\"#taskTable tbody tr\")) == 10\n\n        # Now change entries select box and check again.\n        clicked = False\n        for option in length_select.find_elements_by_css_selector(\"option\"):\n            if option.text == \"50\":\n                option.click()\n                clicked = True\n                break\n\n        assert clicked, 'Could not click option with \"50\" entries.'\n\n        assert length_select.get_attribute(\"value\") == \"50\"\n        assert len(driver.find_elements_by_css_selector(\"#taskTable tbody tr\")) == 50\n\n        # Now refresh page and check. Select box should be 50 and table should contain 50 rows.\n        driver.refresh()\n\n        # Once page refreshed we have to find all selectors again.\n        length_select = driver.find_element_by_css_selector('select[name=\"taskTable_length\"]')\n        assert length_select.get_attribute(\"value\") == \"50\"\n        assert len(driver.find_elements_by_css_selector(\"#taskTable tbody tr\")) == 50\n\n    def test_keeps_table_filter_after_page_refresh(self):\n        port = self.get_http_port()\n        driver = webdriver.PhantomJS()\n\n        driver.get(\"http://localhost:{}\".format(port))\n\n        # Check initial state.\n        search_input = driver.find_element_by_css_selector('input[type=\"search\"]')\n        assert search_input.get_attribute(\"value\") == \"\"\n        assert len(driver.find_elements_by_css_selector(\"#taskTable tbody tr\")) == 10\n\n        # Now filter and check filtered table.\n        search_input.send_keys(\"ber\")\n        # UberTask only should be displayed.\n        assert len(driver.find_elements_by_css_selector(\"#taskTable tbody tr\")) == 1\n\n        # Now refresh page and check. Filter input should contain 'ber' and table should contain\n        # one row (UberTask).\n        driver.refresh()\n\n        # Once page refreshed we have to find all selectors again.\n        search_input = driver.find_element_by_css_selector('input[type=\"search\"]')\n        assert search_input.get_attribute(\"value\") == \"ber\"\n        assert len(driver.find_elements_by_css_selector(\"#taskTable tbody tr\")) == 1\n\n    def test_keeps_order_after_page_refresh(self):\n        port = self.get_http_port()\n        driver = webdriver.PhantomJS()\n\n        driver.get(\"http://localhost:{}\".format(port))\n\n        # Order by name (asc).\n        column = driver.find_elements_by_css_selector(\"#taskTable thead th\")[1]\n        column.click()\n\n        table_body = driver.find_element_by_css_selector(\"#taskTable tbody\")\n        assert self._get_cell_value(table_body, 0, 1) == \"FailingMergeSort_0\"\n\n        # Ordery by name (desc).\n        column.click()\n        assert self._get_cell_value(table_body, 0, 1) == \"UberTask\"\n\n        # Now refresh page and check. Table should be ordered by name (desc).\n        driver.refresh()\n\n        # Once page refreshed we have to find all selectors again.\n        table_body = driver.find_element_by_css_selector(\"#taskTable tbody\")\n        assert self._get_cell_value(table_body, 0, 1) == \"UberTask\"\n\n    def test_keeps_filter_on_server_after_page_refresh(self):\n        port = self.get_http_port()\n        driver = webdriver.PhantomJS()\n\n        driver.get(\"http://localhost:{}/static/visualiser/index.html#tab=tasks\".format(port))\n\n        # Check initial state.\n        checkbox = driver.find_element_by_css_selector(\"#serverSideCheckbox\")\n        assert checkbox.is_selected() is False\n\n        # Change invert checkbox.\n        checkbox.click()\n\n        # Now refresh page and check. Invert checkbox shoud be checked.\n        driver.refresh()\n\n        # Once page refreshed we have to find all selectors again.\n        checkbox = driver.find_element_by_css_selector(\"#serverSideCheckbox\")\n        assert checkbox.is_selected()\n\n    def test_synchronizes_fields_on_tasks_tab(self):\n        # Check fields population if tasks tab was opened by direct link\n        port = self.get_http_port()\n        driver = webdriver.PhantomJS()\n        url = \"http://localhost:{}/static/visualiser/index.html#tab=tasks&length=50&search__search=er&filterOnServer=1&order=1,desc\".format(port)\n\n        driver.get(url)\n\n        length_select = driver.find_element_by_css_selector('select[name=\"taskTable_length\"]')\n        assert length_select.get_attribute(\"value\") == \"50\"\n\n        search_input = driver.find_element_by_css_selector('input[type=\"search\"]')\n        assert search_input.get_attribute(\"value\") == \"er\"\n        assert len(driver.find_elements_by_css_selector(\"#taskTable tbody tr\")) == 50\n\n        # Table is ordered by first column (name)\n        table_body = driver.find_element_by_css_selector(\"#taskTable tbody\")\n        assert self._get_cell_value(table_body, 0, 1) == \"UberTask\"\n\n    # graph tab tests.\n\n    def test_keeps_invert_after_page_refresh(self):\n        port = self.get_http_port()\n        driver = webdriver.PhantomJS()\n\n        driver.get(\"http://localhost:{}/static/visualiser/index.html#tab=graph\".format(port))\n\n        # Check initial state.\n        invert_checkbox = driver.find_element_by_css_selector(\"#invertCheckbox\")\n        assert invert_checkbox.is_selected() is False\n\n        # Change invert checkbox.\n        invert_checkbox.click()\n\n        # Now refresh page and check. Invert checkbox shoud be checked.\n        driver.refresh()\n\n        # Once page refreshed we have to find all selectors again.\n        invert_checkbox = driver.find_element_by_css_selector(\"#invertCheckbox\")\n        assert invert_checkbox.is_selected()\n\n    def test_keeps_task_id_after_page_refresh(self):\n        port = self.get_http_port()\n        driver = webdriver.PhantomJS()\n\n        driver.get(\"http://localhost:{}/static/visualiser/index.html#tab=graph\".format(port))\n\n        # Check initial state.\n        task_id_input = driver.find_element_by_css_selector(\"#js-task-id\")\n        assert task_id_input.get_attribute(\"value\") == \"\"\n\n        # Change task id\n        task_id_input.send_keys(\"1\")\n        driver.find_element_by_css_selector(\"#loadTaskForm button[type=submit]\").click()\n\n        # Now refresh page and check. Task ID field should contain 1\n        driver.refresh()\n\n        # Once page refreshed we have to find all selectors again.\n        task_id_input = driver.find_element_by_css_selector(\"#js-task-id\")\n        assert task_id_input.get_attribute(\"value\") == \"1\"\n\n    def test_keeps_hide_done_after_page_refresh(self):\n        port = self.get_http_port()\n        driver = webdriver.PhantomJS()\n\n        driver.get(\"http://localhost:{}/static/visualiser/index.html#tab=graph\".format(port))\n\n        # Check initial state.\n        hide_done_checkbox = driver.find_element_by_css_selector(\"#hideDoneCheckbox\")\n        assert hide_done_checkbox.is_selected() is False\n\n        # Change invert checkbox.\n        hide_done_checkbox.click()\n\n        # Now refresh page and check. Invert checkbox shoud be checked.\n        driver.refresh()\n\n        # Once page refreshed we have to find all selectors again.\n        hide_done_checkbox = driver.find_element_by_css_selector(\"#hideDoneCheckbox\")\n        assert hide_done_checkbox.is_selected()\n\n    def test_keeps_visualisation_type_after_page_refresh(self):\n        port = self.get_http_port()\n        driver = webdriver.PhantomJS()\n\n        driver.get(\"http://localhost:{}/static/visualiser/index.html#tab=graph\".format(port))\n\n        # Check initial state.\n        svg_radio = driver.find_element_by_css_selector(\"input[value=svg]\")\n        assert svg_radio.is_selected()\n\n        # Change vistype to d3 by clicking on its label.\n        d3_radio = driver.find_element_by_css_selector(\"input[value=d3]\")\n        d3_radio.find_element_by_xpath(\"..\").click()\n\n        # Now refresh page and check. D3 checkbox shoud be checked.\n        driver.refresh()\n\n        # Once page refreshed we have to find all selectors again.\n        d3_radio = driver.find_element_by_css_selector(\"input[value=d3]\")\n        assert d3_radio.is_selected()\n\n    def test_synchronizes_fields_on_graph_tab(self):\n        # Check fields population if tasks tab was opened by direct link.\n        port = self.get_http_port()\n        driver = webdriver.PhantomJS()\n        url = \"http://localhost:{}/static/visualiser/index.html#tab=graph&taskId=1&invert=1&hideDone=1&visType=svg\".format(port)\n        driver.get(url)\n\n        # Check task id input\n        task_id_input = driver.find_element_by_css_selector(\"#js-task-id\")\n        assert task_id_input.get_attribute(\"value\") == \"1\"\n\n        # Check Show Upstream Dependencies checkbox.\n        invert_checkbox = driver.find_element_by_css_selector(\"#invertCheckbox\")\n        assert invert_checkbox.is_selected()\n\n        # Check Hide Done checkbox.\n        hide_done_checkbox = driver.find_element_by_css_selector(\"#hideDoneCheckbox\")\n        assert hide_done_checkbox.is_selected()\n\n        svg_radio = driver.find_element_by_css_selector(\"input[value=svg]\")\n        assert svg_radio.get_attribute(\"checked\")\n\n    def _get_cell_value(self, elem, row, column):\n        tr = elem.find_elements_by_css_selector(\"#taskTable tbody tr\")[row]\n        td = tr.find_elements_by_css_selector(\"td\")[column]\n        return td.text\n\n\n# ---------------------------------------------------------------------------\n# Code for generating a tree of tasks with some failures.\n\n\ndef generate_task_families(task_class, n):\n    \"\"\"\n    Generate n copies of a task with different task_family names.\n\n    :param task_class: a subclass of `luigi.Task`\n    :param n: number of copies of `task_class` to create\n    :return: Dictionary of task_family => task_class\n\n    \"\"\"\n    ret = {}\n    for i in range(n):\n        class_name = \"{}_{}\".format(task_class.task_family, i)\n        ret[class_name] = type(class_name, (task_class,), {})\n\n    return ret\n\n\nclass UberTask(luigi.Task):\n    \"\"\"\n    A task which depends on n copies of a configurable subclass.\n\n    \"\"\"\n\n    _done = False\n\n    base_task = luigi.TaskParameter()\n    x = luigi.Parameter()\n    copies = luigi.IntParameter()\n\n    def requires(self):\n        task_families = generate_task_families(self.base_task, self.copies)\n        for class_name in task_families:\n            yield task_families[class_name](x=self.x)\n\n    def complete(self):\n        return self._done\n\n    def run(self):\n        self._done = True\n\n\ndef popmin(a, b):\n    \"\"\"\n    popmin(a, b) -> (i, a', b')\n\n    where i is min(a[0], b[0]) and a'/b' are the results of removing i from the\n    relevant sequence.\n    \"\"\"\n    if len(a) == 0:\n        return b[0], a, b[1:]\n    elif len(b) == 0:\n        return a[0], a[1:], b\n    elif a[0] > b[0]:\n        return b[0], a, b[1:]\n    else:\n        return a[0], a[1:], b\n\n\nclass MemoryTarget(luigi.Target):\n    def __init__(self):\n        self.box = None\n\n    def exists(self):\n        return self.box is not None\n\n\nclass MergeSort(luigi.Task):\n    x = luigi.Parameter(description=\"A string to be sorted\")\n\n    def __init__(self, *args, **kwargs):\n        super(MergeSort, self).__init__(*args, **kwargs)\n\n        self.result = MemoryTarget()\n\n    def requires(self):\n        # Allows us to override behaviour in subclasses\n        cls = self.__class__\n\n        if len(self.x) > 1:\n            p = len(self.x) // 2\n\n            return [cls(self.x[:p]), cls(self.x[p:])]\n\n    def output(self):\n        return self.result\n\n    def run(self):\n        if len(self.x) > 1:\n            list_1, list_2 = (x.box for x in self.input())\n\n            s = []\n            while list_1 or list_2:\n                item, list_1, list_2 = popmin(list_1, list_2)\n                s.append(item)\n        else:\n            s = self.x\n\n        self.result.box = \"\".join(s)\n\n\nclass FailingMergeSort(MergeSort):\n    \"\"\"\n    Simply fail if the string to sort starts with ' '.\n\n    \"\"\"\n\n    fail_probability = luigi.FloatParameter(default=0.0)\n\n    def run(self):\n        if self.x[0] == \" \":\n            raise Exception(\"I failed\")\n        else:\n            return super(FailingMergeSort, self).run()\n\n\nif __name__ == \"__main__\":\n    x = \"I scream for ice cream\"\n    task = UberTask(base_task=FailingMergeSort, x=x, copies=4)\n    luigi.build([task], workers=1, scheduler_port=8082)\n"
  },
  {
    "path": "test/worker_external_task_test.py",
    "content": "# Copyright (c) 2015\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n\nimport os\nimport shutil\nimport tempfile\n\nfrom helpers import unittest, with_config\nfrom mock import patch\n\nimport luigi\nimport luigi.server\nimport luigi.task\nimport luigi.worker\nfrom luigi.local_target import LocalTarget\nfrom luigi.scheduler import Scheduler\n\n\nclass TestExternalFileTask(luigi.ExternalTask):\n    \"\"\"Mocking tasks is a pain, so touch a file instead\"\"\"\n\n    path = luigi.Parameter()\n    times_to_call = luigi.IntParameter()\n\n    def __init__(self, *args, **kwargs):\n        super(TestExternalFileTask, self).__init__(*args, **kwargs)\n        self.times_called = 0\n\n    def complete(self):\n        \"\"\"\n        Create the file we need after a number of preconfigured attempts\n        \"\"\"\n        self.times_called += 1\n\n        if self.times_called >= self.times_to_call:\n            open(self.path, \"a\").close()\n\n        return os.path.exists(self.path)\n\n    def output(self):\n        return LocalTarget(path=self.path)\n\n\nclass TestTask(luigi.Task):\n    \"\"\"\n    Requires a single file dependency\n    \"\"\"\n\n    tempdir = luigi.Parameter()\n    complete_after = luigi.IntParameter()\n\n    def __init__(self, *args, **kwargs):\n        super(TestTask, self).__init__(*args, **kwargs)\n        self.output_path = os.path.join(self.tempdir, \"test.output\")\n        self.dep_path = os.path.join(self.tempdir, \"test.dep\")\n        self.dependency = TestExternalFileTask(path=self.dep_path, times_to_call=self.complete_after)\n\n    def requires(self):\n        yield self.dependency\n\n    def output(self):\n        return LocalTarget(path=self.output_path)\n\n    def run(self):\n        open(self.output_path, \"a\").close()\n\n\nclass WorkerExternalTaskTest(unittest.TestCase):\n    def setUp(self):\n        self.tempdir = tempfile.mkdtemp(prefix=\"luigi-test-\")\n\n    def tearDown(self):\n        shutil.rmtree(self.tempdir)\n\n    def _assert_complete(self, tasks):\n        for t in tasks:\n            self.assert_(t.complete())\n\n    def _build(self, tasks):\n        with self._make_worker() as w:\n            for t in tasks:\n                w.add(t)\n            w.run()\n\n    def _make_worker(self):\n        self.scheduler = Scheduler(prune_on_get_work=True)\n        return luigi.worker.Worker(scheduler=self.scheduler, worker_processes=1)\n\n    def test_external_dependency_already_complete(self):\n        \"\"\"\n        Test that the test task completes when its dependency exists at the\n        start of the execution.\n        \"\"\"\n        test_task = TestTask(tempdir=self.tempdir, complete_after=1)\n        luigi.build([test_task], local_scheduler=True)\n\n        assert os.path.exists(test_task.dep_path)\n        assert os.path.exists(test_task.output_path)\n\n        # complete() is called once per failure, twice per success\n        assert test_task.dependency.times_called == 2\n\n    @with_config({\"worker\": {\"retry_external_tasks\": \"true\"}, \"scheduler\": {\"retry_delay\": \"0.0\"}})\n    def test_external_dependency_gets_rechecked(self):\n        \"\"\"\n        Test that retry_external_tasks re-checks external tasks\n        \"\"\"\n        assert luigi.worker.worker().retry_external_tasks is True\n\n        test_task = TestTask(tempdir=self.tempdir, complete_after=10)\n        self._build([test_task])\n\n        assert os.path.exists(test_task.dep_path)\n        assert os.path.exists(test_task.output_path)\n\n        self.assertGreaterEqual(test_task.dependency.times_called, 10)\n\n    @with_config({\"worker\": {\"retry_external_tasks\": \"true\", \"keep_alive\": \"true\", \"wait_interval\": \"0.00001\"}, \"scheduler\": {\"retry_delay\": \"0.01\"}})\n    def test_external_dependency_worker_is_patient(self):\n        \"\"\"\n        Test that worker doesn't \"give up\" with keep_alive option\n\n        Instead, it should sleep for random.uniform() seconds, then ask\n        scheduler for work.\n        \"\"\"\n        assert luigi.worker.worker().retry_external_tasks is True\n\n        with patch(\"random.uniform\", return_value=0.001):\n            test_task = TestTask(tempdir=self.tempdir, complete_after=5)\n            self._build([test_task])\n\n        assert os.path.exists(test_task.dep_path)\n        assert os.path.exists(test_task.output_path)\n\n        self.assertGreaterEqual(test_task.dependency.times_called, 5)\n\n    def test_external_dependency_bare(self):\n        \"\"\"\n        Test ExternalTask without altering global settings.\n        \"\"\"\n        assert luigi.worker.worker().retry_external_tasks is False\n\n        test_task = TestTask(tempdir=self.tempdir, complete_after=5)\n\n        scheduler = luigi.scheduler.Scheduler(retry_delay=0.01, prune_on_get_work=True)\n        with luigi.worker.Worker(retry_external_tasks=True, scheduler=scheduler, keep_alive=True, wait_interval=0.00001, wait_jitter=0) as w:\n            w.add(test_task)\n            w.run()\n\n        assert os.path.exists(test_task.dep_path)\n        assert os.path.exists(test_task.output_path)\n\n        self.assertGreaterEqual(test_task.dependency.times_called, 5)\n\n    @with_config(\n        {\n            \"worker\": {\n                \"retry_external_tasks\": \"true\",\n            },\n            \"scheduler\": {\"retry_delay\": \"0.0\"},\n        }\n    )\n    def test_external_task_complete_but_missing_dep_at_runtime(self):\n        \"\"\"\n        Test external task complete but has missing upstream dependency at\n        runtime.\n\n        Should not get \"unfulfilled dependencies\" error.\n        \"\"\"\n        test_task = TestTask(tempdir=self.tempdir, complete_after=3)\n        test_task.run = NotImplemented\n\n        assert len(test_task.deps()) > 0\n\n        # split up scheduling task and running to simulate runtime scenario\n        with self._make_worker() as w:\n            w.add(test_task)\n            # touch output so test_task should be considered complete at runtime\n            open(test_task.output_path, \"a\").close()\n            success = w.run()\n\n        self.assertTrue(success)\n        # upstream dependency output didn't exist at runtime\n        self.assertFalse(os.path.exists(test_task.dep_path))\n"
  },
  {
    "path": "test/worker_keep_alive_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2016 VNG Corporation\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport threading\n\nfrom helpers import LuigiTestCase\n\nimport luigi\nfrom luigi.scheduler import Scheduler\nfrom luigi.worker import Worker\n\n\nclass WorkerKeepAliveUpstreamTest(LuigiTestCase):\n    \"\"\"\n    Tests related to how the worker stays alive after upstream status changes.\n\n    See https://github.com/spotify/luigi/pull/1789\n    \"\"\"\n\n    def run(self, result=None):\n        \"\"\"\n        Common setup code. Due to the contextmanager cant use normal setup\n        \"\"\"\n        self.sch = Scheduler(retry_delay=0.00000001, retry_count=2)\n\n        with Worker(scheduler=self.sch, worker_id=\"X\", keep_alive=True, wait_interval=0.1, wait_jitter=0) as w:\n            self.w = w\n            super(WorkerKeepAliveUpstreamTest, self).run(result)\n\n    def test_alive_while_has_failure(self):\n        \"\"\"\n        One dependency disables and one fails\n        \"\"\"\n\n        class Disabler(luigi.Task):\n            pass\n\n        class Failer(luigi.Task):\n            did_run = False\n\n            def run(self):\n                self.did_run = True\n\n        class Wrapper(luigi.WrapperTask):\n            def requires(self):\n                return (Disabler(), Failer())\n\n        self.w.add(Wrapper())\n        disabler = Disabler().task_id\n        failer = Failer().task_id\n        self.sch.add_task(disabler, \"FAILED\", worker=\"X\")\n        self.sch.prune()  # Make scheduler unfail the disabled task\n        self.sch.add_task(disabler, \"FAILED\", worker=\"X\")  # Disable it\n        self.sch.add_task(failer, \"FAILED\", worker=\"X\")  # Fail it\n        try:\n            t = threading.Thread(target=self.w.run)\n            t.start()\n            t.join(timeout=1)  # Wait 1 second\n            self.assertTrue(t.is_alive())  # It shouldn't stop trying, the failed task should be retried!\n            self.assertFalse(Failer.did_run)  # It should never have run, the cooldown is longer than a second.\n        finally:\n            self.sch.prune()  # Make it, like die. Couldn't find a more forceful way to do this.\n            t.join(timeout=1)  # Wait 1 second\n            assert not t.is_alive()\n\n    def test_alive_while_has_success(self):\n        \"\"\"\n        One dependency disables and one succeeds\n        \"\"\"\n\n        # TODO: Fix copy paste mess\n        class Disabler(luigi.Task):\n            pass\n\n        class Succeeder(luigi.Task):\n            did_run = False\n\n            def run(self):\n                self.did_run = True\n\n        class Wrapper(luigi.WrapperTask):\n            def requires(self):\n                return (Disabler(), Succeeder())\n\n        self.w.add(Wrapper())\n        disabler = Disabler().task_id\n        succeeder = Succeeder().task_id\n        self.sch.add_task(disabler, \"FAILED\", worker=\"X\")\n        self.sch.prune()  # Make scheduler unfail the disabled task\n        self.sch.add_task(disabler, \"FAILED\", worker=\"X\")  # Disable it\n        self.sch.add_task(succeeder, \"DONE\", worker=\"X\")  # Fail it\n        try:\n            t = threading.Thread(target=self.w.run)\n            t.start()\n            t.join(timeout=1)  # Wait 1 second\n            self.assertFalse(t.is_alive())  # The worker should think that it should stop ...\n            # ... because in this case the only work remaining depends on DISABLED tasks,\n            # hence it's not worth considering the wrapper task as a PENDING task to\n            # keep the worker alive anymore.\n            self.assertFalse(Succeeder.did_run)  # It should never have run, it succeeded already\n        finally:\n            self.sch.prune()  # This shouldnt be necessary in this version, but whatevs\n            t.join(timeout=1)  # Wait 1 second\n            assert not t.is_alive()\n"
  },
  {
    "path": "test/worker_multiprocess_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport logging\n\nfrom helpers import unittest\nfrom mock import Mock\n\nimport luigi.notifications\nimport luigi.worker\nfrom luigi import Parameter, RemoteScheduler, Task\nfrom luigi.worker import Worker\n\nluigi.notifications.DEBUG = True\n\n\nclass DummyTask(Task):\n    param = Parameter()\n\n    def __init__(self, *args, **kwargs):\n        super(DummyTask, self).__init__(*args, **kwargs)\n        self.has_run = False\n\n    def complete(self):\n        old_value = self.has_run\n        self.has_run = True\n        return old_value\n\n    def run(self):\n        logging.debug(\"%s - setting has_run\", self)\n        self.has_run = True\n\n\nclass MultiprocessWorkerTest(unittest.TestCase):\n    def run(self, result=None):\n        self.scheduler = RemoteScheduler()\n        self.scheduler.add_worker = Mock()\n        self.scheduler.add_task = Mock()\n        with Worker(scheduler=self.scheduler, worker_id=\"X\", worker_processes=2) as worker:\n            self.worker = worker\n            super(MultiprocessWorkerTest, self).run(result)\n\n    def gw_res(self, pending, task_id):\n        return dict(n_pending_tasks=pending, task_id=task_id, running_tasks=0, n_unique_pending=0)\n\n    def test_positive_path(self):\n        a = DummyTask(\"a\")\n        b = DummyTask(\"b\")\n\n        class MultipleRequirementTask(DummyTask):\n            def requires(self):\n                return [a, b]\n\n        c = MultipleRequirementTask(\"C\")\n\n        self.assertTrue(self.worker.add(c))\n\n        self.scheduler.get_work = Mock(\n            side_effect=[self.gw_res(3, a.task_id), self.gw_res(2, b.task_id), self.gw_res(1, c.task_id), self.gw_res(0, None), self.gw_res(0, None)]\n        )\n\n        self.assertTrue(self.worker.run())\n        self.assertTrue(c.has_run)\n\n    def test_path_with_task_failures(self):\n        class FailingTask(DummyTask):\n            def run(self):\n                raise Exception(\"I am failing\")\n\n        a = FailingTask(\"a\")\n        b = FailingTask(\"b\")\n\n        class MultipleRequirementTask(DummyTask):\n            def requires(self):\n                return [a, b]\n\n        c = MultipleRequirementTask(\"C\")\n\n        self.assertTrue(self.worker.add(c))\n\n        self.scheduler.get_work = Mock(\n            side_effect=[self.gw_res(3, a.task_id), self.gw_res(2, b.task_id), self.gw_res(1, c.task_id), self.gw_res(0, None), self.gw_res(0, None)]\n        )\n\n        self.assertFalse(self.worker.run())\n\n\nclass SingleWorkerMultiprocessTest(unittest.TestCase):\n    def test_default_multiprocessing_behavior(self):\n        with Worker(worker_processes=1) as worker:\n            task = DummyTask(\"a\")\n            task_process = worker._create_task_process(task)\n            self.assertFalse(task_process.use_multiprocessing)\n\n    def test_force_multiprocessing(self):\n        with Worker(worker_processes=1, force_multiprocessing=True) as worker:\n            task = DummyTask(\"a\")\n            task_process = worker._create_task_process(task)\n            self.assertTrue(task_process.use_multiprocessing)\n"
  },
  {
    "path": "test/worker_parallel_scheduling_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport contextlib\nimport gc\nimport os\nimport pickle\nimport time\n\nimport mock\nimport psutil\nfrom helpers import unittest\n\nimport luigi\nfrom luigi.task_status import UNKNOWN\nfrom luigi.worker import Worker\n\n\ndef running_children():\n    children = set()\n    process = psutil.Process(os.getpid())\n    for child in process.children():\n        if child.is_running():\n            children.add(child.pid)\n    return children\n\n\n@contextlib.contextmanager\ndef pause_gc():\n    if not gc.isenabled():\n        yield\n    try:\n        gc.disable()\n        yield\n    finally:\n        gc.enable()\n\n\nclass SlowCompleteWrapper(luigi.WrapperTask):\n    def requires(self):\n        return [SlowCompleteTask(i) for i in range(4)]\n\n\nclass SlowCompleteTask(luigi.Task):\n    n = luigi.IntParameter()\n\n    def complete(self):\n        time.sleep(0.1)\n        return True\n\n\nclass OverlappingSelfDependenciesTask(luigi.Task):\n    n = luigi.IntParameter()\n    k = luigi.IntParameter()\n\n    def complete(self):\n        return self.n < self.k or self.k == 0\n\n    def requires(self):\n        return [OverlappingSelfDependenciesTask(self.n - 1, k) for k in range(self.k + 1)]\n\n\nclass ExceptionCompleteTask(luigi.Task):\n    def complete(self):\n        assert False\n\n\nclass ExceptionRequiresTask(luigi.Task):\n    def requires(self):\n        assert False\n\n\nclass UnpicklableExceptionTask(luigi.Task):\n    def complete(self):\n        class UnpicklableException(Exception):\n            pass\n\n        raise UnpicklableException()\n\n\nclass ParallelSchedulingTest(unittest.TestCase):\n    def setUp(self):\n        self.sch = mock.Mock()\n        self.w = Worker(scheduler=self.sch, worker_id=\"x\")\n\n    def added_tasks(self, status):\n        return [kw[\"task_id\"] for args, kw in self.sch.add_task.call_args_list if kw[\"status\"] == status]\n\n    def test_number_of_processes(self):\n        import multiprocessing\n\n        real_pool = multiprocessing.Pool(1)\n        with mock.patch(\"multiprocessing.Pool\") as mocked_pool:\n            mocked_pool.return_value = real_pool\n            self.w.add(OverlappingSelfDependenciesTask(n=1, k=1), multiprocess=True, processes=1234)\n            mocked_pool.assert_called_once_with(processes=1234)\n\n    def test_zero_processes(self):\n        import multiprocessing\n\n        real_pool = multiprocessing.Pool(1)\n        with mock.patch(\"multiprocessing.Pool\") as mocked_pool:\n            mocked_pool.return_value = real_pool\n            self.w.add(OverlappingSelfDependenciesTask(n=1, k=1), multiprocess=True, processes=0)\n            mocked_pool.assert_called_once_with(processes=None)\n\n    def test_children_terminated(self):\n        before_children = running_children()\n        with pause_gc():\n            self.w.add(\n                OverlappingSelfDependenciesTask(5, 2),\n                multiprocess=True,\n            )\n            self.assertLessEqual(running_children(), before_children)\n\n    def test_multiprocess_scheduling_with_overlapping_dependencies(self):\n        self.w.add(OverlappingSelfDependenciesTask(5, 2), True)\n        self.assertEqual(15, self.sch.add_task.call_count)\n        self.assertEqual(\n            set(\n                (\n                    OverlappingSelfDependenciesTask(n=1, k=1).task_id,\n                    OverlappingSelfDependenciesTask(n=2, k=1).task_id,\n                    OverlappingSelfDependenciesTask(n=2, k=2).task_id,\n                    OverlappingSelfDependenciesTask(n=3, k=1).task_id,\n                    OverlappingSelfDependenciesTask(n=3, k=2).task_id,\n                    OverlappingSelfDependenciesTask(n=4, k=1).task_id,\n                    OverlappingSelfDependenciesTask(n=4, k=2).task_id,\n                    OverlappingSelfDependenciesTask(n=5, k=2).task_id,\n                )\n            ),\n            set(self.added_tasks(\"PENDING\")),\n        )\n        self.assertEqual(\n            set(\n                (\n                    OverlappingSelfDependenciesTask(n=0, k=0).task_id,\n                    OverlappingSelfDependenciesTask(n=0, k=1).task_id,\n                    OverlappingSelfDependenciesTask(n=1, k=0).task_id,\n                    OverlappingSelfDependenciesTask(n=1, k=2).task_id,\n                    OverlappingSelfDependenciesTask(n=2, k=0).task_id,\n                    OverlappingSelfDependenciesTask(n=3, k=0).task_id,\n                    OverlappingSelfDependenciesTask(n=4, k=0).task_id,\n                )\n            ),\n            set(self.added_tasks(\"DONE\")),\n        )\n\n    @mock.patch(\"luigi.notifications.send_error_email\")\n    def test_raise_exception_in_complete(self, send):\n        self.w.add(ExceptionCompleteTask(), multiprocess=True)\n        send.check_called_once()\n        self.assertEqual(UNKNOWN, self.sch.add_task.call_args[1][\"status\"])\n        self.assertFalse(self.sch.add_task.call_args[1][\"runnable\"])\n        self.assertTrue(\"assert False\" in send.call_args[0][1])\n\n    @mock.patch(\"luigi.notifications.send_error_email\")\n    def test_raise_unpicklable_exception_in_complete(self, send):\n        # verify exception can't be pickled\n        self.assertRaises(Exception, UnpicklableExceptionTask().complete)\n        try:\n            UnpicklableExceptionTask().complete()\n        except Exception as e:\n            ex = e\n        self.assertRaises((pickle.PicklingError, AttributeError), pickle.dumps, ex)\n\n        # verify this can run async\n        self.w.add(UnpicklableExceptionTask(), multiprocess=True)\n        send.check_called_once()\n        self.assertEqual(UNKNOWN, self.sch.add_task.call_args[1][\"status\"])\n        self.assertFalse(self.sch.add_task.call_args[1][\"runnable\"])\n        self.assertTrue(\"raise UnpicklableException()\" in send.call_args[0][1])\n\n    @mock.patch(\"luigi.notifications.send_error_email\")\n    def test_raise_exception_in_requires(self, send):\n        self.w.add(ExceptionRequiresTask(), multiprocess=True)\n        send.check_called_once()\n        self.assertEqual(UNKNOWN, self.sch.add_task.call_args[1][\"status\"])\n        self.assertFalse(self.sch.add_task.call_args[1][\"runnable\"])\n"
  },
  {
    "path": "test/worker_scheduler_com_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2017 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport contextlib\nimport os\nimport shutil\nimport tempfile\nimport threading\nimport time\n\nfrom helpers import LuigiTestCase\n\nimport luigi\nfrom luigi.scheduler import Scheduler\nfrom luigi.worker import Worker\n\n\nclass WorkerSchedulerCommunicationTest(LuigiTestCase):\n    \"\"\"\n    Tests related to communication between Worker and Scheduler that is based on the ping polling.\n\n    See https://github.com/spotify/luigi/pull/1993\n    \"\"\"\n\n    def run(self, result=None):\n        self.sch = Scheduler()\n\n        with Worker(scheduler=self.sch, worker_id=\"X\", ping_interval=1, max_reschedules=0) as w:\n            self.w = w\n\n            # also save scheduler's worker struct\n            self.sw = self.sch._state.get_worker(self.w._id)\n\n            super(WorkerSchedulerCommunicationTest, self).run(result)\n\n    def wrapper_task(test_self):\n        tmp = tempfile.mkdtemp()\n\n        class MyTask(luigi.Task):\n            n = luigi.IntParameter()\n            delay = 3\n\n            def output(self):\n                basename = \"%s_%s.txt\" % (self.__class__.__name__, self.n)\n                return luigi.LocalTarget(os.path.join(tmp, basename))\n\n            def run(self):\n                time.sleep(self.delay)\n                with self.output().open(\"w\") as f:\n                    f.write(\"content\\n\")\n\n        class Wrapper(MyTask):\n            delay = 0\n\n            def requires(self):\n                return [MyTask(n=n) for n in range(self.n)]\n\n        return Wrapper, tmp\n\n    def test_message_handling(self):\n        # add some messages for that worker\n        for i in range(10):\n            self.sw.add_rpc_message(\"foo\", i=i)\n        self.assertEqual(10, len(self.sw.rpc_messages))\n        self.assertEqual(9, self.sw.rpc_messages[-1][\"kwargs\"][\"i\"])\n\n        # fetch\n        msgs = self.sw.fetch_rpc_messages()\n        self.assertEqual(0, len(self.sw.rpc_messages))\n        self.assertEqual(9, msgs[-1][\"kwargs\"][\"i\"])\n\n    def test_ping_content(self):\n        # add some messages for that worker\n        for i in range(10):\n            self.sw.add_rpc_message(\"foo\", i=i)\n\n        # ping the scheduler and check the result\n        res = self.sch.ping(worker=self.w._id)\n        self.assertIn(\"rpc_messages\", res)\n        msgs = res[\"rpc_messages\"]\n        self.assertEqual(10, len(msgs))\n        self.assertEqual(\"foo\", msgs[-1][\"name\"])\n        self.assertEqual(9, msgs[-1][\"kwargs\"][\"i\"])\n\n        # there should be no message left\n        self.assertEqual(0, len(self.sw.rpc_messages))\n\n    @contextlib.contextmanager\n    def run_wrapper(self, n):\n        # assign the wrapper task to the worker\n        Wrapper, tmp = self.wrapper_task()\n        wrapper = Wrapper(n=n)\n        self.assertTrue(self.w.add(wrapper))\n\n        # check the initial number of worker processes\n        self.assertEqual(1, self.w.worker_processes)\n\n        # run the task in a thread and while running, increase the number of worker processes\n        # via an rpc message\n        t = threading.Thread(target=self.w.run)\n        t.start()\n\n        # yield\n        yield wrapper, t\n\n        # finally, check that thread is done\n        self.assertFalse(t.is_alive())\n\n        # cleanup the tmp dir\n        shutil.rmtree(tmp)\n\n    def test_dispatch_valid_message(self):\n        with self.run_wrapper(3) as (wrapper, t):\n            # each of the wrapper task's tasks runs 3 seconds, and the ping/message dispatch\n            # interval is 1 second, so it should be safe to wait 1 second here, add the message\n            # which is then fetched by the keep alive thread and dispatched, so after additional 3\n            # seconds, the worker will have a changed number of processes\n            t.join(1)\n            self.sch.set_worker_processes(self.w._id, 2)\n\n            t.join(3)\n            self.assertEqual(2, self.w.worker_processes)\n\n            # after additional 3 seconds, the wrapper task + all required tasks should be completed\n            t.join(3)\n            self.assertTrue(all(task.complete() for task in wrapper.requires()))\n            self.assertTrue(wrapper.complete())\n\n    def test_dispatch_invalid_message(self):\n        # this test is identical to test_dispatch_valid_message, except that the number of processes\n        # is not increased during running as we send an invalid rpc message\n        # in addition, the wrapper will only have two requirements\n        with self.run_wrapper(2) as (wrapper, t):\n            # timing info as above\n            t.join(1)\n            self.sw.add_rpc_message(\"set_worker_processes_not_there\", n=2)\n\n            t.join(3)\n            self.assertEqual(1, self.w.worker_processes)\n\n            # after additional 3 seconds, the wrapper task and all required tasks should be completed\n            t.join(3)\n            self.assertTrue(all(task.complete() for task in wrapper.requires()))\n            self.assertTrue(wrapper.complete())\n\n    def test_dispatch_unregistered_message(self):\n        # this test is identical to test_dispatch_valid_message, except that the number of processes\n        # is not increased during running as we disable the particular callback to work as a\n        # callback, so we want to achieve sth like\n        # self.w.set_worker_processes.is_rpc_message_callback = False\n        # but this is not possible in py 2 due to wrapped method lookup, see\n        # http://stackoverflow.com/questions/9523370/adding-attributes-to-instance-methods-in-python\n        set_worker_processes_orig = self.w.set_worker_processes\n\n        def set_worker_processes_replacement(*args, **kwargs):\n            return set_worker_processes_orig(*args, **kwargs)\n\n        self.w.set_worker_processes = set_worker_processes_replacement\n        self.assertFalse(getattr(self.w.set_worker_processes, \"is_rpc_message_callback\", False))\n\n        with self.run_wrapper(2) as (wrapper, t):\n            # timing info as above\n            t.join(1)\n            self.sw.add_rpc_message(\"set_worker_processes\", n=2)\n\n            t.join(3)\n            self.assertEqual(1, self.w.worker_processes)\n\n            # after additional 3 seconds, the wrapper task and all required tasks should be completed\n            t.join(3)\n            self.assertTrue(all(task.complete() for task in wrapper.requires()))\n            self.assertTrue(wrapper.complete())\n"
  },
  {
    "path": "test/worker_task_process_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport multiprocessing\n\nfrom helpers import LuigiTestCase, temporary_unloaded_module\n\nimport luigi\nfrom luigi.worker import Worker\n\n\nclass ContextManagedTaskProcessTest(LuigiTestCase):\n    def _test_context_manager(self, force_multiprocessing):\n        CONTEXT_MANAGER_MODULE = b\"\"\"\nclass MyContextManager:\n    def __init__(self, task_process):\n        self.task = task_process.task\n    def __enter__(self):\n        assert not self.task.run_event.is_set(), \"the task should not have run yet\"\n        self.task.enter_event.set()\n        return self\n    def __exit__(self, exc_type=None, exc_value=None, traceback=None):\n        assert self.task.run_event.is_set(), \"the task should have run\"\n        self.task.exit_event.set()\n\"\"\"\n\n        class DummyEventRecordingTask(luigi.Task):\n            def __init__(self, *args, **kwargs):\n                self.enter_event = multiprocessing.Event()\n                self.exit_event = multiprocessing.Event()\n                self.run_event = multiprocessing.Event()\n                super(DummyEventRecordingTask, self).__init__(*args, **kwargs)\n\n            def run(self):\n                assert self.enter_event.is_set(), \"the context manager should have been entered\"\n                assert not self.exit_event.is_set(), \"the context manager should not have been exited yet\"\n                assert not self.run_event.is_set(), \"the task should not have run yet\"\n                self.run_event.set()\n\n            def complete(self):\n                return self.run_event.is_set()\n\n        with temporary_unloaded_module(CONTEXT_MANAGER_MODULE) as module_name:\n            t = DummyEventRecordingTask()\n            w = Worker(task_process_context=module_name + \".MyContextManager\", force_multiprocessing=force_multiprocessing)\n            w.add(t)\n            self.assertTrue(w.run())\n            self.assertTrue(t.complete())\n            self.assertTrue(t.enter_event.is_set())\n            self.assertTrue(t.exit_event.is_set())\n\n    def test_context_manager_without_multiprocessing(self):\n        self._test_context_manager(False)\n\n    def test_context_manager_with_multiprocessing(self):\n        self._test_context_manager(True)\n"
  },
  {
    "path": "test/worker_task_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\nimport multiprocessing\nimport sys\nfrom subprocess import check_call\nfrom time import sleep\n\nimport mock\nfrom helpers import LuigiTestCase, StringContaining\nfrom psutil import Process\n\nimport luigi\nimport luigi.date_interval\nimport luigi.notifications\nfrom luigi.mock import MockTarget\nfrom luigi.scheduler import DONE, FAILED\nfrom luigi.worker import TaskException, TaskProcess\n\nluigi.notifications.DEBUG = True\n\n\nclass WorkerTaskTest(LuigiTestCase):\n    def test_constructor(self):\n        class MyTask(luigi.Task):\n            # Test overriding the constructor without calling the superconstructor\n            # This is a simple mistake but caused an error that was very hard to understand\n\n            def __init__(self):\n                pass\n\n        def f():\n            luigi.build([MyTask()], local_scheduler=True)\n\n        self.assertRaises(TaskException, f)\n\n    def test_run_none(self):\n        def f():\n            luigi.build([None], local_scheduler=True)\n\n        self.assertRaises(TaskException, f)\n\n\nclass TaskProcessTest(LuigiTestCase):\n    def test_update_result_queue_on_success(self):\n        # IMO this test makes no sense as it tests internal behavior and have\n        # already broken once during internal non-changing refactoring\n        class SuccessTask(luigi.Task):\n            def on_success(self):\n                return \"test success expl\"\n\n        task = SuccessTask()\n        result_queue = multiprocessing.Queue()\n        task_process = TaskProcess(task, 1, result_queue, mock.Mock())\n\n        with mock.patch.object(result_queue, \"put\") as mock_put:\n            task_process.run()\n            mock_put.assert_called_once_with((task.task_id, DONE, \"test success expl\", [], None))\n\n    def test_update_result_queue_on_failure(self):\n        # IMO this test makes no sense as it tests internal behavior and have\n        # already broken once during internal non-changing refactoring\n        class FailTask(luigi.Task):\n            def run(self):\n                raise BaseException(\"Uh oh.\")\n\n            def on_failure(self, exception):\n                return \"test failure expl\"\n\n        task = FailTask()\n        result_queue = multiprocessing.Queue()\n        task_process = TaskProcess(task, 1, result_queue, mock.Mock())\n\n        with mock.patch.object(result_queue, \"put\") as mock_put:\n            task_process.run()\n            mock_put.assert_called_once_with((task.task_id, FAILED, \"test failure expl\", [], []))\n\n    def test_fail_on_false_complete(self):\n        class NeverCompleteTask(luigi.Task):\n            def complete(self):\n                return False\n\n        task = NeverCompleteTask()\n        result_queue = multiprocessing.Queue()\n        task_process = TaskProcess(task, 1, result_queue, mock.Mock(), check_complete_on_run=True)\n\n        with mock.patch.object(result_queue, \"put\") as mock_put:\n            task_process.run()\n            mock_put.assert_called_once_with((task.task_id, FAILED, StringContaining(\"finished running, but complete() is still returning false\"), [], None))\n\n    def test_fail_on_unfulfilled_dependencies(self):\n        class NeverCompleteTask(luigi.Task):\n            def complete(self):\n                return False\n\n        class A(NeverCompleteTask):\n            def output(self):\n                return []\n\n        class B(NeverCompleteTask):\n            def output(self):\n                return MockTarget(\"foo-B\")\n\n        class C(NeverCompleteTask):\n            def output(self):\n                return [MockTarget(\"foo-C1\"), MockTarget(\"foo-C2\")]\n\n        class Main(NeverCompleteTask):\n            def requires(self):\n                return [A(), B(), C()]\n\n        task = Main()\n        result_queue = multiprocessing.Queue()\n        task_process = TaskProcess(task, 1, result_queue, mock.Mock())\n\n        with mock.patch.object(result_queue, \"put\") as mock_put:\n            task_process.run()\n            expected_missing = [A().task_id, f\"{B().task_id} (foo-B)\", f\"{C().task_id} (foo-C1, foo-C2)\"]\n            mock_put.assert_called_once_with(\n                (\n                    task.task_id,\n                    FAILED,\n                    StringContaining(f\"Unfulfilled dependencies at run time: {', '.join(expected_missing)}\"),\n                    expected_missing,\n                    [],\n                )\n            )\n\n    def test_cleanup_children_on_terminate(self):\n        \"\"\"\n        Subprocesses spawned by tasks should be terminated on terminate\n        \"\"\"\n\n        class HangingSubprocessTask(luigi.Task):\n            def run(self):\n                python = sys.executable\n                check_call([python, \"-c\", \"while True: pass\"])\n\n        task = HangingSubprocessTask()\n        queue = mock.Mock()\n        worker_id = 1\n\n        task_process = TaskProcess(task, worker_id, queue, mock.Mock())\n        task_process.start()\n\n        parent = Process(task_process.pid)\n        while not parent.children():\n            # wait for child process to startup\n            sleep(0.01)\n\n        [child] = parent.children()\n        task_process.terminate()\n        child.wait(timeout=1.0)  # wait for terminate to complete\n\n        self.assertFalse(parent.is_running())\n        self.assertFalse(child.is_running())\n\n    def test_disable_worker_timeout(self):\n        \"\"\"\n        When a task sets worker_timeout explicitly to 0, it should disable the timeout, even if it\n        is configured globally.\n        \"\"\"\n\n        class Task(luigi.Task):\n            worker_timeout = 0\n\n        task_process = TaskProcess(\n            task=Task(),\n            worker_id=1,\n            result_queue=mock.Mock(),\n            status_reporter=mock.Mock(),\n            worker_timeout=10,\n        )\n        self.assertEqual(task_process.worker_timeout, 0)\n"
  },
  {
    "path": "test/worker_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport email.parser\nimport functools\nimport logging\nimport os\nimport shutil\nimport signal\nimport tempfile\nimport threading\nimport time\n\nimport mock\nimport psutil\nfrom helpers import LuigiTestCase, skipOnTravisAndGithubActions, temporary_unloaded_module, unittest, with_config\n\nimport luigi.notifications\nimport luigi.task_register\nimport luigi.worker\nfrom luigi import Event, ExternalTask, RemoteScheduler, Task\nfrom luigi.cmdline import luigi_run\nfrom luigi.mock import MockFileSystem, MockTarget\nfrom luigi.rpc import RPCError\nfrom luigi.scheduler import Scheduler\nfrom luigi.worker import Worker\n\nluigi.notifications.DEBUG = True\n\n\nclass DummyTask(Task):\n    def __init__(self, *args, **kwargs):\n        super(DummyTask, self).__init__(*args, **kwargs)\n        self.has_run = False\n\n    def complete(self):\n        return self.has_run\n\n    def run(self):\n        logging.debug(\"%s - setting has_run\", self)\n        self.has_run = True\n\n\nclass DynamicDummyTask(Task):\n    p = luigi.Parameter()\n    sleep = luigi.FloatParameter(default=0.5, significant=False)\n\n    def output(self):\n        return luigi.LocalTarget(self.p)\n\n    def run(self):\n        with self.output().open(\"w\") as f:\n            f.write(\"Done!\")\n        time.sleep(self.sleep)  # so we can benchmark & see if parallelization works\n\n\nclass DynamicDummyTaskWithNamespace(DynamicDummyTask):\n    task_namespace = \"banana\"\n\n\nclass DynamicRequires(Task):\n    p = luigi.Parameter()\n    use_banana_task = luigi.BoolParameter(default=False)\n\n    def output(self):\n        return luigi.LocalTarget(os.path.join(self.p, \"parent\"))\n\n    def run(self):\n        if self.use_banana_task:\n            task_cls = DynamicDummyTaskWithNamespace\n        else:\n            task_cls = DynamicDummyTask\n        dummy_targets = yield [task_cls(os.path.join(self.p, str(i))) for i in range(5)]\n        dummy_targets += yield [task_cls(os.path.join(self.p, str(i))) for i in range(5, 7)]\n        with self.output().open(\"w\") as f:\n            for i, d in enumerate(dummy_targets):\n                for line in d.open(\"r\"):\n                    print(\"%d: %s\" % (i, line.strip()), file=f)\n\n\nclass DynamicRequiresWrapped(Task):\n    p = luigi.Parameter()\n\n    def output(self):\n        return luigi.LocalTarget(os.path.join(self.p, \"parent\"))\n\n    def run(self):\n        reqs = [DynamicDummyTask(p=os.path.join(self.p, \"%s.txt\" % i), sleep=0.0) for i in range(10)]\n\n        # yield again as DynamicRequires\n        yield luigi.DynamicRequirements(reqs)\n\n        # and again with a custom complete function that does base name comparisons\n        def custom_complete(complete_fn):\n            if not complete_fn(reqs[0]):\n                return False\n            paths = [task.output().path for task in reqs]\n            basenames = os.listdir(os.path.dirname(paths[0]))\n            self._custom_complete_called = True\n            self._custom_complete_result = all(os.path.basename(path) in basenames for path in paths)\n            return self._custom_complete_result\n\n        yield luigi.DynamicRequirements(reqs, custom_complete)\n\n        with self.output().open(\"w\") as f:\n            f.write(\"Done!\")\n\n\nclass DynamicRequiresOtherModule(Task):\n    p = luigi.Parameter()\n\n    def output(self):\n        return luigi.LocalTarget(os.path.join(self.p, \"baz\"))\n\n    def run(self):\n        import other_module\n\n        other_target_foo = yield other_module.OtherModuleTask(os.path.join(self.p, \"foo\"))  # NOQA\n        other_target_bar = yield other_module.OtherModuleTask(os.path.join(self.p, \"bar\"))  # NOQA\n\n        with self.output().open(\"w\") as f:\n            f.write(\"Done!\")\n\n\nclass DummyErrorTask(Task):\n    retry_index = 0\n\n    def run(self):\n        self.retry_index += 1\n        raise Exception(\"Retry index is %s for %s\" % (self.retry_index, self.task_family))\n\n\nclass WorkerTest(LuigiTestCase):\n    def run(self, result=None):\n        self.sch = Scheduler(retry_delay=100, remove_delay=1000, worker_disconnect_delay=10, stable_done_cooldown_secs=0)\n        self.time = time.time\n        with Worker(scheduler=self.sch, worker_id=\"X\") as w, Worker(scheduler=self.sch, worker_id=\"Y\") as w2:\n            self.w = w\n            self.w2 = w2\n            super(WorkerTest, self).run(result)\n\n        if time.time != self.time:\n            time.time = self.time\n\n    def setTime(self, t):\n        time.time = lambda: t\n\n    def test_dep(self):\n        class A(Task):\n            def run(self):\n                self.has_run = True\n\n            def complete(self):\n                return self.has_run\n\n        a = A()\n\n        class B(Task):\n            def requires(self):\n                return a\n\n            def run(self):\n                self.has_run = True\n\n            def complete(self):\n                return self.has_run\n\n        b = B()\n        a.has_run = False\n        b.has_run = False\n\n        self.assertTrue(self.w.add(b))\n        self.assertTrue(self.w.run())\n        self.assertTrue(a.has_run)\n        self.assertTrue(b.has_run)\n\n    def test_external_dep(self):\n        class A(ExternalTask):\n            def complete(self):\n                return False\n\n        a = A()\n\n        class B(Task):\n            def requires(self):\n                return a\n\n            def run(self):\n                self.has_run = True\n\n            def complete(self):\n                return self.has_run\n\n        b = B()\n\n        a.has_run = False\n        b.has_run = False\n\n        self.assertTrue(self.w.add(b))\n        self.assertTrue(self.w.run())\n\n        self.assertFalse(a.has_run)\n        self.assertFalse(b.has_run)\n\n    def test_externalized_dep(self):\n        class A(Task):\n            has_run = False\n\n            def run(self):\n                self.has_run = True\n\n            def complete(self):\n                return self.has_run\n\n        a = A()\n\n        class B(A):\n            def requires(self):\n                return luigi.task.externalize(a)\n\n        b = B()\n\n        self.assertTrue(self.w.add(b))\n        self.assertTrue(self.w.run())\n\n        self.assertFalse(a.has_run)\n        self.assertFalse(b.has_run)\n\n    def test_legacy_externalized_dep(self):\n        class A(Task):\n            has_run = False\n\n            def run(self):\n                self.has_run = True\n\n            def complete(self):\n                return self.has_run\n\n        a = A()\n        a.run = NotImplemented\n\n        class B(A):\n            def requires(self):\n                return a\n\n        b = B()\n\n        self.assertTrue(self.w.add(b))\n        self.assertTrue(self.w.run())\n\n        self.assertFalse(a.has_run)\n        self.assertFalse(b.has_run)\n\n    def test_type_error_in_tracking_run_deprecated(self):\n        class A(Task):\n            num_runs = 0\n\n            def complete(self):\n                return False\n\n            def run(self, tracking_url_callback=None):\n                self.num_runs += 1\n                raise TypeError(\"bad type\")\n\n        a = A()\n        self.assertTrue(self.w.add(a))\n        self.assertFalse(self.w.run())\n\n        # Should only run and fail once, not retry because of the type error\n        self.assertEqual(1, a.num_runs)\n\n    def test_tracking_url(self):\n        tracking_url = \"http://test_url.com/\"\n\n        class A(Task):\n            has_run = False\n\n            def complete(self):\n                return self.has_run\n\n            def run(self):\n                self.set_tracking_url(tracking_url)\n                self.has_run = True\n\n        a = A()\n        self.assertTrue(self.w.add(a))\n        self.assertTrue(self.w.run())\n        tasks = self.sch.task_list(\"DONE\", \"\")\n        self.assertEqual(1, len(tasks))\n        self.assertEqual(tracking_url, tasks[a.task_id][\"tracking_url\"])\n\n    def test_fail(self):\n        class CustomException(BaseException):\n            def __init__(self, msg):\n                self.msg = msg\n\n        class A(Task):\n            def run(self):\n                self.has_run = True\n                raise CustomException(\"bad things\")\n\n            def complete(self):\n                return self.has_run\n\n        a = A()\n\n        class B(Task):\n            def requires(self):\n                return a\n\n            def run(self):\n                self.has_run = True\n\n            def complete(self):\n                return self.has_run\n\n        b = B()\n\n        a.has_run = False\n        b.has_run = False\n\n        self.assertTrue(self.w.add(b))\n        self.assertFalse(self.w.run())\n\n        self.assertTrue(a.has_run)\n        self.assertFalse(b.has_run)\n\n    def test_unknown_dep(self):\n        # see related test_remove_dep test (grep for it)\n        class A(ExternalTask):\n            def complete(self):\n                return False\n\n        class C(Task):\n            def complete(self):\n                return True\n\n        def get_b(dep):\n            class B(Task):\n                def requires(self):\n                    return dep\n\n                def run(self):\n                    self.has_run = True\n\n                def complete(self):\n                    return False\n\n            b = B()\n            b.has_run = False\n            return b\n\n        b_a = get_b(A())\n        b_c = get_b(C())\n\n        self.assertTrue(self.w.add(b_a))\n        # So now another worker goes in and schedules C -> B\n        # This should remove the dep A -> B but will screw up the first worker\n        self.assertTrue(self.w2.add(b_c))\n\n        self.assertFalse(self.w.run())  # should not run anything - the worker should detect that A is broken\n        self.assertFalse(b_a.has_run)\n        # not sure what should happen??\n        # self.w2.run() # should run B since C is fulfilled\n        # self.assertTrue(b_c.has_run)\n\n    def test_unfulfilled_dep(self):\n        class A(Task):\n            def complete(self):\n                return self.done\n\n            def run(self):\n                self.done = True\n\n        def get_b(a):\n            class B(A):\n                def requires(self):\n                    return a\n\n            b = B()\n            b.done = False\n            a.done = True\n            return b\n\n        a = A()\n        b = get_b(a)\n\n        self.assertTrue(self.w.add(b))\n        a.done = False\n        self.w.run()\n        self.assertTrue(a.complete())\n        self.assertTrue(b.complete())\n\n    def test_check_unfulfilled_deps_config(self):\n        class A(Task):\n            i = luigi.IntParameter()\n\n            def __init__(self, *args, **kwargs):\n                super(A, self).__init__(*args, **kwargs)\n                self.complete_count = 0\n                self.has_run = False\n\n            def complete(self):\n                self.complete_count += 1\n                return self.has_run\n\n            def run(self):\n                self.has_run = True\n\n        class B(A):\n            def requires(self):\n                return A(i=self.i)\n\n        # test the enabled features\n        with Worker(scheduler=self.sch, worker_id=\"1\") as w:\n            w._config.check_unfulfilled_deps = True\n            a1 = A(i=1)\n            b1 = B(i=1)\n            self.assertTrue(w.add(b1))\n            self.assertEqual(a1.complete_count, 1)\n            self.assertEqual(b1.complete_count, 1)\n            w.run()\n            self.assertTrue(a1.complete())\n            self.assertTrue(b1.complete())\n            self.assertEqual(a1.complete_count, 3)\n            self.assertEqual(b1.complete_count, 2)\n\n        # test the disabled features\n        with Worker(scheduler=self.sch, worker_id=\"2\") as w:\n            w._config.check_unfulfilled_deps = False\n            a2 = A(i=2)\n            b2 = B(i=2)\n            self.assertTrue(w.add(b2))\n            self.assertEqual(a2.complete_count, 1)\n            self.assertEqual(b2.complete_count, 1)\n            w.run()\n            self.assertTrue(a2.complete())\n            self.assertTrue(b2.complete())\n            self.assertEqual(a2.complete_count, 2)\n            self.assertEqual(b2.complete_count, 2)\n\n    def test_cache_task_completion_config(self):\n        class A(Task):\n            i = luigi.IntParameter()\n\n            def __init__(self, *args, **kwargs):\n                super(A, self).__init__(*args, **kwargs)\n                self.complete_count = 0\n                self.has_run = False\n\n            def complete(self):\n                self.complete_count += 1\n                return self.has_run\n\n            def run(self):\n                self.has_run = True\n\n        class B(A):\n            def run(self):\n                yield A(i=self.i + 0)\n                yield A(i=self.i + 1)\n                yield A(i=self.i + 2)\n                self.has_run = True\n\n        # test with enabled cache_task_completion\n        with Worker(scheduler=self.sch, worker_id=\"2\", cache_task_completion=True) as w:\n            b0 = B(i=0)\n            a0 = A(i=0)\n            a1 = A(i=1)\n            a2 = A(i=2)\n            self.assertTrue(w.add(b0))\n            # a's are required dynamically, so their counts must be 0\n            self.assertEqual(b0.complete_count, 1)\n            self.assertEqual(a0.complete_count, 0)\n            self.assertEqual(a1.complete_count, 0)\n            self.assertEqual(a2.complete_count, 0)\n            w.run()\n            # the complete methods of a's yielded first in b's run method were called equally often\n            self.assertEqual(b0.complete_count, 1)\n            self.assertEqual(a0.complete_count, 2)\n            self.assertEqual(a1.complete_count, 2)\n            self.assertEqual(a2.complete_count, 2)\n\n        # test with disabled cache_task_completion\n        with Worker(scheduler=self.sch, worker_id=\"2\", cache_task_completion=False) as w:\n            b10 = B(i=10)\n            a10 = A(i=10)\n            a11 = A(i=11)\n            a12 = A(i=12)\n            self.assertTrue(w.add(b10))\n            # a's are required dynamically, so their counts must be 0\n            self.assertEqual(b10.complete_count, 1)\n            self.assertEqual(a10.complete_count, 0)\n            self.assertEqual(a11.complete_count, 0)\n            self.assertEqual(a12.complete_count, 0)\n            w.run()\n            # the complete methods of a's yielded first in b's run method were called more often\n            self.assertEqual(b10.complete_count, 1)\n            self.assertEqual(a10.complete_count, 5)\n            self.assertEqual(a11.complete_count, 4)\n            self.assertEqual(a12.complete_count, 3)\n\n        # test with enabled check_complete_on_run\n        with Worker(scheduler=self.sch, worker_id=\"2\", check_complete_on_run=True) as w:\n            b20 = B(i=20)\n            a20 = A(i=20)\n            a21 = A(i=21)\n            a22 = A(i=22)\n            self.assertTrue(w.add(b20))\n            # a's are required dynamically, so their counts must be 0\n            self.assertEqual(b20.complete_count, 1)\n            self.assertEqual(a20.complete_count, 0)\n            self.assertEqual(a21.complete_count, 0)\n            self.assertEqual(a22.complete_count, 0)\n            w.run()\n            # the complete methods of a's yielded first in b's run method were called more often\n            self.assertEqual(b20.complete_count, 2)\n            self.assertEqual(a20.complete_count, 6)\n            self.assertEqual(a21.complete_count, 5)\n            self.assertEqual(a22.complete_count, 4)\n\n    def test_gets_missed_work(self):\n        class A(Task):\n            done = False\n\n            def complete(self):\n                return self.done\n\n            def run(self):\n                self.done = True\n\n        a = A()\n        self.assertTrue(self.w.add(a))\n\n        # simulate a missed get_work response\n        self.assertEqual(a.task_id, self.sch.get_work(worker=\"X\")[\"task_id\"])\n\n        self.assertTrue(self.w.run())\n        self.assertTrue(a.complete())\n\n    def test_avoid_infinite_reschedule(self):\n        class A(Task):\n            def complete(self):\n                return False\n\n        class B(Task):\n            def complete(self):\n                return False\n\n            def requires(self):\n                return A()\n\n        self.assertTrue(self.w.add(B()))\n        self.assertFalse(self.w.run())\n\n    def test_fails_registering_signal(self):\n        with mock.patch(\"luigi.worker.signal\", spec=[\"signal\"]):\n            # mock will raise an attribute error getting signal.SIGUSR1\n            Worker()\n\n    def test_allow_reschedule_with_many_missing_deps(self):\n        class A(Task):\n            \"\"\"Task that must run twice to succeed\"\"\"\n\n            i = luigi.IntParameter()\n\n            runs = 0\n\n            def complete(self):\n                return self.runs >= 2\n\n            def run(self):\n                self.runs += 1\n\n        class B(Task):\n            done = False\n\n            def requires(self):\n                return map(A, range(20))\n\n            def complete(self):\n                return self.done\n\n            def run(self):\n                self.done = True\n\n        b = B()\n        w = Worker(scheduler=self.sch, worker_id=\"X\", max_reschedules=1)\n        self.assertTrue(w.add(b))\n        self.assertFalse(w.run())\n\n        # For b to be done, we must have rescheduled its dependencies to run them twice\n        self.assertTrue(b.complete())\n        self.assertTrue(all(a.complete() for a in b.deps()))\n\n    def test_interleaved_workers(self):\n        class A(DummyTask):\n            pass\n\n        a = A()\n\n        class B(DummyTask):\n            def requires(self):\n                return a\n\n        ExternalB = luigi.task.externalize(B)\n\n        b = B()\n        eb = ExternalB()\n        self.assertEqual(str(eb), \"B()\")\n\n        sch = Scheduler(retry_delay=100, remove_delay=1000, worker_disconnect_delay=10)\n        with Worker(scheduler=sch, worker_id=\"X\") as w, Worker(scheduler=sch, worker_id=\"Y\") as w2:\n            self.assertTrue(w.add(b))\n            self.assertTrue(w2.add(eb))\n            logging.debug(\"RUNNING BROKEN WORKER\")\n            self.assertTrue(w2.run())\n            self.assertFalse(a.complete())\n            self.assertFalse(b.complete())\n            logging.debug(\"RUNNING FUNCTIONAL WORKER\")\n            self.assertTrue(w.run())\n            self.assertTrue(a.complete())\n            self.assertTrue(b.complete())\n\n    def test_interleaved_workers2(self):\n        # two tasks without dependencies, one external, one not\n        class B(DummyTask):\n            pass\n\n        ExternalB = luigi.task.externalize(B)\n\n        b = B()\n        eb = ExternalB()\n\n        self.assertEqual(str(eb), \"B()\")\n\n        sch = Scheduler(retry_delay=100, remove_delay=1000, worker_disconnect_delay=10)\n        with Worker(scheduler=sch, worker_id=\"X\") as w, Worker(scheduler=sch, worker_id=\"Y\") as w2:\n            self.assertTrue(w2.add(eb))\n            self.assertTrue(w.add(b))\n\n            self.assertTrue(w2.run())\n            self.assertFalse(b.complete())\n            self.assertTrue(w.run())\n            self.assertTrue(b.complete())\n\n    def test_interleaved_workers3(self):\n        class A(DummyTask):\n            def run(self):\n                logging.debug(\"running A\")\n                time.sleep(0.1)\n                super(A, self).run()\n\n        a = A()\n\n        class B(DummyTask):\n            def requires(self):\n                return a\n\n            def run(self):\n                logging.debug(\"running B\")\n                super(B, self).run()\n\n        b = B()\n\n        sch = Scheduler(retry_delay=100, remove_delay=1000, worker_disconnect_delay=10)\n\n        with Worker(scheduler=sch, worker_id=\"X\", keep_alive=True, count_uniques=True) as w:\n            with Worker(scheduler=sch, worker_id=\"Y\", keep_alive=True, count_uniques=True, wait_interval=0.1, wait_jitter=0.05) as w2:\n                self.assertTrue(w.add(a))\n                self.assertTrue(w2.add(b))\n\n                threading.Thread(target=w.run).start()\n                self.assertTrue(w2.run())\n\n                self.assertTrue(a.complete())\n                self.assertTrue(b.complete())\n\n    def test_die_for_non_unique_pending(self):\n        class A(DummyTask):\n            def run(self):\n                logging.debug(\"running A\")\n                time.sleep(0.1)\n                super(A, self).run()\n\n        a = A()\n\n        class B(DummyTask):\n            def requires(self):\n                return a\n\n            def run(self):\n                logging.debug(\"running B\")\n                super(B, self).run()\n\n        b = B()\n\n        sch = Scheduler(retry_delay=100, remove_delay=1000, worker_disconnect_delay=10)\n\n        with Worker(scheduler=sch, worker_id=\"X\", keep_alive=True, count_uniques=True) as w:\n            with Worker(scheduler=sch, worker_id=\"Y\", keep_alive=True, count_uniques=True, wait_interval=0.1, wait_jitter=0.05) as w2:\n                self.assertTrue(w.add(b))\n                self.assertTrue(w2.add(b))\n\n                self.assertEqual(w._get_work()[0], a.task_id)\n                self.assertTrue(w2.run())\n\n                self.assertFalse(a.complete())\n                self.assertFalse(b.complete())\n\n    def test_complete_exception(self):\n        \"Tests that a task is still scheduled if its sister task crashes in the complete() method\"\n\n        class A(DummyTask):\n            def complete(self):\n                raise Exception(\"doh\")\n\n        a = A()\n\n        class C(DummyTask):\n            pass\n\n        c = C()\n\n        class B(DummyTask):\n            def requires(self):\n                return a, c\n\n        b = B()\n        sch = Scheduler(retry_delay=100, remove_delay=1000, worker_disconnect_delay=10)\n        with Worker(scheduler=sch, worker_id=\"foo\") as w:\n            self.assertFalse(w.add(b))\n            self.assertTrue(w.run())\n            self.assertFalse(b.has_run)\n            self.assertTrue(c.has_run)\n            self.assertFalse(a.has_run)\n\n    def test_requires_exception(self):\n        class A(DummyTask):\n            def requires(self):\n                raise Exception(\"doh\")\n\n        a = A()\n\n        class D(DummyTask):\n            pass\n\n        d = D()\n\n        class C(DummyTask):\n            def requires(self):\n                return d\n\n        c = C()\n\n        class B(DummyTask):\n            def requires(self):\n                return c, a\n\n        b = B()\n        sch = Scheduler(retry_delay=100, remove_delay=1000, worker_disconnect_delay=10)\n        with Worker(scheduler=sch, worker_id=\"foo\") as w:\n            self.assertFalse(w.add(b))\n            self.assertTrue(w.run())\n            self.assertFalse(b.has_run)\n            self.assertTrue(c.has_run)\n            self.assertTrue(d.has_run)\n            self.assertFalse(a.has_run)\n\n    def test_run_csv_batch_job(self):\n        completed = set()\n\n        class CsvBatchJob(luigi.Task):\n            values = luigi.parameter.Parameter(batch_method=\",\".join)\n            has_run = False\n\n            def run(self):\n                completed.update(self.values.split(\",\"))\n                self.has_run = True\n\n            def complete(self):\n                return all(value in completed for value in self.values.split(\",\"))\n\n        tasks = [CsvBatchJob(str(i)) for i in range(10)]\n        for task in tasks:\n            self.assertTrue(self.w.add(task))\n        self.assertTrue(self.w.run())\n\n        for task in tasks:\n            self.assertTrue(task.complete())\n            self.assertFalse(task.has_run)\n\n    def test_run_max_batch_job(self):\n        completed = set()\n\n        class MaxBatchJob(luigi.Task):\n            value = luigi.IntParameter(batch_method=max)\n            has_run = False\n\n            def run(self):\n                completed.add(self.value)\n                self.has_run = True\n\n            def complete(self):\n                return any(self.value <= ran for ran in completed)\n\n        tasks = [MaxBatchJob(i) for i in range(10)]\n        for task in tasks:\n            self.assertTrue(self.w.add(task))\n        self.assertTrue(self.w.run())\n\n        for task in tasks:\n            self.assertTrue(task.complete())\n            # only task number 9 should run\n            self.assertFalse(task.has_run and task.value < 9)\n\n    def test_run_batch_job_unbatched(self):\n        completed = set()\n\n        class MaxNonBatchJob(luigi.Task):\n            value = luigi.IntParameter(batch_method=max)\n            has_run = False\n\n            batchable = False\n\n            def run(self):\n                completed.add(self.value)\n                self.has_run = True\n\n            def complete(self):\n                return self.value in completed\n\n        tasks = [MaxNonBatchJob((i,)) for i in range(10)]\n        for task in tasks:\n            self.assertTrue(self.w.add(task))\n        self.assertTrue(self.w.run())\n\n        for task in tasks:\n            self.assertTrue(task.complete())\n            self.assertTrue(task.has_run)\n\n    def test_run_batch_job_limit_batch_size(self):\n        completed = set()\n        runs = []\n\n        class CsvLimitedBatchJob(luigi.Task):\n            value = luigi.parameter.Parameter(batch_method=\",\".join)\n            has_run = False\n\n            max_batch_size = 4\n\n            def run(self):\n                completed.update(self.value.split(\",\"))\n                runs.append(self)\n\n            def complete(self):\n                return all(value in completed for value in self.value.split(\",\"))\n\n        tasks = [CsvLimitedBatchJob(str(i)) for i in range(11)]\n        for task in tasks:\n            self.assertTrue(self.w.add(task))\n        self.assertTrue(self.w.run())\n\n        for task in tasks:\n            self.assertTrue(task.complete())\n\n        self.assertEqual(3, len(runs))\n\n    def test_fail_max_batch_job(self):\n        class MaxBatchFailJob(luigi.Task):\n            value = luigi.IntParameter(batch_method=max)\n            has_run = False\n\n            def run(self):\n                self.has_run = True\n                assert False\n\n            def complete(self):\n                return False\n\n        tasks = [MaxBatchFailJob(i) for i in range(10)]\n        for task in tasks:\n            self.assertTrue(self.w.add(task))\n        self.assertFalse(self.w.run())\n\n        for task in tasks:\n            # only task number 9 should run\n            self.assertFalse(task.has_run and task.value < 9)\n\n        self.assertEqual({task.task_id for task in tasks}, set(self.sch.task_list(\"FAILED\", \"\")))\n\n    def test_gracefully_handle_batch_method_failure(self):\n        class BadBatchMethodTask(DummyTask):\n            priority = 10\n            batch_int_param = luigi.IntParameter(batch_method=int.__add__)  # should be sum\n\n        bad_tasks = [BadBatchMethodTask(i) for i in range(5)]\n        good_tasks = [DummyTask()]\n        all_tasks = good_tasks + bad_tasks\n\n        self.assertFalse(any(task.complete() for task in all_tasks))\n\n        worker = Worker(scheduler=Scheduler(retry_count=1), keep_alive=True)\n\n        for task in all_tasks:\n            self.assertTrue(worker.add(task))\n        self.assertFalse(worker.run())\n        self.assertFalse(any(task.complete() for task in bad_tasks))\n\n        # we only get to run the good task if the bad task failures were handled gracefully\n        self.assertTrue(all(task.complete() for task in good_tasks))\n\n    def test_post_error_message_for_failed_batch_methods(self):\n        class BadBatchMethodTask(DummyTask):\n            batch_int_param = luigi.IntParameter(batch_method=int.__add__)  # should be sum\n\n        tasks = [BadBatchMethodTask(1), BadBatchMethodTask(2)]\n\n        for task in tasks:\n            self.assertTrue(self.w.add(task))\n        self.assertFalse(self.w.run())\n\n        failed_ids = set(self.sch.task_list(\"FAILED\", \"\"))\n        self.assertEqual({task.task_id for task in tasks}, failed_ids)\n        self.assertTrue(all(self.sch.fetch_error(task_id)[\"error\"] for task_id in failed_ids))\n\n\nclass WorkerKeepAliveTests(LuigiTestCase):\n    def setUp(self):\n        self.sch = Scheduler()\n        super(WorkerKeepAliveTests, self).setUp()\n\n    def _worker_keep_alive_test(self, first_should_live, second_should_live, task_status=None, **worker_args):\n        worker_args.update(\n            {\n                \"scheduler\": self.sch,\n                \"worker_processes\": 0,\n                \"wait_interval\": 0.01,\n                \"wait_jitter\": 0.0,\n            }\n        )\n        w1 = Worker(worker_id=\"w1\", **worker_args)\n        w2 = Worker(worker_id=\"w2\", **worker_args)\n        with w1 as worker1, w2 as worker2:\n            worker1.add(DummyTask())\n            t1 = threading.Thread(target=worker1.run)\n            t1.start()\n\n            worker2.add(DummyTask())\n            t2 = threading.Thread(target=worker2.run)\n            t2.start()\n\n            if task_status:\n                self.sch.add_task(worker=\"DummyWorker\", task_id=DummyTask().task_id, status=task_status)\n\n            # allow workers to run their get work loops a few times\n            time.sleep(0.1)\n\n            try:\n                self.assertEqual(first_should_live, t1.is_alive())\n                self.assertEqual(second_should_live, t2.is_alive())\n\n            finally:\n                # mark the task done so the worker threads will die\n                self.sch.add_task(worker=\"DummyWorker\", task_id=DummyTask().task_id, status=\"DONE\")\n                t1.join()\n                t2.join()\n\n    def test_no_keep_alive(self):\n        self._worker_keep_alive_test(\n            first_should_live=False,\n            second_should_live=False,\n        )\n\n    def test_keep_alive(self):\n        self._worker_keep_alive_test(\n            first_should_live=True,\n            second_should_live=True,\n            keep_alive=True,\n        )\n\n    def test_keep_alive_count_uniques(self):\n        self._worker_keep_alive_test(\n            first_should_live=False,\n            second_should_live=False,\n            keep_alive=True,\n            count_uniques=True,\n        )\n\n    def test_keep_alive_count_last_scheduled(self):\n        self._worker_keep_alive_test(\n            first_should_live=False,\n            second_should_live=True,\n            keep_alive=True,\n            count_last_scheduled=True,\n        )\n\n    def test_keep_alive_through_failure(self):\n        self._worker_keep_alive_test(\n            first_should_live=True,\n            second_should_live=True,\n            keep_alive=True,\n            task_status=\"FAILED\",\n        )\n\n    def test_do_not_keep_alive_through_disable(self):\n        self._worker_keep_alive_test(\n            first_should_live=False,\n            second_should_live=False,\n            keep_alive=True,\n            task_status=\"DISABLED\",\n        )\n\n\nclass WorkerInterruptedTest(unittest.TestCase):\n    def setUp(self):\n        self.sch = Scheduler(retry_delay=100, remove_delay=1000, worker_disconnect_delay=10)\n\n    requiring_sigusr = unittest.skipUnless(hasattr(signal, \"SIGUSR1\"), \"signal.SIGUSR1 not found on this system\")\n\n    def _test_stop_getting_new_work(self, worker):\n        d = DummyTask()\n        with worker:\n            worker.add(d)  # For assistant its ok that other tasks add it\n            self.assertFalse(d.complete())\n            worker.handle_interrupt(signal.SIGUSR1, None)\n            worker.run()\n            self.assertFalse(d.complete())\n\n    @requiring_sigusr\n    def test_stop_getting_new_work(self):\n        self._test_stop_getting_new_work(Worker(scheduler=self.sch))\n\n    @requiring_sigusr\n    def test_stop_getting_new_work_assistant(self):\n        self._test_stop_getting_new_work(Worker(scheduler=self.sch, keep_alive=False, assistant=True))\n\n    @requiring_sigusr\n    def test_stop_getting_new_work_assistant_keep_alive(self):\n        self._test_stop_getting_new_work(Worker(scheduler=self.sch, keep_alive=True, assistant=True))\n\n    def test_existence_of_disabling_option(self):\n        # any code equivalent of `os.kill(os.getpid(), signal.SIGUSR1)`\n        # seem to give some sort of a \"InvocationError\"\n        Worker(no_install_shutdown_handler=True)\n\n    @with_config({\"worker\": {\"no_install_shutdown_handler\": \"True\"}})\n    def test_can_run_luigi_in_thread(self):\n        class A(DummyTask):\n            pass\n\n        task = A()\n        # Note that ``signal.signal(signal.SIGUSR1, fn)`` can only be called in the main thread.\n        # So if we do not disable the shutdown handler, this would fail.\n        t = threading.Thread(target=lambda: luigi.build([task], local_scheduler=True))\n        t.start()\n        t.join()\n        self.assertTrue(task.complete())\n\n\nclass WorkerDisabledTest(LuigiTestCase):\n    def make_sch(self):\n        return Scheduler(retry_delay=100, remove_delay=1000, worker_disconnect_delay=10)\n\n    def _test_stop_getting_new_work_build(self, sch, worker):\n        \"\"\"\n        I got motivated to create this test case when I saw that the\n        execution_summary crashed after my first attempted solution.\n        \"\"\"\n\n        class KillWorkerTask(luigi.Task):\n            did_actually_run = False\n\n            def run(self):\n                sch.disable_worker(\"my_worker_id\")\n                KillWorkerTask.did_actually_run = True\n\n        class Factory:\n            def create_local_scheduler(self, *args, **kwargs):\n                return sch\n\n            def create_worker(self, *args, **kwargs):\n                return worker\n\n        luigi.build([KillWorkerTask()], worker_scheduler_factory=Factory(), local_scheduler=True)\n        self.assertTrue(KillWorkerTask.did_actually_run)\n\n    def _test_stop_getting_new_work_manual(self, sch, worker):\n        d = DummyTask()\n        with worker:\n            worker.add(d)  # For assistant its ok that other tasks add it\n            self.assertFalse(d.complete())\n            sch.disable_worker(\"my_worker_id\")\n            worker.run()  # Note: Test could fail by hanging on this line\n            self.assertFalse(d.complete())\n\n    def _test_stop_getting_new_work(self, **worker_kwargs):\n        worker_kwargs[\"worker_id\"] = \"my_worker_id\"\n\n        sch = self.make_sch()\n        worker_kwargs[\"scheduler\"] = sch\n        self._test_stop_getting_new_work_manual(sch, Worker(**worker_kwargs))\n\n        sch = self.make_sch()\n        worker_kwargs[\"scheduler\"] = sch\n        self._test_stop_getting_new_work_build(sch, Worker(**worker_kwargs))\n\n    def test_stop_getting_new_work_keep_alive(self):\n        self._test_stop_getting_new_work(keep_alive=True, assistant=False)\n\n    def test_stop_getting_new_work_assistant(self):\n        self._test_stop_getting_new_work(keep_alive=False, assistant=True)\n\n    def test_stop_getting_new_work_assistant_keep_alive(self):\n        self._test_stop_getting_new_work(keep_alive=True, assistant=True)\n\n\nclass DynamicDependenciesTest(LuigiTestCase):\n    n_workers = 1\n    timeout = float(\"inf\")\n\n    def setUp(self):\n        self.p = tempfile.mkdtemp()\n\n    def tearDown(self):\n        shutil.rmtree(self.p)\n\n    def test_dynamic_dependencies(self, use_banana_task=False):\n        t0 = time.time()\n        t = DynamicRequires(p=self.p, use_banana_task=use_banana_task)\n        luigi.build([t], local_scheduler=True, workers=self.n_workers)\n        self.assertTrue(t.complete())\n\n        # loop through output and verify\n        with t.output().open(\"r\") as f:\n            for i in range(7):\n                self.assertEqual(f.readline().strip(), \"%d: Done!\" % i)\n\n        self.assertTrue(time.time() - t0 < self.timeout)\n\n    def test_dynamic_dependencies_with_namespace(self):\n        self.test_dynamic_dependencies(use_banana_task=True)\n\n    def test_dynamic_dependencies_other_module(self):\n        t = DynamicRequiresOtherModule(p=self.p)\n        luigi.build([t], local_scheduler=True, workers=self.n_workers)\n        self.assertTrue(t.complete())\n\n    def test_wrapped_dynamic_requirements(self):\n        t = DynamicRequiresWrapped(p=self.p)\n        luigi.build([t], local_scheduler=True, workers=1)\n        self.assertTrue(t.complete())\n        self.assertTrue(getattr(t, \"_custom_complete_called\", False))\n        self.assertTrue(getattr(t, \"_custom_complete_result\", False))\n\n\nclass DynamicDependenciesWithMultipleWorkersTest(DynamicDependenciesTest):\n    n_workers = 100\n    timeout = 10.0  # We run 7 tasks that take 0.5s each so it should take less than 3.5s\n\n\nclass WorkerPingThreadTests(unittest.TestCase):\n    def test_ping_retry(self):\n        \"\"\"Worker ping fails once. Ping continues to try to connect to scheduler\n\n        Kind of ugly since it uses actual timing with sleep to test the thread\n        \"\"\"\n        sch = Scheduler(\n            retry_delay=100,\n            remove_delay=1000,\n            worker_disconnect_delay=10,\n        )\n\n        self._total_pings = 0  # class var so it can be accessed from fail_ping\n\n        def fail_ping(worker):\n            # this will be called from within keep-alive thread...\n            self._total_pings += 1\n            raise Exception(\"Some random exception\")\n\n        sch.ping = fail_ping\n\n        with Worker(\n            scheduler=sch,\n            worker_id=\"foo\",\n            ping_interval=0.01,  # very short between pings to make test fast\n        ):\n            # let the keep-alive thread run for a bit...\n            time.sleep(0.1)  # yes, this is ugly but it's exactly what we need to test\n        self.assertTrue(self._total_pings > 1, msg=\"Didn't retry pings (%d pings performed)\" % (self._total_pings,))\n\n    def test_ping_thread_shutdown(self):\n        with Worker(ping_interval=0.01) as w:\n            self.assertTrue(w._keep_alive_thread.is_alive())\n        self.assertFalse(w._keep_alive_thread.is_alive())\n\n\ndef email_patch(test_func, email_config=None):\n    EMAIL_CONFIG = {\"email\": {\"receiver\": \"not-a-real-email-address-for-test-only\", \"force_send\": \"true\"}}\n    if email_config is not None:\n        EMAIL_CONFIG.update(email_config)\n    emails = []\n\n    def mock_send_email(sender, recipients, msg):\n        emails.append(msg)\n\n    @with_config(EMAIL_CONFIG)\n    @functools.wraps(test_func)\n    @mock.patch(\"smtplib.SMTP\")\n    def run_test(self, smtp):\n        smtp().sendmail.side_effect = mock_send_email\n        test_func(self, emails)\n\n    return run_test\n\n\ndef custom_email_patch(config):\n    return functools.partial(email_patch, email_config=config)\n\n\nclass WorkerEmailTest(LuigiTestCase):\n    def run(self, result=None):\n        super(WorkerEmailTest, self).setUp()\n        sch = Scheduler(retry_delay=100, remove_delay=1000, worker_disconnect_delay=10)\n        with Worker(scheduler=sch, worker_id=\"foo\") as self.worker:\n            super(WorkerEmailTest, self).run(result)\n\n    @email_patch\n    def test_connection_error(self, emails):\n        sch = RemoteScheduler(\"http://tld.invalid:1337\", connect_timeout=1)\n        sch._rpc_retry_wait = 1  # shorten wait time to speed up tests\n\n        class A(DummyTask):\n            pass\n\n        a = A()\n        self.assertEqual(emails, [])\n        with Worker(scheduler=sch) as worker:\n            try:\n                worker.add(a)\n            except RPCError as e:\n                self.assertTrue(str(e).find(\"Errors (3 attempts)\") != -1)\n                self.assertNotEqual(emails, [])\n                self.assertTrue(emails[0].find(\"Luigi: Framework error while scheduling %s\" % (a,)) != -1)\n            else:\n                self.fail()\n\n    @email_patch\n    def test_complete_error(self, emails):\n        class A(DummyTask):\n            def complete(self):\n                raise Exception(\"b0rk\")\n\n        a = A()\n        self.assertEqual(emails, [])\n        self.worker.add(a)\n        self.assertTrue(emails[0].find(\"Luigi: %s failed scheduling\" % (a,)) != -1)\n        self.worker.run()\n        self.assertTrue(emails[0].find(\"Luigi: %s failed scheduling\" % (a,)) != -1)\n        self.assertFalse(a.has_run)\n\n    @with_config({\"batch_email\": {\"email_interval\": \"0\"}, \"worker\": {\"send_failure_email\": \"False\"}})\n    @email_patch\n    def test_complete_error_email_batch(self, emails):\n        class A(DummyTask):\n            def complete(self):\n                raise Exception(\"b0rk\")\n\n        scheduler = Scheduler(batch_emails=True)\n        worker = Worker(scheduler)\n        a = A()\n        self.assertEqual(emails, [])\n        worker.add(a)\n        self.assertEqual(emails, [])\n        worker.run()\n        self.assertEqual(emails, [])\n        self.assertFalse(a.has_run)\n        scheduler.prune()\n        self.assertTrue(\"1 scheduling failure\" in emails[0])\n\n    @with_config({\"batch_email\": {\"email_interval\": \"0\"}, \"worker\": {\"send_failure_email\": \"False\"}})\n    @email_patch\n    def test_complete_error_email_batch_to_owner(self, emails):\n        class A(DummyTask):\n            owner_email = \"a_owner@test.com\"\n\n            def complete(self):\n                raise Exception(\"b0rk\")\n\n        scheduler = Scheduler(batch_emails=True)\n        worker = Worker(scheduler)\n        a = A()\n        self.assertEqual(emails, [])\n        worker.add(a)\n        self.assertEqual(emails, [])\n        worker.run()\n        self.assertEqual(emails, [])\n        self.assertFalse(a.has_run)\n        scheduler.prune()\n        self.assertTrue(any(\"1 scheduling failure\" in email and \"a_owner@test.com\" in email for email in emails))\n\n    @email_patch\n    def test_announce_scheduling_failure_unexpected_error(self, emails):\n\n        class A(DummyTask):\n            owner_email = \"a_owner@test.com\"\n\n            def complete(self):\n                pass\n\n        scheduler = Scheduler(batch_emails=True)\n        worker = Worker(scheduler)\n        a = A()\n\n        with mock.patch.object(worker._scheduler, \"announce_scheduling_failure\", side_effect=Exception(\"Unexpected\")), self.assertRaises(Exception):\n            worker.add(a)\n        self.assertTrue(len(emails) == 2)  # One for `complete` error, one for exception in announcing.\n        self.assertTrue(\"Luigi: Framework error while scheduling\" in emails[1])\n        self.assertTrue(\"a_owner@test.com\" in emails[1])\n\n    @email_patch\n    def test_requires_error(self, emails):\n        class A(DummyTask):\n            def requires(self):\n                raise Exception(\"b0rk\")\n\n        a = A()\n        self.assertEqual(emails, [])\n        self.worker.add(a)\n        self.assertTrue(emails[0].find(\"Luigi: %s failed scheduling\" % (a,)) != -1)\n        self.worker.run()\n        self.assertFalse(a.has_run)\n\n    @with_config({\"batch_email\": {\"email_interval\": \"0\"}, \"worker\": {\"send_failure_email\": \"False\"}})\n    @email_patch\n    def test_requires_error_email_batch(self, emails):\n        class A(DummyTask):\n            def requires(self):\n                raise Exception(\"b0rk\")\n\n        scheduler = Scheduler(batch_emails=True)\n        worker = Worker(scheduler)\n        a = A()\n        self.assertEqual(emails, [])\n        worker.add(a)\n        self.assertEqual(emails, [])\n        worker.run()\n        self.assertFalse(a.has_run)\n        scheduler.prune()\n        self.assertTrue(\"1 scheduling failure\" in emails[0])\n\n    @email_patch\n    def test_complete_return_value(self, emails):\n        class A(DummyTask):\n            def complete(self):\n                pass  # no return value should be an error\n\n        a = A()\n        self.assertEqual(emails, [])\n        self.worker.add(a)\n        self.assertTrue(emails[0].find(\"Luigi: %s failed scheduling\" % (a,)) != -1)\n        self.worker.run()\n        self.assertTrue(emails[0].find(\"Luigi: %s failed scheduling\" % (a,)) != -1)\n        self.assertFalse(a.has_run)\n\n    @with_config({\"batch_email\": {\"email_interval\": \"0\"}, \"worker\": {\"send_failure_email\": \"False\"}})\n    @email_patch\n    def test_complete_return_value_email_batch(self, emails):\n        class A(DummyTask):\n            def complete(self):\n                pass  # no return value should be an error\n\n        scheduler = Scheduler(batch_emails=True)\n        worker = Worker(scheduler)\n        a = A()\n        self.assertEqual(emails, [])\n        worker.add(a)\n        self.assertEqual(emails, [])\n        self.worker.run()\n        self.assertEqual(emails, [])\n        self.assertFalse(a.has_run)\n        scheduler.prune()\n        self.assertTrue(\"1 scheduling failure\" in emails[0])\n\n    @email_patch\n    def test_run_error(self, emails):\n        class A(luigi.Task):\n            def run(self):\n                raise Exception(\"b0rk\")\n\n        a = A()\n        luigi.build([a], workers=1, local_scheduler=True)\n        self.assertEqual(1, len(emails))\n        self.assertTrue(emails[0].find(\"Luigi: %s FAILED\" % (a,)) != -1)\n\n    @email_patch\n    def test_run_error_long_traceback(self, emails):\n        class A(luigi.Task):\n            def run(self):\n                raise Exception(\"b0rk\" * 10500)\n\n        a = A()\n        luigi.build([a], workers=1, local_scheduler=True)\n        self.assertTrue(len(emails[0]) < 10000)\n        self.assertTrue(emails[0].find(\"Traceback exceeds max length and has been truncated\"))\n\n    @with_config({\"batch_email\": {\"email_interval\": \"0\"}, \"worker\": {\"send_failure_email\": \"False\"}})\n    @email_patch\n    def test_run_error_email_batch(self, emails):\n        class A(luigi.Task):\n            owner_email = [\"a@test.com\", \"b@test.com\"]\n\n            def run(self):\n                raise Exception(\"b0rk\")\n\n        scheduler = Scheduler(batch_emails=True)\n        worker = Worker(scheduler)\n        worker.add(A())\n        worker.run()\n        scheduler.prune()\n        self.assertEqual(3, len(emails))\n        self.assertTrue(any(\"a@test.com\" in email for email in emails))\n        self.assertTrue(any(\"b@test.com\" in email for email in emails))\n\n    @with_config({\"batch_email\": {\"email_interval\": \"0\"}, \"worker\": {\"send_failure_email\": \"False\"}})\n    @email_patch\n    def test_run_error_batch_email_string(self, emails):\n        class A(luigi.Task):\n            owner_email = \"a@test.com\"\n\n            def run(self):\n                raise Exception(\"b0rk\")\n\n        scheduler = Scheduler(batch_emails=True)\n        worker = Worker(scheduler)\n        worker.add(A())\n        worker.run()\n        scheduler.prune()\n        self.assertEqual(2, len(emails))\n        self.assertTrue(any(\"a@test.com\" in email for email in emails))\n\n    @with_config({\"worker\": {\"send_failure_email\": \"False\"}})\n    @email_patch\n    def test_run_error_no_email(self, emails):\n        class A(luigi.Task):\n            def run(self):\n                raise Exception(\"b0rk\")\n\n        luigi.build([A()], workers=1, local_scheduler=True)\n        self.assertFalse(emails)\n\n    @staticmethod\n    def read_email(email_msg):\n        subject_obj, body_obj = email.parser.Parser().parsestr(email_msg).walk()\n        return str(subject_obj[\"Subject\"]), str(body_obj.get_payload(decode=True))\n\n    @email_patch\n    def test_task_process_dies_with_email(self, emails):\n        a = SendSignalTask(signal.SIGKILL)\n        luigi.build([a], workers=2, local_scheduler=True)\n        self.assertEqual(1, len(emails))\n        subject, body = self.read_email(emails[0])\n        self.assertIn(\"Luigi: {} FAILED\".format(a), subject)\n        self.assertIn(\"died unexpectedly with exit code -9\", body)\n\n    @with_config({\"worker\": {\"send_failure_email\": \"False\"}})\n    @email_patch\n    def test_task_process_dies_no_email(self, emails):\n        luigi.build([SendSignalTask(signal.SIGKILL)], workers=2, local_scheduler=True)\n        self.assertEqual([], emails)\n\n    @email_patch\n    def test_task_times_out(self, emails):\n        class A(luigi.Task):\n            worker_timeout = 0.0001\n\n            def run(self):\n                time.sleep(5)\n\n        a = A()\n        luigi.build([a], workers=2, local_scheduler=True)\n        self.assertEqual(1, len(emails))\n        subject, body = self.read_email(emails[0])\n        self.assertIn(\"Luigi: %s FAILED\" % (a,), subject)\n        self.assertIn(\"timed out after 0.0001 seconds and was terminated.\", body)\n\n    @with_config({\"worker\": {\"send_failure_email\": \"False\"}})\n    @email_patch\n    def test_task_times_out_no_email(self, emails):\n        class A(luigi.Task):\n            worker_timeout = 0.0001\n\n            def run(self):\n                time.sleep(5)\n\n        luigi.build([A()], workers=2, local_scheduler=True)\n        self.assertEqual([], emails)\n\n    @with_config(dict(worker=dict(retry_external_tasks=\"true\")))\n    @email_patch\n    def test_external_task_retries(self, emails):\n        \"\"\"\n        Test that we do not send error emails on the failures of external tasks\n        \"\"\"\n\n        class A(luigi.ExternalTask):\n            pass\n\n        a = A()\n        luigi.build([a], workers=2, local_scheduler=True)\n        self.assertEqual(emails, [])\n\n    @email_patch\n    def test_no_error(self, emails):\n        class A(DummyTask):\n            pass\n\n        a = A()\n        self.assertEqual(emails, [])\n        self.worker.add(a)\n        self.assertEqual(emails, [])\n        self.worker.run()\n        self.assertEqual(emails, [])\n        self.assertTrue(a.complete())\n\n    @custom_email_patch({\"email\": {\"receiver\": \"not-a-real-email-address-for-test-only\", \"format\": \"none\"}})\n    def test_disable_emails(self, emails):\n        class A(luigi.Task):\n            def complete(self):\n                raise Exception(\"b0rk\")\n\n        self.worker.add(A())\n        self.assertEqual(emails, [])\n\n\nclass RaiseSystemExit(luigi.Task):\n    def run(self):\n        raise SystemExit(\"System exit!!\")\n\n\nclass SendSignalTask(luigi.Task):\n    signal = luigi.IntParameter()\n\n    def run(self):\n        os.kill(os.getpid(), self.signal)\n\n\nclass HangTheWorkerTask(luigi.Task):\n    worker_timeout = luigi.IntParameter(default=None)\n\n    def run(self):\n        while True:\n            pass\n\n    def complete(self):\n        return False\n\n\nclass MultipleWorkersTest(LuigiTestCase):\n    @unittest.skip(\"Always skip. There are many intermittent failures\")\n    def test_multiple_workers(self):\n        # Test using multiple workers\n        # Also test generating classes dynamically since this may reflect issues with\n        # various platform and how multiprocessing is implemented. If it's using os.fork\n        # under the hood it should be fine, but dynamic classses can't be pickled, so\n        # other implementations of multiprocessing (using spawn etc) may fail\n        class MyDynamicTask(luigi.Task):\n            x = luigi.Parameter()\n\n            def run(self):\n                time.sleep(0.1)\n\n        t0 = time.time()\n        luigi.build([MyDynamicTask(i) for i in range(100)], workers=100, local_scheduler=True)\n        self.assertTrue(time.time() < t0 + 5.0)  # should ideally take exactly 0.1s, but definitely less than 10.0\n\n    def test_zero_workers(self):\n        d = DummyTask()\n        luigi.build([d], workers=0, local_scheduler=True)\n        self.assertFalse(d.complete())\n\n    def test_system_exit(self):\n        # This would hang indefinitely before this fix:\n        # https://github.com/spotify/luigi/pull/439\n        luigi.build([RaiseSystemExit()], workers=2, local_scheduler=True)\n\n    def test_term_worker(self):\n        luigi.build([SendSignalTask(signal.SIGTERM)], workers=2, local_scheduler=True)\n\n    def test_kill_worker(self):\n        luigi.build([SendSignalTask(signal.SIGKILL)], workers=2, local_scheduler=True)\n\n    def test_purge_multiple_workers(self):\n        w = Worker(worker_processes=2, wait_interval=0.01)\n        t1 = SendSignalTask(signal.SIGTERM)\n        t2 = SendSignalTask(signal.SIGKILL)\n        w.add(t1)\n        w.add(t2)\n\n        w._run_task(t1.task_id)\n        w._run_task(t2.task_id)\n        time.sleep(1.0)\n\n        w._handle_next_task()\n        w._handle_next_task()\n        w._handle_next_task()\n\n    def test_stop_worker_kills_subprocesses(self):\n        with Worker(worker_processes=2) as w:\n            hung_task = HangTheWorkerTask()\n            w.add(hung_task)\n\n            w._run_task(hung_task.task_id)\n            pids = [p.pid for p in w._running_tasks.values()]\n            self.assertEqual(1, len(pids))\n            pid = pids[0]\n\n            def is_running():\n                return pid in {p.pid for p in psutil.Process().children()}\n\n            self.assertTrue(is_running())\n        self.assertFalse(is_running())\n\n    @mock.patch(\"luigi.worker.time\")\n    def test_no_process_leak_from_repeatedly_running_same_task(self, worker_time):\n        with Worker(worker_processes=2) as w:\n            hung_task = HangTheWorkerTask()\n            w.add(hung_task)\n\n            w._run_task(hung_task.task_id)\n            children = set(psutil.Process().children())\n\n            # repeatedly try to run the same task id\n            for _ in range(10):\n                worker_time.sleep.reset_mock()\n                w._run_task(hung_task.task_id)\n\n                # should sleep after each attempt\n                worker_time.sleep.assert_called_once_with(mock.ANY)\n\n            # only one process should be running\n            self.assertEqual(children, set(psutil.Process().children()))\n\n    def test_time_out_hung_worker(self):\n        luigi.build([HangTheWorkerTask(0.1)], workers=2, local_scheduler=True)\n\n    def test_time_out_hung_single_worker(self):\n        luigi.build([HangTheWorkerTask(0.1)], workers=1, local_scheduler=True)\n\n    @skipOnTravisAndGithubActions(\"https://travis-ci.org/spotify/luigi/jobs/72953986\")\n    @mock.patch(\"luigi.worker.time\")\n    def test_purge_hung_worker_default_timeout_time(self, mock_time):\n        w = Worker(worker_processes=2, wait_interval=0.01, timeout=5)\n        mock_time.time.return_value = 0\n        task = HangTheWorkerTask()\n        w.add(task)\n        w._run_task(task.task_id)\n\n        mock_time.time.return_value = 5\n        w._handle_next_task()\n        self.assertEqual(1, len(w._running_tasks))\n\n        mock_time.time.return_value = 6\n        w._handle_next_task()\n        self.assertEqual(0, len(w._running_tasks))\n\n    @skipOnTravisAndGithubActions(\"https://travis-ci.org/spotify/luigi/jobs/76645264\")\n    @mock.patch(\"luigi.worker.time\")\n    def test_purge_hung_worker_override_timeout_time(self, mock_time):\n        w = Worker(worker_processes=2, wait_interval=0.01, timeout=5)\n        mock_time.time.return_value = 0\n        task = HangTheWorkerTask(worker_timeout=10)\n        w.add(task)\n        w._run_task(task.task_id)\n\n        mock_time.time.return_value = 10\n        w._handle_next_task()\n        self.assertEqual(1, len(w._running_tasks))\n\n        mock_time.time.return_value = 11\n        w._handle_next_task()\n        self.assertEqual(0, len(w._running_tasks))\n\n\nclass Dummy2Task(Task):\n    p = luigi.Parameter()\n\n    def output(self):\n        return MockTarget(self.p)\n\n    def run(self):\n        f = self.output().open(\"w\")\n        f.write(\"test\")\n        f.close()\n\n\nclass AssistantTest(LuigiTestCase):\n    def run(self, result=None):\n        self.sch = Scheduler(retry_delay=100, remove_delay=1000, worker_disconnect_delay=10)\n        self.assistant = Worker(scheduler=self.sch, worker_id=\"Y\", assistant=True)\n        with Worker(scheduler=self.sch, worker_id=\"X\") as w:\n            self.w = w\n            super(AssistantTest, self).run(result)\n\n    def test_get_work(self):\n        d = Dummy2Task(\"123\")\n        self.w.add(d)\n\n        self.assertFalse(d.complete())\n        self.assistant.run()\n        self.assertTrue(d.complete())\n\n    def test_bad_job_type(self):\n        class Dummy3Task(Dummy2Task):\n            task_family = \"UnknownTaskFamily\"\n\n        d = Dummy3Task(\"123\")\n        self.w.add(d)\n\n        self.assertFalse(d.complete())\n        self.assertFalse(self.assistant.run())\n        self.assertFalse(d.complete())\n        self.assertEqual(list(self.sch.task_list(\"FAILED\", \"\").keys()), [d.task_id])\n\n    def test_unimported_job_type(self):\n        MODULE_CONTENTS = b\"\"\"\nimport luigi\n\n\nclass UnimportedTask(luigi.Task):\n    def complete(self):\n        return False\n\"\"\"\n        reg = luigi.task_register.Register._get_reg()\n\n        class UnimportedTask(luigi.Task):\n            task_module = None  # Set it here, so it's generally settable\n\n        luigi.task_register.Register._set_reg(reg)\n\n        task = UnimportedTask()\n\n        # verify that it can't run the task without the module info necessary to import it\n        self.w.add(task)\n        self.assertFalse(self.assistant.run())\n        self.assertEqual(list(self.sch.task_list(\"FAILED\", \"\").keys()), [task.task_id])\n\n        # check that it can import with the right module\n        with temporary_unloaded_module(MODULE_CONTENTS) as task.task_module:\n            self.w.add(task)\n            self.assertTrue(self.assistant.run())\n            self.assertEqual(list(self.sch.task_list(\"DONE\", \"\").keys()), [task.task_id])\n\n    def test_unimported_job_sends_failure_message(self):\n        class NotInAssistantTask(luigi.Task):\n            task_family = \"Unknown\"\n            task_module = None\n\n        task = NotInAssistantTask()\n        self.w.add(task)\n        self.assertFalse(self.assistant.run())\n        self.assertEqual(list(self.sch.task_list(\"FAILED\", \"\").keys()), [task.task_id])\n        self.assertTrue(self.sch.fetch_error(task.task_id)[\"error\"])\n\n\nclass ForkBombTask(luigi.Task):\n    depth = luigi.IntParameter()\n    breadth = luigi.IntParameter()\n    p = luigi.Parameter(default=(0,))  # ehm for some weird reason [0] becomes a tuple...?\n\n    def output(self):\n        return MockTarget(\".\".join(map(str, self.p)))\n\n    def run(self):\n        with self.output().open(\"w\") as f:\n            f.write(\"Done!\")\n\n    def requires(self):\n        if len(self.p) < self.depth:\n            for i in range(self.breadth):\n                yield ForkBombTask(self.depth, self.breadth, self.p + (i,))\n\n\nclass TaskLimitTest(unittest.TestCase):\n    def tearDown(self):\n        MockFileSystem().remove(\"\")\n\n    @with_config({\"worker\": {\"task_limit\": \"6\"}})\n    def test_task_limit_exceeded(self):\n        w = Worker()\n        t = ForkBombTask(3, 2)\n        w.add(t)\n        w.run()\n        self.assertFalse(t.complete())\n        leaf_tasks = [ForkBombTask(3, 2, branch) for branch in [(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1)]]\n        self.assertEqual(\n            3,\n            sum(t.complete() for t in leaf_tasks),\n            \"should have gracefully completed as much as possible even though the single last leaf didn't get scheduled\",\n        )\n\n    @with_config({\"worker\": {\"task_limit\": \"7\"}})\n    def test_task_limit_not_exceeded(self):\n        w = Worker()\n        t = ForkBombTask(3, 2)\n        w.add(t)\n        w.run()\n        self.assertTrue(t.complete())\n\n    def test_no_task_limit(self):\n        w = Worker()\n        t = ForkBombTask(4, 2)\n        w.add(t)\n        w.run()\n        self.assertTrue(t.complete())\n\n\nclass WorkerConfigurationTest(unittest.TestCase):\n    def test_asserts_for_worker(self):\n        \"\"\"\n        Test that Worker() asserts that it's sanely configured\n        \"\"\"\n        Worker(wait_interval=1)  # This shouldn't raise\n        self.assertRaises(AssertionError, Worker, wait_interval=0)\n\n\nclass WorkerWaitJitterTest(unittest.TestCase):\n    @with_config({\"worker\": {\"wait_jitter\": \"10.0\"}})\n    @mock.patch(\"random.uniform\")\n    @mock.patch(\"time.sleep\")\n    def test_wait_jitter(self, mock_sleep, mock_random):\n        \"\"\"verify configured jitter amount\"\"\"\n        mock_random.return_value = 1.0\n\n        w = Worker()\n        x = w._sleeper()\n        next(x)\n        mock_random.assert_called_with(0, 10.0)\n        mock_sleep.assert_called_with(2.0)\n\n        mock_random.return_value = 2.0\n        next(x)\n        mock_random.assert_called_with(0, 10.0)\n        mock_sleep.assert_called_with(3.0)\n\n    @mock.patch(\"random.uniform\")\n    @mock.patch(\"time.sleep\")\n    def test_wait_jitter_default(self, mock_sleep, mock_random):\n        \"\"\"verify default jitter is as expected\"\"\"\n        mock_random.return_value = 1.0\n        w = Worker()\n        x = w._sleeper()\n        next(x)\n        mock_random.assert_called_with(0, 5.0)\n        mock_sleep.assert_called_with(2.0)\n\n        mock_random.return_value = 3.3\n        next(x)\n        mock_random.assert_called_with(0, 5.0)\n        mock_sleep.assert_called_with(4.3)\n\n\nclass KeyboardInterruptBehaviorTest(LuigiTestCase):\n    def test_propagation_when_executing(self):\n        \"\"\"\n        Ensure that keyboard interrupts causes luigi to quit when you are\n        executing tasks.\n\n        TODO: Add a test that tests the multiprocessing (--worker >1) case\n        \"\"\"\n\n        class KeyboardInterruptTask(luigi.Task):\n            def run(self):\n                raise KeyboardInterrupt()\n\n        cmd = \"KeyboardInterruptTask --local-scheduler --no-lock\".split(\" \")\n        self.assertRaises(KeyboardInterrupt, luigi_run, cmd)\n\n    def test_propagation_when_scheduling(self):\n        \"\"\"\n        Test that KeyboardInterrupt causes luigi to quit while scheduling.\n        \"\"\"\n\n        class KeyboardInterruptTask(luigi.Task):\n            def complete(self):\n                raise KeyboardInterrupt()\n\n        class ExternalKeyboardInterruptTask(luigi.ExternalTask):\n            def complete(self):\n                raise KeyboardInterrupt()\n\n        self.assertRaises(KeyboardInterrupt, luigi_run, [\"KeyboardInterruptTask\", \"--local-scheduler\", \"--no-lock\"])\n        self.assertRaises(KeyboardInterrupt, luigi_run, [\"ExternalKeyboardInterruptTask\", \"--local-scheduler\", \"--no-lock\"])\n\n\nclass WorkerPurgeEventHandlerTest(unittest.TestCase):\n    @mock.patch(\"luigi.worker.ContextManagedTaskProcess\")\n    def test_process_killed_handler(self, task_proc):\n        result = []\n\n        @HangTheWorkerTask.event_handler(Event.PROCESS_FAILURE)\n        def store_task(t, error_msg):\n            self.assertTrue(error_msg)\n            result.append(t)\n\n        w = Worker()\n        task = HangTheWorkerTask()\n        task_process = mock.MagicMock(is_alive=lambda: False, exitcode=-14, task=task)\n        task_proc.return_value = task_process\n\n        w.add(task)\n        w._run_task(task.task_id)\n        w._handle_next_task()\n\n        self.assertEqual(result, [task])\n\n    @mock.patch(\"luigi.worker.time\")\n    def test_timeout_handler(self, mock_time):\n        result = []\n\n        @HangTheWorkerTask.event_handler(Event.TIMEOUT)\n        def store_task(t, error_msg):\n            self.assertTrue(error_msg)\n            result.append(t)\n\n        w = Worker(worker_processes=2, wait_interval=0.01, timeout=5)\n        mock_time.time.return_value = 0\n        task = HangTheWorkerTask(worker_timeout=1)\n        w.add(task)\n        w._run_task(task.task_id)\n\n        mock_time.time.return_value = 3\n        w._handle_next_task()\n\n        self.assertEqual(result, [task])\n\n    @mock.patch(\"luigi.worker.time\")\n    def test_timeout_handler_single_worker(self, mock_time):\n        result = []\n\n        @HangTheWorkerTask.event_handler(Event.TIMEOUT)\n        def store_task(t, error_msg):\n            self.assertTrue(error_msg)\n            result.append(t)\n\n        w = Worker(wait_interval=0.01, timeout=5)\n        mock_time.time.return_value = 0\n        task = HangTheWorkerTask(worker_timeout=1)\n        w.add(task)\n        w._run_task(task.task_id)\n\n        mock_time.time.return_value = 3\n        w._handle_next_task()\n\n        self.assertEqual(result, [task])\n\n\nclass PerTaskRetryPolicyBehaviorTest(LuigiTestCase):\n    def setUp(self):\n        super(PerTaskRetryPolicyBehaviorTest, self).setUp()\n        self.per_task_retry_count = 3\n        self.default_retry_count = 1\n        self.sch = Scheduler(retry_delay=0.1, retry_count=self.default_retry_count, prune_on_get_work=True)\n\n    def test_with_all_disabled_with_single_worker(self):\n        \"\"\"\n        With this test, a case which has a task (TestWrapperTask), requires two another tasks (TestErrorTask1,TestErrorTask1) which both is failed, is\n        tested.\n\n        Task TestErrorTask1 has default retry_count which is 1, but Task TestErrorTask2 has retry_count at task level as 2.\n\n        This test is running on single worker\n        \"\"\"\n\n        class TestErrorTask1(DummyErrorTask):\n            pass\n\n        e1 = TestErrorTask1()\n\n        class TestErrorTask2(DummyErrorTask):\n            retry_count = self.per_task_retry_count\n\n        e2 = TestErrorTask2()\n\n        class TestWrapperTask(luigi.WrapperTask):\n            def requires(self):\n                return [e2, e1]\n\n        wt = TestWrapperTask()\n\n        with Worker(scheduler=self.sch, worker_id=\"X\", keep_alive=True, wait_interval=0.1, wait_jitter=0.05) as w1:\n            self.assertTrue(w1.add(wt))\n\n            self.assertFalse(w1.run())\n\n            self.assertEqual([wt.task_id], list(self.sch.task_list(\"PENDING\", \"UPSTREAM_DISABLED\").keys()))\n\n            self.assertEqual(sorted([e1.task_id, e2.task_id]), sorted(self.sch.task_list(\"DISABLED\", \"\").keys()))\n\n            self.assertEqual(0, self.sch._state.get_task(wt.task_id).num_failures())\n            self.assertEqual(self.per_task_retry_count, self.sch._state.get_task(e2.task_id).num_failures())\n            self.assertEqual(self.default_retry_count, self.sch._state.get_task(e1.task_id).num_failures())\n\n    def test_with_all_disabled_with_multiple_worker(self):\n        \"\"\"\n        With this test, a case which has a task (TestWrapperTask), requires two another tasks (TestErrorTask1,TestErrorTask1) which both is failed, is\n        tested.\n\n        Task TestErrorTask1 has default retry_count which is 1, but Task TestErrorTask2 has retry_count at task level as 2.\n\n        This test is running on multiple worker\n        \"\"\"\n\n        class TestErrorTask1(DummyErrorTask):\n            pass\n\n        e1 = TestErrorTask1()\n\n        class TestErrorTask2(DummyErrorTask):\n            retry_count = self.per_task_retry_count\n\n        e2 = TestErrorTask2()\n\n        class TestWrapperTask(luigi.WrapperTask):\n            def requires(self):\n                return [e2, e1]\n\n        wt = TestWrapperTask()\n\n        with Worker(scheduler=self.sch, worker_id=\"X\", keep_alive=True, wait_interval=0.1, wait_jitter=0.05) as w1:\n            with Worker(scheduler=self.sch, worker_id=\"Y\", keep_alive=True, wait_interval=0.1, wait_jitter=0.05) as w2:\n                with Worker(scheduler=self.sch, worker_id=\"Z\", keep_alive=True, wait_interval=0.1, wait_jitter=0.05) as w3:\n                    self.assertTrue(w1.add(wt))\n                    self.assertTrue(w2.add(e2))\n                    self.assertTrue(w3.add(e1))\n\n                    self.assertFalse(w3.run())\n                    self.assertFalse(w2.run())\n                    self.assertTrue(w1.run())\n\n                    self.assertEqual([wt.task_id], list(self.sch.task_list(\"PENDING\", \"UPSTREAM_DISABLED\").keys()))\n\n                    self.assertEqual(sorted([e1.task_id, e2.task_id]), sorted(self.sch.task_list(\"DISABLED\", \"\").keys()))\n\n                    self.assertEqual(0, self.sch._state.get_task(wt.task_id).num_failures())\n                    self.assertEqual(self.per_task_retry_count, self.sch._state.get_task(e2.task_id).num_failures())\n                    self.assertEqual(self.default_retry_count, self.sch._state.get_task(e1.task_id).num_failures())\n\n    def test_with_includes_success_with_single_worker(self):\n        \"\"\"\n        With this test, a case which has a task (TestWrapperTask), requires one (TestErrorTask1) FAILED and one (TestSuccessTask1) SUCCESS, is tested.\n\n        Task TestSuccessTask1 will be DONE successfully, but Task TestErrorTask1 will be failed and it has retry_count at task level as 2.\n\n        This test is running on single worker\n        \"\"\"\n\n        class TestSuccessTask1(DummyTask):\n            pass\n\n        s1 = TestSuccessTask1()\n\n        class TestErrorTask1(DummyErrorTask):\n            retry_count = self.per_task_retry_count\n\n        e1 = TestErrorTask1()\n\n        class TestWrapperTask(luigi.WrapperTask):\n            def requires(self):\n                return [e1, s1]\n\n        wt = TestWrapperTask()\n\n        with Worker(scheduler=self.sch, worker_id=\"X\", keep_alive=True, wait_interval=0.1, wait_jitter=0.05) as w1:\n            self.assertTrue(w1.add(wt))\n\n            self.assertFalse(w1.run())\n\n            self.assertEqual([wt.task_id], list(self.sch.task_list(\"PENDING\", \"UPSTREAM_DISABLED\").keys()))\n            self.assertEqual([e1.task_id], list(self.sch.task_list(\"DISABLED\", \"\").keys()))\n            self.assertEqual([s1.task_id], list(self.sch.task_list(\"DONE\", \"\").keys()))\n\n            self.assertEqual(0, self.sch._state.get_task(wt.task_id).num_failures())\n            self.assertEqual(self.per_task_retry_count, self.sch._state.get_task(e1.task_id).num_failures())\n            self.assertEqual(0, self.sch._state.get_task(s1.task_id).num_failures())\n\n    def test_with_includes_success_with_multiple_worker(self):\n        \"\"\"\n        With this test, a case which has a task (TestWrapperTask), requires one (TestErrorTask1) FAILED and one (TestSuccessTask1) SUCCESS, is tested.\n\n        Task TestSuccessTask1 will be DONE successfully, but Task TestErrorTask1 will be failed and it has retry_count at task level as 2.\n\n        This test is running on multiple worker\n        \"\"\"\n\n        class TestSuccessTask1(DummyTask):\n            pass\n\n        s1 = TestSuccessTask1()\n\n        class TestErrorTask1(DummyErrorTask):\n            retry_count = self.per_task_retry_count\n\n        e1 = TestErrorTask1()\n\n        class TestWrapperTask(luigi.WrapperTask):\n            def requires(self):\n                return [e1, s1]\n\n        wt = TestWrapperTask()\n\n        with Worker(scheduler=self.sch, worker_id=\"X\", keep_alive=True, wait_interval=0.1, wait_jitter=0.05) as w1:\n            with Worker(scheduler=self.sch, worker_id=\"Y\", keep_alive=True, wait_interval=0.1, wait_jitter=0.05) as w2:\n                with Worker(scheduler=self.sch, worker_id=\"Z\", keep_alive=True, wait_interval=0.1, wait_jitter=0.05) as w3:\n                    self.assertTrue(w1.add(wt))\n                    self.assertTrue(w2.add(e1))\n                    self.assertTrue(w3.add(s1))\n\n                    self.assertTrue(w3.run())\n                    self.assertFalse(w2.run())\n                    self.assertTrue(w1.run())\n\n                    self.assertEqual([wt.task_id], list(self.sch.task_list(\"PENDING\", \"UPSTREAM_DISABLED\").keys()))\n                    self.assertEqual([e1.task_id], list(self.sch.task_list(\"DISABLED\", \"\").keys()))\n                    self.assertEqual([s1.task_id], list(self.sch.task_list(\"DONE\", \"\").keys()))\n\n                    self.assertEqual(0, self.sch._state.get_task(wt.task_id).num_failures())\n                    self.assertEqual(self.per_task_retry_count, self.sch._state.get_task(e1.task_id).num_failures())\n                    self.assertEqual(0, self.sch._state.get_task(s1.task_id).num_failures())\n\n    def test_with_dynamic_dependencies_with_single_worker(self):\n        \"\"\"\n        With this test, a case includes dependency tasks(TestErrorTask1,TestErrorTask2) which both are failed.\n\n        Task TestErrorTask1 has default retry_count which is 1, but Task TestErrorTask2 has retry_count at task level as 2.\n\n        This test is running on single worker\n        \"\"\"\n\n        class TestErrorTask1(DummyErrorTask):\n            pass\n\n        e1 = TestErrorTask1()\n\n        class TestErrorTask2(DummyErrorTask):\n            retry_count = self.per_task_retry_count\n\n        e2 = TestErrorTask2()\n\n        class TestSuccessTask1(DummyTask):\n            pass\n\n        s1 = TestSuccessTask1()\n\n        class TestWrapperTask(DummyTask):\n            def requires(self):\n                return [s1]\n\n            def run(self):\n                super(TestWrapperTask, self).run()\n                yield e2, e1\n\n        wt = TestWrapperTask()\n\n        with Worker(scheduler=self.sch, worker_id=\"X\", keep_alive=True, wait_interval=0.1, wait_jitter=0.05) as w1:\n            self.assertTrue(w1.add(wt))\n\n            self.assertFalse(w1.run())\n\n            self.assertEqual([wt.task_id], list(self.sch.task_list(\"PENDING\", \"UPSTREAM_DISABLED\").keys()))\n\n            self.assertEqual(sorted([e1.task_id, e2.task_id]), sorted(self.sch.task_list(\"DISABLED\", \"\").keys()))\n\n            self.assertEqual(0, self.sch._state.get_task(wt.task_id).num_failures())\n            self.assertEqual(0, self.sch._state.get_task(s1.task_id).num_failures())\n            self.assertEqual(self.per_task_retry_count, self.sch._state.get_task(e2.task_id).num_failures())\n            self.assertEqual(self.default_retry_count, self.sch._state.get_task(e1.task_id).num_failures())\n\n    def test_with_dynamic_dependencies_with_multiple_workers(self):\n        \"\"\"\n        With this test, a case includes dependency tasks(TestErrorTask1,TestErrorTask2) which both are failed.\n\n        Task TestErrorTask1 has default retry_count which is 1, but Task TestErrorTask2 has retry_count at task level as 2.\n\n        This test is running on multiple worker\n        \"\"\"\n\n        class TestErrorTask1(DummyErrorTask):\n            pass\n\n        e1 = TestErrorTask1()\n\n        class TestErrorTask2(DummyErrorTask):\n            retry_count = self.per_task_retry_count\n\n        e2 = TestErrorTask2()\n\n        class TestSuccessTask1(DummyTask):\n            pass\n\n        s1 = TestSuccessTask1()\n\n        class TestWrapperTask(DummyTask):\n            def requires(self):\n                return [s1]\n\n            def run(self):\n                super(TestWrapperTask, self).run()\n                yield e2, e1\n\n        wt = TestWrapperTask()\n\n        with Worker(scheduler=self.sch, worker_id=\"X\", keep_alive=True, wait_interval=0.1, wait_jitter=0.05) as w1:\n            with Worker(scheduler=self.sch, worker_id=\"Y\", keep_alive=True, wait_interval=0.1, wait_jitter=0.05) as w2:\n                self.assertTrue(w1.add(wt))\n                self.assertTrue(w2.add(s1))\n\n                self.assertTrue(w2.run())\n                self.assertFalse(w1.run())\n\n                self.assertEqual([wt.task_id], list(self.sch.task_list(\"PENDING\", \"UPSTREAM_DISABLED\").keys()))\n\n                self.assertEqual(sorted([e1.task_id, e2.task_id]), sorted(self.sch.task_list(\"DISABLED\", \"\").keys()))\n\n                self.assertEqual(0, self.sch._state.get_task(wt.task_id).num_failures())\n                self.assertEqual(0, self.sch._state.get_task(s1.task_id).num_failures())\n                self.assertEqual(self.per_task_retry_count, self.sch._state.get_task(e2.task_id).num_failures())\n                self.assertEqual(self.default_retry_count, self.sch._state.get_task(e1.task_id).num_failures())\n\n    def test_per_task_disable_persist_with_single_worker(self):\n        \"\"\"\n        Ensure that `Task.disable_window` impacts the task retrying policy:\n        - with the scheduler retry policy (disable_window=3), task fails twice and gets disabled\n        - with the task retry policy (disable_window=0.5) task never gets into the DISABLED state\n        \"\"\"\n\n        class TwoErrorsThenSuccessTask(Task):\n            \"\"\"\n            The task is failing two times and then succeeds, waiting 1s before each try\n            \"\"\"\n\n            retry_index = 0\n            disable_window = None\n\n            def run(self):\n                time.sleep(1)\n                self.retry_index += 1\n                if self.retry_index < 3:\n                    raise Exception(\"Retry index is %s for %s\" % (self.retry_index, self.task_family))\n\n        t = TwoErrorsThenSuccessTask()\n\n        sch = Scheduler(retry_delay=0.1, retry_count=2, prune_on_get_work=True, disable_window=2)\n        with Worker(scheduler=sch, worker_id=\"X\", keep_alive=True, wait_interval=0.1, wait_jitter=0.05) as w:\n            self.assertTrue(w.add(t))\n            self.assertFalse(w.run())\n\n            self.assertEqual(2, t.retry_index)\n            self.assertEqual([t.task_id], list(sch.task_list(\"DISABLED\").keys()))\n            self.assertEqual(2, sch._state.get_task(t.task_id).num_failures())\n\n        t = TwoErrorsThenSuccessTask()\n        t.retry_index = 0\n        t.disable_window = 0.5\n\n        sch = Scheduler(retry_delay=0.1, retry_count=2, prune_on_get_work=True, disable_window=2)\n        with Worker(scheduler=sch, worker_id=\"X\", keep_alive=True, wait_interval=0.1, wait_jitter=0.05) as w:\n            self.assertTrue(w.add(t))\n            # Worker.run return False even if a task failed first but eventually succeeded.\n            self.assertFalse(w.run())\n\n            self.assertEqual(3, t.retry_index)\n            self.assertEqual([t.task_id], list(sch.task_list(\"DONE\").keys()))\n            self.assertEqual(1, len(sch._state.get_task(t.task_id).failures))\n"
  },
  {
    "path": "test/wrap_test.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Copyright 2012-2015 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nimport datetime\n\nfrom helpers import unittest\n\nimport luigi\nimport luigi.notifications\nfrom luigi.mock import MockTarget\nfrom luigi.util import inherits\n\nluigi.notifications.DEBUG = True\n\n\nclass A(luigi.Task):\n    task_namespace = \"wrap\"  # to prevent task name conflict between tests\n\n    def output(self):\n        return MockTarget(\"/tmp/a.txt\")\n\n    def run(self):\n        f = self.output().open(\"w\")\n        print(\"hello, world\", file=f)\n        f.close()\n\n\nclass B(luigi.Task):\n    date = luigi.DateParameter()\n\n    def output(self):\n        return MockTarget(self.date.strftime(\"/tmp/b-%Y-%m-%d.txt\"))\n\n    def run(self):\n        f = self.output().open(\"w\")\n        print(\"goodbye, space\", file=f)\n        f.close()\n\n\ndef XMLWrapper(cls):\n    @inherits(cls)\n    class XMLWrapperCls(luigi.Task):\n        def requires(self):\n            return self.clone_parent()\n\n        def run(self):\n            f = self.input().open(\"r\")\n            g = self.output().open(\"w\")\n            print('<?xml version=\"1.0\" ?>', file=g)\n            for line in f:\n                print(\"<dummy-xml>\" + line.strip() + \"</dummy-xml>\", file=g)\n            g.close()\n\n    return XMLWrapperCls\n\n\nclass AXML(XMLWrapper(A)):\n    def output(self):\n        return MockTarget(\"/tmp/a.xml\")\n\n\nclass BXML(XMLWrapper(B)):\n    def output(self):\n        return MockTarget(self.date.strftime(\"/tmp/b-%Y-%m-%d.xml\"))\n\n\nclass WrapperTest(unittest.TestCase):\n    \"\"\"This test illustrates how a task class can wrap another task class by modifying its behavior.\n\n    See instance_wrap_test.py for an example of how instances can wrap each other.\"\"\"\n\n    workers = 1\n\n    def setUp(self):\n        MockTarget.fs.clear()\n\n    def test_a(self):\n        luigi.build([AXML()], local_scheduler=True, no_lock=True, workers=self.workers)\n        self.assertEqual(MockTarget.fs.get_data(\"/tmp/a.xml\"), b'<?xml version=\"1.0\" ?>\\n<dummy-xml>hello, world</dummy-xml>\\n')\n\n    def test_b(self):\n        luigi.build([BXML(datetime.date(2012, 1, 1))], local_scheduler=True, no_lock=True, workers=self.workers)\n        self.assertEqual(MockTarget.fs.get_data(\"/tmp/b-2012-01-01.xml\"), b'<?xml version=\"1.0\" ?>\\n<dummy-xml>goodbye, space</dummy-xml>\\n')\n\n\nclass WrapperWithMultipleWorkersTest(WrapperTest):\n    workers = 7\n"
  },
  {
    "path": "tox.ini",
    "content": "[tox]\nrequires =\n    tox>=4.22 # `dependency_groups` needed\n    tox-uv>=1.19\nenvlist = py{310,311,312,313}-{cdh,hdp,core,contrib,apache,aws,gcloud,mysql,postgres,unixsocket,azureblob,dropbox}, visualiser, docs, lint, typecheck\nskipsdist = True\n\n[pytest]\naddopts = --cov=luigi --cov-report=xml -vv --strict-markers --ignore-glob=\"**/_*\" --fulltrace\ntestpaths = test\nmarkers =\n    contrib: tests related to luigi/contrib\n    apache: tests related to apache\n    aws: tests related to AWS\n    postgres: tests related to postgresql\n    mysql: tests related to mysql\n    scheduler: tests related to scheduler\n    cdh: tests related to cdh\n    hdp: tests related to hdp\n    gcloud: tests related to GCP\n    unixsocket: tests related to unixsocket\n    dropbox: tests related to dropbox\n    azureblob: tests related to azure\n    unmarked: tests with no explicit markers\n\n[testenv]\nrunner = uv-venv-lock-runner\nallowlist_externals = {toxinidir}/scripts/ci/*.sh\ndependency_groups =\n    core: common\n    contrib: common\n    apache: common\n    aws: common\n    postgres: test_postgres\n    mysql: common\n    scheduler: common\n    cdh: test_cdh\n    hdp: test_hdp\n    gcloud: test_gcloud\n    unixsocket: test_unixsocket\n    dropbox: test_dropbox\n    azureblob: common\npassenv = USER, JAVA_HOME, POSTGRES_USER, DATAPROC_TEST_PROJECT_ID, GCS_TEST_PROJECT_ID, GCS_TEST_BUCKET, GOOGLE_APPLICATION_CREDENTIALS, CI, DROPBOX_APP_TOKEN, DOCKERHUB_TOKEN, GITHUB_ACTIONS, OVERRIDE_SKIP_CI_TESTS\nsetenv =\n    LC_ALL = en_US.utf-8\n    cdh: HADOOP_DISTRO=cdh\n    cdh: HADOOP_HOME={toxinidir}/.tox/hadoop-cdh\n    hdp: HADOOP_DISTRO=hdp\n    hdp: HADOOP_HOME={toxinidir}/.tox/hadoop-hdp\n    LUIGI_CONFIG_PATH={toxinidir}/test/testconfig/luigi.cfg\n    COVERAGE_PROCESS_START={toxinidir}/.coveragerc\n    FULL_COVERAGE=true\n    AWS_DEFAULT_REGION=us-east-1\n    AWS_ACCESS_KEY_ID=accesskey\n    AWS_SECRET_ACCESS_KEY=secretkey\n    AZURITE_ACCOUNT_NAME=devstoreaccount1\n    AZURITE_ACCOUNT_KEY=YXp1cml0ZQ==\n    AZURITE_CUSTOM_DOMAIN=localhost:10000\ncommands =\n    # Setup\n    cdh,hdp: {toxinidir}/scripts/ci/setup_hadoop_env.sh\n    azureblob: {toxinidir}/scripts/ci/install_start_azurite.sh {toxinidir}/scripts/ci\n    {envpython} --version\n    # Test\n    contrib: {envpython} test/runtests.py test/contrib/ -m \"contrib or unmarked\" {posargs:}\n    apache: {envpython} test/runtests.py -m apache {posargs:}\n    aws: {envpython} test/runtests.py -m aws {posargs:}\n    mysql: {envpython} test/runtests.py -m mysql {posargs:}\n    postgres: {envpython} test/runtests.py -m postgres {posargs:}\n    scheduler: {envpython} test/runtests.py -m scheduler {posargs:}\n    cdh,hdp: {envpython} test/runtests.py -m minicluster {posargs:}\n    gcloud: {envpython} test/runtests.py -m gcloud {posargs:}\n    unixsocket: {envpython} test/runtests.py -m unixsocket {posargs:}\n    dropbox: {envpython} test/runtests.py -m dropbox {posargs:}\n    azureblob: {envpython} test/runtests.py -m azureblob {posargs:}\n    core: {envpython} test/runtests.py --doctest-modules -m \"not minicluster and not gcloud and not mysql and not postgres and not unixsocket and not contrib and not apache and not aws and not azureblob and not dropbox\" -n auto --dist=loadfile {posargs:}\n    # Teardown\n    azureblob: {toxinidir}/scripts/ci/stop_azurite.sh\n\n[testenv:visualiser]\nrunner = uv-venv-lock-runner\ndependency_groups = visualizer\npassenv = {[testenv]passenv}\nsetenv =\n    LC_ALL = en_US.utf-8\n    LUIGI_CONFIG_PATH={toxinidir}/test/testconfig/luigi.cfg\n    TEST_VISUALISER=1\ncommands =\n    python --version\n    pytest test/visualiser\n\n[testenv:lint]\ndependency_groups = lint\ncommands =\n    ruff check .\n    ruff format --check .\n\n[testenv:typecheck]\ndependency_groups = common\ncommands =\n    mypy luigi/\n\n[testenv:docs]\n# Python 3.13 required for Sphinx 9.x\nbasepython = py313\n# Build documentation using sphinx.\n# Call this using `tox run -e docs`.\ndependency_groups = docs\nsetenv =\n    AWS_DEFAULT_REGION=us-east-1\ncommands =\n    sphinx-build -W -b html -d {envtmpdir}/doctrees doc doc/_build/html\n"
  }
]